Bug 608066 - Update libvpx to v0.9.5. r=chris,khuey a=b-f

This commit is contained in:
Timothy B. Terriberry 2010-11-08 09:47:17 +02:00
Родитель 0a19a9180e
Коммит 767618fe25
240 изменённых файлов: 19211 добавлений и 3546 удалений

Просмотреть файл

@ -165,7 +165,11 @@ MOZ_TREMOR = @MOZ_TREMOR@
MOZ_WEBM = @MOZ_WEBM@
VPX_AS = @VPX_AS@
VPX_ASFLAGS = @VPX_ASFLAGS@
VPX_DASH_C_FLAG = @VPX_DASH_C_FLAG@
VPX_AS_CONVERSION = @VPX_AS_CONVERSION@
VPX_ASM_SUFFIX = @VPX_ASM_SUFFIX@
VPX_X86_ASM = @VPX_X86_ASM@
VPX_ARM_ASM = @VPX_ARM_ASM@
NS_PRINTING = @NS_PRINTING@
MOZ_CRASHREPORTER = @MOZ_CRASHREPORTER@
MOZ_HELP_VIEWER = @MOZ_HELP_VIEWER@

Просмотреть файл

@ -4958,7 +4958,11 @@ MOZ_MEDIA=
MOZ_WEBM=1
VPX_AS=
VPX_ASFLAGS=
VPX_AS_DASH_C_FLAG=
VPX_AS_CONVERSION=
VPX_ASM_SUFFIX=
VPX_X86_ASM=
VPX_ARM_ASM=
MOZ_PANGO=1
MOZ_PERMISSIONS=1
MOZ_PLACES=1
@ -6045,8 +6049,10 @@ if test -n "$MOZ_WEBM" -a -z "$MOZ_NATIVE_LIBVPX"; then
dnl Detect if we can use an assembler to compile optimized assembly for libvpx.
dnl We currently require yasm on all platforms and require yasm 1.1.0 on Win32.
dnl We currently require yasm on all x86 platforms and require yasm 1.1.0 on Win32.
dnl We currently require gcc on all arm platforms.
VPX_AS=$YASM
VPX_ASM_SUFFIX=asm
dnl See if we have assembly on this platform.
case "$OS_ARCH:$CPU_ARCH" in
@ -6093,6 +6099,17 @@ if test -n "$MOZ_WEBM" -a -z "$MOZ_NATIVE_LIBVPX"; then
fi
fi
;;
*:arm*)
if test -n "$GNU_AS" ; then
VPX_AS=$AS
dnl These flags are a lie; they're just used to enable the requisite
dnl opcodes; actual arch detection is done at runtime.
VPX_ASFLAGS="-march=armv7-a -mfpu=neon"
VPX_DASH_C_FLAG="-c"
VPX_AS_CONVERSION="$PERL ${srcdir}/media/libvpx/build/make/ads2gas.pl"
VPX_ASM_SUFFIX="$ASM_SUFFIX"
VPX_ARM_ASM=1
fi
esac
if test -n "$COMPILE_ENVIRONMENT" -a -n "$VPX_X86_ASM" -a -z "$VPX_AS"; then
@ -6101,6 +6118,8 @@ if test -n "$MOZ_WEBM" -a -z "$MOZ_NATIVE_LIBVPX"; then
if test -n "$VPX_X86_ASM"; then
AC_DEFINE(VPX_X86_ASM)
elif test -n "$VPX_ARM_ASM"; then
AC_DEFINE(VPX_ARM_ASM)
else
AC_MSG_WARN([No assembler or assembly support for libvpx. Using unoptimized C routines.])
fi
@ -9082,7 +9101,11 @@ AC_SUBST(MOZ_OGG)
AC_SUBST(MOZ_ALSA_LIBS)
AC_SUBST(VPX_AS)
AC_SUBST(VPX_ASFLAGS)
AC_SUBST(VPX_DASH_C_FLAG)
AC_SUBST(VPX_AS_CONVERSION)
AC_SUBST(VPX_ASM_SUFFIX)
AC_SUBST(VPX_X86_ASM)
AC_SUBST(VPX_ARM_ASM)
if test "$USING_HCC"; then
CC='${topsrcdir}/build/hcc'

Просмотреть файл

@ -53,8 +53,10 @@ LOCAL_INCLUDES += \
-I$(topsrcdir)/media/libvpx \
-I$(topsrcdir)/media/libvpx/vp8/ \
-I$(topsrcdir)/media/libvpx/vp8/common/ \
-I$(topsrcdir)/media/libvpx/vp8/common/arm \
-I$(topsrcdir)/media/libvpx/vp8/common/x86 \
-I$(topsrcdir)/media/libvpx/vp8/decoder \
-I$(topsrcdir)/media/libvpx/vp8/decoder/arm \
-I$(topsrcdir)/media/libvpx/vp8/decoder/x86 \
-I$(topsrcdir)/media/libvpx/vpx_codec \
-I$(topsrcdir)/media/libvpx/vpx_mem/ \
@ -64,25 +66,35 @@ LOCAL_INCLUDES += \
$(NULL)
VPATH += \
$(srcdir)/build/make \
$(srcdir)/vpx \
$(srcdir)/vpx/src \
$(srcdir)/vpx_mem \
$(srcdir)/vpx_mem/include \
$(srcdir)/vpx_ports \
$(srcdir)/vpx_scale \
$(srcdir)/vpx_scale/arm \
$(srcdir)/vpx_scale/generic \
$(srcdir)/vp8 \
$(srcdir)/vp8/common \
$(srcdir)/vp8/common/arm \
$(srcdir)/vp8/common/arm/armv6 \
$(srcdir)/vp8/common/arm/neon \
$(srcdir)/vp8/common/generic \
$(srcdir)/vp8/common/x86 \
$(srcdir)/vp8/decoder \
$(srcdir)/vp8/decoder/arm \
$(srcdir)/vp8/decoder/arm/armv6 \
$(srcdir)/vp8/decoder/arm/neon \
$(srcdir)/vp8/decoder/generic \
$(srcdir)/vp8/decoder/x86 \
$(NULL)
ASM_SUFFIX=asm
#Setup the libvpx assembler config.
AS=$(VPX_AS)
ASFLAGS=$(VPX_ASFLAGS) -I$(topsrcdir)/media/libvpx/ -I$(topsrcdir)/media/libvpx/vpx_ports/
ASFLAGS=$(VPX_ASFLAGS) -I. -I$(topsrcdir)/media/libvpx/ -I$(topsrcdir)/media/libvpx/vpx_ports/
AS_DASH_C_FLAG=$(VPX_DASH_C_FLAG)
ASM_SUFFIX=$(VPX_ASM_SUFFIX)
EXPORTS_NAMESPACES = vpx
@ -104,6 +116,7 @@ EXPORTS_vpx = \
mem.h \
vpx_integer.h \
vpx_timer.h \
arm.h \
x86.h \
scale_mode.h \
vpxscale.h \
@ -145,9 +158,9 @@ CSRCS += \
dboolhuff.c \
decodemv.c \
decodframe.c \
demode.c \
dequantize.c \
detokenize.c \
reconintra_mt.c \
idct_blk.c \
onyxd_if.c \
threading.c \
@ -168,6 +181,7 @@ CSRCS += \
ifdef VPX_X86_ASM
# Building on an x86 platform with a supported assembler, include
# the optimized assembly in the build.
CSRCS += \
idct_blk_mmx.c \
idct_blk_sse2.c \
@ -196,7 +210,116 @@ ASFILES += \
$(NULL)
endif
ifdef VPX_ARM_ASM
# Building on an ARM platform with a supported assembler, include
# the optimized assembly in the build.
# The Android NDK doesn't pre-define anything to indicate the OS it's on, so
# do it for them.
ifeq ($(OS_TARGET),Android)
DEFINES += -D__linux__
endif
CSRCS += \
arm_cpudetect.c \
arm_systemdependent.c \
bilinearfilter_arm.c \
filter_arm.c \
loopfilter_arm.c \
reconintra_arm.c \
arm_dsystemdependent.c \
dequantize_arm.c \
idct_blk_v6.c \
idct_blk_neon.c \
recon_neon.c \
$(NULL)
VPX_ASFILES = \
detokenize.asm \
bilinearfilter_v6.asm \
copymem8x4_v6.asm \
copymem8x8_v6.asm \
copymem16x16_v6.asm \
dc_only_idct_add_v6.asm \
iwalsh_v6.asm \
filter_v6.asm \
idct_v6.asm \
loopfilter_v6.asm \
recon_v6.asm \
simpleloopfilter_v6.asm \
sixtappredict8x4_v6.asm \
bilinearpredict4x4_neon.asm \
bilinearpredict8x4_neon.asm \
bilinearpredict8x8_neon.asm \
bilinearpredict16x16_neon.asm \
copymem8x4_neon.asm \
copymem8x8_neon.asm \
copymem16x16_neon.asm \
dc_only_idct_add_neon.asm \
iwalsh_neon.asm \
loopfilter_neon.asm \
loopfiltersimplehorizontaledge_neon.asm \
loopfiltersimpleverticaledge_neon.asm \
mbloopfilter_neon.asm \
recon2b_neon.asm \
recon4b_neon.asm \
reconb_neon.asm \
shortidct4x4llm_1_neon.asm \
shortidct4x4llm_neon.asm \
sixtappredict4x4_neon.asm \
sixtappredict8x4_neon.asm \
sixtappredict8x8_neon.asm \
sixtappredict16x16_neon.asm \
recon16x16mb_neon.asm \
buildintrapredictorsmby_neon.asm \
save_neon_reg.asm \
dequant_dc_idct_v6.asm \
dequant_idct_v6.asm \
dequantize_v6.asm \
idct_dequant_dc_full_2x_neon.asm \
idct_dequant_dc_0_2x_neon.asm \
dequant_idct_neon.asm \
idct_dequant_full_2x_neon.asm \
idct_dequant_0_2x_neon.asm \
dequantizeb_neon.asm \
$(NULL)
# The ARM asm needs to extract the offsets of various C struct members.
# We need a program that runs on the host to pull them out of a .o file.
HOST_CSRCS = obj_int_extract.c
HOST_PROGRAM = host_obj_int_extract$(HOST_BIN_SUFFIX)
ifdef VPX_AS_CONVERSION
# The ARM asm is written in ARM RVCT syntax, but we actually build it with
# gas using GNU syntax. Add some rules to perform the conversion.
VPX_CONVERTED_ASFILES = $(addsuffix .$(ASM_SUFFIX), $(VPX_ASFILES))
ASFILES += $(VPX_CONVERTED_ASFILES)
GARBAGE += $(VPX_CONVERTED_ASFILES)
%.asm.$(ASM_SUFFIX): %.asm
$(VPX_AS_CONVERSION) < $< > $@
vpx_asm_offsets.asm: vpx_asm_offsets.$(OBJ_SUFFIX) $(HOST_PROGRAM)
./$(HOST_PROGRAM) rvds $< | $(VPX_AS_CONVERSION) > $@
detokenize.asm.$(OBJ_SUFFIX): vpx_asm_offsets.asm
else
ASFILES += $(VPX_ASFILES)
vpx_asm_offsets.asm: vpx_asm_offsets.$(OBJ_SUFFIX) $(HOST_PROGRAM)
./$(HOST_PROGRAM) rvds $< > $@
detokenize.$(OBJ_SUFFIX): vpx_asm_offsets.asm
endif
GARBAGE += vpx_asm_offsets.$(OBJ_SUFFIX) vpx_asm_offsets.asm
endif
include $(topsrcdir)/config/rules.mk
# Workaround a bug of Sun Studio (CR 6963410)

Просмотреть файл

@ -1,2 +1,2 @@
Using libvpx pulled from git://review.webmproject.org/libvpx.git
Commit ID: 0dd78af3e9b089eacc9af280adfb5549fc7ecdcd
Using the v0.9.5 release pulled from
http://webm.googlecode.com/files/libvpx-v0.9.5.zip

Просмотреть файл

@ -0,0 +1,150 @@
#!/usr/bin/perl
##
## Copyright (c) 2010 The WebM project authors. All Rights Reserved.
##
## Use of this source code is governed by a BSD-style license
## that can be found in the LICENSE file in the root of the source
## tree. An additional intellectual property rights grant can be found
## in the file PATENTS. All contributing project authors may
## be found in the AUTHORS file in the root of the source tree.
##
# ads2gas.pl
# Author: Eric Fung (efung (at) acm.org)
#
# Convert ARM Developer Suite 1.0.1 syntax assembly source to GNU as format
#
# Usage: cat inputfile | perl ads2gas.pl > outputfile
#
print "@ This file was created from a .asm file\n";
print "@ using the ads2gas.pl script.\n";
print "\t.equ DO1STROUNDING, 0\n";
while (<STDIN>)
{
# Comment character
s/;/@/g;
# Hexadecimal constants prefaced by 0x
s/#&/#0x/g;
# Convert :OR: to |
s/:OR:/ | /g;
# Convert :AND: to &
s/:AND:/ & /g;
# Convert :NOT: to ~
s/:NOT:/ ~ /g;
# Convert :SHL: to <<
s/:SHL:/ << /g;
# Convert :SHR: to >>
s/:SHR:/ >> /g;
# Convert ELSE to .else
s/ELSE/.else/g;
# Convert ENDIF to .endif
s/ENDIF/.endif/g;
# Convert ELSEIF to .elseif
s/ELSEIF/.elseif/g;
# Convert LTORG to .ltorg
s/LTORG/.ltorg/g;
# Convert IF :DEF:to .if
# gcc doesn't have the ability to do a conditional
# if defined variable that is set by IF :DEF: on
# armasm, so convert it to a normal .if and then
# make sure to define a value elesewhere
if (s/\bIF :DEF:\b/.if /g)
{
s/=/==/g;
}
# Convert IF to .if
if (s/\bIF\b/.if/g)
{
s/=+/==/g;
}
# Convert INCLUDE to .INCLUDE "file"
s/INCLUDE(\s*)(.*)$/.include $1\"$2\"/;
# Code directive (ARM vs Thumb)
s/CODE([0-9][0-9])/.code $1/;
# No AREA required
s/^\s*AREA.*$/.text/;
# DCD to .word
# This one is for incoming symbols
s/DCD\s+\|(\w*)\|/.long $1/;
# DCW to .short
s/DCW\s+\|(\w*)\|/.short $1/;
s/DCW(.*)/.short $1/;
# Constants defined in scope
s/DCD(.*)/.long $1/;
s/DCB(.*)/.byte $1/;
# RN to .req
if (s/RN\s+([Rr]\d+|lr)/.req $1/)
{
print;
next;
}
# Make function visible to linker, and make additional symbol with
# prepended underscore
s/EXPORT\s+\|([\$\w]*)\|/.global $1 \n\t.type $1, function/;
s/IMPORT\s+\|([\$\w]*)\|/.global $1/;
# No vertical bars required; make additional symbol with prepended
# underscore
s/^\|(\$?\w+)\|/_$1\n\t$1:/g;
# Labels need trailing colon
# s/^(\w+)/$1:/ if !/EQU/;
# put the colon at the end of the line in the macro
s/^([a-zA-Z_0-9\$]+)/$1:/ if !/EQU/;
# Strip ALIGN
s/\sALIGN/@ ALIGN/g;
# Strip ARM
s/\sARM/@ ARM/g;
# Strip REQUIRE8
#s/\sREQUIRE8/@ REQUIRE8/g;
s/\sREQUIRE8/@ /g; #EQU cause problem
# Strip PRESERVE8
s/\sPRESERVE8/@ PRESERVE8/g;
# Strip PROC and ENDPROC
s/\sPROC/@/g;
s/\sENDP/@/g;
# EQU directive
s/(.*)EQU(.*)/.equ $1, $2/;
# Begin macro definition
if (/MACRO/) {
$_ = <STDIN>;
s/^/.macro/;
s/\$//g; # remove formal param reference
s/;/@/g; # change comment characters
}
# For macros, use \ to reference formal params
s/\$/\\/g; # End macro definition
s/MEND/.endm/; # No need to tell it where to stop assembling
next if /^\s*END\s*$/;
print;
}

Просмотреть файл

@ -0,0 +1,756 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <stdio.h>
#include <stdlib.h>
#include "vpx_config.h"
#if defined(_MSC_VER)
#include <io.h>
#include <share.h>
#include "vpx/vpx_integer.h"
#else
#include <stdint.h>
#include <unistd.h>
#endif
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdarg.h>
typedef enum
{
OUTPUT_FMT_PLAIN,
OUTPUT_FMT_RVDS,
OUTPUT_FMT_GAS,
} output_fmt_t;
int log_msg(const char *fmt, ...)
{
int res;
va_list ap;
va_start(ap, fmt);
res = vfprintf(stderr, fmt, ap);
va_end(ap);
return res;
}
#if defined(__GNUC__) && __GNUC__
#if defined(__MACH__)
#include <mach-o/loader.h>
#include <mach-o/nlist.h>
int parse_macho(uint8_t *base_buf, size_t sz)
{
int i, j;
struct mach_header header;
uint8_t *buf = base_buf;
int base_data_section = 0;
memcpy(&header, buf, sizeof(struct mach_header));
buf += sizeof(struct mach_header);
if (header.magic != MH_MAGIC)
{
log_msg("Bad magic number for object file. 0x%x expected, 0x%x found.\n",
header.magic, MH_MAGIC);
goto bail;
}
if (header.cputype != CPU_TYPE_ARM)
{
log_msg("Bad cputype for object file. Currently only tested for CPU_TYPE_ARM.\n");
goto bail;
}
if (header.filetype != MH_OBJECT)
{
log_msg("Bad filetype for object file. Currently only tested for MH_OBJECT.\n");
goto bail;
}
for (i = 0; i < header.ncmds; i++)
{
struct load_command lc;
struct symtab_command sc;
struct segment_command seg_c;
memcpy(&lc, buf, sizeof(struct load_command));
if (lc.cmd == LC_SEGMENT)
{
uint8_t *seg_buf = buf;
struct section s;
memcpy(&seg_c, buf, sizeof(struct segment_command));
seg_buf += sizeof(struct segment_command);
for (j = 0; j < seg_c.nsects; j++)
{
memcpy(&s, seg_buf + (j * sizeof(struct section)), sizeof(struct section));
// Need to get this offset which is the start of the symbol table
// before matching the strings up with symbols.
base_data_section = s.offset;
}
}
else if (lc.cmd == LC_SYMTAB)
{
uint8_t *sym_buf = base_buf;
uint8_t *str_buf = base_buf;
if (base_data_section != 0)
{
memcpy(&sc, buf, sizeof(struct symtab_command));
if (sc.cmdsize != sizeof(struct symtab_command))
log_msg("Can't find symbol table!\n");
sym_buf += sc.symoff;
str_buf += sc.stroff;
for (j = 0; j < sc.nsyms; j++)
{
struct nlist nl;
int val;
memcpy(&nl, sym_buf + (j * sizeof(struct nlist)), sizeof(struct nlist));
val = *((int *)(base_buf + base_data_section + nl.n_value));
// Location of string is cacluated each time from the
// start of the string buffer. On darwin the symbols
// are prefixed by "_". On other platforms it is not
// so it needs to be removed. That is the reason for
// the +1.
printf("%-40s EQU %5d\n", str_buf + nl.n_un.n_strx + 1, val);
}
}
}
buf += lc.cmdsize;
}
return 0;
bail:
return 1;
}
int main(int argc, char **argv)
{
int fd;
char *f;
struct stat stat_buf;
uint8_t *file_buf;
int res;
if (argc < 2 || argc > 3)
{
fprintf(stderr, "Usage: %s [output format] <obj file>\n\n", argv[0]);
fprintf(stderr, " <obj file>\tMachO format object file to parse\n");
fprintf(stderr, "Output Formats:\n");
fprintf(stderr, " gas - compatible with GNU assembler\n");
fprintf(stderr, " rvds - compatible with armasm\n");
goto bail;
}
f = argv[2];
if (!((!strcmp(argv[1], "rvds")) || (!strcmp(argv[1], "gas"))))
f = argv[1];
fd = open(f, O_RDONLY);
if (fd < 0)
{
perror("Unable to open file");
goto bail;
}
if (fstat(fd, &stat_buf))
{
perror("stat");
goto bail;
}
file_buf = malloc(stat_buf.st_size);
if (!file_buf)
{
perror("malloc");
goto bail;
}
if (read(fd, file_buf, stat_buf.st_size) != stat_buf.st_size)
{
perror("read");
goto bail;
}
if (close(fd))
{
perror("close");
goto bail;
}
res = parse_macho(file_buf, stat_buf.st_size);
free(file_buf);
if (!res)
return EXIT_SUCCESS;
bail:
return EXIT_FAILURE;
}
#else
#include "elf.h"
#define COPY_STRUCT(dst, buf, ofst, sz) do {\
if(ofst + sizeof((*(dst))) > sz) goto bail;\
memcpy(dst, buf+ofst, sizeof((*(dst))));\
} while(0)
#define ENDIAN_ASSIGN(val, memb) do {\
if(!elf->le_data) {log_msg("Big Endian data not supported yet!\n");goto bail;}\
(val) = (memb);\
} while(0)
#define ENDIAN_ASSIGN_IN_PLACE(memb) do {\
ENDIAN_ASSIGN(memb, memb);\
} while(0)
typedef struct
{
uint8_t *buf; /* Buffer containing ELF data */
size_t sz; /* Buffer size */
int le_data; /* Data is little-endian */
Elf32_Ehdr hdr;
} elf_obj_t;
int parse_elf32_header(elf_obj_t *elf)
{
int res;
/* Verify ELF32 header */
COPY_STRUCT(&elf->hdr, elf->buf, 0, elf->sz);
res = elf->hdr.e_ident[EI_MAG0] == ELFMAG0;
res &= elf->hdr.e_ident[EI_MAG1] == ELFMAG1;
res &= elf->hdr.e_ident[EI_MAG2] == ELFMAG2;
res &= elf->hdr.e_ident[EI_MAG3] == ELFMAG3;
res &= elf->hdr.e_ident[EI_CLASS] == ELFCLASS32;
res &= elf->hdr.e_ident[EI_DATA] == ELFDATA2LSB
|| elf->hdr.e_ident[EI_DATA] == ELFDATA2MSB;
if (!res) goto bail;
elf->le_data = elf->hdr.e_ident[EI_DATA] == ELFDATA2LSB;
ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_type);
ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_machine);
ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_version);
ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_entry);
ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_phoff);
ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shoff);
ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_flags);
ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_ehsize);
ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_phentsize);
ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_phnum);
ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shentsize);
ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shnum);
ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shstrndx);
return 0;
bail:
return 1;
}
int parse_elf32_section(elf_obj_t *elf, int idx, Elf32_Shdr *hdr)
{
if (idx >= elf->hdr.e_shnum)
goto bail;
COPY_STRUCT(hdr, elf->buf, elf->hdr.e_shoff + idx * elf->hdr.e_shentsize,
elf->sz);
ENDIAN_ASSIGN_IN_PLACE(hdr->sh_name);
ENDIAN_ASSIGN_IN_PLACE(hdr->sh_type);
ENDIAN_ASSIGN_IN_PLACE(hdr->sh_flags);
ENDIAN_ASSIGN_IN_PLACE(hdr->sh_addr);
ENDIAN_ASSIGN_IN_PLACE(hdr->sh_offset);
ENDIAN_ASSIGN_IN_PLACE(hdr->sh_size);
ENDIAN_ASSIGN_IN_PLACE(hdr->sh_link);
ENDIAN_ASSIGN_IN_PLACE(hdr->sh_info);
ENDIAN_ASSIGN_IN_PLACE(hdr->sh_addralign);
ENDIAN_ASSIGN_IN_PLACE(hdr->sh_entsize);
return 0;
bail:
return 1;
}
char *parse_elf32_string_table(elf_obj_t *elf, int s_idx, int idx)
{
Elf32_Shdr shdr;
if (parse_elf32_section(elf, s_idx, &shdr))
{
log_msg("Failed to parse ELF string table: section %d, index %d\n",
s_idx, idx);
return "";
}
return (char *)(elf->buf + shdr.sh_offset + idx);
}
int parse_elf32_symbol(elf_obj_t *elf, unsigned int ofst, Elf32_Sym *sym)
{
COPY_STRUCT(sym, elf->buf, ofst, elf->sz);
ENDIAN_ASSIGN_IN_PLACE(sym->st_name);
ENDIAN_ASSIGN_IN_PLACE(sym->st_value);
ENDIAN_ASSIGN_IN_PLACE(sym->st_size);
ENDIAN_ASSIGN_IN_PLACE(sym->st_info);
ENDIAN_ASSIGN_IN_PLACE(sym->st_other);
ENDIAN_ASSIGN_IN_PLACE(sym->st_shndx);
return 0;
bail:
return 1;
}
int parse_elf32(uint8_t *buf, size_t sz, output_fmt_t mode)
{
elf_obj_t elf;
Elf32_Shdr shdr;
unsigned int ofst;
int i;
Elf32_Off strtab_off; /* save String Table offset for later use */
memset(&elf, 0, sizeof(elf));
elf.buf = buf;
elf.sz = sz;
/* Parse Header */
if (parse_elf32_header(&elf))
{
log_msg("Parse error: File does not appear to be valid ELF32\n");
return 1;
}
for (i = 0; i < elf.hdr.e_shnum; i++)
{
parse_elf32_section(&elf, i, &shdr);
if (shdr.sh_type == SHT_STRTAB)
{
char strtsb_name[128];
strcpy(strtsb_name, (char *)(elf.buf + shdr.sh_offset + shdr.sh_name));
if (!(strcmp(strtsb_name, ".shstrtab")))
{
log_msg("found section: %s\n", strtsb_name);
strtab_off = shdr.sh_offset;
break;
}
}
}
/* Parse all Symbol Tables */
for (i = 0; i < elf.hdr.e_shnum; i++)
{
parse_elf32_section(&elf, i, &shdr);
if (shdr.sh_type == SHT_SYMTAB)
{
for (ofst = shdr.sh_offset;
ofst < shdr.sh_offset + shdr.sh_size;
ofst += shdr.sh_entsize)
{
Elf32_Sym sym;
parse_elf32_symbol(&elf, ofst, &sym);
/* For all OBJECTS (data objects), extract the value from the
* proper data segment.
*/
if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT && sym.st_name)
log_msg("found data object %s\n",
parse_elf32_string_table(&elf,
shdr.sh_link,
sym.st_name));
if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT
&& sym.st_size == 4)
{
Elf32_Shdr dhdr;
int32_t val;
char section_name[128];
parse_elf32_section(&elf, sym.st_shndx, &dhdr);
/* For explanition - refer to _MSC_VER version of code */
strcpy(section_name, (char *)(elf.buf + strtab_off + dhdr.sh_name));
log_msg("Section_name: %s, Section_type: %d\n", section_name, dhdr.sh_type);
if (!(strcmp(section_name, ".bss")))
{
val = 0;
}
else
{
memcpy(&val,
elf.buf + dhdr.sh_offset + sym.st_value,
sizeof(val));
}
if (!elf.le_data)
{
log_msg("Big Endian data not supported yet!\n");
goto bail;
}\
switch (mode)
{
case OUTPUT_FMT_RVDS:
printf("%-40s EQU %5d\n",
parse_elf32_string_table(&elf,
shdr.sh_link,
sym.st_name),
val);
break;
case OUTPUT_FMT_GAS:
printf(".equ %-40s, %5d\n",
parse_elf32_string_table(&elf,
shdr.sh_link,
sym.st_name),
val);
break;
default:
printf("%s = %d\n",
parse_elf32_string_table(&elf,
shdr.sh_link,
sym.st_name),
val);
}
}
}
}
}
if (mode == OUTPUT_FMT_RVDS)
printf(" END\n");
return 0;
bail:
log_msg("Parse error: File does not appear to be valid ELF32\n");
return 1;
}
int main(int argc, char **argv)
{
int fd;
output_fmt_t mode;
char *f;
struct stat stat_buf;
uint8_t *file_buf;
int res;
if (argc < 2 || argc > 3)
{
fprintf(stderr, "Usage: %s [output format] <obj file>\n\n", argv[0]);
fprintf(stderr, " <obj file>\tELF format object file to parse\n");
fprintf(stderr, "Output Formats:\n");
fprintf(stderr, " gas - compatible with GNU assembler\n");
fprintf(stderr, " rvds - compatible with armasm\n");
goto bail;
}
f = argv[2];
if (!strcmp(argv[1], "rvds"))
mode = OUTPUT_FMT_RVDS;
else if (!strcmp(argv[1], "gas"))
mode = OUTPUT_FMT_GAS;
else
f = argv[1];
fd = open(f, O_RDONLY);
if (fd < 0)
{
perror("Unable to open file");
goto bail;
}
if (fstat(fd, &stat_buf))
{
perror("stat");
goto bail;
}
file_buf = malloc(stat_buf.st_size);
if (!file_buf)
{
perror("malloc");
goto bail;
}
if (read(fd, file_buf, stat_buf.st_size) != stat_buf.st_size)
{
perror("read");
goto bail;
}
if (close(fd))
{
perror("close");
goto bail;
}
res = parse_elf32(file_buf, stat_buf.st_size, mode);
//res = parse_coff(file_buf, stat_buf.st_size);
free(file_buf);
if (!res)
return EXIT_SUCCESS;
bail:
return EXIT_FAILURE;
}
#endif
#endif
#if defined(_MSC_VER)
/* See "Microsoft Portable Executable and Common Object File Format Specification"
for reference.
*/
#define get_le32(x) ((*(x)) | (*(x+1)) << 8 |(*(x+2)) << 16 | (*(x+3)) << 24 )
#define get_le16(x) ((*(x)) | (*(x+1)) << 8)
int parse_coff(unsigned __int8 *buf, size_t sz)
{
unsigned int nsections, symtab_ptr, symtab_sz, strtab_ptr;
unsigned int sectionrawdata_ptr;
unsigned int i;
unsigned __int8 *ptr;
unsigned __int32 symoffset;
FILE *fp;
char **sectionlist; //this array holds all section names in their correct order.
//it is used to check if the symbol is in .bss or .data section.
nsections = get_le16(buf + 2);
symtab_ptr = get_le32(buf + 8);
symtab_sz = get_le32(buf + 12);
strtab_ptr = symtab_ptr + symtab_sz * 18;
if (nsections > 96)
goto bail;
sectionlist = malloc(nsections * sizeof * sectionlist);
//log_msg("COFF: Found %u symbols in %u sections.\n", symtab_sz, nsections);
/*
The size of optional header is always zero for an obj file. So, the section header
follows the file header immediately.
*/
ptr = buf + 20; //section header
for (i = 0; i < nsections; i++)
{
char sectionname[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
strncpy(sectionname, ptr, 8);
//log_msg("COFF: Parsing section %s\n",sectionname);
sectionlist[i] = malloc(strlen(sectionname) + 1);
strcpy(sectionlist[i], sectionname);
if (!strcmp(sectionname, ".data")) sectionrawdata_ptr = get_le32(ptr + 20);
ptr += 40;
}
//log_msg("COFF: Symbol table at offset %u\n", symtab_ptr);
//log_msg("COFF: raw data pointer ofset for section .data is %u\n", sectionrawdata_ptr);
fp = fopen("vpx_asm_offsets.asm", "w");
if (fp == NULL)
{
perror("open file");
goto bail;
}
/* The compiler puts the data with non-zero offset in .data section, but puts the data with
zero offset in .bss section. So, if the data in in .bss section, set offset=0.
Note from Wiki: In an object module compiled from C, the bss section contains
the local variables (but not functions) that were declared with the static keyword,
except for those with non-zero initial values. (In C, static variables are initialized
to zero by default.) It also contains the non-local (both extern and static) variables
that are also initialized to zero (either explicitly or by default).
*/
//move to symbol table
/* COFF symbol table:
offset field
0 Name(*)
8 Value
12 SectionNumber
14 Type
16 StorageClass
17 NumberOfAuxSymbols
*/
ptr = buf + symtab_ptr;
for (i = 0; i < symtab_sz; i++)
{
__int16 section = get_le16(ptr + 12); //section number
if (section > 0 && ptr[16] == 2)
{
//if(section > 0 && ptr[16] == 3 && get_le32(ptr+8)) {
if (get_le32(ptr))
{
char name[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
strncpy(name, ptr, 8);
//log_msg("COFF: Parsing symbol %s\n",name);
fprintf(fp, "%-40s EQU ", name);
}
else
{
//log_msg("COFF: Parsing symbol %s\n",
// buf + strtab_ptr + get_le32(ptr+4));
fprintf(fp, "%-40s EQU ", buf + strtab_ptr + get_le32(ptr + 4));
}
if (!(strcmp(sectionlist[section-1], ".bss")))
{
symoffset = 0;
}
else
{
symoffset = get_le32(buf + sectionrawdata_ptr + get_le32(ptr + 8));
}
//log_msg(" Section: %d\n",section);
//log_msg(" Class: %d\n",ptr[16]);
//log_msg(" Address: %u\n",get_le32(ptr+8));
//log_msg(" Offset: %u\n", symoffset);
fprintf(fp, "%5d\n", symoffset);
}
ptr += 18;
}
fprintf(fp, " END\n");
fclose(fp);
for (i = 0; i < nsections; i++)
{
free(sectionlist[i]);
}
free(sectionlist);
return 0;
bail:
for (i = 0; i < nsections; i++)
{
free(sectionlist[i]);
}
free(sectionlist);
return 1;
}
int main(int argc, char **argv)
{
int fd;
output_fmt_t mode;
const char *f;
struct _stat stat_buf;
unsigned __int8 *file_buf;
int res;
if (argc < 2 || argc > 3)
{
fprintf(stderr, "Usage: %s [output format] <obj file>\n\n", argv[0]);
fprintf(stderr, " <obj file>\tELF format object file to parse\n");
fprintf(stderr, "Output Formats:\n");
fprintf(stderr, " gas - compatible with GNU assembler\n");
fprintf(stderr, " rvds - compatible with armasm\n");
goto bail;
}
f = argv[2];
if (!strcmp(argv[1], "rvds"))
mode = OUTPUT_FMT_RVDS;
else if (!strcmp(argv[1], "gas"))
mode = OUTPUT_FMT_GAS;
else
f = argv[1];
if (_sopen_s(&fd, f, _O_BINARY, _SH_DENYNO, _S_IREAD | _S_IWRITE))
{
perror("Unable to open file");
goto bail;
}
if (_fstat(fd, &stat_buf))
{
perror("stat");
goto bail;
}
file_buf = malloc(stat_buf.st_size);
if (!file_buf)
{
perror("malloc");
goto bail;
}
if (_read(fd, file_buf, stat_buf.st_size) != stat_buf.st_size)
{
perror("read");
goto bail;
}
if (_close(fd))
{
perror("close");
goto bail;
}
res = parse_coff(file_buf, stat_buf.st_size);
free(file_buf);
if (!res)
return EXIT_SUCCESS;
bail:
return EXIT_FAILURE;
}
#endif

Просмотреть файл

@ -1,113 +0,0 @@
diff --git a/media/libvpx/vp8/decoder/decodframe.c b/media/libvpx/vp8/decoder/decodframe.c
--- a/media/libvpx/vp8/decoder/decodframe.c
+++ b/media/libvpx/vp8/decoder/decodframe.c
@@ -462,17 +462,17 @@ static void setup_token_decoder(VP8D_COM
{
partition_size = read_partition_size(partition_size_ptr);
}
else
{
partition_size = user_data_end - partition;
}
- if (partition + partition_size > user_data_end)
+ if (user_data_end - partition < partition_size)
vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
"Truncated packet or corrupt partition "
"%d length", i + 1);
if (vp8dx_start_decode(bool_decoder, IF_RTCD(&pbi->dboolhuff),
partition, partition_size))
vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate bool decoder %d", i + 1);
@@ -564,30 +564,33 @@ static void init_frame(VP8D_COMP *pbi)
int vp8_decode_frame(VP8D_COMP *pbi)
{
vp8_reader *const bc = & pbi->bc;
VP8_COMMON *const pc = & pbi->common;
MACROBLOCKD *const xd = & pbi->mb;
const unsigned char *data = (const unsigned char *)pbi->Source;
const unsigned char *const data_end = data + pbi->source_sz;
- int first_partition_length_in_bytes;
+ unsigned int first_partition_length_in_bytes;
int mb_row;
int i, j, k, l;
const int *const mb_feature_data_bits = vp8_mb_feature_data_bits;
+ if (data_end - data < 3)
+ vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+ "Truncated packet");
pc->frame_type = (FRAME_TYPE)(data[0] & 1);
pc->version = (data[0] >> 1) & 7;
pc->show_frame = (data[0] >> 4) & 1;
first_partition_length_in_bytes =
(data[0] | (data[1] << 8) | (data[2] << 16)) >> 5;
data += 3;
- if (data + first_partition_length_in_bytes > data_end)
+ if (data_end - data < first_partition_length_in_bytes)
vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
"Truncated packet or corrupt partition 0 length");
vp8_setup_version(pc);
if (pc->frame_type == KEY_FRAME)
{
const int Width = pc->Width;
const int Height = pc->Height;
diff --git a/media/libvpx/vp8/decoder/onyxd_if.c b/media/libvpx/vp8/decoder/onyxd_if.c
--- a/media/libvpx/vp8/decoder/onyxd_if.c
+++ b/media/libvpx/vp8/decoder/onyxd_if.c
@@ -318,45 +318,49 @@ int vp8dx_receive_compressed_data(VP8D_P
if (ptr == 0)
{
return -1;
}
pbi->common.error.error_code = VPX_CODEC_OK;
+ cm->new_fb_idx = get_free_fb (cm);
+
if (setjmp(pbi->common.error.jmp))
{
pbi->common.error.setjmp = 0;
+ if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
+ cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
return -1;
}
pbi->common.error.setjmp = 1;
#if HAVE_ARMV7
vp8_push_neon(dx_store_reg);
#endif
vpx_usec_timer_start(&timer);
//cm->current_video_frame++;
pbi->Source = source;
pbi->source_sz = size;
- cm->new_fb_idx = get_free_fb (cm);
-
retcode = vp8_decode_frame(pbi);
if (retcode < 0)
{
#if HAVE_ARMV7
vp8_pop_neon(dx_store_reg);
#endif
pbi->common.error.error_code = VPX_CODEC_ERROR;
pbi->common.error.setjmp = 0;
+ if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
+ cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
return retcode;
}
if (pbi->b_multithreaded_lf && pbi->common.filter_level != 0)
vp8_stop_lfthread(pbi);
if (swap_frame_buffers (cm))
{

Просмотреть файл

@ -1,168 +0,0 @@
diff --git a/media/libvpx/vp8/common/blockd.h b/media/libvpx/vp8/common/blockd.h
--- a/media/libvpx/vp8/common/blockd.h
+++ b/media/libvpx/vp8/common/blockd.h
@@ -90,17 +90,17 @@ typedef enum
MB_MODE_COUNT
} MB_PREDICTION_MODE;
// Macroblock level features
typedef enum
{
MB_LVL_ALT_Q = 0, // Use alternate Quantizer ....
MB_LVL_ALT_LF = 1, // Use alternate loop filter value...
- MB_LVL_MAX = 2, // Number of MB level features supported
+ MB_LVL_MAX = 2 // Number of MB level features supported
} MB_LVL_FEATURES;
// Segment Feature Masks
#define SEGMENT_ALTQ 0x01
#define SEGMENT_ALT_LF 0x02
#define VP8_YMODES (B_PRED + 1)
diff --git a/media/libvpx/vp8/common/ppflags.h b/media/libvpx/vp8/common/ppflags.h
--- a/media/libvpx/vp8/common/ppflags.h
+++ b/media/libvpx/vp8/common/ppflags.h
@@ -15,12 +15,12 @@ enum
{
VP8D_NOFILTERING = 0,
VP8D_DEBLOCK = 1,
VP8D_DEMACROBLOCK = 2,
VP8D_ADDNOISE = 4,
VP8D_DEBUG_LEVEL1 = 8,
VP8D_DEBUG_LEVEL2 = 16,
VP8D_DEBUG_LEVEL3 = 32,
- VP8D_DEBUG_LEVEL4 = 64,
+ VP8D_DEBUG_LEVEL4 = 64
};
#endif
diff --git a/media/libvpx/vpx/vp8.h b/media/libvpx/vpx/vp8.h
--- a/media/libvpx/vpx/vp8.h
+++ b/media/libvpx/vpx/vp8.h
@@ -48,17 +48,17 @@ enum vp8_dec_control_id
*
* The set of macros define VP8 decoder post processing flags
*/
enum vp8_postproc_level
{
VP8_NOFILTERING = 0,
VP8_DEBLOCK = 1,
VP8_DEMACROBLOCK = 2,
- VP8_ADDNOISE = 4,
+ VP8_ADDNOISE = 4
};
/*!\brief post process flags
*
* This define a structure that describe the post processing settings. For
* the best objective measure (using thet PSNR metric) set post_proc_flag
* to VP8_DEBLOCK and deblocking_level to 1.
*/
diff --git a/media/libvpx/vpx/vpx_codec.h b/media/libvpx/vpx/vpx_codec.h
--- a/media/libvpx/vpx/vpx_codec.h
+++ b/media/libvpx/vpx/vpx_codec.h
@@ -57,17 +57,17 @@ extern "C" {
#define DEPRECATED
#define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */
#endif
#endif
/*!\brief Decorator indicating a function is potentially unused */
#ifdef UNUSED
#elif __GNUC__
-#define UNUSED __attribute__ ((unused));
+#define UNUSED __attribute__ ((unused))
#else
#define UNUSED
#endif
/*!\brief Current ABI version number
*
* \internal
* If this file is altered in any way that changes the ABI, this value
@@ -123,17 +123,17 @@ extern "C" {
/*!\brief An application-supplied parameter is not valid.
*
*/
VPX_CODEC_INVALID_PARAM,
/*!\brief An iterator reached the end of list.
*
*/
- VPX_CODEC_LIST_END,
+ VPX_CODEC_LIST_END
}
vpx_codec_err_t;
/*! \brief Codec capabilities bitfield
*
* Each codec advertises the capabilities it supports as part of its
diff --git a/media/libvpx/vpx/vpx_decoder_compat.h b/media/libvpx/vpx/vpx_decoder_compat.h
--- a/media/libvpx/vpx/vpx_decoder_compat.h
+++ b/media/libvpx/vpx/vpx_decoder_compat.h
@@ -73,17 +73,17 @@ extern "C" {
/*!\brief An application-supplied parameter is not valid.
*
*/
VPX_DEC_INVALID_PARAM = VPX_CODEC_INVALID_PARAM,
/*!\brief An iterator reached the end of list.
*
*/
- VPX_DEC_LIST_END = VPX_CODEC_LIST_END,
+ VPX_DEC_LIST_END = VPX_CODEC_LIST_END
}
vpx_dec_err_t;
/*! \brief Decoder capabilities bitfield
*
* Each decoder advertises the capabilities it supports as part of its
* ::vpx_dec_iface_t interface structure. Capabilities are extra interfaces
diff --git a/media/libvpx/vpx/vpx_encoder.h b/media/libvpx/vpx/vpx_encoder.h
--- a/media/libvpx/vpx/vpx_encoder.h
+++ b/media/libvpx/vpx/vpx_encoder.h
@@ -166,17 +166,17 @@ extern "C" {
} vpx_rational_t; /**< alias for struct vpx_rational */
/*!\brief Multi-pass Encoding Pass */
enum vpx_enc_pass
{
VPX_RC_ONE_PASS, /**< Single pass mode */
VPX_RC_FIRST_PASS, /**< First pass of multi-pass mode */
- VPX_RC_LAST_PASS, /**< Final pass of multi-pass mode */
+ VPX_RC_LAST_PASS /**< Final pass of multi-pass mode */
};
/*!\brief Rate control mode */
enum vpx_rc_mode
{
VPX_VBR, /**< Variable Bit Rate (VBR) mode */
VPX_CBR /**< Constant Bit Rate (CBR) mode */
diff --git a/media/libvpx/vpx/vpx_image.h b/media/libvpx/vpx/vpx_image.h
--- a/media/libvpx/vpx/vpx_image.h
+++ b/media/libvpx/vpx/vpx_image.h
@@ -50,17 +50,17 @@ extern "C" {
VPX_IMG_FMT_RGB32_LE, /**< 32 bit packed BGR0 */
VPX_IMG_FMT_ARGB, /**< 32 bit packed ARGB, alpha=255 */
VPX_IMG_FMT_ARGB_LE, /**< 32 bit packed BGRA, alpha=255 */
VPX_IMG_FMT_RGB565_LE, /**< 16 bit per pixel, gggbbbbb rrrrrggg */
VPX_IMG_FMT_RGB555_LE, /**< 16 bit per pixel, gggbbbbb 0rrrrrgg */
VPX_IMG_FMT_YV12 = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_UV_FLIP | 1, /**< planar YVU */
VPX_IMG_FMT_I420 = VPX_IMG_FMT_PLANAR | 2,
VPX_IMG_FMT_VPXYV12 = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_UV_FLIP | 3, /** < planar 4:2:0 format with vpx color space */
- VPX_IMG_FMT_VPXI420 = VPX_IMG_FMT_PLANAR | 4, /** < planar 4:2:0 format with vpx color space */
+ VPX_IMG_FMT_VPXI420 = VPX_IMG_FMT_PLANAR | 4 /** < planar 4:2:0 format with vpx color space */
}
vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */
#if !defined(VPX_CODEC_DISABLE_COMPAT) || !VPX_CODEC_DISABLE_COMPAT
#define IMG_FMT_PLANAR VPX_IMG_FMT_PLANAR /**< \deprecated Use #VPX_IMG_FMT_PLANAR */
#define IMG_FMT_UV_FLIP VPX_IMG_FMT_UV_FLIP /**< \deprecated Use #VPX_IMG_FMT_UV_FLIP */
#define IMG_FMT_HAS_ALPHA VPX_IMG_FMT_HAS_ALPHA /**< \deprecated Use #VPX_IMG_FMT_HAS_ALPHA */

Просмотреть файл

@ -1,10 +1,7 @@
diff --git a/media/libvpx/vp8/common/loopfilter_filters.c b/media/libvpx/vp8/common/loopfilter_filters.c
--- a/media/libvpx/vp8/common/loopfilter_filters.c
+++ b/media/libvpx/vp8/common/loopfilter_filters.c
@@ -8,16 +8,19 @@
* be found in the AUTHORS file in the root of the source tree.
*/
@@ -11,10 +11,14 @@
#include <stdlib.h>
#include "loopfilter.h"
@ -13,9 +10,7 @@ diff --git a/media/libvpx/vp8/common/loopfilter_filters.c b/media/libvpx/vp8/com
+#ifdef __SUNPRO_C
+#define __inline inline
+#endif
#define NEW_LOOPFILTER_MASK
+
typedef unsigned char uc;
static __inline signed char vp8_signed_char_clamp(int t)
@ -109,14 +104,14 @@ diff --git a/media/libvpx/vpx_ports/mem.h b/media/libvpx/vpx_ports/mem.h
diff --git a/media/libvpx/vpx_ports/x86.h b/media/libvpx/vpx_ports/x86.h
--- a/media/libvpx/vpx_ports/x86.h
+++ b/media/libvpx/vpx_ports/x86.h
@@ -26,16 +26,36 @@
@@ -45,16 +45,36 @@
#define cpuid(func,ax,bx,cx,dx)\
__asm__ __volatile__ (\
"pushl %%ebx \n\t" \
"cpuid \n\t" \
"movl %%ebx, %1 \n\t" \
"popl %%ebx \n\t" \
: "=a" (ax), "=r" (bx), "=c" (cx), "=d" (dx) \
: "a" (func));
"mov %%ebx, %%edi \n\t" \
"cpuid \n\t" \
"xchg %%edi, %%ebx \n\t" \
: "=a" (ax), "=D" (bx), "=c" (cx), "=d" (dx) \
: "a" (func));
#endif
+#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+#if ARCH_X86_64

Просмотреть файл

@ -1,22 +0,0 @@
diff --git a/media/libvpx/vp8/common/x86/subpixel_sse2.asm b/media/libvpx/vp8/common/x86/subpixel_sse2.asm
--- a/media/libvpx/vp8/common/x86/subpixel_sse2.asm
+++ b/media/libvpx/vp8/common/x86/subpixel_sse2.asm
@@ -1003,17 +1003,17 @@ next_row8x8:
paddw xmm3, xmm7
movdqa xmm7, xmm4
paddw xmm3, [rd GLOBAL] ; xmm3 += round value
psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
packuswb xmm3, xmm0
- movq [rdi], xmm3 ; store the results in the destination
+ movq QWORD PTR [rdi], xmm3 ; store the results in the destination
add rsp, 16 ; next line
add rdi, rdx
cmp rdi, rcx
jne next_row8x8
;add rsp, 144

Просмотреть файл

@ -79,19 +79,30 @@ commonFiles=(
vp8/common/swapyv12buffer.c
vp8/common/textblit.c
vp8/common/treecoder.c
vp8/common/arm/arm_systemdependent.c
vp8/common/arm/bilinearfilter_arm.c
vp8/common/arm/filter_arm.c
vp8/common/arm/loopfilter_arm.c
vp8/common/arm/reconintra_arm.c
vp8/common/arm/vpx_asm_offsets.c
vp8/common/arm/neon/recon_neon.c
vp8/common/x86/loopfilter_x86.c
vp8/common/x86/vp8_asm_stubs.c
vp8/common/x86/x86_systemdependent.c
vp8/decoder/dboolhuff.c
vp8/decoder/decodemv.c
vp8/decoder/decodframe.c
vp8/decoder/demode.c
vp8/decoder/dequantize.c
vp8/decoder/detokenize.c
vp8/decoder/reconintra_mt.c
vp8/decoder/generic/dsystemdependent.c
vp8/decoder/idct_blk.c
vp8/decoder/onyxd_if.c
vp8/decoder/threading.c
vp8/decoder/arm/arm_dsystemdependent.c
vp8/decoder/arm/dequantize_arm.c
vp8/decoder/arm/armv6/idct_blk_v6.c
vp8/decoder/arm/neon/idct_blk_neon.c
vp8/decoder/x86/idct_blk_mmx.c
vp8/decoder/x86/idct_blk_sse2.c
vp8/decoder/x86/x86_dsystemdependent.c
@ -138,7 +149,6 @@ commonFiles=(
vp8/common/reconinter.h
vp8/common/reconintra4x4.h
vp8/common/reconintra.h
vp8/common/segmentation_common.h
vp8/common/setupintrarecon.h
vp8/common/subpixel.h
vp8/common/swapyv12buffer.h
@ -147,6 +157,10 @@ commonFiles=(
vp8/common/treecoder.h
vp8/common/type_aliases.h
vp8/common/vpxerrors.h
vp8/common/arm/idct_arm.h
vp8/common/arm/loopfilter_arm.h
vp8/common/arm/recon_arm.h
vp8/common/arm/subpixel_arm.h
vp8/common/x86/idct_x86.h
vp8/common/x86/loopfilter_x86.h
vp8/common/x86/postproc_x86.h
@ -155,11 +169,14 @@ commonFiles=(
vp8/decoder/dboolhuff.h
vp8/decoder/decodemv.h
vp8/decoder/decoderthreading.h
vp8/decoder/demode.h
vp8/decoder/dequantize.h
vp8/decoder/detokenize.h
vp8/decoder/onyxd_int.h
vp8/decoder/reconintra_mt.h
vp8/decoder/treereader.h
vp8/decoder/arm/dboolhuff_arm.h
vp8/decoder/arm/dequantize_arm.h
vp8/decoder/arm/detokenize_arm.h
vp8/decoder/x86/dequantize_x86.h
vpx/internal/vpx_codec_internal.h
vpx/vp8cx.h
@ -176,14 +193,63 @@ commonFiles=(
vpx/vpx_integer.h
vpx_mem/include/vpx_mem_intrnl.h
vpx_mem/vpx_mem.h
vpx_ports/arm_cpudetect.c
vpx_ports/config.h
vpx_ports/mem.h
vpx_ports/vpx_timer.h
vpx_ports/arm.h
vpx_ports/x86.h
vpx_scale/scale_mode.h
vpx_scale/vpxscale.h
vpx_scale/yv12config.h
vpx_scale/yv12extend.h
vp8/common/arm/armv6/bilinearfilter_v6.asm
vp8/common/arm/armv6/copymem8x4_v6.asm
vp8/common/arm/armv6/copymem8x8_v6.asm
vp8/common/arm/armv6/copymem16x16_v6.asm
vp8/common/arm/armv6/dc_only_idct_add_v6.asm
vp8/common/arm/armv6/iwalsh_v6.asm
vp8/common/arm/armv6/filter_v6.asm
vp8/common/arm/armv6/idct_v6.asm
vp8/common/arm/armv6/loopfilter_v6.asm
vp8/common/arm/armv6/recon_v6.asm
vp8/common/arm/armv6/simpleloopfilter_v6.asm
vp8/common/arm/armv6/sixtappredict8x4_v6.asm
vp8/common/arm/neon/bilinearpredict4x4_neon.asm
vp8/common/arm/neon/bilinearpredict8x4_neon.asm
vp8/common/arm/neon/bilinearpredict8x8_neon.asm
vp8/common/arm/neon/bilinearpredict16x16_neon.asm
vp8/common/arm/neon/copymem8x4_neon.asm
vp8/common/arm/neon/copymem8x8_neon.asm
vp8/common/arm/neon/copymem16x16_neon.asm
vp8/common/arm/neon/dc_only_idct_add_neon.asm
vp8/common/arm/neon/iwalsh_neon.asm
vp8/common/arm/neon/loopfilter_neon.asm
vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
vp8/common/arm/neon/mbloopfilter_neon.asm
vp8/common/arm/neon/recon2b_neon.asm
vp8/common/arm/neon/recon4b_neon.asm
vp8/common/arm/neon/reconb_neon.asm
vp8/common/arm/neon/shortidct4x4llm_1_neon.asm
vp8/common/arm/neon/shortidct4x4llm_neon.asm
vp8/common/arm/neon/sixtappredict4x4_neon.asm
vp8/common/arm/neon/sixtappredict8x4_neon.asm
vp8/common/arm/neon/sixtappredict8x8_neon.asm
vp8/common/arm/neon/sixtappredict16x16_neon.asm
vp8/common/arm/neon/recon16x16mb_neon.asm
vp8/common/arm/neon/buildintrapredictorsmby_neon.asm
vp8/common/arm/neon/save_neon_reg.asm
vp8/decoder/arm/detokenize.asm
vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm
vp8/decoder/arm/armv6/dequant_idct_v6.asm
vp8/decoder/arm/armv6/dequantize_v6.asm
vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm
vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm
vp8/decoder/arm/neon/dequant_idct_neon.asm
vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm
vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm
vp8/decoder/arm/neon/dequantizeb_neon.asm
vp8/common/x86/idctllm_mmx.asm
vp8/common/x86/idctllm_sse2.asm
vp8/common/x86/iwalsh_mmx.asm
@ -200,6 +266,8 @@ commonFiles=(
vp8/decoder/x86/dequantize_mmx.asm
vpx_ports/emms.asm
vpx_ports/x86_abi_support.asm
build/make/ads2gas.pl
build/make/obj_int_extract.c
LICENSE
PATENTS
)
@ -218,9 +286,9 @@ cp $1/objdir/x86-linux-gcc/vpx_config.asm vpx_config_x86-linux-gcc.asm
cp $1/objdir/x86-linux-gcc/vpx_config.h vpx_config_x86-linux-gcc.h
# Config files for x86_64-linux-gcc and Solaris x86_64
cp $1/objdir/x86_64-linux-gcc/vpx_config.c vpx_config_x86-linux-gcc.c
cp $1/objdir/x86_64-linux-gcc/vpx_config.asm vpx_config_x86-linux-gcc.asm
cp $1/objdir/x86_64-linux-gcc/vpx_config.h vpx_config_x86-linux-gcc.h
cp $1/objdir/x86_64-linux-gcc/vpx_config.c vpx_config_x86_64-linux-gcc.c
cp $1/objdir/x86_64-linux-gcc/vpx_config.asm vpx_config_x86_64-linux-gcc.asm
cp $1/objdir/x86_64-linux-gcc/vpx_config.h vpx_config_x86_64-linux-gcc.h
# Copy config files for mac...
cp $1/objdir/x86-darwin9-gcc/vpx_config.c vpx_config_x86-darwin9-gcc.c
@ -232,6 +300,10 @@ cp $1/objdir/x86_64-darwin9-gcc/vpx_config.c vpx_config_x86_64-darwin9-gcc.c
cp $1/objdir/x86_64-darwin9-gcc/vpx_config.asm vpx_config_x86_64-darwin9-gcc.asm
cp $1/objdir/x86_64-darwin9-gcc/vpx_config.h vpx_config_x86_64-darwin9-gcc.h
# Config files for arm-linux-gcc
cp $1/objdir/armv7-linux-gcc/vpx_config.c vpx_config_arm-linux-gcc.c
cp $1/objdir/armv7-linux-gcc/vpx_config.h vpx_config_arm-linux-gcc.h
# Config files for generic-gnu
cp $1/objdir/generic-gnu/vpx_config.c vpx_config_generic-gnu.c
cp $1/objdir/generic-gnu/vpx_config.h vpx_config_generic-gnu.h
@ -243,11 +315,5 @@ do
cp -v $1/$f $f
done
# Patch to reduce compiler warnings, so we can compile with -Werror in mozilla.
patch -p3 < reduce-warnings-1.patch
patch -p3 < subpixel-qword.patch
# Patch to compile with Sun Studio on Solaris
patch -p3 < solaris.patch
# Patch to fix frame buffer reference counting and parition length overflow
# checks.
patch -p3 < frame_buf_ref.patch

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@ -56,7 +56,7 @@ int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height)
vp8_de_alloc_frame_buffers(oci);
// our internal buffers are always multiples of 16
/* our internal buffers are always multiples of 16 */
if ((width & 0xf) != 0)
width += 16 - (width & 0xf);
@ -153,7 +153,7 @@ void vp8_setup_version(VP8_COMMON *cm)
cm->full_pixel = 1;
break;
default:
//4,5,6,7 are reserved for future use
/*4,5,6,7 are reserved for future use*/
cm->no_lpf = 0;
cm->simpler_lpf = 0;
cm->use_bilinear_mc_filter = 0;
@ -177,10 +177,10 @@ void vp8_create_common(VP8_COMMON *oci)
oci->clr_type = REG_YUV;
oci->clamp_type = RECON_CLAMP_REQUIRED;
// Initialise reference frame sign bias structure to defaults
/* Initialise reference frame sign bias structure to defaults */
vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
// Default disable buffer to buffer copying
/* Default disable buffer to buffer copying */
oci->copy_buffer_to_gf = 0;
oci->copy_buffer_to_arf = 0;
}

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source

Просмотреть файл

@ -0,0 +1,136 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_ports/config.h"
#include "vpx_ports/arm.h"
#include "g_common.h"
#include "pragmas.h"
#include "subpixel.h"
#include "loopfilter.h"
#include "recon.h"
#include "idct.h"
#include "onyxc_int.h"
extern void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x);
extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x);
extern void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x);
extern void (*vp8_build_intra_predictors_mby_s_ptr)(MACROBLOCKD *x);
extern void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x);
extern void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x);
void vp8_arch_arm_common_init(VP8_COMMON *ctx)
{
#if CONFIG_RUNTIME_CPU_DETECT
VP8_COMMON_RTCD *rtcd = &ctx->rtcd;
int flags = arm_cpu_caps();
int has_edsp = flags & HAS_EDSP;
int has_media = flags & HAS_MEDIA;
int has_neon = flags & HAS_NEON;
rtcd->flags = flags;
/* Override default functions with fastest ones for this CPU. */
#if HAVE_ARMV6
if (has_media)
{
rtcd->subpix.sixtap16x16 = vp8_sixtap_predict16x16_armv6;
rtcd->subpix.sixtap8x8 = vp8_sixtap_predict8x8_armv6;
rtcd->subpix.sixtap8x4 = vp8_sixtap_predict8x4_armv6;
rtcd->subpix.sixtap4x4 = vp8_sixtap_predict_armv6;
rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_armv6;
rtcd->subpix.bilinear8x8 = vp8_bilinear_predict8x8_armv6;
rtcd->subpix.bilinear8x4 = vp8_bilinear_predict8x4_armv6;
rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_armv6;
rtcd->idct.idct1 = vp8_short_idct4x4llm_1_v6;
rtcd->idct.idct16 = vp8_short_idct4x4llm_v6_dual;
rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_v6;
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_v6;
rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_armv6;
rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_armv6;
rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_armv6;
rtcd->loopfilter.normal_b_h = vp8_loop_filter_bh_armv6;
rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_armv6;
rtcd->loopfilter.simple_b_v = vp8_loop_filter_bvs_armv6;
rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_armv6;
rtcd->loopfilter.simple_b_h = vp8_loop_filter_bhs_armv6;
rtcd->recon.copy16x16 = vp8_copy_mem16x16_v6;
rtcd->recon.copy8x8 = vp8_copy_mem8x8_v6;
rtcd->recon.copy8x4 = vp8_copy_mem8x4_v6;
rtcd->recon.recon = vp8_recon_b_armv6;
rtcd->recon.recon2 = vp8_recon2b_armv6;
rtcd->recon.recon4 = vp8_recon4b_armv6;
}
#endif
#if HAVE_ARMV7
if (has_neon)
{
rtcd->subpix.sixtap16x16 = vp8_sixtap_predict16x16_neon;
rtcd->subpix.sixtap8x8 = vp8_sixtap_predict8x8_neon;
rtcd->subpix.sixtap8x4 = vp8_sixtap_predict8x4_neon;
rtcd->subpix.sixtap4x4 = vp8_sixtap_predict_neon;
rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_neon;
rtcd->subpix.bilinear8x8 = vp8_bilinear_predict8x8_neon;
rtcd->subpix.bilinear8x4 = vp8_bilinear_predict8x4_neon;
rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_neon;
rtcd->idct.idct1 = vp8_short_idct4x4llm_1_neon;
rtcd->idct.idct16 = vp8_short_idct4x4llm_neon;
rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_neon;
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_neon;
rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_neon;
rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_neon;
rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_neon;
rtcd->loopfilter.normal_b_h = vp8_loop_filter_bh_neon;
rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_neon;
rtcd->loopfilter.simple_b_v = vp8_loop_filter_bvs_neon;
rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_neon;
rtcd->loopfilter.simple_b_h = vp8_loop_filter_bhs_neon;
rtcd->recon.copy16x16 = vp8_copy_mem16x16_neon;
rtcd->recon.copy8x8 = vp8_copy_mem8x8_neon;
rtcd->recon.copy8x4 = vp8_copy_mem8x4_neon;
rtcd->recon.recon = vp8_recon_b_neon;
rtcd->recon.recon2 = vp8_recon2b_neon;
rtcd->recon.recon4 = vp8_recon4b_neon;
rtcd->recon.recon_mb = vp8_recon_mb_neon;
}
#endif
#endif
#if HAVE_ARMV6
#if CONFIG_RUNTIME_CPU_DETECT
if (has_media)
#endif
{
vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby;
vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s;
}
#endif
#if HAVE_ARMV7
#if CONFIG_RUNTIME_CPU_DETECT
if (has_neon)
#endif
{
vp8_build_intra_predictors_mby_ptr =
vp8_build_intra_predictors_mby_neon;
vp8_build_intra_predictors_mby_s_ptr =
vp8_build_intra_predictors_mby_s_neon;
}
#endif
}

Просмотреть файл

@ -0,0 +1,238 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_filter_block2d_bil_first_pass_armv6|
EXPORT |vp8_filter_block2d_bil_second_pass_armv6|
AREA |.text|, CODE, READONLY ; name this block of code
;-------------------------------------
; r0 unsigned char *src_ptr,
; r1 unsigned short *output_ptr,
; r2 unsigned int src_pixels_per_line,
; r3 unsigned int output_height,
; stack unsigned int output_width,
; stack const short *vp8_filter
;-------------------------------------
; The output is transposed stroed in output array to make it easy for second pass filtering.
|vp8_filter_block2d_bil_first_pass_armv6| PROC
stmdb sp!, {r4 - r11, lr}
ldr r11, [sp, #40] ; vp8_filter address
ldr r4, [sp, #36] ; output width
mov r12, r3 ; outer-loop counter
sub r2, r2, r4 ; src increment for height loop
;;IF ARCHITECTURE=6
pld [r0]
;;ENDIF
ldr r5, [r11] ; load up filter coefficients
mov r3, r3, lsl #1 ; output_height*2
add r3, r3, #2 ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
mov r11, r1 ; save output_ptr for each row
cmp r5, #128 ; if filter coef = 128, then skip the filter
beq bil_null_1st_filter
|bil_height_loop_1st_v6|
ldrb r6, [r0] ; load source data
ldrb r7, [r0, #1]
ldrb r8, [r0, #2]
mov lr, r4, lsr #2 ; 4-in-parellel loop counter
|bil_width_loop_1st_v6|
ldrb r9, [r0, #3]
ldrb r10, [r0, #4]
pkhbt r6, r6, r7, lsl #16 ; src[1] | src[0]
pkhbt r7, r7, r8, lsl #16 ; src[2] | src[1]
smuad r6, r6, r5 ; apply the filter
pkhbt r8, r8, r9, lsl #16 ; src[3] | src[2]
smuad r7, r7, r5
pkhbt r9, r9, r10, lsl #16 ; src[4] | src[3]
smuad r8, r8, r5
smuad r9, r9, r5
add r0, r0, #4
subs lr, lr, #1
add r6, r6, #0x40 ; round_shift_and_clamp
add r7, r7, #0x40
usat r6, #16, r6, asr #7
usat r7, #16, r7, asr #7
strh r6, [r1], r3 ; result is transposed and stored
add r8, r8, #0x40 ; round_shift_and_clamp
strh r7, [r1], r3
add r9, r9, #0x40
usat r8, #16, r8, asr #7
usat r9, #16, r9, asr #7
strh r8, [r1], r3 ; result is transposed and stored
ldrneb r6, [r0] ; load source data
strh r9, [r1], r3
ldrneb r7, [r0, #1]
ldrneb r8, [r0, #2]
bne bil_width_loop_1st_v6
add r0, r0, r2 ; move to next input row
subs r12, r12, #1
;;IF ARCHITECTURE=6
pld [r0]
;;ENDIF
add r11, r11, #2 ; move over to next column
mov r1, r11
bne bil_height_loop_1st_v6
ldmia sp!, {r4 - r11, pc}
|bil_null_1st_filter|
|bil_height_loop_null_1st|
mov lr, r4, lsr #2 ; loop counter
|bil_width_loop_null_1st|
ldrb r6, [r0] ; load data
ldrb r7, [r0, #1]
ldrb r8, [r0, #2]
ldrb r9, [r0, #3]
strh r6, [r1], r3 ; store it to immediate buffer
add r0, r0, #4
strh r7, [r1], r3
subs lr, lr, #1
strh r8, [r1], r3
strh r9, [r1], r3
bne bil_width_loop_null_1st
subs r12, r12, #1
add r0, r0, r2 ; move to next input line
add r11, r11, #2 ; move over to next column
mov r1, r11
bne bil_height_loop_null_1st
ldmia sp!, {r4 - r11, pc}
ENDP ; |vp8_filter_block2d_bil_first_pass_armv6|
;---------------------------------
; r0 unsigned short *src_ptr,
; r1 unsigned char *output_ptr,
; r2 int output_pitch,
; r3 unsigned int output_height,
; stack unsigned int output_width,
; stack const short *vp8_filter
;---------------------------------
|vp8_filter_block2d_bil_second_pass_armv6| PROC
stmdb sp!, {r4 - r11, lr}
ldr r11, [sp, #40] ; vp8_filter address
ldr r4, [sp, #36] ; output width
ldr r5, [r11] ; load up filter coefficients
mov r12, r4 ; outer-loop counter = width, since we work on transposed data matrix
mov r11, r1
cmp r5, #128 ; if filter coef = 128, then skip the filter
beq bil_null_2nd_filter
|bil_height_loop_2nd|
ldr r6, [r0] ; load the data
ldr r8, [r0, #4]
ldrh r10, [r0, #8]
mov lr, r3, lsr #2 ; loop counter
|bil_width_loop_2nd|
pkhtb r7, r6, r8 ; src[1] | src[2]
pkhtb r9, r8, r10 ; src[3] | src[4]
smuad r6, r6, r5 ; apply filter
smuad r8, r8, r5 ; apply filter
subs lr, lr, #1
smuadx r7, r7, r5 ; apply filter
smuadx r9, r9, r5 ; apply filter
add r0, r0, #8
add r6, r6, #0x40 ; round_shift_and_clamp
add r7, r7, #0x40
usat r6, #8, r6, asr #7
usat r7, #8, r7, asr #7
strb r6, [r1], r2 ; the result is transposed back and stored
add r8, r8, #0x40 ; round_shift_and_clamp
strb r7, [r1], r2
add r9, r9, #0x40
usat r8, #8, r8, asr #7
usat r9, #8, r9, asr #7
strb r8, [r1], r2 ; the result is transposed back and stored
ldrne r6, [r0] ; load data
strb r9, [r1], r2
ldrne r8, [r0, #4]
ldrneh r10, [r0, #8]
bne bil_width_loop_2nd
subs r12, r12, #1
add r0, r0, #4 ; update src for next row
add r11, r11, #1
mov r1, r11
bne bil_height_loop_2nd
ldmia sp!, {r4 - r11, pc}
|bil_null_2nd_filter|
|bil_height_loop_null_2nd|
mov lr, r3, lsr #2
|bil_width_loop_null_2nd|
ldr r6, [r0], #4 ; load data
subs lr, lr, #1
ldr r8, [r0], #4
strb r6, [r1], r2 ; store data
mov r7, r6, lsr #16
strb r7, [r1], r2
mov r9, r8, lsr #16
strb r8, [r1], r2
strb r9, [r1], r2
bne bil_width_loop_null_2nd
subs r12, r12, #1
add r0, r0, #4
add r11, r11, #1
mov r1, r11
bne bil_height_loop_null_2nd
ldmia sp!, {r4 - r11, pc}
ENDP ; |vp8_filter_block2d_second_pass_armv6|
END

Просмотреть файл

@ -0,0 +1,182 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_copy_mem16x16_v6|
; ARM
; REQUIRE8
; PRESERVE8
AREA Block, CODE, READONLY ; name this block of code
;void copy_mem16x16_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|vp8_copy_mem16x16_v6| PROC
stmdb sp!, {r4 - r7}
;push {r4-r7}
;preload
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
ands r4, r0, #15
beq copy_mem16x16_fast
ands r4, r0, #7
beq copy_mem16x16_8
ands r4, r0, #3
beq copy_mem16x16_4
;copy one byte each time
ldrb r4, [r0]
ldrb r5, [r0, #1]
ldrb r6, [r0, #2]
ldrb r7, [r0, #3]
mov r12, #16
copy_mem16x16_1_loop
strb r4, [r2]
strb r5, [r2, #1]
strb r6, [r2, #2]
strb r7, [r2, #3]
ldrb r4, [r0, #4]
ldrb r5, [r0, #5]
ldrb r6, [r0, #6]
ldrb r7, [r0, #7]
subs r12, r12, #1
strb r4, [r2, #4]
strb r5, [r2, #5]
strb r6, [r2, #6]
strb r7, [r2, #7]
ldrb r4, [r0, #8]
ldrb r5, [r0, #9]
ldrb r6, [r0, #10]
ldrb r7, [r0, #11]
strb r4, [r2, #8]
strb r5, [r2, #9]
strb r6, [r2, #10]
strb r7, [r2, #11]
ldrb r4, [r0, #12]
ldrb r5, [r0, #13]
ldrb r6, [r0, #14]
ldrb r7, [r0, #15]
add r0, r0, r1
strb r4, [r2, #12]
strb r5, [r2, #13]
strb r6, [r2, #14]
strb r7, [r2, #15]
add r2, r2, r3
ldrneb r4, [r0]
ldrneb r5, [r0, #1]
ldrneb r6, [r0, #2]
ldrneb r7, [r0, #3]
bne copy_mem16x16_1_loop
ldmia sp!, {r4 - r7}
;pop {r4-r7}
mov pc, lr
;copy 4 bytes each time
copy_mem16x16_4
ldr r4, [r0]
ldr r5, [r0, #4]
ldr r6, [r0, #8]
ldr r7, [r0, #12]
mov r12, #16
copy_mem16x16_4_loop
subs r12, r12, #1
add r0, r0, r1
str r4, [r2]
str r5, [r2, #4]
str r6, [r2, #8]
str r7, [r2, #12]
add r2, r2, r3
ldrne r4, [r0]
ldrne r5, [r0, #4]
ldrne r6, [r0, #8]
ldrne r7, [r0, #12]
bne copy_mem16x16_4_loop
ldmia sp!, {r4 - r7}
;pop {r4-r7}
mov pc, lr
;copy 8 bytes each time
copy_mem16x16_8
sub r1, r1, #16
sub r3, r3, #16
mov r12, #16
copy_mem16x16_8_loop
ldmia r0!, {r4-r5}
;ldm r0, {r4-r5}
ldmia r0!, {r6-r7}
add r0, r0, r1
stmia r2!, {r4-r5}
subs r12, r12, #1
;stm r2, {r4-r5}
stmia r2!, {r6-r7}
add r2, r2, r3
bne copy_mem16x16_8_loop
ldmia sp!, {r4 - r7}
;pop {r4-r7}
mov pc, lr
;copy 16 bytes each time
copy_mem16x16_fast
;sub r1, r1, #16
;sub r3, r3, #16
mov r12, #16
copy_mem16x16_fast_loop
ldmia r0, {r4-r7}
;ldm r0, {r4-r7}
add r0, r0, r1
subs r12, r12, #1
stmia r2, {r4-r7}
;stm r2, {r4-r7}
add r2, r2, r3
bne copy_mem16x16_fast_loop
ldmia sp!, {r4 - r7}
;pop {r4-r7}
mov pc, lr
ENDP ; |vp8_copy_mem16x16_v6|
END

Просмотреть файл

@ -0,0 +1,128 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_copy_mem8x4_v6|
; ARM
; REQUIRE8
; PRESERVE8
AREA Block, CODE, READONLY ; name this block of code
;void vp8_copy_mem8x4_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|vp8_copy_mem8x4_v6| PROC
;push {r4-r5}
stmdb sp!, {r4-r5}
;preload
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
ands r4, r0, #7
beq copy_mem8x4_fast
ands r4, r0, #3
beq copy_mem8x4_4
;copy 1 byte each time
ldrb r4, [r0]
ldrb r5, [r0, #1]
mov r12, #4
copy_mem8x4_1_loop
strb r4, [r2]
strb r5, [r2, #1]
ldrb r4, [r0, #2]
ldrb r5, [r0, #3]
subs r12, r12, #1
strb r4, [r2, #2]
strb r5, [r2, #3]
ldrb r4, [r0, #4]
ldrb r5, [r0, #5]
strb r4, [r2, #4]
strb r5, [r2, #5]
ldrb r4, [r0, #6]
ldrb r5, [r0, #7]
add r0, r0, r1
strb r4, [r2, #6]
strb r5, [r2, #7]
add r2, r2, r3
ldrneb r4, [r0]
ldrneb r5, [r0, #1]
bne copy_mem8x4_1_loop
ldmia sp!, {r4 - r5}
;pop {r4-r5}
mov pc, lr
;copy 4 bytes each time
copy_mem8x4_4
ldr r4, [r0]
ldr r5, [r0, #4]
mov r12, #4
copy_mem8x4_4_loop
subs r12, r12, #1
add r0, r0, r1
str r4, [r2]
str r5, [r2, #4]
add r2, r2, r3
ldrne r4, [r0]
ldrne r5, [r0, #4]
bne copy_mem8x4_4_loop
ldmia sp!, {r4-r5}
;pop {r4-r5}
mov pc, lr
;copy 8 bytes each time
copy_mem8x4_fast
;sub r1, r1, #8
;sub r3, r3, #8
mov r12, #4
copy_mem8x4_fast_loop
ldmia r0, {r4-r5}
;ldm r0, {r4-r5}
add r0, r0, r1
subs r12, r12, #1
stmia r2, {r4-r5}
;stm r2, {r4-r5}
add r2, r2, r3
bne copy_mem8x4_fast_loop
ldmia sp!, {r4-r5}
;pop {r4-r5}
mov pc, lr
ENDP ; |vp8_copy_mem8x4_v6|
END

Просмотреть файл

@ -0,0 +1,128 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_copy_mem8x8_v6|
; ARM
; REQUIRE8
; PRESERVE8
AREA Block, CODE, READONLY ; name this block of code
;void copy_mem8x8_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|vp8_copy_mem8x8_v6| PROC
;push {r4-r5}
stmdb sp!, {r4-r5}
;preload
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
ands r4, r0, #7
beq copy_mem8x8_fast
ands r4, r0, #3
beq copy_mem8x8_4
;copy 1 byte each time
ldrb r4, [r0]
ldrb r5, [r0, #1]
mov r12, #8
copy_mem8x8_1_loop
strb r4, [r2]
strb r5, [r2, #1]
ldrb r4, [r0, #2]
ldrb r5, [r0, #3]
subs r12, r12, #1
strb r4, [r2, #2]
strb r5, [r2, #3]
ldrb r4, [r0, #4]
ldrb r5, [r0, #5]
strb r4, [r2, #4]
strb r5, [r2, #5]
ldrb r4, [r0, #6]
ldrb r5, [r0, #7]
add r0, r0, r1
strb r4, [r2, #6]
strb r5, [r2, #7]
add r2, r2, r3
ldrneb r4, [r0]
ldrneb r5, [r0, #1]
bne copy_mem8x8_1_loop
ldmia sp!, {r4 - r5}
;pop {r4-r5}
mov pc, lr
;copy 4 bytes each time
copy_mem8x8_4
ldr r4, [r0]
ldr r5, [r0, #4]
mov r12, #8
copy_mem8x8_4_loop
subs r12, r12, #1
add r0, r0, r1
str r4, [r2]
str r5, [r2, #4]
add r2, r2, r3
ldrne r4, [r0]
ldrne r5, [r0, #4]
bne copy_mem8x8_4_loop
ldmia sp!, {r4 - r5}
;pop {r4-r5}
mov pc, lr
;copy 8 bytes each time
copy_mem8x8_fast
;sub r1, r1, #8
;sub r3, r3, #8
mov r12, #8
copy_mem8x8_fast_loop
ldmia r0, {r4-r5}
;ldm r0, {r4-r5}
add r0, r0, r1
subs r12, r12, #1
stmia r2, {r4-r5}
;stm r2, {r4-r5}
add r2, r2, r3
bne copy_mem8x8_fast_loop
ldmia sp!, {r4-r5}
;pop {r4-r5}
mov pc, lr
ENDP ; |vp8_copy_mem8x8_v6|
END

Просмотреть файл

@ -0,0 +1,67 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license and patent
; grant that can be found in the LICENSE file in the root of the source
; tree. All contributing project authors may be found in the AUTHORS
; file in the root of the source tree.
;
EXPORT |vp8_dc_only_idct_add_v6|
AREA |.text|, CODE, READONLY
;void vp8_dc_only_idct_add_v6(short input_dc, unsigned char *pred_ptr,
; unsigned char *dst_ptr, int pitch, int stride)
; r0 input_dc
; r1 pred_ptr
; r2 dest_ptr
; r3 pitch
; sp stride
|vp8_dc_only_idct_add_v6| PROC
stmdb sp!, {r4 - r7, lr}
add r0, r0, #4 ; input_dc += 4
ldr r12, c0x0000FFFF
ldr r4, [r1], r3
ldr r6, [r1], r3
and r0, r12, r0, asr #3 ; input_dc >> 3 + mask
ldr lr, [sp, #20]
orr r0, r0, r0, lsl #16 ; a1 | a1
uxtab16 r5, r0, r4 ; a1+2 | a1+0
uxtab16 r4, r0, r4, ror #8 ; a1+3 | a1+1
uxtab16 r7, r0, r6
uxtab16 r6, r0, r6, ror #8
usat16 r5, #8, r5
usat16 r4, #8, r4
usat16 r7, #8, r7
usat16 r6, #8, r6
orr r5, r5, r4, lsl #8
orr r7, r7, r6, lsl #8
ldr r4, [r1], r3
ldr r6, [r1]
str r5, [r2], lr
str r7, [r2], lr
uxtab16 r5, r0, r4
uxtab16 r4, r0, r4, ror #8
uxtab16 r7, r0, r6
uxtab16 r6, r0, r6, ror #8
usat16 r5, #8, r5
usat16 r4, #8, r4
usat16 r7, #8, r7
usat16 r6, #8, r6
orr r5, r5, r4, lsl #8
orr r7, r7, r6, lsl #8
str r5, [r2], lr
str r7, [r2]
ldmia sp!, {r4 - r7, pc}
ENDP ; |vp8_dc_only_idct_add_v6|
; Constant Pool
c0x0000FFFF DCD 0x0000FFFF
END

Просмотреть файл

@ -0,0 +1,443 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_filter_block2d_first_pass_armv6|
EXPORT |vp8_filter_block2d_second_pass_armv6|
EXPORT |vp8_filter4_block2d_second_pass_armv6|
EXPORT |vp8_filter_block2d_first_pass_only_armv6|
EXPORT |vp8_filter_block2d_second_pass_only_armv6|
AREA |.text|, CODE, READONLY ; name this block of code
;-------------------------------------
; r0 unsigned char *src_ptr
; r1 short *output_ptr
; r2 unsigned int src_pixels_per_line
; r3 unsigned int output_width
; stack unsigned int output_height
; stack const short *vp8_filter
;-------------------------------------
; vp8_filter the input and put in the output array. Apply the 6 tap FIR filter with
; the output being a 2 byte value and the intput being a 1 byte value.
|vp8_filter_block2d_first_pass_armv6| PROC
stmdb sp!, {r4 - r11, lr}
ldr r11, [sp, #40] ; vp8_filter address
ldr r7, [sp, #36] ; output height
sub r2, r2, r3 ; inside loop increments input array,
; so the height loop only needs to add
; r2 - width to the input pointer
mov r3, r3, lsl #1 ; multiply width by 2 because using shorts
add r12, r3, #16 ; square off the output
sub sp, sp, #4
;;IF ARCHITECTURE=6
;pld [r0, #-2]
;;pld [r0, #30]
;;ENDIF
ldr r4, [r11] ; load up packed filter coefficients
ldr r5, [r11, #4]
ldr r6, [r11, #8]
str r1, [sp] ; push destination to stack
mov r7, r7, lsl #16 ; height is top part of counter
; six tap filter
|height_loop_1st_6|
ldrb r8, [r0, #-2] ; load source data
ldrb r9, [r0, #-1]
ldrb r10, [r0], #2
orr r7, r7, r3, lsr #2 ; construct loop counter
|width_loop_1st_6|
ldrb r11, [r0, #-1]
pkhbt lr, r8, r9, lsl #16 ; r9 | r8
pkhbt r8, r9, r10, lsl #16 ; r10 | r9
ldrb r9, [r0]
smuad lr, lr, r4 ; apply the filter
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
smuad r8, r8, r4
pkhbt r11, r11, r9, lsl #16 ; r9 | r11
smlad lr, r10, r5, lr
ldrb r10, [r0, #1]
smlad r8, r11, r5, r8
ldrb r11, [r0, #2]
sub r7, r7, #1
pkhbt r9, r9, r10, lsl #16 ; r10 | r9
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
smlad lr, r9, r6, lr
smlad r11, r10, r6, r8
ands r10, r7, #0xff ; test loop counter
add lr, lr, #0x40 ; round_shift_and_clamp
ldrneb r8, [r0, #-2] ; load data for next loop
usat lr, #8, lr, asr #7
add r11, r11, #0x40
ldrneb r9, [r0, #-1]
usat r11, #8, r11, asr #7
strh lr, [r1], r12 ; result is transposed and stored, which
; will make second pass filtering easier.
ldrneb r10, [r0], #2
strh r11, [r1], r12
bne width_loop_1st_6
;;add r9, r2, #30 ; attempt to load 2 adjacent cache lines
;;IF ARCHITECTURE=6
;pld [r0, r2]
;;pld [r0, r9]
;;ENDIF
ldr r1, [sp] ; load and update dst address
subs r7, r7, #0x10000
add r0, r0, r2 ; move to next input line
add r1, r1, #2 ; move over to next column
str r1, [sp]
bne height_loop_1st_6
add sp, sp, #4
ldmia sp!, {r4 - r11, pc}
ENDP
;---------------------------------
; r0 short *src_ptr,
; r1 unsigned char *output_ptr,
; r2 unsigned int output_pitch,
; r3 unsigned int cnt,
; stack const short *vp8_filter
;---------------------------------
|vp8_filter_block2d_second_pass_armv6| PROC
stmdb sp!, {r4 - r11, lr}
ldr r11, [sp, #36] ; vp8_filter address
sub sp, sp, #4
mov r7, r3, lsl #16 ; height is top part of counter
str r1, [sp] ; push destination to stack
ldr r4, [r11] ; load up packed filter coefficients
ldr r5, [r11, #4]
ldr r6, [r11, #8]
pkhbt r12, r5, r4 ; pack the filter differently
pkhbt r11, r6, r5
sub r0, r0, #4 ; offset input buffer
|height_loop_2nd|
ldr r8, [r0] ; load the data
ldr r9, [r0, #4]
orr r7, r7, r3, lsr #1 ; loop counter
|width_loop_2nd|
smuad lr, r4, r8 ; apply filter
sub r7, r7, #1
smulbt r8, r4, r8
ldr r10, [r0, #8]
smlad lr, r5, r9, lr
smladx r8, r12, r9, r8
ldrh r9, [r0, #12]
smlad lr, r6, r10, lr
smladx r8, r11, r10, r8
add r0, r0, #4
smlatb r10, r6, r9, r8
add lr, lr, #0x40 ; round_shift_and_clamp
ands r8, r7, #0xff
usat lr, #8, lr, asr #7
add r10, r10, #0x40
strb lr, [r1], r2 ; the result is transposed back and stored
usat r10, #8, r10, asr #7
ldrne r8, [r0] ; load data for next loop
ldrne r9, [r0, #4]
strb r10, [r1], r2
bne width_loop_2nd
ldr r1, [sp] ; update dst for next loop
subs r7, r7, #0x10000
add r0, r0, #16 ; updata src for next loop
add r1, r1, #1
str r1, [sp]
bne height_loop_2nd
add sp, sp, #4
ldmia sp!, {r4 - r11, pc}
ENDP
;---------------------------------
; r0 short *src_ptr,
; r1 unsigned char *output_ptr,
; r2 unsigned int output_pitch,
; r3 unsigned int cnt,
; stack const short *vp8_filter
;---------------------------------
|vp8_filter4_block2d_second_pass_armv6| PROC
stmdb sp!, {r4 - r11, lr}
ldr r11, [sp, #36] ; vp8_filter address
mov r7, r3, lsl #16 ; height is top part of counter
ldr r4, [r11] ; load up packed filter coefficients
add lr, r1, r3 ; save final destination pointer
ldr r5, [r11, #4]
ldr r6, [r11, #8]
pkhbt r12, r5, r4 ; pack the filter differently
pkhbt r11, r6, r5
mov r4, #0x40 ; rounding factor (for smlad{x})
|height_loop_2nd_4|
ldrd r8, [r0, #-4] ; load the data
orr r7, r7, r3, lsr #1 ; loop counter
|width_loop_2nd_4|
ldr r10, [r0, #4]!
smladx r6, r9, r12, r4 ; apply filter
pkhbt r8, r9, r8
smlad r5, r8, r12, r4
pkhbt r8, r10, r9
smladx r6, r10, r11, r6
sub r7, r7, #1
smlad r5, r8, r11, r5
mov r8, r9 ; shift the data for the next loop
mov r9, r10
usat r6, #8, r6, asr #7 ; shift and clamp
usat r5, #8, r5, asr #7
strb r5, [r1], r2 ; the result is transposed back and stored
tst r7, #0xff
strb r6, [r1], r2
bne width_loop_2nd_4
subs r7, r7, #0x10000
add r0, r0, #16 ; update src for next loop
sub r1, lr, r7, lsr #16 ; update dst for next loop
bne height_loop_2nd_4
ldmia sp!, {r4 - r11, pc}
ENDP
;------------------------------------
; r0 unsigned char *src_ptr
; r1 unsigned char *output_ptr,
; r2 unsigned int src_pixels_per_line
; r3 unsigned int cnt,
; stack unsigned int output_pitch,
; stack const short *vp8_filter
;------------------------------------
|vp8_filter_block2d_first_pass_only_armv6| PROC
stmdb sp!, {r4 - r11, lr}
ldr r4, [sp, #36] ; output pitch
ldr r11, [sp, #40] ; HFilter address
sub sp, sp, #8
mov r7, r3
sub r2, r2, r3 ; inside loop increments input array,
; so the height loop only needs to add
; r2 - width to the input pointer
sub r4, r4, r3
str r4, [sp] ; save modified output pitch
str r2, [sp, #4]
mov r2, #0x40
ldr r4, [r11] ; load up packed filter coefficients
ldr r5, [r11, #4]
ldr r6, [r11, #8]
; six tap filter
|height_loop_1st_only_6|
ldrb r8, [r0, #-2] ; load data
ldrb r9, [r0, #-1]
ldrb r10, [r0], #2
mov r12, r3, lsr #1 ; loop counter
|width_loop_1st_only_6|
ldrb r11, [r0, #-1]
pkhbt lr, r8, r9, lsl #16 ; r9 | r8
pkhbt r8, r9, r10, lsl #16 ; r10 | r9
ldrb r9, [r0]
;; smuad lr, lr, r4
smlad lr, lr, r4, r2
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
;; smuad r8, r8, r4
smlad r8, r8, r4, r2
pkhbt r11, r11, r9, lsl #16 ; r9 | r11
smlad lr, r10, r5, lr
ldrb r10, [r0, #1]
smlad r8, r11, r5, r8
ldrb r11, [r0, #2]
subs r12, r12, #1
pkhbt r9, r9, r10, lsl #16 ; r10 | r9
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
smlad lr, r9, r6, lr
smlad r10, r10, r6, r8
;; add lr, lr, #0x40 ; round_shift_and_clamp
ldrneb r8, [r0, #-2] ; load data for next loop
usat lr, #8, lr, asr #7
;; add r10, r10, #0x40
strb lr, [r1], #1 ; store the result
usat r10, #8, r10, asr #7
ldrneb r9, [r0, #-1]
strb r10, [r1], #1
ldrneb r10, [r0], #2
bne width_loop_1st_only_6
;;add r9, r2, #30 ; attempt to load 2 adjacent cache lines
;;IF ARCHITECTURE=6
;pld [r0, r2]
;;pld [r0, r9]
;;ENDIF
ldr lr, [sp] ; load back output pitch
ldr r12, [sp, #4] ; load back output pitch
subs r7, r7, #1
add r0, r0, r12 ; updata src for next loop
add r1, r1, lr ; update dst for next loop
bne height_loop_1st_only_6
add sp, sp, #8
ldmia sp!, {r4 - r11, pc}
ENDP ; |vp8_filter_block2d_first_pass_only_armv6|
;------------------------------------
; r0 unsigned char *src_ptr,
; r1 unsigned char *output_ptr,
; r2 unsigned int src_pixels_per_line
; r3 unsigned int cnt,
; stack unsigned int output_pitch,
; stack const short *vp8_filter
;------------------------------------
|vp8_filter_block2d_second_pass_only_armv6| PROC
stmdb sp!, {r4 - r11, lr}
ldr r11, [sp, #40] ; VFilter address
ldr r12, [sp, #36] ; output pitch
mov r7, r3, lsl #16 ; height is top part of counter
sub r0, r0, r2, lsl #1 ; need 6 elements for filtering, 2 before, 3 after
sub sp, sp, #8
ldr r4, [r11] ; load up packed filter coefficients
ldr r5, [r11, #4]
ldr r6, [r11, #8]
str r0, [sp] ; save r0 to stack
str r1, [sp, #4] ; save dst to stack
; six tap filter
|width_loop_2nd_only_6|
ldrb r8, [r0], r2 ; load data
orr r7, r7, r3 ; loop counter
ldrb r9, [r0], r2
ldrb r10, [r0], r2
|height_loop_2nd_only_6|
; filter first column in this inner loop, than, move to next colum.
ldrb r11, [r0], r2
pkhbt lr, r8, r9, lsl #16 ; r9 | r8
pkhbt r8, r9, r10, lsl #16 ; r10 | r9
ldrb r9, [r0], r2
smuad lr, lr, r4
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
smuad r8, r8, r4
pkhbt r11, r11, r9, lsl #16 ; r9 | r11
smlad lr, r10, r5, lr
ldrb r10, [r0], r2
smlad r8, r11, r5, r8
ldrb r11, [r0]
sub r7, r7, #2
sub r0, r0, r2, lsl #2
pkhbt r9, r9, r10, lsl #16 ; r10 | r9
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
smlad lr, r9, r6, lr
smlad r10, r10, r6, r8
ands r9, r7, #0xff
add lr, lr, #0x40 ; round_shift_and_clamp
ldrneb r8, [r0], r2 ; load data for next loop
usat lr, #8, lr, asr #7
add r10, r10, #0x40
strb lr, [r1], r12 ; store the result for the column
usat r10, #8, r10, asr #7
ldrneb r9, [r0], r2
strb r10, [r1], r12
ldrneb r10, [r0], r2
bne height_loop_2nd_only_6
ldr r0, [sp]
ldr r1, [sp, #4]
subs r7, r7, #0x10000
add r0, r0, #1 ; move to filter next column
str r0, [sp]
add r1, r1, #1
str r1, [sp, #4]
bne width_loop_2nd_only_6
add sp, sp, #8
ldmia sp!, {r4 - r11, pc}
ENDP ; |vp8_filter_block2d_second_pass_only_armv6|
END

Просмотреть файл

@ -0,0 +1,345 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
; r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r14
EXPORT |vp8_short_idct4x4llm_1_v6|
EXPORT |vp8_short_idct4x4llm_v6|
EXPORT |vp8_short_idct4x4llm_v6_scott|
EXPORT |vp8_short_idct4x4llm_v6_dual|
AREA |.text|, CODE, READONLY
;********************************************************************************
;* void short_idct4x4llm_1_v6(INT16 * input, INT16 * output, INT32 pitch)
;* r0 INT16 * input
;* r1 INT16 * output
;* r2 INT32 pitch
;* bench: 3/5
;********************************************************************************
|vp8_short_idct4x4llm_1_v6| PROC ; cycles in out pit
;
ldrsh r0, [r0] ; load input[0] 1, r0 un 2
add r0, r0, #4 ; 1 +4
stmdb sp!, {r4, r5, lr} ; make room for wide writes 1 backup
mov r0, r0, asr #3 ; (input[0] + 4) >> 3 1, r0 req`d ^1 >> 3
pkhbt r4, r0, r0, lsl #16 ; pack r0 into r4 1, r0 req`d ^1 pack
mov r5, r4 ; expand expand
strd r4, [r1], r2 ; *output = r0, post inc 1
strd r4, [r1], r2 ; 1
strd r4, [r1], r2 ; 1
strd r4, [r1] ; 1
;
ldmia sp!, {r4, r5, pc} ; replace vars, return restore
ENDP ; |vp8_short_idct4x4llm_1_v6|
;********************************************************************************
;********************************************************************************
;********************************************************************************
;********************************************************************************
;* void short_idct4x4llm_v6(INT16 * input, INT16 * output, INT32 pitch)
;* r0 INT16 * input
;* r1 INT16 * output
;* r2 INT32 pitch
;* bench:
;********************************************************************************
|vp8_short_idct4x4llm_v6| PROC ; cycles in out pit
;
stmdb sp!, {r4-r11, lr} ; backup registers 1 backup
;
mov r4, #0x00004E00 ; 1 cst
orr r4, r4, #0x0000007B ; cospi8sqrt2minus1
mov r5, #0x00008A00 ; 1 cst
orr r5, r5, #0x0000008C ; sinpi8sqrt2
;
mov r6, #4 ; i=4 1 i
loop1 ;
ldrsh r12, [r0, #8] ; input[4] 1, r12 unavail 2 [4]
ldrsh r3, [r0, #24] ; input[12] 1, r3 unavail 2 [12]
ldrsh r8, [r0, #16] ; input[8] 1, r8 unavail 2 [8]
ldrsh r7, [r0], #0x2 ; input[0] 1, r7 unavail 2 ++ [0]
smulwb r10, r5, r12 ; ([4] * sinpi8sqrt2) >> 16 1, r10 un 2, r12/r5 ^1 t1
smulwb r11, r4, r3 ; ([12] * cospi8sqrt2minus1) >> 16 1, r11 un 2, r3/r4 ^1 t2
add r9, r7, r8 ; a1 = [0] + [8] 1 a1
sub r7, r7, r8 ; b1 = [0] - [8] 1 b1
add r11, r3, r11 ; temp2 1
rsb r11, r11, r10 ; c1 = temp1 - temp2 1 c1
smulwb r3, r5, r3 ; ([12] * sinpi8sqrt2) >> 16 1, r3 un 2, r3/r5 ^ 1 t2
smulwb r10, r4, r12 ; ([4] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r12/r4 ^1 t1
add r8, r7, r11 ; b1 + c1 1 b+c
strh r8, [r1, r2] ; out[pitch] = b1+c1 1
sub r7, r7, r11 ; b1 - c1 1 b-c
add r10, r12, r10 ; temp1 1
add r3, r10, r3 ; d1 = temp1 + temp2 1 d1
add r10, r9, r3 ; a1 + d1 1 a+d
sub r3, r9, r3 ; a1 - d1 1 a-d
add r8, r2, r2 ; pitch * 2 1 p*2
strh r7, [r1, r8] ; out[pitch*2] = b1-c1 1
add r7, r2, r2, lsl #1 ; pitch * 3 1 p*3
strh r3, [r1, r7] ; out[pitch*3] = a1-d1 1
subs r6, r6, #1 ; i-- 1 --
strh r10, [r1], #0x2 ; out[0] = a1+d1 1 ++
bne loop1 ; if i>0, continue
;
sub r1, r1, #8 ; set up out for next loop 1 -4
; for this iteration, input=prev output
mov r6, #4 ; i=4 1 i
; b returnfull
loop2 ;
ldrsh r11, [r1, #2] ; input[1] 1, r11 un 2 [1]
ldrsh r8, [r1, #6] ; input[3] 1, r8 un 2 [3]
ldrsh r3, [r1, #4] ; input[2] 1, r3 un 2 [2]
ldrsh r0, [r1] ; input[0] 1, r0 un 2 [0]
smulwb r9, r5, r11 ; ([1] * sinpi8sqrt2) >> 16 1, r9 un 2, r5/r11 ^1 t1
smulwb r10, r4, r8 ; ([3] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r8 ^1 t2
add r7, r0, r3 ; a1 = [0] + [2] 1 a1
sub r0, r0, r3 ; b1 = [0] - [2] 1 b1
add r10, r8, r10 ; temp2 1
rsb r9, r10, r9 ; c1 = temp1 - temp2 1 c1
smulwb r8, r5, r8 ; ([3] * sinpi8sqrt2) >> 16 1, r8 un 2, r5/r8 ^1 t2
smulwb r10, r4, r11 ; ([1] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r11 ^1 t1
add r3, r0, r9 ; b1+c1 1 b+c
add r3, r3, #4 ; b1+c1+4 1 +4
add r10, r11, r10 ; temp1 1
mov r3, r3, asr #3 ; b1+c1+4 >> 3 1, r3 ^1 >>3
strh r3, [r1, #2] ; out[1] = b1+c1 1
add r10, r10, r8 ; d1 = temp1 + temp2 1 d1
add r3, r7, r10 ; a1+d1 1 a+d
add r3, r3, #4 ; a1+d1+4 1 +4
sub r7, r7, r10 ; a1-d1 1 a-d
add r7, r7, #4 ; a1-d1+4 1 +4
mov r3, r3, asr #3 ; a1+d1+4 >> 3 1, r3 ^1 >>3
mov r7, r7, asr #3 ; a1-d1+4 >> 3 1, r7 ^1 >>3
strh r7, [r1, #6] ; out[3] = a1-d1 1
sub r0, r0, r9 ; b1-c1 1 b-c
add r0, r0, #4 ; b1-c1+4 1 +4
subs r6, r6, #1 ; i-- 1 --
mov r0, r0, asr #3 ; b1-c1+4 >> 3 1, r0 ^1 >>3
strh r0, [r1, #4] ; out[2] = b1-c1 1
strh r3, [r1], r2 ; out[0] = a1+d1 1
; add r1, r1, r2 ; out += pitch 1 ++
bne loop2 ; if i>0, continue
returnfull ;
ldmia sp!, {r4 - r11, pc} ; replace vars, return restore
ENDP
;********************************************************************************
;********************************************************************************
;********************************************************************************
;********************************************************************************
;* void short_idct4x4llm_v6_scott(INT16 * input, INT16 * output, INT32 pitch)
;* r0 INT16 * input
;* r1 INT16 * output
;* r2 INT32 pitch
;* bench:
;********************************************************************************
|vp8_short_idct4x4llm_v6_scott| PROC ; cycles in out pit
; mov r0, #0 ;
; ldr r0, [r0] ;
stmdb sp!, {r4 - r11, lr} ; backup registers 1 backup
;
mov r3, #0x00004E00 ; cos
orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
mov r4, #0x00008A00 ; sin
orr r4, r4, #0x0000008C ; sinpi8sqrt2
;
mov r5, #0x2 ; i i
;
short_idct4x4llm_v6_scott_loop1 ;
ldr r10, [r0, #(4*2)] ; i5 | i4 5,4
ldr r11, [r0, #(12*2)] ; i13 | i12 13,12
;
smulwb r6, r4, r10 ; ((ip[4] * sinpi8sqrt2) >> 16) lt1
smulwb r7, r3, r11 ; ((ip[12] * cospi8sqrt2minus1) >> 16) lt2
;
smulwb r12, r3, r10 ; ((ip[4] * cospi8sqrt2misu1) >> 16) l2t2
smulwb r14, r4, r11 ; ((ip[12] * sinpi8sqrt2) >> 16) l2t1
;
add r6, r6, r7 ; partial c1 lt1-lt2
add r12, r12, r14 ; partial d1 l2t2+l2t1
;
smulwt r14, r4, r10 ; ((ip[5] * sinpi8sqrt2) >> 16) ht1
smulwt r7, r3, r11 ; ((ip[13] * cospi8sqrt2minus1) >> 16) ht2
;
smulwt r8, r3, r10 ; ((ip[5] * cospi8sqrt2minus1) >> 16) h2t1
smulwt r9, r4, r11 ; ((ip[13] * sinpi8sqrt2) >> 16) h2t2
;
add r7, r14, r7 ; partial c1_2 ht1+ht2
sub r8, r8, r9 ; partial d1_2 h2t1-h2t2
;
pkhbt r6, r6, r7, lsl #16 ; partial c1_2 | partial c1_1 pack
pkhbt r12, r12, r8, lsl #16 ; partial d1_2 | partial d1_1 pack
;
usub16 r6, r6, r10 ; c1_2 | c1_1 c
uadd16 r12, r12, r11 ; d1_2 | d1_1 d
;
ldr r10, [r0, #0] ; i1 | i0 1,0
ldr r11, [r0, #(8*2)] ; i9 | i10 9,10
;
;;;;;; add r0, r0, #0x4 ; +4
;;;;;; add r1, r1, #0x4 ; +4
;
uadd16 r8, r10, r11 ; i1 + i9 | i0 + i8 aka a1 a
usub16 r9, r10, r11 ; i1 - i9 | i0 - i8 aka b1 b
;
uadd16 r7, r8, r12 ; a1 + d1 pair a+d
usub16 r14, r8, r12 ; a1 - d1 pair a-d
;
str r7, [r1] ; op[0] = a1 + d1
str r14, [r1, r2] ; op[pitch*3] = a1 - d1
;
add r0, r0, #0x4 ; op[pitch] = b1 + c1 ++
add r1, r1, #0x4 ; op[pitch*2] = b1 - c1 ++
;
subs r5, r5, #0x1 ; --
bne short_idct4x4llm_v6_scott_loop1 ;
;
sub r1, r1, #16 ; reset output ptr
mov r5, #0x4 ;
mov r0, r1 ; input = output
;
short_idct4x4llm_v6_scott_loop2 ;
;
subs r5, r5, #0x1 ;
bne short_idct4x4llm_v6_scott_loop2 ;
;
ldmia sp!, {r4 - r11, pc} ;
ENDP ;
;
;********************************************************************************
;********************************************************************************
;********************************************************************************
;********************************************************************************
;* void short_idct4x4llm_v6_dual(INT16 * input, INT16 * output, INT32 pitch)
;* r0 INT16 * input
;* r1 INT16 * output
;* r2 INT32 pitch
;* bench:
;********************************************************************************
|vp8_short_idct4x4llm_v6_dual| PROC ; cycles in out pit
;
stmdb sp!, {r4-r11, lr} ; backup registers 1 backup
mov r3, #0x00004E00 ; cos
orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
mov r4, #0x00008A00 ; sin
orr r4, r4, #0x0000008C ; sinpi8sqrt2
mov r5, #0x2 ; i=2 i
loop1_dual
ldr r6, [r0, #(4*2)] ; i5 | i4 5|4
ldr r12, [r0, #(12*2)] ; i13 | i12 13|12
ldr r14, [r0, #(8*2)] ; i9 | i8 9|8
smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
smulwb r7, r3, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 4c
smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 4s
pkhbt r7, r7, r9, lsl #16 ; 5c | 4c
smulwt r11, r3, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 13c
pkhbt r8, r8, r10, lsl #16 ; 5s | 4s
uadd16 r6, r6, r7 ; 5c+5 | 4c+4
smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 13s
smulwb r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 12c
smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 12s
subs r5, r5, #0x1 ; i-- --
pkhbt r9, r9, r11, lsl #16 ; 13c | 12c
ldr r11, [r0], #0x4 ; i1 | i0 ++ 1|0
pkhbt r10, r10, r7, lsl #16 ; 13s | 12s
uadd16 r7, r12, r9 ; 13c+13 | 12c+12
usub16 r7, r8, r7 ; c c
uadd16 r6, r6, r10 ; d d
uadd16 r10, r11, r14 ; a a
usub16 r8, r11, r14 ; b b
uadd16 r9, r10, r6 ; a+d a+d
usub16 r10, r10, r6 ; a-d a-d
uadd16 r6, r8, r7 ; b+c b+c
usub16 r7, r8, r7 ; b-c b-c
str r6, [r1, r2] ; o5 | o4
add r6, r2, r2 ; pitch * 2 p2
str r7, [r1, r6] ; o9 | o8
add r6, r6, r2 ; pitch * 3 p3
str r10, [r1, r6] ; o13 | o12
str r9, [r1], #0x4 ; o1 | o0 ++
bne loop1_dual ;
mov r5, #0x2 ; i=2 i
sub r0, r1, #8 ; reset input/output i/o
loop2_dual
ldr r6, [r0, r2] ; i5 | i4 5|4
ldr r1, [r0] ; i1 | i0 1|0
ldr r12, [r0, #0x4] ; i3 | i2 3|2
add r14, r2, #0x4 ; pitch + 2 p+2
ldr r14, [r0, r14] ; i7 | i6 7|6
smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
smulwt r7, r3, r1 ; (ip[1] * cospi8sqrt2minus1) >> 16 1c
smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
smulwt r8, r4, r1 ; (ip[1] * sinpi8sqrt2) >> 16 1s
pkhbt r11, r6, r1, lsl #16 ; i0 | i4 0|4
pkhbt r7, r9, r7, lsl #16 ; 1c | 5c
pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 © tc1
pkhtb r1, r1, r6, asr #16 ; i1 | i5 1|5
uadd16 r1, r7, r1 ; 1c+1 | 5c+5 = temp2 (d) td2
pkhbt r9, r14, r12, lsl #16 ; i2 | i6 2|6
uadd16 r10, r11, r9 ; a a
usub16 r9, r11, r9 ; b b
pkhtb r6, r12, r14, asr #16 ; i3 | i7 3|7
subs r5, r5, #0x1 ; i-- --
smulwt r7, r3, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 3c
smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 3s
smulwb r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 7c
smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 7s
pkhbt r7, r12, r7, lsl #16 ; 3c | 7c
pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 (d) td1
uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 (c) tc2
usub16 r12, r8, r6 ; c (o1 | o5) c
uadd16 r6, r11, r1 ; d (o3 | o7) d
uadd16 r7, r10, r6 ; a+d a+d
mov r8, #0x4 ; set up 4's 4
orr r8, r8, #0x40000 ; 4|4
usub16 r6, r10, r6 ; a-d a-d
uadd16 r6, r6, r8 ; a-d+4 3|7
uadd16 r7, r7, r8 ; a+d+4 0|4
uadd16 r10, r9, r12 ; b+c b+c
usub16 r1, r9, r12 ; b-c b-c
uadd16 r10, r10, r8 ; b+c+4 1|5
uadd16 r1, r1, r8 ; b-c+4 2|6
mov r8, r10, asr #19 ; o1 >> 3
strh r8, [r0, #2] ; o1
mov r8, r1, asr #19 ; o2 >> 3
strh r8, [r0, #4] ; o2
mov r8, r6, asr #19 ; o3 >> 3
strh r8, [r0, #6] ; o3
mov r8, r7, asr #19 ; o0 >> 3
strh r8, [r0], r2 ; o0 +p
sxth r10, r10 ;
mov r8, r10, asr #3 ; o5 >> 3
strh r8, [r0, #2] ; o5
sxth r1, r1 ;
mov r8, r1, asr #3 ; o6 >> 3
strh r8, [r0, #4] ; o6
sxth r6, r6 ;
mov r8, r6, asr #3 ; o7 >> 3
strh r8, [r0, #6] ; o7
sxth r7, r7 ;
mov r8, r7, asr #3 ; o4 >> 3
strh r8, [r0], r2 ; o4 +p
;;;;; subs r5, r5, #0x1 ; i-- --
bne loop2_dual ;
;
ldmia sp!, {r4 - r11, pc} ; replace vars, return restore
ENDP
END

Просмотреть файл

@ -0,0 +1,152 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_short_inv_walsh4x4_v6|
EXPORT |vp8_short_inv_walsh4x4_1_v6|
ARM
REQUIRE8
PRESERVE8
AREA |.text|, CODE, READONLY ; name this block of code
;short vp8_short_inv_walsh4x4_v6(short *input, short *output)
|vp8_short_inv_walsh4x4_v6| PROC
stmdb sp!, {r4 - r11, lr}
ldr r2, [r0], #4 ; [1 | 0]
ldr r3, [r0], #4 ; [3 | 2]
ldr r4, [r0], #4 ; [5 | 4]
ldr r5, [r0], #4 ; [7 | 6]
ldr r6, [r0], #4 ; [9 | 8]
ldr r7, [r0], #4 ; [11 | 10]
ldr r8, [r0], #4 ; [13 | 12]
ldr r9, [r0] ; [15 | 14]
qadd16 r10, r2, r8 ; a1 [1+13 | 0+12]
qadd16 r11, r4, r6 ; b1 [5+9 | 4+8]
qsub16 r12, r4, r6 ; c1 [5-9 | 4-8]
qsub16 lr, r2, r8 ; d1 [1-13 | 0-12]
qadd16 r2, r10, r11 ; a1 + b1 [1 | 0]
qadd16 r4, r12, lr ; c1 + d1 [5 | 4]
qsub16 r6, r10, r11 ; a1 - b1 [9 | 8]
qsub16 r8, lr, r12 ; d1 - c1 [13 | 12]
qadd16 r10, r3, r9 ; a1 [3+15 | 2+14]
qadd16 r11, r5, r7 ; b1 [7+11 | 6+10]
qsub16 r12, r5, r7 ; c1 [7-11 | 6-10]
qsub16 lr, r3, r9 ; d1 [3-15 | 2-14]
qadd16 r3, r10, r11 ; a1 + b1 [3 | 2]
qadd16 r5, r12, lr ; c1 + d1 [7 | 6]
qsub16 r7, r10, r11 ; a1 - b1 [11 | 10]
qsub16 r9, lr, r12 ; d1 - c1 [15 | 14]
; first transform complete
qsubaddx r10, r2, r3 ; [c1|a1] [1-2 | 0+3]
qaddsubx r11, r2, r3 ; [b1|d1] [1+2 | 0-3]
qsubaddx r12, r4, r5 ; [c1|a1] [5-6 | 4+7]
qaddsubx lr, r4, r5 ; [b1|d1] [5+6 | 4-7]
qaddsubx r2, r10, r11 ; [b2|c2] [c1+d1 | a1-b1]
qaddsubx r3, r11, r10 ; [a2|d2] [b1+a1 | d1-c1]
ldr r10, c0x00030003
qaddsubx r4, r12, lr ; [b2|c2] [c1+d1 | a1-b1]
qaddsubx r5, lr, r12 ; [a2|d2] [b1+a1 | d1-c1]
qadd16 r2, r2, r10 ; [b2+3|c2+3]
qadd16 r3, r3, r10 ; [a2+3|d2+3]
qadd16 r4, r4, r10 ; [b2+3|c2+3]
qadd16 r5, r5, r10 ; [a2+3|d2+3]
asr r12, r2, #3 ; [1 | x]
pkhtb r12, r12, r3, asr #19; [1 | 0]
lsl lr, r3, #16 ; [~3 | x]
lsl r2, r2, #16 ; [~2 | x]
asr lr, lr, #3 ; [3 | x]
pkhtb lr, lr, r2, asr #19 ; [3 | 2]
asr r2, r4, #3 ; [5 | x]
pkhtb r2, r2, r5, asr #19 ; [5 | 4]
lsl r3, r5, #16 ; [~7 | x]
lsl r4, r4, #16 ; [~6 | x]
asr r3, r3, #3 ; [7 | x]
pkhtb r3, r3, r4, asr #19 ; [7 | 6]
str r12, [r1], #4
str lr, [r1], #4
str r2, [r1], #4
str r3, [r1], #4
qsubaddx r2, r6, r7 ; [c1|a1] [9-10 | 8+11]
qaddsubx r3, r6, r7 ; [b1|d1] [9+10 | 8-11]
qsubaddx r4, r8, r9 ; [c1|a1] [13-14 | 12+15]
qaddsubx r5, r8, r9 ; [b1|d1] [13+14 | 12-15]
qaddsubx r6, r2, r3 ; [b2|c2] [c1+d1 | a1-b1]
qaddsubx r7, r3, r2 ; [a2|d2] [b1+a1 | d1-c1]
qaddsubx r8, r4, r5 ; [b2|c2] [c1+d1 | a1-b1]
qaddsubx r9, r5, r4 ; [a2|d2] [b1+a1 | d1-c1]
qadd16 r6, r6, r10 ; [b2+3|c2+3]
qadd16 r7, r7, r10 ; [a2+3|d2+3]
qadd16 r8, r8, r10 ; [b2+3|c2+3]
qadd16 r9, r9, r10 ; [a2+3|d2+3]
asr r2, r6, #3 ; [9 | x]
pkhtb r2, r2, r7, asr #19 ; [9 | 8]
lsl r3, r7, #16 ; [~11| x]
lsl r4, r6, #16 ; [~10| x]
asr r3, r3, #3 ; [11 | x]
pkhtb r3, r3, r4, asr #19 ; [11 | 10]
asr r4, r8, #3 ; [13 | x]
pkhtb r4, r4, r9, asr #19 ; [13 | 12]
lsl r5, r9, #16 ; [~15| x]
lsl r6, r8, #16 ; [~14| x]
asr r5, r5, #3 ; [15 | x]
pkhtb r5, r5, r6, asr #19 ; [15 | 14]
str r2, [r1], #4
str r3, [r1], #4
str r4, [r1], #4
str r5, [r1]
ldmia sp!, {r4 - r11, pc}
ENDP ; |vp8_short_inv_walsh4x4_v6|
;short vp8_short_inv_walsh4x4_1_v6(short *input, short *output)
|vp8_short_inv_walsh4x4_1_v6| PROC
ldrsh r2, [r0] ; [0]
add r2, r2, #3 ; [0] + 3
asr r2, r2, #3 ; a1 ([0]+3) >> 3
lsl r2, r2, #16 ; [a1 | x]
orr r2, r2, r2, lsr #16 ; [a1 | a1]
str r2, [r1], #4
str r2, [r1], #4
str r2, [r1], #4
str r2, [r1], #4
str r2, [r1], #4
str r2, [r1], #4
str r2, [r1], #4
str r2, [r1]
bx lr
ENDP ; |vp8_short_inv_walsh4x4_1_v6|
; Constant Pool
c0x00030003 DCD 0x00030003
END

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,281 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_recon_b_armv6|
EXPORT |vp8_recon2b_armv6|
EXPORT |vp8_recon4b_armv6|
AREA |.text|, CODE, READONLY ; name this block of code
prd RN r0
dif RN r1
dst RN r2
stride RN r3
;void recon_b(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride)
; R0 char* pred_ptr
; R1 short * dif_ptr
; R2 char * dst_ptr
; R3 int stride
; Description:
; Loop through the block adding the Pred and Diff together. Clamp and then
; store back into the Dst.
; Restrictions :
; all buffers are expected to be 4 byte aligned coming in and
; going out.
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
;
;
;
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|vp8_recon_b_armv6| PROC
stmdb sp!, {r4 - r9, lr}
;0, 1, 2, 3
ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
ldr r6, [dif, #0] ; 1 | 0
ldr r7, [dif, #4] ; 3 | 2
pkhbt r8, r6, r7, lsl #16 ; 2 | 0
pkhtb r9, r7, r6, asr #16 ; 3 | 1
uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
usat16 r8, #8, r8
usat16 r9, #8, r9
add dif, dif, #32
orr r8, r8, r9, lsl #8
str r8, [dst], stride
;0, 1, 2, 3
ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
;; ldr r6, [dif, #8] ; 1 | 0
;; ldr r7, [dif, #12] ; 3 | 2
ldr r6, [dif, #0] ; 1 | 0
ldr r7, [dif, #4] ; 3 | 2
pkhbt r8, r6, r7, lsl #16 ; 2 | 0
pkhtb r9, r7, r6, asr #16 ; 3 | 1
uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
usat16 r8, #8, r8
usat16 r9, #8, r9
add dif, dif, #32
orr r8, r8, r9, lsl #8
str r8, [dst], stride
;0, 1, 2, 3
ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
;; ldr r6, [dif, #16] ; 1 | 0
;; ldr r7, [dif, #20] ; 3 | 2
ldr r6, [dif, #0] ; 1 | 0
ldr r7, [dif, #4] ; 3 | 2
pkhbt r8, r6, r7, lsl #16 ; 2 | 0
pkhtb r9, r7, r6, asr #16 ; 3 | 1
uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
usat16 r8, #8, r8
usat16 r9, #8, r9
add dif, dif, #32
orr r8, r8, r9, lsl #8
str r8, [dst], stride
;0, 1, 2, 3
ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
;; ldr r6, [dif, #24] ; 1 | 0
;; ldr r7, [dif, #28] ; 3 | 2
ldr r6, [dif, #0] ; 1 | 0
ldr r7, [dif, #4] ; 3 | 2
pkhbt r8, r6, r7, lsl #16 ; 2 | 0
pkhtb r9, r7, r6, asr #16 ; 3 | 1
uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
usat16 r8, #8, r8
usat16 r9, #8, r9
orr r8, r8, r9, lsl #8
str r8, [dst], stride
ldmia sp!, {r4 - r9, pc}
ENDP ; |recon_b|
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
;
;
;
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
; R0 char *pred_ptr
; R1 short *dif_ptr
; R2 char *dst_ptr
; R3 int stride
|vp8_recon4b_armv6| PROC
stmdb sp!, {r4 - r9, lr}
mov lr, #4
recon4b_loop
;0, 1, 2, 3
ldr r4, [prd], #4 ; 3 | 2 | 1 | 0
ldr r6, [dif, #0] ; 1 | 0
ldr r7, [dif, #4] ; 3 | 2
pkhbt r8, r6, r7, lsl #16 ; 2 | 0
pkhtb r9, r7, r6, asr #16 ; 3 | 1
uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
usat16 r8, #8, r8
usat16 r9, #8, r9
orr r8, r8, r9, lsl #8
str r8, [dst]
;4, 5, 6, 7
ldr r4, [prd], #4
;; ldr r6, [dif, #32]
;; ldr r7, [dif, #36]
ldr r6, [dif, #8]
ldr r7, [dif, #12]
pkhbt r8, r6, r7, lsl #16
pkhtb r9, r7, r6, asr #16
uxtab16 r8, r8, r4
uxtab16 r9, r9, r4, ror #8
usat16 r8, #8, r8
usat16 r9, #8, r9
orr r8, r8, r9, lsl #8
str r8, [dst, #4]
;8, 9, 10, 11
ldr r4, [prd], #4
;; ldr r6, [dif, #64]
;; ldr r7, [dif, #68]
ldr r6, [dif, #16]
ldr r7, [dif, #20]
pkhbt r8, r6, r7, lsl #16
pkhtb r9, r7, r6, asr #16
uxtab16 r8, r8, r4
uxtab16 r9, r9, r4, ror #8
usat16 r8, #8, r8
usat16 r9, #8, r9
orr r8, r8, r9, lsl #8
str r8, [dst, #8]
;12, 13, 14, 15
ldr r4, [prd], #4
;; ldr r6, [dif, #96]
;; ldr r7, [dif, #100]
ldr r6, [dif, #24]
ldr r7, [dif, #28]
pkhbt r8, r6, r7, lsl #16
pkhtb r9, r7, r6, asr #16
uxtab16 r8, r8, r4
uxtab16 r9, r9, r4, ror #8
usat16 r8, #8, r8
usat16 r9, #8, r9
orr r8, r8, r9, lsl #8
str r8, [dst, #12]
add dst, dst, stride
;; add dif, dif, #8
add dif, dif, #32
subs lr, lr, #1
bne recon4b_loop
ldmia sp!, {r4 - r9, pc}
ENDP ; |Recon4B|
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
;
;
;
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
; R0 char *pred_ptr
; R1 short *dif_ptr
; R2 char *dst_ptr
; R3 int stride
|vp8_recon2b_armv6| PROC
stmdb sp!, {r4 - r9, lr}
mov lr, #4
recon2b_loop
;0, 1, 2, 3
ldr r4, [prd], #4
ldr r6, [dif, #0]
ldr r7, [dif, #4]
pkhbt r8, r6, r7, lsl #16
pkhtb r9, r7, r6, asr #16
uxtab16 r8, r8, r4
uxtab16 r9, r9, r4, ror #8
usat16 r8, #8, r8
usat16 r9, #8, r9
orr r8, r8, r9, lsl #8
str r8, [dst]
;4, 5, 6, 7
ldr r4, [prd], #4
;; ldr r6, [dif, #32]
;; ldr r7, [dif, #36]
ldr r6, [dif, #8]
ldr r7, [dif, #12]
pkhbt r8, r6, r7, lsl #16
pkhtb r9, r7, r6, asr #16
uxtab16 r8, r8, r4
uxtab16 r9, r9, r4, ror #8
usat16 r8, #8, r8
usat16 r9, #8, r9
orr r8, r8, r9, lsl #8
str r8, [dst, #4]
add dst, dst, stride
;; add dif, dif, #8
add dif, dif, #16
subs lr, lr, #1
bne recon2b_loop
ldmia sp!, {r4 - r9, pc}
ENDP ; |Recon2B|
END

Просмотреть файл

@ -0,0 +1,287 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_loop_filter_simple_horizontal_edge_armv6|
EXPORT |vp8_loop_filter_simple_vertical_edge_armv6|
AREA |.text|, CODE, READONLY ; name this block of code
MACRO
TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3
; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3
; a0: 03 02 01 00
; a1: 13 12 11 10
; a2: 23 22 21 20
; a3: 33 32 31 30
; b3 b2 b1 b0
uxtb16 $b1, $a1 ; xx 12 xx 10
uxtb16 $b0, $a0 ; xx 02 xx 00
uxtb16 $b3, $a3 ; xx 32 xx 30
uxtb16 $b2, $a2 ; xx 22 xx 20
orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00
orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20
uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11
uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31
uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01
uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21
orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01
orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21
pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1
pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3
pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0
pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2
MEND
src RN r0
pstep RN r1
;r0 unsigned char *src_ptr,
;r1 int src_pixel_step,
;r2 const char *flimit,
;r3 const char *limit,
;stack const char *thresh,
;stack int count
; All 16 elements in flimit are equal. So, in the code, only one load is needed
; for flimit. Same applies to limit. thresh is not used in simple looopfilter
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|vp8_loop_filter_simple_horizontal_edge_armv6| PROC
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
stmdb sp!, {r4 - r11, lr}
ldr r12, [r3] ; limit
ldr r3, [src, -pstep, lsl #1] ; p1
ldr r4, [src, -pstep] ; p0
ldr r5, [src] ; q0
ldr r6, [src, pstep] ; q1
ldr r7, [r2] ; flimit
ldr r2, c0x80808080
ldr r9, [sp, #40] ; count for 8-in-parallel
uadd8 r7, r7, r7 ; flimit * 2
mov r9, r9, lsl #1 ; double the count. we're doing 4 at a time
uadd8 r12, r7, r12 ; flimit * 2 + limit
mov lr, #0 ; need 0 in a couple places
|simple_hnext8|
; vp8_simple_filter_mask()
uqsub8 r7, r3, r6 ; p1 - q1
uqsub8 r8, r6, r3 ; q1 - p1
uqsub8 r10, r4, r5 ; p0 - q0
uqsub8 r11, r5, r4 ; q0 - p0
orr r8, r8, r7 ; abs(p1 - q1)
orr r10, r10, r11 ; abs(p0 - q0)
uqadd8 r10, r10, r10 ; abs(p0 - q0) * 2
uhadd8 r8, r8, lr ; abs(p1 - q2) >> 1
uqadd8 r10, r10, r8 ; abs(p0 - q0)*2 + abs(p1 - q1)/2
mvn r8, #0
usub8 r10, r12, r10 ; compare to flimit. usub8 sets GE flags
sel r10, r8, lr ; filter mask: F or 0
cmp r10, #0
beq simple_hskip_filter ; skip filtering if all masks are 0x00
;vp8_simple_filter()
eor r3, r3, r2 ; p1 offset to convert to a signed value
eor r6, r6, r2 ; q1 offset to convert to a signed value
eor r4, r4, r2 ; p0 offset to convert to a signed value
eor r5, r5, r2 ; q0 offset to convert to a signed value
qsub8 r3, r3, r6 ; vp8_filter = p1 - q1
qsub8 r6, r5, r4 ; q0 - p0
qadd8 r3, r3, r6 ; += q0 - p0
ldr r7, c0x04040404
qadd8 r3, r3, r6 ; += q0 - p0
ldr r8, c0x03030303
qadd8 r3, r3, r6 ; vp8_filter = p1-q1 + 3*(q0-p0))
;STALL
and r3, r3, r10 ; vp8_filter &= mask
qadd8 r7 , r3 , r7 ; Filter1 = vp8_filter + 4
qadd8 r8 , r3 , r8 ; Filter2 = vp8_filter + 3
shadd8 r7 , r7 , lr
shadd8 r8 , r8 , lr
shadd8 r7 , r7 , lr
shadd8 r8 , r8 , lr
shadd8 r7 , r7 , lr ; Filter1 >>= 3
shadd8 r8 , r8 , lr ; Filter2 >>= 3
qsub8 r5 ,r5, r7 ; u = q0 - Filter1
qadd8 r4, r4, r8 ; u = p0 + Filter2
eor r5, r5, r2 ; *oq0 = u^0x80
str r5, [src] ; store oq0 result
eor r4, r4, r2 ; *op0 = u^0x80
str r4, [src, -pstep] ; store op0 result
|simple_hskip_filter|
subs r9, r9, #1
addne src, src, #4 ; next row
ldrne r3, [src, -pstep, lsl #1] ; p1
ldrne r4, [src, -pstep] ; p0
ldrne r5, [src] ; q0
ldrne r6, [src, pstep] ; q1
bne simple_hnext8
ldmia sp!, {r4 - r11, pc}
ENDP ; |vp8_loop_filter_simple_horizontal_edge_armv6|
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|vp8_loop_filter_simple_vertical_edge_armv6| PROC
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
stmdb sp!, {r4 - r11, lr}
ldr r12, [r2] ; r12: flimit
ldr r2, c0x80808080
ldr r7, [r3] ; limit
; load soure data to r7, r8, r9, r10
ldrh r3, [src, #-2]
ldrh r4, [src], pstep
uadd8 r12, r12, r12 ; flimit * 2
ldrh r5, [src, #-2]
ldrh r6, [src], pstep
uadd8 r12, r12, r7 ; flimit * 2 + limit
pkhbt r7, r3, r4, lsl #16
ldrh r3, [src, #-2]
ldrh r4, [src], pstep
ldr r11, [sp, #40] ; count (r11) for 8-in-parallel
pkhbt r8, r5, r6, lsl #16
ldrh r5, [src, #-2]
ldrh r6, [src], pstep
mov r11, r11, lsl #1 ; 4-in-parallel
|simple_vnext8|
; vp8_simple_filter_mask() function
pkhbt r9, r3, r4, lsl #16
pkhbt r10, r5, r6, lsl #16
;transpose r7, r8, r9, r10 to r3, r4, r5, r6
TRANSPOSE_MATRIX r7, r8, r9, r10, r3, r4, r5, r6
uqsub8 r7, r3, r6 ; p1 - q1
uqsub8 r8, r6, r3 ; q1 - p1
uqsub8 r9, r4, r5 ; p0 - q0
uqsub8 r10, r5, r4 ; q0 - p0
orr r7, r7, r8 ; abs(p1 - q1)
orr r9, r9, r10 ; abs(p0 - q0)
mov r8, #0
uqadd8 r9, r9, r9 ; abs(p0 - q0) * 2
uhadd8 r7, r7, r8 ; abs(p1 - q1) / 2
uqadd8 r7, r7, r9 ; abs(p0 - q0)*2 + abs(p1 - q1)/2
mvn r10, #0 ; r10 == -1
usub8 r7, r12, r7 ; compare to flimit
sel lr, r10, r8 ; filter mask
cmp lr, #0
beq simple_vskip_filter ; skip filtering
;vp8_simple_filter() function
eor r3, r3, r2 ; p1 offset to convert to a signed value
eor r6, r6, r2 ; q1 offset to convert to a signed value
eor r4, r4, r2 ; p0 offset to convert to a signed value
eor r5, r5, r2 ; q0 offset to convert to a signed value
qsub8 r3, r3, r6 ; vp8_filter = p1 - q1
qsub8 r6, r5, r4 ; q0 - p0
qadd8 r3, r3, r6 ; vp8_filter += q0 - p0
ldr r9, c0x03030303 ; r9 = 3
qadd8 r3, r3, r6 ; vp8_filter += q0 - p0
ldr r7, c0x04040404
qadd8 r3, r3, r6 ; vp8_filter = p1-q1 + 3*(q0-p0))
;STALL
and r3, r3, lr ; vp8_filter &= mask
qadd8 r9 , r3 , r9 ; Filter2 = vp8_filter + 3
qadd8 r3 , r3 , r7 ; Filter1 = vp8_filter + 4
shadd8 r9 , r9 , r8
shadd8 r3 , r3 , r8
shadd8 r9 , r9 , r8
shadd8 r3 , r3 , r8
shadd8 r9 , r9 , r8 ; Filter2 >>= 3
shadd8 r3 , r3 , r8 ; Filter1 >>= 3
;calculate output
sub src, src, pstep, lsl #2
qadd8 r4, r4, r9 ; u = p0 + Filter2
qsub8 r5, r5, r3 ; u = q0 - Filter1
eor r4, r4, r2 ; *op0 = u^0x80
eor r5, r5, r2 ; *oq0 = u^0x80
strb r4, [src, #-1] ; store the result
mov r4, r4, lsr #8
strb r5, [src], pstep
mov r5, r5, lsr #8
strb r4, [src, #-1]
mov r4, r4, lsr #8
strb r5, [src], pstep
mov r5, r5, lsr #8
strb r4, [src, #-1]
mov r4, r4, lsr #8
strb r5, [src], pstep
mov r5, r5, lsr #8
strb r4, [src, #-1]
strb r5, [src], pstep
|simple_vskip_filter|
subs r11, r11, #1
; load soure data to r7, r8, r9, r10
ldrneh r3, [src, #-2]
ldrneh r4, [src], pstep
ldrneh r5, [src, #-2]
ldrneh r6, [src], pstep
pkhbt r7, r3, r4, lsl #16
ldrneh r3, [src, #-2]
ldrneh r4, [src], pstep
pkhbt r8, r5, r6, lsl #16
ldrneh r5, [src, #-2]
ldrneh r6, [src], pstep
bne simple_vnext8
ldmia sp!, {r4 - r11, pc}
ENDP ; |vp8_loop_filter_simple_vertical_edge_armv6|
; Constant Pool
c0x80808080 DCD 0x80808080
c0x03030303 DCD 0x03030303
c0x04040404 DCD 0x04040404
END

Просмотреть файл

@ -0,0 +1,271 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_sixtap_predict8x4_armv6|
AREA |.text|, CODE, READONLY ; name this block of code
;-------------------------------------
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; stack unsigned char *dst_ptr,
; stack int dst_pitch
;-------------------------------------
;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184.
;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack,
;and the result is stored in transpose.
|vp8_sixtap_predict8x4_armv6| PROC
stmdb sp!, {r4 - r11, lr}
str r3, [sp, #-184]! ;reserve space on stack for temporary storage, store yoffset
cmp r2, #0 ;skip first_pass filter if xoffset=0
add lr, sp, #4 ;point to temporary buffer
beq skip_firstpass_filter
;first-pass filter
ldr r12, _filter8_coeff_
sub r0, r0, r1, lsl #1
add r2, r12, r2, lsl #4 ;calculate filter location
add r0, r0, #3 ;adjust src only for loading convinience
ldr r3, [r2] ; load up packed filter coefficients
ldr r4, [r2, #4]
ldr r5, [r2, #8]
mov r2, #0x90000 ; height=9 is top part of counter
sub r1, r1, #8
|first_pass_hloop_v6|
ldrb r6, [r0, #-5] ; load source data
ldrb r7, [r0, #-4]
ldrb r8, [r0, #-3]
ldrb r9, [r0, #-2]
ldrb r10, [r0, #-1]
orr r2, r2, #0x4 ; construct loop counter. width=8=4x2
pkhbt r6, r6, r7, lsl #16 ; r7 | r6
pkhbt r7, r7, r8, lsl #16 ; r8 | r7
pkhbt r8, r8, r9, lsl #16 ; r9 | r8
pkhbt r9, r9, r10, lsl #16 ; r10 | r9
|first_pass_wloop_v6|
smuad r11, r6, r3 ; vp8_filter[0], vp8_filter[1]
smuad r12, r7, r3
ldrb r6, [r0], #1
smlad r11, r8, r4, r11 ; vp8_filter[2], vp8_filter[3]
ldrb r7, [r0], #1
smlad r12, r9, r4, r12
pkhbt r10, r10, r6, lsl #16 ; r10 | r9
pkhbt r6, r6, r7, lsl #16 ; r11 | r10
smlad r11, r10, r5, r11 ; vp8_filter[4], vp8_filter[5]
smlad r12, r6, r5, r12
sub r2, r2, #1
add r11, r11, #0x40 ; round_shift_and_clamp
tst r2, #0xff ; test loop counter
usat r11, #8, r11, asr #7
add r12, r12, #0x40
strh r11, [lr], #20 ; result is transposed and stored, which
usat r12, #8, r12, asr #7
strh r12, [lr], #20
movne r11, r6
movne r12, r7
movne r6, r8
movne r7, r9
movne r8, r10
movne r9, r11
movne r10, r12
bne first_pass_wloop_v6
;;add r9, ppl, #30 ; attempt to load 2 adjacent cache lines
;;IF ARCHITECTURE=6
;pld [src, ppl]
;;pld [src, r9]
;;ENDIF
subs r2, r2, #0x10000
sub lr, lr, #158
add r0, r0, r1 ; move to next input line
bne first_pass_hloop_v6
;second pass filter
secondpass_filter
ldr r3, [sp], #4 ; load back yoffset
ldr r0, [sp, #216] ; load dst address from stack 180+36
ldr r1, [sp, #220] ; load dst stride from stack 180+40
cmp r3, #0
beq skip_secondpass_filter
ldr r12, _filter8_coeff_
add lr, r12, r3, lsl #4 ;calculate filter location
mov r2, #0x00080000
ldr r3, [lr] ; load up packed filter coefficients
ldr r4, [lr, #4]
ldr r5, [lr, #8]
pkhbt r12, r4, r3 ; pack the filter differently
pkhbt r11, r5, r4
second_pass_hloop_v6
ldr r6, [sp] ; load the data
ldr r7, [sp, #4]
orr r2, r2, #2 ; loop counter
second_pass_wloop_v6
smuad lr, r3, r6 ; apply filter
smulbt r10, r3, r6
ldr r8, [sp, #8]
smlad lr, r4, r7, lr
smladx r10, r12, r7, r10
ldrh r9, [sp, #12]
smlad lr, r5, r8, lr
smladx r10, r11, r8, r10
add sp, sp, #4
smlatb r10, r5, r9, r10
sub r2, r2, #1
add lr, lr, #0x40 ; round_shift_and_clamp
tst r2, #0xff
usat lr, #8, lr, asr #7
add r10, r10, #0x40
strb lr, [r0], r1 ; the result is transposed back and stored
usat r10, #8, r10, asr #7
strb r10, [r0],r1
movne r6, r7
movne r7, r8
bne second_pass_wloop_v6
subs r2, r2, #0x10000
add sp, sp, #12 ; updata src for next loop (20-8)
sub r0, r0, r1, lsl #2
add r0, r0, #1
bne second_pass_hloop_v6
add sp, sp, #20
ldmia sp!, {r4 - r11, pc}
;--------------------
skip_firstpass_filter
sub r0, r0, r1, lsl #1
sub r1, r1, #8
mov r2, #9
skip_firstpass_hloop
ldrb r4, [r0], #1 ; load data
subs r2, r2, #1
ldrb r5, [r0], #1
strh r4, [lr], #20 ; store it to immediate buffer
ldrb r6, [r0], #1 ; load data
strh r5, [lr], #20
ldrb r7, [r0], #1
strh r6, [lr], #20
ldrb r8, [r0], #1
strh r7, [lr], #20
ldrb r9, [r0], #1
strh r8, [lr], #20
ldrb r10, [r0], #1
strh r9, [lr], #20
ldrb r11, [r0], #1
strh r10, [lr], #20
add r0, r0, r1 ; move to next input line
strh r11, [lr], #20
sub lr, lr, #158 ; move over to next column
bne skip_firstpass_hloop
b secondpass_filter
;--------------------
skip_secondpass_filter
mov r2, #8
add sp, sp, #4 ;start from src[0] instead of src[-2]
skip_secondpass_hloop
ldr r6, [sp], #4
subs r2, r2, #1
ldr r8, [sp], #4
mov r7, r6, lsr #16 ; unpack
strb r6, [r0], r1
mov r9, r8, lsr #16
strb r7, [r0], r1
add sp, sp, #12 ; 20-8
strb r8, [r0], r1
strb r9, [r0], r1
sub r0, r0, r1, lsl #2
add r0, r0, #1
bne skip_secondpass_hloop
add sp, sp, #16 ; 180 - (160 +4)
ldmia sp!, {r4 - r11, pc}
ENDP
;-----------------
AREA subpelfilters8_dat, DATA, READWRITE ;read/write by default
;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
;One word each is reserved. Label filter_coeff can be used to access the data.
;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
_filter8_coeff_
DCD filter8_coeff
filter8_coeff
DCD 0x00000000, 0x00000080, 0x00000000, 0x00000000
DCD 0xfffa0000, 0x000c007b, 0x0000ffff, 0x00000000
DCD 0xfff50002, 0x0024006c, 0x0001fff8, 0x00000000
DCD 0xfff70000, 0x0032005d, 0x0000fffa, 0x00000000
DCD 0xfff00003, 0x004d004d, 0x0003fff0, 0x00000000
DCD 0xfffa0000, 0x005d0032, 0x0000fff7, 0x00000000
DCD 0xfff80001, 0x006c0024, 0x0002fff5, 0x00000000
DCD 0xffff0000, 0x007b000c, 0x0000fffa, 0x00000000
;DCD 0, 0, 128, 0, 0, 0
;DCD 0, -6, 123, 12, -1, 0
;DCD 2, -11, 108, 36, -8, 1
;DCD 0, -9, 93, 50, -6, 0
;DCD 3, -16, 77, 77, -16, 3
;DCD 0, -6, 50, 93, -9, 0
;DCD 1, -8, 36, 108, -11, 2
;DCD 0, -1, 12, 123, -6, 0
END

Просмотреть файл

@ -0,0 +1,212 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <math.h>
#include "subpixel.h"
#define BLOCK_HEIGHT_WIDTH 4
#define VP8_FILTER_WEIGHT 128
#define VP8_FILTER_SHIFT 7
static const short bilinear_filters[8][2] =
{
{ 128, 0 },
{ 112, 16 },
{ 96, 32 },
{ 80, 48 },
{ 64, 64 },
{ 48, 80 },
{ 32, 96 },
{ 16, 112 }
};
extern void vp8_filter_block2d_bil_first_pass_armv6
(
unsigned char *src_ptr,
unsigned short *output_ptr,
unsigned int src_pixels_per_line,
unsigned int output_height,
unsigned int output_width,
const short *vp8_filter
);
extern void vp8_filter_block2d_bil_second_pass_armv6
(
unsigned short *src_ptr,
unsigned char *output_ptr,
int output_pitch,
unsigned int output_height,
unsigned int output_width,
const short *vp8_filter
);
#if 0
void vp8_filter_block2d_bil_first_pass_6
(
unsigned char *src_ptr,
unsigned short *output_ptr,
unsigned int src_pixels_per_line,
unsigned int output_height,
unsigned int output_width,
const short *vp8_filter
)
{
unsigned int i, j;
for ( i=0; i<output_height; i++ )
{
for ( j=0; j<output_width; j++ )
{
/* Apply bilinear filter */
output_ptr[j] = ( ( (int)src_ptr[0] * vp8_filter[0]) +
((int)src_ptr[1] * vp8_filter[1]) +
(VP8_FILTER_WEIGHT/2) ) >> VP8_FILTER_SHIFT;
src_ptr++;
}
/* Next row... */
src_ptr += src_pixels_per_line - output_width;
output_ptr += output_width;
}
}
void vp8_filter_block2d_bil_second_pass_6
(
unsigned short *src_ptr,
unsigned char *output_ptr,
int output_pitch,
unsigned int output_height,
unsigned int output_width,
const short *vp8_filter
)
{
unsigned int i,j;
int Temp;
for ( i=0; i<output_height; i++ )
{
for ( j=0; j<output_width; j++ )
{
/* Apply filter */
Temp = ((int)src_ptr[0] * vp8_filter[0]) +
((int)src_ptr[output_width] * vp8_filter[1]) +
(VP8_FILTER_WEIGHT/2);
output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
src_ptr++;
}
/* Next row... */
/*src_ptr += src_pixels_per_line - output_width;*/
output_ptr += output_pitch;
}
}
#endif
void vp8_filter_block2d_bil_armv6
(
unsigned char *src_ptr,
unsigned char *output_ptr,
unsigned int src_pixels_per_line,
unsigned int dst_pitch,
const short *HFilter,
const short *VFilter,
int Width,
int Height
)
{
unsigned short FData[36*16]; /* Temp data bufffer used in filtering */
/* First filter 1-D horizontally... */
/* pixel_step = 1; */
vp8_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pixels_per_line, Height + 1, Width, HFilter);
/* then 1-D vertically... */
vp8_filter_block2d_bil_second_pass_armv6(FData, output_ptr, dst_pitch, Height, Width, VFilter);
}
void vp8_bilinear_predict4x4_armv6
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
)
{
const short *HFilter;
const short *VFilter;
HFilter = bilinear_filters[xoffset];
VFilter = bilinear_filters[yoffset];
vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
}
void vp8_bilinear_predict8x8_armv6
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
)
{
const short *HFilter;
const short *VFilter;
HFilter = bilinear_filters[xoffset];
VFilter = bilinear_filters[yoffset];
vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
}
void vp8_bilinear_predict8x4_armv6
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
)
{
const short *HFilter;
const short *VFilter;
HFilter = bilinear_filters[xoffset];
VFilter = bilinear_filters[yoffset];
vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
}
void vp8_bilinear_predict16x16_armv6
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
)
{
const short *HFilter;
const short *VFilter;
HFilter = bilinear_filters[xoffset];
VFilter = bilinear_filters[yoffset];
vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
}

Просмотреть файл

@ -0,0 +1,256 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_ports/config.h"
#include <math.h>
#include "subpixel.h"
#include "vpx_ports/mem.h"
#define BLOCK_HEIGHT_WIDTH 4
#define VP8_FILTER_WEIGHT 128
#define VP8_FILTER_SHIFT 7
DECLARE_ALIGNED(16, static const short, sub_pel_filters[8][6]) =
{
{ 0, 0, 128, 0, 0, 0 }, /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */
{ 0, -6, 123, 12, -1, 0 },
{ 2, -11, 108, 36, -8, 1 }, /* New 1/4 pel 6 tap filter */
{ 0, -9, 93, 50, -6, 0 },
{ 3, -16, 77, 77, -16, 3 }, /* New 1/2 pel 6 tap filter */
{ 0, -6, 50, 93, -9, 0 },
{ 1, -8, 36, 108, -11, 2 }, /* New 1/4 pel 6 tap filter */
{ 0, -1, 12, 123, -6, 0 },
};
extern void vp8_filter_block2d_first_pass_armv6
(
unsigned char *src_ptr,
short *output_ptr,
unsigned int src_pixels_per_line,
unsigned int output_width,
unsigned int output_height,
const short *vp8_filter
);
extern void vp8_filter_block2d_second_pass_armv6
(
short *src_ptr,
unsigned char *output_ptr,
unsigned int output_pitch,
unsigned int cnt,
const short *vp8_filter
);
extern void vp8_filter4_block2d_second_pass_armv6
(
short *src_ptr,
unsigned char *output_ptr,
unsigned int output_pitch,
unsigned int cnt,
const short *vp8_filter
);
extern void vp8_filter_block2d_first_pass_only_armv6
(
unsigned char *src_ptr,
unsigned char *output_ptr,
unsigned int src_pixels_per_line,
unsigned int cnt,
unsigned int output_pitch,
const short *vp8_filter
);
extern void vp8_filter_block2d_second_pass_only_armv6
(
unsigned char *src_ptr,
unsigned char *output_ptr,
unsigned int src_pixels_per_line,
unsigned int cnt,
unsigned int output_pitch,
const short *vp8_filter
);
#if HAVE_ARMV6
void vp8_sixtap_predict_armv6
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
)
{
const short *HFilter;
const short *VFilter;
DECLARE_ALIGNED_ARRAY(4, short, FData, 12*4); /* Temp data bufffer used in filtering */
HFilter = sub_pel_filters[xoffset]; /* 6 tap */
VFilter = sub_pel_filters[yoffset]; /* 6 tap */
/* Vfilter is null. First pass only */
if (xoffset && !yoffset)
{
/*vp8_filter_block2d_first_pass_armv6 ( src_ptr, FData+2, src_pixels_per_line, 4, 4, HFilter );
vp8_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, VFilter );*/
vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, HFilter);
}
/* Hfilter is null. Second pass only */
else if (!xoffset && yoffset)
{
vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, VFilter);
}
else
{
/* Vfilter is a 4 tap filter */
if (yoffset & 0x1)
{
vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 4, 7, HFilter);
vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
}
/* Vfilter is 6 tap filter */
else
{
vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 4, 9, HFilter);
vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
}
}
}
#if 0
void vp8_sixtap_predict8x4_armv6
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
)
{
const short *HFilter;
const short *VFilter;
DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data bufffer used in filtering */
HFilter = sub_pel_filters[xoffset]; /* 6 tap */
VFilter = sub_pel_filters[yoffset]; /* 6 tap */
/*if (xoffset && !yoffset)
{
vp8_filter_block2d_first_pass_only_armv6 ( src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter );
}*/
/* Hfilter is null. Second pass only */
/*else if (!xoffset && yoffset)
{
vp8_filter_block2d_second_pass_only_armv6 ( src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter );
}
else
{
if (yoffset & 0x1)
vp8_filter_block2d_first_pass_armv6 ( src_ptr-src_pixels_per_line, FData+1, src_pixels_per_line, 8, 7, HFilter );
else*/
vp8_filter_block2d_first_pass_armv6 ( src_ptr-(2*src_pixels_per_line), FData, src_pixels_per_line, 8, 9, HFilter );
vp8_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, 8, VFilter );
/*}*/
}
#endif
void vp8_sixtap_predict8x8_armv6
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
)
{
const short *HFilter;
const short *VFilter;
DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data bufffer used in filtering */
HFilter = sub_pel_filters[xoffset]; /* 6 tap */
VFilter = sub_pel_filters[yoffset]; /* 6 tap */
if (xoffset && !yoffset)
{
vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter);
}
/* Hfilter is null. Second pass only */
else if (!xoffset && yoffset)
{
vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter);
}
else
{
if (yoffset & 0x1)
{
vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 8, 11, HFilter);
vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
}
else
{
vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, 13, HFilter);
vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
}
}
}
void vp8_sixtap_predict16x16_armv6
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
)
{
const short *HFilter;
const short *VFilter;
DECLARE_ALIGNED_ARRAY(4, short, FData, 24*16); /* Temp data bufffer used in filtering */
HFilter = sub_pel_filters[xoffset]; /* 6 tap */
VFilter = sub_pel_filters[yoffset]; /* 6 tap */
if (xoffset && !yoffset)
{
vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, HFilter);
}
/* Hfilter is null. Second pass only */
else if (!xoffset && yoffset)
{
vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, VFilter);
}
else
{
if (yoffset & 0x1)
{
vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 16, 19, HFilter);
vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
}
else
{
vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, 21, HFilter);
vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
}
}
}
#endif

Просмотреть файл

@ -0,0 +1,65 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef IDCT_ARM_H
#define IDCT_ARM_H
#if HAVE_ARMV6
extern prototype_idct(vp8_short_idct4x4llm_1_v6);
extern prototype_idct(vp8_short_idct4x4llm_v6_dual);
extern prototype_idct_scalar_add(vp8_dc_only_idct_add_v6);
extern prototype_second_order(vp8_short_inv_walsh4x4_1_v6);
extern prototype_second_order(vp8_short_inv_walsh4x4_v6);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_idct_idct1
#define vp8_idct_idct1 vp8_short_idct4x4llm_1_v6
#undef vp8_idct_idct16
#define vp8_idct_idct16 vp8_short_idct4x4llm_v6_dual
#undef vp8_idct_idct1_scalar_add
#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_v6
#undef vp8_idct_iwalsh1
#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_v6
#undef vp8_idct_iwalsh16
#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_v6
#endif
#endif
#if HAVE_ARMV7
extern prototype_idct(vp8_short_idct4x4llm_1_neon);
extern prototype_idct(vp8_short_idct4x4llm_neon);
extern prototype_idct_scalar_add(vp8_dc_only_idct_add_neon);
extern prototype_second_order(vp8_short_inv_walsh4x4_1_neon);
extern prototype_second_order(vp8_short_inv_walsh4x4_neon);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_idct_idct1
#define vp8_idct_idct1 vp8_short_idct4x4llm_1_neon
#undef vp8_idct_idct16
#define vp8_idct_idct16 vp8_short_idct4x4llm_neon
#undef vp8_idct_idct1_scalar_add
#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_neon
#undef vp8_idct_iwalsh1
#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_neon
#undef vp8_idct_iwalsh16
#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_neon
#endif
#endif
#endif

Просмотреть файл

@ -0,0 +1,237 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_ports/config.h"
#include <math.h>
#include "loopfilter.h"
#include "onyxc_int.h"
extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6);
extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6);
extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6);
extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6);
extern prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_armv6);
extern prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6);
extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_y_neon);
extern prototype_loopfilter(vp8_loop_filter_vertical_edge_y_neon);
extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_y_neon);
extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_y_neon);
extern prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_neon);
extern prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_neon);
extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_neon;
extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_neon;
extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_neon;
extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_neon;
#if HAVE_ARMV6
/*ARMV6 loopfilter functions*/
/* Horizontal MB filtering */
void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void) simpler_lpf;
vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
if (u_ptr)
vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
if (v_ptr)
vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
}
void vp8_loop_filter_mbhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
(void) simpler_lpf;
vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
}
/* Vertical MB Filtering */
void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void) simpler_lpf;
vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
if (u_ptr)
vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
if (v_ptr)
vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
}
void vp8_loop_filter_mbvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
(void) simpler_lpf;
vp8_loop_filter_simple_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
}
/* Horizontal B Filtering */
void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void) simpler_lpf;
vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
if (u_ptr)
vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
if (v_ptr)
vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
}
void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
(void) simpler_lpf;
vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
}
/* Vertical B Filtering */
void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void) simpler_lpf;
vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
if (u_ptr)
vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
if (v_ptr)
vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
}
void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
(void) simpler_lpf;
vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
}
#endif
#if HAVE_ARMV7
/* NEON loopfilter functions */
/* Horizontal MB filtering */
void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void) simpler_lpf;
vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
if (u_ptr)
vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, v_ptr);
}
void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
(void) simpler_lpf;
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
}
/* Vertical MB Filtering */
void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void) simpler_lpf;
vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
if (u_ptr)
vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, v_ptr);
}
void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
(void) simpler_lpf;
vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
}
/* Horizontal B Filtering */
void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void) simpler_lpf;
vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
if (u_ptr)
vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, v_ptr + 4 * uv_stride);
}
void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
(void) simpler_lpf;
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
}
/* Vertical B Filtering */
void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void) simpler_lpf;
vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
if (u_ptr)
vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, v_ptr + 4);
}
void vp8_loop_filter_bvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
(void) simpler_lpf;
vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
}
#endif

Просмотреть файл

@ -0,0 +1,89 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef LOOPFILTER_ARM_H
#define LOOPFILTER_ARM_H
#if HAVE_ARMV6
extern prototype_loopfilter_block(vp8_loop_filter_mbv_armv6);
extern prototype_loopfilter_block(vp8_loop_filter_bv_armv6);
extern prototype_loopfilter_block(vp8_loop_filter_mbh_armv6);
extern prototype_loopfilter_block(vp8_loop_filter_bh_armv6);
extern prototype_loopfilter_block(vp8_loop_filter_mbvs_armv6);
extern prototype_loopfilter_block(vp8_loop_filter_bvs_armv6);
extern prototype_loopfilter_block(vp8_loop_filter_mbhs_armv6);
extern prototype_loopfilter_block(vp8_loop_filter_bhs_armv6);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_lf_normal_mb_v
#define vp8_lf_normal_mb_v vp8_loop_filter_mbv_armv6
#undef vp8_lf_normal_b_v
#define vp8_lf_normal_b_v vp8_loop_filter_bv_armv6
#undef vp8_lf_normal_mb_h
#define vp8_lf_normal_mb_h vp8_loop_filter_mbh_armv6
#undef vp8_lf_normal_b_h
#define vp8_lf_normal_b_h vp8_loop_filter_bh_armv6
#undef vp8_lf_simple_mb_v
#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_armv6
#undef vp8_lf_simple_b_v
#define vp8_lf_simple_b_v vp8_loop_filter_bvs_armv6
#undef vp8_lf_simple_mb_h
#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_armv6
#undef vp8_lf_simple_b_h
#define vp8_lf_simple_b_h vp8_loop_filter_bhs_armv6
#endif
#endif
#if HAVE_ARMV7
extern prototype_loopfilter_block(vp8_loop_filter_mbv_neon);
extern prototype_loopfilter_block(vp8_loop_filter_bv_neon);
extern prototype_loopfilter_block(vp8_loop_filter_mbh_neon);
extern prototype_loopfilter_block(vp8_loop_filter_bh_neon);
extern prototype_loopfilter_block(vp8_loop_filter_mbvs_neon);
extern prototype_loopfilter_block(vp8_loop_filter_bvs_neon);
extern prototype_loopfilter_block(vp8_loop_filter_mbhs_neon);
extern prototype_loopfilter_block(vp8_loop_filter_bhs_neon);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_lf_normal_mb_v
#define vp8_lf_normal_mb_v vp8_loop_filter_mbv_neon
#undef vp8_lf_normal_b_v
#define vp8_lf_normal_b_v vp8_loop_filter_bv_neon
#undef vp8_lf_normal_mb_h
#define vp8_lf_normal_mb_h vp8_loop_filter_mbh_neon
#undef vp8_lf_normal_b_h
#define vp8_lf_normal_b_h vp8_loop_filter_bh_neon
#undef vp8_lf_simple_mb_v
#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_neon
#undef vp8_lf_simple_b_v
#define vp8_lf_simple_b_v vp8_loop_filter_bvs_neon
#undef vp8_lf_simple_mb_h
#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_neon
#undef vp8_lf_simple_b_h
#define vp8_lf_simple_b_h vp8_loop_filter_bhs_neon
#endif
#endif
#endif

Просмотреть файл

@ -0,0 +1,362 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_bilinear_predict16x16_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; r4 unsigned char *dst_ptr,
; stack(r5) int dst_pitch
|vp8_bilinear_predict16x16_neon| PROC
push {r4-r5, lr}
ldr r12, _bifilter16_coeff_
ldr r4, [sp, #12] ;load parameters from stack
ldr r5, [sp, #16] ;load parameters from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq secondpass_bfilter16x16_only
add r2, r12, r2, lsl #3 ;calculate filter location
cmp r3, #0 ;skip second_pass filter if yoffset=0
vld1.s32 {d31}, [r2] ;load first_pass filter
beq firstpass_bfilter16x16_only
sub sp, sp, #272 ;reserve space on stack for temporary storage
vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
mov lr, sp
vld1.u8 {d5, d6, d7}, [r0], r1
mov r2, #3 ;loop counter
vld1.u8 {d8, d9, d10}, [r0], r1
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
vld1.u8 {d11, d12, d13}, [r0], r1
vdup.8 d1, d31[4]
;First Pass: output_height lines x output_width columns (17x16)
filt_blk2d_fp16x16_loop_neon
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0])
vmull.u8 q8, d3, d0
vmull.u8 q9, d5, d0
vmull.u8 q10, d6, d0
vmull.u8 q11, d8, d0
vmull.u8 q12, d9, d0
vmull.u8 q13, d11, d0
vmull.u8 q14, d12, d0
vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
vext.8 d5, d5, d6, #1
vext.8 d8, d8, d9, #1
vext.8 d11, d11, d12, #1
vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp8_filter[1])
vmlal.u8 q9, d5, d1
vmlal.u8 q11, d8, d1
vmlal.u8 q13, d11, d1
vext.8 d3, d3, d4, #1
vext.8 d6, d6, d7, #1
vext.8 d9, d9, d10, #1
vext.8 d12, d12, d13, #1
vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp8_filter[1])
vmlal.u8 q10, d6, d1
vmlal.u8 q12, d9, d1
vmlal.u8 q14, d12, d1
subs r2, r2, #1
vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
vqrshrn.u16 d15, q8, #7
vqrshrn.u16 d16, q9, #7
vqrshrn.u16 d17, q10, #7
vqrshrn.u16 d18, q11, #7
vqrshrn.u16 d19, q12, #7
vqrshrn.u16 d20, q13, #7
vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
vqrshrn.u16 d21, q14, #7
vld1.u8 {d5, d6, d7}, [r0], r1
vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result
vld1.u8 {d8, d9, d10}, [r0], r1
vst1.u8 {d18, d19, d20, d21}, [lr]!
vld1.u8 {d11, d12, d13}, [r0], r1
bne filt_blk2d_fp16x16_loop_neon
;First-pass filtering for rest 5 lines
vld1.u8 {d14, d15, d16}, [r0], r1
vmull.u8 q9, d2, d0 ;(src_ptr[0] * vp8_filter[0])
vmull.u8 q10, d3, d0
vmull.u8 q11, d5, d0
vmull.u8 q12, d6, d0
vmull.u8 q13, d8, d0
vmull.u8 q14, d9, d0
vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
vext.8 d5, d5, d6, #1
vext.8 d8, d8, d9, #1
vmlal.u8 q9, d2, d1 ;(src_ptr[0] * vp8_filter[1])
vmlal.u8 q11, d5, d1
vmlal.u8 q13, d8, d1
vext.8 d3, d3, d4, #1
vext.8 d6, d6, d7, #1
vext.8 d9, d9, d10, #1
vmlal.u8 q10, d3, d1 ;(src_ptr[0] * vp8_filter[1])
vmlal.u8 q12, d6, d1
vmlal.u8 q14, d9, d1
vmull.u8 q1, d11, d0
vmull.u8 q2, d12, d0
vmull.u8 q3, d14, d0
vmull.u8 q4, d15, d0
vext.8 d11, d11, d12, #1 ;construct src_ptr[1]
vext.8 d14, d14, d15, #1
vmlal.u8 q1, d11, d1 ;(src_ptr[0] * vp8_filter[1])
vmlal.u8 q3, d14, d1
vext.8 d12, d12, d13, #1
vext.8 d15, d15, d16, #1
vmlal.u8 q2, d12, d1 ;(src_ptr[0] * vp8_filter[1])
vmlal.u8 q4, d15, d1
vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8
vqrshrn.u16 d11, q10, #7
vqrshrn.u16 d12, q11, #7
vqrshrn.u16 d13, q12, #7
vqrshrn.u16 d14, q13, #7
vqrshrn.u16 d15, q14, #7
vqrshrn.u16 d16, q1, #7
vqrshrn.u16 d17, q2, #7
vqrshrn.u16 d18, q3, #7
vqrshrn.u16 d19, q4, #7
vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result
vst1.u8 {d14, d15, d16, d17}, [lr]!
vst1.u8 {d18, d19}, [lr]!
;Second pass: 16x16
;secondpass_filter
add r3, r12, r3, lsl #3
sub lr, lr, #272
vld1.u32 {d31}, [r3] ;load second_pass filter
vld1.u8 {d22, d23}, [lr]! ;load src data
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
vdup.8 d1, d31[4]
mov r12, #4 ;loop counter
filt_blk2d_sp16x16_loop_neon
vld1.u8 {d24, d25}, [lr]!
vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0])
vld1.u8 {d26, d27}, [lr]!
vmull.u8 q2, d23, d0
vld1.u8 {d28, d29}, [lr]!
vmull.u8 q3, d24, d0
vld1.u8 {d30, d31}, [lr]!
vmull.u8 q4, d25, d0
vmull.u8 q5, d26, d0
vmull.u8 q6, d27, d0
vmull.u8 q7, d28, d0
vmull.u8 q8, d29, d0
vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp8_filter[1])
vmlal.u8 q2, d25, d1
vmlal.u8 q3, d26, d1
vmlal.u8 q4, d27, d1
vmlal.u8 q5, d28, d1
vmlal.u8 q6, d29, d1
vmlal.u8 q7, d30, d1
vmlal.u8 q8, d31, d1
subs r12, r12, #1
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
vqrshrn.u16 d3, q2, #7
vqrshrn.u16 d4, q3, #7
vqrshrn.u16 d5, q4, #7
vqrshrn.u16 d6, q5, #7
vqrshrn.u16 d7, q6, #7
vqrshrn.u16 d8, q7, #7
vqrshrn.u16 d9, q8, #7
vst1.u8 {d2, d3}, [r4], r5 ;store result
vst1.u8 {d4, d5}, [r4], r5
vst1.u8 {d6, d7}, [r4], r5
vmov q11, q15
vst1.u8 {d8, d9}, [r4], r5
bne filt_blk2d_sp16x16_loop_neon
add sp, sp, #272
pop {r4-r5,pc}
;--------------------
firstpass_bfilter16x16_only
mov r2, #4 ;loop counter
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
vdup.8 d1, d31[4]
;First Pass: output_height lines x output_width columns (16x16)
filt_blk2d_fpo16x16_loop_neon
vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
vld1.u8 {d5, d6, d7}, [r0], r1
vld1.u8 {d8, d9, d10}, [r0], r1
vld1.u8 {d11, d12, d13}, [r0], r1
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0])
vmull.u8 q8, d3, d0
vmull.u8 q9, d5, d0
vmull.u8 q10, d6, d0
vmull.u8 q11, d8, d0
vmull.u8 q12, d9, d0
vmull.u8 q13, d11, d0
vmull.u8 q14, d12, d0
vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
vext.8 d5, d5, d6, #1
vext.8 d8, d8, d9, #1
vext.8 d11, d11, d12, #1
vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp8_filter[1])
vmlal.u8 q9, d5, d1
vmlal.u8 q11, d8, d1
vmlal.u8 q13, d11, d1
vext.8 d3, d3, d4, #1
vext.8 d6, d6, d7, #1
vext.8 d9, d9, d10, #1
vext.8 d12, d12, d13, #1
vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp8_filter[1])
vmlal.u8 q10, d6, d1
vmlal.u8 q12, d9, d1
vmlal.u8 q14, d12, d1
subs r2, r2, #1
vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
vqrshrn.u16 d15, q8, #7
vqrshrn.u16 d16, q9, #7
vqrshrn.u16 d17, q10, #7
vqrshrn.u16 d18, q11, #7
vqrshrn.u16 d19, q12, #7
vqrshrn.u16 d20, q13, #7
vst1.u8 {d14, d15}, [r4], r5 ;store result
vqrshrn.u16 d21, q14, #7
vst1.u8 {d16, d17}, [r4], r5
vst1.u8 {d18, d19}, [r4], r5
vst1.u8 {d20, d21}, [r4], r5
bne filt_blk2d_fpo16x16_loop_neon
pop {r4-r5,pc}
;---------------------
secondpass_bfilter16x16_only
;Second pass: 16x16
;secondpass_filter
add r3, r12, r3, lsl #3
mov r12, #4 ;loop counter
vld1.u32 {d31}, [r3] ;load second_pass filter
vld1.u8 {d22, d23}, [r0], r1 ;load src data
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
vdup.8 d1, d31[4]
filt_blk2d_spo16x16_loop_neon
vld1.u8 {d24, d25}, [r0], r1
vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0])
vld1.u8 {d26, d27}, [r0], r1
vmull.u8 q2, d23, d0
vld1.u8 {d28, d29}, [r0], r1
vmull.u8 q3, d24, d0
vld1.u8 {d30, d31}, [r0], r1
vmull.u8 q4, d25, d0
vmull.u8 q5, d26, d0
vmull.u8 q6, d27, d0
vmull.u8 q7, d28, d0
vmull.u8 q8, d29, d0
vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp8_filter[1])
vmlal.u8 q2, d25, d1
vmlal.u8 q3, d26, d1
vmlal.u8 q4, d27, d1
vmlal.u8 q5, d28, d1
vmlal.u8 q6, d29, d1
vmlal.u8 q7, d30, d1
vmlal.u8 q8, d31, d1
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
vqrshrn.u16 d3, q2, #7
vqrshrn.u16 d4, q3, #7
vqrshrn.u16 d5, q4, #7
vqrshrn.u16 d6, q5, #7
vqrshrn.u16 d7, q6, #7
vqrshrn.u16 d8, q7, #7
vqrshrn.u16 d9, q8, #7
vst1.u8 {d2, d3}, [r4], r5 ;store result
subs r12, r12, #1
vst1.u8 {d4, d5}, [r4], r5
vmov q11, q15
vst1.u8 {d6, d7}, [r4], r5
vst1.u8 {d8, d9}, [r4], r5
bne filt_blk2d_spo16x16_loop_neon
pop {r4-r5,pc}
ENDP
;-----------------
AREA bifilters16_dat, DATA, READWRITE ;read/write by default
;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
;One word each is reserved. Label filter_coeff can be used to access the data.
;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
_bifilter16_coeff_
DCD bifilter16_coeff
bifilter16_coeff
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
END

Просмотреть файл

@ -0,0 +1,135 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_bilinear_predict4x4_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; r4 unsigned char *dst_ptr,
; stack(lr) int dst_pitch
|vp8_bilinear_predict4x4_neon| PROC
push {r4, lr}
ldr r12, _bifilter4_coeff_
ldr r4, [sp, #8] ;load parameters from stack
ldr lr, [sp, #12] ;load parameters from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq skip_firstpass_filter
;First pass: output_height lines x output_width columns (5x4)
vld1.u8 {d2}, [r0], r1 ;load src data
add r2, r12, r2, lsl #3 ;calculate Hfilter location (2coeffsx4bytes=8bytes)
vld1.u8 {d3}, [r0], r1
vld1.u32 {d31}, [r2] ;first_pass filter
vld1.u8 {d4}, [r0], r1
vdup.8 d0, d31[0] ;first_pass filter (d0-d1)
vld1.u8 {d5}, [r0], r1
vdup.8 d1, d31[4]
vld1.u8 {d6}, [r0], r1
vshr.u64 q4, q1, #8 ;construct src_ptr[1]
vshr.u64 q5, q2, #8
vshr.u64 d12, d6, #8
vzip.32 d2, d3 ;put 2-line data in 1 register (src_ptr[0])
vzip.32 d4, d5
vzip.32 d8, d9 ;put 2-line data in 1 register (src_ptr[1])
vzip.32 d10, d11
vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0])
vmull.u8 q8, d4, d0
vmull.u8 q9, d6, d0
vmlal.u8 q7, d8, d1 ;(src_ptr[1] * vp8_filter[1])
vmlal.u8 q8, d10, d1
vmlal.u8 q9, d12, d1
vqrshrn.u16 d28, q7, #7 ;shift/round/saturate to u8
vqrshrn.u16 d29, q8, #7
vqrshrn.u16 d30, q9, #7
;Second pass: 4x4
secondpass_filter
cmp r3, #0 ;skip second_pass filter if yoffset=0
beq skip_secondpass_filter
add r3, r12, r3, lsl #3 ;calculate Vfilter location
vld1.u32 {d31}, [r3] ;load second_pass filter
vdup.8 d0, d31[0] ;second_pass filter parameters (d0-d5)
vdup.8 d1, d31[4]
vmull.u8 q1, d28, d0
vmull.u8 q2, d29, d0
vext.8 d26, d28, d29, #4 ;construct src_ptr[pixel_step]
vext.8 d27, d29, d30, #4
vmlal.u8 q1, d26, d1
vmlal.u8 q2, d27, d1
add r0, r4, lr
add r1, r0, lr
add r2, r1, lr
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
vqrshrn.u16 d3, q2, #7
vst1.32 {d2[0]}, [r4] ;store result
vst1.32 {d2[1]}, [r0]
vst1.32 {d3[0]}, [r1]
vst1.32 {d3[1]}, [r2]
pop {r4, pc}
;--------------------
skip_firstpass_filter
vld1.32 {d28[0]}, [r0], r1 ;load src data
vld1.32 {d28[1]}, [r0], r1
vld1.32 {d29[0]}, [r0], r1
vld1.32 {d29[1]}, [r0], r1
vld1.32 {d30[0]}, [r0], r1
b secondpass_filter
;---------------------
skip_secondpass_filter
vst1.32 {d28[0]}, [r4], lr ;store result
vst1.32 {d28[1]}, [r4], lr
vst1.32 {d29[0]}, [r4], lr
vst1.32 {d29[1]}, [r4], lr
pop {r4, pc}
ENDP
;-----------------
AREA bilinearfilters4_dat, DATA, READWRITE ;read/write by default
;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
;One word each is reserved. Label filter_coeff can be used to access the data.
;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
_bifilter4_coeff_
DCD bifilter4_coeff
bifilter4_coeff
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
END

Просмотреть файл

@ -0,0 +1,140 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_bilinear_predict8x4_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; r4 unsigned char *dst_ptr,
; stack(lr) int dst_pitch
|vp8_bilinear_predict8x4_neon| PROC
push {r4, lr}
ldr r12, _bifilter8x4_coeff_
ldr r4, [sp, #8] ;load parameters from stack
ldr lr, [sp, #12] ;load parameters from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq skip_firstpass_filter
;First pass: output_height lines x output_width columns (5x8)
add r2, r12, r2, lsl #3 ;calculate filter location
vld1.u8 {q1}, [r0], r1 ;load src data
vld1.u32 {d31}, [r2] ;load first_pass filter
vld1.u8 {q2}, [r0], r1
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
vld1.u8 {q3}, [r0], r1
vdup.8 d1, d31[4]
vld1.u8 {q4}, [r0], r1
vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0])
vld1.u8 {q5}, [r0], r1
vmull.u8 q7, d4, d0
vmull.u8 q8, d6, d0
vmull.u8 q9, d8, d0
vmull.u8 q10, d10, d0
vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
vext.8 d5, d4, d5, #1
vext.8 d7, d6, d7, #1
vext.8 d9, d8, d9, #1
vext.8 d11, d10, d11, #1
vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1])
vmlal.u8 q7, d5, d1
vmlal.u8 q8, d7, d1
vmlal.u8 q9, d9, d1
vmlal.u8 q10, d11, d1
vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
vqrshrn.u16 d23, q7, #7
vqrshrn.u16 d24, q8, #7
vqrshrn.u16 d25, q9, #7
vqrshrn.u16 d26, q10, #7
;Second pass: 4x8
secondpass_filter
cmp r3, #0 ;skip second_pass filter if yoffset=0
beq skip_secondpass_filter
add r3, r12, r3, lsl #3
add r0, r4, lr
vld1.u32 {d31}, [r3] ;load second_pass filter
add r1, r0, lr
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
vdup.8 d1, d31[4]
vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0])
vmull.u8 q2, d23, d0
vmull.u8 q3, d24, d0
vmull.u8 q4, d25, d0
vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp8_filter[1])
vmlal.u8 q2, d24, d1
vmlal.u8 q3, d25, d1
vmlal.u8 q4, d26, d1
add r2, r1, lr
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
vqrshrn.u16 d3, q2, #7
vqrshrn.u16 d4, q3, #7
vqrshrn.u16 d5, q4, #7
vst1.u8 {d2}, [r4] ;store result
vst1.u8 {d3}, [r0]
vst1.u8 {d4}, [r1]
vst1.u8 {d5}, [r2]
pop {r4, pc}
;--------------------
skip_firstpass_filter
vld1.u8 {d22}, [r0], r1 ;load src data
vld1.u8 {d23}, [r0], r1
vld1.u8 {d24}, [r0], r1
vld1.u8 {d25}, [r0], r1
vld1.u8 {d26}, [r0], r1
b secondpass_filter
;---------------------
skip_secondpass_filter
vst1.u8 {d22}, [r4], lr ;store result
vst1.u8 {d23}, [r4], lr
vst1.u8 {d24}, [r4], lr
vst1.u8 {d25}, [r4], lr
pop {r4, pc}
ENDP
;-----------------
AREA bifilters8x4_dat, DATA, READWRITE ;read/write by default
;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
;One word each is reserved. Label filter_coeff can be used to access the data.
;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
_bifilter8x4_coeff_
DCD bifilter8x4_coeff
bifilter8x4_coeff
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
END

Просмотреть файл

@ -0,0 +1,188 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_bilinear_predict8x8_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; r4 unsigned char *dst_ptr,
; stack(lr) int dst_pitch
|vp8_bilinear_predict8x8_neon| PROC
push {r4, lr}
ldr r12, _bifilter8_coeff_
ldr r4, [sp, #8] ;load parameters from stack
ldr lr, [sp, #12] ;load parameters from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq skip_firstpass_filter
;First pass: output_height lines x output_width columns (9x8)
add r2, r12, r2, lsl #3 ;calculate filter location
vld1.u8 {q1}, [r0], r1 ;load src data
vld1.u32 {d31}, [r2] ;load first_pass filter
vld1.u8 {q2}, [r0], r1
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
vld1.u8 {q3}, [r0], r1
vdup.8 d1, d31[4]
vld1.u8 {q4}, [r0], r1
vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0])
vmull.u8 q7, d4, d0
vmull.u8 q8, d6, d0
vmull.u8 q9, d8, d0
vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
vext.8 d5, d4, d5, #1
vext.8 d7, d6, d7, #1
vext.8 d9, d8, d9, #1
vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1])
vmlal.u8 q7, d5, d1
vmlal.u8 q8, d7, d1
vmlal.u8 q9, d9, d1
vld1.u8 {q1}, [r0], r1 ;load src data
vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
vld1.u8 {q2}, [r0], r1
vqrshrn.u16 d23, q7, #7
vld1.u8 {q3}, [r0], r1
vqrshrn.u16 d24, q8, #7
vld1.u8 {q4}, [r0], r1
vqrshrn.u16 d25, q9, #7
;first_pass filtering on the rest 5-line data
vld1.u8 {q5}, [r0], r1
vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0])
vmull.u8 q7, d4, d0
vmull.u8 q8, d6, d0
vmull.u8 q9, d8, d0
vmull.u8 q10, d10, d0
vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
vext.8 d5, d4, d5, #1
vext.8 d7, d6, d7, #1
vext.8 d9, d8, d9, #1
vext.8 d11, d10, d11, #1
vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1])
vmlal.u8 q7, d5, d1
vmlal.u8 q8, d7, d1
vmlal.u8 q9, d9, d1
vmlal.u8 q10, d11, d1
vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8
vqrshrn.u16 d27, q7, #7
vqrshrn.u16 d28, q8, #7
vqrshrn.u16 d29, q9, #7
vqrshrn.u16 d30, q10, #7
;Second pass: 8x8
secondpass_filter
cmp r3, #0 ;skip second_pass filter if yoffset=0
beq skip_secondpass_filter
add r3, r12, r3, lsl #3
add r0, r4, lr
vld1.u32 {d31}, [r3] ;load second_pass filter
add r1, r0, lr
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
vdup.8 d1, d31[4]
vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0])
vmull.u8 q2, d23, d0
vmull.u8 q3, d24, d0
vmull.u8 q4, d25, d0
vmull.u8 q5, d26, d0
vmull.u8 q6, d27, d0
vmull.u8 q7, d28, d0
vmull.u8 q8, d29, d0
vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp8_filter[1])
vmlal.u8 q2, d24, d1
vmlal.u8 q3, d25, d1
vmlal.u8 q4, d26, d1
vmlal.u8 q5, d27, d1
vmlal.u8 q6, d28, d1
vmlal.u8 q7, d29, d1
vmlal.u8 q8, d30, d1
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
vqrshrn.u16 d3, q2, #7
vqrshrn.u16 d4, q3, #7
vqrshrn.u16 d5, q4, #7
vqrshrn.u16 d6, q5, #7
vqrshrn.u16 d7, q6, #7
vqrshrn.u16 d8, q7, #7
vqrshrn.u16 d9, q8, #7
vst1.u8 {d2}, [r4] ;store result
vst1.u8 {d3}, [r0]
vst1.u8 {d4}, [r1], lr
vst1.u8 {d5}, [r1], lr
vst1.u8 {d6}, [r1], lr
vst1.u8 {d7}, [r1], lr
vst1.u8 {d8}, [r1], lr
vst1.u8 {d9}, [r1], lr
pop {r4, pc}
;--------------------
skip_firstpass_filter
vld1.u8 {d22}, [r0], r1 ;load src data
vld1.u8 {d23}, [r0], r1
vld1.u8 {d24}, [r0], r1
vld1.u8 {d25}, [r0], r1
vld1.u8 {d26}, [r0], r1
vld1.u8 {d27}, [r0], r1
vld1.u8 {d28}, [r0], r1
vld1.u8 {d29}, [r0], r1
vld1.u8 {d30}, [r0], r1
b secondpass_filter
;---------------------
skip_secondpass_filter
vst1.u8 {d22}, [r4], lr ;store result
vst1.u8 {d23}, [r4], lr
vst1.u8 {d24}, [r4], lr
vst1.u8 {d25}, [r4], lr
vst1.u8 {d26}, [r4], lr
vst1.u8 {d27}, [r4], lr
vst1.u8 {d28}, [r4], lr
vst1.u8 {d29}, [r4], lr
pop {r4, pc}
ENDP
;-----------------
AREA bifilters8_dat, DATA, READWRITE ;read/write by default
;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
;One word each is reserved. Label filter_coeff can be used to access the data.
;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
_bifilter8_coeff_
DCD bifilter8_coeff
bifilter8_coeff
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
END

Просмотреть файл

@ -0,0 +1,584 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_build_intra_predictors_mby_neon_func|
EXPORT |vp8_build_intra_predictors_mby_s_neon_func|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *y_buffer
; r1 unsigned char *ypred_ptr
; r2 int y_stride
; r3 int mode
; stack int Up
; stack int Left
|vp8_build_intra_predictors_mby_neon_func| PROC
push {r4-r8, lr}
cmp r3, #0
beq case_dc_pred
cmp r3, #1
beq case_v_pred
cmp r3, #2
beq case_h_pred
cmp r3, #3
beq case_tm_pred
case_dc_pred
ldr r4, [sp, #24] ; Up
ldr r5, [sp, #28] ; Left
; Default the DC average to 128
mov r12, #128
vdup.u8 q0, r12
; Zero out running sum
mov r12, #0
; compute shift and jump
adds r7, r4, r5
beq skip_dc_pred_up_left
; Load above row, if it exists
cmp r4, #0
beq skip_dc_pred_up
sub r6, r0, r2
vld1.8 {q1}, [r6]
vpaddl.u8 q2, q1
vpaddl.u16 q3, q2
vpaddl.u32 q4, q3
vmov.32 r4, d8[0]
vmov.32 r6, d9[0]
add r12, r4, r6
; Move back to interger registers
skip_dc_pred_up
cmp r5, #0
beq skip_dc_pred_left
sub r0, r0, #1
; Load left row, if it exists
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
add r12, r12, r3
add r12, r12, r4
add r12, r12, r5
add r12, r12, r6
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
add r12, r12, r3
add r12, r12, r4
add r12, r12, r5
add r12, r12, r6
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
add r12, r12, r3
add r12, r12, r4
add r12, r12, r5
add r12, r12, r6
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0]
add r12, r12, r3
add r12, r12, r4
add r12, r12, r5
add r12, r12, r6
skip_dc_pred_left
add r7, r7, #3 ; Shift
sub r4, r7, #1
mov r5, #1
add r12, r12, r5, lsl r4
mov r5, r12, lsr r7 ; expected_dc
vdup.u8 q0, r5
skip_dc_pred_up_left
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
pop {r4-r8,pc}
case_v_pred
; Copy down above row
sub r6, r0, r2
vld1.8 {q0}, [r6]
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
pop {r4-r8,pc}
case_h_pred
; Load 4x yleft_col
sub r0, r0, #1
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u8 q0, r3
vdup.u8 q1, r4
vdup.u8 q2, r5
vdup.u8 q3, r6
vst1.u8 {q0}, [r1]!
vst1.u8 {q1}, [r1]!
vst1.u8 {q2}, [r1]!
vst1.u8 {q3}, [r1]!
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u8 q0, r3
vdup.u8 q1, r4
vdup.u8 q2, r5
vdup.u8 q3, r6
vst1.u8 {q0}, [r1]!
vst1.u8 {q1}, [r1]!
vst1.u8 {q2}, [r1]!
vst1.u8 {q3}, [r1]!
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u8 q0, r3
vdup.u8 q1, r4
vdup.u8 q2, r5
vdup.u8 q3, r6
vst1.u8 {q0}, [r1]!
vst1.u8 {q1}, [r1]!
vst1.u8 {q2}, [r1]!
vst1.u8 {q3}, [r1]!
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u8 q0, r3
vdup.u8 q1, r4
vdup.u8 q2, r5
vdup.u8 q3, r6
vst1.u8 {q0}, [r1]!
vst1.u8 {q1}, [r1]!
vst1.u8 {q2}, [r1]!
vst1.u8 {q3}, [r1]!
pop {r4-r8,pc}
case_tm_pred
; Load yabove_row
sub r3, r0, r2
vld1.8 {q8}, [r3]
; Load ytop_left
sub r3, r3, #1
ldrb r7, [r3]
vdup.u16 q7, r7
; Compute yabove_row - ytop_left
mov r3, #1
vdup.u8 q0, r3
vmull.u8 q4, d16, d0
vmull.u8 q5, d17, d0
vsub.s16 q4, q4, q7
vsub.s16 q5, q5, q7
; Load 4x yleft_col
sub r0, r0, #1
mov r12, #4
case_tm_pred_loop
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u16 q0, r3
vdup.u16 q1, r4
vdup.u16 q2, r5
vdup.u16 q3, r6
vqadd.s16 q8, q0, q4
vqadd.s16 q9, q0, q5
vqadd.s16 q10, q1, q4
vqadd.s16 q11, q1, q5
vqadd.s16 q12, q2, q4
vqadd.s16 q13, q2, q5
vqadd.s16 q14, q3, q4
vqadd.s16 q15, q3, q5
vqshrun.s16 d0, q8, #0
vqshrun.s16 d1, q9, #0
vqshrun.s16 d2, q10, #0
vqshrun.s16 d3, q11, #0
vqshrun.s16 d4, q12, #0
vqshrun.s16 d5, q13, #0
vqshrun.s16 d6, q14, #0
vqshrun.s16 d7, q15, #0
vst1.u8 {q0}, [r1]!
vst1.u8 {q1}, [r1]!
vst1.u8 {q2}, [r1]!
vst1.u8 {q3}, [r1]!
subs r12, r12, #1
bne case_tm_pred_loop
pop {r4-r8,pc}
ENDP
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; r0 unsigned char *y_buffer
; r1 unsigned char *ypred_ptr
; r2 int y_stride
; r3 int mode
; stack int Up
; stack int Left
|vp8_build_intra_predictors_mby_s_neon_func| PROC
push {r4-r8, lr}
mov r1, r0 ; unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor;
cmp r3, #0
beq case_dc_pred_s
cmp r3, #1
beq case_v_pred_s
cmp r3, #2
beq case_h_pred_s
cmp r3, #3
beq case_tm_pred_s
case_dc_pred_s
ldr r4, [sp, #24] ; Up
ldr r5, [sp, #28] ; Left
; Default the DC average to 128
mov r12, #128
vdup.u8 q0, r12
; Zero out running sum
mov r12, #0
; compute shift and jump
adds r7, r4, r5
beq skip_dc_pred_up_left_s
; Load above row, if it exists
cmp r4, #0
beq skip_dc_pred_up_s
sub r6, r0, r2
vld1.8 {q1}, [r6]
vpaddl.u8 q2, q1
vpaddl.u16 q3, q2
vpaddl.u32 q4, q3
vmov.32 r4, d8[0]
vmov.32 r6, d9[0]
add r12, r4, r6
; Move back to interger registers
skip_dc_pred_up_s
cmp r5, #0
beq skip_dc_pred_left_s
sub r0, r0, #1
; Load left row, if it exists
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
add r12, r12, r3
add r12, r12, r4
add r12, r12, r5
add r12, r12, r6
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
add r12, r12, r3
add r12, r12, r4
add r12, r12, r5
add r12, r12, r6
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
add r12, r12, r3
add r12, r12, r4
add r12, r12, r5
add r12, r12, r6
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0]
add r12, r12, r3
add r12, r12, r4
add r12, r12, r5
add r12, r12, r6
skip_dc_pred_left_s
add r7, r7, #3 ; Shift
sub r4, r7, #1
mov r5, #1
add r12, r12, r5, lsl r4
mov r5, r12, lsr r7 ; expected_dc
vdup.u8 q0, r5
skip_dc_pred_up_left_s
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
pop {r4-r8,pc}
case_v_pred_s
; Copy down above row
sub r6, r0, r2
vld1.8 {q0}, [r6]
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
pop {r4-r8,pc}
case_h_pred_s
; Load 4x yleft_col
sub r0, r0, #1
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u8 q0, r3
vdup.u8 q1, r4
vdup.u8 q2, r5
vdup.u8 q3, r6
vst1.u8 {q0}, [r1], r2
vst1.u8 {q1}, [r1], r2
vst1.u8 {q2}, [r1], r2
vst1.u8 {q3}, [r1], r2
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u8 q0, r3
vdup.u8 q1, r4
vdup.u8 q2, r5
vdup.u8 q3, r6
vst1.u8 {q0}, [r1], r2
vst1.u8 {q1}, [r1], r2
vst1.u8 {q2}, [r1], r2
vst1.u8 {q3}, [r1], r2
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u8 q0, r3
vdup.u8 q1, r4
vdup.u8 q2, r5
vdup.u8 q3, r6
vst1.u8 {q0}, [r1], r2
vst1.u8 {q1}, [r1], r2
vst1.u8 {q2}, [r1], r2
vst1.u8 {q3}, [r1], r2
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u8 q0, r3
vdup.u8 q1, r4
vdup.u8 q2, r5
vdup.u8 q3, r6
vst1.u8 {q0}, [r1], r2
vst1.u8 {q1}, [r1], r2
vst1.u8 {q2}, [r1], r2
vst1.u8 {q3}, [r1], r2
pop {r4-r8,pc}
case_tm_pred_s
; Load yabove_row
sub r3, r0, r2
vld1.8 {q8}, [r3]
; Load ytop_left
sub r3, r3, #1
ldrb r7, [r3]
vdup.u16 q7, r7
; Compute yabove_row - ytop_left
mov r3, #1
vdup.u8 q0, r3
vmull.u8 q4, d16, d0
vmull.u8 q5, d17, d0
vsub.s16 q4, q4, q7
vsub.s16 q5, q5, q7
; Load 4x yleft_col
sub r0, r0, #1
mov r12, #4
case_tm_pred_loop_s
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u16 q0, r3
vdup.u16 q1, r4
vdup.u16 q2, r5
vdup.u16 q3, r6
vqadd.s16 q8, q0, q4
vqadd.s16 q9, q0, q5
vqadd.s16 q10, q1, q4
vqadd.s16 q11, q1, q5
vqadd.s16 q12, q2, q4
vqadd.s16 q13, q2, q5
vqadd.s16 q14, q3, q4
vqadd.s16 q15, q3, q5
vqshrun.s16 d0, q8, #0
vqshrun.s16 d1, q9, #0
vqshrun.s16 d2, q10, #0
vqshrun.s16 d3, q11, #0
vqshrun.s16 d4, q12, #0
vqshrun.s16 d5, q13, #0
vqshrun.s16 d6, q14, #0
vqshrun.s16 d7, q15, #0
vst1.u8 {q0}, [r1], r2
vst1.u8 {q1}, [r1], r2
vst1.u8 {q2}, [r1], r2
vst1.u8 {q3}, [r1], r2
subs r12, r12, #1
bne case_tm_pred_loop_s
pop {r4-r8,pc}
ENDP
END

Просмотреть файл

@ -0,0 +1,59 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_copy_mem16x16_neon|
; ARM
; REQUIRE8
; PRESERVE8
AREA Block, CODE, READONLY ; name this block of code
;void copy_mem16x16_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|vp8_copy_mem16x16_neon| PROC
vld1.u8 {q0}, [r0], r1
vld1.u8 {q1}, [r0], r1
vld1.u8 {q2}, [r0], r1
vst1.u8 {q0}, [r2], r3
vld1.u8 {q3}, [r0], r1
vst1.u8 {q1}, [r2], r3
vld1.u8 {q4}, [r0], r1
vst1.u8 {q2}, [r2], r3
vld1.u8 {q5}, [r0], r1
vst1.u8 {q3}, [r2], r3
vld1.u8 {q6}, [r0], r1
vst1.u8 {q4}, [r2], r3
vld1.u8 {q7}, [r0], r1
vst1.u8 {q5}, [r2], r3
vld1.u8 {q8}, [r0], r1
vst1.u8 {q6}, [r2], r3
vld1.u8 {q9}, [r0], r1
vst1.u8 {q7}, [r2], r3
vld1.u8 {q10}, [r0], r1
vst1.u8 {q8}, [r2], r3
vld1.u8 {q11}, [r0], r1
vst1.u8 {q9}, [r2], r3
vld1.u8 {q12}, [r0], r1
vst1.u8 {q10}, [r2], r3
vld1.u8 {q13}, [r0], r1
vst1.u8 {q11}, [r2], r3
vld1.u8 {q14}, [r0], r1
vst1.u8 {q12}, [r2], r3
vld1.u8 {q15}, [r0], r1
vst1.u8 {q13}, [r2], r3
vst1.u8 {q14}, [r2], r3
vst1.u8 {q15}, [r2], r3
mov pc, lr
ENDP ; |vp8_copy_mem16x16_neon|
END

Просмотреть файл

@ -0,0 +1,34 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_copy_mem8x4_neon|
; ARM
; REQUIRE8
; PRESERVE8
AREA Block, CODE, READONLY ; name this block of code
;void copy_mem8x4_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|vp8_copy_mem8x4_neon| PROC
vld1.u8 {d0}, [r0], r1
vld1.u8 {d1}, [r0], r1
vst1.u8 {d0}, [r2], r3
vld1.u8 {d2}, [r0], r1
vst1.u8 {d1}, [r2], r3
vld1.u8 {d3}, [r0], r1
vst1.u8 {d2}, [r2], r3
vst1.u8 {d3}, [r2], r3
mov pc, lr
ENDP ; |vp8_copy_mem8x4_neon|
END

Просмотреть файл

@ -0,0 +1,43 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_copy_mem8x8_neon|
; ARM
; REQUIRE8
; PRESERVE8
AREA Block, CODE, READONLY ; name this block of code
;void copy_mem8x8_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|vp8_copy_mem8x8_neon| PROC
vld1.u8 {d0}, [r0], r1
vld1.u8 {d1}, [r0], r1
vst1.u8 {d0}, [r2], r3
vld1.u8 {d2}, [r0], r1
vst1.u8 {d1}, [r2], r3
vld1.u8 {d3}, [r0], r1
vst1.u8 {d2}, [r2], r3
vld1.u8 {d4}, [r0], r1
vst1.u8 {d3}, [r2], r3
vld1.u8 {d5}, [r0], r1
vst1.u8 {d4}, [r2], r3
vld1.u8 {d6}, [r0], r1
vst1.u8 {d5}, [r2], r3
vld1.u8 {d7}, [r0], r1
vst1.u8 {d6}, [r2], r3
vst1.u8 {d7}, [r2], r3
mov pc, lr
ENDP ; |vp8_copy_mem8x8_neon|
END

Просмотреть файл

@ -0,0 +1,49 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license and patent
; grant that can be found in the LICENSE file in the root of the source
; tree. All contributing project authors may be found in the AUTHORS
; file in the root of the source tree.
;
EXPORT |vp8_dc_only_idct_add_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void vp8_dc_only_idct_add_neon(short input_dc, unsigned char *pred_ptr,
; unsigned char *dst_ptr, int pitch, int stride)
; r0 input_dc
; r1 pred_ptr
; r2 dst_ptr
; r3 pitch
; sp stride
|vp8_dc_only_idct_add_neon| PROC
add r0, r0, #4
asr r0, r0, #3
ldr r12, [sp]
vdup.16 q0, r0
vld1.32 {d2[0]}, [r1], r3
vld1.32 {d2[1]}, [r1], r3
vld1.32 {d4[0]}, [r1], r3
vld1.32 {d4[1]}, [r1]
vaddw.u8 q1, q0, d2
vaddw.u8 q2, q0, d4
vqmovun.s16 d2, q1
vqmovun.s16 d4, q2
vst1.32 {d2[0]}, [r2], r12
vst1.32 {d2[1]}, [r2], r12
vst1.32 {d4[0]}, [r2], r12
vst1.32 {d4[1]}, [r2]
bx lr
ENDP
END

Просмотреть файл

@ -0,0 +1,96 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_short_inv_walsh4x4_neon|
EXPORT |vp8_short_inv_walsh4x4_1_neon|
ARM
REQUIRE8
PRESERVE8
AREA |.text|, CODE, READONLY ; name this block of code
;short vp8_short_inv_walsh4x4_neon(short *input, short *output)
|vp8_short_inv_walsh4x4_neon| PROC
; read in all four lines of values: d0->d3
vldm.64 r0, {q0, q1}
; first for loop
vadd.s16 d4, d0, d3 ;a = [0] + [12]
vadd.s16 d5, d1, d2 ;b = [4] + [8]
vsub.s16 d6, d1, d2 ;c = [4] - [8]
vsub.s16 d7, d0, d3 ;d = [0] - [12]
vadd.s16 d0, d4, d5 ;a + b
vadd.s16 d1, d6, d7 ;c + d
vsub.s16 d2, d4, d5 ;a - b
vsub.s16 d3, d7, d6 ;d - c
vtrn.32 d0, d2 ;d0: 0 1 8 9
;d2: 2 3 10 11
vtrn.32 d1, d3 ;d1: 4 5 12 13
;d3: 6 7 14 15
vtrn.16 d0, d1 ;d0: 0 4 8 12
;d1: 1 5 9 13
vtrn.16 d2, d3 ;d2: 2 6 10 14
;d3: 3 7 11 15
; second for loop
vadd.s16 d4, d0, d3 ;a = [0] + [3]
vadd.s16 d5, d1, d2 ;b = [1] + [2]
vsub.s16 d6, d1, d2 ;c = [1] - [2]
vsub.s16 d7, d0, d3 ;d = [0] - [3]
vadd.s16 d0, d4, d5 ;e = a + b
vadd.s16 d1, d6, d7 ;f = c + d
vsub.s16 d2, d4, d5 ;g = a - b
vsub.s16 d3, d7, d6 ;h = d - c
vmov.i16 q2, #3
vadd.i16 q0, q0, q2 ;e/f += 3
vadd.i16 q1, q1, q2 ;g/h += 3
vshr.s16 q0, q0, #3 ;e/f >> 3
vshr.s16 q1, q1, #3 ;g/h >> 3
vtrn.32 d0, d2
vtrn.32 d1, d3
vtrn.16 d0, d1
vtrn.16 d2, d3
vstmia.16 r1!, {q0}
vstmia.16 r1!, {q1}
bx lr
ENDP ; |vp8_short_inv_walsh4x4_neon|
;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output)
|vp8_short_inv_walsh4x4_1_neon| PROC
; load a full line into a neon register
vld1.16 {q0}, [r0]
; extract first element and replicate
vdup.16 q1, d0[0]
; add 3 to all values
vmov.i16 q2, #3
vadd.i16 q3, q1, q2
; right shift
vshr.s16 q3, q3, #3
; write it back
vstmia.16 r1!, {q3}
vstmia.16 r1!, {q3}
bx lr
ENDP ; |vp8_short_inv_walsh4x4_1_neon|
END

Просмотреть файл

@ -0,0 +1,409 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_loop_filter_horizontal_edge_y_neon|
EXPORT |vp8_loop_filter_horizontal_edge_uv_neon|
EXPORT |vp8_loop_filter_vertical_edge_y_neon|
EXPORT |vp8_loop_filter_vertical_edge_uv_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; flimit, limit, and thresh should be positive numbers.
; All 16 elements in these variables are equal.
; void vp8_loop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
; const signed char *flimit,
; const signed char *limit,
; const signed char *thresh,
; int count)
; r0 unsigned char *src
; r1 int pitch
; r2 const signed char *flimit
; r3 const signed char *limit
; sp const signed char *thresh,
; sp+4 int count (unused)
|vp8_loop_filter_horizontal_edge_y_neon| PROC
stmdb sp!, {lr}
vld1.s8 {d0[], d1[]}, [r2] ; flimit
vld1.s8 {d2[], d3[]}, [r3] ; limit
sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines
ldr r12, [sp, #4] ; load thresh pointer
vld1.u8 {q3}, [r2], r1 ; p3
vld1.u8 {q4}, [r2], r1 ; p2
vld1.u8 {q5}, [r2], r1 ; p1
vld1.u8 {q6}, [r2], r1 ; p0
vld1.u8 {q7}, [r2], r1 ; q0
vld1.u8 {q8}, [r2], r1 ; q1
vld1.u8 {q9}, [r2], r1 ; q2
vld1.u8 {q10}, [r2] ; q3
vld1.s8 {d4[], d5[]}, [r12] ; thresh
sub r0, r0, r1, lsl #1
bl vp8_loop_filter_neon
vst1.u8 {q5}, [r0], r1 ; store op1
vst1.u8 {q6}, [r0], r1 ; store op0
vst1.u8 {q7}, [r0], r1 ; store oq0
vst1.u8 {q8}, [r0], r1 ; store oq1
ldmia sp!, {pc}
ENDP ; |vp8_loop_filter_horizontal_edge_y_neon|
; void vp8_loop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch
; const signed char *flimit,
; const signed char *limit,
; const signed char *thresh,
; unsigned char *v)
; r0 unsigned char *u,
; r1 int pitch,
; r2 const signed char *flimit,
; r3 const signed char *limit,
; sp const signed char *thresh,
; sp+4 unsigned char *v
|vp8_loop_filter_horizontal_edge_uv_neon| PROC
stmdb sp!, {lr}
vld1.s8 {d0[], d1[]}, [r2] ; flimit
vld1.s8 {d2[], d3[]}, [r3] ; limit
ldr r2, [sp, #8] ; load v ptr
sub r3, r0, r1, lsl #2 ; move u pointer down by 4 lines
vld1.u8 {d6}, [r3], r1 ; p3
vld1.u8 {d8}, [r3], r1 ; p2
vld1.u8 {d10}, [r3], r1 ; p1
vld1.u8 {d12}, [r3], r1 ; p0
vld1.u8 {d14}, [r3], r1 ; q0
vld1.u8 {d16}, [r3], r1 ; q1
vld1.u8 {d18}, [r3], r1 ; q2
vld1.u8 {d20}, [r3] ; q3
ldr r3, [sp, #4] ; load thresh pointer
sub r12, r2, r1, lsl #2 ; move v pointer down by 4 lines
vld1.u8 {d7}, [r12], r1 ; p3
vld1.u8 {d9}, [r12], r1 ; p2
vld1.u8 {d11}, [r12], r1 ; p1
vld1.u8 {d13}, [r12], r1 ; p0
vld1.u8 {d15}, [r12], r1 ; q0
vld1.u8 {d17}, [r12], r1 ; q1
vld1.u8 {d19}, [r12], r1 ; q2
vld1.u8 {d21}, [r12] ; q3
vld1.s8 {d4[], d5[]}, [r3] ; thresh
bl vp8_loop_filter_neon
sub r0, r0, r1, lsl #1
sub r2, r2, r1, lsl #1
vst1.u8 {d10}, [r0], r1 ; store u op1
vst1.u8 {d11}, [r2], r1 ; store v op1
vst1.u8 {d12}, [r0], r1 ; store u op0
vst1.u8 {d13}, [r2], r1 ; store v op0
vst1.u8 {d14}, [r0], r1 ; store u oq0
vst1.u8 {d15}, [r2], r1 ; store v oq0
vst1.u8 {d16}, [r0] ; store u oq1
vst1.u8 {d17}, [r2] ; store v oq1
ldmia sp!, {pc}
ENDP ; |vp8_loop_filter_horizontal_edge_uv_neon|
; void vp8_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
; const signed char *flimit,
; const signed char *limit,
; const signed char *thresh,
; int count)
; r0 unsigned char *src,
; r1 int pitch,
; r2 const signed char *flimit,
; r3 const signed char *limit,
; sp const signed char *thresh,
; sp+4 int count (unused)
|vp8_loop_filter_vertical_edge_y_neon| PROC
stmdb sp!, {lr}
vld1.s8 {d0[], d1[]}, [r2] ; flimit
vld1.s8 {d2[], d3[]}, [r3] ; limit
sub r2, r0, #4 ; src ptr down by 4 columns
sub r0, r0, #2 ; dst ptr
ldr r12, [sp, #4] ; load thresh pointer
vld1.u8 {d6}, [r2], r1 ; load first 8-line src data
vld1.u8 {d8}, [r2], r1
vld1.u8 {d10}, [r2], r1
vld1.u8 {d12}, [r2], r1
vld1.u8 {d14}, [r2], r1
vld1.u8 {d16}, [r2], r1
vld1.u8 {d18}, [r2], r1
vld1.u8 {d20}, [r2], r1
vld1.s8 {d4[], d5[]}, [r12] ; thresh
vld1.u8 {d7}, [r2], r1 ; load second 8-line src data
vld1.u8 {d9}, [r2], r1
vld1.u8 {d11}, [r2], r1
vld1.u8 {d13}, [r2], r1
vld1.u8 {d15}, [r2], r1
vld1.u8 {d17}, [r2], r1
vld1.u8 {d19}, [r2], r1
vld1.u8 {d21}, [r2]
;transpose to 8x16 matrix
vtrn.32 q3, q7
vtrn.32 q4, q8
vtrn.32 q5, q9
vtrn.32 q6, q10
vtrn.16 q3, q5
vtrn.16 q4, q6
vtrn.16 q7, q9
vtrn.16 q8, q10
vtrn.8 q3, q4
vtrn.8 q5, q6
vtrn.8 q7, q8
vtrn.8 q9, q10
bl vp8_loop_filter_neon
vswp d12, d11
vswp d16, d13
vswp d14, d12
vswp d16, d15
;store op1, op0, oq0, oq1
vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0], r1
vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r0], r1
vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r0], r1
vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r0], r1
vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r0], r1
vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r0], r1
vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r0], r1
vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0], r1
vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r0]
ldmia sp!, {pc}
ENDP ; |vp8_loop_filter_vertical_edge_y_neon|
; void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch
; const signed char *flimit,
; const signed char *limit,
; const signed char *thresh,
; unsigned char *v)
; r0 unsigned char *u,
; r1 int pitch,
; r2 const signed char *flimit,
; r3 const signed char *limit,
; sp const signed char *thresh,
; sp+4 unsigned char *v
|vp8_loop_filter_vertical_edge_uv_neon| PROC
stmdb sp!, {lr}
sub r12, r0, #4 ; move u pointer down by 4 columns
vld1.s8 {d0[], d1[]}, [r2] ; flimit
vld1.s8 {d2[], d3[]}, [r3] ; limit
ldr r2, [sp, #8] ; load v ptr
vld1.u8 {d6}, [r12], r1 ;load u data
vld1.u8 {d8}, [r12], r1
vld1.u8 {d10}, [r12], r1
vld1.u8 {d12}, [r12], r1
vld1.u8 {d14}, [r12], r1
vld1.u8 {d16}, [r12], r1
vld1.u8 {d18}, [r12], r1
vld1.u8 {d20}, [r12]
sub r3, r2, #4 ; move v pointer down by 4 columns
vld1.u8 {d7}, [r3], r1 ;load v data
vld1.u8 {d9}, [r3], r1
vld1.u8 {d11}, [r3], r1
vld1.u8 {d13}, [r3], r1
vld1.u8 {d15}, [r3], r1
vld1.u8 {d17}, [r3], r1
vld1.u8 {d19}, [r3], r1
vld1.u8 {d21}, [r3]
ldr r12, [sp, #4] ; load thresh pointer
;transpose to 8x16 matrix
vtrn.32 q3, q7
vtrn.32 q4, q8
vtrn.32 q5, q9
vtrn.32 q6, q10
vtrn.16 q3, q5
vtrn.16 q4, q6
vtrn.16 q7, q9
vtrn.16 q8, q10
vtrn.8 q3, q4
vtrn.8 q5, q6
vtrn.8 q7, q8
vtrn.8 q9, q10
vld1.s8 {d4[], d5[]}, [r12] ; thresh
bl vp8_loop_filter_neon
sub r0, r0, #2
sub r2, r2, #2
vswp d12, d11
vswp d16, d13
vswp d14, d12
vswp d16, d15
;store op1, op0, oq0, oq1
vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r2], r1
vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r2], r1
vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r2], r1
vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r2], r1
vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r2], r1
vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r2], r1
vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r2], r1
vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0]
vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2]
ldmia sp!, {pc}
ENDP ; |vp8_loop_filter_vertical_edge_uv_neon|
; void vp8_loop_filter_neon();
; This is a helper function for the loopfilters. The invidual functions do the
; necessary load, transpose (if necessary) and store.
; r0-r3 PRESERVE
; q0 flimit
; q1 limit
; q2 thresh
; q3 p3
; q4 p2
; q5 p1
; q6 p0
; q7 q0
; q8 q1
; q9 q2
; q10 q3
|vp8_loop_filter_neon| PROC
ldr r12, _lf_coeff_
; vp8_filter_mask
vabd.u8 q11, q3, q4 ; abs(p3 - p2)
vabd.u8 q12, q4, q5 ; abs(p2 - p1)
vabd.u8 q13, q5, q6 ; abs(p1 - p0)
vabd.u8 q14, q8, q7 ; abs(q1 - q0)
vabd.u8 q3, q9, q8 ; abs(q2 - q1)
vabd.u8 q4, q10, q9 ; abs(q3 - q2)
vabd.u8 q9, q6, q7 ; abs(p0 - q0)
vmax.u8 q11, q11, q12
vmax.u8 q12, q13, q14
vmax.u8 q3, q3, q4
vmax.u8 q15, q11, q12
; vp8_hevmask
vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1
vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1
vmax.u8 q15, q15, q3
vadd.u8 q0, q0, q0 ; flimit * 2
vadd.u8 q0, q0, q1 ; flimit * 2 + limit
vcge.u8 q15, q1, q15
vabd.u8 q2, q5, q8 ; a = abs(p1 - q1)
vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2
vshr.u8 q2, q2, #1 ; a = a / 2
vqadd.u8 q9, q9, q2 ; a = b + a
vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1
vld1.u8 {q0}, [r12]!
; vp8_filter() function
; convert to signed
veor q7, q7, q0 ; qs0
veor q6, q6, q0 ; ps0
veor q5, q5, q0 ; ps1
veor q8, q8, q0 ; qs1
vld1.u8 {q10}, [r12]!
vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
vsubl.s8 q11, d15, d13
vmovl.u8 q4, d20
vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1)
vorr q14, q13, q14 ; vp8_hevmask
vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0)
vmul.i16 q11, q11, q4
vand q1, q1, q14 ; vp8_filter &= hev
vand q15, q15, q9 ; vp8_filter_mask
vaddw.s8 q2, q2, d2
vaddw.s8 q11, q11, d3
vld1.u8 {q9}, [r12]!
; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
vqmovn.s16 d2, q2
vqmovn.s16 d3, q11
vand q1, q1, q15 ; vp8_filter &= mask
vqadd.s8 q2, q1, q10 ; Filter2 = clamp(vp8_filter+3)
vqadd.s8 q1, q1, q9 ; Filter1 = clamp(vp8_filter+4)
vshr.s8 q2, q2, #3 ; Filter2 >>= 3
vshr.s8 q1, q1, #3 ; Filter1 >>= 3
vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + Filter2)
vqsub.s8 q10, q7, q1 ; u = clamp(qs0 - Filter1)
; outer tap adjustments: ++vp8_filter >> 1
vrshr.s8 q1, q1, #1
vbic q1, q1, q14 ; vp8_filter &= ~hev
vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + vp8_filter)
vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - vp8_filter)
veor q5, q13, q0 ; *op1 = u^0x80
veor q6, q11, q0 ; *op0 = u^0x80
veor q7, q10, q0 ; *oq0 = u^0x80
veor q8, q12, q0 ; *oq1 = u^0x80
bx lr
ENDP ; |vp8_loop_filter_horizontal_edge_y_neon|
AREA loopfilter_dat, DATA, READONLY
_lf_coeff_
DCD lf_coeff
lf_coeff
DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080
DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303
DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404
DCD 0x01010101, 0x01010101, 0x01010101, 0x01010101
END

Просмотреть файл

@ -0,0 +1,118 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_loop_filter_simple_horizontal_edge_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit
;are equal. So, in the code, only one load is needed
;for flimit. Same way applies to limit and thresh.
; r0 unsigned char *s,
; r1 int p, //pitch
; r2 const signed char *flimit,
; r3 const signed char *limit,
; stack(r4) const signed char *thresh,
; //stack(r5) int count --unused
|vp8_loop_filter_simple_horizontal_edge_neon| PROC
sub r0, r0, r1, lsl #1 ; move src pointer down by 2 lines
ldr r12, _lfhy_coeff_
vld1.u8 {q5}, [r0], r1 ; p1
vld1.s8 {d2[], d3[]}, [r2] ; flimit
vld1.s8 {d26[], d27[]}, [r3] ; limit -> q13
vld1.u8 {q6}, [r0], r1 ; p0
vld1.u8 {q0}, [r12]! ; 0x80
vld1.u8 {q7}, [r0], r1 ; q0
vld1.u8 {q10}, [r12]! ; 0x03
vld1.u8 {q8}, [r0] ; q1
;vp8_filter_mask() function
vabd.u8 q15, q6, q7 ; abs(p0 - q0)
vabd.u8 q14, q5, q8 ; abs(p1 - q1)
vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2
vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2
vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
;vp8_filter() function
veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value
veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value
veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value
veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value
vadd.u8 q1, q1, q1 ; flimit * 2
vadd.u8 q1, q1, q13 ; flimit * 2 + limit
vcge.u8 q15, q1, q15 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
;;;;;;;;;;
;vqsub.s8 q2, q7, q6 ; ( qs0 - ps0)
vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
vsubl.s8 q3, d15, d13
vqsub.s8 q4, q5, q8 ; q4: vp8_filter = vp8_signed_char_clamp(ps1-qs1)
;vmul.i8 q2, q2, q10 ; 3 * ( qs0 - ps0)
vadd.s16 q11, q2, q2 ; 3 * ( qs0 - ps0)
vadd.s16 q12, q3, q3
vld1.u8 {q9}, [r12]! ; 0x04
vadd.s16 q2, q2, q11
vadd.s16 q3, q3, q12
vaddw.s8 q2, q2, d8 ; vp8_filter + 3 * ( qs0 - ps0)
vaddw.s8 q3, q3, d9
;vqadd.s8 q4, q4, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
vqmovn.s16 d8, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
vqmovn.s16 d9, q3
;;;;;;;;;;;;;
vand q4, q4, q15 ; vp8_filter &= mask
vqadd.s8 q2, q4, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
vqadd.s8 q4, q4, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
vshr.s8 q2, q2, #3 ; Filter2 >>= 3
vshr.s8 q4, q4, #3 ; Filter1 >>= 3
sub r0, r0, r1, lsl #1
;calculate output
vqadd.s8 q11, q6, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2)
vqsub.s8 q10, q7, q4 ; u = vp8_signed_char_clamp(qs0 - Filter1)
add r3, r0, r1
veor q6, q11, q0 ; *op0 = u^0x80
veor q7, q10, q0 ; *oq0 = u^0x80
vst1.u8 {q6}, [r0] ; store op0
vst1.u8 {q7}, [r3] ; store oq0
bx lr
ENDP ; |vp8_loop_filter_simple_horizontal_edge_neon|
;-----------------
AREA hloopfiltery_dat, DATA, READWRITE ;read/write by default
;Data section with name data_area is specified. DCD reserves space in memory for 16 data.
;One word each is reserved. Label filter_coeff can be used to access the data.
;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
_lfhy_coeff_
DCD lfhy_coeff
lfhy_coeff
DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080
DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303
DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404
END

Просмотреть файл

@ -0,0 +1,159 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_loop_filter_simple_vertical_edge_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;Note: flimit, limit, and thresh should be positive numbers. All 16 elements in flimit
;are equal. So, in the code, only one load is needed
;for flimit. Same way applies to limit and thresh.
; r0 unsigned char *s,
; r1 int p, //pitch
; r2 const signed char *flimit,
; r3 const signed char *limit,
; stack(r4) const signed char *thresh,
; //stack(r5) int count --unused
|vp8_loop_filter_simple_vertical_edge_neon| PROC
sub r0, r0, #2 ; move src pointer down by 2 columns
vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r1
vld1.s8 {d2[], d3[]}, [r2] ; flimit
vld1.s8 {d26[], d27[]}, [r3] ; limit -> q13
vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r0], r1
ldr r12, _vlfy_coeff_
vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r1
vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r0], r1
vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r1
vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r0], r1
vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r1
vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r0], r1
vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
vld1.u8 {q0}, [r12]! ; 0x80
vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
vld1.u8 {q11}, [r12]! ; 0x03
vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
vld1.u8 {q12}, [r12]! ; 0x04
vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0], r1
vswp d7, d10
vswp d12, d9
;vswp q4, q5 ; p1:q3, p0:q5, q0:q4, q1:q6
;vp8_filter_mask() function
;vp8_hevmask() function
sub r0, r0, r1, lsl #4
vabd.u8 q15, q5, q4 ; abs(p0 - q0)
vabd.u8 q14, q3, q6 ; abs(p1 - q1)
vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2
vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2
vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
veor q4, q4, q0 ; qs0: q0 offset to convert to a signed value
veor q5, q5, q0 ; ps0: p0 offset to convert to a signed value
veor q3, q3, q0 ; ps1: p1 offset to convert to a signed value
veor q6, q6, q0 ; qs1: q1 offset to convert to a signed value
vadd.u8 q1, q1, q1 ; flimit * 2
vadd.u8 q1, q1, q13 ; flimit * 2 + limit
vcge.u8 q15, q1, q15 ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
;vp8_filter() function
;;;;;;;;;;
;vqsub.s8 q2, q5, q4 ; ( qs0 - ps0)
vsubl.s8 q2, d8, d10 ; ( qs0 - ps0)
vsubl.s8 q13, d9, d11
vqsub.s8 q1, q3, q6 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
;vmul.i8 q2, q2, q11 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
vadd.s16 q10, q2, q2 ; 3 * ( qs0 - ps0)
vadd.s16 q14, q13, q13
vadd.s16 q2, q2, q10
vadd.s16 q13, q13, q14
;vqadd.s8 q1, q1, q2
vaddw.s8 q2, q2, d2 ; vp8_filter + 3 * ( qs0 - ps0)
vaddw.s8 q13, q13, d3
vqmovn.s16 d2, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
vqmovn.s16 d3, q13
add r0, r0, #1
add r2, r0, r1
;;;;;;;;;;;
vand q1, q1, q15 ; vp8_filter &= mask
vqadd.s8 q2, q1, q11 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
vqadd.s8 q1, q1, q12 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
vshr.s8 q2, q2, #3 ; Filter2 >>= 3
vshr.s8 q1, q1, #3 ; Filter1 >>= 3
;calculate output
vqsub.s8 q10, q4, q1 ; u = vp8_signed_char_clamp(qs0 - Filter1)
vqadd.s8 q11, q5, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2)
veor q7, q10, q0 ; *oq0 = u^0x80
veor q6, q11, q0 ; *op0 = u^0x80
add r3, r2, r1
vswp d13, d14
add r12, r3, r1
;store op1, op0, oq0, oq1
vst2.8 {d12[0], d13[0]}, [r0]
vst2.8 {d12[1], d13[1]}, [r2]
vst2.8 {d12[2], d13[2]}, [r3]
vst2.8 {d12[3], d13[3]}, [r12], r1
add r0, r12, r1
vst2.8 {d12[4], d13[4]}, [r12]
vst2.8 {d12[5], d13[5]}, [r0], r1
add r2, r0, r1
vst2.8 {d12[6], d13[6]}, [r0]
vst2.8 {d12[7], d13[7]}, [r2], r1
add r3, r2, r1
vst2.8 {d14[0], d15[0]}, [r2]
vst2.8 {d14[1], d15[1]}, [r3], r1
add r12, r3, r1
vst2.8 {d14[2], d15[2]}, [r3]
vst2.8 {d14[3], d15[3]}, [r12], r1
add r0, r12, r1
vst2.8 {d14[4], d15[4]}, [r12]
vst2.8 {d14[5], d15[5]}, [r0], r1
add r2, r0, r1
vst2.8 {d14[6], d15[6]}, [r0]
vst2.8 {d14[7], d15[7]}, [r2]
bx lr
ENDP ; |vp8_loop_filter_simple_vertical_edge_neon|
;-----------------
AREA vloopfiltery_dat, DATA, READWRITE ;read/write by default
;Data section with name data_area is specified. DCD reserves space in memory for 16 data.
;One word each is reserved. Label filter_coeff can be used to access the data.
;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
_vlfy_coeff_
DCD vlfy_coeff
vlfy_coeff
DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080
DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303
DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404
END

Просмотреть файл

@ -0,0 +1,519 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_mbloop_filter_horizontal_edge_y_neon|
EXPORT |vp8_mbloop_filter_horizontal_edge_uv_neon|
EXPORT |vp8_mbloop_filter_vertical_edge_y_neon|
EXPORT |vp8_mbloop_filter_vertical_edge_uv_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; flimit, limit, and thresh should be positive numbers.
; All 16 elements in these variables are equal.
; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
; const signed char *flimit,
; const signed char *limit,
; const signed char *thresh,
; int count)
; r0 unsigned char *src,
; r1 int pitch,
; r2 const signed char *flimit,
; r3 const signed char *limit,
; sp const signed char *thresh,
; sp+4 int count (unused)
|vp8_mbloop_filter_horizontal_edge_y_neon| PROC
stmdb sp!, {lr}
sub r0, r0, r1, lsl #2 ; move src pointer down by 4 lines
ldr r12, [sp, #4] ; load thresh pointer
vld1.u8 {q3}, [r0], r1 ; p3
vld1.s8 {d2[], d3[]}, [r3] ; limit
vld1.u8 {q4}, [r0], r1 ; p2
vld1.s8 {d4[], d5[]}, [r12] ; thresh
vld1.u8 {q5}, [r0], r1 ; p1
vld1.u8 {q6}, [r0], r1 ; p0
vld1.u8 {q7}, [r0], r1 ; q0
vld1.u8 {q8}, [r0], r1 ; q1
vld1.u8 {q9}, [r0], r1 ; q2
vld1.u8 {q10}, [r0], r1 ; q3
bl vp8_mbloop_filter_neon
sub r0, r0, r1, lsl #3
add r0, r0, r1
add r2, r0, r1
add r3, r2, r1
vst1.u8 {q4}, [r0] ; store op2
vst1.u8 {q5}, [r2] ; store op1
vst1.u8 {q6}, [r3], r1 ; store op0
add r12, r3, r1
vst1.u8 {q7}, [r3] ; store oq0
vst1.u8 {q8}, [r12], r1 ; store oq1
vst1.u8 {q9}, [r12] ; store oq2
ldmia sp!, {pc}
ENDP ; |vp8_mbloop_filter_horizontal_edge_y_neon|
; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch,
; const signed char *flimit,
; const signed char *limit,
; const signed char *thresh,
; unsigned char *v)
; r0 unsigned char *u,
; r1 int pitch,
; r2 const signed char *flimit,
; r3 const signed char *limit,
; sp const signed char *thresh,
; sp+4 unsigned char *v
|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC
stmdb sp!, {lr}
sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines
vld1.s8 {d2[], d3[]}, [r3] ; limit
ldr r3, [sp, #8] ; load v ptr
ldr r12, [sp, #4] ; load thresh pointer
sub r3, r3, r1, lsl #2 ; move v pointer down by 4 lines
vld1.u8 {d6}, [r0], r1 ; p3
vld1.u8 {d7}, [r3], r1 ; p3
vld1.u8 {d8}, [r0], r1 ; p2
vld1.u8 {d9}, [r3], r1 ; p2
vld1.u8 {d10}, [r0], r1 ; p1
vld1.u8 {d11}, [r3], r1 ; p1
vld1.u8 {d12}, [r0], r1 ; p0
vld1.u8 {d13}, [r3], r1 ; p0
vld1.u8 {d14}, [r0], r1 ; q0
vld1.u8 {d15}, [r3], r1 ; q0
vld1.u8 {d16}, [r0], r1 ; q1
vld1.u8 {d17}, [r3], r1 ; q1
vld1.u8 {d18}, [r0], r1 ; q2
vld1.u8 {d19}, [r3], r1 ; q2
vld1.u8 {d20}, [r0], r1 ; q3
vld1.u8 {d21}, [r3], r1 ; q3
vld1.s8 {d4[], d5[]}, [r12] ; thresh
bl vp8_mbloop_filter_neon
sub r0, r0, r1, lsl #3
sub r3, r3, r1, lsl #3
add r0, r0, r1
add r3, r3, r1
vst1.u8 {d8}, [r0], r1 ; store u op2
vst1.u8 {d9}, [r3], r1 ; store v op2
vst1.u8 {d10}, [r0], r1 ; store u op1
vst1.u8 {d11}, [r3], r1 ; store v op1
vst1.u8 {d12}, [r0], r1 ; store u op0
vst1.u8 {d13}, [r3], r1 ; store v op0
vst1.u8 {d14}, [r0], r1 ; store u oq0
vst1.u8 {d15}, [r3], r1 ; store v oq0
vst1.u8 {d16}, [r0], r1 ; store u oq1
vst1.u8 {d17}, [r3], r1 ; store v oq1
vst1.u8 {d18}, [r0], r1 ; store u oq2
vst1.u8 {d19}, [r3], r1 ; store v oq2
ldmia sp!, {pc}
ENDP ; |vp8_mbloop_filter_horizontal_edge_uv_neon|
; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
; const signed char *flimit,
; const signed char *limit,
; const signed char *thresh,
; int count)
; r0 unsigned char *src,
; r1 int pitch,
; r2 const signed char *flimit,
; r3 const signed char *limit,
; sp const signed char *thresh,
; sp+4 int count (unused)
|vp8_mbloop_filter_vertical_edge_y_neon| PROC
stmdb sp!, {lr}
sub r0, r0, #4 ; move src pointer down by 4 columns
vld1.u8 {d6}, [r0], r1 ; load first 8-line src data
ldr r12, [sp, #4] ; load thresh pointer
vld1.u8 {d8}, [r0], r1
sub sp, sp, #32
vld1.u8 {d10}, [r0], r1
vld1.u8 {d12}, [r0], r1
vld1.u8 {d14}, [r0], r1
vld1.u8 {d16}, [r0], r1
vld1.u8 {d18}, [r0], r1
vld1.u8 {d20}, [r0], r1
vld1.u8 {d7}, [r0], r1 ; load second 8-line src data
vld1.u8 {d9}, [r0], r1
vld1.u8 {d11}, [r0], r1
vld1.u8 {d13}, [r0], r1
vld1.u8 {d15}, [r0], r1
vld1.u8 {d17}, [r0], r1
vld1.u8 {d19}, [r0], r1
vld1.u8 {d21}, [r0], r1
;transpose to 8x16 matrix
vtrn.32 q3, q7
vtrn.32 q4, q8
vtrn.32 q5, q9
vtrn.32 q6, q10
vtrn.16 q3, q5
vtrn.16 q4, q6
vtrn.16 q7, q9
vtrn.16 q8, q10
vtrn.8 q3, q4
vtrn.8 q5, q6
vtrn.8 q7, q8
vtrn.8 q9, q10
vld1.s8 {d4[], d5[]}, [r12] ; thresh
vld1.s8 {d2[], d3[]}, [r3] ; limit
mov r12, sp
vst1.u8 {q3}, [r12]!
vst1.u8 {q10}, [r12]!
bl vp8_mbloop_filter_neon
sub r0, r0, r1, lsl #4
add r2, r0, r1
add r3, r2, r1
vld1.u8 {q3}, [sp]!
vld1.u8 {q10}, [sp]!
;transpose to 16x8 matrix
vtrn.32 q3, q7
vtrn.32 q4, q8
vtrn.32 q5, q9
vtrn.32 q6, q10
add r12, r3, r1
vtrn.16 q3, q5
vtrn.16 q4, q6
vtrn.16 q7, q9
vtrn.16 q8, q10
vtrn.8 q3, q4
vtrn.8 q5, q6
vtrn.8 q7, q8
vtrn.8 q9, q10
;store op2, op1, op0, oq0, oq1, oq2
vst1.8 {d6}, [r0]
vst1.8 {d8}, [r2]
vst1.8 {d10}, [r3]
vst1.8 {d12}, [r12], r1
add r0, r12, r1
vst1.8 {d14}, [r12]
vst1.8 {d16}, [r0], r1
add r2, r0, r1
vst1.8 {d18}, [r0]
vst1.8 {d20}, [r2], r1
add r3, r2, r1
vst1.8 {d7}, [r2]
vst1.8 {d9}, [r3], r1
add r12, r3, r1
vst1.8 {d11}, [r3]
vst1.8 {d13}, [r12], r1
add r0, r12, r1
vst1.8 {d15}, [r12]
vst1.8 {d17}, [r0], r1
add r2, r0, r1
vst1.8 {d19}, [r0]
vst1.8 {d21}, [r2]
ldmia sp!, {pc}
ENDP ; |vp8_mbloop_filter_vertical_edge_y_neon|
; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch,
; const signed char *flimit,
; const signed char *limit,
; const signed char *thresh,
; unsigned char *v)
; r0 unsigned char *u,
; r1 int pitch,
; r2 const signed char *flimit,
; r3 const signed char *limit,
; sp const signed char *thresh,
; sp+4 unsigned char *v
|vp8_mbloop_filter_vertical_edge_uv_neon| PROC
stmdb sp!, {lr}
sub r0, r0, #4 ; move src pointer down by 4 columns
vld1.s8 {d2[], d3[]}, [r3] ; limit
ldr r3, [sp, #8] ; load v ptr
ldr r12, [sp, #4] ; load thresh pointer
sub r3, r3, #4 ; move v pointer down by 4 columns
vld1.u8 {d6}, [r0], r1 ;load u data
vld1.u8 {d7}, [r3], r1 ;load v data
vld1.u8 {d8}, [r0], r1
vld1.u8 {d9}, [r3], r1
vld1.u8 {d10}, [r0], r1
vld1.u8 {d11}, [r3], r1
vld1.u8 {d12}, [r0], r1
vld1.u8 {d13}, [r3], r1
vld1.u8 {d14}, [r0], r1
vld1.u8 {d15}, [r3], r1
vld1.u8 {d16}, [r0], r1
vld1.u8 {d17}, [r3], r1
vld1.u8 {d18}, [r0], r1
vld1.u8 {d19}, [r3], r1
vld1.u8 {d20}, [r0], r1
vld1.u8 {d21}, [r3], r1
;transpose to 8x16 matrix
vtrn.32 q3, q7
vtrn.32 q4, q8
vtrn.32 q5, q9
vtrn.32 q6, q10
vtrn.16 q3, q5
vtrn.16 q4, q6
vtrn.16 q7, q9
vtrn.16 q8, q10
vtrn.8 q3, q4
vtrn.8 q5, q6
vtrn.8 q7, q8
vtrn.8 q9, q10
sub sp, sp, #32
vld1.s8 {d4[], d5[]}, [r12] ; thresh
mov r12, sp
vst1.u8 {q3}, [r12]!
vst1.u8 {q10}, [r12]!
bl vp8_mbloop_filter_neon
sub r0, r0, r1, lsl #3
sub r3, r3, r1, lsl #3
vld1.u8 {q3}, [sp]!
vld1.u8 {q10}, [sp]!
;transpose to 16x8 matrix
vtrn.32 q3, q7
vtrn.32 q4, q8
vtrn.32 q5, q9
vtrn.32 q6, q10
vtrn.16 q3, q5
vtrn.16 q4, q6
vtrn.16 q7, q9
vtrn.16 q8, q10
vtrn.8 q3, q4
vtrn.8 q5, q6
vtrn.8 q7, q8
vtrn.8 q9, q10
;store op2, op1, op0, oq0, oq1, oq2
vst1.8 {d6}, [r0], r1
vst1.8 {d7}, [r3], r1
vst1.8 {d8}, [r0], r1
vst1.8 {d9}, [r3], r1
vst1.8 {d10}, [r0], r1
vst1.8 {d11}, [r3], r1
vst1.8 {d12}, [r0], r1
vst1.8 {d13}, [r3], r1
vst1.8 {d14}, [r0], r1
vst1.8 {d15}, [r3], r1
vst1.8 {d16}, [r0], r1
vst1.8 {d17}, [r3], r1
vst1.8 {d18}, [r0], r1
vst1.8 {d19}, [r3], r1
vst1.8 {d20}, [r0], r1
vst1.8 {d21}, [r3], r1
ldmia sp!, {pc}
ENDP ; |vp8_mbloop_filter_vertical_edge_uv_neon|
; void vp8_mbloop_filter_neon()
; This is a helper function for the macroblock loopfilters. The individual
; functions do the necessary load, transpose (if necessary), preserve (if
; necessary) and store.
; TODO:
; The vertical filter writes p3/q3 back out because two 4 element writes are
; much simpler than ordering and writing two 3 element sets (or three 2 elements
; sets, or whichever other combinations are possible).
; If we can preserve q3 and q10, the vertical filter will be able to avoid
; storing those values on the stack and reading them back after the filter.
; r0,r1 PRESERVE
; r2 flimit
; r3 PRESERVE
; q1 limit
; q2 thresh
; q3 p3
; q4 p2
; q5 p1
; q6 p0
; q7 q0
; q8 q1
; q9 q2
; q10 q3
|vp8_mbloop_filter_neon| PROC
ldr r12, _mblf_coeff_
; vp8_filter_mask
vabd.u8 q11, q3, q4 ; abs(p3 - p2)
vabd.u8 q12, q4, q5 ; abs(p2 - p1)
vabd.u8 q13, q5, q6 ; abs(p1 - p0)
vabd.u8 q14, q8, q7 ; abs(q1 - q0)
vabd.u8 q3, q9, q8 ; abs(q2 - q1)
vabd.u8 q0, q10, q9 ; abs(q3 - q2)
vmax.u8 q11, q11, q12
vmax.u8 q12, q13, q14
vmax.u8 q3, q3, q0
vmax.u8 q15, q11, q12
vabd.u8 q12, q6, q7 ; abs(p0 - q0)
; vp8_hevmask
vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh) * -1
vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh) * -1
vmax.u8 q15, q15, q3
vld1.s8 {d4[], d5[]}, [r2] ; flimit
vld1.u8 {q0}, [r12]!
vadd.u8 q2, q2, q2 ; flimit * 2
vadd.u8 q2, q2, q1 ; flimit * 2 + limit
vcge.u8 q15, q1, q15
vabd.u8 q1, q5, q8 ; a = abs(p1 - q1)
vqadd.u8 q12, q12, q12 ; b = abs(p0 - q0) * 2
vshr.u8 q1, q1, #1 ; a = a / 2
vqadd.u8 q12, q12, q1 ; a = b + a
vcge.u8 q12, q2, q12 ; (a > flimit * 2 + limit) * -1
; vp8_filter
; convert to signed
veor q7, q7, q0 ; qs0
veor q6, q6, q0 ; ps0
veor q5, q5, q0 ; ps1
veor q8, q8, q0 ; qs1
veor q4, q4, q0 ; ps2
veor q9, q9, q0 ; qs2
vorr q14, q13, q14 ; vp8_hevmask
vsubl.s8 q2, d14, d12 ; qs0 - ps0
vsubl.s8 q13, d15, d13
vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1)
vadd.s16 q10, q2, q2 ; 3 * (qs0 - ps0)
vadd.s16 q11, q13, q13
vand q15, q15, q12 ; vp8_filter_mask
vadd.s16 q2, q2, q10
vadd.s16 q13, q13, q11
vld1.u8 {q12}, [r12]! ; #3
vaddw.s8 q2, q2, d2 ; vp8_filter + 3 * ( qs0 - ps0)
vaddw.s8 q13, q13, d3
vld1.u8 {q11}, [r12]! ; #4
; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
vqmovn.s16 d2, q2
vqmovn.s16 d3, q13
vand q1, q1, q15 ; vp8_filter &= mask
vld1.u8 {q15}, [r12]! ; #63
;
vand q13, q1, q14 ; Filter2 &= hev
vld1.u8 {d7}, [r12]! ; #9
vqadd.s8 q2, q13, q11 ; Filter1 = clamp(Filter2+4)
vqadd.s8 q13, q13, q12 ; Filter2 = clamp(Filter2+3)
vld1.u8 {d6}, [r12]! ; #18
vshr.s8 q2, q2, #3 ; Filter1 >>= 3
vshr.s8 q13, q13, #3 ; Filter2 >>= 3
vmov q10, q15
vmov q12, q15
vqsub.s8 q7, q7, q2 ; qs0 = clamp(qs0 - Filter1)
vld1.u8 {d5}, [r12]! ; #27
vqadd.s8 q6, q6, q13 ; ps0 = clamp(ps0 + Filter2)
vbic q1, q1, q14 ; vp8_filter &= ~hev
; roughly 1/7th difference across boundary
; roughly 2/7th difference across boundary
; roughly 3/7th difference across boundary
vmov q11, q15
vmov q13, q15
vmov q14, q15
vmlal.s8 q10, d2, d7 ; Filter2 * 9
vmlal.s8 q11, d3, d7
vmlal.s8 q12, d2, d6 ; Filter2 * 18
vmlal.s8 q13, d3, d6
vmlal.s8 q14, d2, d5 ; Filter2 * 27
vmlal.s8 q15, d3, d5
vqshrn.s16 d20, q10, #7 ; u = clamp((63 + Filter2 * 9)>>7)
vqshrn.s16 d21, q11, #7
vqshrn.s16 d24, q12, #7 ; u = clamp((63 + Filter2 * 18)>>7)
vqshrn.s16 d25, q13, #7
vqshrn.s16 d28, q14, #7 ; u = clamp((63 + Filter2 * 27)>>7)
vqshrn.s16 d29, q15, #7
vqsub.s8 q11, q9, q10 ; s = clamp(qs2 - u)
vqadd.s8 q10, q4, q10 ; s = clamp(ps2 + u)
vqsub.s8 q13, q8, q12 ; s = clamp(qs1 - u)
vqadd.s8 q12, q5, q12 ; s = clamp(ps1 + u)
vqsub.s8 q15, q7, q14 ; s = clamp(qs0 - u)
vqadd.s8 q14, q6, q14 ; s = clamp(ps0 + u)
veor q9, q11, q0 ; *oq2 = s^0x80
veor q4, q10, q0 ; *op2 = s^0x80
veor q8, q13, q0 ; *oq1 = s^0x80
veor q5, q12, q0 ; *op2 = s^0x80
veor q7, q15, q0 ; *oq0 = s^0x80
veor q6, q14, q0 ; *op0 = s^0x80
bx lr
ENDP ; |vp8_mbloop_filter_neon|
AREA mbloopfilter_dat, DATA, READONLY
_mblf_coeff_
DCD mblf_coeff
mblf_coeff
DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080
DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303
DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404
DCD 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
DCD 0x09090909, 0x09090909, 0x12121212, 0x12121212
DCD 0x1b1b1b1b, 0x1b1b1b1b
END

Просмотреть файл

@ -0,0 +1,131 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_recon16x16mb_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *pred_ptr,
; r1 short *diff_ptr,
; r2 unsigned char *dst_ptr,
; r3 int ystride,
; stack unsigned char *udst_ptr,
; stack unsigned char *vdst_ptr
|vp8_recon16x16mb_neon| PROC
mov r12, #4 ;loop counter for Y loop
recon16x16mb_loop_y
vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr
vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr
vld1.u8 {q14, q15}, [r0]!
vld1.16 {q10, q11}, [r1]!
vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits
vmovl.u8 q1, d25
vmovl.u8 q2, d26
vmovl.u8 q3, d27
vmovl.u8 q4, d28
vmovl.u8 q5, d29
vmovl.u8 q6, d30
vld1.16 {q12, q13}, [r1]!
vmovl.u8 q7, d31
vld1.16 {q14, q15}, [r1]!
pld [r0]
pld [r1]
pld [r1, #64]
vadd.s16 q0, q0, q8 ;add Diff data and Pred data together
vadd.s16 q1, q1, q9
vadd.s16 q2, q2, q10
vadd.s16 q3, q3, q11
vadd.s16 q4, q4, q12
vadd.s16 q5, q5, q13
vadd.s16 q6, q6, q14
vadd.s16 q7, q7, q15
vqmovun.s16 d0, q0 ;CLAMP() saturation
vqmovun.s16 d1, q1
vqmovun.s16 d2, q2
vqmovun.s16 d3, q3
vqmovun.s16 d4, q4
vqmovun.s16 d5, q5
vst1.u8 {q0}, [r2], r3 ;store result
vqmovun.s16 d6, q6
vst1.u8 {q1}, [r2], r3
vqmovun.s16 d7, q7
vst1.u8 {q2}, [r2], r3
subs r12, r12, #1
moveq r12, #2 ;loop counter for UV loop
vst1.u8 {q3}, [r2], r3
bne recon16x16mb_loop_y
mov r3, r3, lsr #1 ;uv_stride = ystride>>1
ldr r2, [sp] ;load upred_ptr
recon16x16mb_loop_uv
vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr
vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr
vld1.u8 {q14, q15}, [r0]!
vld1.16 {q10, q11}, [r1]!
vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits
vmovl.u8 q1, d25
vmovl.u8 q2, d26
vmovl.u8 q3, d27
vmovl.u8 q4, d28
vmovl.u8 q5, d29
vmovl.u8 q6, d30
vld1.16 {q12, q13}, [r1]!
vmovl.u8 q7, d31
vld1.16 {q14, q15}, [r1]!
vadd.s16 q0, q0, q8 ;add Diff data and Pred data together
vadd.s16 q1, q1, q9
vadd.s16 q2, q2, q10
vadd.s16 q3, q3, q11
vadd.s16 q4, q4, q12
vadd.s16 q5, q5, q13
vadd.s16 q6, q6, q14
vqmovun.s16 d0, q0 ;CLAMP() saturation
vadd.s16 q7, q7, q15
vqmovun.s16 d1, q1
vqmovun.s16 d2, q2
vqmovun.s16 d3, q3
vst1.u8 {d0}, [r2], r3 ;store result
vqmovun.s16 d4, q4
vst1.u8 {d1}, [r2], r3
vqmovun.s16 d5, q5
vst1.u8 {d2}, [r2], r3
vqmovun.s16 d6, q6
vst1.u8 {d3}, [r2], r3
vqmovun.s16 d7, q7
vst1.u8 {d4}, [r2], r3
subs r12, r12, #1
vst1.u8 {d5}, [r2], r3
vst1.u8 {d6}, [r2], r3
vst1.u8 {d7}, [r2], r3
ldrne r2, [sp, #4] ;load vpred_ptr
bne recon16x16mb_loop_uv
bx lr
ENDP
END

Просмотреть файл

@ -0,0 +1,54 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_recon2b_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *pred_ptr,
; r1 short *diff_ptr,
; r2 unsigned char *dst_ptr,
; r3 int stride
|vp8_recon2b_neon| PROC
vld1.u8 {q8, q9}, [r0] ;load data from pred_ptr
vld1.16 {q4, q5}, [r1]! ;load data from diff_ptr
vmovl.u8 q0, d16 ;modify Pred data from 8 bits to 16 bits
vld1.16 {q6, q7}, [r1]!
vmovl.u8 q1, d17
vmovl.u8 q2, d18
vmovl.u8 q3, d19
vadd.s16 q0, q0, q4 ;add Diff data and Pred data together
vadd.s16 q1, q1, q5
vadd.s16 q2, q2, q6
vadd.s16 q3, q3, q7
vqmovun.s16 d0, q0 ;CLAMP() saturation
vqmovun.s16 d1, q1
vqmovun.s16 d2, q2
vqmovun.s16 d3, q3
add r0, r2, r3
vst1.u8 {d0}, [r2] ;store result
vst1.u8 {d1}, [r0], r3
add r2, r0, r3
vst1.u8 {d2}, [r0]
vst1.u8 {d3}, [r2], r3
bx lr
ENDP
END

Просмотреть файл

@ -0,0 +1,69 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_recon4b_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *pred_ptr,
; r1 short *diff_ptr,
; r2 unsigned char *dst_ptr,
; r3 int stride
|vp8_recon4b_neon| PROC
vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr
vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr
vld1.u8 {q14, q15}, [r0]
vld1.16 {q10, q11}, [r1]!
vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits
vmovl.u8 q1, d25
vmovl.u8 q2, d26
vmovl.u8 q3, d27
vmovl.u8 q4, d28
vmovl.u8 q5, d29
vmovl.u8 q6, d30
vld1.16 {q12, q13}, [r1]!
vmovl.u8 q7, d31
vld1.16 {q14, q15}, [r1]
vadd.s16 q0, q0, q8 ;add Diff data and Pred data together
vadd.s16 q1, q1, q9
vadd.s16 q2, q2, q10
vadd.s16 q3, q3, q11
vadd.s16 q4, q4, q12
vadd.s16 q5, q5, q13
vadd.s16 q6, q6, q14
vadd.s16 q7, q7, q15
vqmovun.s16 d0, q0 ;CLAMP() saturation
vqmovun.s16 d1, q1
vqmovun.s16 d2, q2
vqmovun.s16 d3, q3
vqmovun.s16 d4, q4
vqmovun.s16 d5, q5
vqmovun.s16 d6, q6
vqmovun.s16 d7, q7
add r0, r2, r3
vst1.u8 {q0}, [r2] ;store result
vst1.u8 {q1}, [r0], r3
add r2, r0, r3
vst1.u8 {q2}, [r0]
vst1.u8 {q3}, [r2], r3
bx lr
ENDP
END

Просмотреть файл

@ -0,0 +1,29 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_ports/config.h"
#include "recon.h"
#include "blockd.h"
extern void vp8_recon16x16mb_neon(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int ystride, unsigned char *udst_ptr, unsigned char *vdst_ptr);
void vp8_recon_mb_neon(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
{
unsigned char *pred_ptr = &x->predictor[0];
short *diff_ptr = &x->diff[0];
unsigned char *dst_ptr = x->dst.y_buffer;
unsigned char *udst_ptr = x->dst.u_buffer;
unsigned char *vdst_ptr = x->dst.v_buffer;
int ystride = x->dst.y_stride;
/*int uv_stride = x->dst.uv_stride;*/
vp8_recon16x16mb_neon(pred_ptr, diff_ptr, dst_ptr, ystride, udst_ptr, vdst_ptr);
}

Просмотреть файл

@ -0,0 +1,61 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_recon_b_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *pred_ptr,
; r1 short *diff_ptr,
; r2 unsigned char *dst_ptr,
; r3 int stride
|vp8_recon_b_neon| PROC
mov r12, #16
vld1.u8 {d28}, [r0], r12 ;load 4 data/line from pred_ptr
vld1.16 {q10, q11}, [r1]! ;load data from diff_ptr
vld1.u8 {d29}, [r0], r12
vld1.16 {q11, q12}, [r1]!
vld1.u8 {d30}, [r0], r12
vld1.16 {q12, q13}, [r1]!
vld1.u8 {d31}, [r0], r12
vld1.16 {q13}, [r1]
vmovl.u8 q0, d28 ;modify Pred data from 8 bits to 16 bits
vmovl.u8 q1, d29 ;Pred data in d0, d2, d4, d6
vmovl.u8 q2, d30
vmovl.u8 q3, d31
vadd.s16 d0, d0, d20 ;add Diff data and Pred data together
vadd.s16 d2, d2, d22
vadd.s16 d4, d4, d24
vadd.s16 d6, d6, d26
vqmovun.s16 d0, q0 ;CLAMP() saturation
vqmovun.s16 d1, q1
vqmovun.s16 d2, q2
vqmovun.s16 d3, q3
add r1, r2, r3
vst1.32 {d0[0]}, [r2] ;store result
vst1.32 {d1[0]}, [r1], r3
add r2, r1, r3
vst1.32 {d2[0]}, [r1]
vst1.32 {d3[0]}, [r2], r3
bx lr
ENDP
END

Просмотреть файл

@ -0,0 +1,36 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_push_neon|
EXPORT |vp8_pop_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
|vp8_push_neon| PROC
vst1.i64 {d8, d9, d10, d11}, [r0]!
vst1.i64 {d12, d13, d14, d15}, [r0]!
bx lr
ENDP
|vp8_pop_neon| PROC
vld1.i64 {d8, d9, d10, d11}, [r0]!
vld1.i64 {d12, d13, d14, d15}, [r0]!
bx lr
ENDP
END

Просмотреть файл

@ -0,0 +1,67 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_short_idct4x4llm_1_neon|
EXPORT |vp8_dc_only_idct_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch);
; r0 short *input;
; r1 short *output;
; r2 int pitch;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|vp8_short_idct4x4llm_1_neon| PROC
vld1.16 {d0[]}, [r0] ;load input[0]
add r3, r1, r2
add r12, r3, r2
vrshr.s16 d0, d0, #3
add r0, r12, r2
vst1.16 {d0}, [r1]
vst1.16 {d0}, [r3]
vst1.16 {d0}, [r12]
vst1.16 {d0}, [r0]
bx lr
ENDP
;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);
; r0 short input_dc;
; r1 short *output;
; r2 int pitch;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|vp8_dc_only_idct_neon| PROC
vdup.16 d0, r0
add r3, r1, r2
add r12, r3, r2
vrshr.s16 d0, d0, #3
add r0, r12, r2
vst1.16 {d0}, [r1]
vst1.16 {d0}, [r3]
vst1.16 {d0}, [r12]
vst1.16 {d0}, [r0]
bx lr
ENDP
END

Просмотреть файл

@ -0,0 +1,127 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_short_idct4x4llm_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;*************************************************************
;void vp8_short_idct4x4llm_c(short *input, short *output, int pitch)
;r0 short * input
;r1 short * output
;r2 int pitch
;*************************************************************
;static const int cospi8sqrt2minus1=20091;
;static const int sinpi8sqrt2 =35468;
;static const int rounding = 0;
;Optimization note: The resulted data from dequantization are signed 13-bit data that is
;in the range of [-4096, 4095]. This allows to use "vqdmulh"(neon) instruction since
;it won't go out of range (13+16+1=30bits<32bits). This instruction gives the high half
;result of the multiplication that is needed in IDCT.
|vp8_short_idct4x4llm_neon| PROC
ldr r12, _idct_coeff_
vld1.16 {q1, q2}, [r0]
vld1.16 {d0}, [r12]
vswp d3, d4 ;q2(vp[4] vp[12])
vqdmulh.s16 q3, q2, d0[2]
vqdmulh.s16 q4, q2, d0[0]
vqadd.s16 d12, d2, d3 ;a1
vqsub.s16 d13, d2, d3 ;b1
vshr.s16 q3, q3, #1
vshr.s16 q4, q4, #1
vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
vqadd.s16 q4, q4, q2
;d6 - c1:temp1
;d7 - d1:temp2
;d8 - d1:temp1
;d9 - c1:temp2
vqsub.s16 d10, d6, d9 ;c1
vqadd.s16 d11, d7, d8 ;d1
vqadd.s16 d2, d12, d11
vqadd.s16 d3, d13, d10
vqsub.s16 d4, d13, d10
vqsub.s16 d5, d12, d11
vtrn.32 d2, d4
vtrn.32 d3, d5
vtrn.16 d2, d3
vtrn.16 d4, d5
vswp d3, d4
vqdmulh.s16 q3, q2, d0[2]
vqdmulh.s16 q4, q2, d0[0]
vqadd.s16 d12, d2, d3 ;a1
vqsub.s16 d13, d2, d3 ;b1
vshr.s16 q3, q3, #1
vshr.s16 q4, q4, #1
vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
vqadd.s16 q4, q4, q2
vqsub.s16 d10, d6, d9 ;c1
vqadd.s16 d11, d7, d8 ;d1
vqadd.s16 d2, d12, d11
vqadd.s16 d3, d13, d10
vqsub.s16 d4, d13, d10
vqsub.s16 d5, d12, d11
vrshr.s16 d2, d2, #3
vrshr.s16 d3, d3, #3
vrshr.s16 d4, d4, #3
vrshr.s16 d5, d5, #3
add r3, r1, r2
add r12, r3, r2
add r0, r12, r2
vtrn.32 d2, d4
vtrn.32 d3, d5
vtrn.16 d2, d3
vtrn.16 d4, d5
vst1.16 {d2}, [r1]
vst1.16 {d3}, [r3]
vst1.16 {d4}, [r12]
vst1.16 {d5}, [r0]
bx lr
ENDP
;-----------------
AREA idct4x4_dat, DATA, READWRITE ;read/write by default
;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
;One word each is reserved. Label filter_coeff can be used to access the data.
;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
_idct_coeff_
DCD idct_coeff
idct_coeff
DCD 0x4e7b4e7b, 0x8a8c8a8c
;20091, 20091, 35468, 35468
END

Просмотреть файл

@ -0,0 +1,495 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_sixtap_predict16x16_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; r4 unsigned char *dst_ptr,
; stack(r5) int dst_pitch
;Note: To take advantage of 8-bit mulplication instruction in NEON. First apply abs() to
; filter coeffs to make them u8. Then, use vmlsl for negtive coeffs. After multiplication,
; the result can be negtive. So, I treat the result as s16. But, since it is also possible
; that the result can be a large positive number (> 2^15-1), which could be confused as a
; negtive number. To avoid that error, apply filter coeffs in the order of 0, 1, 4 ,5 ,2,
; which ensures that the result stays in s16 range. Finally, saturated add the result by
; applying 3rd filter coeff. Same applys to other filter functions.
|vp8_sixtap_predict16x16_neon| PROC
push {r4-r5, lr}
ldr r12, _filter16_coeff_
ldr r4, [sp, #12] ;load parameters from stack
ldr r5, [sp, #16] ;load parameters from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq secondpass_filter16x16_only
add r2, r12, r2, lsl #5 ;calculate filter location
cmp r3, #0 ;skip second_pass filter if yoffset=0
vld1.s32 {q14, q15}, [r2] ;load first_pass filter
beq firstpass_filter16x16_only
sub sp, sp, #336 ;reserve space on stack for temporary storage
mov lr, sp
vabs.s32 q12, q14
vabs.s32 q13, q15
mov r2, #7 ;loop counter
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
sub r0, r0, r1, lsl #1
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
vdup.8 d1, d24[4]
vdup.8 d2, d25[0]
vdup.8 d3, d25[4]
vdup.8 d4, d26[0]
vdup.8 d5, d26[4]
;First Pass: output_height lines x output_width columns (21x16)
filt_blk2d_fp16x16_loop_neon
vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data
vld1.u8 {d9, d10, d11}, [r0], r1
vld1.u8 {d12, d13, d14}, [r0], r1
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q9, d7, d0
vmull.u8 q10, d9, d0
vmull.u8 q11, d10, d0
vmull.u8 q12, d12, d0
vmull.u8 q13, d13, d0
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
vext.8 d29, d9, d10, #1
vext.8 d30, d12, d13, #1
vmlsl.u8 q8, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q10, d29, d1
vmlsl.u8 q12, d30, d1
vext.8 d28, d7, d8, #1
vext.8 d29, d10, d11, #1
vext.8 d30, d13, d14, #1
vmlsl.u8 q9, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q11, d29, d1
vmlsl.u8 q13, d30, d1
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
vext.8 d29, d9, d10, #4
vext.8 d30, d12, d13, #4
vmlsl.u8 q8, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q10, d29, d4
vmlsl.u8 q12, d30, d4
vext.8 d28, d7, d8, #4
vext.8 d29, d10, d11, #4
vext.8 d30, d13, d14, #4
vmlsl.u8 q9, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q11, d29, d4
vmlsl.u8 q13, d30, d4
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
vext.8 d29, d9, d10, #5
vext.8 d30, d12, d13, #5
vmlal.u8 q8, d28, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q10, d29, d5
vmlal.u8 q12, d30, d5
vext.8 d28, d7, d8, #5
vext.8 d29, d10, d11, #5
vext.8 d30, d13, d14, #5
vmlal.u8 q9, d28, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q11, d29, d5
vmlal.u8 q13, d30, d5
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
vext.8 d29, d9, d10, #2
vext.8 d30, d12, d13, #2
vmlal.u8 q8, d28, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q10, d29, d2
vmlal.u8 q12, d30, d2
vext.8 d28, d7, d8, #2
vext.8 d29, d10, d11, #2
vext.8 d30, d13, d14, #2
vmlal.u8 q9, d28, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q11, d29, d2
vmlal.u8 q13, d30, d2
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
vext.8 d29, d9, d10, #3
vext.8 d30, d12, d13, #3
vext.8 d15, d7, d8, #3
vext.8 d31, d10, d11, #3
vext.8 d6, d13, d14, #3
vmull.u8 q4, d28, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q5, d29, d3
vmull.u8 q6, d30, d3
vqadd.s16 q8, q4 ;sum of all (src_data*filter_parameters)
vqadd.s16 q10, q5
vqadd.s16 q12, q6
vmull.u8 q6, d15, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q7, d31, d3
vmull.u8 q3, d6, d3
subs r2, r2, #1
vqadd.s16 q9, q6
vqadd.s16 q11, q7
vqadd.s16 q13, q3
vqrshrun.s16 d6, q8, #7 ;shift/round/saturate to u8
vqrshrun.s16 d7, q9, #7
vqrshrun.s16 d8, q10, #7
vqrshrun.s16 d9, q11, #7
vqrshrun.s16 d10, q12, #7
vqrshrun.s16 d11, q13, #7
vst1.u8 {d6, d7, d8}, [lr]! ;store result
vst1.u8 {d9, d10, d11}, [lr]!
bne filt_blk2d_fp16x16_loop_neon
;Second pass: 16x16
;secondpass_filter - do first 8-columns and then second 8-columns
add r3, r12, r3, lsl #5
sub lr, lr, #336
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
mov r3, #2 ;loop counter
vabs.s32 q7, q5
vabs.s32 q8, q6
mov r2, #16
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
vdup.8 d1, d14[4]
vdup.8 d2, d15[0]
vdup.8 d3, d15[4]
vdup.8 d4, d16[0]
vdup.8 d5, d16[4]
filt_blk2d_sp16x16_outloop_neon
vld1.u8 {d18}, [lr], r2 ;load src data
vld1.u8 {d19}, [lr], r2
vld1.u8 {d20}, [lr], r2
vld1.u8 {d21}, [lr], r2
mov r12, #4 ;loop counter
vld1.u8 {d22}, [lr], r2
secondpass_inner_loop_neon
vld1.u8 {d23}, [lr], r2 ;load src data
vld1.u8 {d24}, [lr], r2
vld1.u8 {d25}, [lr], r2
vld1.u8 {d26}, [lr], r2
vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q4, d19, d0
vmull.u8 q5, d20, d0
vmull.u8 q6, d21, d0
vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q4, d20, d1
vmlsl.u8 q5, d21, d1
vmlsl.u8 q6, d22, d1
vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q4, d23, d4
vmlsl.u8 q5, d24, d4
vmlsl.u8 q6, d25, d4
vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q4, d21, d2
vmlal.u8 q5, d22, d2
vmlal.u8 q6, d23, d2
vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q4, d24, d5
vmlal.u8 q5, d25, d5
vmlal.u8 q6, d26, d5
vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q8, d22, d3
vmull.u8 q9, d23, d3
vmull.u8 q10, d24, d3
subs r12, r12, #1
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d7, q8, #7
vqrshrun.s16 d8, q9, #7
vqrshrun.s16 d9, q10, #7
vst1.u8 {d6}, [r4], r5 ;store result
vmov q9, q11
vst1.u8 {d7}, [r4], r5
vmov q10, q12
vst1.u8 {d8}, [r4], r5
vmov d22, d26
vst1.u8 {d9}, [r4], r5
bne secondpass_inner_loop_neon
subs r3, r3, #1
sub lr, lr, #336
add lr, lr, #8
sub r4, r4, r5, lsl #4
add r4, r4, #8
bne filt_blk2d_sp16x16_outloop_neon
add sp, sp, #336
pop {r4-r5,pc}
;--------------------
firstpass_filter16x16_only
vabs.s32 q12, q14
vabs.s32 q13, q15
mov r2, #8 ;loop counter
sub r0, r0, #2 ;move srcptr back to (column-2)
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
vdup.8 d1, d24[4]
vdup.8 d2, d25[0]
vdup.8 d3, d25[4]
vdup.8 d4, d26[0]
vdup.8 d5, d26[4]
;First Pass: output_height lines x output_width columns (16x16)
filt_blk2d_fpo16x16_loop_neon
vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data
vld1.u8 {d9, d10, d11}, [r0], r1
pld [r0]
pld [r0, r1]
vmull.u8 q6, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q7, d7, d0
vmull.u8 q8, d9, d0
vmull.u8 q9, d10, d0
vext.8 d20, d6, d7, #1 ;construct src_ptr[-1]
vext.8 d21, d9, d10, #1
vext.8 d22, d7, d8, #1
vext.8 d23, d10, d11, #1
vext.8 d24, d6, d7, #4 ;construct src_ptr[2]
vext.8 d25, d9, d10, #4
vext.8 d26, d7, d8, #4
vext.8 d27, d10, d11, #4
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
vext.8 d29, d9, d10, #5
vmlsl.u8 q6, d20, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q8, d21, d1
vmlsl.u8 q7, d22, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q9, d23, d1
vmlsl.u8 q6, d24, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q8, d25, d4
vmlsl.u8 q7, d26, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q9, d27, d4
vmlal.u8 q6, d28, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q8, d29, d5
vext.8 d20, d7, d8, #5
vext.8 d21, d10, d11, #5
vext.8 d22, d6, d7, #2 ;construct src_ptr[0]
vext.8 d23, d9, d10, #2
vext.8 d24, d7, d8, #2
vext.8 d25, d10, d11, #2
vext.8 d26, d6, d7, #3 ;construct src_ptr[1]
vext.8 d27, d9, d10, #3
vext.8 d28, d7, d8, #3
vext.8 d29, d10, d11, #3
vmlal.u8 q7, d20, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q9, d21, d5
vmlal.u8 q6, d22, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q8, d23, d2
vmlal.u8 q7, d24, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q9, d25, d2
vmull.u8 q10, d26, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q11, d27, d3
vmull.u8 q12, d28, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q15, d29, d3
vqadd.s16 q6, q10 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q11
vqadd.s16 q7, q12
vqadd.s16 q9, q15
subs r2, r2, #1
vqrshrun.s16 d6, q6, #7 ;shift/round/saturate to u8
vqrshrun.s16 d7, q7, #7
vqrshrun.s16 d8, q8, #7
vqrshrun.s16 d9, q9, #7
vst1.u8 {q3}, [r4], r5 ;store result
vst1.u8 {q4}, [r4], r5
bne filt_blk2d_fpo16x16_loop_neon
pop {r4-r5,pc}
;--------------------
secondpass_filter16x16_only
;Second pass: 16x16
add r3, r12, r3, lsl #5
sub r0, r0, r1, lsl #1
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
mov r3, #2 ;loop counter
vabs.s32 q7, q5
vabs.s32 q8, q6
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
vdup.8 d1, d14[4]
vdup.8 d2, d15[0]
vdup.8 d3, d15[4]
vdup.8 d4, d16[0]
vdup.8 d5, d16[4]
filt_blk2d_spo16x16_outloop_neon
vld1.u8 {d18}, [r0], r1 ;load src data
vld1.u8 {d19}, [r0], r1
vld1.u8 {d20}, [r0], r1
vld1.u8 {d21}, [r0], r1
mov r12, #4 ;loop counter
vld1.u8 {d22}, [r0], r1
secondpass_only_inner_loop_neon
vld1.u8 {d23}, [r0], r1 ;load src data
vld1.u8 {d24}, [r0], r1
vld1.u8 {d25}, [r0], r1
vld1.u8 {d26}, [r0], r1
vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q4, d19, d0
vmull.u8 q5, d20, d0
vmull.u8 q6, d21, d0
vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q4, d20, d1
vmlsl.u8 q5, d21, d1
vmlsl.u8 q6, d22, d1
vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q4, d23, d4
vmlsl.u8 q5, d24, d4
vmlsl.u8 q6, d25, d4
vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q4, d21, d2
vmlal.u8 q5, d22, d2
vmlal.u8 q6, d23, d2
vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q4, d24, d5
vmlal.u8 q5, d25, d5
vmlal.u8 q6, d26, d5
vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q8, d22, d3
vmull.u8 q9, d23, d3
vmull.u8 q10, d24, d3
subs r12, r12, #1
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d7, q8, #7
vqrshrun.s16 d8, q9, #7
vqrshrun.s16 d9, q10, #7
vst1.u8 {d6}, [r4], r5 ;store result
vmov q9, q11
vst1.u8 {d7}, [r4], r5
vmov q10, q12
vst1.u8 {d8}, [r4], r5
vmov d22, d26
vst1.u8 {d9}, [r4], r5
bne secondpass_only_inner_loop_neon
subs r3, r3, #1
sub r0, r0, r1, lsl #4
sub r0, r0, r1, lsl #2
sub r0, r0, r1
add r0, r0, #8
sub r4, r4, r5, lsl #4
add r4, r4, #8
bne filt_blk2d_spo16x16_outloop_neon
pop {r4-r5,pc}
ENDP
;-----------------
AREA subpelfilters16_dat, DATA, READWRITE ;read/write by default
;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
;One word each is reserved. Label filter_coeff can be used to access the data.
;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
_filter16_coeff_
DCD filter16_coeff
filter16_coeff
DCD 0, 0, 128, 0, 0, 0, 0, 0
DCD 0, -6, 123, 12, -1, 0, 0, 0
DCD 2, -11, 108, 36, -8, 1, 0, 0
DCD 0, -9, 93, 50, -6, 0, 0, 0
DCD 3, -16, 77, 77, -16, 3, 0, 0
DCD 0, -6, 50, 93, -9, 0, 0, 0
DCD 1, -8, 36, 108, -11, 2, 0, 0
DCD 0, -1, 12, 123, -6, 0, 0, 0
END

Просмотреть файл

@ -0,0 +1,426 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_sixtap_predict_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; stack(r4) unsigned char *dst_ptr,
; stack(lr) int dst_pitch
|vp8_sixtap_predict_neon| PROC
push {r4, lr}
ldr r12, _filter4_coeff_
ldr r4, [sp, #8] ;load parameters from stack
ldr lr, [sp, #12] ;load parameters from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq secondpass_filter4x4_only
add r2, r12, r2, lsl #5 ;calculate filter location
cmp r3, #0 ;skip second_pass filter if yoffset=0
vld1.s32 {q14, q15}, [r2] ;load first_pass filter
beq firstpass_filter4x4_only
vabs.s32 q12, q14 ;get abs(filer_parameters)
vabs.s32 q13, q15
sub r0, r0, #2 ;go back 2 columns of src data
sub r0, r0, r1, lsl #1 ;go back 2 lines of src data
;First pass: output_height lines x output_width columns (9x4)
vld1.u8 {q3}, [r0], r1 ;load first 4-line src data
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
vld1.u8 {q4}, [r0], r1
vdup.8 d1, d24[4]
vld1.u8 {q5}, [r0], r1
vdup.8 d2, d25[0]
vld1.u8 {q6}, [r0], r1
vdup.8 d3, d25[4]
vdup.8 d4, d26[0]
vdup.8 d5, d26[4]
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vext.8 d18, d6, d7, #5 ;construct src_ptr[3]
vext.8 d19, d8, d9, #5
vext.8 d20, d10, d11, #5
vext.8 d21, d12, d13, #5
vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done
vswp d11, d12
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3])
vzip.32 d20, d21
vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5])
vmull.u8 q8, d20, d5
vmov q4, q3 ;keep original src data in q4 q6
vmov q6, q5
vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together
vzip.32 d10, d11
vshr.u64 q9, q4, #8 ;construct src_ptr[-1]
vshr.u64 q10, q6, #8
vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0])
vmlal.u8 q8, d10, d0
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1])
vzip.32 d20, d21
vshr.u64 q3, q4, #32 ;construct src_ptr[2]
vshr.u64 q5, q6, #32
vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q8, d20, d1
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2])
vzip.32 d10, d11
vshr.u64 q9, q4, #16 ;construct src_ptr[0]
vshr.u64 q10, q6, #16
vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q8, d10, d4
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0])
vzip.32 d20, d21
vshr.u64 q3, q4, #24 ;construct src_ptr[1]
vshr.u64 q5, q6, #24
vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q8, d20, d2
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1])
vzip.32 d10, d11
vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q10, d10, d3
vld1.u8 {q3}, [r0], r1 ;load rest 5-line src data
vld1.u8 {q4}, [r0], r1
vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q10
vld1.u8 {q5}, [r0], r1
vld1.u8 {q6}, [r0], r1
vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d28, q8, #7
;First Pass on rest 5-line data
vld1.u8 {q11}, [r0], r1
vext.8 d18, d6, d7, #5 ;construct src_ptr[3]
vext.8 d19, d8, d9, #5
vext.8 d20, d10, d11, #5
vext.8 d21, d12, d13, #5
vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done
vswp d11, d12
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3])
vzip.32 d20, d21
vext.8 d31, d22, d23, #5 ;construct src_ptr[3]
vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5])
vmull.u8 q8, d20, d5
vmull.u8 q12, d31, d5 ;(src_ptr[3] * vp8_filter[5])
vmov q4, q3 ;keep original src data in q4 q6
vmov q6, q5
vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together
vzip.32 d10, d11
vshr.u64 q9, q4, #8 ;construct src_ptr[-1]
vshr.u64 q10, q6, #8
vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0])
vmlal.u8 q8, d10, d0
vmlal.u8 q12, d22, d0 ;(src_ptr[-2] * vp8_filter[0])
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1])
vzip.32 d20, d21
vshr.u64 q3, q4, #32 ;construct src_ptr[2]
vshr.u64 q5, q6, #32
vext.8 d31, d22, d23, #1 ;construct src_ptr[-1]
vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q8, d20, d1
vmlsl.u8 q12, d31, d1 ;-(src_ptr[-1] * vp8_filter[1])
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2])
vzip.32 d10, d11
vshr.u64 q9, q4, #16 ;construct src_ptr[0]
vshr.u64 q10, q6, #16
vext.8 d31, d22, d23, #4 ;construct src_ptr[2]
vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q8, d10, d4
vmlsl.u8 q12, d31, d4 ;-(src_ptr[2] * vp8_filter[4])
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0])
vzip.32 d20, d21
vshr.u64 q3, q4, #24 ;construct src_ptr[1]
vshr.u64 q5, q6, #24
vext.8 d31, d22, d23, #2 ;construct src_ptr[0]
vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q8, d20, d2
vmlal.u8 q12, d31, d2 ;(src_ptr[0] * vp8_filter[2])
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1])
vzip.32 d10, d11
vext.8 d31, d22, d23, #3 ;construct src_ptr[1]
vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q10, d10, d3
vmull.u8 q11, d31, d3 ;(src_ptr[1] * vp8_filter[3])
add r3, r12, r3, lsl #5
vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q10
vqadd.s16 q12, q11
vext.8 d23, d27, d28, #4
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
vqrshrun.s16 d29, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d30, q8, #7
vqrshrun.s16 d31, q12, #7
;Second pass: 4x4
vabs.s32 q7, q5
vabs.s32 q8, q6
vext.8 d24, d28, d29, #4
vext.8 d25, d29, d30, #4
vext.8 d26, d30, d31, #4
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
vdup.8 d1, d14[4]
vdup.8 d2, d15[0]
vdup.8 d3, d15[4]
vdup.8 d4, d16[0]
vdup.8 d5, d16[4]
vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q4, d28, d0
vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp8_filter[5])
vmull.u8 q6, d26, d5
vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q4, d30, d4
vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q6, d24, d1
vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q4, d29, d2
vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp8_filter[3])
vmlal.u8 q6, d25, d3
add r0, r4, lr
add r1, r0, lr
add r2, r1, lr
vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q6, q4
vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8
vqrshrun.s16 d4, q6, #7
vst1.32 {d3[0]}, [r4] ;store result
vst1.32 {d3[1]}, [r0]
vst1.32 {d4[0]}, [r1]
vst1.32 {d4[1]}, [r2]
pop {r4, pc}
;---------------------
firstpass_filter4x4_only
vabs.s32 q12, q14 ;get abs(filer_parameters)
vabs.s32 q13, q15
sub r0, r0, #2 ;go back 2 columns of src data
;First pass: output_height lines x output_width columns (4x4)
vld1.u8 {q3}, [r0], r1 ;load first 4-line src data
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
vld1.u8 {q4}, [r0], r1
vdup.8 d1, d24[4]
vld1.u8 {q5}, [r0], r1
vdup.8 d2, d25[0]
vld1.u8 {q6}, [r0], r1
vdup.8 d3, d25[4]
vdup.8 d4, d26[0]
vdup.8 d5, d26[4]
vext.8 d18, d6, d7, #5 ;construct src_ptr[3]
vext.8 d19, d8, d9, #5
vext.8 d20, d10, d11, #5
vext.8 d21, d12, d13, #5
vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done
vswp d11, d12
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3])
vzip.32 d20, d21
vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5])
vmull.u8 q8, d20, d5
vmov q4, q3 ;keep original src data in q4 q6
vmov q6, q5
vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together
vzip.32 d10, d11
vshr.u64 q9, q4, #8 ;construct src_ptr[-1]
vshr.u64 q10, q6, #8
vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0])
vmlal.u8 q8, d10, d0
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1])
vzip.32 d20, d21
vshr.u64 q3, q4, #32 ;construct src_ptr[2]
vshr.u64 q5, q6, #32
vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q8, d20, d1
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2])
vzip.32 d10, d11
vshr.u64 q9, q4, #16 ;construct src_ptr[0]
vshr.u64 q10, q6, #16
vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q8, d10, d4
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0])
vzip.32 d20, d21
vshr.u64 q3, q4, #24 ;construct src_ptr[1]
vshr.u64 q5, q6, #24
vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q8, d20, d2
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1])
vzip.32 d10, d11
vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q10, d10, d3
add r0, r4, lr
add r1, r0, lr
add r2, r1, lr
vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q10
vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d28, q8, #7
vst1.32 {d27[0]}, [r4] ;store result
vst1.32 {d27[1]}, [r0]
vst1.32 {d28[0]}, [r1]
vst1.32 {d28[1]}, [r2]
pop {r4, pc}
;---------------------
secondpass_filter4x4_only
sub r0, r0, r1, lsl #1
add r3, r12, r3, lsl #5
vld1.32 {d27[0]}, [r0], r1 ;load src data
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
vld1.32 {d27[1]}, [r0], r1
vabs.s32 q7, q5
vld1.32 {d28[0]}, [r0], r1
vabs.s32 q8, q6
vld1.32 {d28[1]}, [r0], r1
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
vld1.32 {d29[0]}, [r0], r1
vdup.8 d1, d14[4]
vld1.32 {d29[1]}, [r0], r1
vdup.8 d2, d15[0]
vld1.32 {d30[0]}, [r0], r1
vdup.8 d3, d15[4]
vld1.32 {d30[1]}, [r0], r1
vdup.8 d4, d16[0]
vld1.32 {d31[0]}, [r0], r1
vdup.8 d5, d16[4]
vext.8 d23, d27, d28, #4
vext.8 d24, d28, d29, #4
vext.8 d25, d29, d30, #4
vext.8 d26, d30, d31, #4
vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q4, d28, d0
vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp8_filter[5])
vmull.u8 q6, d26, d5
vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q4, d30, d4
vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q6, d24, d1
vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q4, d29, d2
vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp8_filter[3])
vmlal.u8 q6, d25, d3
add r0, r4, lr
add r1, r0, lr
add r2, r1, lr
vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q6, q4
vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8
vqrshrun.s16 d4, q6, #7
vst1.32 {d3[0]}, [r4] ;store result
vst1.32 {d3[1]}, [r0]
vst1.32 {d4[0]}, [r1]
vst1.32 {d4[1]}, [r2]
pop {r4, pc}
ENDP
;-----------------
AREA subpelfilters4_dat, DATA, READWRITE ;read/write by default
;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
;One word each is reserved. Label filter_coeff can be used to access the data.
;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
_filter4_coeff_
DCD filter4_coeff
filter4_coeff
DCD 0, 0, 128, 0, 0, 0, 0, 0
DCD 0, -6, 123, 12, -1, 0, 0, 0
DCD 2, -11, 108, 36, -8, 1, 0, 0
DCD 0, -9, 93, 50, -6, 0, 0, 0
DCD 3, -16, 77, 77, -16, 3, 0, 0
DCD 0, -6, 50, 93, -9, 0, 0, 0
DCD 1, -8, 36, 108, -11, 2, 0, 0
DCD 0, -1, 12, 123, -6, 0, 0, 0
END

Просмотреть файл

@ -0,0 +1,477 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_sixtap_predict8x4_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; r4 unsigned char *dst_ptr,
; stack(r5) int dst_pitch
|vp8_sixtap_predict8x4_neon| PROC
push {r4-r5, lr}
ldr r12, _filter8_coeff_
ldr r4, [sp, #12] ;load parameters from stack
ldr r5, [sp, #16] ;load parameters from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq secondpass_filter8x4_only
add r2, r12, r2, lsl #5 ;calculate filter location
cmp r3, #0 ;skip second_pass filter if yoffset=0
vld1.s32 {q14, q15}, [r2] ;load first_pass filter
beq firstpass_filter8x4_only
sub sp, sp, #32 ;reserve space on stack for temporary storage
vabs.s32 q12, q14
vabs.s32 q13, q15
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
mov lr, sp
sub r0, r0, r1, lsl #1
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
vdup.8 d1, d24[4]
vdup.8 d2, d25[0]
;First pass: output_height lines x output_width columns (9x8)
vld1.u8 {q3}, [r0], r1 ;load src data
vdup.8 d3, d25[4]
vld1.u8 {q4}, [r0], r1
vdup.8 d4, d26[0]
vld1.u8 {q5}, [r0], r1
vdup.8 d5, d26[4]
vld1.u8 {q6}, [r0], r1
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q8, d8, d0
vmull.u8 q9, d10, d0
vmull.u8 q10, d12, d0
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
vext.8 d29, d8, d9, #1
vext.8 d30, d10, d11, #1
vext.8 d31, d12, d13, #1
vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q8, d29, d1
vmlsl.u8 q9, d30, d1
vmlsl.u8 q10, d31, d1
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
vext.8 d29, d8, d9, #4
vext.8 d30, d10, d11, #4
vext.8 d31, d12, d13, #4
vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q8, d29, d4
vmlsl.u8 q9, d30, d4
vmlsl.u8 q10, d31, d4
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
vext.8 d29, d8, d9, #2
vext.8 d30, d10, d11, #2
vext.8 d31, d12, d13, #2
vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q8, d29, d2
vmlal.u8 q9, d30, d2
vmlal.u8 q10, d31, d2
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
vext.8 d29, d8, d9, #5
vext.8 d30, d10, d11, #5
vext.8 d31, d12, d13, #5
vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q8, d29, d5
vmlal.u8 q9, d30, d5
vmlal.u8 q10, d31, d5
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
vext.8 d29, d8, d9, #3
vext.8 d30, d10, d11, #3
vext.8 d31, d12, d13, #3
vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q4, d29, d3
vmull.u8 q5, d30, d3
vmull.u8 q6, d31, d3
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vld1.u8 {q3}, [r0], r1 ;load src data
vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d23, q8, #7
vqrshrun.s16 d24, q9, #7
vqrshrun.s16 d25, q10, #7
vld1.u8 {q4}, [r0], r1
vst1.u8 {d22}, [lr]! ;store result
vld1.u8 {q5}, [r0], r1
vst1.u8 {d23}, [lr]!
vld1.u8 {q6}, [r0], r1
vst1.u8 {d24}, [lr]!
vld1.u8 {q7}, [r0], r1
vst1.u8 {d25}, [lr]!
;first_pass filtering on the rest 5-line data
vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q9, d8, d0
vmull.u8 q10, d10, d0
vmull.u8 q11, d12, d0
vmull.u8 q12, d14, d0
vext.8 d27, d6, d7, #1 ;construct src_ptr[-1]
vext.8 d28, d8, d9, #1
vext.8 d29, d10, d11, #1
vext.8 d30, d12, d13, #1
vext.8 d31, d14, d15, #1
vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q9, d28, d1
vmlsl.u8 q10, d29, d1
vmlsl.u8 q11, d30, d1
vmlsl.u8 q12, d31, d1
vext.8 d27, d6, d7, #4 ;construct src_ptr[2]
vext.8 d28, d8, d9, #4
vext.8 d29, d10, d11, #4
vext.8 d30, d12, d13, #4
vext.8 d31, d14, d15, #4
vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q9, d28, d4
vmlsl.u8 q10, d29, d4
vmlsl.u8 q11, d30, d4
vmlsl.u8 q12, d31, d4
vext.8 d27, d6, d7, #2 ;construct src_ptr[0]
vext.8 d28, d8, d9, #2
vext.8 d29, d10, d11, #2
vext.8 d30, d12, d13, #2
vext.8 d31, d14, d15, #2
vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q9, d28, d2
vmlal.u8 q10, d29, d2
vmlal.u8 q11, d30, d2
vmlal.u8 q12, d31, d2
vext.8 d27, d6, d7, #5 ;construct src_ptr[3]
vext.8 d28, d8, d9, #5
vext.8 d29, d10, d11, #5
vext.8 d30, d12, d13, #5
vext.8 d31, d14, d15, #5
vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q9, d28, d5
vmlal.u8 q10, d29, d5
vmlal.u8 q11, d30, d5
vmlal.u8 q12, d31, d5
vext.8 d27, d6, d7, #3 ;construct src_ptr[1]
vext.8 d28, d8, d9, #3
vext.8 d29, d10, d11, #3
vext.8 d30, d12, d13, #3
vext.8 d31, d14, d15, #3
vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q4, d28, d3
vmull.u8 q5, d29, d3
vmull.u8 q6, d30, d3
vmull.u8 q7, d31, d3
vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q9, q4
vqadd.s16 q10, q5
vqadd.s16 q11, q6
vqadd.s16 q12, q7
vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8
vqrshrun.s16 d27, q9, #7
vqrshrun.s16 d28, q10, #7
vqrshrun.s16 d29, q11, #7 ;load intermediate data from stack
vqrshrun.s16 d30, q12, #7
;Second pass: 8x4
;secondpass_filter
add r3, r12, r3, lsl #5
sub lr, lr, #32
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
vld1.u8 {q11}, [lr]!
vabs.s32 q7, q5
vabs.s32 q8, q6
vld1.u8 {q12}, [lr]!
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
vdup.8 d1, d14[4]
vdup.8 d2, d15[0]
vdup.8 d3, d15[4]
vdup.8 d4, d16[0]
vdup.8 d5, d16[4]
vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q4, d23, d0
vmull.u8 q5, d24, d0
vmull.u8 q6, d25, d0
vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q4, d24, d1
vmlsl.u8 q5, d25, d1
vmlsl.u8 q6, d26, d1
vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q4, d27, d4
vmlsl.u8 q5, d28, d4
vmlsl.u8 q6, d29, d4
vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q4, d25, d2
vmlal.u8 q5, d26, d2
vmlal.u8 q6, d27, d2
vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q4, d28, d5
vmlal.u8 q5, d29, d5
vmlal.u8 q6, d30, d5
vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q8, d26, d3
vmull.u8 q9, d27, d3
vmull.u8 q10, d28, d3
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d7, q8, #7
vqrshrun.s16 d8, q9, #7
vqrshrun.s16 d9, q10, #7
vst1.u8 {d6}, [r4], r5 ;store result
vst1.u8 {d7}, [r4], r5
vst1.u8 {d8}, [r4], r5
vst1.u8 {d9}, [r4], r5
add sp, sp, #32
pop {r4-r5,pc}
;--------------------
firstpass_filter8x4_only
vabs.s32 q12, q14
vabs.s32 q13, q15
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
vld1.u8 {q3}, [r0], r1 ;load src data
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
vld1.u8 {q4}, [r0], r1
vdup.8 d1, d24[4]
vld1.u8 {q5}, [r0], r1
vdup.8 d2, d25[0]
vld1.u8 {q6}, [r0], r1
vdup.8 d3, d25[4]
vdup.8 d4, d26[0]
vdup.8 d5, d26[4]
;First pass: output_height lines x output_width columns (4x8)
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q8, d8, d0
vmull.u8 q9, d10, d0
vmull.u8 q10, d12, d0
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
vext.8 d29, d8, d9, #1
vext.8 d30, d10, d11, #1
vext.8 d31, d12, d13, #1
vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q8, d29, d1
vmlsl.u8 q9, d30, d1
vmlsl.u8 q10, d31, d1
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
vext.8 d29, d8, d9, #4
vext.8 d30, d10, d11, #4
vext.8 d31, d12, d13, #4
vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q8, d29, d4
vmlsl.u8 q9, d30, d4
vmlsl.u8 q10, d31, d4
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
vext.8 d29, d8, d9, #2
vext.8 d30, d10, d11, #2
vext.8 d31, d12, d13, #2
vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q8, d29, d2
vmlal.u8 q9, d30, d2
vmlal.u8 q10, d31, d2
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
vext.8 d29, d8, d9, #5
vext.8 d30, d10, d11, #5
vext.8 d31, d12, d13, #5
vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q8, d29, d5
vmlal.u8 q9, d30, d5
vmlal.u8 q10, d31, d5
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
vext.8 d29, d8, d9, #3
vext.8 d30, d10, d11, #3
vext.8 d31, d12, d13, #3
vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q4, d29, d3
vmull.u8 q5, d30, d3
vmull.u8 q6, d31, d3
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d23, q8, #7
vqrshrun.s16 d24, q9, #7
vqrshrun.s16 d25, q10, #7
vst1.u8 {d22}, [r4], r5 ;store result
vst1.u8 {d23}, [r4], r5
vst1.u8 {d24}, [r4], r5
vst1.u8 {d25}, [r4], r5
pop {r4-r5,pc}
;---------------------
secondpass_filter8x4_only
;Second pass: 8x4
add r3, r12, r3, lsl #5
sub r0, r0, r1, lsl #1
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
vabs.s32 q7, q5
vabs.s32 q8, q6
vld1.u8 {d22}, [r0], r1
vld1.u8 {d23}, [r0], r1
vld1.u8 {d24}, [r0], r1
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
vld1.u8 {d25}, [r0], r1
vdup.8 d1, d14[4]
vld1.u8 {d26}, [r0], r1
vdup.8 d2, d15[0]
vld1.u8 {d27}, [r0], r1
vdup.8 d3, d15[4]
vld1.u8 {d28}, [r0], r1
vdup.8 d4, d16[0]
vld1.u8 {d29}, [r0], r1
vdup.8 d5, d16[4]
vld1.u8 {d30}, [r0], r1
vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q4, d23, d0
vmull.u8 q5, d24, d0
vmull.u8 q6, d25, d0
vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q4, d24, d1
vmlsl.u8 q5, d25, d1
vmlsl.u8 q6, d26, d1
vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q4, d27, d4
vmlsl.u8 q5, d28, d4
vmlsl.u8 q6, d29, d4
vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q4, d25, d2
vmlal.u8 q5, d26, d2
vmlal.u8 q6, d27, d2
vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q4, d28, d5
vmlal.u8 q5, d29, d5
vmlal.u8 q6, d30, d5
vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q8, d26, d3
vmull.u8 q9, d27, d3
vmull.u8 q10, d28, d3
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d7, q8, #7
vqrshrun.s16 d8, q9, #7
vqrshrun.s16 d9, q10, #7
vst1.u8 {d6}, [r4], r5 ;store result
vst1.u8 {d7}, [r4], r5
vst1.u8 {d8}, [r4], r5
vst1.u8 {d9}, [r4], r5
pop {r4-r5,pc}
ENDP
;-----------------
AREA subpelfilters8_dat, DATA, READWRITE ;read/write by default
;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
;One word each is reserved. Label filter_coeff can be used to access the data.
;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
_filter8_coeff_
DCD filter8_coeff
filter8_coeff
DCD 0, 0, 128, 0, 0, 0, 0, 0
DCD 0, -6, 123, 12, -1, 0, 0, 0
DCD 2, -11, 108, 36, -8, 1, 0, 0
DCD 0, -9, 93, 50, -6, 0, 0, 0
DCD 3, -16, 77, 77, -16, 3, 0, 0
DCD 0, -6, 50, 93, -9, 0, 0, 0
DCD 1, -8, 36, 108, -11, 2, 0, 0
DCD 0, -1, 12, 123, -6, 0, 0, 0
END

Просмотреть файл

@ -0,0 +1,528 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_sixtap_predict8x8_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; stack(r4) unsigned char *dst_ptr,
; stack(r5) int dst_pitch
|vp8_sixtap_predict8x8_neon| PROC
push {r4-r5, lr}
ldr r12, _filter8_coeff_
ldr r4, [sp, #12] ;load parameters from stack
ldr r5, [sp, #16] ;load parameters from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq secondpass_filter8x8_only
add r2, r12, r2, lsl #5 ;calculate filter location
cmp r3, #0 ;skip second_pass filter if yoffset=0
vld1.s32 {q14, q15}, [r2] ;load first_pass filter
beq firstpass_filter8x8_only
sub sp, sp, #64 ;reserve space on stack for temporary storage
mov lr, sp
vabs.s32 q12, q14
vabs.s32 q13, q15
mov r2, #2 ;loop counter
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
sub r0, r0, r1, lsl #1
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
vdup.8 d1, d24[4]
vdup.8 d2, d25[0]
;First pass: output_height lines x output_width columns (13x8)
vld1.u8 {q3}, [r0], r1 ;load src data
vdup.8 d3, d25[4]
vld1.u8 {q4}, [r0], r1
vdup.8 d4, d26[0]
vld1.u8 {q5}, [r0], r1
vdup.8 d5, d26[4]
vld1.u8 {q6}, [r0], r1
filt_blk2d_fp8x8_loop_neon
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q8, d8, d0
vmull.u8 q9, d10, d0
vmull.u8 q10, d12, d0
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
vext.8 d29, d8, d9, #1
vext.8 d30, d10, d11, #1
vext.8 d31, d12, d13, #1
vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q8, d29, d1
vmlsl.u8 q9, d30, d1
vmlsl.u8 q10, d31, d1
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
vext.8 d29, d8, d9, #4
vext.8 d30, d10, d11, #4
vext.8 d31, d12, d13, #4
vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q8, d29, d4
vmlsl.u8 q9, d30, d4
vmlsl.u8 q10, d31, d4
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
vext.8 d29, d8, d9, #2
vext.8 d30, d10, d11, #2
vext.8 d31, d12, d13, #2
vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q8, d29, d2
vmlal.u8 q9, d30, d2
vmlal.u8 q10, d31, d2
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
vext.8 d29, d8, d9, #5
vext.8 d30, d10, d11, #5
vext.8 d31, d12, d13, #5
vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q8, d29, d5
vmlal.u8 q9, d30, d5
vmlal.u8 q10, d31, d5
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
vext.8 d29, d8, d9, #3
vext.8 d30, d10, d11, #3
vext.8 d31, d12, d13, #3
vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q4, d29, d3
vmull.u8 q5, d30, d3
vmull.u8 q6, d31, d3
subs r2, r2, #1
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vld1.u8 {q3}, [r0], r1 ;load src data
vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d23, q8, #7
vqrshrun.s16 d24, q9, #7
vqrshrun.s16 d25, q10, #7
vst1.u8 {d22}, [lr]! ;store result
vld1.u8 {q4}, [r0], r1
vst1.u8 {d23}, [lr]!
vld1.u8 {q5}, [r0], r1
vst1.u8 {d24}, [lr]!
vld1.u8 {q6}, [r0], r1
vst1.u8 {d25}, [lr]!
bne filt_blk2d_fp8x8_loop_neon
;first_pass filtering on the rest 5-line data
;vld1.u8 {q3}, [r0], r1 ;load src data
;vld1.u8 {q4}, [r0], r1
;vld1.u8 {q5}, [r0], r1
;vld1.u8 {q6}, [r0], r1
vld1.u8 {q7}, [r0], r1
vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q9, d8, d0
vmull.u8 q10, d10, d0
vmull.u8 q11, d12, d0
vmull.u8 q12, d14, d0
vext.8 d27, d6, d7, #1 ;construct src_ptr[-1]
vext.8 d28, d8, d9, #1
vext.8 d29, d10, d11, #1
vext.8 d30, d12, d13, #1
vext.8 d31, d14, d15, #1
vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q9, d28, d1
vmlsl.u8 q10, d29, d1
vmlsl.u8 q11, d30, d1
vmlsl.u8 q12, d31, d1
vext.8 d27, d6, d7, #4 ;construct src_ptr[2]
vext.8 d28, d8, d9, #4
vext.8 d29, d10, d11, #4
vext.8 d30, d12, d13, #4
vext.8 d31, d14, d15, #4
vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q9, d28, d4
vmlsl.u8 q10, d29, d4
vmlsl.u8 q11, d30, d4
vmlsl.u8 q12, d31, d4
vext.8 d27, d6, d7, #2 ;construct src_ptr[0]
vext.8 d28, d8, d9, #2
vext.8 d29, d10, d11, #2
vext.8 d30, d12, d13, #2
vext.8 d31, d14, d15, #2
vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q9, d28, d2
vmlal.u8 q10, d29, d2
vmlal.u8 q11, d30, d2
vmlal.u8 q12, d31, d2
vext.8 d27, d6, d7, #5 ;construct src_ptr[3]
vext.8 d28, d8, d9, #5
vext.8 d29, d10, d11, #5
vext.8 d30, d12, d13, #5
vext.8 d31, d14, d15, #5
vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q9, d28, d5
vmlal.u8 q10, d29, d5
vmlal.u8 q11, d30, d5
vmlal.u8 q12, d31, d5
vext.8 d27, d6, d7, #3 ;construct src_ptr[1]
vext.8 d28, d8, d9, #3
vext.8 d29, d10, d11, #3
vext.8 d30, d12, d13, #3
vext.8 d31, d14, d15, #3
vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q4, d28, d3
vmull.u8 q5, d29, d3
vmull.u8 q6, d30, d3
vmull.u8 q7, d31, d3
vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q9, q4
vqadd.s16 q10, q5
vqadd.s16 q11, q6
vqadd.s16 q12, q7
add r3, r12, r3, lsl #5
vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8
sub lr, lr, #64
vqrshrun.s16 d27, q9, #7
vld1.u8 {q9}, [lr]! ;load intermediate data from stack
vqrshrun.s16 d28, q10, #7
vld1.u8 {q10}, [lr]!
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
vqrshrun.s16 d29, q11, #7
vld1.u8 {q11}, [lr]!
vabs.s32 q7, q5
vabs.s32 q8, q6
vqrshrun.s16 d30, q12, #7
vld1.u8 {q12}, [lr]!
;Second pass: 8x8
mov r3, #2 ;loop counter
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
vdup.8 d1, d14[4]
vdup.8 d2, d15[0]
vdup.8 d3, d15[4]
vdup.8 d4, d16[0]
vdup.8 d5, d16[4]
filt_blk2d_sp8x8_loop_neon
vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q4, d19, d0
vmull.u8 q5, d20, d0
vmull.u8 q6, d21, d0
vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q4, d20, d1
vmlsl.u8 q5, d21, d1
vmlsl.u8 q6, d22, d1
vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q4, d23, d4
vmlsl.u8 q5, d24, d4
vmlsl.u8 q6, d25, d4
vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q4, d21, d2
vmlal.u8 q5, d22, d2
vmlal.u8 q6, d23, d2
vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q4, d24, d5
vmlal.u8 q5, d25, d5
vmlal.u8 q6, d26, d5
vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q8, d22, d3
vmull.u8 q9, d23, d3
vmull.u8 q10, d24, d3
subs r3, r3, #1
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d7, q8, #7
vqrshrun.s16 d8, q9, #7
vqrshrun.s16 d9, q10, #7
vmov q9, q11
vst1.u8 {d6}, [r4], r5 ;store result
vmov q10, q12
vst1.u8 {d7}, [r4], r5
vmov q11, q13
vst1.u8 {d8}, [r4], r5
vmov q12, q14
vst1.u8 {d9}, [r4], r5
vmov d26, d30
bne filt_blk2d_sp8x8_loop_neon
add sp, sp, #64
pop {r4-r5,pc}
;---------------------
firstpass_filter8x8_only
;add r2, r12, r2, lsl #5 ;calculate filter location
;vld1.s32 {q14, q15}, [r2] ;load first_pass filter
vabs.s32 q12, q14
vabs.s32 q13, q15
mov r2, #2 ;loop counter
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
vdup.8 d1, d24[4]
vdup.8 d2, d25[0]
vdup.8 d3, d25[4]
vdup.8 d4, d26[0]
vdup.8 d5, d26[4]
;First pass: output_height lines x output_width columns (8x8)
filt_blk2d_fpo8x8_loop_neon
vld1.u8 {q3}, [r0], r1 ;load src data
vld1.u8 {q4}, [r0], r1
vld1.u8 {q5}, [r0], r1
vld1.u8 {q6}, [r0], r1
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q8, d8, d0
vmull.u8 q9, d10, d0
vmull.u8 q10, d12, d0
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
vext.8 d29, d8, d9, #1
vext.8 d30, d10, d11, #1
vext.8 d31, d12, d13, #1
vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q8, d29, d1
vmlsl.u8 q9, d30, d1
vmlsl.u8 q10, d31, d1
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
vext.8 d29, d8, d9, #4
vext.8 d30, d10, d11, #4
vext.8 d31, d12, d13, #4
vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q8, d29, d4
vmlsl.u8 q9, d30, d4
vmlsl.u8 q10, d31, d4
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
vext.8 d29, d8, d9, #2
vext.8 d30, d10, d11, #2
vext.8 d31, d12, d13, #2
vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q8, d29, d2
vmlal.u8 q9, d30, d2
vmlal.u8 q10, d31, d2
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
vext.8 d29, d8, d9, #5
vext.8 d30, d10, d11, #5
vext.8 d31, d12, d13, #5
vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q8, d29, d5
vmlal.u8 q9, d30, d5
vmlal.u8 q10, d31, d5
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
vext.8 d29, d8, d9, #3
vext.8 d30, d10, d11, #3
vext.8 d31, d12, d13, #3
vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q4, d29, d3
vmull.u8 q5, d30, d3
vmull.u8 q6, d31, d3
;
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
subs r2, r2, #1
vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d23, q8, #7
vqrshrun.s16 d24, q9, #7
vqrshrun.s16 d25, q10, #7
vst1.u8 {d22}, [r4], r5 ;store result
vst1.u8 {d23}, [r4], r5
vst1.u8 {d24}, [r4], r5
vst1.u8 {d25}, [r4], r5
bne filt_blk2d_fpo8x8_loop_neon
pop {r4-r5,pc}
;---------------------
secondpass_filter8x8_only
sub r0, r0, r1, lsl #1
add r3, r12, r3, lsl #5
vld1.u8 {d18}, [r0], r1 ;load src data
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
vld1.u8 {d19}, [r0], r1
vabs.s32 q7, q5
vld1.u8 {d20}, [r0], r1
vabs.s32 q8, q6
vld1.u8 {d21}, [r0], r1
mov r3, #2 ;loop counter
vld1.u8 {d22}, [r0], r1
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
vld1.u8 {d23}, [r0], r1
vdup.8 d1, d14[4]
vld1.u8 {d24}, [r0], r1
vdup.8 d2, d15[0]
vld1.u8 {d25}, [r0], r1
vdup.8 d3, d15[4]
vld1.u8 {d26}, [r0], r1
vdup.8 d4, d16[0]
vld1.u8 {d27}, [r0], r1
vdup.8 d5, d16[4]
vld1.u8 {d28}, [r0], r1
vld1.u8 {d29}, [r0], r1
vld1.u8 {d30}, [r0], r1
;Second pass: 8x8
filt_blk2d_spo8x8_loop_neon
vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q4, d19, d0
vmull.u8 q5, d20, d0
vmull.u8 q6, d21, d0
vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q4, d20, d1
vmlsl.u8 q5, d21, d1
vmlsl.u8 q6, d22, d1
vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q4, d23, d4
vmlsl.u8 q5, d24, d4
vmlsl.u8 q6, d25, d4
vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q4, d21, d2
vmlal.u8 q5, d22, d2
vmlal.u8 q6, d23, d2
vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q4, d24, d5
vmlal.u8 q5, d25, d5
vmlal.u8 q6, d26, d5
vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q8, d22, d3
vmull.u8 q9, d23, d3
vmull.u8 q10, d24, d3
subs r3, r3, #1
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d7, q8, #7
vqrshrun.s16 d8, q9, #7
vqrshrun.s16 d9, q10, #7
vmov q9, q11
vst1.u8 {d6}, [r4], r5 ;store result
vmov q10, q12
vst1.u8 {d7}, [r4], r5
vmov q11, q13
vst1.u8 {d8}, [r4], r5
vmov q12, q14
vst1.u8 {d9}, [r4], r5
vmov d26, d30
bne filt_blk2d_spo8x8_loop_neon
pop {r4-r5,pc}
ENDP
;-----------------
AREA subpelfilters8_dat, DATA, READWRITE ;read/write by default
;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
;One word each is reserved. Label filter_coeff can be used to access the data.
;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
_filter8_coeff_
DCD filter8_coeff
filter8_coeff
DCD 0, 0, 128, 0, 0, 0, 0, 0
DCD 0, -6, 123, 12, -1, 0, 0, 0
DCD 2, -11, 108, 36, -8, 1, 0, 0
DCD 0, -9, 93, 50, -6, 0, 0, 0
DCD 3, -16, 77, 77, -16, 3, 0, 0
DCD 0, -6, 50, 93, -9, 0, 0, 0
DCD 1, -8, 36, 108, -11, 2, 0, 0
DCD 0, -1, 12, 123, -6, 0, 0, 0
END

Просмотреть файл

@ -0,0 +1,80 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef RECON_ARM_H
#define RECON_ARM_H
#if HAVE_ARMV6
extern prototype_recon_block(vp8_recon_b_armv6);
extern prototype_recon_block(vp8_recon2b_armv6);
extern prototype_recon_block(vp8_recon4b_armv6);
extern prototype_copy_block(vp8_copy_mem8x8_v6);
extern prototype_copy_block(vp8_copy_mem8x4_v6);
extern prototype_copy_block(vp8_copy_mem16x16_v6);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_recon_recon
#define vp8_recon_recon vp8_recon_b_armv6
#undef vp8_recon_recon2
#define vp8_recon_recon2 vp8_recon2b_armv6
#undef vp8_recon_recon4
#define vp8_recon_recon4 vp8_recon4b_armv6
#undef vp8_recon_copy8x8
#define vp8_recon_copy8x8 vp8_copy_mem8x8_v6
#undef vp8_recon_copy8x4
#define vp8_recon_copy8x4 vp8_copy_mem8x4_v6
#undef vp8_recon_copy16x16
#define vp8_recon_copy16x16 vp8_copy_mem16x16_v6
#endif
#endif
#if HAVE_ARMV7
extern prototype_recon_block(vp8_recon_b_neon);
extern prototype_recon_block(vp8_recon2b_neon);
extern prototype_recon_block(vp8_recon4b_neon);
extern prototype_copy_block(vp8_copy_mem8x8_neon);
extern prototype_copy_block(vp8_copy_mem8x4_neon);
extern prototype_copy_block(vp8_copy_mem16x16_neon);
extern prototype_recon_macroblock(vp8_recon_mb_neon);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_recon_recon
#define vp8_recon_recon vp8_recon_b_neon
#undef vp8_recon_recon2
#define vp8_recon_recon2 vp8_recon2b_neon
#undef vp8_recon_recon4
#define vp8_recon_recon4 vp8_recon4b_neon
#undef vp8_recon_copy8x8
#define vp8_recon_copy8x8 vp8_copy_mem8x8_neon
#undef vp8_recon_copy8x4
#define vp8_recon_copy8x4 vp8_copy_mem8x4_neon
#undef vp8_recon_copy16x16
#define vp8_recon_copy16x16 vp8_copy_mem16x16_neon
#undef vp8_recon_recon_mb
#define vp8_recon_recon_mb vp8_recon_mb_neon
#endif
#endif
#endif

Просмотреть файл

@ -0,0 +1,62 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_ports/config.h"
#include "blockd.h"
#include "reconintra.h"
#include "vpx_mem/vpx_mem.h"
#include "recon.h"
#if HAVE_ARMV7
extern void vp8_build_intra_predictors_mby_neon_func(
unsigned char *y_buffer,
unsigned char *ypred_ptr,
int y_stride,
int mode,
int Up,
int Left);
void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x)
{
unsigned char *y_buffer = x->dst.y_buffer;
unsigned char *ypred_ptr = x->predictor;
int y_stride = x->dst.y_stride;
int mode = x->mode_info_context->mbmi.mode;
int Up = x->up_available;
int Left = x->left_available;
vp8_build_intra_predictors_mby_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left);
}
#endif
#if HAVE_ARMV7
extern void vp8_build_intra_predictors_mby_s_neon_func(
unsigned char *y_buffer,
unsigned char *ypred_ptr,
int y_stride,
int mode,
int Up,
int Left);
void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x)
{
unsigned char *y_buffer = x->dst.y_buffer;
unsigned char *ypred_ptr = x->predictor;
int y_stride = x->dst.y_stride;
int mode = x->mode_info_context->mbmi.mode;
int Up = x->up_available;
int Left = x->left_available;
vp8_build_intra_predictors_mby_s_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left);
}
#endif

Просмотреть файл

@ -0,0 +1,89 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef SUBPIXEL_ARM_H
#define SUBPIXEL_ARM_H
#if HAVE_ARMV6
extern prototype_subpixel_predict(vp8_sixtap_predict16x16_armv6);
extern prototype_subpixel_predict(vp8_sixtap_predict8x8_armv6);
extern prototype_subpixel_predict(vp8_sixtap_predict8x4_armv6);
extern prototype_subpixel_predict(vp8_sixtap_predict_armv6);
extern prototype_subpixel_predict(vp8_bilinear_predict16x16_armv6);
extern prototype_subpixel_predict(vp8_bilinear_predict8x8_armv6);
extern prototype_subpixel_predict(vp8_bilinear_predict8x4_armv6);
extern prototype_subpixel_predict(vp8_bilinear_predict4x4_armv6);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_subpix_sixtap16x16
#define vp8_subpix_sixtap16x16 vp8_sixtap_predict16x16_armv6
#undef vp8_subpix_sixtap8x8
#define vp8_subpix_sixtap8x8 vp8_sixtap_predict8x8_armv6
#undef vp8_subpix_sixtap8x4
#define vp8_subpix_sixtap8x4 vp8_sixtap_predict8x4_armv6
#undef vp8_subpix_sixtap4x4
#define vp8_subpix_sixtap4x4 vp8_sixtap_predict_armv6
#undef vp8_subpix_bilinear16x16
#define vp8_subpix_bilinear16x16 vp8_bilinear_predict16x16_armv6
#undef vp8_subpix_bilinear8x8
#define vp8_subpix_bilinear8x8 vp8_bilinear_predict8x8_armv6
#undef vp8_subpix_bilinear8x4
#define vp8_subpix_bilinear8x4 vp8_bilinear_predict8x4_armv6
#undef vp8_subpix_bilinear4x4
#define vp8_subpix_bilinear4x4 vp8_bilinear_predict4x4_armv6
#endif
#endif
#if HAVE_ARMV7
extern prototype_subpixel_predict(vp8_sixtap_predict16x16_neon);
extern prototype_subpixel_predict(vp8_sixtap_predict8x8_neon);
extern prototype_subpixel_predict(vp8_sixtap_predict8x4_neon);
extern prototype_subpixel_predict(vp8_sixtap_predict_neon);
extern prototype_subpixel_predict(vp8_bilinear_predict16x16_neon);
extern prototype_subpixel_predict(vp8_bilinear_predict8x8_neon);
extern prototype_subpixel_predict(vp8_bilinear_predict8x4_neon);
extern prototype_subpixel_predict(vp8_bilinear_predict4x4_neon);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_subpix_sixtap16x16
#define vp8_subpix_sixtap16x16 vp8_sixtap_predict16x16_neon
#undef vp8_subpix_sixtap8x8
#define vp8_subpix_sixtap8x8 vp8_sixtap_predict8x8_neon
#undef vp8_subpix_sixtap8x4
#define vp8_subpix_sixtap8x4 vp8_sixtap_predict8x4_neon
#undef vp8_subpix_sixtap4x4
#define vp8_subpix_sixtap4x4 vp8_sixtap_predict_neon
#undef vp8_subpix_bilinear16x16
#define vp8_subpix_bilinear16x16 vp8_bilinear_predict16x16_neon
#undef vp8_subpix_bilinear8x8
#define vp8_subpix_bilinear8x8 vp8_bilinear_predict8x8_neon
#undef vp8_subpix_bilinear8x4
#define vp8_subpix_bilinear8x4 vp8_bilinear_predict8x4_neon
#undef vp8_subpix_bilinear4x4
#define vp8_subpix_bilinear4x4 vp8_bilinear_predict4x4_neon
#endif
#endif
#endif

Просмотреть файл

@ -0,0 +1,87 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_ports/config.h"
#include <stddef.h>
#if CONFIG_VP8_ENCODER
#include "vpx_scale/yv12config.h"
#endif
#if CONFIG_VP8_DECODER
#include "onyxd_int.h"
#endif
#define DEFINE(sym, val) int sym = val;
/*
#define BLANK() asm volatile("\n->" : : )
*/
/*
* int main(void)
* {
*/
#if CONFIG_VP8_DECODER || CONFIG_VP8_ENCODER
DEFINE(yv12_buffer_config_y_width, offsetof(YV12_BUFFER_CONFIG, y_width));
DEFINE(yv12_buffer_config_y_height, offsetof(YV12_BUFFER_CONFIG, y_height));
DEFINE(yv12_buffer_config_y_stride, offsetof(YV12_BUFFER_CONFIG, y_stride));
DEFINE(yv12_buffer_config_uv_width, offsetof(YV12_BUFFER_CONFIG, uv_width));
DEFINE(yv12_buffer_config_uv_height, offsetof(YV12_BUFFER_CONFIG, uv_height));
DEFINE(yv12_buffer_config_uv_stride, offsetof(YV12_BUFFER_CONFIG, uv_stride));
DEFINE(yv12_buffer_config_y_buffer, offsetof(YV12_BUFFER_CONFIG, y_buffer));
DEFINE(yv12_buffer_config_u_buffer, offsetof(YV12_BUFFER_CONFIG, u_buffer));
DEFINE(yv12_buffer_config_v_buffer, offsetof(YV12_BUFFER_CONFIG, v_buffer));
DEFINE(yv12_buffer_config_border, offsetof(YV12_BUFFER_CONFIG, border));
#endif
#if CONFIG_VP8_DECODER
DEFINE(mb_diff, offsetof(MACROBLOCKD, diff));
DEFINE(mb_predictor, offsetof(MACROBLOCKD, predictor));
DEFINE(mb_dst_y_stride, offsetof(MACROBLOCKD, dst.y_stride));
DEFINE(mb_dst_y_buffer, offsetof(MACROBLOCKD, dst.y_buffer));
DEFINE(mb_dst_u_buffer, offsetof(MACROBLOCKD, dst.u_buffer));
DEFINE(mb_dst_v_buffer, offsetof(MACROBLOCKD, dst.v_buffer));
DEFINE(mb_up_available, offsetof(MACROBLOCKD, up_available));
DEFINE(mb_left_available, offsetof(MACROBLOCKD, left_available));
DEFINE(detok_scan, offsetof(DETOK, scan));
DEFINE(detok_ptr_block2leftabove, offsetof(DETOK, ptr_block2leftabove));
DEFINE(detok_coef_tree_ptr, offsetof(DETOK, vp8_coef_tree_ptr));
DEFINE(detok_teb_base_ptr, offsetof(DETOK, teb_base_ptr));
DEFINE(detok_norm_ptr, offsetof(DETOK, norm_ptr));
DEFINE(detok_ptr_coef_bands_x, offsetof(DETOK, ptr_coef_bands_x));
DEFINE(detok_A, offsetof(DETOK, A));
DEFINE(detok_L, offsetof(DETOK, L));
DEFINE(detok_qcoeff_start_ptr, offsetof(DETOK, qcoeff_start_ptr));
DEFINE(detok_current_bc, offsetof(DETOK, current_bc));
DEFINE(detok_coef_probs, offsetof(DETOK, coef_probs));
DEFINE(detok_eob, offsetof(DETOK, eob));
DEFINE(bool_decoder_user_buffer_end, offsetof(BOOL_DECODER, user_buffer_end));
DEFINE(bool_decoder_user_buffer, offsetof(BOOL_DECODER, user_buffer));
DEFINE(bool_decoder_value, offsetof(BOOL_DECODER, value));
DEFINE(bool_decoder_count, offsetof(BOOL_DECODER, count));
DEFINE(bool_decoder_range, offsetof(BOOL_DECODER, range));
DEFINE(tokenextrabits_min_val, offsetof(TOKENEXTRABITS, min_val));
DEFINE(tokenextrabits_length, offsetof(TOKENEXTRABITS, Length));
#endif
//add asserts for any offset that is not supported by assembly code
//add asserts for any size that is not supported by assembly code
/*
* return 0;
* }
*/

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@ -24,7 +24,7 @@ void vpx_log(const char *format, ...);
#define TRUE 1
#define FALSE 0
//#define DCPRED 1
/*#define DCPRED 1*/
#define DCPREDSIMTHRESH 0
#define DCPREDCNTTHRESH 3
@ -39,7 +39,7 @@ void vpx_log(const char *format, ...);
#define MAX_REF_LF_DELTAS 4
#define MAX_MODE_LF_DELTAS 4
// Segment Feature Masks
/* Segment Feature Masks */
#define SEGMENT_DELTADATA 0
#define SEGMENT_ABSDATA 1
@ -75,11 +75,11 @@ typedef enum
typedef enum
{
DC_PRED, // average of above and left pixels
V_PRED, // vertical prediction
H_PRED, // horizontal prediction
TM_PRED, // Truemotion prediction
B_PRED, // block based prediction, each block has its own prediction mode
DC_PRED, /* average of above and left pixels */
V_PRED, /* vertical prediction */
H_PRED, /* horizontal prediction */
TM_PRED, /* Truemotion prediction */
B_PRED, /* block based prediction, each block has its own prediction mode */
NEARESTMV,
NEARMV,
@ -90,16 +90,16 @@ typedef enum
MB_MODE_COUNT
} MB_PREDICTION_MODE;
// Macroblock level features
/* Macroblock level features */
typedef enum
{
MB_LVL_ALT_Q = 0, // Use alternate Quantizer ....
MB_LVL_ALT_LF = 1, // Use alternate loop filter value...
MB_LVL_MAX = 2 // Number of MB level features supported
MB_LVL_ALT_Q = 0, /* Use alternate Quantizer .... */
MB_LVL_ALT_LF = 1, /* Use alternate loop filter value... */
MB_LVL_MAX = 2 /* Number of MB level features supported */
} MB_LVL_FEATURES;
// Segment Feature Masks
/* Segment Feature Masks */
#define SEGMENT_ALTQ 0x01
#define SEGMENT_ALT_LF 0x02
@ -110,11 +110,11 @@ typedef enum
typedef enum
{
B_DC_PRED, // average of above and left pixels
B_DC_PRED, /* average of above and left pixels */
B_TM_PRED,
B_VE_PRED, // vertical prediction
B_HE_PRED, // horizontal prediction
B_VE_PRED, /* vertical prediction */
B_HE_PRED, /* horizontal prediction */
B_LD_PRED,
B_RD_PRED,
@ -168,14 +168,15 @@ typedef struct
int as_int;
MV as_mv;
} mv;
int partitioning;
int partition_count;
int mb_skip_coeff; //does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens
int dc_diff;
unsigned char segment_id; // Which set of segmentation parameters should be used for this MB
int force_no_skip;
int need_to_clamp_mvs;
B_MODE_INFO partition_bmi[16];
unsigned char partitioning;
unsigned char mb_skip_coeff; /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
unsigned char dc_diff;
unsigned char need_to_clamp_mvs;
unsigned char segment_id; /* Which set of segmentation parameters should be used for this MB */
unsigned char force_no_skip; /* encoder only */
} MB_MODE_INFO;
@ -194,9 +195,9 @@ typedef struct
short *diff;
short *reference;
short(*dequant)[4];
short *dequant;
// 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries
/* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */
unsigned char **base_pre;
int pre;
int pre_stride;
@ -213,22 +214,20 @@ typedef struct
typedef struct
{
DECLARE_ALIGNED(16, short, diff[400]); // from idct diff
DECLARE_ALIGNED(16, short, diff[400]); /* from idct diff */
DECLARE_ALIGNED(16, unsigned char, predictor[384]);
//not used DECLARE_ALIGNED(16, short, reference[384]);
/* not used DECLARE_ALIGNED(16, short, reference[384]); */
DECLARE_ALIGNED(16, short, qcoeff[400]);
DECLARE_ALIGNED(16, short, dqcoeff[400]);
DECLARE_ALIGNED(16, char, eobs[25]);
// 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries.
/* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */
BLOCKD block[25];
YV12_BUFFER_CONFIG pre; // Filtered copy of previous frame reconstruction
YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */
YV12_BUFFER_CONFIG dst;
MODE_INFO *mode_info_context;
MODE_INFO *mode_info;
int mode_info_stride;
FRAME_TYPE frame_type;
@ -236,39 +235,39 @@ typedef struct
int up_available;
int left_available;
// Y,U,V,Y2
/* Y,U,V,Y2 */
ENTROPY_CONTEXT_PLANES *above_context;
ENTROPY_CONTEXT_PLANES *left_context;
// 0 indicates segmentation at MB level is not enabled. Otherwise the individual bits indicate which features are active.
/* 0 indicates segmentation at MB level is not enabled. Otherwise the individual bits indicate which features are active. */
unsigned char segmentation_enabled;
// 0 (do not update) 1 (update) the macroblock segmentation map.
/* 0 (do not update) 1 (update) the macroblock segmentation map. */
unsigned char update_mb_segmentation_map;
// 0 (do not update) 1 (update) the macroblock segmentation feature data.
/* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
unsigned char update_mb_segmentation_data;
// 0 (do not update) 1 (update) the macroblock segmentation feature data.
/* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
unsigned char mb_segement_abs_delta;
// Per frame flags that define which MB level features (such as quantizer or loop filter level)
// are enabled and when enabled the proabilities used to decode the per MB flags in MB_MODE_INFO
vp8_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS]; // Probability Tree used to code Segment number
/* Per frame flags that define which MB level features (such as quantizer or loop filter level) */
/* are enabled and when enabled the proabilities used to decode the per MB flags in MB_MODE_INFO */
vp8_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS]; /* Probability Tree used to code Segment number */
signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS]; // Segment parameters
signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS]; /* Segment parameters */
// mode_based Loop filter adjustment
/* mode_based Loop filter adjustment */
unsigned char mode_ref_lf_delta_enabled;
unsigned char mode_ref_lf_delta_update;
// Delta values have the range +/- MAX_LOOP_FILTER
//char ref_lf_deltas[MAX_REF_LF_DELTAS]; // 0 = Intra, Last, GF, ARF
//char mode_lf_deltas[MAX_MODE_LF_DELTAS]; // 0 = BPRED, ZERO_MV, MV, SPLIT
signed char ref_lf_deltas[MAX_REF_LF_DELTAS]; // 0 = Intra, Last, GF, ARF
signed char mode_lf_deltas[MAX_MODE_LF_DELTAS]; // 0 = BPRED, ZERO_MV, MV, SPLIT
/* Delta values have the range +/- MAX_LOOP_FILTER */
signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS]; /* 0 = Intra, Last, GF, ARF */
signed char ref_lf_deltas[MAX_REF_LF_DELTAS]; /* 0 = Intra, Last, GF, ARF */
signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS]; /* 0 = BPRED, ZERO_MV, MV, SPLIT */
signed char mode_lf_deltas[MAX_MODE_LF_DELTAS]; /* 0 = BPRED, ZERO_MV, MV, SPLIT */
// Distance of MB away from frame edges
/* Distance of MB away from frame edges */
int mb_to_left_edge;
int mb_to_right_edge;
int mb_to_top_edge;

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@ -21,7 +21,7 @@ void vp8_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, int f
int mb_index = 0;
FILE *mvs = fopen("mvs.stt", "a");
// print out the macroblock Y modes
/* print out the macroblock Y modes */
mb_index = 0;
fprintf(mvs, "Mb Modes for Frame %d\n", frame);
@ -60,7 +60,7 @@ void vp8_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, int f
fprintf(mvs, "\n");
// print out the macroblock UV modes
/* print out the macroblock UV modes */
mb_index = 0;
fprintf(mvs, "UV Modes for Frame %d\n", frame);
@ -80,7 +80,7 @@ void vp8_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, int f
fprintf(mvs, "\n");
// print out the block modes
/* print out the block modes */
mb_index = 0;
fprintf(mvs, "Mbs for Frame %d\n", frame);
{
@ -108,7 +108,7 @@ void vp8_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, int f
}
fprintf(mvs, "\n");
// print out the macroblock mvs
/* print out the macroblock mvs */
mb_index = 0;
fprintf(mvs, "MVs for Frame %d\n", frame);
@ -128,7 +128,7 @@ void vp8_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, int f
fprintf(mvs, "\n");
// print out the block modes
/* print out the block modes */
mb_index = 0;
fprintf(mvs, "MVs for Frame %d\n", frame);
{

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@ -15,204 +15,204 @@ static const unsigned int default_coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_C
{
{
// Block Type ( 0 )
/* Block Type ( 0 ) */
{
// Coeff Band ( 0 )
/* Coeff Band ( 0 ) */
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
},
{
// Coeff Band ( 1 )
/* Coeff Band ( 1 ) */
{30190, 26544, 225, 24, 4, 0, 0, 0, 0, 0, 0, 4171593,},
{26846, 25157, 1241, 130, 26, 6, 1, 0, 0, 0, 0, 149987,},
{10484, 9538, 1006, 160, 36, 18, 0, 0, 0, 0, 0, 15104,},
},
{
// Coeff Band ( 2 )
/* Coeff Band ( 2 ) */
{25842, 40456, 1126, 83, 11, 2, 0, 0, 0, 0, 0, 0,},
{9338, 8010, 512, 73, 7, 3, 2, 0, 0, 0, 0, 43294,},
{1047, 751, 149, 31, 13, 6, 1, 0, 0, 0, 0, 879,},
},
{
// Coeff Band ( 3 )
/* Coeff Band ( 3 ) */
{26136, 9826, 252, 13, 0, 0, 0, 0, 0, 0, 0, 0,},
{8134, 5574, 191, 14, 2, 0, 0, 0, 0, 0, 0, 35302,},
{ 605, 677, 116, 9, 1, 0, 0, 0, 0, 0, 0, 611,},
},
{
// Coeff Band ( 4 )
/* Coeff Band ( 4 ) */
{10263, 15463, 283, 17, 0, 0, 0, 0, 0, 0, 0, 0,},
{2773, 2191, 128, 9, 2, 2, 0, 0, 0, 0, 0, 10073,},
{ 134, 125, 32, 4, 0, 2, 0, 0, 0, 0, 0, 50,},
},
{
// Coeff Band ( 5 )
/* Coeff Band ( 5 ) */
{10483, 2663, 23, 1, 0, 0, 0, 0, 0, 0, 0, 0,},
{2137, 1251, 27, 1, 1, 0, 0, 0, 0, 0, 0, 14362,},
{ 116, 156, 14, 2, 1, 0, 0, 0, 0, 0, 0, 190,},
},
{
// Coeff Band ( 6 )
/* Coeff Band ( 6 ) */
{40977, 27614, 412, 28, 0, 0, 0, 0, 0, 0, 0, 0,},
{6113, 5213, 261, 22, 3, 0, 0, 0, 0, 0, 0, 26164,},
{ 382, 312, 50, 14, 2, 0, 0, 0, 0, 0, 0, 345,},
},
{
// Coeff Band ( 7 )
/* Coeff Band ( 7 ) */
{ 0, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
{ 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 319,},
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8,},
},
},
{
// Block Type ( 1 )
/* Block Type ( 1 ) */
{
// Coeff Band ( 0 )
/* Coeff Band ( 0 ) */
{3268, 19382, 1043, 250, 93, 82, 49, 26, 17, 8, 25, 82289,},
{8758, 32110, 5436, 1832, 827, 668, 420, 153, 24, 0, 3, 52914,},
{9337, 23725, 8487, 3954, 2107, 1836, 1069, 399, 59, 0, 0, 18620,},
},
{
// Coeff Band ( 1 )
/* Coeff Band ( 1 ) */
{12419, 8420, 452, 62, 9, 1, 0, 0, 0, 0, 0, 0,},
{11715, 8705, 693, 92, 15, 7, 2, 0, 0, 0, 0, 53988,},
{7603, 8585, 2306, 778, 270, 145, 39, 5, 0, 0, 0, 9136,},
},
{
// Coeff Band ( 2 )
/* Coeff Band ( 2 ) */
{15938, 14335, 1207, 184, 55, 13, 4, 1, 0, 0, 0, 0,},
{7415, 6829, 1138, 244, 71, 26, 7, 0, 0, 0, 0, 9980,},
{1580, 1824, 655, 241, 89, 46, 10, 2, 0, 0, 0, 429,},
},
{
// Coeff Band ( 3 )
/* Coeff Band ( 3 ) */
{19453, 5260, 201, 19, 0, 0, 0, 0, 0, 0, 0, 0,},
{9173, 3758, 213, 22, 1, 1, 0, 0, 0, 0, 0, 9820,},
{1689, 1277, 276, 51, 17, 4, 0, 0, 0, 0, 0, 679,},
},
{
// Coeff Band ( 4 )
/* Coeff Band ( 4 ) */
{12076, 10667, 620, 85, 19, 9, 5, 0, 0, 0, 0, 0,},
{4665, 3625, 423, 55, 19, 9, 0, 0, 0, 0, 0, 5127,},
{ 415, 440, 143, 34, 20, 7, 2, 0, 0, 0, 0, 101,},
},
{
// Coeff Band ( 5 )
/* Coeff Band ( 5 ) */
{12183, 4846, 115, 11, 1, 0, 0, 0, 0, 0, 0, 0,},
{4226, 3149, 177, 21, 2, 0, 0, 0, 0, 0, 0, 7157,},
{ 375, 621, 189, 51, 11, 4, 1, 0, 0, 0, 0, 198,},
},
{
// Coeff Band ( 6 )
/* Coeff Band ( 6 ) */
{61658, 37743, 1203, 94, 10, 3, 0, 0, 0, 0, 0, 0,},
{15514, 11563, 903, 111, 14, 5, 0, 0, 0, 0, 0, 25195,},
{ 929, 1077, 291, 78, 14, 7, 1, 0, 0, 0, 0, 507,},
},
{
// Coeff Band ( 7 )
/* Coeff Band ( 7 ) */
{ 0, 990, 15, 3, 0, 0, 0, 0, 0, 0, 0, 0,},
{ 0, 412, 13, 0, 0, 0, 0, 0, 0, 0, 0, 1641,},
{ 0, 18, 7, 1, 0, 0, 0, 0, 0, 0, 0, 30,},
},
},
{
// Block Type ( 2 )
/* Block Type ( 2 ) */
{
// Coeff Band ( 0 )
/* Coeff Band ( 0 ) */
{ 953, 24519, 628, 120, 28, 12, 4, 0, 0, 0, 0, 2248798,},
{1525, 25654, 2647, 617, 239, 143, 42, 5, 0, 0, 0, 66837,},
{1180, 11011, 3001, 1237, 532, 448, 239, 54, 5, 0, 0, 7122,},
},
{
// Coeff Band ( 1 )
/* Coeff Band ( 1 ) */
{1356, 2220, 67, 10, 4, 1, 0, 0, 0, 0, 0, 0,},
{1450, 2544, 102, 18, 4, 3, 0, 0, 0, 0, 0, 57063,},
{1182, 2110, 470, 130, 41, 21, 0, 0, 0, 0, 0, 6047,},
},
{
// Coeff Band ( 2 )
/* Coeff Band ( 2 ) */
{ 370, 3378, 200, 30, 5, 4, 1, 0, 0, 0, 0, 0,},
{ 293, 1006, 131, 29, 11, 0, 0, 0, 0, 0, 0, 5404,},
{ 114, 387, 98, 23, 4, 8, 1, 0, 0, 0, 0, 236,},
},
{
// Coeff Band ( 3 )
/* Coeff Band ( 3 ) */
{ 579, 194, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
{ 395, 213, 5, 1, 0, 0, 0, 0, 0, 0, 0, 4157,},
{ 119, 122, 4, 0, 0, 0, 0, 0, 0, 0, 0, 300,},
},
{
// Coeff Band ( 4 )
/* Coeff Band ( 4 ) */
{ 38, 557, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
{ 21, 114, 12, 1, 0, 0, 0, 0, 0, 0, 0, 427,},
{ 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7,},
},
{
// Coeff Band ( 5 )
/* Coeff Band ( 5 ) */
{ 52, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
{ 18, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 652,},
{ 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30,},
},
{
// Coeff Band ( 6 )
/* Coeff Band ( 6 ) */
{ 640, 569, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
{ 25, 77, 2, 0, 0, 0, 0, 0, 0, 0, 0, 517,},
{ 4, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3,},
},
{
// Coeff Band ( 7 )
/* Coeff Band ( 7 ) */
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
},
},
{
// Block Type ( 3 )
/* Block Type ( 3 ) */
{
// Coeff Band ( 0 )
/* Coeff Band ( 0 ) */
{2506, 20161, 2707, 767, 261, 178, 107, 30, 14, 3, 0, 100694,},
{8806, 36478, 8817, 3268, 1280, 850, 401, 114, 42, 0, 0, 58572,},
{11003, 27214, 11798, 5716, 2482, 2072, 1048, 175, 32, 0, 0, 19284,},
},
{
// Coeff Band ( 1 )
/* Coeff Band ( 1 ) */
{9738, 11313, 959, 205, 70, 18, 11, 1, 0, 0, 0, 0,},
{12628, 15085, 1507, 273, 52, 19, 9, 0, 0, 0, 0, 54280,},
{10701, 15846, 5561, 1926, 813, 570, 249, 36, 0, 0, 0, 6460,},
},
{
// Coeff Band ( 2 )
/* Coeff Band ( 2 ) */
{6781, 22539, 2784, 634, 182, 123, 20, 4, 0, 0, 0, 0,},
{6263, 11544, 2649, 790, 259, 168, 27, 5, 0, 0, 0, 20539,},
{3109, 4075, 2031, 896, 457, 386, 158, 29, 0, 0, 0, 1138,},
},
{
// Coeff Band ( 3 )
/* Coeff Band ( 3 ) */
{11515, 4079, 465, 73, 5, 14, 2, 0, 0, 0, 0, 0,},
{9361, 5834, 650, 96, 24, 8, 4, 0, 0, 0, 0, 22181,},
{4343, 3974, 1360, 415, 132, 96, 14, 1, 0, 0, 0, 1267,},
},
{
// Coeff Band ( 4 )
/* Coeff Band ( 4 ) */
{4787, 9297, 823, 168, 44, 12, 4, 0, 0, 0, 0, 0,},
{3619, 4472, 719, 198, 60, 31, 3, 0, 0, 0, 0, 8401,},
{1157, 1175, 483, 182, 88, 31, 8, 0, 0, 0, 0, 268,},
},
{
// Coeff Band ( 5 )
/* Coeff Band ( 5 ) */
{8299, 1226, 32, 5, 1, 0, 0, 0, 0, 0, 0, 0,},
{3502, 1568, 57, 4, 1, 1, 0, 0, 0, 0, 0, 9811,},
{1055, 1070, 166, 29, 6, 1, 0, 0, 0, 0, 0, 527,},
},
{
// Coeff Band ( 6 )
/* Coeff Band ( 6 ) */
{27414, 27927, 1989, 347, 69, 26, 0, 0, 0, 0, 0, 0,},
{5876, 10074, 1574, 341, 91, 24, 4, 0, 0, 0, 0, 21954,},
{1571, 2171, 778, 324, 124, 65, 16, 0, 0, 0, 0, 979,},
},
{
// Coeff Band ( 7 )
/* Coeff Band ( 7 ) */
{ 0, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
{ 0, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 459,},
{ 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13,},

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@ -17,18 +17,18 @@
/* Coefficient token alphabet */
#define ZERO_TOKEN 0 //0 Extra Bits 0+0
#define ONE_TOKEN 1 //1 Extra Bits 0+1
#define TWO_TOKEN 2 //2 Extra Bits 0+1
#define THREE_TOKEN 3 //3 Extra Bits 0+1
#define FOUR_TOKEN 4 //4 Extra Bits 0+1
#define DCT_VAL_CATEGORY1 5 //5-6 Extra Bits 1+1
#define DCT_VAL_CATEGORY2 6 //7-10 Extra Bits 2+1
#define DCT_VAL_CATEGORY3 7 //11-26 Extra Bits 4+1
#define DCT_VAL_CATEGORY4 8 //11-26 Extra Bits 5+1
#define DCT_VAL_CATEGORY5 9 //27-58 Extra Bits 5+1
#define DCT_VAL_CATEGORY6 10 //59+ Extra Bits 11+1
#define DCT_EOB_TOKEN 11 //EOB Extra Bits 0+0
#define ZERO_TOKEN 0 /* 0 Extra Bits 0+0 */
#define ONE_TOKEN 1 /* 1 Extra Bits 0+1 */
#define TWO_TOKEN 2 /* 2 Extra Bits 0+1 */
#define THREE_TOKEN 3 /* 3 Extra Bits 0+1 */
#define FOUR_TOKEN 4 /* 4 Extra Bits 0+1 */
#define DCT_VAL_CATEGORY1 5 /* 5-6 Extra Bits 1+1 */
#define DCT_VAL_CATEGORY2 6 /* 7-10 Extra Bits 2+1 */
#define DCT_VAL_CATEGORY3 7 /* 11-26 Extra Bits 4+1 */
#define DCT_VAL_CATEGORY4 8 /* 11-26 Extra Bits 5+1 */
#define DCT_VAL_CATEGORY5 9 /* 27-58 Extra Bits 5+1 */
#define DCT_VAL_CATEGORY6 10 /* 59+ Extra Bits 11+1 */
#define DCT_EOB_TOKEN 11 /* EOB Extra Bits 0+0 */
#define vp8_coef_tokens 12
#define MAX_ENTROPY_TOKENS vp8_coef_tokens
@ -83,7 +83,7 @@ extern DECLARE_ALIGNED(16, const unsigned char, vp8_coef_bands[16]);
coefficient band (and since zigzag positions 0, 1, and 2 are in
distinct bands). */
/*# define DC_TOKEN_CONTEXTS 3 // 00, 0!0, !0!0 */
/*# define DC_TOKEN_CONTEXTS 3*/ /* 00, 0!0, !0!0 */
# define PREV_COEF_CONTEXTS 3
extern DECLARE_ALIGNED(16, const unsigned char, vp8_prev_token_class[vp8_coef_tokens]);

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@ -29,21 +29,21 @@ const MV_CONTEXT vp8_mv_update_probs[2] =
const MV_CONTEXT vp8_default_mv_context[2] =
{
{{
// row
162, // is short
128, // sign
225, 146, 172, 147, 214, 39, 156, // short tree
128, 129, 132, 75, 145, 178, 206, 239, 254, 254 // long bits
/* row */
162, /* is short */
128, /* sign */
225, 146, 172, 147, 214, 39, 156, /* short tree */
128, 129, 132, 75, 145, 178, 206, 239, 254, 254 /* long bits */
}},
{{
// same for column
164, // is short
/* same for column */
164, /* is short */
128,
204, 170, 119, 235, 140, 230, 228,
128, 130, 130, 74, 148, 180, 203, 236, 254, 254 // long bits
128, 130, 130, 74, 148, 180, 203, 236, 254, 254 /* long bits */
}}
};

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@ -15,14 +15,14 @@
static void extend_plane_borders
(
unsigned char *s, // source
int sp, // pitch
int h, // height
int w, // width
int et, // extend top border
int el, // extend left border
int eb, // extend bottom border
int er // extend right border
unsigned char *s, /* source */
int sp, /* pitch */
int h, /* height */
int w, /* width */
int et, /* extend top border */
int el, /* extend left border */
int eb, /* extend bottom border */
int er /* extend right border */
)
{
@ -31,7 +31,7 @@ static void extend_plane_borders
unsigned char *dest_ptr1, *dest_ptr2;
int linesize;
// copy the left and right most columns out
/* copy the left and right most columns out */
src_ptr1 = s;
src_ptr2 = s + w - 1;
dest_ptr1 = s - el;
@ -39,8 +39,9 @@ static void extend_plane_borders
for (i = 0; i < h - 0 + 1; i++)
{
// Some linkers will complain if we call vpx_memset with el set to a
// constant 0.
/* Some linkers will complain if we call vpx_memset with el set to a
* constant 0.
*/
if (el)
vpx_memset(dest_ptr1, src_ptr1[0], el);
vpx_memset(dest_ptr2, src_ptr2[0], er);
@ -50,7 +51,7 @@ static void extend_plane_borders
dest_ptr2 += sp;
}
// Now copy the top and bottom source lines into each line of the respective borders
/* Now copy the top and bottom source lines into each line of the respective borders */
src_ptr1 = s - el;
src_ptr2 = s + sp * (h - 1) - el;
dest_ptr1 = s + sp * (-et) - el;
@ -76,12 +77,12 @@ void vp8_extend_to_multiple_of16(YV12_BUFFER_CONFIG *ybf, int width, int height)
int er = 0xf & (16 - (width & 0xf));
int eb = 0xf & (16 - (height & 0xf));
// check for non multiples of 16
/* check for non multiples of 16 */
if (er != 0 || eb != 0)
{
extend_plane_borders(ybf->y_buffer, ybf->y_stride, height, width, 0, 0, eb, er);
//adjust for uv
/* adjust for uv */
height = (height + 1) >> 1;
width = (width + 1) >> 1;
er = 0x7 & (8 - (width & 0x7));
@ -95,7 +96,7 @@ void vp8_extend_to_multiple_of16(YV12_BUFFER_CONFIG *ybf, int width, int height)
}
}
// note the extension is only for the last row, for intra prediction purpose
/* note the extension is only for the last row, for intra prediction purpose */
void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, unsigned char *UPtr, unsigned char *VPtr)
{
int i;

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@ -32,13 +32,13 @@ static const int bilinear_filters[8][2] =
static const short sub_pel_filters[8][6] =
{
{ 0, 0, 128, 0, 0, 0 }, // note that 1/8 pel positions are just as per alpha -0.5 bicubic
{ 0, 0, 128, 0, 0, 0 }, /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */
{ 0, -6, 123, 12, -1, 0 },
{ 2, -11, 108, 36, -8, 1 }, // New 1/4 pel 6 tap filter
{ 2, -11, 108, 36, -8, 1 }, /* New 1/4 pel 6 tap filter */
{ 0, -9, 93, 50, -6, 0 },
{ 3, -16, 77, 77, -16, 3 }, // New 1/2 pel 6 tap filter
{ 3, -16, 77, 77, -16, 3 }, /* New 1/2 pel 6 tap filter */
{ 0, -6, 50, 93, -9, 0 },
{ 1, -8, 36, 108, -11, 2 }, // New 1/4 pel 6 tap filter
{ 1, -8, 36, 108, -11, 2 }, /* New 1/4 pel 6 tap filter */
{ 0, -1, 12, 123, -6, 0 },
@ -69,9 +69,9 @@ void vp8_filter_block2d_first_pass
((int)src_ptr[pixel_step] * vp8_filter[3]) +
((int)src_ptr[2*pixel_step] * vp8_filter[4]) +
((int)src_ptr[3*pixel_step] * vp8_filter[5]) +
(VP8_FILTER_WEIGHT >> 1); // Rounding
(VP8_FILTER_WEIGHT >> 1); /* Rounding */
// Normalize back to 0-255
/* Normalize back to 0-255 */
Temp = Temp >> VP8_FILTER_SHIFT;
if (Temp < 0)
@ -83,7 +83,7 @@ void vp8_filter_block2d_first_pass
src_ptr++;
}
// Next row...
/* Next row... */
src_ptr += src_pixels_per_line - output_width;
output_ptr += output_width;
}
@ -108,16 +108,16 @@ void vp8_filter_block2d_second_pass
{
for (j = 0; j < output_width; j++)
{
// Apply filter
/* Apply filter */
Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +
((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +
((int)src_ptr[0] * vp8_filter[2]) +
((int)src_ptr[pixel_step] * vp8_filter[3]) +
((int)src_ptr[2*pixel_step] * vp8_filter[4]) +
((int)src_ptr[3*pixel_step] * vp8_filter[5]) +
(VP8_FILTER_WEIGHT >> 1); // Rounding
(VP8_FILTER_WEIGHT >> 1); /* Rounding */
// Normalize back to 0-255
/* Normalize back to 0-255 */
Temp = Temp >> VP8_FILTER_SHIFT;
if (Temp < 0)
@ -129,7 +129,7 @@ void vp8_filter_block2d_second_pass
src_ptr++;
}
// Start next row
/* Start next row */
src_ptr += src_pixels_per_line - output_width;
output_ptr += output_pitch;
}
@ -146,12 +146,12 @@ void vp8_filter_block2d
const short *VFilter
)
{
int FData[9*4]; // Temp data bufffer used in filtering
int FData[9*4]; /* Temp data bufffer used in filtering */
// First filter 1-D horizontally...
/* First filter 1-D horizontally... */
vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 4, HFilter);
// then filter verticaly...
/* then filter verticaly... */
vp8_filter_block2d_second_pass(FData + 8, output_ptr, output_pitch, 4, 4, 4, 4, VFilter);
}
@ -195,8 +195,8 @@ void vp8_sixtap_predict_c
const short *HFilter;
const short *VFilter;
HFilter = sub_pel_filters[xoffset]; // 6 tap
VFilter = sub_pel_filters[yoffset]; // 6 tap
HFilter = sub_pel_filters[xoffset]; /* 6 tap */
VFilter = sub_pel_filters[yoffset]; /* 6 tap */
vp8_filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);
}
@ -212,16 +212,16 @@ void vp8_sixtap_predict8x8_c
{
const short *HFilter;
const short *VFilter;
int FData[13*16]; // Temp data bufffer used in filtering
int FData[13*16]; /* Temp data bufffer used in filtering */
HFilter = sub_pel_filters[xoffset]; // 6 tap
VFilter = sub_pel_filters[yoffset]; // 6 tap
HFilter = sub_pel_filters[xoffset]; /* 6 tap */
VFilter = sub_pel_filters[yoffset]; /* 6 tap */
// First filter 1-D horizontally...
/* First filter 1-D horizontally... */
vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 13, 8, HFilter);
// then filter verticaly...
/* then filter verticaly... */
vp8_filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);
}
@ -238,16 +238,16 @@ void vp8_sixtap_predict8x4_c
{
const short *HFilter;
const short *VFilter;
int FData[13*16]; // Temp data bufffer used in filtering
int FData[13*16]; /* Temp data bufffer used in filtering */
HFilter = sub_pel_filters[xoffset]; // 6 tap
VFilter = sub_pel_filters[yoffset]; // 6 tap
HFilter = sub_pel_filters[xoffset]; /* 6 tap */
VFilter = sub_pel_filters[yoffset]; /* 6 tap */
// First filter 1-D horizontally...
/* First filter 1-D horizontally... */
vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 8, HFilter);
// then filter verticaly...
/* then filter verticaly... */
vp8_filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter);
}
@ -264,16 +264,16 @@ void vp8_sixtap_predict16x16_c
{
const short *HFilter;
const short *VFilter;
int FData[21*24]; // Temp data bufffer used in filtering
int FData[21*24]; /* Temp data bufffer used in filtering */
HFilter = sub_pel_filters[xoffset]; // 6 tap
VFilter = sub_pel_filters[yoffset]; // 6 tap
HFilter = sub_pel_filters[xoffset]; /* 6 tap */
VFilter = sub_pel_filters[yoffset]; /* 6 tap */
// First filter 1-D horizontally...
/* First filter 1-D horizontally... */
vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 21, 16, HFilter);
// then filter verticaly...
/* then filter verticaly... */
vp8_filter_block2d_second_pass(FData + 32, dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter);
}
@ -324,14 +324,14 @@ void vp8_filter_block2d_bil_first_pass
{
for (j = 0; j < output_width; j++)
{
// Apply bilinear filter
/* Apply bilinear filter */
output_ptr[j] = (((int)src_ptr[0] * vp8_filter[0]) +
((int)src_ptr[pixel_step] * vp8_filter[1]) +
(VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
src_ptr++;
}
// Next row...
/* Next row... */
src_ptr += src_pixels_per_line - output_width;
output_ptr += output_width;
}
@ -384,7 +384,7 @@ void vp8_filter_block2d_bil_second_pass
{
for (j = 0; j < output_width; j++)
{
// Apply filter
/* Apply filter */
Temp = ((int)src_ptr[0] * vp8_filter[0]) +
((int)src_ptr[pixel_step] * vp8_filter[1]) +
(VP8_FILTER_WEIGHT / 2);
@ -392,7 +392,7 @@ void vp8_filter_block2d_bil_second_pass
src_ptr++;
}
// Next row...
/* Next row... */
src_ptr += src_pixels_per_line - output_width;
output_ptr += output_pitch;
}
@ -432,12 +432,12 @@ void vp8_filter_block2d_bil
)
{
unsigned short FData[17*16]; // Temp data bufffer used in filtering
unsigned short FData[17*16]; /* Temp data bufffer used in filtering */
// First filter 1-D horizontally...
/* First filter 1-D horizontally... */
vp8_filter_block2d_bil_first_pass(src_ptr, FData, src_pixels_per_line, 1, Height + 1, Width, HFilter);
// then 1-D vertically...
/* then 1-D vertically... */
vp8_filter_block2d_bil_second_pass(FData, output_ptr, dst_pitch, Width, Width, Height, Width, VFilter);
}

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@ -168,7 +168,7 @@ void vp8_find_near_mvs
vp8_clamp_mv(nearest, xd);
vp8_clamp_mv(nearby, xd);
vp8_clamp_mv(best_mv, xd); //TODO: move this up before the copy
vp8_clamp_mv(best_mv, xd); /*TODO: move this up before the copy*/
}
vp8_prob *vp8_mv_ref_probs(
@ -179,7 +179,7 @@ vp8_prob *vp8_mv_ref_probs(
p[1] = vp8_mode_contexts [near_mv_ref_ct[1]] [1];
p[2] = vp8_mode_contexts [near_mv_ref_ct[2]] [2];
p[3] = vp8_mode_contexts [near_mv_ref_ct[3]] [3];
//p[3] = vp8_mode_contexts [near_mv_ref_ct[1] + near_mv_ref_ct[2] + near_mv_ref_ct[3]] [3];
/*p[3] = vp8_mode_contexts [near_mv_ref_ct[1] + near_mv_ref_ct[2] + near_mv_ref_ct[3]] [3];*/
return p;
}

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@ -18,6 +18,7 @@
#include "onyxc_int.h"
extern void vp8_arch_x86_common_init(VP8_COMMON *ctx);
extern void vp8_arch_arm_common_init(VP8_COMMON *ctx);
void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x);
extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x);
@ -39,9 +40,11 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
rtcd->recon.copy16x16 = vp8_copy_mem16x16_c;
rtcd->recon.copy8x8 = vp8_copy_mem8x8_c;
rtcd->recon.copy8x4 = vp8_copy_mem8x4_c;
rtcd->recon.recon = vp8_recon_b_c;
rtcd->recon.recon = vp8_recon_b_c;
rtcd->recon.recon2 = vp8_recon2b_c;
rtcd->recon.recon4 = vp8_recon4b_c;
rtcd->recon.recon4 = vp8_recon4b_c;
rtcd->recon.recon_mb = vp8_recon_mb_c;
rtcd->recon.recon_mby = vp8_recon_mby_c;
rtcd->subpix.sixtap16x16 = vp8_sixtap_predict16x16_c;
rtcd->subpix.sixtap8x8 = vp8_sixtap_predict8x8_c;
@ -66,10 +69,11 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
rtcd->postproc.across = vp8_mbpost_proc_across_ip_c;
rtcd->postproc.downacross = vp8_post_proc_down_and_across_c;
rtcd->postproc.addnoise = vp8_plane_add_noise_c;
rtcd->postproc.blend_mb = vp8_blend_mb_c;
#endif
#endif
// Pure C:
/* Pure C: */
vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby;
vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s;
@ -77,4 +81,8 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
vp8_arch_x86_common_init(ctx);
#endif
#if ARCH_ARM
vp8_arch_arm_common_init(ctx);
#endif
}

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@ -38,7 +38,7 @@ void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *
{
int i;
// do 2nd order transform on the dc block
/* do 2nd order transform on the dc block */
IDCT_INVOKE(rtcd, iwalsh16)(x->block[24].dqcoeff, x->block[24].diff);
recon_dcblock(x);
@ -68,7 +68,7 @@ void vp8_inverse_transform_mb(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x
if (x->mode_info_context->mbmi.mode != B_PRED &&
x->mode_info_context->mbmi.mode != SPLITMV)
{
// do 2nd order transform on the dc block
/* do 2nd order transform on the dc block */
IDCT_INVOKE(rtcd, iwalsh16)(&x->block[24].dqcoeff[0], x->block[24].diff);
recon_dcblock(x);

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@ -23,7 +23,7 @@ prototype_loopfilter(vp8_mbloop_filter_vertical_edge_c);
prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_c);
prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_c);
// Horizontal MB filtering
/* Horizontal MB filtering */
void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
@ -47,7 +47,7 @@ void vp8_loop_filter_mbhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned
vp8_loop_filter_simple_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
}
// Vertical MB Filtering
/* Vertical MB Filtering */
void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
@ -71,7 +71,7 @@ void vp8_loop_filter_mbvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned
vp8_loop_filter_simple_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
}
// Horizontal B Filtering
/* Horizontal B Filtering */
void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
@ -99,7 +99,7 @@ void vp8_loop_filter_bhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned
vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
}
// Vertical B Filtering
/* Vertical B Filtering */
void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
@ -140,7 +140,7 @@ void vp8_init_loop_filter(VP8_COMMON *cm)
const int yhedge_boost = 2;
const int uvhedge_boost = 2;
// For each possible value for the loop filter fill out a "loop_filter_info" entry.
/* For each possible value for the loop filter fill out a "loop_filter_info" entry. */
for (i = 0; i <= MAX_LOOP_FILTER; i++)
{
int filt_lvl = i;
@ -166,7 +166,7 @@ void vp8_init_loop_filter(VP8_COMMON *cm)
HEVThresh = 0;
}
// Set loop filter paramaeters that control sharpness.
/* Set loop filter paramaeters that control sharpness. */
block_inside_limit = filt_lvl >> (sharpness_lvl > 0);
block_inside_limit = block_inside_limit >> (sharpness_lvl > 4);
@ -195,7 +195,7 @@ void vp8_init_loop_filter(VP8_COMMON *cm)
}
// Set up the function pointers depending on the type of loop filtering selected
/* Set up the function pointers depending on the type of loop filtering selected */
if (lft == NORMAL_LOOPFILTER)
{
cm->lf_mbv = LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v);
@ -212,14 +212,15 @@ void vp8_init_loop_filter(VP8_COMMON *cm)
}
}
// Put vp8_init_loop_filter() in vp8dx_create_decompressor(). Only call vp8_frame_init_loop_filter() while decoding
// each frame. Check last_frame_type to skip the function most of times.
/* Put vp8_init_loop_filter() in vp8dx_create_decompressor(). Only call vp8_frame_init_loop_filter() while decoding
* each frame. Check last_frame_type to skip the function most of times.
*/
void vp8_frame_init_loop_filter(loop_filter_info *lfi, int frame_type)
{
int HEVThresh;
int i, j;
// For each possible value for the loop filter fill out a "loop_filter_info" entry.
/* For each possible value for the loop filter fill out a "loop_filter_info" entry. */
for (i = 0; i <= MAX_LOOP_FILTER; i++)
{
int filt_lvl = i;
@ -247,15 +248,15 @@ void vp8_frame_init_loop_filter(loop_filter_info *lfi, int frame_type)
for (j = 0; j < 16; j++)
{
//lfi[i].lim[j] = block_inside_limit;
//lfi[i].mbflim[j] = filt_lvl+yhedge_boost;
/*lfi[i].lim[j] = block_inside_limit;
lfi[i].mbflim[j] = filt_lvl+yhedge_boost;*/
lfi[i].mbthr[j] = HEVThresh;
//lfi[i].flim[j] = filt_lvl;
/*lfi[i].flim[j] = filt_lvl;*/
lfi[i].thr[j] = HEVThresh;
//lfi[i].uvlim[j] = block_inside_limit;
//lfi[i].uvmbflim[j] = filt_lvl+uvhedge_boost;
/*lfi[i].uvlim[j] = block_inside_limit;
lfi[i].uvmbflim[j] = filt_lvl+uvhedge_boost;*/
lfi[i].uvmbthr[j] = HEVThresh;
//lfi[i].uvflim[j] = filt_lvl;
/*lfi[i].uvflim[j] = filt_lvl;*/
lfi[i].uvthr[j] = HEVThresh;
}
}
@ -268,32 +269,32 @@ void vp8_adjust_mb_lf_value(MACROBLOCKD *mbd, int *filter_level)
if (mbd->mode_ref_lf_delta_enabled)
{
// Aplly delta for reference frame
/* Apply delta for reference frame */
*filter_level += mbd->ref_lf_deltas[mbmi->ref_frame];
// Apply delta for mode
/* Apply delta for mode */
if (mbmi->ref_frame == INTRA_FRAME)
{
// Only the split mode BPRED has a further special case
/* Only the split mode BPRED has a further special case */
if (mbmi->mode == B_PRED)
*filter_level += mbd->mode_lf_deltas[0];
}
else
{
// Zero motion mode
/* Zero motion mode */
if (mbmi->mode == ZEROMV)
*filter_level += mbd->mode_lf_deltas[1];
// Split MB motion mode
/* Split MB motion mode */
else if (mbmi->mode == SPLITMV)
*filter_level += mbd->mode_lf_deltas[3];
// All other inter motion modes (Nearest, Near, New)
/* All other inter motion modes (Nearest, Near, New) */
else
*filter_level += mbd->mode_lf_deltas[2];
}
// Range check
/* Range check */
if (*filter_level > MAX_LOOP_FILTER)
*filter_level = MAX_LOOP_FILTER;
else if (*filter_level < 0)
@ -311,7 +312,7 @@ void vp8_loop_filter_frame
{
YV12_BUFFER_CONFIG *post = cm->frame_to_show;
loop_filter_info *lfi = cm->lf_info;
int frame_type = cm->frame_type;
FRAME_TYPE frame_type = cm->frame_type;
int mb_row;
int mb_col;
@ -324,21 +325,21 @@ void vp8_loop_filter_frame
int i;
unsigned char *y_ptr, *u_ptr, *v_ptr;
mbd->mode_info_context = cm->mi; // Point at base of Mb MODE_INFO list
mbd->mode_info_context = cm->mi; /* Point at base of Mb MODE_INFO list */
// Note the baseline filter values for each segment
/* Note the baseline filter values for each segment */
if (alt_flt_enabled)
{
for (i = 0; i < MAX_MB_SEGMENTS; i++)
{
// Abs value
/* Abs value */
if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
// Delta Value
/* Delta Value */
else
{
baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0; // Clamp to valid range
baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0; /* Clamp to valid range */
}
}
}
@ -348,18 +349,18 @@ void vp8_loop_filter_frame
baseline_filter_level[i] = default_filt_lvl;
}
// Initialize the loop filter for this frame.
/* Initialize the loop filter for this frame. */
if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level))
vp8_init_loop_filter(cm);
else if (frame_type != cm->last_frame_type)
vp8_frame_init_loop_filter(lfi, frame_type);
// Set up the buffer pointers
/* Set up the buffer pointers */
y_ptr = post->y_buffer;
u_ptr = post->u_buffer;
v_ptr = post->v_buffer;
// vp8_filter each macro block
/* vp8_filter each macro block */
for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
{
for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
@ -368,9 +369,10 @@ void vp8_loop_filter_frame
filter_level = baseline_filter_level[Segment];
// Distance of Mb to the various image edges.
// These specified to 8th pel as they are always compared to values that are in 1/8th pel units
// Apply any context driven MB level adjustment
/* Distance of Mb to the various image edges.
* These specified to 8th pel as they are always compared to values that are in 1/8th pel units
* Apply any context driven MB level adjustment
*/
vp8_adjust_mb_lf_value(mbd, &filter_level);
if (filter_level)
@ -381,7 +383,7 @@ void vp8_loop_filter_frame
if (mbd->mode_info_context->mbmi.dc_diff > 0)
cm->lf_bv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
// don't apply across umv border
/* don't apply across umv border */
if (mb_row > 0)
cm->lf_mbh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
@ -393,14 +395,14 @@ void vp8_loop_filter_frame
u_ptr += 8;
v_ptr += 8;
mbd->mode_info_context++; // step to next MB
mbd->mode_info_context++; /* step to next MB */
}
y_ptr += post->y_stride * 16 - post->y_width;
u_ptr += post->uv_stride * 8 - post->uv_width;
v_ptr += post->uv_stride * 8 - post->uv_width;
mbd->mode_info_context++; // Skip border mb
mbd->mode_info_context++; /* Skip border mb */
}
}
@ -424,26 +426,26 @@ void vp8_loop_filter_frame_yonly
int baseline_filter_level[MAX_MB_SEGMENTS];
int filter_level;
int alt_flt_enabled = mbd->segmentation_enabled;
int frame_type = cm->frame_type;
FRAME_TYPE frame_type = cm->frame_type;
(void) sharpness_lvl;
//MODE_INFO * this_mb_mode_info = cm->mi; // Point at base of Mb MODE_INFO list
mbd->mode_info_context = cm->mi; // Point at base of Mb MODE_INFO list
/*MODE_INFO * this_mb_mode_info = cm->mi;*/ /* Point at base of Mb MODE_INFO list */
mbd->mode_info_context = cm->mi; /* Point at base of Mb MODE_INFO list */
// Note the baseline filter values for each segment
/* Note the baseline filter values for each segment */
if (alt_flt_enabled)
{
for (i = 0; i < MAX_MB_SEGMENTS; i++)
{
// Abs value
/* Abs value */
if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
// Delta Value
/* Delta Value */
else
{
baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0; // Clamp to valid range
baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0; /* Clamp to valid range */
}
}
}
@ -453,16 +455,16 @@ void vp8_loop_filter_frame_yonly
baseline_filter_level[i] = default_filt_lvl;
}
// Initialize the loop filter for this frame.
/* Initialize the loop filter for this frame. */
if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level))
vp8_init_loop_filter(cm);
else if (frame_type != cm->last_frame_type)
vp8_frame_init_loop_filter(lfi, frame_type);
// Set up the buffer pointers
/* Set up the buffer pointers */
y_ptr = post->y_buffer;
// vp8_filter each macro block
/* vp8_filter each macro block */
for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
{
for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
@ -470,7 +472,7 @@ void vp8_loop_filter_frame_yonly
int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0;
filter_level = baseline_filter_level[Segment];
// Apply any context driven MB level adjustment
/* Apply any context driven MB level adjustment */
vp8_adjust_mb_lf_value(mbd, &filter_level);
if (filter_level)
@ -481,7 +483,7 @@ void vp8_loop_filter_frame_yonly
if (mbd->mode_info_context->mbmi.dc_diff > 0)
cm->lf_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
// don't apply across umv border
/* don't apply across umv border */
if (mb_row > 0)
cm->lf_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
@ -490,12 +492,12 @@ void vp8_loop_filter_frame_yonly
}
y_ptr += 16;
mbd->mode_info_context ++; // step to next MB
mbd->mode_info_context ++; /* step to next MB */
}
y_ptr += post->y_stride * 16 - post->y_width;
mbd->mode_info_context ++; // Skip border mb
mbd->mode_info_context ++; /* Skip border mb */
}
}
@ -516,7 +518,7 @@ void vp8_loop_filter_partial_frame
unsigned char *y_ptr;
int mb_row;
int mb_col;
//int mb_rows = post->y_height >> 4;
/*int mb_rows = post->y_height >> 4;*/
int mb_cols = post->y_width >> 4;
int linestocopy;
@ -525,12 +527,12 @@ void vp8_loop_filter_partial_frame
int baseline_filter_level[MAX_MB_SEGMENTS];
int filter_level;
int alt_flt_enabled = mbd->segmentation_enabled;
int frame_type = cm->frame_type;
FRAME_TYPE frame_type = cm->frame_type;
(void) sharpness_lvl;
//MODE_INFO * this_mb_mode_info = cm->mi + (post->y_height>>5) * (mb_cols + 1); // Point at base of Mb MODE_INFO list
mbd->mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1); // Point at base of Mb MODE_INFO list
/*MODE_INFO * this_mb_mode_info = cm->mi + (post->y_height>>5) * (mb_cols + 1);*/ /* Point at base of Mb MODE_INFO list */
mbd->mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1); /* Point at base of Mb MODE_INFO list */
linestocopy = (post->y_height >> (4 + Fraction));
@ -539,19 +541,19 @@ void vp8_loop_filter_partial_frame
linestocopy <<= 4;
// Note the baseline filter values for each segment
/* Note the baseline filter values for each segment */
if (alt_flt_enabled)
{
for (i = 0; i < MAX_MB_SEGMENTS; i++)
{
// Abs value
/* Abs value */
if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
// Delta Value
/* Delta Value */
else
{
baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0; // Clamp to valid range
baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0; /* Clamp to valid range */
}
}
}
@ -561,16 +563,16 @@ void vp8_loop_filter_partial_frame
baseline_filter_level[i] = default_filt_lvl;
}
// Initialize the loop filter for this frame.
/* Initialize the loop filter for this frame. */
if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level))
vp8_init_loop_filter(cm);
else if (frame_type != cm->last_frame_type)
vp8_frame_init_loop_filter(lfi, frame_type);
// Set up the buffer pointers
/* Set up the buffer pointers */
y_ptr = post->y_buffer + (post->y_height >> 5) * 16 * post->y_stride;
// vp8_filter each macro block
/* vp8_filter each macro block */
for (mb_row = 0; mb_row<(linestocopy >> 4); mb_row++)
{
for (mb_col = 0; mb_col < mb_cols; mb_col++)
@ -593,10 +595,10 @@ void vp8_loop_filter_partial_frame
}
y_ptr += 16;
mbd->mode_info_context += 1; // step to next MB
mbd->mode_info_context += 1; /* step to next MB */
}
y_ptr += post->y_stride * 16 - post->y_width;
mbd->mode_info_context += 1; // Skip border mb
mbd->mode_info_context += 1; /* Skip border mb */
}
}

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@ -22,10 +22,10 @@ typedef enum
SIMPLE_LOOPFILTER = 1
} LOOPFILTERTYPE;
// FRK
// Need to align this structure so when it is declared and
// passed it can be loaded into vector registers.
// FRK
/* FRK
* Need to align this structure so when it is declared and
* passed it can be loaded into vector registers.
*/
typedef struct
{
DECLARE_ALIGNED(16, signed char, lim[16]);
@ -119,8 +119,8 @@ typedef struct
typedef void loop_filter_uvfunction
(
unsigned char *u, // source pointer
int p, // pitch
unsigned char *u, /* source pointer */
int p, /* pitch */
const signed char *flimit,
const signed char *limit,
const signed char *thresh,

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@ -17,8 +17,6 @@
#define __inline inline
#endif
#define NEW_LOOPFILTER_MASK
typedef unsigned char uc;
static __inline signed char vp8_signed_char_clamp(int t)
@ -29,7 +27,7 @@ static __inline signed char vp8_signed_char_clamp(int t)
}
// should we apply any filter at all ( 11111111 yes, 00000000 no)
/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
static __inline signed char vp8_filter_mask(signed char limit, signed char flimit,
uc p3, uc p2, uc p1, uc p0, uc q0, uc q1, uc q2, uc q3)
{
@ -40,16 +38,12 @@ static __inline signed char vp8_filter_mask(signed char limit, signed char flimi
mask |= (abs(q1 - q0) > limit) * -1;
mask |= (abs(q2 - q1) > limit) * -1;
mask |= (abs(q3 - q2) > limit) * -1;
#ifndef NEW_LOOPFILTER_MASK
mask |= (abs(p0 - q0) > flimit) * -1;
#else
mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit) * -1;
#endif
mask = ~mask;
return mask;
}
// is there high variance internal edge ( 11111111 yes, 00000000 no)
/* is there high variance internal edge ( 11111111 yes, 00000000 no) */
static __inline signed char vp8_hevmask(signed char thresh, uc p1, uc p0, uc q0, uc q1)
{
signed char hev = 0;
@ -71,17 +65,18 @@ static __inline void vp8_filter(signed char mask, signed char hev, uc *op1, uc *
qs0 = (signed char) * oq0 ^ 0x80;
qs1 = (signed char) * oq1 ^ 0x80;
// add outer taps if we have high edge variance
/* add outer taps if we have high edge variance */
vp8_filter = vp8_signed_char_clamp(ps1 - qs1);
vp8_filter &= hev;
// inner taps
/* inner taps */
vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0));
vp8_filter &= mask;
// save bottom 3 bits so that we round one side +4 and the other +3
// if it equals 4 we'll set to adjust by -1 to account for the fact
// we'd round 3 the other way
/* save bottom 3 bits so that we round one side +4 and the other +3
* if it equals 4 we'll set to adjust by -1 to account for the fact
* we'd round 3 the other way
*/
Filter1 = vp8_signed_char_clamp(vp8_filter + 4);
Filter2 = vp8_signed_char_clamp(vp8_filter + 3);
Filter1 >>= 3;
@ -92,7 +87,7 @@ static __inline void vp8_filter(signed char mask, signed char hev, uc *op1, uc *
*op0 = u ^ 0x80;
vp8_filter = Filter1;
// outer tap adjustments
/* outer tap adjustments */
vp8_filter += 1;
vp8_filter >>= 1;
vp8_filter &= ~hev;
@ -106,19 +101,20 @@ static __inline void vp8_filter(signed char mask, signed char hev, uc *op1, uc *
void vp8_loop_filter_horizontal_edge_c
(
unsigned char *s,
int p, //pitch
int p, /* pitch */
const signed char *flimit,
const signed char *limit,
const signed char *thresh,
int count
)
{
int hev = 0; // high edge variance
int hev = 0; /* high edge variance */
signed char mask = 0;
int i = 0;
// loop filter designed to work using chars so that we can make maximum use
// of 8 bit simd instructions.
/* loop filter designed to work using chars so that we can make maximum use
* of 8 bit simd instructions.
*/
do
{
mask = vp8_filter_mask(limit[i], flimit[i],
@ -144,12 +140,13 @@ void vp8_loop_filter_vertical_edge_c
int count
)
{
int hev = 0; // high edge variance
int hev = 0; /* high edge variance */
signed char mask = 0;
int i = 0;
// loop filter designed to work using chars so that we can make maximum use
// of 8 bit simd instructions.
/* loop filter designed to work using chars so that we can make maximum use
* of 8 bit simd instructions.
*/
do
{
mask = vp8_filter_mask(limit[i], flimit[i],
@ -176,7 +173,7 @@ static __inline void vp8_mbfilter(signed char mask, signed char hev,
signed char qs1 = (signed char) * oq1 ^ 0x80;
signed char qs2 = (signed char) * oq2 ^ 0x80;
// add outer taps if we have high edge variance
/* add outer taps if we have high edge variance */
vp8_filter = vp8_signed_char_clamp(ps1 - qs1);
vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0));
vp8_filter &= mask;
@ -184,7 +181,7 @@ static __inline void vp8_mbfilter(signed char mask, signed char hev,
Filter2 = vp8_filter;
Filter2 &= hev;
// save bottom 3 bits so that we round one side +4 and the other +3
/* save bottom 3 bits so that we round one side +4 and the other +3 */
Filter1 = vp8_signed_char_clamp(Filter2 + 4);
Filter2 = vp8_signed_char_clamp(Filter2 + 3);
Filter1 >>= 3;
@ -193,25 +190,25 @@ static __inline void vp8_mbfilter(signed char mask, signed char hev,
ps0 = vp8_signed_char_clamp(ps0 + Filter2);
// only apply wider filter if not high edge variance
/* only apply wider filter if not high edge variance */
vp8_filter &= ~hev;
Filter2 = vp8_filter;
// roughly 3/7th difference across boundary
/* roughly 3/7th difference across boundary */
u = vp8_signed_char_clamp((63 + Filter2 * 27) >> 7);
s = vp8_signed_char_clamp(qs0 - u);
*oq0 = s ^ 0x80;
s = vp8_signed_char_clamp(ps0 + u);
*op0 = s ^ 0x80;
// roughly 2/7th difference across boundary
/* roughly 2/7th difference across boundary */
u = vp8_signed_char_clamp((63 + Filter2 * 18) >> 7);
s = vp8_signed_char_clamp(qs1 - u);
*oq1 = s ^ 0x80;
s = vp8_signed_char_clamp(ps1 + u);
*op1 = s ^ 0x80;
// roughly 1/7th difference across boundary
/* roughly 1/7th difference across boundary */
u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7);
s = vp8_signed_char_clamp(qs2 - u);
*oq2 = s ^ 0x80;
@ -229,12 +226,13 @@ void vp8_mbloop_filter_horizontal_edge_c
int count
)
{
signed char hev = 0; // high edge variance
signed char hev = 0; /* high edge variance */
signed char mask = 0;
int i = 0;
// loop filter designed to work using chars so that we can make maximum use
// of 8 bit simd instructions.
/* loop filter designed to work using chars so that we can make maximum use
* of 8 bit simd instructions.
*/
do
{
@ -263,7 +261,7 @@ void vp8_mbloop_filter_vertical_edge_c
int count
)
{
signed char hev = 0; // high edge variance
signed char hev = 0; /* high edge variance */
signed char mask = 0;
int i = 0;
@ -283,17 +281,14 @@ void vp8_mbloop_filter_vertical_edge_c
}
// should we apply any filter at all ( 11111111 yes, 00000000 no)
/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
static __inline signed char vp8_simple_filter_mask(signed char limit, signed char flimit, uc p1, uc p0, uc q0, uc q1)
{
// Why does this cause problems for win32?
// error C2143: syntax error : missing ';' before 'type'
// (void) limit;
#ifndef NEW_LOOPFILTER_MASK
signed char mask = (abs(p0 - q0) <= flimit) * -1;
#else
/* Why does this cause problems for win32?
* error C2143: syntax error : missing ';' before 'type'
* (void) limit;
*/
signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= flimit * 2 + limit) * -1;
#endif
return mask;
}
@ -310,7 +305,7 @@ static __inline void vp8_simple_filter(signed char mask, uc *op1, uc *op0, uc *o
vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (q0 - p0));
vp8_filter &= mask;
// save bottom 3 bits so that we round one side +4 and the other +3
/* save bottom 3 bits so that we round one side +4 and the other +3 */
Filter1 = vp8_signed_char_clamp(vp8_filter + 4);
Filter1 >>= 3;
u = vp8_signed_char_clamp(q0 - Filter1);
@ -338,7 +333,7 @@ void vp8_loop_filter_simple_horizontal_edge_c
do
{
//mask = vp8_simple_filter_mask( limit[i], flimit[i],s[-1*p],s[0*p]);
/*mask = vp8_simple_filter_mask( limit[i], flimit[i],s[-1*p],s[0*p]);*/
mask = vp8_simple_filter_mask(limit[i], flimit[i], s[-2*p], s[-1*p], s[0*p], s[1*p]);
vp8_simple_filter(mask, s - 2 * p, s - 1 * p, s, s + 1 * p);
++s;
@ -362,7 +357,7 @@ void vp8_loop_filter_simple_vertical_edge_c
do
{
//mask = vp8_simple_filter_mask( limit[i], flimit[i],s[-1],s[0]);
/*mask = vp8_simple_filter_mask( limit[i], flimit[i],s[-1],s[0]);*/
mask = vp8_simple_filter_mask(limit[i], flimit[i], s[-2], s[-1], s[0], s[1]);
vp8_simple_filter(mask, s - 2, s - 1, s, s + 1);
s += p;

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@ -14,7 +14,7 @@
typedef enum
{
PRED = 0,
DEST = 1,
DEST = 1
} BLOCKSET;
void vp8_setup_block
@ -62,13 +62,13 @@ void vp8_setup_macroblock(MACROBLOCKD *x, BLOCKSET bs)
v = &x->pre.v_buffer;
}
for (block = 0; block < 16; block++) // y blocks
for (block = 0; block < 16; block++) /* y blocks */
{
vp8_setup_block(&x->block[block], x->dst.y_stride, y, x->dst.y_stride,
(block >> 2) * 4 * x->dst.y_stride + (block & 3) * 4, bs);
}
for (block = 16; block < 20; block++) // U and V blocks
for (block = 16; block < 20; block++) /* U and V blocks */
{
vp8_setup_block(&x->block[block], x->dst.uv_stride, u, x->dst.uv_stride,
((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4, bs);
@ -123,7 +123,7 @@ void vp8_setup_block_dptrs(MACROBLOCKD *x)
void vp8_build_block_doffsets(MACROBLOCKD *x)
{
// handle the destination pitch features
/* handle the destination pitch features */
vp8_setup_macroblock(x, DEST);
vp8_setup_macroblock(x, PRED);
}

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@ -14,27 +14,27 @@
const int vp8_mode_contexts[6][4] =
{
{
// 0
/* 0 */
7, 1, 1, 143,
},
{
// 1
/* 1 */
14, 18, 14, 107,
},
{
// 2
/* 2 */
135, 64, 57, 68,
},
{
// 3
/* 3 */
60, 56, 128, 65,
},
{
// 4
/* 4 */
159, 134, 128, 34,
},
{
// 5
/* 5 */
234, 188, 128, 28,
},
};

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@ -14,133 +14,133 @@
const unsigned int vp8_kf_default_bmode_counts [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES] =
{
{
//Above Mode : 0
{ 43438, 2195, 470, 316, 615, 171, 217, 412, 124, 160, }, // left_mode 0
{ 5722, 2751, 296, 291, 81, 68, 80, 101, 100, 170, }, // left_mode 1
{ 1629, 201, 307, 25, 47, 16, 34, 72, 19, 28, }, // left_mode 2
{ 332, 266, 36, 500, 20, 65, 23, 14, 154, 106, }, // left_mode 3
{ 450, 97, 10, 24, 117, 10, 2, 12, 8, 71, }, // left_mode 4
{ 384, 49, 29, 44, 12, 162, 51, 5, 87, 42, }, // left_mode 5
{ 495, 53, 157, 27, 14, 57, 180, 17, 17, 34, }, // left_mode 6
{ 695, 64, 62, 9, 27, 5, 3, 147, 10, 26, }, // left_mode 7
{ 230, 54, 20, 124, 16, 125, 29, 12, 283, 37, }, // left_mode 8
{ 260, 87, 21, 120, 32, 16, 33, 16, 33, 203, }, // left_mode 9
/*Above Mode : 0*/
{ 43438, 2195, 470, 316, 615, 171, 217, 412, 124, 160, }, /* left_mode 0 */
{ 5722, 2751, 296, 291, 81, 68, 80, 101, 100, 170, }, /* left_mode 1 */
{ 1629, 201, 307, 25, 47, 16, 34, 72, 19, 28, }, /* left_mode 2 */
{ 332, 266, 36, 500, 20, 65, 23, 14, 154, 106, }, /* left_mode 3 */
{ 450, 97, 10, 24, 117, 10, 2, 12, 8, 71, }, /* left_mode 4 */
{ 384, 49, 29, 44, 12, 162, 51, 5, 87, 42, }, /* left_mode 5 */
{ 495, 53, 157, 27, 14, 57, 180, 17, 17, 34, }, /* left_mode 6 */
{ 695, 64, 62, 9, 27, 5, 3, 147, 10, 26, }, /* left_mode 7 */
{ 230, 54, 20, 124, 16, 125, 29, 12, 283, 37, }, /* left_mode 8 */
{ 260, 87, 21, 120, 32, 16, 33, 16, 33, 203, }, /* left_mode 9 */
},
{
//Above Mode : 1
{ 3934, 2573, 355, 137, 128, 87, 133, 117, 37, 27, }, // left_mode 0
{ 1036, 1929, 278, 135, 27, 37, 48, 55, 41, 91, }, // left_mode 1
{ 223, 256, 253, 15, 13, 9, 28, 64, 3, 3, }, // left_mode 2
{ 120, 129, 17, 316, 15, 11, 9, 4, 53, 74, }, // left_mode 3
{ 129, 58, 6, 11, 38, 2, 0, 5, 2, 67, }, // left_mode 4
{ 53, 22, 11, 16, 8, 26, 14, 3, 19, 12, }, // left_mode 5
{ 59, 26, 61, 11, 4, 9, 35, 13, 8, 8, }, // left_mode 6
{ 101, 52, 40, 8, 5, 2, 8, 59, 2, 20, }, // left_mode 7
{ 48, 34, 10, 52, 8, 15, 6, 6, 63, 20, }, // left_mode 8
{ 96, 48, 22, 63, 11, 14, 5, 8, 9, 96, }, // left_mode 9
/*Above Mode : 1*/
{ 3934, 2573, 355, 137, 128, 87, 133, 117, 37, 27, }, /* left_mode 0 */
{ 1036, 1929, 278, 135, 27, 37, 48, 55, 41, 91, }, /* left_mode 1 */
{ 223, 256, 253, 15, 13, 9, 28, 64, 3, 3, }, /* left_mode 2 */
{ 120, 129, 17, 316, 15, 11, 9, 4, 53, 74, }, /* left_mode 3 */
{ 129, 58, 6, 11, 38, 2, 0, 5, 2, 67, }, /* left_mode 4 */
{ 53, 22, 11, 16, 8, 26, 14, 3, 19, 12, }, /* left_mode 5 */
{ 59, 26, 61, 11, 4, 9, 35, 13, 8, 8, }, /* left_mode 6 */
{ 101, 52, 40, 8, 5, 2, 8, 59, 2, 20, }, /* left_mode 7 */
{ 48, 34, 10, 52, 8, 15, 6, 6, 63, 20, }, /* left_mode 8 */
{ 96, 48, 22, 63, 11, 14, 5, 8, 9, 96, }, /* left_mode 9 */
},
{
//Above Mode : 2
{ 709, 461, 506, 36, 27, 33, 151, 98, 24, 6, }, // left_mode 0
{ 201, 375, 442, 27, 13, 8, 46, 58, 6, 19, }, // left_mode 1
{ 122, 140, 417, 4, 13, 3, 33, 59, 4, 2, }, // left_mode 2
{ 36, 17, 22, 16, 6, 8, 12, 17, 9, 21, }, // left_mode 3
{ 51, 15, 7, 1, 14, 0, 4, 5, 3, 22, }, // left_mode 4
{ 18, 11, 30, 9, 7, 20, 11, 5, 2, 6, }, // left_mode 5
{ 38, 21, 103, 9, 4, 12, 79, 13, 2, 5, }, // left_mode 6
{ 64, 17, 66, 2, 12, 4, 2, 65, 4, 5, }, // left_mode 7
{ 14, 7, 7, 16, 3, 11, 4, 13, 15, 16, }, // left_mode 8
{ 36, 8, 32, 9, 9, 4, 14, 7, 6, 24, }, // left_mode 9
/*Above Mode : 2*/
{ 709, 461, 506, 36, 27, 33, 151, 98, 24, 6, }, /* left_mode 0 */
{ 201, 375, 442, 27, 13, 8, 46, 58, 6, 19, }, /* left_mode 1 */
{ 122, 140, 417, 4, 13, 3, 33, 59, 4, 2, }, /* left_mode 2 */
{ 36, 17, 22, 16, 6, 8, 12, 17, 9, 21, }, /* left_mode 3 */
{ 51, 15, 7, 1, 14, 0, 4, 5, 3, 22, }, /* left_mode 4 */
{ 18, 11, 30, 9, 7, 20, 11, 5, 2, 6, }, /* left_mode 5 */
{ 38, 21, 103, 9, 4, 12, 79, 13, 2, 5, }, /* left_mode 6 */
{ 64, 17, 66, 2, 12, 4, 2, 65, 4, 5, }, /* left_mode 7 */
{ 14, 7, 7, 16, 3, 11, 4, 13, 15, 16, }, /* left_mode 8 */
{ 36, 8, 32, 9, 9, 4, 14, 7, 6, 24, }, /* left_mode 9 */
},
{
//Above Mode : 3
{ 1340, 173, 36, 119, 30, 10, 13, 10, 20, 26, }, // left_mode 0
{ 156, 293, 26, 108, 5, 16, 2, 4, 23, 30, }, // left_mode 1
{ 60, 34, 13, 7, 3, 3, 0, 8, 4, 5, }, // left_mode 2
{ 72, 64, 1, 235, 3, 9, 2, 7, 28, 38, }, // left_mode 3
{ 29, 14, 1, 3, 5, 0, 2, 2, 5, 13, }, // left_mode 4
{ 22, 7, 4, 11, 2, 5, 1, 2, 6, 4, }, // left_mode 5
{ 18, 14, 5, 6, 4, 3, 14, 0, 9, 2, }, // left_mode 6
{ 41, 10, 7, 1, 2, 0, 0, 10, 2, 1, }, // left_mode 7
{ 23, 19, 2, 33, 1, 5, 2, 0, 51, 8, }, // left_mode 8
{ 33, 26, 7, 53, 3, 9, 3, 3, 9, 19, }, // left_mode 9
/*Above Mode : 3*/
{ 1340, 173, 36, 119, 30, 10, 13, 10, 20, 26, }, /* left_mode 0 */
{ 156, 293, 26, 108, 5, 16, 2, 4, 23, 30, }, /* left_mode 1 */
{ 60, 34, 13, 7, 3, 3, 0, 8, 4, 5, }, /* left_mode 2 */
{ 72, 64, 1, 235, 3, 9, 2, 7, 28, 38, }, /* left_mode 3 */
{ 29, 14, 1, 3, 5, 0, 2, 2, 5, 13, }, /* left_mode 4 */
{ 22, 7, 4, 11, 2, 5, 1, 2, 6, 4, }, /* left_mode 5 */
{ 18, 14, 5, 6, 4, 3, 14, 0, 9, 2, }, /* left_mode 6 */
{ 41, 10, 7, 1, 2, 0, 0, 10, 2, 1, }, /* left_mode 7 */
{ 23, 19, 2, 33, 1, 5, 2, 0, 51, 8, }, /* left_mode 8 */
{ 33, 26, 7, 53, 3, 9, 3, 3, 9, 19, }, /* left_mode 9 */
},
{
//Above Mode : 4
{ 410, 165, 43, 31, 66, 15, 30, 54, 8, 17, }, // left_mode 0
{ 115, 64, 27, 18, 30, 7, 11, 15, 4, 19, }, // left_mode 1
{ 31, 23, 25, 1, 7, 2, 2, 10, 0, 5, }, // left_mode 2
{ 17, 4, 1, 6, 8, 2, 7, 5, 5, 21, }, // left_mode 3
{ 120, 12, 1, 2, 83, 3, 0, 4, 1, 40, }, // left_mode 4
{ 4, 3, 1, 2, 1, 2, 5, 0, 3, 6, }, // left_mode 5
{ 10, 2, 13, 6, 6, 6, 8, 2, 4, 5, }, // left_mode 6
{ 58, 10, 5, 1, 28, 1, 1, 33, 1, 9, }, // left_mode 7
{ 8, 2, 1, 4, 2, 5, 1, 1, 2, 10, }, // left_mode 8
{ 76, 7, 5, 7, 18, 2, 2, 0, 5, 45, }, // left_mode 9
/*Above Mode : 4*/
{ 410, 165, 43, 31, 66, 15, 30, 54, 8, 17, }, /* left_mode 0 */
{ 115, 64, 27, 18, 30, 7, 11, 15, 4, 19, }, /* left_mode 1 */
{ 31, 23, 25, 1, 7, 2, 2, 10, 0, 5, }, /* left_mode 2 */
{ 17, 4, 1, 6, 8, 2, 7, 5, 5, 21, }, /* left_mode 3 */
{ 120, 12, 1, 2, 83, 3, 0, 4, 1, 40, }, /* left_mode 4 */
{ 4, 3, 1, 2, 1, 2, 5, 0, 3, 6, }, /* left_mode 5 */
{ 10, 2, 13, 6, 6, 6, 8, 2, 4, 5, }, /* left_mode 6 */
{ 58, 10, 5, 1, 28, 1, 1, 33, 1, 9, }, /* left_mode 7 */
{ 8, 2, 1, 4, 2, 5, 1, 1, 2, 10, }, /* left_mode 8 */
{ 76, 7, 5, 7, 18, 2, 2, 0, 5, 45, }, /* left_mode 9 */
},
{
//Above Mode : 5
{ 444, 46, 47, 20, 14, 110, 60, 14, 60, 7, }, // left_mode 0
{ 59, 57, 25, 18, 3, 17, 21, 6, 14, 6, }, // left_mode 1
{ 24, 17, 20, 6, 4, 13, 7, 2, 3, 2, }, // left_mode 2
{ 13, 11, 5, 14, 4, 9, 2, 4, 15, 7, }, // left_mode 3
{ 8, 5, 2, 1, 4, 0, 1, 1, 2, 12, }, // left_mode 4
{ 19, 5, 5, 7, 4, 40, 6, 3, 10, 4, }, // left_mode 5
{ 16, 5, 9, 1, 1, 16, 26, 2, 10, 4, }, // left_mode 6
{ 11, 4, 8, 1, 1, 4, 4, 5, 4, 1, }, // left_mode 7
{ 15, 1, 3, 7, 3, 21, 7, 1, 34, 5, }, // left_mode 8
{ 18, 5, 1, 3, 4, 3, 7, 1, 2, 9, }, // left_mode 9
/*Above Mode : 5*/
{ 444, 46, 47, 20, 14, 110, 60, 14, 60, 7, }, /* left_mode 0 */
{ 59, 57, 25, 18, 3, 17, 21, 6, 14, 6, }, /* left_mode 1 */
{ 24, 17, 20, 6, 4, 13, 7, 2, 3, 2, }, /* left_mode 2 */
{ 13, 11, 5, 14, 4, 9, 2, 4, 15, 7, }, /* left_mode 3 */
{ 8, 5, 2, 1, 4, 0, 1, 1, 2, 12, }, /* left_mode 4 */
{ 19, 5, 5, 7, 4, 40, 6, 3, 10, 4, }, /* left_mode 5 */
{ 16, 5, 9, 1, 1, 16, 26, 2, 10, 4, }, /* left_mode 6 */
{ 11, 4, 8, 1, 1, 4, 4, 5, 4, 1, }, /* left_mode 7 */
{ 15, 1, 3, 7, 3, 21, 7, 1, 34, 5, }, /* left_mode 8 */
{ 18, 5, 1, 3, 4, 3, 7, 1, 2, 9, }, /* left_mode 9 */
},
{
//Above Mode : 6
{ 476, 149, 94, 13, 14, 77, 291, 27, 23, 3, }, // left_mode 0
{ 79, 83, 42, 14, 2, 12, 63, 2, 4, 14, }, // left_mode 1
{ 43, 36, 55, 1, 3, 8, 42, 11, 5, 1, }, // left_mode 2
{ 9, 9, 6, 16, 1, 5, 6, 3, 11, 10, }, // left_mode 3
{ 10, 3, 1, 3, 10, 1, 0, 1, 1, 4, }, // left_mode 4
{ 14, 6, 15, 5, 1, 20, 25, 2, 5, 0, }, // left_mode 5
{ 28, 7, 51, 1, 0, 8, 127, 6, 2, 5, }, // left_mode 6
{ 13, 3, 3, 2, 3, 1, 2, 8, 1, 2, }, // left_mode 7
{ 10, 3, 3, 3, 3, 8, 2, 2, 9, 3, }, // left_mode 8
{ 13, 7, 11, 4, 0, 4, 6, 2, 5, 8, }, // left_mode 9
/*Above Mode : 6*/
{ 476, 149, 94, 13, 14, 77, 291, 27, 23, 3, }, /* left_mode 0 */
{ 79, 83, 42, 14, 2, 12, 63, 2, 4, 14, }, /* left_mode 1 */
{ 43, 36, 55, 1, 3, 8, 42, 11, 5, 1, }, /* left_mode 2 */
{ 9, 9, 6, 16, 1, 5, 6, 3, 11, 10, }, /* left_mode 3 */
{ 10, 3, 1, 3, 10, 1, 0, 1, 1, 4, }, /* left_mode 4 */
{ 14, 6, 15, 5, 1, 20, 25, 2, 5, 0, }, /* left_mode 5 */
{ 28, 7, 51, 1, 0, 8, 127, 6, 2, 5, }, /* left_mode 6 */
{ 13, 3, 3, 2, 3, 1, 2, 8, 1, 2, }, /* left_mode 7 */
{ 10, 3, 3, 3, 3, 8, 2, 2, 9, 3, }, /* left_mode 8 */
{ 13, 7, 11, 4, 0, 4, 6, 2, 5, 8, }, /* left_mode 9 */
},
{
//Above Mode : 7
{ 376, 135, 119, 6, 32, 8, 31, 224, 9, 3, }, // left_mode 0
{ 93, 60, 54, 6, 13, 7, 8, 92, 2, 12, }, // left_mode 1
{ 74, 36, 84, 0, 3, 2, 9, 67, 2, 1, }, // left_mode 2
{ 19, 4, 4, 8, 8, 2, 4, 7, 6, 16, }, // left_mode 3
{ 51, 7, 4, 1, 77, 3, 0, 14, 1, 15, }, // left_mode 4
{ 7, 7, 5, 7, 4, 7, 4, 5, 0, 3, }, // left_mode 5
{ 18, 2, 19, 2, 2, 4, 12, 11, 1, 2, }, // left_mode 6
{ 129, 6, 27, 1, 21, 3, 0, 189, 0, 6, }, // left_mode 7
{ 9, 1, 2, 8, 3, 7, 0, 5, 3, 3, }, // left_mode 8
{ 20, 4, 5, 10, 4, 2, 7, 17, 3, 16, }, // left_mode 9
/*Above Mode : 7*/
{ 376, 135, 119, 6, 32, 8, 31, 224, 9, 3, }, /* left_mode 0 */
{ 93, 60, 54, 6, 13, 7, 8, 92, 2, 12, }, /* left_mode 1 */
{ 74, 36, 84, 0, 3, 2, 9, 67, 2, 1, }, /* left_mode 2 */
{ 19, 4, 4, 8, 8, 2, 4, 7, 6, 16, }, /* left_mode 3 */
{ 51, 7, 4, 1, 77, 3, 0, 14, 1, 15, }, /* left_mode 4 */
{ 7, 7, 5, 7, 4, 7, 4, 5, 0, 3, }, /* left_mode 5 */
{ 18, 2, 19, 2, 2, 4, 12, 11, 1, 2, }, /* left_mode 6 */
{ 129, 6, 27, 1, 21, 3, 0, 189, 0, 6, }, /* left_mode 7 */
{ 9, 1, 2, 8, 3, 7, 0, 5, 3, 3, }, /* left_mode 8 */
{ 20, 4, 5, 10, 4, 2, 7, 17, 3, 16, }, /* left_mode 9 */
},
{
//Above Mode : 8
{ 617, 68, 34, 79, 11, 27, 25, 14, 75, 13, }, // left_mode 0
{ 51, 82, 21, 26, 6, 12, 13, 1, 26, 16, }, // left_mode 1
{ 29, 9, 12, 11, 3, 7, 1, 10, 2, 2, }, // left_mode 2
{ 17, 19, 11, 74, 4, 3, 2, 0, 58, 13, }, // left_mode 3
{ 10, 1, 1, 3, 4, 1, 0, 2, 1, 8, }, // left_mode 4
{ 14, 4, 5, 5, 1, 13, 2, 0, 27, 8, }, // left_mode 5
{ 10, 3, 5, 4, 1, 7, 6, 4, 5, 1, }, // left_mode 6
{ 10, 2, 6, 2, 1, 1, 1, 4, 2, 1, }, // left_mode 7
{ 14, 8, 5, 23, 2, 12, 6, 2, 117, 5, }, // left_mode 8
{ 9, 6, 2, 19, 1, 6, 3, 2, 9, 9, }, // left_mode 9
/*Above Mode : 8*/
{ 617, 68, 34, 79, 11, 27, 25, 14, 75, 13, }, /* left_mode 0 */
{ 51, 82, 21, 26, 6, 12, 13, 1, 26, 16, }, /* left_mode 1 */
{ 29, 9, 12, 11, 3, 7, 1, 10, 2, 2, }, /* left_mode 2 */
{ 17, 19, 11, 74, 4, 3, 2, 0, 58, 13, }, /* left_mode 3 */
{ 10, 1, 1, 3, 4, 1, 0, 2, 1, 8, }, /* left_mode 4 */
{ 14, 4, 5, 5, 1, 13, 2, 0, 27, 8, }, /* left_mode 5 */
{ 10, 3, 5, 4, 1, 7, 6, 4, 5, 1, }, /* left_mode 6 */
{ 10, 2, 6, 2, 1, 1, 1, 4, 2, 1, }, /* left_mode 7 */
{ 14, 8, 5, 23, 2, 12, 6, 2, 117, 5, }, /* left_mode 8 */
{ 9, 6, 2, 19, 1, 6, 3, 2, 9, 9, }, /* left_mode 9 */
},
{
//Above Mode : 9
{ 680, 73, 22, 38, 42, 5, 11, 9, 6, 28, }, // left_mode 0
{ 113, 112, 21, 22, 10, 2, 8, 4, 6, 42, }, // left_mode 1
{ 44, 20, 24, 6, 5, 4, 3, 3, 1, 2, }, // left_mode 2
{ 40, 23, 7, 71, 5, 2, 4, 1, 7, 22, }, // left_mode 3
{ 85, 9, 4, 4, 17, 2, 0, 3, 2, 23, }, // left_mode 4
{ 13, 4, 2, 6, 1, 7, 0, 1, 7, 6, }, // left_mode 5
{ 26, 6, 8, 3, 2, 3, 8, 1, 5, 4, }, // left_mode 6
{ 54, 8, 9, 6, 7, 0, 1, 11, 1, 3, }, // left_mode 7
{ 9, 10, 4, 13, 2, 5, 4, 2, 14, 8, }, // left_mode 8
{ 92, 9, 5, 19, 15, 3, 3, 1, 6, 58, }, // left_mode 9
/*Above Mode : 9*/
{ 680, 73, 22, 38, 42, 5, 11, 9, 6, 28, }, /* left_mode 0 */
{ 113, 112, 21, 22, 10, 2, 8, 4, 6, 42, }, /* left_mode 1 */
{ 44, 20, 24, 6, 5, 4, 3, 3, 1, 2, }, /* left_mode 2 */
{ 40, 23, 7, 71, 5, 2, 4, 1, 7, 22, }, /* left_mode 3 */
{ 85, 9, 4, 4, 17, 2, 0, 3, 2, 23, }, /* left_mode 4 */
{ 13, 4, 2, 6, 1, 7, 0, 1, 7, 6, }, /* left_mode 5 */
{ 26, 6, 8, 3, 2, 3, 8, 1, 5, 4, }, /* left_mode 6 */
{ 54, 8, 9, 6, 7, 0, 1, 11, 1, 3, }, /* left_mode 7 */
{ 9, 10, 4, 13, 2, 5, 4, 2, 14, 8, }, /* left_mode 8 */
{ 92, 9, 5, 19, 15, 3, 3, 1, 6, 58, }, /* left_mode 9 */
},
};

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@ -21,9 +21,9 @@
#include "recon.h"
#include "postproc.h"
//#ifdef PACKET_TESTING
/*#ifdef PACKET_TESTING*/
#include "header.h"
//#endif
/*#endif*/
/* Create/destroy static data structures. */
@ -43,7 +43,7 @@ typedef struct frame_contexts
vp8_prob sub_mv_ref_prob [VP8_SUBMVREFS-1];
vp8_prob coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens-1];
MV_CONTEXT mvc[2];
MV_CONTEXT pre_mvc[2]; //not to caculate the mvcost for the frame if mvc doesn't change.
MV_CONTEXT pre_mvc[2]; /* not to caculate the mvcost for the frame if mvc doesn't change. */
} FRAME_CONTEXT;
typedef enum
@ -74,6 +74,7 @@ typedef struct VP8_COMMON_RTCD
vp8_subpix_rtcd_vtable_t subpix;
vp8_loopfilter_rtcd_vtable_t loopfilter;
vp8_postproc_rtcd_vtable_t postproc;
int flags;
#else
int unused;
#endif
@ -83,9 +84,9 @@ typedef struct VP8Common
{
struct vpx_internal_error_info error;
DECLARE_ALIGNED(16, short, Y1dequant[QINDEX_RANGE][4][4]);
DECLARE_ALIGNED(16, short, Y2dequant[QINDEX_RANGE][4][4]);
DECLARE_ALIGNED(16, short, UVdequant[QINDEX_RANGE][4][4]);
DECLARE_ALIGNED(16, short, Y1dequant[QINDEX_RANGE][16]);
DECLARE_ALIGNED(16, short, Y2dequant[QINDEX_RANGE][16]);
DECLARE_ALIGNED(16, short, UVdequant[QINDEX_RANGE][16]);
int Width;
int Height;
@ -104,7 +105,7 @@ typedef struct VP8Common
YV12_BUFFER_CONFIG post_proc_buffer;
YV12_BUFFER_CONFIG temp_scale_frame;
FRAME_TYPE last_frame_type; //Add to check if vp8_frame_init_loop_filter() can be skipped.
FRAME_TYPE last_frame_type; /* Add to check if vp8_frame_init_loop_filter() can be skipped. */
FRAME_TYPE frame_type;
int show_frame;
@ -115,7 +116,7 @@ typedef struct VP8Common
int mb_cols;
int mode_info_stride;
// prfile settings
/* profile settings */
int mb_no_coeff_skip;
int no_lpf;
int simpler_lpf;
@ -123,7 +124,7 @@ typedef struct VP8Common
int full_pixel;
int base_qindex;
int last_kf_gf_q; // Q used on the last GF or KF
int last_kf_gf_q; /* Q used on the last GF or KF */
int y1dc_delta_q;
int y2dc_delta_q;
@ -153,31 +154,31 @@ typedef struct VP8Common
int last_sharpness_level;
int sharpness_level;
int refresh_last_frame; // Two state 0 = NO, 1 = YES
int refresh_golden_frame; // Two state 0 = NO, 1 = YES
int refresh_alt_ref_frame; // Two state 0 = NO, 1 = YES
int refresh_last_frame; /* Two state 0 = NO, 1 = YES */
int refresh_golden_frame; /* Two state 0 = NO, 1 = YES */
int refresh_alt_ref_frame; /* Two state 0 = NO, 1 = YES */
int copy_buffer_to_gf; // 0 none, 1 Last to GF, 2 ARF to GF
int copy_buffer_to_arf; // 0 none, 1 Last to ARF, 2 GF to ARF
int copy_buffer_to_gf; /* 0 none, 1 Last to GF, 2 ARF to GF */
int copy_buffer_to_arf; /* 0 none, 1 Last to ARF, 2 GF to ARF */
int refresh_entropy_probs; // Two state 0 = NO, 1 = YES
int refresh_entropy_probs; /* Two state 0 = NO, 1 = YES */
int ref_frame_sign_bias[MAX_REF_FRAMES]; // Two state 0, 1
int ref_frame_sign_bias[MAX_REF_FRAMES]; /* Two state 0, 1 */
// Y,U,V,Y2
ENTROPY_CONTEXT_PLANES *above_context; // row of context for each plane
ENTROPY_CONTEXT_PLANES left_context; // (up to) 4 contexts ""
/* Y,U,V,Y2 */
ENTROPY_CONTEXT_PLANES *above_context; /* row of context for each plane */
ENTROPY_CONTEXT_PLANES left_context; /* (up to) 4 contexts "" */
// keyframe block modes are predicted by their above, left neighbors
/* keyframe block modes are predicted by their above, left neighbors */
vp8_prob kf_bmode_prob [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES-1];
vp8_prob kf_ymode_prob [VP8_YMODES-1]; /* keyframe "" */
vp8_prob kf_uv_mode_prob [VP8_UV_MODES-1];
FRAME_CONTEXT lfc; // last frame entropy
FRAME_CONTEXT fc; // this frame entropy
FRAME_CONTEXT lfc; /* last frame entropy */
FRAME_CONTEXT fc; /* this frame entropy */
unsigned int current_video_frame;

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@ -19,7 +19,35 @@
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
// global constants
#define RGB_TO_YUV(t) \
( (0.257*(float)(t>>16)) + (0.504*(float)(t>>8&0xff)) + (0.098*(float)(t&0xff)) + 16), \
(-(0.148*(float)(t>>16)) - (0.291*(float)(t>>8&0xff)) + (0.439*(float)(t&0xff)) + 128), \
( (0.439*(float)(t>>16)) - (0.368*(float)(t>>8&0xff)) - (0.071*(float)(t&0xff)) + 128)
/* global constants */
static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] =
{
{ RGB_TO_YUV(0x98FB98) }, /* PaleGreen */
{ RGB_TO_YUV(0x00FF00) }, /* Green */
{ RGB_TO_YUV(0xADFF2F) }, /* GreenYellow */
{ RGB_TO_YUV(0x228B22) }, /* ForestGreen */
{ RGB_TO_YUV(0x006400) }, /* DarkGreen */
{ RGB_TO_YUV(0x98F5FF) }, /* Cadet Blue */
{ RGB_TO_YUV(0x6CA6CD) }, /* Sky Blue */
{ RGB_TO_YUV(0x00008B) }, /* Dark blue */
{ RGB_TO_YUV(0x551A8B) }, /* Purple */
{ RGB_TO_YUV(0xFF0000) } /* Red */
};
static const unsigned char MV_REFERENCE_FRAME_colors[MB_MODE_COUNT][3] =
{
{ RGB_TO_YUV(0x00ff00) }, /* Blue */
{ RGB_TO_YUV(0x0000ff) }, /* Green */
{ RGB_TO_YUV(0xffff00) }, /* Yellow */
{ RGB_TO_YUV(0xff0000) }, /* Red */
};
static const short kernel5[] =
{
@ -76,7 +104,7 @@ const short vp8_rv[] =
extern void vp8_blit_text(const char *msg, unsigned char *address, const int pitch);
extern void vp8_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch);
/***********************************************************************************************************
*/
void vp8_post_proc_down_and_across_c
@ -101,7 +129,7 @@ void vp8_post_proc_down_and_across_c
for (row = 0; row < rows; row++)
{
// post_proc_down for one row
/* post_proc_down for one row */
p_src = src_ptr;
p_dst = dst_ptr;
@ -124,7 +152,7 @@ void vp8_post_proc_down_and_across_c
p_dst[col] = v;
}
// now post_proc_across
/* now post_proc_across */
p_src = dst_ptr;
p_dst = dst_ptr;
@ -153,12 +181,12 @@ void vp8_post_proc_down_and_across_c
p_dst[col-2] = d[(col-2)&7];
}
//handle the last two pixels
/* handle the last two pixels */
p_dst[col-2] = d[(col-2)&7];
p_dst[col-1] = d[(col-1)&7];
//next row
/* next row */
src_ptr += pitch;
dst_ptr += pitch;
}
@ -351,9 +379,9 @@ static void fillrd(struct postproc_state *state, int q, int a)
sigma = ai + .5 + .6 * (63 - qi) / 63.0;
// set up a lookup table of 256 entries that matches
// a gaussian distribution with sigma determined by q.
//
/* set up a lookup table of 256 entries that matches
* a gaussian distribution with sigma determined by q.
*/
{
double i;
int next, j;
@ -444,6 +472,89 @@ void vp8_plane_add_noise_c(unsigned char *Start, char *noise,
}
}
/* Blend the macro block with a solid colored square. Leave the
* edges unblended to give distinction to macro blocks in areas
* filled with the same color block.
*/
void vp8_blend_mb_c (unsigned char *y, unsigned char *u, unsigned char *v,
int y1, int u1, int v1, int alpha, int stride)
{
int i, j;
int y1_const = y1*((1<<16)-alpha);
int u1_const = u1*((1<<16)-alpha);
int v1_const = v1*((1<<16)-alpha);
y += stride + 2;
for (i = 0; i < 14; i++)
{
for (j = 0; j < 14; j++)
{
y[j] = (y[j]*alpha + y1_const)>>16;
}
y += stride;
}
stride >>= 1;
u += stride + 1;
v += stride + 1;
for (i = 0; i < 6; i++)
{
for (j = 0; j < 6; j++)
{
u[j] = (u[j]*alpha + u1_const)>>16;
v[j] = (v[j]*alpha + v1_const)>>16;
}
u += stride;
v += stride;
}
}
static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int height)
{
int dx;
int dy;
if (*x1 > width)
{
dx = *x1 - x0;
dy = *y1 - y0;
*x1 = width;
if (dy)
*y1 = ((width-x0)*dy)/dx + y0;
}
if (*x1 < 0)
{
dx = *x1 - x0;
dy = *y1 - y0;
*x1 = 0;
if (dy)
*y1 = ((0-x0)*dy)/dx + y0;
}
if (*y1 > height)
{
dx = *x1 - x0;
dy = *y1 - y0;
*y1 = height;
if (dx)
*x1 = ((height-y0)*dx)/dy + x0;
}
if (*y1 < 0)
{
dx = *x1 - x0;
dy = *y1 - y0;
*y1 = 0;
if (dx)
*x1 = ((0-y0)*dx)/dy + x0;
}
}
#if CONFIG_RUNTIME_CPU_DETECT
#define RTCD_VTABLE(oci) (&(oci)->rtcd.postproc)
#else
@ -465,7 +576,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
{
*dest = *oci->frame_to_show;
// handle problem with extending borders
/* handle problem with extending borders */
dest->y_width = oci->Width;
dest->y_height = oci->Height;
dest->uv_height = dest->y_height / 2;
@ -521,7 +632,8 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
oci->mb_cols, oci->mb_rows);
vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride);
}
else if (flags & VP8D_DEBUG_LEVEL2)
if (flags & VP8D_DEBUG_LEVEL2)
{
int i, j;
unsigned char *y_ptr;
@ -533,7 +645,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
y_ptr = post->y_buffer + 4 * post->y_stride + 4;
// vp8_filter each macro block
/* vp8_filter each macro block */
for (i = 0; i < mb_rows; i++)
{
for (j = 0; j < mb_cols; j++)
@ -547,12 +659,13 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
y_ptr += 16;
}
mb_index ++; //border
mb_index ++; /* border */
y_ptr += post->y_stride * 16 - post->y_width;
}
}
else if (flags & VP8D_DEBUG_LEVEL3)
if (flags & VP8D_DEBUG_LEVEL3)
{
int i, j;
unsigned char *y_ptr;
@ -564,7 +677,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
y_ptr = post->y_buffer + 4 * post->y_stride + 4;
// vp8_filter each macro block
/* vp8_filter each macro block */
for (i = 0; i < mb_rows; i++)
{
for (j = 0; j < mb_cols; j++)
@ -581,12 +694,13 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
y_ptr += 16;
}
mb_index ++; //border
mb_index ++; /* border */
y_ptr += post->y_stride * 16 - post->y_width;
}
}
else if (flags & VP8D_DEBUG_LEVEL4)
if (flags & VP8D_DEBUG_LEVEL4)
{
sprintf(message, "Bitrate: %10.2f frame_rate: %10.2f ", oci->bitrate, oci->framerate);
vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride);
@ -601,7 +715,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
y_ptr = post->y_buffer + 4 * post->y_stride + 4;
// vp8_filter each macro block
/* vp8_filter each macro block */
for (i = 0; i < mb_rows; i++)
{
for (j = 0; j < mb_cols; j++)
@ -614,7 +728,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
y_ptr += 16;
}
mb_index ++; //border
mb_index ++; /* border */
y_ptr += post->y_stride * 16 - post->y_width;
}
@ -623,11 +737,122 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_l
}
/* Draw motion vectors */
if (flags & VP8D_DEBUG_LEVEL5)
{
YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
int width = post->y_width;
int height = post->y_height;
int mb_cols = width >> 4;
unsigned char *y_buffer = oci->post_proc_buffer.y_buffer;
int y_stride = oci->post_proc_buffer.y_stride;
MODE_INFO *mi = oci->mi;
int x0, y0;
for (y0 = 8; y0 < (height + 8); y0 += 16)
{
for (x0 = 8; x0 < (width + 8); x0 += 16)
{
int x1, y1;
if (mi->mbmi.mode >= NEARESTMV)
{
MV *mv = &mi->mbmi.mv.as_mv;
x1 = x0 + (mv->col >> 3);
y1 = y0 + (mv->row >> 3);
if (x1 != x0 && y1 != y0)
{
constrain_line (x0, &x1, y0-1, &y1, width, height);
vp8_blit_line (x0, x1, y0-1, y1, y_buffer, y_stride);
constrain_line (x0, &x1, y0+1, &y1, width, height);
vp8_blit_line (x0, x1, y0+1, y1, y_buffer, y_stride);
}
else
vp8_blit_line (x0, x1, y0, y1, y_buffer, y_stride);
}
mi++;
}
mi++;
}
}
/* Color in block modes */
if (flags & VP8D_DEBUG_LEVEL6)
{
int i, j;
YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
int width = post->y_width;
int height = post->y_height;
unsigned char *y_ptr = oci->post_proc_buffer.y_buffer;
unsigned char *u_ptr = oci->post_proc_buffer.u_buffer;
unsigned char *v_ptr = oci->post_proc_buffer.v_buffer;
int y_stride = oci->post_proc_buffer.y_stride;
MODE_INFO *mi = oci->mi;
for (i = 0; i < height; i += 16)
{
for (j = 0; j < width; j += 16)
{
int Y = 0, U = 0, V = 0;
Y = MB_PREDICTION_MODE_colors[mi->mbmi.mode][0];
U = MB_PREDICTION_MODE_colors[mi->mbmi.mode][1];
V = MB_PREDICTION_MODE_colors[mi->mbmi.mode][2];
POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb)
(&y_ptr[j], &u_ptr[j>>1], &v_ptr[j>>1], Y, U, V, 0xc000, y_stride);
mi++;
}
y_ptr += y_stride*16;
u_ptr += y_stride*4;
v_ptr += y_stride*4;
mi++;
}
}
/* Color in frame reference blocks */
if (flags & VP8D_DEBUG_LEVEL7)
{
int i, j;
YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
int width = post->y_width;
int height = post->y_height;
unsigned char *y_ptr = oci->post_proc_buffer.y_buffer;
unsigned char *u_ptr = oci->post_proc_buffer.u_buffer;
unsigned char *v_ptr = oci->post_proc_buffer.v_buffer;
int y_stride = oci->post_proc_buffer.y_stride;
MODE_INFO *mi = oci->mi;
for (i = 0; i < height; i += 16)
{
for (j = 0; j < width; j +=16)
{
int Y = 0, U = 0, V = 0;
Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0];
U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1];
V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2];
POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb)
(&y_ptr[j], &u_ptr[j>>1], &v_ptr[j>>1], Y, U, V, 0xc000, y_stride);
mi++;
}
y_ptr += y_stride*16;
u_ptr += y_stride*4;
v_ptr += y_stride*4;
mi++;
}
}
*dest = oci->post_proc_buffer;
// handle problem with extending borders
/* handle problem with extending borders */
dest->y_width = oci->Width;
dest->y_height = oci->Height;
dest->uv_height = dest->y_height / 2;

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@ -24,6 +24,10 @@
char whiteclamp[16], char bothclamp[16],\
unsigned int w, unsigned int h, int pitch)
#define prototype_postproc_blend_mb(sym)\
void sym (unsigned char *y, unsigned char *u, unsigned char *v,\
int y1, int u1, int v1, int alpha, int stride)
#if ARCH_X86 || ARCH_X86_64
#include "x86/postproc_x86.h"
#endif
@ -48,16 +52,22 @@ extern prototype_postproc(vp8_postproc_downacross);
#endif
extern prototype_postproc_addnoise(vp8_postproc_addnoise);
#ifndef vp8_postproc_blend_mb
#define vp8_postproc_blend_mb vp8_blend_mb_c
#endif
extern prototype_postproc_blend_mb(vp8_postproc_blend_mb);
typedef prototype_postproc((*vp8_postproc_fn_t));
typedef prototype_postproc_inplace((*vp8_postproc_inplace_fn_t));
typedef prototype_postproc_addnoise((*vp8_postproc_addnoise_fn_t));
typedef prototype_postproc_blend_mb((*vp8_postproc_blend_mb_fn_t));
typedef struct
{
vp8_postproc_inplace_fn_t down;
vp8_postproc_inplace_fn_t across;
vp8_postproc_fn_t downacross;
vp8_postproc_addnoise_fn_t addnoise;
vp8_postproc_blend_mb_fn_t blend_mb;
} vp8_postproc_rtcd_vtable_t;
#if CONFIG_RUNTIME_CPU_DETECT

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@ -14,13 +14,16 @@
enum
{
VP8D_NOFILTERING = 0,
VP8D_DEBLOCK = 1,
VP8D_DEMACROBLOCK = 2,
VP8D_ADDNOISE = 4,
VP8D_DEBUG_LEVEL1 = 8,
VP8D_DEBUG_LEVEL2 = 16,
VP8D_DEBUG_LEVEL3 = 32,
VP8D_DEBUG_LEVEL4 = 64
VP8D_DEBLOCK = 1<<0,
VP8D_DEMACROBLOCK = 1<<1,
VP8D_ADDNOISE = 1<<2,
VP8D_DEBUG_LEVEL1 = 1<<3,
VP8D_DEBUG_LEVEL2 = 1<<4,
VP8D_DEBUG_LEVEL3 = 1<<5,
VP8D_DEBUG_LEVEL4 = 1<<6,
VP8D_DEBUG_LEVEL5 = 1<<7,
VP8D_DEBUG_LEVEL6 = 1<<8,
VP8D_DEBUG_LEVEL7 = 1<<9
};
#endif

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше