Rough merge of master into experimental

Creates a merge between the master and experimental branches. Fixes a number of conflicts in the build system to allow *either* VP8 or VP9 to be built. Specifically either: $ configure --disable-vp9 $ configure --disable-vp8 --disable-unit-tests VP9 still exports its symbols and files as VP8, so that will be resolved in the next commit. Unit tests are broken in VP9, but this isn't a new issue. They are fixed upstream on origin/experimental as of this writing, but rebasing this merge proved difficult, so will tackle that in a second merge commit. Change-Id: I2b7d852c18efd58d1ebc621b8041fe0260442c21
2012-11-06 16:59:01 -08:00 · 2012-11-06 16:59:01 -08:00 · 7b8dfcb5a2
--- a/.gitignore
+++ b/.gitignore
@ -32,6 +32,8 @@
 /ivfdec.dox
 /ivfenc
 /ivfenc.dox
+/libvpx.so*
+/libvpx.ver
 /obj_int_extract
 /postproc
 /postproc.c
@ -43,12 +45,12 @@
 /simple_encoder
 /simple_encoder.c
 /simple_encoder.dox
+/test_libvpx
 /twopass_encoder
 /twopass_encoder.c
 /twopass_encoder.dox
 /vp8_api1_migration.dox
 /vp8_scalable_patterns
-/vp8_scalable_patterns.c
 /vp8_scalable_patterns.dox
 /vp8_set_maps
 /vp8_set_maps.c
@ -56,7 +58,14 @@
 /vp8cx_set_ref
 /vp8cx_set_ref.c
 /vp8cx_set_ref.dox
+/vpx.pc
 /vpx_config.c
 /vpx_config.h
+/vpx_rtcd.h
 /vpx_version.h
+/vpxdec
+/vpxenc
 TAGS
+.cproject
+.project
+.settings
--- a/.mailmap
+++ b/.mailmap
@ -3,3 +3,6 @@ Johann Koenig <johannkoenig@google.com>
 Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
 Tom Finegan <tomfinegan@google.com>
 Ralph Giles <giles@xiph.org> <giles@entropywave.com>
+Ralph Giles <giles@xiph.org> <giles@mozilla.com>
+Alpha Lam <hclam@google.com> <hclam@chromium.org>
+Deb Mukherjee <debargha@google.com>
--- a/8
+++ b/8
@ -6,10 +6,12 @@ Adrian Grange <agrange@google.com>
 Alex Converse <alex.converse@gmail.com>
 Alexis Ballier <aballier@gentoo.org>
 Alok Ahuja <waveletcoeff@gmail.com>
+Alpha Lam <hclam@google.com>
 Andoni Morales Alastruey <ylatuya@gmail.com>
 Andres Mejia <mcitadel@gmail.com>
 Aron Rosenberg <arosenberg@logitech.com>
 Attila Nagy <attilanagy@google.com>
+Deb Mukherjee <debargha@google.com>
 Fabio Pedretti <fabio.ped@libero.it>
 Frank Galligan <fgalligan@google.com>
 Fredrik Söderquist <fs@opera.com>
@ -21,6 +23,7 @@ Henrik Lundin <hlundin@google.com>
 James Berry <jamesberry@google.com>
 James Zern <jzern@google.com>
 Jan Kratochvil <jan.kratochvil@redhat.com>
+Jeff Faust <jfaust@google.com>
 Jeff Muizelaar <jmuizelaar@mozilla.com>
 Jim Bankoski <jimbankoski@google.com>
 Johann Koenig <johannkoenig@google.com>
@ -28,9 +31,11 @@ John Koleszar <jkoleszar@google.com>
 Joshua Bleecher Snyder <josh@treelinelabs.com>
 Justin Clift <justin@salasaga.org>
 Justin Lebar <justin.lebar@gmail.com>
+KO Myung-Hun <komh@chollian.net>
 Lou Quillio <louquillio@google.com>
 Luca Barbato <lu_zero@gentoo.org>
 Makoto Kato <makoto.kt@gmail.com>
+Marco Paniconi <marpan@google.com>
 Martin Ettl <ettl.martin78@googlemail.com>
 Michael Kohler <michaelkohler@live.com>
 Mike Hommey <mhommey@mozilla.com>
@ -40,12 +45,15 @@ Patrik Westin <patrik.westin@gmail.com>
 Paul Wilkins <paulwilkins@google.com>
 Pavol Rusnak <stick@gk2.sk>
 Philip Jägenstedt <philipj@opera.com>
+Priit Laes <plaes@plaes.org>
 Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
+Rafaël Carré <funman@videolan.org>
 Ralph Giles <giles@xiph.org>
 Ronald S. Bultje <rbultje@google.com>
 Scott LaVarnway <slavarnway@google.com>
 Stefan Holmer <holmer@google.com>
 Taekhyun Kim <takim@nvidia.com>
+Takanori MATSUURA <t.matsuu@gmail.com>
 Tero Rintaluoma <teror@google.com>
 Thijs Vermeir <thijsvermeir@gmail.com>
 Timothy B. Terriberry <tterribe@xiph.org>
--- a/140
+++ b/140
@ -1,3 +1,143 @@
+2012-05-09 v1.1.0 "Eider"
+  This introduces a number of enhancements, mostly focused on real-time
+  encoding. In addition, it fixes a decoder bug (first introduced in
+  Duclair) so all users of that release are encouraged to upgrade.
+
+  - Upgrading:
+    This release is ABI and API compatible with Duclair (v1.0.0). Users
+    of older releases should refer to the Upgrading notes in this
+    document for that release.
+
+    This release introduces a new temporal denoiser, controlled by the
+    VP8E_SET_NOISE_SENSITIVITY control. The temporal denoiser does not
+    currently take a strength parameter, so the control is effectively
+    a boolean - zero (off) or non-zero (on). For compatibility with
+    existing applications, the values accepted are the same as those
+    for the spatial denoiser (0-6). The temporal denoiser is enabled
+    by default, and the older spatial denoiser may be restored by
+    configuring with --disable-temporal-denoising. The temporal denoiser
+    is more computationally intensive than the spatial one.
+
+    This release removes support for a legacy, decode only API that was
+    supported, but deprecated, at the initial release of libvpx
+    (v0.9.0). This is not expected to have any impact. If you are
+    impacted, you can apply a reversion to commit 2bf8fb58 locally.
+    Please update to the latest libvpx API if you are affected.
+
+  - Enhancements:
+      Adds a motion compensated temporal denoiser to the encoder, which
+      gives higher quality than the older spatial denoiser. (See above
+      for notes on upgrading).
+
+      In addition, support for new compilers and platforms were added,
+      including:
+        improved support for XCode
+        Android x86 NDK build
+        OS/2 support
+        SunCC support
+
+      Changing resolution with vpx_codec_enc_config_set() is now
+      supported. Previously, reinitializing the codec was required to
+      change the input resolution.
+
+      The vpxenc application has initial support for producing multiple
+      encodes from the same input in one call. Resizing is not yet
+      supported, but varying other codec parameters is. Use -- to
+      delineate output streams. Options persist from one stream to the
+      next.
+
+      Also, the vpxenc application will now use a keyframe interval of
+      5 seconds by default. Use the --kf-max-dist option to override.
+
+  - Speed:
+      Decoder performance improved 2.5% versus Duclair. Encoder speed is
+      consistent with Duclair for most material. Two pass encoding of
+      slideshow-like material will see significant improvements.
+
+      Large realtime encoding speed gains at a small quality expense are
+      possible by configuring the on-the-fly bitpacking experiment with
+      --enable-onthefly-bitpacking. Realtime encoder can be up to 13%
+      faster (ARM) depending on the number of threads and bitrate
+      settings. This technique sees constant gain over the 5-16 speed
+      range. For VC style input the loss seen is up to 0.2dB. See commit
+      52cf4dca for further details.
+
+  - Quality:
+      On the whole, quality is consistent with the Duclair release. Some
+      tweaks:
+
+        Reduced blockiness in easy sections by applying a penalty to
+        intra modes.
+
+        Improved quality of static sections (like slideshows) with
+        two pass encoding.
+
+        Improved keyframe sizing with multiple temporal layers
+
+  - Bug Fixes:
+      Corrected alt-ref contribution to frame rate for visible updates
+      to the alt-ref buffer. This affected applications making manual
+      usage of the frame reference flags, or temporal layers.
+
+      Additional constraints were added to disable multi-frame quality
+      enhancement (MFQE) in sections of the frame where there is motion.
+      (#392)
+
+      Fixed corruption issues when vpx_codec_enc_config_set() was called
+      with spatial resampling enabled.
+
+      Fixed a decoder error introduced in Duclair where the segmentation
+      map was not being reinitialized on keyframes (#378)
+
+
+2012-01-27 v1.0.0 "Duclair"
+  Our fourth named release, focused on performance and features related to
+  real-time encoding. It also fixes a decoder crash bug introduced in
+  v0.9.7, so all users of that release are encouraged to upgrade.
+
+  - Upgrading:
+      This release is ABI incompatible with prior releases of libvpx, so the
+      "major" version number has been bumped to 1. You must recompile your
+      applications against the latest version of the libvpx headers. The
+      API remains compatible, and this should not require code changes in most
+      applications.
+
+  - Enhancements:
+      This release introduces several substantial new features to the encoder,
+      of particular interest to real time streaming applications.
+
+      Temporal scalability allows the encoder to produce a stream that can
+      be decimated to different frame rates, with independent rate targetting
+      for each substream.
+
+      Multiframe quality enhancement postprocessing can make visual quality
+      more consistent in the presence of frames that are substantially
+      different quality than the surrounding frames, as in the temporal
+      scalability case and in some forced keyframe scenarios.
+
+      Multiple-resolution encoding support allows the encoding of the
+      same content at different resolutions faster than encoding them
+      separately.
+
+  - Speed:
+      Optimization targets for this release included the decoder and the real-
+      time modes of the encoder. Decoder speed on x86 has improved 10.5% with
+      this release. Encoder improvements followed a curve where speeds 1-3
+      improved 4.0%-1.5%, speeds 4-8 improved <1%, and speeds 9-16 improved
+      1.5% to 10.5%, respectively. "Best" mode speed is consistent with the
+      Cayuga release.
+
+  - Quality:
+      Encoder quality in the single stream case is consistent with the Cayuga
+      release.
+
+  - Bug Fixes:
+      This release fixes an OOB read decoder crash bug present in v0.9.7
+      related to the clamping of motion vectors in SPLITMV blocks. This
+      behavior could be triggered by corrupt input or by starting
+      decoding from a P-frame.
+
+
 2011-08-15 v0.9.7-p1 "Cayuga" patch 1
  This is an incremental bugfix release against Cayuga. All users of that
  release are strongly encouraged to upgrade.
--- a/9
+++ b/9
@ -1,4 +1,4 @@
-Copyright (c) 2010, Google Inc. All rights reserved.
+Copyright (c) 2010, The WebM Project authors. All rights reserved.

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@ -12,9 +12,10 @@ met:
    the documentation and/or other materials provided with the
    distribution.

-  * Neither the name of Google nor the names of its contributors may
-    be used to endorse or promote products derived from this software
-    without specific prior written permission.
+  * Neither the name of Google, nor the WebM Project, nor the names
+    of its contributors may be used to endorse or promote products
+    derived from this software without specific prior written
+    permission.

 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
--- a/20
+++ b/20
@ -1,5 +1,5 @@
 vpx Multi-Format Codec SDK
-README - 19 May 2010
+README - 21 June 2012

 Welcome to the WebM VP8 Codec SDK!

@ -15,11 +15,19 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    * Building the documentation requires PHP[3] and Doxygen[4]. If you do not
      have these packages, you must pass --disable-install-docs to the
      configure script.
+    * Downloading the data for the unit tests requires curl[5] and sha1sum.
+      sha1sum is provided via the GNU coreutils, installed by default on
+      many *nix platforms, as well as MinGW and Cygwin. If coreutils is not
+      available, a compatible version of sha1sum can be built from
+      source[6]. These requirements are optional if not running the unit
+      tests.

    [1]: http://www.tortall.net/projects/yasm
    [2]: http://www.cygwin.com
    [3]: http://php.net
    [4]: http://www.doxygen.org
+    [5]: http://curl.haxx.se
+    [6]: http://www.microbrew.org/tools/md5sha1sum/

  2. Out-of-tree builds
  Out of tree builds are a supported method of building the application. For
@ -42,17 +50,13 @@ COMPILING THE APPLICATIONS/LIBRARIES:
  --help output of the configure script. As of this writing, the list of
  available targets is:

+    armv5te-android-gcc
    armv5te-linux-rvct
    armv5te-linux-gcc
-    armv5te-symbian-gcc
    armv6-darwin-gcc
    armv6-linux-rvct
    armv6-linux-gcc
-    armv6-symbian-gcc
-    iwmmxt-linux-rvct
-    iwmmxt-linux-gcc
-    iwmmxt2-linux-rvct
-    iwmmxt2-linux-gcc
+    armv7-android-gcc
    armv7-linux-rvct
    armv7-linux-gcc
    mips32-linux-gcc
@ -98,5 +102,5 @@ COMPILING THE APPLICATIONS/LIBRARIES:

 SUPPORT
  This library is an open source project supported by its community. Please
-  please email webm-users@webmproject.org for help.
+  please email webm-discuss@webmproject.org for help.

--- a/122
+++ b/122
@ -20,27 +20,36 @@ show_help(){
    show_help_pre
    cat << EOF
 Advanced options:
-  ${toggle_libs}                  don't build libraries
-  ${toggle_examples}              don't build examples
-  ${toggle_unit_tests}            build unit tests
+  ${toggle_libs}                  libraries
+  ${toggle_examples}              examples
+  ${toggle_docs}                  documentation
+  ${toggle_unit_tests}            unit tests
  --libc=PATH                     path to alternate libc
  --as={yasm|nasm|auto}           use specified assembler [auto, yasm preferred]
+  --sdk-path=PATH                 path to root of sdk (iOS, android builds only)
  ${toggle_fast_unaligned}        don't use unaligned accesses, even when
                                  supported by hardware [auto]
  ${toggle_codec_srcs}            in/exclude codec library source code
  ${toggle_debug_libs}            in/exclude debug version of libraries
  ${toggle_md5}                   support for output of checksum data
  ${toggle_static_msvcrt}         use static MSVCRT (VS builds only)
+  ${toggle_vp8}                   VP8 codec support
  ${toggle_vp9}                   VP9 codec support
  ${toggle_internal_stats}        output of encoder internal stats for debug, if supported (encoders)
  ${toggle_mem_tracker}           track memory usage
  ${toggle_postproc}              postprocessing
+  ${toggle_multithread}           multithreaded encoding and decoding
  ${toggle_spatial_resampling}    spatial sampling (scaling) support
+  ${toggle_realtime_only}         enable this option while building for real-time encoding
+  ${toggle_onthefly_bitpacking}   enable on-the-fly bitpacking in real-time encoding
+  ${toggle_error_concealment}     enable this option to get a decoder which is able to conceal losses
  ${toggle_runtime_cpu_detect}    runtime cpu detection
  ${toggle_shared}                shared library support
  ${toggle_static}                static library support
  ${toggle_small}                 favor smaller size over speed
  ${toggle_postproc_visualizer}   macro block / block level visualizers
+  ${toggle_multi_res_encoding}    enable multiple-resolution encoding
+  ${toggle_temporal_denoising}    enable temporal denoising and disable the spatial denoiser

 Codecs:
  Codecs can be selectively enabled or disabled individually, or by family:
@ -76,19 +85,15 @@ EOF

 # all_platforms is a list of all supported target platforms. Maintain
 # alphabetically by architecture, generic-gnu last.
+all_platforms="${all_platforms} armv5te-android-gcc"
 all_platforms="${all_platforms} armv5te-linux-rvct"
 all_platforms="${all_platforms} armv5te-linux-gcc"
 all_platforms="${all_platforms} armv5te-none-rvct"
-all_platforms="${all_platforms} armv5te-symbian-gcc"
 all_platforms="${all_platforms} armv6-darwin-gcc"
 all_platforms="${all_platforms} armv6-linux-rvct"
 all_platforms="${all_platforms} armv6-linux-gcc"
 all_platforms="${all_platforms} armv6-none-rvct"
-all_platforms="${all_platforms} armv6-symbian-gcc"
-all_platforms="${all_platforms} iwmmxt-linux-rvct"
-all_platforms="${all_platforms} iwmmxt-linux-gcc"
-all_platforms="${all_platforms} iwmmxt2-linux-rvct"
-all_platforms="${all_platforms} iwmmxt2-linux-gcc"
+all_platforms="${all_platforms} armv7-android-gcc"   #neon Cortex-A8
 all_platforms="${all_platforms} armv7-darwin-gcc"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-rvct"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-gcc"     #neon Cortex-A8
@ -105,8 +110,12 @@ all_platforms="${all_platforms} x86-darwin8-gcc"
 all_platforms="${all_platforms} x86-darwin8-icc"
 all_platforms="${all_platforms} x86-darwin9-gcc"
 all_platforms="${all_platforms} x86-darwin9-icc"
+all_platforms="${all_platforms} x86-darwin10-gcc"
+all_platforms="${all_platforms} x86-darwin11-gcc"
+all_platforms="${all_platforms} x86-darwin12-gcc"
 all_platforms="${all_platforms} x86-linux-gcc"
 all_platforms="${all_platforms} x86-linux-icc"
+all_platforms="${all_platforms} x86-os2-gcc"
 all_platforms="${all_platforms} x86-solaris-gcc"
 all_platforms="${all_platforms} x86-win32-gcc"
 all_platforms="${all_platforms} x86-win32-vs7"
@ -115,13 +124,18 @@ all_platforms="${all_platforms} x86-win32-vs9"
 all_platforms="${all_platforms} x86_64-darwin9-gcc"
 all_platforms="${all_platforms} x86_64-darwin10-gcc"
 all_platforms="${all_platforms} x86_64-darwin11-gcc"
+all_platforms="${all_platforms} x86_64-darwin12-gcc"
 all_platforms="${all_platforms} x86_64-linux-gcc"
 all_platforms="${all_platforms} x86_64-linux-icc"
 all_platforms="${all_platforms} x86_64-solaris-gcc"
+all_platforms="${all_platforms} x86_64-win64-gcc"
 all_platforms="${all_platforms} x86_64-win64-vs8"
 all_platforms="${all_platforms} x86_64-win64-vs9"
 all_platforms="${all_platforms} universal-darwin8-gcc"
 all_platforms="${all_platforms} universal-darwin9-gcc"
+all_platforms="${all_platforms} universal-darwin10-gcc"
+all_platforms="${all_platforms} universal-darwin11-gcc"
+all_platforms="${all_platforms} universal-darwin12-gcc"
 all_platforms="${all_platforms} generic-gnu"

 # all_targets is a list of all targets that can be configured
@ -158,20 +172,29 @@ enable optimizations
 enable fast_unaligned #allow unaligned accesses, if supported by hw
 enable md5
 enable spatial_resampling
+enable multithread
 enable os_support
+enable temporal_denoising

 [ -d ${source_path}/../include ] && enable alt_tree_layout
-for d in vp9; do
+for d in vp8 vp9; do
    [ -d ${source_path}/${d} ] && disable alt_tree_layout;
 done

 if ! enabled alt_tree_layout; then
 # development environment
+[ -d ${source_path}/vp8 ] && CODECS="${CODECS} vp8_encoder vp8_decoder"
 [ -d ${source_path}/vp9 ] && CODECS="${CODECS} vp9_encoder vp9_decoder"
 else
 # customer environment
-[ -f ${source_path}/../include/vpx/vp8cx.h ] && CODECS="${CODECS} vp9_encoder"
-[ -f ${source_path}/../include/vpx/vp8dx.h ] && CODECS="${CODECS} vp9_decoder"
+[ -f ${source_path}/../include/vpx/vp8cx.h ] && CODECS="${CODECS} vp8_encoder"
+[ -f ${source_path}/../include/vpx/vp8dx.h ] && CODECS="${CODECS} vp8_decoder"
+[ -f ${source_path}/../include/vpx/vp9cx.h ] && CODECS="${CODECS} vp9_encoder"
+[ -f ${source_path}/../include/vpx/vp9dx.h ] && CODECS="${CODECS} vp9_decoder"
+[ -f ${source_path}/../include/vpx/vp8cx.h ] || disable vp8_encoder
+[ -f ${source_path}/../include/vpx/vp8dx.h ] || disable vp8_decoder
+[ -f ${source_path}/../include/vpx/vp9cx.h ] || disable vp9_encoder
+[ -f ${source_path}/../include/vpx/vp9dx.h ] || disable vp9_decoder

 [ -f ${source_path}/../lib/*/*mt.lib ] && soft_enable static_msvcrt
 fi
@ -188,13 +211,12 @@ ARCH_LIST="
    ppc64
 "
 ARCH_EXT_LIST="
-    armv5te
-    armv6
-    armv7
-    iwmmxt
-    iwmmxt2
+    edsp
+    media
+    neon

    mips32
+    dspr2

    mmx
    sse
@ -252,6 +274,7 @@ CONFIG_LIST="
    dc_recon
    runtime_cpu_detect
    postproc
+    multithread
    internal_stats
    ${CODECS}
    ${CODEC_FAMILIES}
@ -259,12 +282,17 @@ CONFIG_LIST="
    decoders
    static_msvcrt
    spatial_resampling
+    realtime_only
+    onthefly_bitpacking
+    error_concealment
    shared
    static
    small
    postproc_visualizer
    os_support
    unit_tests
+    multi_res_encoding
+    temporal_denoising
    experimental
    ${EXPERIMENT_LIST}
 "
@ -285,6 +313,7 @@ CMDLINE_SELECT="

    libs
    examples
+    docs
    libc
    as
    fast_unaligned
@ -295,17 +324,23 @@ CMDLINE_SELECT="
    dequant_tokens
    dc_recon
    postproc
+    multithread
    internal_stats
    ${CODECS}
    ${CODEC_FAMILIES}
    static_msvcrt
    mem_tracker
    spatial_resampling
+    realtime_only
+    onthefly_bitpacking
+    error_concealment
    shared
    static
    small
    postproc_visualizer
    unit_tests
+    multi_res_encoding
+    temporal_denoising
    experimental
 "

@ -394,6 +429,7 @@ process_targets() {
    enabled debug_libs && DIST_DIR="${DIST_DIR}-debug"
    enabled codec_srcs && DIST_DIR="${DIST_DIR}-src"
    ! enabled postproc && DIST_DIR="${DIST_DIR}-nopost"
+    ! enabled multithread && DIST_DIR="${DIST_DIR}-nomt"
    ! enabled install_docs && DIST_DIR="${DIST_DIR}-nodocs"
    DIST_DIR="${DIST_DIR}-${tgt_isa}-${tgt_os}"
    case "${tgt_os}" in
@ -448,6 +484,18 @@ EOF
 }

 process_detect() {
+    if enabled shared; then
+        # Can only build shared libs on a subset of platforms. Doing this check
+        # here rather than at option parse time because the target auto-detect
+        # magic happens after the command line has been parsed.
+        if ! enabled linux; then
+            if enabled gnu; then
+                echo "--enable-shared is only supported on ELF; assuming this is OK"
+            else
+                die "--enable-shared only supported on ELF for now"
+            fi
+        fi
+    fi
    if [ -z "$CC" ]; then
        echo "Bypassing toolchain for environment detection."
        enable external_build
@ -492,11 +540,20 @@ process_toolchain() {
    case $toolchain in
        universal-darwin*)
            local darwin_ver=${tgt_os##darwin}
-            fat_bin_archs="$fat_bin_archs ppc32-${tgt_os}-gcc"

-            # Intel
-            fat_bin_archs="$fat_bin_archs x86-${tgt_os}-${tgt_cc}"
-            if [ $darwin_ver -gt 8 ]; then
+            # Snow Leopard (10.6/darwin10) dropped support for PPC
+            # Include PPC support for all prior versions
+            if [ $darwin_ver -lt 10 ]; then
+                fat_bin_archs="$fat_bin_archs ppc32-${tgt_os}-gcc"
+            fi
+
+            # Tiger (10.4/darwin8) brought support for x86
+            if [ $darwin_ver -ge 8 ]; then
+                fat_bin_archs="$fat_bin_archs x86-${tgt_os}-${tgt_cc}"
+            fi
+
+            # Leopard (10.5/darwin9) brought 64 bit support
+            if [ $darwin_ver -ge 9 ]; then
                fat_bin_archs="$fat_bin_archs x86_64-${tgt_os}-${tgt_cc}"
            fi
            ;;
@ -512,8 +569,11 @@ process_toolchain() {
        check_add_cflags -Wpointer-arith
        check_add_cflags -Wtype-limits
        check_add_cflags -Wcast-qual
-        check_add_cflags -Wundef
        check_add_cflags -Wvla
+        check_add_cflags -Wimplicit-function-declaration
+        check_add_cflags -Wuninitialized
+        check_add_cflags -Wunused-variable
+        check_add_cflags -Wunused-but-set-variable
        enabled extra_warnings || check_add_cflags -Wno-unused-function
    fi

@ -568,6 +628,21 @@ process_toolchain() {
    if enabled postproc_visualizer; then
        enabled postproc || die "postproc_visualizer requires postproc to be enabled"
    fi
+
+    # Enable unit tests if we have a working C++ compiler
+    case "$toolchain" in
+        *-vs*)
+            soft_enable unit_tests
+        ;;
+        *-android-*)
+            # GTestLog must be modified to use Android logging utilities.
+        ;;
+        *)
+            check_cxx "$@" <<EOF && soft_enable unit_tests
+int z;
+EOF
+        ;;
+    esac
 }


@ -576,7 +651,8 @@ process_toolchain() {
 ##
 CONFIGURE_ARGS="$@"
 process "$@"
-cat <<EOF > ${BUILD_PFX}vpx_config.c
+print_webm_license ${BUILD_PFX}vpx_config.c "/*" " */"
+cat <<EOF >> ${BUILD_PFX}vpx_config.c
 static const char* const cfg = "$CONFIGURE_ARGS";
 const char *vpx_codec_build_config(void) {return cfg;}
 EOF
--- a/docs.mk
+++ b/docs.mk
@ -21,9 +21,6 @@ CODEC_DOX :=    mainpage.dox \
 		usage_dx.dox \

 # Other doxy files sourced in Markdown
-TXT_DOX-$(CONFIG_VP9)          += vp8_api1_migration.dox
-vp8_api1_migration.dox.DESC     = VP8 API 1.x Migration
-
 TXT_DOX = $(call enabled,TXT_DOX)

 %.dox: %.txt
--- a/examples.mk
+++ b/examples.mk
@ -16,7 +16,7 @@ UTILS-$(CONFIG_DECODERS)    += vpxdec.c
 vpxdec.SRCS                 += md5_utils.c md5_utils.h
 vpxdec.SRCS                 += vpx_ports/vpx_timer.h
 vpxdec.SRCS                 += vpx/vpx_integer.h
-vpxdec.SRCS                 += args.c args.h vpx_ports/config.h
+vpxdec.SRCS                 += args.c args.h
 vpxdec.SRCS                 += tools_common.c tools_common.h
 vpxdec.SRCS                 += nestegg/halloc/halloc.h
 vpxdec.SRCS                 += nestegg/halloc/src/align.h
@ -30,13 +30,17 @@ vpxdec.DESCRIPTION           = Full featured decoder
 UTILS-$(CONFIG_ENCODERS)    += vpxenc.c
 vpxenc.SRCS                 += args.c args.h y4minput.c y4minput.h
 vpxenc.SRCS                 += tools_common.c tools_common.h
-vpxenc.SRCS                 += vpx_ports/config.h vpx_ports/mem_ops.h
+vpxenc.SRCS                 += vpx_ports/mem_ops.h
 vpxenc.SRCS                 += vpx_ports/mem_ops_aligned.h
+vpxenc.SRCS                 += vpx_ports/vpx_timer.h
 vpxenc.SRCS                 += libmkv/EbmlIDs.h
 vpxenc.SRCS                 += libmkv/EbmlWriter.c
 vpxenc.SRCS                 += libmkv/EbmlWriter.h
 vpxenc.GUID                  = 548DEC74-7A15-4B2B-AFC3-AA102E7C25C1
 vpxenc.DESCRIPTION           = Full featured encoder
+UTILS-$(CONFIG_ENCODERS)    += vp8_scalable_patterns.c
+vp8_scalable_patterns.GUID   = 0D6A210B-F482-4D6F-8570-4A9C01ACC88C
+vp8_scalable_patterns.DESCRIPTION = Temporal Scalability Encoder

 # Clean up old ivfenc, ivfdec binaries.
 ifeq ($(CONFIG_MSVS),yes)
@ -77,29 +81,44 @@ GEN_EXAMPLES-$(CONFIG_ENCODERS) += decode_with_drops.c
 endif
 decode_with_drops.GUID           = CE5C53C4-8DDA-438A-86ED-0DDD3CDB8D26
 decode_with_drops.DESCRIPTION    = Drops frames while decoding
+ifeq ($(CONFIG_DECODERS),yes)
+GEN_EXAMPLES-$(CONFIG_ERROR_CONCEALMENT) += decode_with_partial_drops.c
+endif
+decode_with_partial_drops.GUID           = 61C2D026-5754-46AC-916F-1343ECC5537E
+decode_with_partial_drops.DESCRIPTION    = Drops parts of frames while decoding
 GEN_EXAMPLES-$(CONFIG_ENCODERS) += error_resilient.c
 error_resilient.GUID             = DF5837B9-4145-4F92-A031-44E4F832E00C
 error_resilient.DESCRIPTION      = Error Resiliency Feature

-GEN_EXAMPLES-$(CONFIG_VP9_ENCODER) += vp8_scalable_patterns.c
-vp8_scalable_patterns.GUID          = 0D6A210B-F482-4D6F-8570-4A9C01ACC88C
-vp8_scalable_patterns.DESCRIPTION   = VP8 Scalable Bitstream Patterns
-GEN_EXAMPLES-$(CONFIG_VP9_ENCODER) += vp8_set_maps.c
+GEN_EXAMPLES-$(CONFIG_VP8_ENCODER) += vp8_set_maps.c
 vp8_set_maps.GUID                   = ECB2D24D-98B8-4015-A465-A4AF3DCC145F
 vp8_set_maps.DESCRIPTION            = VP8 set active and ROI maps
-GEN_EXAMPLES-$(CONFIG_VP9_ENCODER) += vp8cx_set_ref.c
+GEN_EXAMPLES-$(CONFIG_VP8_ENCODER) += vp8cx_set_ref.c
 vp8cx_set_ref.GUID                  = C5E31F7F-96F6-48BD-BD3E-10EBF6E8057A
 vp8cx_set_ref.DESCRIPTION           = VP8 set encoder reference frame

+# C file is provided, not generated automatically.
+UTILS-$(CONFIG_MULTI_RES_ENCODING) += vp8_multi_resolution_encoder.c
+vp8_multi_resolution_encoder.SRCS  \
+                         += third_party/libyuv/include/libyuv/basic_types.h  \
+                            third_party/libyuv/include/libyuv/cpu_id.h  \
+                            third_party/libyuv/include/libyuv/scale.h  \
+                            third_party/libyuv/source/row.h \
+                            third_party/libyuv/source/scale.c  \
+                            third_party/libyuv/source/cpu_id.c
+vp8_multi_resolution_encoder.GUID         = 04f8738e-63c8-423b-90fa-7c2703a374de
+vp8_multi_resolution_encoder.DESCRIPTION  = VP8 Multiple-resolution Encoding

 # Handle extra library flags depending on codec configuration

 # We should not link to math library (libm) on RVCT
 # when building for bare-metal targets
 ifeq ($(CONFIG_OS_SUPPORT), yes)
+CODEC_EXTRA_LIBS-$(CONFIG_VP8)         += m
 CODEC_EXTRA_LIBS-$(CONFIG_VP9)         += m
 else
    ifeq ($(CONFIG_GCC), yes)
+    CODEC_EXTRA_LIBS-$(CONFIG_VP8)         += m
    CODEC_EXTRA_LIBS-$(CONFIG_VP9)         += m
    endif
 endif
@ -117,6 +136,8 @@ ifeq ($(HAVE_ALT_TREE_LAYOUT),yes)
    INC_PATH := $(SRC_PATH_BARE)/../include
 else
    LIB_PATH-yes                     += $(if $(BUILD_PFX),$(BUILD_PFX),.)
+    INC_PATH-$(CONFIG_VP8_DECODER)   += $(SRC_PATH_BARE)/vp8
+    INC_PATH-$(CONFIG_VP8_ENCODER)   += $(SRC_PATH_BARE)/vp8
    INC_PATH-$(CONFIG_VP9_DECODER)   += $(SRC_PATH_BARE)/vp9
    INC_PATH-$(CONFIG_VP9_ENCODER)   += $(SRC_PATH_BARE)/vp9
    LIB_PATH := $(call enabled,LIB_PATH)
@ -152,12 +173,12 @@ $(eval $(if $(filter universal%,$(TOOLCHAIN)),LIPO_OBJS,BUILD_OBJS):=yes)
 # Create build/install dependencies for all examples. The common case
 # is handled here. The MSVS case is handled below.
 NOT_MSVS = $(if $(CONFIG_MSVS),,yes)
-DIST-BINS-$(NOT_MSVS)      += $(addprefix bin/,$(ALL_EXAMPLES:.c=))
-INSTALL-BINS-$(NOT_MSVS)   += $(addprefix bin/,$(UTILS:.c=))
+DIST-BINS-$(NOT_MSVS)      += $(addprefix bin/,$(ALL_EXAMPLES:.c=$(EXE_SFX)))
+INSTALL-BINS-$(NOT_MSVS)   += $(addprefix bin/,$(UTILS:.c=$(EXE_SFX)))
 DIST-SRCS-yes              += $(ALL_SRCS)
 INSTALL-SRCS-yes           += $(UTIL_SRCS)
 OBJS-$(NOT_MSVS)           += $(if $(BUILD_OBJS),$(call objs,$(ALL_SRCS)))
-BINS-$(NOT_MSVS)           += $(addprefix $(BUILD_PFX),$(ALL_EXAMPLES:.c=))
+BINS-$(NOT_MSVS)           += $(addprefix $(BUILD_PFX),$(ALL_EXAMPLES:.c=$(EXE_SFX)))


 # Instantiate linker template for all examples.
@ -168,7 +189,7 @@ $(foreach bin,$(BINS-yes),\
    $(if $(BUILD_OBJS),$(eval $(bin):\
        $(LIB_PATH)/lib$(CODEC_LIB)$(CODEC_LIB_SUF)))\
    $(if $(BUILD_OBJS),$(eval $(call linker_template,$(bin),\
-        $(call objs,$($(notdir $(bin)).SRCS)) \
+        $(call objs,$($(notdir $(bin:$(EXE_SFX)=)).SRCS)) \
        -l$(CODEC_LIB) $(addprefix -l,$(CODEC_EXTRA_LIBS))\
        )))\
    $(if $(LIPO_OBJS),$(eval $(call lipo_bin_template,$(bin))))\
--- a/examples/decoder_tmpl.txt
+++ b/examples/decoder_tmpl.txt
@ -48,8 +48,8 @@ for(plane=0; plane < 3; plane++) {
    unsigned char *buf =img->planes[plane];

    for(y=0; y < (plane ? (img->d_h + 1) >> 1 : img->d_h); y++) {
-        if(fwrite(buf, 1, (plane ? (img->d_w + 1) >> 1 : img->d_w),
-           outfile));
+        (void) fwrite(buf, 1, (plane ? (img->d_w + 1) >> 1 : img->d_w),
+                      outfile);
        buf += img->stride[plane];
    }
 }
--- a/examples/encoder_tmpl.c
+++ b/examples/encoder_tmpl.c
@ -85,7 +85,7 @@ static void write_ivf_file_header(FILE *outfile,
    mem_put_le32(header+24, frame_cnt);           /* length */
    mem_put_le32(header+28, 0);                   /* unused */

-    if(fwrite(header, 1, 32, outfile));
+    (void) fwrite(header, 1, 32, outfile);
 }


@ -103,7 +103,7 @@ static void write_ivf_frame_header(FILE *outfile,
    mem_put_le32(header+4, pts&0xFFFFFFFF);
    mem_put_le32(header+8, pts >> 32);

-    if(fwrite(header, 1, 12, outfile));
+    (void) fwrite(header, 1, 12, outfile);
 }

 int main(int argc, char **argv) {
--- a/examples/encoder_tmpl.txt
+++ b/examples/encoder_tmpl.txt
@ -61,13 +61,14 @@ if(vpx_codec_encode(&codec, frame_avail? &raw : NULL, frame_cnt,
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PROCESS_FRAME
 case VPX_CODEC_CX_FRAME_PKT:
    write_ivf_frame_header(outfile, pkt);
-    if(fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz,
-              outfile));
+    (void) fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz,
+                  outfile);
    break;
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PROCESS_FRAME


 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DESTROY
+vpx_img_free(&raw);
 if(vpx_codec_destroy(&codec))
    die_codec(&codec, "Failed to destroy codec");
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DESTROY
--- a/examples/postproc.txt
+++ b/examples/postproc.txt
@ -58,7 +58,7 @@ if(frame_cnt%30 == 1) {
    if(vpx_codec_control(&codec, VP8_SET_POSTPROC, &pp))
        die_codec(&codec, "Failed to turn off postproc");
 } else if(frame_cnt%30 == 16) {
-    vp8_postproc_cfg_t  pp = {VP8_DEBLOCK | VP8_DEMACROBLOCK, 4, 0};
+    vp8_postproc_cfg_t  pp = {VP8_DEBLOCK | VP8_DEMACROBLOCK | VP8_MFQE, 4, 0};

    if(vpx_codec_control(&codec, VP8_SET_POSTPROC, &pp))
        die_codec(&codec, "Failed to turn on postproc");
--- a/examples/twopass_encoder.txt
+++ b/examples/twopass_encoder.txt
@ -71,5 +71,17 @@ Pass Progress Reporting
 It's sometimes helpful to see when each pass completes.
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ TWOPASS_LOOP_END
    printf("Pass %d complete.\n", pass+1);
+    if(vpx_codec_destroy(&codec))
+        die_codec(&codec, "Failed to destroy codec");
 }
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ TWOPASS_LOOP_END
+
+
+Clean-up
+-----------------------------
+Destruction of the encoder instance must be done on each pass. The
+raw image should be destroyed at the end as usual.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DESTROY
+vpx_img_free(&raw);
+free(stats.buf);
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DESTROY
--- a/examples/vp8_scalable_patterns.txt
+++ b/examples/vp8_scalable_patterns.txt
@ -1,143 +0,0 @@
-@TEMPLATE encoder_tmpl.c
-VP8 Scalable Frame Patterns
-===========================
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTRODUCTION
-This is an example demonstrating how to control the VP8 encoder's
-reference frame selection and update mechanism for video applications
-that benefit from a scalable bitstream.
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTRODUCTION
-
-
-Configuration
-------------
-Scalable frame patterns are most useful in an error resilient context,
-so error resiliency mode is enabled, as in the `error_resilient.c`
-example. In addition, we want to disable automatic keyframe selection,
-so we force an interval of 1000 frames.
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  ENC_SET_CFG2
-
-/* Enable error resilient mode */
-cfg.g_error_resilient = 1;
-cfg.g_lag_in_frames   = 0;
-cfg.kf_mode           = VPX_KF_FIXED;
-
-/* Disable automatic keyframe placement */
-cfg.kf_min_dist = cfg.kf_max_dist = 1000;
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ENC_SET_CFG2
-
-This example uses the following frame pattern (L->last_frame,
-G->golden_frame, A->alt_ref_frame):
-
-*  Frame  0  Intra, use none,  update L&G&A
-*  Frame  1  Inter, use LGA,   update none
-*  Frame  2  Inter, use LGA,   update L
-*  Frame  3  Inter, use LGA,   update none
-*  Frame  4  Inter, use GA,    update L&G
-*  Frame  5  Inter, use LGA,   update none
-*  Frame  6  Inter, use LGA,   update L
-*  Frame  7  Inter, use LGA,   update none
-*  Frame  8  Inter, use A,     update L&G&A
-*  Frame  9  Inter, use LGA,   update none
-*  Frame 10  Inter, use LGA,   update L
-*  Frame 11  Inter, use LGA,   update none
-*  Frame 12  Inter, use GA,    update L&G
-*  Frame 13  Inter, use LGA,   update none
-*  Frame 14  Inter, use LGA,   update L
-*  Frame 15  Inter, use LGA,   update none
-*  ...Repeats the pattern from frame 0
-
-Change this variable to test the 3 decodable streams case.
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ TWOPASS_VARS
-int                  num_streams = 5;
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ TWOPASS_VARS
-
-
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PER_FRAME_CFG
-flags = 0;
-if(num_streams == 5)
-{
-    switch(frame_cnt % 16) {
-        case 0:
-            flags |= VPX_EFLAG_FORCE_KF;
-            flags |= VP8_EFLAG_FORCE_GF;
-            flags |= VP8_EFLAG_FORCE_ARF;
-            break;
-        case 1:
-        case 3:
-        case 5:
-        case 7:
-        case 9:
-        case 11:
-        case 13:
-        case 15:
-            flags |= VP8_EFLAG_NO_UPD_LAST;
-            flags |= VP8_EFLAG_NO_UPD_GF;
-            flags |= VP8_EFLAG_NO_UPD_ARF;
-            break;
-        case 2:
-        case 6:
-        case 10:
-        case 14:
-            break;
-        case 4:
-            flags |= VP8_EFLAG_NO_REF_LAST;
-            flags |= VP8_EFLAG_FORCE_GF;
-            break;
-        case 8:
-            flags |= VP8_EFLAG_NO_REF_LAST;
-            flags |= VP8_EFLAG_NO_REF_GF;
-            flags |= VP8_EFLAG_FORCE_GF;
-            flags |= VP8_EFLAG_FORCE_ARF;
-            break;
-        case 12:
-            flags |= VP8_EFLAG_NO_REF_LAST;
-            flags |= VP8_EFLAG_FORCE_GF;
-            break;
-    }
-}
-else
-{
-    switch(frame_cnt % 9) {
-        case 0:
-            if(frame_cnt==0)
-            {
-                flags |= VPX_EFLAG_FORCE_KF;
-            }
-            else
-            {
-                cfg.rc_max_quantizer = 26;
-                cfg.rc_min_quantizer = 0;
-                cfg.rc_target_bitrate = 300;
-                flags |= VP8_EFLAG_NO_REF_LAST;
-                flags |= VP8_EFLAG_NO_REF_ARF;
-            }
-            flags |= VP8_EFLAG_FORCE_GF;
-            flags |= VP8_EFLAG_FORCE_ARF;
-            break;
-        case 1:
-        case 2:
-        case 4:
-        case 5:
-        case 7:
-        case 8:
-            cfg.rc_max_quantizer = 45;
-            cfg.rc_min_quantizer = 0;
-            cfg.rc_target_bitrate = 230;
-            break;
-        case 3:
-        case 6:
-            cfg.rc_max_quantizer = 45;
-            cfg.rc_min_quantizer = 0;
-            cfg.rc_target_bitrate = 215;
-            flags |= VP8_EFLAG_NO_REF_LAST;
-            flags |= VP8_EFLAG_FORCE_ARF;
-            break;
-    }
-}
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PER_FRAME_CFG
-
-Observing The Effects
---------------------
-Use the `decode_with_drops` example to decode with various dropped frame
-patterns. Good patterns to start with are 1/2, 3/4, 7/8, and 15/16
-drops.
--- a/libmkv/EbmlIDs.h
+++ b/libmkv/EbmlIDs.h
@ -1,16 +1,16 @@
-// Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the LICENSE file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS.  All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-
-
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
 #ifndef MKV_DEFS_HPP
 #define MKV_DEFS_HPP 1

-// Commenting out values not available in webm, but available in matroska
+/* Commenting out values not available in webm, but available in matroska */

 enum mkv {
  EBML = 0x1A45DFA3,
@ -21,7 +21,7 @@ enum mkv {
  DocType = 0x4282,
  DocTypeVersion = 0x4287,
  DocTypeReadVersion = 0x4285,
-//  CRC_32 = 0xBF,
+/* CRC_32 = 0xBF, */
  Void = 0xEC,
  SignatureSlot = 0x1B538667,
  SignatureAlgo = 0x7E8A,
@ -31,61 +31,61 @@ enum mkv {
  SignatureElements = 0x7E5B,
  SignatureElementList = 0x7E7B,
  SignedElement = 0x6532,
-  // segment
+  /* segment */
  Segment = 0x18538067,
-  // Meta Seek Information
+  /* Meta Seek Information */
  SeekHead = 0x114D9B74,
  Seek = 0x4DBB,
  SeekID = 0x53AB,
  SeekPosition = 0x53AC,
-  // Segment Information
+  /* Segment Information */
  Info = 0x1549A966,
-//  SegmentUID = 0x73A4,
-//  SegmentFilename = 0x7384,
-//  PrevUID = 0x3CB923,
-//  PrevFilename = 0x3C83AB,
-//  NextUID = 0x3EB923,
-//  NextFilename = 0x3E83BB,
-//  SegmentFamily = 0x4444,
-//  ChapterTranslate = 0x6924,
-//  ChapterTranslateEditionUID = 0x69FC,
-//  ChapterTranslateCodec = 0x69BF,
-//  ChapterTranslateID = 0x69A5,
+/* SegmentUID = 0x73A4, */
+/* SegmentFilename = 0x7384, */
+/* PrevUID = 0x3CB923, */
+/* PrevFilename = 0x3C83AB, */
+/* NextUID = 0x3EB923, */
+/* NextFilename = 0x3E83BB, */
+/* SegmentFamily = 0x4444, */
+/* ChapterTranslate = 0x6924, */
+/* ChapterTranslateEditionUID = 0x69FC, */
+/* ChapterTranslateCodec = 0x69BF, */
+/* ChapterTranslateID = 0x69A5, */
  TimecodeScale = 0x2AD7B1,
  Segment_Duration = 0x4489,
  DateUTC = 0x4461,
-//  Title = 0x7BA9,
+/* Title = 0x7BA9, */
  MuxingApp = 0x4D80,
  WritingApp = 0x5741,
-  // Cluster
+  /* Cluster */
  Cluster = 0x1F43B675,
  Timecode = 0xE7,
-//  SilentTracks = 0x5854,
-//  SilentTrackNumber = 0x58D7,
-//  Position = 0xA7,
+/* SilentTracks = 0x5854, */
+/* SilentTrackNumber = 0x58D7, */
+/* Position = 0xA7, */
  PrevSize = 0xAB,
  BlockGroup = 0xA0,
  Block = 0xA1,
-//  BlockVirtual = 0xA2,
-//  BlockAdditions = 0x75A1,
-//  BlockMore = 0xA6,
-//  BlockAddID = 0xEE,
-//  BlockAdditional = 0xA5,
+/* BlockVirtual = 0xA2, */
+/* BlockAdditions = 0x75A1, */
+/* BlockMore = 0xA6, */
+/* BlockAddID = 0xEE, */
+/* BlockAdditional = 0xA5, */
  BlockDuration = 0x9B,
-//  ReferencePriority = 0xFA,
+/* ReferencePriority = 0xFA, */
  ReferenceBlock = 0xFB,
-//  ReferenceVirtual = 0xFD,
-//  CodecState = 0xA4,
-//  Slices = 0x8E,
-//  TimeSlice = 0xE8,
+/* ReferenceVirtual = 0xFD, */
+/* CodecState = 0xA4, */
+/* Slices = 0x8E, */
+/* TimeSlice = 0xE8, */
  LaceNumber = 0xCC,
-//  FrameNumber = 0xCD,
-//  BlockAdditionID = 0xCB,
-//  MkvDelay = 0xCE,
-//  Cluster_Duration = 0xCF,
+/* FrameNumber = 0xCD, */
+/* BlockAdditionID = 0xCB, */
+/* MkvDelay = 0xCE, */
+/* Cluster_Duration = 0xCF, */
  SimpleBlock = 0xA3,
-//  EncryptedBlock = 0xAF,
-  // Track
+/* EncryptedBlock = 0xAF, */
+  /* Track */
  Tracks = 0x1654AE6B,
  TrackEntry = 0xAE,
  TrackNumber = 0xD7,
@ -95,28 +95,28 @@ enum mkv {
  FlagDefault = 0x88,
  FlagForced = 0x55AA,
  FlagLacing = 0x9C,
-//  MinCache = 0x6DE7,
-//  MaxCache = 0x6DF8,
+/* MinCache = 0x6DE7, */
+/* MaxCache = 0x6DF8, */
  DefaultDuration = 0x23E383,
-//  TrackTimecodeScale = 0x23314F,
-//  TrackOffset = 0x537F,
-//  MaxBlockAdditionID = 0x55EE,
+/* TrackTimecodeScale = 0x23314F, */
+/* TrackOffset = 0x537F, */
+/* MaxBlockAdditionID = 0x55EE, */
  Name = 0x536E,
  Language = 0x22B59C,
  CodecID = 0x86,
  CodecPrivate = 0x63A2,
  CodecName = 0x258688,
-//  AttachmentLink = 0x7446,
-//  CodecSettings = 0x3A9697,
-//  CodecInfoURL = 0x3B4040,
-//  CodecDownloadURL = 0x26B240,
-//  CodecDecodeAll = 0xAA,
-//  TrackOverlay = 0x6FAB,
-//  TrackTranslate = 0x6624,
-//  TrackTranslateEditionUID = 0x66FC,
-//  TrackTranslateCodec = 0x66BF,
-//  TrackTranslateTrackID = 0x66A5,
-  // video
+/* AttachmentLink = 0x7446, */
+/* CodecSettings = 0x3A9697, */
+/* CodecInfoURL = 0x3B4040, */
+/* CodecDownloadURL = 0x26B240, */
+/* CodecDecodeAll = 0xAA, */
+/* TrackOverlay = 0x6FAB, */
+/* TrackTranslate = 0x6624, */
+/* TrackTranslateEditionUID = 0x66FC, */
+/* TrackTranslateCodec = 0x66BF, */
+/* TrackTranslateTrackID = 0x66A5, */
+  /* video */
  Video = 0xE0,
  FlagInterlaced = 0x9A,
  StereoMode = 0x53B8,
@ -130,101 +130,101 @@ enum mkv {
  DisplayHeight = 0x54BA,
  DisplayUnit = 0x54B2,
  AspectRatioType = 0x54B3,
-//  ColourSpace = 0x2EB524,
-//  GammaValue = 0x2FB523,
+/* ColourSpace = 0x2EB524, */
+/* GammaValue = 0x2FB523, */
  FrameRate = 0x2383E3,
-  // end video
-  // audio
+  /* end video */
+  /* audio */
  Audio = 0xE1,
  SamplingFrequency = 0xB5,
  OutputSamplingFrequency = 0x78B5,
  Channels = 0x9F,
-//  ChannelPositions = 0x7D7B,
+/* ChannelPositions = 0x7D7B, */
  BitDepth = 0x6264,
-  // end audio
-  // content encoding
-//  ContentEncodings = 0x6d80,
-//  ContentEncoding = 0x6240,
-//  ContentEncodingOrder = 0x5031,
-//  ContentEncodingScope = 0x5032,
-//  ContentEncodingType = 0x5033,
-//  ContentCompression = 0x5034,
-//  ContentCompAlgo = 0x4254,
-//  ContentCompSettings = 0x4255,
-//  ContentEncryption = 0x5035,
-//  ContentEncAlgo = 0x47e1,
-//  ContentEncKeyID = 0x47e2,
-//  ContentSignature = 0x47e3,
-//  ContentSigKeyID = 0x47e4,
-//  ContentSigAlgo = 0x47e5,
-//  ContentSigHashAlgo = 0x47e6,
-  // end content encoding
-  // Cueing Data
+  /* end audio */
+  /* content encoding */
+/* ContentEncodings = 0x6d80, */
+/* ContentEncoding = 0x6240, */
+/* ContentEncodingOrder = 0x5031, */
+/* ContentEncodingScope = 0x5032, */
+/* ContentEncodingType = 0x5033, */
+/* ContentCompression = 0x5034, */
+/* ContentCompAlgo = 0x4254, */
+/* ContentCompSettings = 0x4255, */
+/* ContentEncryption = 0x5035, */
+/* ContentEncAlgo = 0x47e1, */
+/* ContentEncKeyID = 0x47e2, */
+/* ContentSignature = 0x47e3, */
+/* ContentSigKeyID = 0x47e4, */
+/* ContentSigAlgo = 0x47e5, */
+/* ContentSigHashAlgo = 0x47e6, */
+  /* end content encoding */
+  /* Cueing Data */
  Cues = 0x1C53BB6B,
  CuePoint = 0xBB,
  CueTime = 0xB3,
  CueTrackPositions = 0xB7,
  CueTrack = 0xF7,
  CueClusterPosition = 0xF1,
-  CueBlockNumber = 0x5378,
-//  CueCodecState = 0xEA,
-//  CueReference = 0xDB,
-//  CueRefTime = 0x96,
-//  CueRefCluster = 0x97,
-//  CueRefNumber = 0x535F,
-//  CueRefCodecState = 0xEB,
-  // Attachment
-//  Attachments = 0x1941A469,
-//  AttachedFile = 0x61A7,
-//  FileDescription = 0x467E,
-//  FileName = 0x466E,
-//  FileMimeType = 0x4660,
-//  FileData = 0x465C,
-//  FileUID = 0x46AE,
-//  FileReferral = 0x4675,
-  // Chapters
-//  Chapters = 0x1043A770,
-//  EditionEntry = 0x45B9,
-//  EditionUID = 0x45BC,
-//  EditionFlagHidden = 0x45BD,
-//  EditionFlagDefault = 0x45DB,
-//  EditionFlagOrdered = 0x45DD,
-//  ChapterAtom = 0xB6,
-//  ChapterUID = 0x73C4,
-//  ChapterTimeStart = 0x91,
-//  ChapterTimeEnd = 0x92,
-//  ChapterFlagHidden = 0x98,
-//  ChapterFlagEnabled = 0x4598,
-//  ChapterSegmentUID = 0x6E67,
-//  ChapterSegmentEditionUID = 0x6EBC,
-//  ChapterPhysicalEquiv = 0x63C3,
-//  ChapterTrack = 0x8F,
-//  ChapterTrackNumber = 0x89,
-//  ChapterDisplay = 0x80,
-//  ChapString = 0x85,
-//  ChapLanguage = 0x437C,
-//  ChapCountry = 0x437E,
-//  ChapProcess = 0x6944,
-//  ChapProcessCodecID = 0x6955,
-//  ChapProcessPrivate = 0x450D,
-//  ChapProcessCommand = 0x6911,
-//  ChapProcessTime = 0x6922,
-//  ChapProcessData = 0x6933,
-  // Tagging
-//  Tags = 0x1254C367,
-//  Tag = 0x7373,
-//  Targets = 0x63C0,
-//  TargetTypeValue = 0x68CA,
-//  TargetType = 0x63CA,
-//  Tagging_TrackUID = 0x63C5,
-//  Tagging_EditionUID = 0x63C9,
-//  Tagging_ChapterUID = 0x63C4,
-//  AttachmentUID = 0x63C6,
-//  SimpleTag = 0x67C8,
-//  TagName = 0x45A3,
-//  TagLanguage = 0x447A,
-//  TagDefault = 0x4484,
-//  TagString = 0x4487,
-//  TagBinary = 0x4485,
+  CueBlockNumber = 0x5378
+/* CueCodecState = 0xEA, */
+/* CueReference = 0xDB, */
+/* CueRefTime = 0x96, */
+/* CueRefCluster = 0x97, */
+/* CueRefNumber = 0x535F, */
+/* CueRefCodecState = 0xEB, */
+  /* Attachment */
+/* Attachments = 0x1941A469, */
+/* AttachedFile = 0x61A7, */
+/* FileDescription = 0x467E, */
+/* FileName = 0x466E, */
+/* FileMimeType = 0x4660, */
+/* FileData = 0x465C, */
+/* FileUID = 0x46AE, */
+/* FileReferral = 0x4675, */
+  /* Chapters */
+/* Chapters = 0x1043A770, */
+/* EditionEntry = 0x45B9, */
+/* EditionUID = 0x45BC, */
+/* EditionFlagHidden = 0x45BD, */
+/* EditionFlagDefault = 0x45DB, */
+/* EditionFlagOrdered = 0x45DD, */
+/* ChapterAtom = 0xB6, */
+/* ChapterUID = 0x73C4, */
+/* ChapterTimeStart = 0x91, */
+/* ChapterTimeEnd = 0x92, */
+/* ChapterFlagHidden = 0x98, */
+/* ChapterFlagEnabled = 0x4598, */
+/* ChapterSegmentUID = 0x6E67, */
+/* ChapterSegmentEditionUID = 0x6EBC, */
+/* ChapterPhysicalEquiv = 0x63C3, */
+/* ChapterTrack = 0x8F, */
+/* ChapterTrackNumber = 0x89, */
+/* ChapterDisplay = 0x80, */
+/* ChapString = 0x85, */
+/* ChapLanguage = 0x437C, */
+/* ChapCountry = 0x437E, */
+/* ChapProcess = 0x6944, */
+/* ChapProcessCodecID = 0x6955, */
+/* ChapProcessPrivate = 0x450D, */
+/* ChapProcessCommand = 0x6911, */
+/* ChapProcessTime = 0x6922, */
+/* ChapProcessData = 0x6933, */
+  /* Tagging */
+/* Tags = 0x1254C367, */
+/* Tag = 0x7373, */
+/* Targets = 0x63C0, */
+/* TargetTypeValue = 0x68CA, */
+/* TargetType = 0x63CA, */
+/* Tagging_TrackUID = 0x63C5, */
+/* Tagging_EditionUID = 0x63C9, */
+/* Tagging_ChapterUID = 0x63C4, */
+/* AttachmentUID = 0x63C6, */
+/* SimpleTag = 0x67C8, */
+/* TagName = 0x45A3, */
+/* TagLanguage = 0x447A, */
+/* TagDefault = 0x4484, */
+/* TagString = 0x4487, */
+/* TagBinary = 0x4485, */
 };
 #endif
--- a/libmkv/EbmlWriter.c
+++ b/libmkv/EbmlWriter.c
@ -1,12 +1,12 @@
-// Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the LICENSE file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS.  All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-
-
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
 #include "EbmlWriter.h"
 #include <stdlib.h>
 #include <wchar.h>
@ -18,10 +18,12 @@
 #define LITERALU64(n) n##LLU
 #endif

-void Ebml_WriteLen(EbmlGlobal *glob, long long val) {
-  // TODO check and make sure we are not > than 0x0100000000000000LLU
-  unsigned char size = 8; // size in bytes to output
-  unsigned long long minVal = LITERALU64(0x00000000000000ff); // mask to compare for byte size
+void Ebml_WriteLen(EbmlGlobal *glob, int64_t val) {
+  /* TODO check and make sure we are not > than 0x0100000000000000LLU */
+  unsigned char size = 8; /* size in bytes to output */
+
+  /* mask to compare for byte size */
+  int64_t minVal = 0xff;

  for (size = 1; size < 8; size ++) {
    if (val < minVal)
@ -30,29 +32,31 @@ void Ebml_WriteLen(EbmlGlobal *glob, long long val) {
    minVal = (minVal << 7);
  }

-  val |= (LITERALU64(0x000000000000080) << ((size - 1) * 7));
+  val |= (((uint64_t)0x80) << ((size - 1) * 7));

  Ebml_Serialize(glob, (void *) &val, sizeof(val), size);
 }

 void Ebml_WriteString(EbmlGlobal *glob, const char *str) {
  const size_t size_ = strlen(str);
-  const unsigned long long  size = size_;
+  const uint64_t  size = size_;
  Ebml_WriteLen(glob, size);
-  // TODO: it's not clear from the spec whether the nul terminator
-  // should be serialized too.  For now we omit the null terminator.
-  Ebml_Write(glob, str, size);
+  /* TODO: it's not clear from the spec whether the nul terminator
+   * should be serialized too.  For now we omit the null terminator.
+   */
+  Ebml_Write(glob, str, (unsigned long)size);
 }

 void Ebml_WriteUTF8(EbmlGlobal *glob, const wchar_t *wstr) {
  const size_t strlen = wcslen(wstr);

-  // TODO: it's not clear from the spec whether the nul terminator
-  // should be serialized too.  For now we include it.
-  const unsigned long long  size = strlen;
+  /* TODO: it's not clear from the spec whether the nul terminator
+   * should be serialized too.  For now we include it.
+   */
+  const uint64_t  size = strlen;

  Ebml_WriteLen(glob, size);
-  Ebml_Write(glob, wstr, size);
+  Ebml_Write(glob, wstr, (unsigned long)size);
 }

 void Ebml_WriteID(EbmlGlobal *glob, unsigned long class_id) {
@ -78,12 +82,12 @@ void Ebml_SerializeUnsigned64(EbmlGlobal *glob, unsigned long class_id, uint64_t
 }

 void Ebml_SerializeUnsigned(EbmlGlobal *glob, unsigned long class_id, unsigned long ui) {
-  unsigned char size = 8; // size in bytes to output
+  unsigned char size = 8; /* size in bytes to output */
  unsigned char sizeSerialized = 0;
  unsigned long minVal;

  Ebml_WriteID(glob, class_id);
-  minVal = 0x7fLU; // mask to compare for byte size
+  minVal = 0x7fLU; /* mask to compare for byte size */

  for (size = 1; size < 4; size ++) {
    if (ui < minVal) {
@ -97,7 +101,7 @@ void Ebml_SerializeUnsigned(EbmlGlobal *glob, unsigned long class_id, unsigned l
  Ebml_Serialize(glob, &sizeSerialized, sizeof(sizeSerialized), 1);
  Ebml_Serialize(glob, &ui, sizeof(ui), size);
 }
-// TODO: perhaps this is a poor name for this id serializer helper function
+/* TODO: perhaps this is a poor name for this id serializer helper function */
 void Ebml_SerializeBinary(EbmlGlobal *glob, unsigned long class_id, unsigned long bin) {
  int size;
  for (size = 4; size > 1; size--) {
@ -150,4 +154,4 @@ void Ebml_WriteVoid(EbmlGlobal *glob, unsigned long vSize) {
  }
 }

-// TODO Serialize Date
+/* TODO Serialize Date */
--- a/libmkv/EbmlWriter.h
+++ b/libmkv/EbmlWriter.h
@ -1,26 +1,30 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
 #ifndef EBMLWRITER_HPP
 #define EBMLWRITER_HPP
-
-// Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the LICENSE file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS.  All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-
-// note: you must define write and serialize functions as well as your own EBML_GLOBAL
-// These functions MUST be implemented
 #include <stddef.h>
 #include "vpx/vpx_integer.h"

+/* note: you must define write and serialize functions as well as your own
+ * EBML_GLOBAL
+ *
+ * These functions MUST be implemented
+ */
+
 typedef struct EbmlGlobal EbmlGlobal;
 void  Ebml_Serialize(EbmlGlobal *glob, const void *, int, unsigned long);
 void  Ebml_Write(EbmlGlobal *glob, const void *, unsigned long);
-/////

+/*****/

-void Ebml_WriteLen(EbmlGlobal *glob, long long val);
+void Ebml_WriteLen(EbmlGlobal *glob, int64_t val);
 void Ebml_WriteString(EbmlGlobal *glob, const char *str);
 void Ebml_WriteUTF8(EbmlGlobal *glob, const wchar_t *wstr);
 void Ebml_WriteID(EbmlGlobal *glob, unsigned long class_id);
@ -28,11 +32,11 @@ void Ebml_SerializeUnsigned64(EbmlGlobal *glob, unsigned long class_id, uint64_t
 void Ebml_SerializeUnsigned(EbmlGlobal *glob, unsigned long class_id, unsigned long ui);
 void Ebml_SerializeBinary(EbmlGlobal *glob, unsigned long class_id, unsigned long ui);
 void Ebml_SerializeFloat(EbmlGlobal *glob, unsigned long class_id, double d);
-// TODO make this more generic to signed
+/* TODO make this more generic to signed */
 void Ebml_WriteSigned16(EbmlGlobal *glob, short val);
 void Ebml_SerializeString(EbmlGlobal *glob, unsigned long class_id, const char *s);
 void Ebml_SerializeUTF8(EbmlGlobal *glob, unsigned long class_id, wchar_t *s);
 void Ebml_SerializeData(EbmlGlobal *glob, unsigned long class_id, unsigned char *data, unsigned long data_length);
 void Ebml_WriteVoid(EbmlGlobal *glob, unsigned long vSize);
-// TODO need date function
+/* TODO need date function */
 #endif
--- a/libs.mk
+++ b/libs.mk
@ -17,6 +17,34 @@ else
  ASM:=.asm
 endif

+
+#
+# Calculate platform- and compiler-specific offsets for hand coded assembly
+#
+ifeq ($(filter icc gcc,$(TGT_CC)), $(TGT_CC))
+OFFSET_PATTERN:='^[a-zA-Z0-9_]* EQU'
+define asm_offsets_template
+$$(BUILD_PFX)$(1): $$(BUILD_PFX)$(2).S
+	@echo "    [CREATE] $$@"
+	$$(qexec)LC_ALL=C grep $$(OFFSET_PATTERN) $$< | tr -d '$$$$\#' $$(ADS2GAS) > $$@
+$$(BUILD_PFX)$(2).S: $(2)
+CLEAN-OBJS += $$(BUILD_PFX)$(1) $(2).S
+endef
+else
+  ifeq ($(filter rvct,$(TGT_CC)), $(TGT_CC))
+define asm_offsets_template
+$$(BUILD_PFX)$(1): obj_int_extract
+$$(BUILD_PFX)$(1): $$(BUILD_PFX)$(2).o
+	@echo "    [CREATE] $$@"
+	$$(qexec)./obj_int_extract rvds $$< $$(ADS2GAS) > $$@
+OBJS-yes += $$(BUILD_PFX)$(2).o
+CLEAN-OBJS += $$(BUILD_PFX)$(1)
+$$(filter %$$(ASM).o,$$(OBJS-yes)): $$(BUILD_PFX)$(1)
+endef
+endif # rvct
+endif # !gcc
+
+
 CODEC_SRCS-yes += CHANGELOG
 CODEC_SRCS-yes += libs.mk

@ -29,15 +57,47 @@ CODEC_SRCS-yes += $(addprefix vpx_mem/,$(call enabled,MEM_SRCS))
 include $(SRC_PATH_BARE)/vpx_scale/vpx_scale.mk
 CODEC_SRCS-yes += $(addprefix vpx_scale/,$(call enabled,SCALE_SRCS))

+ifneq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),)
+  VP8_PREFIX=vp8/
+  include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8_common.mk
+endif
+
+ifeq ($(CONFIG_VP8_ENCODER),yes)
+  include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8cx.mk
+  CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_CX_SRCS))
+  CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_CX_EXPORTS))
+  CODEC_SRCS-yes += $(VP8_PREFIX)vp8cx.mk vpx/vp8.h vpx/vp8cx.h
+  CODEC_SRCS-$(ARCH_ARM) += $(VP8_PREFIX)vp88cx_arm.mk
+  INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h
+  INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/%
+  CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h
+  CODEC_DOC_SECTIONS += vp8 vp8_encoder
+endif
+
+ifeq ($(CONFIG_VP8_DECODER),yes)
+  include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8dx.mk
+  CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_DX_SRCS))
+  CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_DX_EXPORTS))
+  CODEC_SRCS-yes += $(VP8_PREFIX)vp8dx.mk vpx/vp8.h vpx/vp8dx.h
+  INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8dx.h
+  INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/%
+  CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8dx.h
+  CODEC_DOC_SECTIONS += vp8 vp8_decoder
+endif
+
+ifneq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_DECODER),)
+  VP9_PREFIX=vp9/
+  include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9_common.mk
+endif

 ifeq ($(CONFIG_VP9_ENCODER),yes)
  VP9_PREFIX=vp9/
  include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9cx.mk
  CODEC_SRCS-yes += $(addprefix $(VP9_PREFIX),$(call enabled,VP9_CX_SRCS))
  CODEC_EXPORTS-yes += $(addprefix $(VP9_PREFIX),$(VP9_CX_EXPORTS))
-  CODEC_SRCS-yes += $(VP9_PREFIX)vp9cx.mk vpx/vp8.h vpx/vp8cx.h vpx/vp8e.h
+  CODEC_SRCS-yes += $(VP9_PREFIX)vp9cx.mk vpx/vp8.h vpx/vp8cx.h
  CODEC_SRCS-$(ARCH_ARM) += $(VP9_PREFIX)vp98cx_arm.mk
-  INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8e.h include/vpx/vp8cx.h
+  INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h
  INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP9_PREFIX)/%
  CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h
  CODEC_DOC_SECTIONS += vp9 vp9_encoder
@ -117,7 +177,6 @@ INSTALL-LIBS-yes += include/vpx/vpx_integer.h
 INSTALL-LIBS-yes += include/vpx/vpx_codec_impl_top.h
 INSTALL-LIBS-yes += include/vpx/vpx_codec_impl_bottom.h
 INSTALL-LIBS-$(CONFIG_DECODERS) += include/vpx/vpx_decoder.h
-INSTALL-LIBS-$(CONFIG_DECODERS) += include/vpx/vpx_decoder_compat.h
 INSTALL-LIBS-$(CONFIG_ENCODERS) += include/vpx/vpx_encoder.h
 ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
 ifeq ($(CONFIG_MSVS),yes)
@ -149,7 +208,7 @@ ifeq ($(CONFIG_MSVS),yes)
 obj_int_extract.vcproj: $(SRC_PATH_BARE)/build/make/obj_int_extract.c
 	@cp $(SRC_PATH_BARE)/build/x86-msvs/obj_int_extract.bat .
 	@echo "    [CREATE] $@"
-	$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \
+	$(qexec)$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \
    --exe \
    --target=$(TOOLCHAIN) \
    --name=obj_int_extract \
@ -165,14 +224,14 @@ PROJECTS-$(BUILD_LIBVPX) += obj_int_extract.bat

 vpx.def: $(call enabled,CODEC_EXPORTS)
 	@echo "    [CREATE] $@"
-	$(SRC_PATH_BARE)/build/make/gen_msvs_def.sh\
+	$(qexec)$(SRC_PATH_BARE)/build/make/gen_msvs_def.sh\
            --name=vpx\
            --out=$@ $^
 CLEAN-OBJS += vpx.def

 vpx.vcproj: $(CODEC_SRCS) vpx.def
 	@echo "    [CREATE] $@"
-	$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \
+	$(qexec)$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \
            $(if $(CONFIG_SHARED),--dll,--lib) \
            --target=$(TOOLCHAIN) \
            $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
@ -264,6 +323,7 @@ vpx.pc: config.mk libs.mk
 	$(qexec)echo 'Requires:' >> $@
 	$(qexec)echo 'Conflicts:' >> $@
 	$(qexec)echo 'Libs: -L$${libdir} -lvpx -lm' >> $@
+	$(qexec)echo 'Libs.private: -lm -lpthread' >> $@
 	$(qexec)echo 'Cflags: -I$${includedir}' >> $@
 INSTALL-LIBS-yes += $(LIBSUBDIR)/pkgconfig/vpx.pc
 INSTALL_MAPS += $(LIBSUBDIR)/pkgconfig/%.pc %.pc
@ -298,57 +358,6 @@ endif
 $(filter %.s.o,$(OBJS-yes)):     $(BUILD_PFX)vpx_config.asm
 $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm

-#
-# Calculate platform- and compiler-specific offsets for hand coded assembly
-#
-
-OFFSET_PATTERN:='^[a-zA-Z0-9_]* EQU'
-
-ifeq ($(filter icc gcc,$(TGT_CC)), $(TGT_CC))
-    $(BUILD_PFX)asm_com_offsets.asm: $(BUILD_PFX)$(VP9_PREFIX)common/asm_com_offsets.c.S
-	@echo "    [CREATE] $@"
-	$(qexec)LC_ALL=C grep $(OFFSET_PATTERN) $< | tr -d '$$\#' $(ADS2GAS) > $@
-    $(BUILD_PFX)$(VP9_PREFIX)common/asm_com_offsets.c.S: $(VP9_PREFIX)common/asm_com_offsets.c
-    CLEAN-OBJS += $(BUILD_PFX)asm_com_offsets.asm $(BUILD_PFX)$(VP9_PREFIX)common/asm_com_offsets.c.S
-
-    $(BUILD_PFX)asm_enc_offsets.asm: $(BUILD_PFX)$(VP9_PREFIX)encoder/asm_enc_offsets.c.S
-	@echo "    [CREATE] $@"
-	$(qexec)LC_ALL=C grep $(OFFSET_PATTERN) $< | tr -d '$$\#' $(ADS2GAS) > $@
-    $(BUILD_PFX)$(VP9_PREFIX)encoder/asm_enc_offsets.c.S: $(VP9_PREFIX)encoder/asm_enc_offsets.c
-    CLEAN-OBJS += $(BUILD_PFX)asm_enc_offsets.asm $(BUILD_PFX)$(VP9_PREFIX)encoder/asm_enc_offsets.c.S
-
-    $(BUILD_PFX)asm_dec_offsets.asm: $(BUILD_PFX)$(VP9_PREFIX)decoder/asm_dec_offsets.c.S
-	@echo "    [CREATE] $@"
-	$(qexec)LC_ALL=C grep $(OFFSET_PATTERN) $< | tr -d '$$\#' $(ADS2GAS) > $@
-    $(BUILD_PFX)$(VP9_PREFIX)decoder/asm_dec_offsets.c.S: $(VP9_PREFIX)decoder/asm_dec_offsets.c
-    CLEAN-OBJS += $(BUILD_PFX)asm_dec_offsets.asm $(BUILD_PFX)$(VP9_PREFIX)decoder/asm_dec_offsets.c.S
-else
-  ifeq ($(filter rvct,$(TGT_CC)), $(TGT_CC))
-    asm_com_offsets.asm: obj_int_extract
-    asm_com_offsets.asm: $(VP9_PREFIX)common/asm_com_offsets.c.o
-	@echo "    [CREATE] $@"
-	$(qexec)./obj_int_extract rvds $< $(ADS2GAS) > $@
-    OBJS-yes += $(VP9_PREFIX)common/asm_com_offsets.c.o
-    CLEAN-OBJS += asm_com_offsets.asm
-    $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_com_offsets.asm
-
-    asm_enc_offsets.asm: obj_int_extract
-    asm_enc_offsets.asm: $(VP9_PREFIX)encoder/asm_enc_offsets.c.o
-	@echo "    [CREATE] $@"
-	$(qexec)./obj_int_extract rvds $< $(ADS2GAS) > $@
-    OBJS-yes += $(VP9_PREFIX)encoder/asm_enc_offsets.c.o
-    CLEAN-OBJS += asm_enc_offsets.asm
-    $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_enc_offsets.asm
-
-    asm_dec_offsets.asm: obj_int_extract
-    asm_dec_offsets.asm: $(VP9_PREFIX)decoder/asm_dec_offsets.c.o
-	@echo "    [CREATE] $@"
-	$(qexec)./obj_int_extract rvds $< $(ADS2GAS) > $@
-    OBJS-yes += $(VP9_PREFIX)decoder/asm_dec_offsets.c.o
-    CLEAN-OBJS += asm_dec_offsets.asm
-    $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_dec_offsets.asm
-  endif
-endif

 $(shell $(SRC_PATH_BARE)/build/make/version.sh "$(SRC_PATH_BARE)" $(BUILD_PFX)vpx_version.h)
 CLEAN-OBJS += $(BUILD_PFX)vpx_version.h
@ -356,15 +365,15 @@ CLEAN-OBJS += $(BUILD_PFX)vpx_version.h
 #
 # Rule to generate runtime cpu detection files
 #
-$(OBJS-yes:.o=.d): vpx_rtcd.h
-vpx_rtcd.h: $(sort $(filter %rtcd_defs.sh,$(CODEC_SRCS)))
+$(BUILD_PFX)vpx_rtcd.h: $(SRC_PATH_BARE)/$(sort $(filter %rtcd_defs.sh,$(CODEC_SRCS)))
 	@echo "    [CREATE] $@"
 	$(qexec)$(SRC_PATH_BARE)/build/make/rtcd.sh --arch=$(TGT_ISA) \
-		  --sym=vpx_rtcd \
-		  --config=$(target)$(if $(FAT_ARCHS),,-$(TOOLCHAIN)).mk \
-		  $(RTCD_OPTIONS) $^ > $@
+          --sym=vpx_rtcd \
+          --config=$(target)$(if $(FAT_ARCHS),,-$(TOOLCHAIN)).mk \
+          $(RTCD_OPTIONS) $^ > $@
 CLEAN-OBJS += $(BUILD_PFX)vpx_rtcd.h

+
 CODEC_DOC_SRCS += vpx/vpx_codec.h \
                  vpx/vpx_decoder.h \
                  vpx/vpx_encoder.h \
@ -373,7 +382,6 @@ CODEC_DOC_SRCS += vpx/vpx_codec.h \
 ##
 ## libvpx test directives
 ##
-
 ifeq ($(CONFIG_UNIT_TESTS),yes)
 LIBVPX_TEST_DATA_PATH ?= .

@ -392,8 +400,12 @@ $(LIBVPX_TEST_DATA):
 testdata:: $(LIBVPX_TEST_DATA)
 	$(qexec)if [ -x "$$(which sha1sum)" ]; then\
            echo "Checking test data:";\
-            (cd $(LIBVPX_TEST_DATA_PATH); sha1sum -c)\
-                < $(SRC_PATH_BARE)/test/test-data.sha1; \
+            if [ -n "$(LIBVPX_TEST_DATA)" ]; then\
+                for f in $(call enabled,LIBVPX_TEST_DATA); do\
+                    grep $$f $(SRC_PATH_BARE)/test/test-data.sha1 |\
+                        (cd $(LIBVPX_TEST_DATA_PATH); sha1sum -c);\
+                done; \
+            fi; \
        else\
            echo "Skipping test data integrity check, sha1sum not found.";\
        fi
@ -403,7 +415,7 @@ ifeq ($(CONFIG_MSVS),yes)

 gtest.vcproj: $(SRC_PATH_BARE)/third_party/googletest/src/src/gtest-all.cc
 	@echo "    [CREATE] $@"
-	$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \
+	$(qexec)$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \
            --lib \
            --target=$(TOOLCHAIN) \
            $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
@ -418,7 +430,7 @@ PROJECTS-$(CONFIG_MSVS) += gtest.vcproj

 test_libvpx.vcproj: $(LIBVPX_TEST_SRCS)
 	@echo "    [CREATE] $@"
-	$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \
+	$(qexec)$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \
            --exe \
            --target=$(TOOLCHAIN) \
            --name=test_libvpx \
@ -428,28 +440,6 @@ test_libvpx.vcproj: $(LIBVPX_TEST_SRCS)
            --out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \
            -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \
            -L. -l$(CODEC_LIB) -lwinmm -l$(GTEST_LIB) $^
-ifeq ($(CONFIG_STATIC_MSVCRT),--static-crt)
-lib_sfx=mt
-else
-lib_sfx=md
-endif
-
-define unit_test_vcproj_template
-$(notdir $(1:.cc=.vcproj)): $(SRC_PATH_BARE)/$(1)
-	@echo "    [vcproj] $$@"
-	$$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh\
-            --exe\
-            --target=$$(TOOLCHAIN)\
-            --name=$(notdir $(1:.cc=))\
-            --ver=$$(CONFIG_VS_VERSION)\
-            $$(if $$(CONFIG_STATIC_MSVCRT),--static-crt) \
-            --out=$$@ $$(INTERNAL_CFLAGS) $$(CFLAGS) \
-            -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \
-            -L. -lvpxmd -lwinmm -lgtest$(lib_sfx) $$^
-endef
-
-$(foreach proj,$(LIBVPX_TEST_BINS),\
-    $(eval $(call unit_test_vcproj_template,$(proj))))

 PROJECTS-$(CONFIG_MSVS) += test_libvpx.vcproj

@ -461,24 +451,28 @@ else
 include $(SRC_PATH_BARE)/third_party/googletest/gtest.mk
 GTEST_SRCS := $(addprefix third_party/googletest/src/,$(call enabled,GTEST_SRCS))
 GTEST_OBJS=$(call objs,$(GTEST_SRCS))
-$(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src
-$(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src/include
+$(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src
+$(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src/include
 OBJS-$(BUILD_LIBVPX) += $(GTEST_OBJS)
 LIBS-$(BUILD_LIBVPX) += $(BUILD_PFX)libgtest.a $(BUILD_PFX)libgtest_g.a
 $(BUILD_PFX)libgtest_g.a: $(GTEST_OBJS)

 LIBVPX_TEST_OBJS=$(sort $(call objs,$(LIBVPX_TEST_SRCS)))
-$(LIBVPX_TEST_OBJS) $(LIBVPX_TEST_OBJS:.o=.d): CFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src
-$(LIBVPX_TEST_OBJS) $(LIBVPX_TEST_OBJS:.o=.d): CFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src/include
+$(LIBVPX_TEST_OBJS) $(LIBVPX_TEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src
+$(LIBVPX_TEST_OBJS) $(LIBVPX_TEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src/include
 OBJS-$(BUILD_LIBVPX) += $(LIBVPX_TEST_OBJS)
+BINS-$(BUILD_LIBVPX) += $(LIBVPX_TEST_BINS)

 # Install test sources only if codec source is included
 INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(patsubst $(SRC_PATH_BARE)/%,%,\
    $(shell find $(SRC_PATH_BARE)/third_party/googletest -type f))
 INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(LIBVPX_TEST_SRCS)

+CODEC_LIB=$(if $(CONFIG_DEBUG_LIBS),vpx_g,vpx)
+CODEC_LIB_SUF=$(if $(CONFIG_SHARED),.so,.a)
 $(foreach bin,$(LIBVPX_TEST_BINS),\
-    $(if $(BUILD_LIBVPX),$(eval $(bin): libvpx.a libgtest.a ))\
+    $(if $(BUILD_LIBVPX),$(eval $(bin): \
+        lib$(CODEC_LIB)$(CODEC_LIB_SUF) libgtest.a ))\
    $(if $(BUILD_LIBVPX),$(eval $(call linkerxx_template,$(bin),\
        $(LIBVPX_TEST_OBJS) \
        -L. -lvpx -lgtest -lpthread -lm)\
@ -503,3 +497,6 @@ libs.doxy: $(CODEC_DOC_SRCS)
 	@echo "PREDEFINED = VPX_CODEC_DISABLE_COMPAT" >> $@
 	@echo "INCLUDE_PATH += ." >> $@;
 	@echo "ENABLED_SECTIONS += $(sort $(CODEC_DOC_SECTIONS))" >> $@
+
+## Generate vpx_rtcd.h for all objects
+$(OBJS-yes:.o=.d): $(BUILD_PFX)vpx_rtcd.h
--- a/mainpage.dox
+++ b/mainpage.dox
@ -12,8 +12,12 @@

  This distribution of the WebM VP8 Codec SDK includes the following support:

-  \if vp8_encoder    - \ref vp8_encoder   \endif
-  \if vp8_decoder    - \ref vp8_decoder   \endif
+  \if vp8_encoder
+  - \ref vp8_encoder
+  \endif
+  \if vp8_decoder
+  - \ref vp8_decoder
+  \endif


  \section main_startpoints Starting Points
@ -24,8 +28,12 @@
  - Read the \ref samples "sample code" for examples of how to interact with the
    codec.
  - \ref codec reference
-    \if encoder - \ref encoder reference \endif
-    \if decoder - \ref decoder reference \endif
+    \if encoder
+    - \ref encoder reference
+    \endif
+    \if decoder
+    - \ref decoder reference
+    \endif

  \section main_support Support Options & FAQ
  The WebM project is an open source project supported by its community. For
--- a/nestegg/src/nestegg.c
+++ b/nestegg/src/nestegg.c
@ -1272,7 +1272,7 @@ ne_read_block(nestegg * ctx, uint64_t block_id, uint64_t block_size, nestegg_pac
  if (total > block_size)
    return -1;

-  entry = ne_find_track_entry(ctx, track - 1);
+  entry = ne_find_track_entry(ctx, (unsigned int)(track - 1));
  if (!entry)
    return -1;

@ -1291,7 +1291,7 @@ ne_read_block(nestegg * ctx, uint64_t block_id, uint64_t block_size, nestegg_pac

  pkt = ne_alloc(sizeof(*pkt));
  pkt->track = track - 1;
-  pkt->timecode = abs_timecode * tc_scale * track_scale;
+  pkt->timecode = (uint64_t)(abs_timecode * tc_scale * track_scale);

  ctx->log(ctx, NESTEGG_LOG_DEBUG, "%sblock t %lld pts %f f %llx frames: %llu",
           block_id == ID_BLOCK ? "" : "simple", pkt->track, pkt->timecode / 1e9, flags, frames);
@ -1774,35 +1774,35 @@ nestegg_track_video_params(nestegg * ctx, unsigned int track,

  if (ne_get_uint(entry->video.pixel_width, &value) != 0)
    return -1;
-  params->width = value;
+  params->width = (unsigned int)value;

  if (ne_get_uint(entry->video.pixel_height, &value) != 0)
    return -1;
-  params->height = value;
+  params->height = (unsigned int)value;

  value = 0;
  ne_get_uint(entry->video.pixel_crop_bottom, &value);
-  params->crop_bottom = value;
+  params->crop_bottom = (unsigned int)value;

  value = 0;
  ne_get_uint(entry->video.pixel_crop_top, &value);
-  params->crop_top = value;
+  params->crop_top = (unsigned int)value;

  value = 0;
  ne_get_uint(entry->video.pixel_crop_left, &value);
-  params->crop_left = value;
+  params->crop_left = (unsigned int)value;

  value = 0;
  ne_get_uint(entry->video.pixel_crop_right, &value);
-  params->crop_right = value;
+  params->crop_right = (unsigned int)value;

  value = params->width;
  ne_get_uint(entry->video.display_width, &value);
-  params->display_width = value;
+  params->display_width = (unsigned int)value;

  value = params->height;
  ne_get_uint(entry->video.display_height, &value);
-  params->display_height = value;
+  params->display_height = (unsigned int)value;

  return 0;
 }
@ -1828,11 +1828,11 @@ nestegg_track_audio_params(nestegg * ctx, unsigned int track,

  value = 1;
  ne_get_uint(entry->audio.channels, &value);
-  params->channels = value;
+  params->channels = (unsigned int)value;

  value = 16;
  ne_get_uint(entry->audio.bit_depth, &value);
-  params->depth = value;
+  params->depth = (unsigned int)value;

  return 0;
 }
@ -1888,7 +1888,7 @@ nestegg_free_packet(nestegg_packet * pkt)
 int
 nestegg_packet_track(nestegg_packet * pkt, unsigned int * track)
 {
-  *track = pkt->track;
+  *track = (unsigned int)pkt->track;
  return 0;
 }

--- a/solution.mk
+++ b/solution.mk
@ -8,18 +8,19 @@
 ##  be found in the AUTHORS file in the root of the source tree.
 ##

+# libvpx reverse dependencies (targets that depend on libvpx)
+VPX_NONDEPS=$(addsuffix .vcproj,vpx gtest obj_int_extract)
+VPX_RDEPS=$(foreach vcp,\
+              $(filter-out $(VPX_NONDEPS),$^), --dep=$(vcp:.vcproj=):vpx)

 vpx.sln: $(wildcard *.vcproj)
 	@echo "    [CREATE] $@"
 	$(SRC_PATH_BARE)/build/make/gen_msvs_sln.sh \
-            $(if $(filter %vpx.vcproj,$^),\
-                $(foreach vcp,$(filter-out %vpx.vcproj %gtest.vcproj %obj_int_extract.vcproj,$^),\
-                  --dep=$(vcp:.vcproj=):vpx) \
-                $(foreach vcp,$(filter %_test.vcproj,$^),\
-                  --dep=$(vcp:.vcproj=):gtest)) \
-                  --dep=vpx:obj_int_extract \
-                  --ver=$(CONFIG_VS_VERSION)\
-                  --out=$@ $^
+            $(if $(filter vpx.vcproj,$^),$(VPX_RDEPS)) \
+            --dep=vpx:obj_int_extract \
+            --dep=test_libvpx:gtest \
+            --ver=$(CONFIG_VS_VERSION)\
+            --out=$@ $^
 vpx.sln.mk: vpx.sln
 	@true

--- a/test/acm_random.h
+++ b/test/acm_random.h
@ -19,6 +19,10 @@ namespace libvpx_test {

 class ACMRandom {
 public:
+  ACMRandom() {
+    Reset(DeterministicSeed());
+  }
+
  explicit ACMRandom(int seed) {
    Reset(seed);
  }
--- a/test/altref_test.cc
+++ b/test/altref_test.cc
@ -0,0 +1,71 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+
+namespace {
+
+// lookahead range: [kLookAheadMin, kLookAheadMax).
+const int kLookAheadMin = 5;
+const int kLookAheadMax = 26;
+
+class AltRefTest : public libvpx_test::EncoderTest,
+    public ::testing::TestWithParam<int> {
+ protected:
+  AltRefTest() : altref_count_(0) {}
+  virtual ~AltRefTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(libvpx_test::kTwoPassGood);
+  }
+
+  virtual void BeginPassHook(unsigned int pass) {
+    altref_count_ = 0;
+  }
+
+  virtual bool Continue() const {
+    return !HasFatalFailure() && !abort_;
+  }
+
+  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                                  libvpx_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(VP8E_SET_CPUUSED, 3);
+    }
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    if (pkt->data.frame.flags & VPX_FRAME_IS_INVISIBLE) ++altref_count_;
+  }
+
+  int altref_count() const { return altref_count_; }
+
+ private:
+  int altref_count_;
+};
+
+TEST_P(AltRefTest, MonotonicTimestamps) {
+  const vpx_rational timebase = { 33333333, 1000000000 };
+  cfg_.g_timebase = timebase;
+  cfg_.rc_target_bitrate = 1000;
+  cfg_.g_lag_in_frames = GetParam();
+
+  libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     timebase.den, timebase.num, 0, 30);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  EXPECT_GE(altref_count(), 1);
+}
+
+INSTANTIATE_TEST_CASE_P(NonZeroLag, AltRefTest,
+                        ::testing::Range(kLookAheadMin, kLookAheadMax));
+}  // namespace
--- a/test/boolcoder_test.cc
+++ b/test/boolcoder_test.cc
@ -8,26 +8,28 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "third_party/googletest/src/include/gtest/gtest.h"
-
 extern "C" {
-#include "vp9/encoder/boolhuff.h"
-#include "vp9/decoder/dboolhuff.h"
+#include "vp8/encoder/boolhuff.h"
+#include "vp8/decoder/dboolhuff.h"
 }

-#include "acm_random.h"
-#include "vpx/vpx_integer.h"
+#include <math.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>

-using libvpx_test::ACMRandom;
+#include "test/acm_random.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "vpx/vpx_integer.h"

 namespace {
 const int num_tests = 10;
 }  // namespace

+using libvpx_test::ACMRandom;
+
 TEST(VP8, TestBitIO) {
  ACMRandom rnd(ACMRandom::DeterministicSeed());
  for (int n = 0; n < num_tests; ++n) {
@ -38,15 +40,15 @@ TEST(VP8, TestBitIO) {
      for (int i = 0; i < bits_to_test; ++i) {
        const int parity = i & 1;
        probas[i] =
-          (method == 0) ? 0 : (method == 1) ? 255 :
-          (method == 2) ? 128 :
-          (method == 3) ? rnd.Rand8() :
-          (method == 4) ? (parity ? 0 : 255) :
+            (method == 0) ? 0 : (method == 1) ? 255 :
+            (method == 2) ? 128 :
+            (method == 3) ? rnd.Rand8() :
+            (method == 4) ? (parity ? 0 : 255) :
            // alternate between low and high proba:
            (method == 5) ? (parity ? rnd(128) : 255 - rnd(128)) :
            (method == 6) ?
-            (parity ? rnd(64) : 255 - rnd(64)) :
-            (parity ? rnd(32) : 255 - rnd(32));
+                (parity ? rnd(64) : 255 - rnd(64)) :
+                (parity ? rnd(32) : 255 - rnd(32));
      }
      for (int bit_method = 0; bit_method <= 3; ++bit_method) {
        const int random_seed = 6432;
@ -54,7 +56,7 @@ TEST(VP8, TestBitIO) {
        ACMRandom bit_rnd(random_seed);
        BOOL_CODER bw;
        uint8_t bw_buffer[buffer_size];
-        vp8_start_encode(&bw, bw_buffer);
+        vp8_start_encode(&bw, bw_buffer, bw_buffer + buffer_size);

        int bit = (bit_method == 0) ? 0 : (bit_method == 1) ? 1 : 0;
        for (int i = 0; i < bits_to_test; ++i) {
@ -78,7 +80,7 @@ TEST(VP8, TestBitIO) {
            bit = bit_rnd(2);
          }
          GTEST_ASSERT_EQ(vp8dx_decode_bool(&br, probas[i]), bit)
-              << "pos: " << i << " / " << bits_to_test
+              << "pos: "<< i << " / " << bits_to_test
              << " bit_method: " << bit_method
              << " method: " << method;
        }
--- a/test/config_test.cc
+++ b/test/config_test.cc
@ -0,0 +1,61 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/encode_test_driver.h"
+#include "test/video_source.h"
+
+namespace {
+
+class ConfigTest : public ::libvpx_test::EncoderTest,
+    public ::testing::TestWithParam<enum libvpx_test::TestMode> {
+ public:
+  ConfigTest() : frame_count_in_(0), frame_count_out_(0), frame_count_max_(0) {}
+
+ protected:
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GetParam());
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    frame_count_in_ = 0;
+    frame_count_out_ = 0;
+  }
+
+  virtual void PreEncodeFrameHook(libvpx_test::VideoSource* /*video*/) {
+    ++frame_count_in_;
+    abort_ |= (frame_count_in_ >= frame_count_max_);
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t* /*pkt*/) {
+    ++frame_count_out_;
+  }
+
+  virtual bool Continue() const {
+    return !HasFatalFailure() && !abort_;
+  }
+
+  unsigned int frame_count_in_;
+  unsigned int frame_count_out_;
+  unsigned int frame_count_max_;
+};
+
+TEST_P(ConfigTest, LagIsDisabled) {
+  frame_count_max_ = 2;
+  cfg_.g_lag_in_frames = 15;
+
+  libvpx_test::DummyVideoSource video;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  EXPECT_EQ(frame_count_in_, frame_count_out_);
+}
+
+INSTANTIATE_TEST_CASE_P(OnePassModes, ConfigTest, ONE_PASS_TEST_MODES);
+}  // namespace
--- a/test/cq_test.cc
+++ b/test/cq_test.cc
@ -0,0 +1,106 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <cmath>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+
+// CQ level range: [kCQLevelMin, kCQLevelMax).
+const int kCQLevelMin = 4;
+const int kCQLevelMax = 63;
+const int kCQLevelStep = 8;
+const int kCQTargetBitrate = 2000;
+
+namespace {
+
+class CQTest : public libvpx_test::EncoderTest,
+    public ::testing::TestWithParam<int> {
+ protected:
+  CQTest() : cq_level_(GetParam()) { init_flags_ = VPX_CODEC_USE_PSNR; }
+  virtual ~CQTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(libvpx_test::kTwoPassGood);
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    file_size_ = 0;
+    psnr_ = 0.0;
+    n_frames_ = 0;
+  }
+
+  virtual bool Continue() const {
+    return !HasFatalFailure() && !abort_;
+  }
+
+  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                                  libvpx_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      if (cfg_.rc_end_usage == VPX_CQ) {
+        encoder->Control(VP8E_SET_CQ_LEVEL, cq_level_);
+      }
+      encoder->Control(VP8E_SET_CPUUSED, 3);
+    }
+  }
+
+  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+    psnr_ += pow(10.0, pkt->data.psnr.psnr[0] / 10.0);
+    n_frames_++;
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    file_size_ += pkt->data.frame.sz;
+  }
+
+  double GetLinearPSNROverBitrate() const {
+    double avg_psnr = log10(psnr_ / n_frames_) * 10.0;
+    return pow(10.0, avg_psnr / 10.0) / file_size_;
+  }
+
+  int file_size() const { return file_size_; }
+  int n_frames() const { return n_frames_; }
+
+ private:
+  int cq_level_;
+  int file_size_;
+  double psnr_;
+  int n_frames_;
+};
+
+int prev_actual_bitrate = kCQTargetBitrate;
+TEST_P(CQTest, LinearPSNRIsHigherForCQLevel) {
+  const vpx_rational timebase = { 33333333, 1000000000 };
+  cfg_.g_timebase = timebase;
+  cfg_.rc_target_bitrate = kCQTargetBitrate;
+  cfg_.g_lag_in_frames = 25;
+
+  cfg_.rc_end_usage = VPX_CQ;
+  libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     timebase.den, timebase.num, 0, 30);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double cq_psnr_lin = GetLinearPSNROverBitrate();
+  const int cq_actual_bitrate = file_size() * 8 * 30 / (n_frames() * 1000);
+  EXPECT_LE(cq_actual_bitrate, kCQTargetBitrate);
+  EXPECT_LE(cq_actual_bitrate, prev_actual_bitrate);
+  prev_actual_bitrate = cq_actual_bitrate;
+
+  // try targeting the approximate same bitrate with VBR mode
+  cfg_.rc_end_usage = VPX_VBR;
+  cfg_.rc_target_bitrate = cq_actual_bitrate;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double vbr_psnr_lin = GetLinearPSNROverBitrate();
+  EXPECT_GE(cq_psnr_lin, vbr_psnr_lin);
+}
+
+INSTANTIATE_TEST_CASE_P(CQLevelRange, CQTest,
+                        ::testing::Range(kCQLevelMin, kCQLevelMax,
+                                         kCQLevelStep));
+}  // namespace
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@ -0,0 +1,169 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+namespace {
+
+class DatarateTest : public ::libvpx_test::EncoderTest,
+    public ::testing::TestWithParam<enum libvpx_test::TestMode> {
+ protected:
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GetParam());
+    ResetModel();
+  }
+
+  virtual void ResetModel() {
+    last_pts_ = 0;
+    bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz;
+    frame_number_ = 0;
+    first_drop_ = 0;
+    bits_total_ = 0;
+    duration_ = 0.0;
+  }
+
+  virtual bool Continue() const {
+    return !HasFatalFailure() && !abort_;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    const vpx_rational_t tb = video->timebase();
+    timebase_ = static_cast<double>(tb.num) / tb.den;
+    duration_ = 0;
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    // Time since last timestamp = duration.
+    vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
+
+    // TODO(jimbankoski): Remove these lines when the issue:
+    // http://code.google.com/p/webm/issues/detail?id=496 is fixed.
+    // For now the codec assumes buffer starts at starting buffer rate
+    // plus one frame's time.
+    if (last_pts_ == 0)
+      duration = 1;
+
+    // Add to the buffer the bits we'd expect from a constant bitrate server.
+    bits_in_buffer_model_ += duration * timebase_ * cfg_.rc_target_bitrate
+        * 1000;
+
+    /* Test the buffer model here before subtracting the frame. Do so because
+     * the way the leaky bucket model works in libvpx is to allow the buffer to
+     * empty - and then stop showing frames until we've got enough bits to
+     * show one. */
+    ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame "
+        << pkt->data.frame.pts;
+
+    const int frame_size_in_bits = pkt->data.frame.sz * 8;
+
+    // Subtract from the buffer the bits associated with a played back frame.
+    bits_in_buffer_model_ -= frame_size_in_bits;
+
+    // Update the running total of bits for end of test datarate checks.
+    bits_total_ += frame_size_in_bits ;
+
+    // If first drop not set and we have a drop set it to this time.
+    if (!first_drop_ && duration > 1)
+      first_drop_ = last_pts_ + 1;
+
+    // Update the most recent pts.
+    last_pts_ = pkt->data.frame.pts;
+
+    // We update this so that we can calculate the datarate minus the last
+    // frame encoded in the file.
+    bits_in_last_frame_ = frame_size_in_bits;
+
+    ++frame_number_;
+  }
+
+  virtual void EndPassHook(void) {
+    if (bits_total_) {
+      const double file_size_in_kb = bits_total_ / 1000;  /* bits per kilobit */
+
+      duration_ = (last_pts_ + 1) * timebase_;
+
+      // Effective file datarate includes the time spent prebuffering.
+      effective_datarate_ = (bits_total_ - bits_in_last_frame_) / 1000.0
+          / (cfg_.rc_buf_initial_sz / 1000.0 + duration_);
+
+      file_datarate_ = file_size_in_kb / duration_;
+    }
+  }
+
+  vpx_codec_pts_t last_pts_;
+  int bits_in_buffer_model_;
+  double timebase_;
+  int frame_number_;
+  vpx_codec_pts_t first_drop_;
+  int64_t bits_total_;
+  double duration_;
+  double file_datarate_;
+  double effective_datarate_;
+  int bits_in_last_frame_;
+};
+
+TEST_P(DatarateTest, BasicBufferModel) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  // 2 pass cbr datarate control has a bug hidden by the small # of
+  // frames selected in this encode. The problem is that even if the buffer is
+  // negative we produce a keyframe on a cutscene. Ignoring datarate
+  // constraints
+  // TODO(jimbankoski): ( Fix when issue
+  // http://code.google.com/p/webm/issues/detail?id=495 is addressed. )
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 140);
+
+  for (int i = 70; i < 700; i += 200) {
+    cfg_.rc_target_bitrate = i;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_)
+        << " The datarate for the file exceeds the target!";
+
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.3)
+        << " The datarate for the file missed the target!";
+  }
+}
+
+TEST_P(DatarateTest, ChangingDropFrameThresh) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_max_quantizer = 36;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.rc_target_bitrate = 200;
+  cfg_.kf_mode = VPX_KF_DISABLED;
+
+  const int frame_count = 40;
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, frame_count);
+
+  // Here we check that the first dropped frame gets earlier and earlier
+  // as the drop frame threshold is increased.
+
+  const int kDropFrameThreshTestStep = 30;
+  vpx_codec_pts_t last_drop = frame_count;
+  for (int i = 1; i < 91; i += kDropFrameThreshTestStep) {
+    cfg_.rc_dropframe_thresh = i;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_LE(first_drop_, last_drop)
+        << " The first dropped frame for drop_thresh " << i
+        << " > first dropped frame for drop_thresh "
+        << i - kDropFrameThreshTestStep;
+    last_drop = first_drop_;
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(AllModes, DatarateTest, ALL_TEST_MODES);
+}  // namespace
--- a/test/decode_test_driver.cc
+++ b/test/decode_test_driver.cc
@ -0,0 +1,46 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "test/decode_test_driver.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/video_source.h"
+
+namespace libvpx_test {
+#if CONFIG_VP8_DECODER
+void Decoder::DecodeFrame(const uint8_t *cxdata, int size) {
+  if (!decoder_.priv) {
+    const vpx_codec_err_t res_init = vpx_codec_dec_init(&decoder_,
+                                                        &vpx_codec_vp8_dx_algo,
+                                                        &cfg_, 0);
+    ASSERT_EQ(VPX_CODEC_OK, res_init) << DecodeError();
+  }
+
+  const vpx_codec_err_t res_dec = vpx_codec_decode(&decoder_,
+                                                   cxdata, size, NULL, 0);
+  ASSERT_EQ(VPX_CODEC_OK, res_dec) << DecodeError();
+}
+
+void DecoderTest::RunLoop(CompressedVideoSource *video) {
+  vpx_codec_dec_cfg_t dec_cfg = {0};
+  Decoder decoder(dec_cfg, 0);
+
+  // Decode frames.
+  for (video->Begin(); video->cxdata(); video->Next()) {
+    decoder.DecodeFrame(video->cxdata(), video->frame_size());
+
+    DxDataIterator dec_iter = decoder.GetDxData();
+    const vpx_image_t *img = NULL;
+
+    // Get decompressed data
+    while ((img = dec_iter.Next()))
+      DecompressedFrameHook(*img, video->frame_number());
+  }
+}
+#endif
+}  // namespace libvpx_test
--- a/test/decode_test_driver.h
+++ b/test/decode_test_driver.h
@ -0,0 +1,97 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef TEST_DECODE_TEST_DRIVER_H_
+#define TEST_DECODE_TEST_DRIVER_H_
+#include <cstring>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "vpx_config.h"
+#include "vpx/vpx_decoder.h"
+#include "vpx/vp8dx.h"
+
+namespace libvpx_test {
+
+class CompressedVideoSource;
+
+// Provides an object to handle decoding output
+class DxDataIterator {
+ public:
+  explicit DxDataIterator(vpx_codec_ctx_t *decoder)
+      : decoder_(decoder), iter_(NULL) {}
+
+  const vpx_image_t *Next() {
+    return vpx_codec_get_frame(decoder_, &iter_);
+  }
+
+ private:
+  vpx_codec_ctx_t  *decoder_;
+  vpx_codec_iter_t  iter_;
+};
+
+// Provides a simplified interface to manage one video decoding.
+//
+// TODO: similar to Encoder class, the exact services should be
+// added as more tests are added.
+class Decoder {
+ public:
+  Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)
+      : cfg_(cfg), deadline_(deadline) {
+    memset(&decoder_, 0, sizeof(decoder_));
+  }
+
+  ~Decoder() {
+    vpx_codec_destroy(&decoder_);
+  }
+
+  void DecodeFrame(const uint8_t *cxdata, int size);
+
+  DxDataIterator GetDxData() {
+    return DxDataIterator(&decoder_);
+  }
+
+  void set_deadline(unsigned long deadline) {
+    deadline_ = deadline;
+  }
+
+  void Control(int ctrl_id, int arg) {
+    const vpx_codec_err_t res = vpx_codec_control_(&decoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << DecodeError();
+  }
+
+ protected:
+  const char *DecodeError() {
+    const char *detail = vpx_codec_error_detail(&decoder_);
+    return detail ? detail : vpx_codec_error(&decoder_);
+  }
+
+  vpx_codec_ctx_t     decoder_;
+  vpx_codec_dec_cfg_t cfg_;
+  unsigned int        deadline_;
+};
+
+// Common test functionality for all Decoder tests.
+class DecoderTest {
+ public:
+  // Main loop.
+  virtual void RunLoop(CompressedVideoSource *video);
+
+  // Hook to be called on every decompressed frame.
+  virtual void DecompressedFrameHook(const vpx_image_t& img,
+                                     const unsigned int frame_number) {}
+
+ protected:
+  DecoderTest() {}
+
+  virtual ~DecoderTest() {}
+};
+
+}  // namespace libvpx_test
+
+#endif  // TEST_DECODE_TEST_DRIVER_H_
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@ -0,0 +1,204 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "vpx_config.h"
+#include "test/encode_test_driver.h"
+#if CONFIG_VP8_DECODER
+#include "test/decode_test_driver.h"
+#endif
+#include "test/video_source.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+namespace libvpx_test {
+void Encoder::EncodeFrame(VideoSource *video, const unsigned long frame_flags) {
+  if (video->img())
+    EncodeFrameInternal(*video, frame_flags);
+  else
+    Flush();
+
+  // Handle twopass stats
+  CxDataIterator iter = GetCxData();
+
+  while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) {
+    if (pkt->kind != VPX_CODEC_STATS_PKT)
+      continue;
+
+    stats_->Append(*pkt);
+  }
+}
+
+void Encoder::EncodeFrameInternal(const VideoSource &video,
+                                  const unsigned long frame_flags) {
+  vpx_codec_err_t res;
+  const vpx_image_t *img = video.img();
+
+  // Handle first frame initialization
+  if (!encoder_.priv) {
+    cfg_.g_w = img->d_w;
+    cfg_.g_h = img->d_h;
+    cfg_.g_timebase = video.timebase();
+    cfg_.rc_twopass_stats_in = stats_->buf();
+    res = vpx_codec_enc_init(&encoder_, &vpx_codec_vp8_cx_algo, &cfg_,
+                             init_flags_);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+  }
+
+  // Handle frame resizing
+  if (cfg_.g_w != img->d_w || cfg_.g_h != img->d_h) {
+    cfg_.g_w = img->d_w;
+    cfg_.g_h = img->d_h;
+    res = vpx_codec_enc_config_set(&encoder_, &cfg_);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+  }
+
+  // Encode the frame
+  res = vpx_codec_encode(&encoder_,
+                         video.img(), video.pts(), video.duration(),
+                         frame_flags, deadline_);
+  ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+}
+
+void Encoder::Flush() {
+  const vpx_codec_err_t res = vpx_codec_encode(&encoder_, NULL, 0, 0, 0,
+                                               deadline_);
+  ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+}
+
+void EncoderTest::SetMode(TestMode mode) {
+  switch (mode) {
+    case kRealTime:
+      deadline_ = VPX_DL_REALTIME;
+      break;
+
+    case kOnePassGood:
+    case kTwoPassGood:
+      deadline_ = VPX_DL_GOOD_QUALITY;
+      break;
+
+    case kOnePassBest:
+    case kTwoPassBest:
+      deadline_ = VPX_DL_BEST_QUALITY;
+      break;
+
+    default:
+      ASSERT_TRUE(false) << "Unexpected mode " << mode;
+  }
+
+  if (mode == kTwoPassGood || mode == kTwoPassBest)
+    passes_ = 2;
+  else
+    passes_ = 1;
+}
+// The function should return "true" most of the time, therefore no early
+// break-out is implemented within the match checking process.
+static bool compare_img(const vpx_image_t *img1,
+                        const vpx_image_t *img2) {
+  bool match = (img1->fmt == img2->fmt) &&
+               (img1->d_w == img2->d_w) &&
+               (img1->d_h == img2->d_h);
+
+  const unsigned int width_y  = img1->d_w;
+  const unsigned int height_y = img1->d_h;
+  unsigned int i;
+  for (i = 0; i < height_y; ++i)
+    match = ( memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y],
+                     img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y],
+                     width_y) == 0) && match;
+  const unsigned int width_uv  = (img1->d_w + 1) >> 1;
+  const unsigned int height_uv = (img1->d_h + 1) >> 1;
+  for (i = 0; i <  height_uv; ++i)
+    match = ( memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U],
+                     img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U],
+                     width_uv) == 0) && match;
+  for (i = 0; i < height_uv; ++i)
+    match = ( memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V],
+                     img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V],
+                     width_uv) == 0) && match;
+  return match;
+}
+
+void EncoderTest::RunLoop(VideoSource *video) {
+#if CONFIG_VP8_DECODER
+  vpx_codec_dec_cfg_t dec_cfg = {0};
+#endif
+
+  stats_.Reset();
+
+  for (unsigned int pass = 0; pass < passes_; pass++) {
+    last_pts_ = 0;
+
+    if (passes_ == 1)
+      cfg_.g_pass = VPX_RC_ONE_PASS;
+    else if (pass == 0)
+      cfg_.g_pass = VPX_RC_FIRST_PASS;
+    else
+      cfg_.g_pass = VPX_RC_LAST_PASS;
+
+    BeginPassHook(pass);
+    Encoder encoder(cfg_, deadline_, init_flags_, &stats_);
+#if CONFIG_VP8_DECODER
+    Decoder decoder(dec_cfg, 0);
+    bool has_cxdata = false;
+#endif
+    bool again;
+    for (again = true, video->Begin(); again; video->Next()) {
+      again = video->img() != NULL;
+
+      PreEncodeFrameHook(video);
+      PreEncodeFrameHook(video, &encoder);
+      encoder.EncodeFrame(video, frame_flags_);
+
+      CxDataIterator iter = encoder.GetCxData();
+
+      while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) {
+        again = true;
+
+        switch (pkt->kind) {
+          case VPX_CODEC_CX_FRAME_PKT:
+#if CONFIG_VP8_DECODER
+            has_cxdata = true;
+            decoder.DecodeFrame((const uint8_t*)pkt->data.frame.buf,
+                                pkt->data.frame.sz);
+#endif
+            ASSERT_GE(pkt->data.frame.pts, last_pts_);
+            last_pts_ = pkt->data.frame.pts;
+            FramePktHook(pkt);
+            break;
+
+          case VPX_CODEC_PSNR_PKT:
+            PSNRPktHook(pkt);
+            break;
+
+          default:
+            break;
+        }
+      }
+
+#if CONFIG_VP8_DECODER
+      if (has_cxdata) {
+        const vpx_image_t *img_enc = encoder.GetPreviewFrame();
+        DxDataIterator dec_iter = decoder.GetDxData();
+        const vpx_image_t *img_dec = dec_iter.Next();
+        if(img_enc && img_dec) {
+          const bool res = compare_img(img_enc, img_dec);
+          ASSERT_TRUE(res)<< "Encoder/Decoder mismatch found.";
+        }
+      }
+#endif
+      if (!Continue())
+        break;
+    }
+
+    EndPassHook();
+
+    if (!Continue())
+      break;
+  }
+}
+}  // namespace libvpx_test
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@ -0,0 +1,197 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef TEST_ENCODE_TEST_DRIVER_H_
+#define TEST_ENCODE_TEST_DRIVER_H_
+#include <string>
+#include <vector>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "vpx/vpx_encoder.h"
+#include "vpx/vp8cx.h"
+
+namespace libvpx_test {
+
+class VideoSource;
+
+enum TestMode {
+  kRealTime,
+  kOnePassGood,
+  kOnePassBest,
+  kTwoPassGood,
+  kTwoPassBest
+};
+#define ALL_TEST_MODES ::testing::Values(::libvpx_test::kRealTime, \
+                                         ::libvpx_test::kOnePassGood, \
+                                         ::libvpx_test::kOnePassBest, \
+                                         ::libvpx_test::kTwoPassGood, \
+                                         ::libvpx_test::kTwoPassBest)
+
+#define ONE_PASS_TEST_MODES ::testing::Values(::libvpx_test::kRealTime, \
+                                              ::libvpx_test::kOnePassGood, \
+                                              ::libvpx_test::kOnePassBest)
+
+
+// Provides an object to handle the libvpx get_cx_data() iteration pattern
+class CxDataIterator {
+ public:
+  explicit CxDataIterator(vpx_codec_ctx_t *encoder)
+    : encoder_(encoder), iter_(NULL) {}
+
+  const vpx_codec_cx_pkt_t *Next() {
+    return vpx_codec_get_cx_data(encoder_, &iter_);
+  }
+
+ private:
+  vpx_codec_ctx_t  *encoder_;
+  vpx_codec_iter_t  iter_;
+};
+
+// Implements an in-memory store for libvpx twopass statistics
+class TwopassStatsStore {
+ public:
+  void Append(const vpx_codec_cx_pkt_t &pkt) {
+    buffer_.append(reinterpret_cast<char *>(pkt.data.twopass_stats.buf),
+                   pkt.data.twopass_stats.sz);
+  }
+
+  vpx_fixed_buf_t buf() {
+    const vpx_fixed_buf_t buf = { &buffer_[0], buffer_.size() };
+    return buf;
+  }
+
+  void Reset() {
+    buffer_.clear();
+  }
+
+ protected:
+  std::string  buffer_;
+};
+
+
+// Provides a simplified interface to manage one video encoding pass, given
+// a configuration and video source.
+//
+// TODO(jkoleszar): The exact services it provides and the appropriate
+// level of abstraction will be fleshed out as more tests are written.
+class Encoder {
+ public:
+  Encoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline,
+          const unsigned long init_flags, TwopassStatsStore *stats)
+    : cfg_(cfg), deadline_(deadline), init_flags_(init_flags), stats_(stats) {
+    memset(&encoder_, 0, sizeof(encoder_));
+  }
+
+  ~Encoder() {
+    vpx_codec_destroy(&encoder_);
+  }
+
+  CxDataIterator GetCxData() {
+    return CxDataIterator(&encoder_);
+  }
+
+  const vpx_image_t *GetPreviewFrame() {
+    return vpx_codec_get_preview_frame(&encoder_);
+  }
+  // This is a thin wrapper around vpx_codec_encode(), so refer to
+  // vpx_encoder.h for its semantics.
+  void EncodeFrame(VideoSource *video, const unsigned long frame_flags);
+
+  // Convenience wrapper for EncodeFrame()
+  void EncodeFrame(VideoSource *video) {
+    EncodeFrame(video, 0);
+  }
+
+  void Control(int ctrl_id, int arg) {
+    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+  }
+
+  void set_deadline(unsigned long deadline) {
+    deadline_ = deadline;
+  }
+
+ protected:
+  const char *EncoderError() {
+    const char *detail = vpx_codec_error_detail(&encoder_);
+    return detail ? detail : vpx_codec_error(&encoder_);
+  }
+
+  // Encode an image
+  void EncodeFrameInternal(const VideoSource &video,
+                           const unsigned long frame_flags);
+
+  // Flush the encoder on EOS
+  void Flush();
+
+  vpx_codec_ctx_t      encoder_;
+  vpx_codec_enc_cfg_t  cfg_;
+  unsigned long        deadline_;
+  unsigned long        init_flags_;
+  TwopassStatsStore   *stats_;
+};
+
+// Common test functionality for all Encoder tests.
+//
+// This class is a mixin which provides the main loop common to all
+// encoder tests. It provides hooks which can be overridden by subclasses
+// to implement each test's specific behavior, while centralizing the bulk
+// of the boilerplate. Note that it doesn't inherit the gtest testing
+// classes directly, so that tests can be parameterized differently.
+class EncoderTest {
+ protected:
+  EncoderTest() : abort_(false), init_flags_(0), frame_flags_(0),
+                  last_pts_(0) {}
+
+  virtual ~EncoderTest() {}
+
+  // Initialize the cfg_ member with the default configuration.
+  void InitializeConfig() {
+    const vpx_codec_err_t res = vpx_codec_enc_config_default(
+                                    &vpx_codec_vp8_cx_algo, &cfg_, 0);
+    ASSERT_EQ(VPX_CODEC_OK, res);
+  }
+
+  // Map the TestMode enum to the deadline_ and passes_ variables.
+  void SetMode(TestMode mode);
+
+  // Main loop.
+  virtual void RunLoop(VideoSource *video);
+
+  // Hook to be called at the beginning of a pass.
+  virtual void BeginPassHook(unsigned int pass) {}
+
+  // Hook to be called at the end of a pass.
+  virtual void EndPassHook() {}
+
+  // Hook to be called before encoding a frame.
+  virtual void PreEncodeFrameHook(VideoSource *video) {}
+  virtual void PreEncodeFrameHook(VideoSource *video, Encoder *encoder) {}
+
+  // Hook to be called on every compressed data packet.
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {}
+
+  // Hook to be called on every PSNR packet.
+  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {}
+
+  // Hook to determine whether the encode loop should continue.
+  virtual bool Continue() const { return !abort_; }
+
+  bool                 abort_;
+  vpx_codec_enc_cfg_t  cfg_;
+  unsigned int         passes_;
+  unsigned long        deadline_;
+  TwopassStatsStore    stats_;
+  unsigned long        init_flags_;
+  unsigned long        frame_flags_;
+  vpx_codec_pts_t      last_pts_;
+};
+
+}  // namespace libvpx_test
+
+#endif  // TEST_ENCODE_TEST_DRIVER_H_
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc
@ -0,0 +1,90 @@
+/*
+  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+
+  Use of this source code is governed by a BSD-style license
+  that can be found in the LICENSE file in the root of the source
+  tree. An additional intellectual property rights grant can be found
+  in the file PATENTS.  All contributing project authors may
+  be found in the AUTHORS file in the root of the source tree.
+*/
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+
+namespace {
+
+class ErrorResilienceTest : public libvpx_test::EncoderTest,
+    public ::testing::TestWithParam<int> {
+ protected:
+  ErrorResilienceTest() {
+    psnr_ = 0.0;
+    nframes_ = 0;
+    encoding_mode_ = static_cast<libvpx_test::TestMode>(GetParam());
+  }
+  virtual ~ErrorResilienceTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    psnr_ = 0.0;
+    nframes_ = 0;
+  }
+
+  virtual bool Continue() const {
+    return !HasFatalFailure() && !abort_;
+  }
+
+  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
+  double GetAveragePsnr() const {
+    if (nframes_)
+      return psnr_ / nframes_;
+    return 0.0;
+  }
+
+ private:
+  double psnr_;
+  unsigned int nframes_;
+  libvpx_test::TestMode encoding_mode_;
+};
+
+TEST_P(ErrorResilienceTest, OnVersusOff) {
+  const vpx_rational timebase = { 33333333, 1000000000 };
+  cfg_.g_timebase = timebase;
+  cfg_.rc_target_bitrate = 2000;
+  cfg_.g_lag_in_frames = 25;
+
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     timebase.den, timebase.num, 0, 30);
+
+  // Error resilient mode OFF.
+  cfg_.g_error_resilient = 0;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr_resilience_off = GetAveragePsnr();
+  EXPECT_GT(psnr_resilience_off, 25.0);
+
+  // Error resilient mode ON.
+  cfg_.g_error_resilient = 1;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr_resilience_on = GetAveragePsnr();
+  EXPECT_GT(psnr_resilience_on, 25.0);
+
+  // Test that turning on error resilient mode hurts by 10% at most.
+  if (psnr_resilience_off > 0.0) {
+    const double psnr_ratio = psnr_resilience_on / psnr_resilience_off;
+    EXPECT_GE(psnr_ratio, 0.9);
+    EXPECT_LE(psnr_ratio, 1.1);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(OnOffTest, ErrorResilienceTest,
+                        ONE_PASS_TEST_MODES);
+}  // namespace
--- a/test/i420_video_source.h
+++ b/test/i420_video_source.h
@ -0,0 +1,117 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef TEST_I420_VIDEO_SOURCE_H_
+#define TEST_I420_VIDEO_SOURCE_H_
+#include <cstdio>
+#include <cstdlib>
+
+#include "test/video_source.h"
+
+namespace libvpx_test {
+
+// This class extends VideoSource to allow parsing of raw yv12
+// so that we can do actual file encodes.
+class I420VideoSource : public VideoSource {
+ public:
+  I420VideoSource(const std::string &file_name,
+                  unsigned int width, unsigned int height,
+                  int rate_numerator, int rate_denominator,
+                  unsigned int start, int limit)
+      : file_name_(file_name),
+        input_file_(NULL),
+        img_(NULL),
+        start_(start),
+        limit_(limit),
+        frame_(0),
+        width_(0),
+        height_(0),
+        framerate_numerator_(rate_numerator),
+        framerate_denominator_(rate_denominator) {
+
+    // This initializes raw_sz_, width_, height_ and allocates an img.
+    SetSize(width, height);
+  }
+
+  virtual ~I420VideoSource() {
+    vpx_img_free(img_);
+    if (input_file_)
+      fclose(input_file_);
+  }
+
+  virtual void Begin() {
+    if (input_file_)
+      fclose(input_file_);
+    input_file_ = OpenTestDataFile(file_name_);
+    ASSERT_TRUE(input_file_) << "Input file open failed. Filename: "
+        << file_name_;
+    if (start_) {
+      fseek(input_file_, raw_sz_ * start_, SEEK_SET);
+    }
+
+    frame_ = start_;
+    FillFrame();
+  }
+
+  virtual void Next() {
+    ++frame_;
+    FillFrame();
+  }
+
+  virtual vpx_image_t *img() const { return (frame_ < limit_) ? img_ : NULL;  }
+
+  // Models a stream where Timebase = 1/FPS, so pts == frame.
+  virtual vpx_codec_pts_t pts() const { return frame_; }
+
+  virtual unsigned long duration() const { return 1; }
+
+  virtual vpx_rational_t timebase() const {
+    const vpx_rational_t t = { framerate_denominator_, framerate_numerator_ };
+    return t;
+  }
+
+  virtual unsigned int frame() const { return frame_; }
+
+  virtual unsigned int limit() const { return limit_; }
+
+  void SetSize(unsigned int width, unsigned int height) {
+    if (width != width_ || height != height_) {
+      vpx_img_free(img_);
+      img_ = vpx_img_alloc(NULL, VPX_IMG_FMT_VPXI420, width, height, 1);
+      ASSERT_TRUE(img_ != NULL);
+      width_ = width;
+      height_ = height;
+      raw_sz_ = width * height * 3 / 2;
+    }
+  }
+
+  virtual void FillFrame() {
+    // Read a frame from input_file.
+    if (fread(img_->img_data, raw_sz_, 1, input_file_) == 0) {
+      limit_ = frame_;
+    }
+  }
+
+ protected:
+  std::string file_name_;
+  FILE *input_file_;
+  vpx_image_t *img_;
+  size_t raw_sz_;
+  unsigned int start_;
+  unsigned int limit_;
+  unsigned int frame_;
+  unsigned int width_;
+  unsigned int height_;
+  unsigned int framerate_numerator_;
+  unsigned int framerate_denominator_;
+};
+
+}  // namespace libvpx_test
+
+#endif  // TEST_I420_VIDEO_SOURCE_H_
--- a/test/idctllm_test.cc
+++ b/test/idctllm_test.cc
@ -0,0 +1,125 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+extern "C" {
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+}
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+typedef void (*idct_fn_t)(short *input, unsigned char *pred_ptr,
+                          int pred_stride, unsigned char *dst_ptr,
+                          int dst_stride);
+namespace {
+class IDCTTest : public ::testing::TestWithParam<idct_fn_t>
+{
+  protected:
+    virtual void SetUp()
+    {
+        int i;
+
+        UUT = GetParam();
+        memset(input, 0, sizeof(input));
+        /* Set up guard blocks */
+        for(i=0; i<256; i++)
+            output[i] = ((i&0xF)<4&&(i<64))?0:-1;
+    }
+
+    idct_fn_t UUT;
+    short input[16];
+    unsigned char output[256];
+    unsigned char predict[256];
+};
+
+TEST_P(IDCTTest, TestGuardBlocks)
+{
+    int i;
+
+    for(i=0; i<256; i++)
+        if((i&0xF) < 4 && i<64)
+            EXPECT_EQ(0, output[i]) << i;
+        else
+            EXPECT_EQ(255, output[i]);
+}
+
+TEST_P(IDCTTest, TestAllZeros)
+{
+    int i;
+
+    UUT(input, output, 16, output, 16);
+
+    for(i=0; i<256; i++)
+        if((i&0xF) < 4 && i<64)
+            EXPECT_EQ(0, output[i]) << "i==" << i;
+        else
+            EXPECT_EQ(255, output[i]) << "i==" << i;
+}
+
+TEST_P(IDCTTest, TestAllOnes)
+{
+    int i;
+
+    input[0] = 4;
+    UUT(input, output, 16, output, 16);
+
+    for(i=0; i<256; i++)
+        if((i&0xF) < 4 && i<64)
+            EXPECT_EQ(1, output[i]) << "i==" << i;
+        else
+            EXPECT_EQ(255, output[i]) << "i==" << i;
+}
+
+TEST_P(IDCTTest, TestAddOne)
+{
+    int i;
+
+    for(i=0; i<256; i++)
+        predict[i] = i;
+
+    input[0] = 4;
+    UUT(input, predict, 16, output, 16);
+
+    for(i=0; i<256; i++)
+        if((i&0xF) < 4 && i<64)
+            EXPECT_EQ(i+1, output[i]) << "i==" << i;
+        else
+            EXPECT_EQ(255, output[i]) << "i==" << i;
+}
+
+TEST_P(IDCTTest, TestWithData)
+{
+    int i;
+
+    for(i=0; i<16; i++)
+        input[i] = i;
+
+    UUT(input, output, 16, output, 16);
+
+    for(i=0; i<256; i++)
+        if((i&0xF) > 3 || i>63)
+            EXPECT_EQ(255, output[i]) << "i==" << i;
+        else if(i == 0)
+            EXPECT_EQ(11, output[i]) << "i==" << i;
+        else if(i == 34)
+            EXPECT_EQ(1, output[i]) << "i==" << i;
+        else if(i == 2 || i == 17 || i == 32)
+            EXPECT_EQ(3, output[i]) << "i==" << i;
+        else
+            EXPECT_EQ(0, output[i]) << "i==" << i;
+}
+
+INSTANTIATE_TEST_CASE_P(C, IDCTTest,
+                        ::testing::Values(vp8_short_idct4x4llm_c));
+#if HAVE_MMX
+INSTANTIATE_TEST_CASE_P(MMX, IDCTTest,
+                        ::testing::Values(vp8_short_idct4x4llm_mmx));
+#endif
+}
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@ -0,0 +1,354 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <string.h>
+#include "test/acm_random.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+extern "C" {
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vp8/common/blockd.h"
+#include "vpx_mem/vpx_mem.h"
+}
+
+namespace {
+
+using libvpx_test::ACMRandom;
+
+class IntraPredBase {
+ protected:
+  void SetupMacroblock(uint8_t *data, int block_size, int stride,
+                       int num_planes) {
+    memset(&mb_, 0, sizeof(mb_));
+    memset(&mi_, 0, sizeof(mi_));
+    mb_.up_available = 1;
+    mb_.left_available = 1;
+    mb_.mode_info_context = &mi_;
+    stride_ = stride;
+    block_size_ = block_size;
+    num_planes_ = num_planes;
+    for (int p = 0; p < num_planes; p++)
+      data_ptr_[p] = data + stride * (block_size + 1) * p +
+                     stride + block_size;
+  }
+
+  void FillRandom() {
+    // Fill edges with random data
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    for (int p = 0; p < num_planes_; p++) {
+      for (int x = -1 ; x <= block_size_; x++)
+        data_ptr_[p][x - stride_] = rnd.Rand8();
+      for (int y = 0; y < block_size_; y++)
+        data_ptr_[p][y * stride_ - 1] = rnd.Rand8();
+    }
+  }
+
+  virtual void Predict(MB_PREDICTION_MODE mode) = 0;
+
+  void SetLeftUnavailable() {
+    mb_.left_available = 0;
+    for (int p = 0; p < num_planes_; p++)
+      for (int i = -1; i < block_size_; ++i)
+        data_ptr_[p][stride_ * i - 1] = 129;
+  }
+
+  void SetTopUnavailable() {
+    mb_.up_available = 0;
+    for (int p = 0; p < num_planes_; p++)
+      memset(&data_ptr_[p][-1 - stride_], 127, block_size_ + 2);
+  }
+
+  void SetTopLeftUnavailable() {
+    SetLeftUnavailable();
+    SetTopUnavailable();
+  }
+
+  int BlockSizeLog2Min1() const {
+    switch (block_size_) {
+      case 16:
+        return 3;
+      case 8:
+        return 2;
+      default:
+        return 0;
+    }
+  }
+
+  // check DC prediction output against a reference
+  void CheckDCPrediction() const {
+    for (int p = 0; p < num_planes_; p++) {
+      // calculate expected DC
+      int expected;
+      if (mb_.up_available || mb_.left_available) {
+        int sum = 0, shift = BlockSizeLog2Min1() + mb_.up_available +
+                             mb_.left_available;
+        if (mb_.up_available)
+          for (int x = 0; x < block_size_; x++)
+            sum += data_ptr_[p][x - stride_];
+        if (mb_.left_available)
+          for (int y = 0; y < block_size_; y++)
+            sum += data_ptr_[p][y * stride_ - 1];
+        expected = (sum + (1 << (shift - 1))) >> shift;
+      } else
+        expected = 0x80;
+
+      // check that all subsequent lines are equal to the first
+      for (int y = 1; y < block_size_; ++y)
+        ASSERT_EQ(0, memcmp(data_ptr_[p], &data_ptr_[p][y * stride_],
+                            block_size_));
+      // within the first line, ensure that each pixel has the same value
+      for (int x = 1; x < block_size_; ++x)
+        ASSERT_EQ(data_ptr_[p][0], data_ptr_[p][x]);
+      // now ensure that that pixel has the expected (DC) value
+      ASSERT_EQ(expected, data_ptr_[p][0]);
+    }
+  }
+
+  // check V prediction output against a reference
+  void CheckVPrediction() const {
+    // check that all lines equal the top border
+    for (int p = 0; p < num_planes_; p++)
+      for (int y = 0; y < block_size_; y++)
+        ASSERT_EQ(0, memcmp(&data_ptr_[p][-stride_],
+                            &data_ptr_[p][y * stride_], block_size_));
+  }
+
+  // check H prediction output against a reference
+  void CheckHPrediction() const {
+    // for each line, ensure that each pixel is equal to the left border
+    for (int p = 0; p < num_planes_; p++)
+      for (int y = 0; y < block_size_; y++)
+        for (int x = 0; x < block_size_; x++)
+          ASSERT_EQ(data_ptr_[p][-1 + y * stride_],
+                    data_ptr_[p][x + y * stride_]);
+  }
+
+  static int ClipByte(int value) {
+    if (value > 255)
+      return 255;
+    else if (value < 0)
+      return 0;
+    return value;
+  }
+
+  // check TM prediction output against a reference
+  void CheckTMPrediction() const {
+    for (int p = 0; p < num_planes_; p++)
+      for (int y = 0; y < block_size_; y++)
+        for (int x = 0; x < block_size_; x++) {
+          const int expected = ClipByte(data_ptr_[p][x - stride_]
+                                      + data_ptr_[p][stride_ * y - 1]
+                                      - data_ptr_[p][-1 - stride_]);
+          ASSERT_EQ(expected, data_ptr_[p][y * stride_ + x]);
+       }
+  }
+
+  // Actual test
+  void RunTest() {
+    {
+      SCOPED_TRACE("DC_PRED");
+      FillRandom();
+      Predict(DC_PRED);
+      CheckDCPrediction();
+    }
+    {
+      SCOPED_TRACE("DC_PRED LEFT");
+      FillRandom();
+      SetLeftUnavailable();
+      Predict(DC_PRED);
+      CheckDCPrediction();
+    }
+    {
+      SCOPED_TRACE("DC_PRED TOP");
+      FillRandom();
+      SetTopUnavailable();
+      Predict(DC_PRED);
+      CheckDCPrediction();
+    }
+    {
+      SCOPED_TRACE("DC_PRED TOP_LEFT");
+      FillRandom();
+      SetTopLeftUnavailable();
+      Predict(DC_PRED);
+      CheckDCPrediction();
+    }
+    {
+      SCOPED_TRACE("H_PRED");
+      FillRandom();
+      Predict(H_PRED);
+      CheckHPrediction();
+    }
+    {
+      SCOPED_TRACE("V_PRED");
+      FillRandom();
+      Predict(V_PRED);
+      CheckVPrediction();
+    }
+    {
+      SCOPED_TRACE("TM_PRED");
+      FillRandom();
+      Predict(TM_PRED);
+      CheckTMPrediction();
+    }
+  }
+
+  MACROBLOCKD mb_;
+  MODE_INFO mi_;
+  uint8_t *data_ptr_[2];  // in the case of Y, only [0] is used
+  int stride_;
+  int block_size_;
+  int num_planes_;
+};
+
+typedef void (*intra_pred_y_fn_t)(MACROBLOCKD *x,
+                                  uint8_t *yabove_row,
+                                  uint8_t *yleft,
+                                  int left_stride,
+                                  uint8_t *ypred_ptr,
+                                  int y_stride);
+
+class IntraPredYTest : public ::testing::TestWithParam<intra_pred_y_fn_t>,
+    protected IntraPredBase {
+ public:
+  static void SetUpTestCase() {
+    data_array_ = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kDataBufferSize));
+  }
+
+  static void TearDownTestCase() {
+    vpx_free(data_array_);
+    data_array_ = NULL;
+  }
+
+ protected:
+  static const int kBlockSize = 16;
+  static const int kDataAlignment = 16;
+  static const int kStride = kBlockSize * 3;
+  // We use 48 so that the data pointer of the first pixel in each row of
+  // each macroblock is 16-byte aligned, and this gives us access to the
+  // top-left and top-right corner pixels belonging to the top-left/right
+  // macroblocks.
+  // We use 17 lines so we have one line above us for top-prediction.
+  static const int kDataBufferSize = kStride * (kBlockSize + 1);
+
+  virtual void SetUp() {
+    pred_fn_ = GetParam();
+    SetupMacroblock(data_array_, kBlockSize, kStride, 1);
+  }
+
+  virtual void Predict(MB_PREDICTION_MODE mode) {
+    mb_.mode_info_context->mbmi.mode = mode;
+    pred_fn_(&mb_, data_ptr_[0] - kStride, data_ptr_[0] - 1, kStride,
+             data_ptr_[0], kStride);
+  }
+
+  intra_pred_y_fn_t pred_fn_;
+  static uint8_t* data_array_;
+};
+
+uint8_t* IntraPredYTest::data_array_ = NULL;
+
+TEST_P(IntraPredYTest, IntraPredTests) {
+  RunTest();
+}
+
+INSTANTIATE_TEST_CASE_P(C, IntraPredYTest,
+                        ::testing::Values(
+                            vp8_build_intra_predictors_mby_s_c));
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, IntraPredYTest,
+                        ::testing::Values(
+                            vp8_build_intra_predictors_mby_s_sse2));
+#endif
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(SSSE3, IntraPredYTest,
+                        ::testing::Values(
+                            vp8_build_intra_predictors_mby_s_ssse3));
+#endif
+
+typedef void (*intra_pred_uv_fn_t)(MACROBLOCKD *x,
+                                   uint8_t *uabove_row,
+                                   uint8_t *vabove_row,
+                                   uint8_t *uleft,
+                                   uint8_t *vleft,
+                                   int left_stride,
+                                   uint8_t *upred_ptr,
+                                   uint8_t *vpred_ptr,
+                                   int pred_stride);
+
+class IntraPredUVTest : public ::testing::TestWithParam<intra_pred_uv_fn_t>,
+    protected IntraPredBase {
+ public:
+  static void SetUpTestCase() {
+    data_array_ = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kDataBufferSize));
+  }
+
+  static void TearDownTestCase() {
+    vpx_free(data_array_);
+    data_array_ = NULL;
+  }
+
+ protected:
+  static const int kBlockSize = 8;
+  static const int kDataAlignment = 8;
+  static const int kStride = kBlockSize * 3;
+  // We use 24 so that the data pointer of the first pixel in each row of
+  // each macroblock is 8-byte aligned, and this gives us access to the
+  // top-left and top-right corner pixels belonging to the top-left/right
+  // macroblocks.
+  // We use 9 lines so we have one line above us for top-prediction.
+  // [0] = U, [1] = V
+  static const int kDataBufferSize = 2 * kStride * (kBlockSize + 1);
+
+  virtual void SetUp() {
+    pred_fn_ = GetParam();
+    SetupMacroblock(data_array_, kBlockSize, kStride, 2);
+  }
+
+  virtual void Predict(MB_PREDICTION_MODE mode) {
+    mb_.mode_info_context->mbmi.uv_mode = mode;
+    pred_fn_(&mb_, data_ptr_[0] - kStride, data_ptr_[1] - kStride,
+             data_ptr_[0] - 1, data_ptr_[1] - 1, kStride,
+             data_ptr_[0], data_ptr_[1], kStride);
+  }
+
+  intra_pred_uv_fn_t pred_fn_;
+  // We use 24 so that the data pointer of the first pixel in each row of
+  // each macroblock is 8-byte aligned, and this gives us access to the
+  // top-left and top-right corner pixels belonging to the top-left/right
+  // macroblocks.
+  // We use 9 lines so we have one line above us for top-prediction.
+  // [0] = U, [1] = V
+  static uint8_t* data_array_;
+};
+
+uint8_t* IntraPredUVTest::data_array_ = NULL;
+
+TEST_P(IntraPredUVTest, IntraPredTests) {
+  RunTest();
+}
+
+INSTANTIATE_TEST_CASE_P(C, IntraPredUVTest,
+                        ::testing::Values(
+                            vp8_build_intra_predictors_mbuv_s_c));
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, IntraPredUVTest,
+                        ::testing::Values(
+                            vp8_build_intra_predictors_mbuv_s_sse2));
+#endif
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(SSSE3, IntraPredUVTest,
+                        ::testing::Values(
+                            vp8_build_intra_predictors_mbuv_s_ssse3));
+#endif
+
+}  // namespace
--- a/test/ivf_video_source.h
+++ b/test/ivf_video_source.h
@ -0,0 +1,109 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef TEST_IVF_VIDEO_SOURCE_H_
+#define TEST_IVF_VIDEO_SOURCE_H_
+#include <cstdio>
+#include <cstdlib>
+#include <new>
+#include <string>
+#include "test/video_source.h"
+
+namespace libvpx_test {
+const unsigned int kCodeBufferSize = 256 * 1024;
+const unsigned int kIvfFileHdrSize = 32;
+const unsigned int kIvfFrameHdrSize = 12;
+
+static unsigned int MemGetLe32(const uint8_t *mem) {
+  return (mem[3] << 24) | (mem[2] << 16) | (mem[1] << 8) | (mem[0]);
+}
+
+// This class extends VideoSource to allow parsing of ivf files,
+// so that we can do actual file decodes.
+class IVFVideoSource : public CompressedVideoSource {
+ public:
+  IVFVideoSource(const std::string &file_name)
+      : file_name_(file_name),
+        input_file_(NULL),
+        compressed_frame_buf_(NULL),
+        frame_sz_(0),
+        frame_(0),
+        end_of_file_(false) {
+  }
+
+  virtual ~IVFVideoSource() {
+    delete[] compressed_frame_buf_;
+
+    if (input_file_)
+      fclose(input_file_);
+  }
+
+  virtual void Init() {
+    // Allocate a buffer for read in the compressed video frame.
+    compressed_frame_buf_ = new uint8_t[libvpx_test::kCodeBufferSize];
+    ASSERT_TRUE(compressed_frame_buf_) << "Allocate frame buffer failed";
+  }
+
+  virtual void Begin() {
+    input_file_ = OpenTestDataFile(file_name_);
+    ASSERT_TRUE(input_file_) << "Input file open failed. Filename: "
+        << file_name_;
+
+    // Read file header
+    uint8_t file_hdr[kIvfFileHdrSize];
+    ASSERT_EQ(kIvfFileHdrSize, fread(file_hdr, 1, kIvfFileHdrSize, input_file_))
+        << "File header read failed.";
+    // Check file header
+    ASSERT_TRUE(file_hdr[0] == 'D' && file_hdr[1] == 'K' && file_hdr[2] == 'I'
+                && file_hdr[3] == 'F') << "Input is not an IVF file.";
+
+    FillFrame();
+  }
+
+  virtual void Next() {
+    ++frame_;
+    FillFrame();
+  }
+
+  void FillFrame() {
+    uint8_t frame_hdr[kIvfFrameHdrSize];
+    // Check frame header and read a frame from input_file.
+    if (fread(frame_hdr, 1, kIvfFrameHdrSize, input_file_)
+        != kIvfFrameHdrSize) {
+      end_of_file_ = true;
+    } else {
+      end_of_file_ = false;
+
+      frame_sz_ = MemGetLe32(frame_hdr);
+      ASSERT_LE(frame_sz_, kCodeBufferSize)
+          << "Frame is too big for allocated code buffer";
+      ASSERT_EQ(frame_sz_,
+                fread(compressed_frame_buf_, 1, frame_sz_, input_file_))
+          << "Failed to read complete frame";
+    }
+  }
+
+  virtual const uint8_t *cxdata() const {
+    return end_of_file_ ? NULL : compressed_frame_buf_;
+  }
+  virtual const unsigned int frame_size() const { return frame_sz_; }
+  virtual const unsigned int frame_number() const { return frame_; }
+
+ protected:
+  std::string file_name_;
+  FILE *input_file_;
+  uint8_t *compressed_frame_buf_;
+  unsigned int frame_sz_;
+  unsigned int frame_;
+  bool end_of_file_;
+};
+
+}  // namespace libvpx_test
+
+#endif  // TEST_IVF_VIDEO_SOURCE_H_
--- a/test/keyframe_test.cc
+++ b/test/keyframe_test.cc
@ -0,0 +1,145 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <climits>
+#include <vector>
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+namespace {
+
+class KeyframeTest : public ::libvpx_test::EncoderTest,
+    public ::testing::TestWithParam<enum libvpx_test::TestMode> {
+ protected:
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GetParam());
+    kf_count_ = 0;
+    kf_count_max_ = INT_MAX;
+    kf_do_force_kf_ = false;
+    set_cpu_used_ = 0;
+  }
+
+  virtual bool Continue() const {
+    return !HasFatalFailure() && !abort_;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (kf_do_force_kf_)
+      frame_flags_ = (video->frame() % 3) ? 0 : VPX_EFLAG_FORCE_KF;
+    if (set_cpu_used_ && video->frame() == 1)
+      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) {
+      kf_pts_list_.push_back(pkt->data.frame.pts);
+      kf_count_++;
+      abort_ |= kf_count_ > kf_count_max_;
+    }
+  }
+
+  bool kf_do_force_kf_;
+  int kf_count_;
+  int kf_count_max_;
+  std::vector<vpx_codec_pts_t> kf_pts_list_;
+  int set_cpu_used_;
+};
+
+TEST_P(KeyframeTest, TestRandomVideoSource) {
+  // Validate that encoding the RandomVideoSource produces multiple keyframes.
+  // This validates the results of the TestDisableKeyframes test.
+  kf_count_max_ = 2;  // early exit successful tests.
+
+  ::libvpx_test::RandomVideoSource video;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // In realtime mode - auto placed keyframes are exceedingly rare,  don't
+  // bother with this check   if(GetParam() > 0)
+  if(GetParam() > 0)
+    EXPECT_GT(kf_count_, 1);
+}
+
+TEST_P(KeyframeTest, TestDisableKeyframes) {
+  cfg_.kf_mode = VPX_KF_DISABLED;
+  kf_count_max_ = 1;  // early exit failed tests.
+
+  ::libvpx_test::RandomVideoSource video;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  EXPECT_EQ(1, kf_count_);
+}
+
+TEST_P(KeyframeTest, TestForceKeyframe) {
+  cfg_.kf_mode = VPX_KF_DISABLED;
+  kf_do_force_kf_ = true;
+
+  ::libvpx_test::DummyVideoSource video;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // verify that every third frame is a keyframe.
+  for (std::vector<vpx_codec_pts_t>::const_iterator iter = kf_pts_list_.begin();
+       iter != kf_pts_list_.end(); ++iter) {
+    ASSERT_EQ(0, *iter % 3) << "Unexpected keyframe at frame " << *iter;
+  }
+}
+
+TEST_P(KeyframeTest, TestKeyframeMaxDistance) {
+  cfg_.kf_max_dist = 25;
+
+  ::libvpx_test::DummyVideoSource video;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // verify that keyframe interval matches kf_max_dist
+  for (std::vector<vpx_codec_pts_t>::const_iterator iter = kf_pts_list_.begin();
+       iter != kf_pts_list_.end(); ++iter) {
+    ASSERT_EQ(0, *iter % 25) << "Unexpected keyframe at frame " << *iter;
+  }
+}
+
+TEST_P(KeyframeTest, TestAutoKeyframe) {
+  cfg_.kf_mode = VPX_KF_AUTO;
+  kf_do_force_kf_ = false;
+
+  // Force a deterministic speed step in Real Time mode, as the faster modes
+  // may not produce a keyframe like we expect. This is necessary when running
+  // on very slow environments (like Valgrind). The step -11 was determined
+  // experimentally as the fastest mode that still throws the keyframe.
+  if (deadline_ == VPX_DL_REALTIME)
+    set_cpu_used_ = -11;
+
+  // This clip has a cut scene every 30 frames -> Frame 0, 30, 60, 90, 120.
+  // I check only the first 40 frames to make sure there's a keyframe at frame
+  // 0 and 30.
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 40);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // In realtime mode - auto placed keyframes are exceedingly rare,  don't
+  // bother with this check
+  if(GetParam() > 0)
+    EXPECT_EQ(2u, kf_pts_list_.size()) << " Not the right number of keyframes ";
+
+  // Verify that keyframes match the file keyframes in the file.
+  for (std::vector<vpx_codec_pts_t>::const_iterator iter = kf_pts_list_.begin();
+       iter != kf_pts_list_.end(); ++iter) {
+
+    if (deadline_ == VPX_DL_REALTIME && *iter > 0)
+      EXPECT_EQ(0, (*iter - 1) % 30) << "Unexpected keyframe at frame "
+        << *iter;
+    else
+      EXPECT_EQ(0, *iter % 30) << "Unexpected keyframe at frame " << *iter;
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(AllModes, KeyframeTest, ALL_TEST_MODES);
+}  // namespace
--- a/test/pp_filter_test.cc
+++ b/test/pp_filter_test.cc
@ -0,0 +1,106 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "third_party/googletest/src/include/gtest/gtest.h"
+extern "C" {
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+}
+
+typedef void (*post_proc_func_t)(unsigned char *src_ptr,
+                                 unsigned char *dst_ptr,
+                                 int src_pixels_per_line,
+                                 int dst_pixels_per_line,
+                                 int cols,
+                                 unsigned char *flimit,
+                                 int size);
+
+namespace {
+
+class Vp8PostProcessingFilterTest
+    : public ::testing::TestWithParam<post_proc_func_t> {};
+
+// Test routine for the VP8 post-processing function
+// vp8_post_proc_down_and_across_mb_row_c.
+
+TEST_P(Vp8PostProcessingFilterTest, FilterOutputCheck) {
+  // Size of the underlying data block that will be filtered.
+  const int block_width  = 16;
+  const int block_height = 16;
+
+  // 5-tap filter needs 2 padding rows above and below the block in the input.
+  const int input_width = block_width;
+  const int input_height = block_height + 4;
+  const int input_stride = input_width;
+  const int input_size = input_width * input_height;
+
+  // Filter extends output block by 8 samples at left and right edges.
+  const int output_width = block_width + 16;
+  const int output_height = block_height;
+  const int output_stride = output_width;
+  const int output_size = output_width * output_height;
+
+  uint8_t *const src_image =
+      reinterpret_cast<uint8_t*>(vpx_calloc(input_size, 1));
+  uint8_t *const dst_image =
+      reinterpret_cast<uint8_t*>(vpx_calloc(output_size, 1));
+
+  // Pointers to top-left pixel of block in the input and output images.
+  uint8_t *const src_image_ptr = src_image + (input_stride << 1);
+  uint8_t *const dst_image_ptr = dst_image + 8;
+  uint8_t *const flimits = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
+  (void)vpx_memset(flimits, 255, block_width);
+
+  // Initialize pixels in the input:
+  //   block pixels to value 1,
+  //   border pixels to value 10.
+  (void)vpx_memset(src_image, 10, input_size);
+  uint8_t *pixel_ptr = src_image_ptr;
+  for (int i = 0; i < block_height; ++i) {
+    for (int j = 0; j < block_width; ++j) {
+      pixel_ptr[j] = 1;
+    }
+    pixel_ptr += input_stride;
+  }
+
+  // Initialize pixels in the output to 99.
+  (void)vpx_memset(dst_image, 99, output_size);
+
+  GetParam()(src_image_ptr, dst_image_ptr, input_stride,
+             output_stride, block_width, flimits, 16);
+
+  static const uint8_t expected_data[block_height] = {
+    4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4
+  };
+
+  pixel_ptr = dst_image_ptr;
+  for (int i = 0; i < block_height; ++i) {
+    for (int j = 0; j < block_width; ++j) {
+      EXPECT_EQ(expected_data[i], pixel_ptr[j])
+          << "Vp8PostProcessingFilterTest failed with invalid filter output";
+    }
+    pixel_ptr += output_stride;
+  }
+
+  vpx_free(src_image);
+  vpx_free(dst_image);
+  vpx_free(flimits);
+};
+
+INSTANTIATE_TEST_CASE_P(C, Vp8PostProcessingFilterTest,
+    ::testing::Values(vp8_post_proc_down_and_across_mb_row_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, Vp8PostProcessingFilterTest,
+    ::testing::Values(vp8_post_proc_down_and_across_mb_row_sse2));
+#endif
+
+}  // namespace
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@ -0,0 +1,104 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <climits>
+#include <vector>
+#include "test/encode_test_driver.h"
+#include "test/video_source.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+namespace {
+
+const unsigned int kInitialWidth = 320;
+const unsigned int kInitialHeight = 240;
+
+unsigned int ScaleForFrameNumber(unsigned int frame, unsigned int val) {
+  if (frame < 10)
+    return val;
+  if (frame < 20)
+    return val / 2;
+  if (frame < 30)
+    return val * 2 / 3;
+  if (frame < 40)
+    return val / 4;
+  if (frame < 50)
+    return val * 7 / 8;
+  return val;
+}
+
+class ResizingVideoSource : public ::libvpx_test::DummyVideoSource {
+ public:
+  ResizingVideoSource() {
+    SetSize(kInitialWidth, kInitialHeight);
+    limit_ = 60;
+  }
+
+ protected:
+  virtual void Next() {
+    ++frame_;
+    SetSize(ScaleForFrameNumber(frame_, kInitialWidth),
+            ScaleForFrameNumber(frame_, kInitialHeight));
+    FillFrame();
+  }
+};
+
+class ResizeTest : public ::libvpx_test::EncoderTest,
+  public ::testing::TestWithParam<enum libvpx_test::TestMode> {
+ protected:
+  struct FrameInfo {
+    FrameInfo(vpx_codec_pts_t _pts, unsigned int _w, unsigned int _h)
+        : pts(_pts), w(_w), h(_h) {}
+
+    vpx_codec_pts_t pts;
+    unsigned int    w;
+    unsigned int    h;
+  };
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GetParam());
+  }
+
+  virtual bool Continue() const {
+    return !HasFatalFailure() && !abort_;
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) {
+      const unsigned char *buf =
+          reinterpret_cast<const unsigned char *>(pkt->data.frame.buf);
+      const unsigned int w = (buf[6] | (buf[7] << 8)) & 0x3fff;
+      const unsigned int h = (buf[8] | (buf[9] << 8)) & 0x3fff;
+
+      frame_info_list_.push_back(FrameInfo(pkt->data.frame.pts, w, h));
+    }
+  }
+
+  std::vector< FrameInfo > frame_info_list_;
+};
+
+TEST_P(ResizeTest, TestExternalResizeWorks) {
+  ResizingVideoSource video;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  for (std::vector<FrameInfo>::iterator info = frame_info_list_.begin();
+       info != frame_info_list_.end(); ++info) {
+    const vpx_codec_pts_t pts = info->pts;
+    const unsigned int expected_w = ScaleForFrameNumber(pts, kInitialWidth);
+    const unsigned int expected_h = ScaleForFrameNumber(pts, kInitialHeight);
+
+    EXPECT_EQ(expected_w, info->w)
+        << "Frame " << pts << "had unexpected width";
+    EXPECT_EQ(expected_h, info->h)
+        << "Frame " << pts << "had unexpected height";
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(OnePass, ResizeTest, ONE_PASS_TEST_MODES);
+}  // namespace
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@ -0,0 +1,250 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <string.h>
+#include <limits.h>
+#include <stdio.h>
+
+extern "C" {
+#include "./vpx_config.h"
+#include "./vpx_rtcd.h"
+#include "vp8/common/blockd.h"
+#include "vpx_mem/vpx_mem.h"
+}
+
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+
+typedef unsigned int (*sad_m_by_n_fn_t)(const unsigned char *source_ptr,
+                                        int source_stride,
+                                        const unsigned char *reference_ptr,
+                                        int reference_stride,
+                                        unsigned int max_sad);
+
+using libvpx_test::ACMRandom;
+
+namespace {
+class SADTest : public PARAMS(int, int, sad_m_by_n_fn_t) {
+ public:
+  static void SetUpTestCase() {
+    source_data_ = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kDataBufferSize));
+    reference_data_ = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kDataBufferSize));
+  }
+
+  static void TearDownTestCase() {
+    vpx_free(source_data_);
+    source_data_ = NULL;
+    vpx_free(reference_data_);
+    reference_data_ = NULL;
+  }
+
+ protected:
+  static const int kDataAlignment = 16;
+  static const int kDataBufferSize = 16 * 32;
+
+  virtual void SetUp() {
+    sad_fn_ = GET_PARAM(2);
+    height_ = GET_PARAM(1);
+    width_ = GET_PARAM(0);
+    source_stride_ = width_ * 2;
+    reference_stride_ = width_ * 2;
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+  sad_m_by_n_fn_t sad_fn_;
+  virtual unsigned int SAD(unsigned int max_sad) {
+    return sad_fn_(source_data_, source_stride_,
+                   reference_data_, reference_stride_,
+                   max_sad);
+  }
+
+  // Sum of Absolute Differences. Given two blocks, calculate the absolute
+  // difference between two pixels in the same relative location; accumulate.
+  unsigned int ReferenceSAD(unsigned int max_sad) {
+    unsigned int sad = 0;
+
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < width_; ++w) {
+        sad += abs(source_data_[h * source_stride_ + w]
+               - reference_data_[h * reference_stride_ + w]);
+      }
+      if (sad > max_sad) {
+        break;
+      }
+    }
+    return sad;
+  }
+
+  void FillConstant(uint8_t *data, int stride, uint8_t fill_constant) {
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < width_; ++w) {
+        data[h * stride + w] = fill_constant;
+      }
+    }
+  }
+
+  void FillRandom(uint8_t *data, int stride) {
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < width_; ++w) {
+        data[h * stride + w] = rnd_.Rand8();
+      }
+    }
+  }
+
+  void CheckSad(unsigned int max_sad) {
+    unsigned int reference_sad, exp_sad;
+
+    reference_sad = ReferenceSAD(max_sad);
+    exp_sad = SAD(max_sad);
+
+    if (reference_sad <= max_sad) {
+      ASSERT_EQ(exp_sad, reference_sad);
+    } else {
+      // Alternative implementations are not required to check max_sad
+      ASSERT_GE(exp_sad, reference_sad);
+    }
+  }
+
+  // Handle blocks up to 16x16 with stride up to 32
+  int height_, width_;
+  static uint8_t* source_data_;
+  int source_stride_;
+  static uint8_t* reference_data_;
+  int reference_stride_;
+
+  ACMRandom rnd_;
+};
+
+uint8_t* SADTest::source_data_ = NULL;
+uint8_t* SADTest::reference_data_ = NULL;
+
+TEST_P(SADTest, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(reference_data_, reference_stride_, 255);
+  CheckSad(UINT_MAX);
+}
+
+TEST_P(SADTest, MaxSrc) {
+  FillConstant(source_data_, source_stride_, 255);
+  FillConstant(reference_data_, reference_stride_, 0);
+  CheckSad(UINT_MAX);
+}
+
+TEST_P(SADTest, ShortRef) {
+  int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSad(UINT_MAX);
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADTest, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSad(UINT_MAX);
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADTest, ShortSrc) {
+  int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSad(UINT_MAX);
+  source_stride_ = tmp_stride;
+}
+
+TEST_P(SADTest, MaxSAD) {
+  // Verify that, when max_sad is set, the implementation does not return a
+  // value lower than the reference.
+  FillConstant(source_data_, source_stride_, 255);
+  FillConstant(reference_data_, reference_stride_, 0);
+  CheckSad(128);
+}
+
+using std::tr1::make_tuple;
+
+const sad_m_by_n_fn_t sad_16x16_c = vp8_sad16x16_c;
+const sad_m_by_n_fn_t sad_8x16_c = vp8_sad8x16_c;
+const sad_m_by_n_fn_t sad_16x8_c = vp8_sad16x8_c;
+const sad_m_by_n_fn_t sad_8x8_c = vp8_sad8x8_c;
+const sad_m_by_n_fn_t sad_4x4_c = vp8_sad4x4_c;
+INSTANTIATE_TEST_CASE_P(C, SADTest, ::testing::Values(
+                        make_tuple(16, 16, sad_16x16_c),
+                        make_tuple(8, 16, sad_8x16_c),
+                        make_tuple(16, 8, sad_16x8_c),
+                        make_tuple(8, 8, sad_8x8_c),
+                        make_tuple(4, 4, sad_4x4_c)));
+
+// ARM tests
+#if HAVE_MEDIA
+const sad_m_by_n_fn_t sad_16x16_armv6 = vp8_sad16x16_armv6;
+INSTANTIATE_TEST_CASE_P(MEDIA, SADTest, ::testing::Values(
+                        make_tuple(16, 16, sad_16x16_armv6)));
+
+#endif
+#if HAVE_NEON
+const sad_m_by_n_fn_t sad_16x16_neon = vp8_sad16x16_neon;
+const sad_m_by_n_fn_t sad_8x16_neon = vp8_sad8x16_neon;
+const sad_m_by_n_fn_t sad_16x8_neon = vp8_sad16x8_neon;
+const sad_m_by_n_fn_t sad_8x8_neon = vp8_sad8x8_neon;
+const sad_m_by_n_fn_t sad_4x4_neon = vp8_sad4x4_neon;
+INSTANTIATE_TEST_CASE_P(NEON, SADTest, ::testing::Values(
+                        make_tuple(16, 16, sad_16x16_neon),
+                        make_tuple(8, 16, sad_8x16_neon),
+                        make_tuple(16, 8, sad_16x8_neon),
+                        make_tuple(8, 8, sad_8x8_neon),
+                        make_tuple(4, 4, sad_4x4_neon)));
+#endif
+
+// X86 tests
+#if HAVE_MMX
+const sad_m_by_n_fn_t sad_16x16_mmx = vp8_sad16x16_mmx;
+const sad_m_by_n_fn_t sad_8x16_mmx = vp8_sad8x16_mmx;
+const sad_m_by_n_fn_t sad_16x8_mmx = vp8_sad16x8_mmx;
+const sad_m_by_n_fn_t sad_8x8_mmx = vp8_sad8x8_mmx;
+const sad_m_by_n_fn_t sad_4x4_mmx = vp8_sad4x4_mmx;
+INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::Values(
+                        make_tuple(16, 16, sad_16x16_mmx),
+                        make_tuple(8, 16, sad_8x16_mmx),
+                        make_tuple(16, 8, sad_16x8_mmx),
+                        make_tuple(8, 8, sad_8x8_mmx),
+                        make_tuple(4, 4, sad_4x4_mmx)));
+#endif
+#if HAVE_SSE2
+const sad_m_by_n_fn_t sad_16x16_wmt = vp8_sad16x16_wmt;
+const sad_m_by_n_fn_t sad_8x16_wmt = vp8_sad8x16_wmt;
+const sad_m_by_n_fn_t sad_16x8_wmt = vp8_sad16x8_wmt;
+const sad_m_by_n_fn_t sad_8x8_wmt = vp8_sad8x8_wmt;
+const sad_m_by_n_fn_t sad_4x4_wmt = vp8_sad4x4_wmt;
+INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::Values(
+                        make_tuple(16, 16, sad_16x16_wmt),
+                        make_tuple(8, 16, sad_8x16_wmt),
+                        make_tuple(16, 8, sad_16x8_wmt),
+                        make_tuple(8, 8, sad_8x8_wmt),
+                        make_tuple(4, 4, sad_4x4_wmt)));
+#endif
+#if HAVE_SSSE3
+const sad_m_by_n_fn_t sad_16x16_sse3 = vp8_sad16x16_sse3;
+INSTANTIATE_TEST_CASE_P(SSE3, SADTest, ::testing::Values(
+                        make_tuple(16, 16, sad_16x16_sse3)));
+#endif
+
+}  // namespace
--- a/test/set_roi.cc
+++ b/test/set_roi.cc
@ -0,0 +1,182 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <math.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+extern "C" {
+#include "vp8/encoder/onyx_int.h"
+}
+
+namespace {
+
+TEST(Vp8RoiMapTest, ParameterCheck) {
+  int delta_q[MAX_MB_SEGMENTS] = { -2, -25, 0, 31 };
+  int delta_lf[MAX_MB_SEGMENTS] = { -2, -25, 0, 31 };
+  unsigned int threshold[MAX_MB_SEGMENTS] = { 0, 100, 200, 300 };
+
+  const int internalq_trans[] = {
+    0,   1,  2,  3,  4,  5,  7,  8,
+    9,  10, 12, 13, 15, 17, 18, 19,
+    20,  21, 23, 24, 25, 26, 27, 28,
+    29,  30, 31, 33, 35, 37, 39, 41,
+    43,  45, 47, 49, 51, 53, 55, 57,
+    59,  61, 64, 67, 70, 73, 76, 79,
+    82,  85, 88, 91, 94, 97, 100, 103,
+    106, 109, 112, 115, 118, 121, 124, 127,
+  };
+
+  // Initialize elements of cpi with valid defaults.
+  VP8_COMP cpi;
+  cpi.mb.e_mbd.mb_segement_abs_delta = SEGMENT_DELTADATA;
+  cpi.cyclic_refresh_mode_enabled = 0;
+  cpi.mb.e_mbd.segmentation_enabled = 0;
+  cpi.mb.e_mbd.update_mb_segmentation_map = 0;
+  cpi.mb.e_mbd.update_mb_segmentation_data = 0;
+  cpi.common.mb_rows = 240 >> 4;
+  cpi.common.mb_cols = 320 >> 4;
+  const int mbs = (cpi.common.mb_rows * cpi.common.mb_cols);
+  vpx_memset(cpi.segment_feature_data, 0, sizeof(cpi.segment_feature_data));
+
+  // Segment map
+  cpi.segmentation_map = reinterpret_cast<unsigned char *>(vpx_calloc(mbs, 1));
+
+  // Allocate memory for the source memory map.
+  unsigned char *roi_map =
+    reinterpret_cast<unsigned char *>(vpx_calloc(mbs, 1));
+  vpx_memset(&roi_map[mbs >> 2], 1, (mbs >> 2));
+  vpx_memset(&roi_map[mbs >> 1], 2, (mbs >> 2));
+  vpx_memset(&roi_map[mbs -(mbs >> 2)], 3, (mbs >> 2));
+
+  // Do a test call with valid parameters.
+  int roi_retval = vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows,
+                                  cpi.common.mb_cols, delta_q, delta_lf,
+                                  threshold);
+  EXPECT_EQ(0, roi_retval)
+        << "vp8_set_roimap roi failed with default test parameters";
+
+  // Check that the values in the cpi structure get set as expected.
+  if (roi_retval == 0) {
+    // Check that the segment map got set.
+    const int mapcompare = memcmp(roi_map, cpi.segmentation_map, mbs);
+    EXPECT_EQ(0, mapcompare) << "segment map error";
+
+    // Check the q deltas (note the need to translate into
+    // the interanl range of 0-127.
+    for (int i = 0; i < MAX_MB_SEGMENTS; ++i) {
+      const int transq = internalq_trans[abs(delta_q[i])];
+      if (abs(cpi.segment_feature_data[MB_LVL_ALT_Q][i]) != transq) {
+          EXPECT_EQ(transq, cpi.segment_feature_data[MB_LVL_ALT_Q][i])
+                    << "segment delta_q  error";
+          break;
+      }
+    }
+
+    // Check the loop filter deltas
+    for (int i = 0; i < MAX_MB_SEGMENTS; ++i) {
+      if (cpi.segment_feature_data[MB_LVL_ALT_LF][i] != delta_lf[i]) {
+        EXPECT_EQ(delta_lf[i], cpi.segment_feature_data[MB_LVL_ALT_LF][i])
+                  << "segment delta_lf error";
+        break;
+      }
+    }
+
+    // Check the breakout thresholds
+    for (int i = 0; i < MAX_MB_SEGMENTS; ++i) {
+      unsigned int breakout =
+        static_cast<unsigned int>(cpi.segment_encode_breakout[i]);
+
+      if (threshold[i] != breakout) {
+        EXPECT_EQ(threshold[i], breakout)
+                  << "breakout threshold error";
+        break;
+      }
+    }
+
+    // Segmentation, and segmentation update flages should be set.
+    EXPECT_EQ(1, cpi.mb.e_mbd.segmentation_enabled)
+              << "segmentation_enabled error";
+    EXPECT_EQ(1, cpi.mb.e_mbd.update_mb_segmentation_map)
+              << "update_mb_segmentation_map error";
+    EXPECT_EQ(1, cpi.mb.e_mbd.update_mb_segmentation_data)
+              << "update_mb_segmentation_data error";
+
+
+    // Try a range of delta q and lf parameters (some legal, some not)
+    for (int i = 0; i < 1000; ++i) {
+      int rand_deltas[4];
+      int deltas_valid;
+      rand_deltas[0] = (rand() % 160) - 80;
+      rand_deltas[1] = (rand() % 160) - 80;
+      rand_deltas[2] = (rand() % 160) - 80;
+      rand_deltas[3] = (rand() % 160) - 80;
+
+      deltas_valid = ((abs(rand_deltas[0]) <= 63) &&
+                      (abs(rand_deltas[1]) <= 63) &&
+                      (abs(rand_deltas[2]) <= 63) &&
+                      (abs(rand_deltas[3]) <= 63)) ? 0 : -1;
+
+      // Test with random delta q values.
+      roi_retval = vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows,
+                                  cpi.common.mb_cols, rand_deltas,
+                                  delta_lf, threshold);
+      EXPECT_EQ(deltas_valid, roi_retval) << "dq range check error";
+
+      // One delta_q error shown at a time
+      if (deltas_valid != roi_retval)
+        break;
+
+      // Test with random loop filter values.
+      roi_retval = vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows,
+                                  cpi.common.mb_cols, delta_q,
+                                  rand_deltas, threshold);
+      EXPECT_EQ(deltas_valid, roi_retval) << "dlf range check error";
+
+      // One delta loop filter error shown at a time
+      if (deltas_valid != roi_retval)
+        break;
+    }
+
+    // Test that we report and error if cyclic refresh is enabled.
+    cpi.cyclic_refresh_mode_enabled = 1;
+    roi_retval = vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows,
+                                cpi.common.mb_cols, delta_q,
+                                delta_lf, threshold);
+    EXPECT_EQ(-1, roi_retval) << "cyclic refresh check error";
+    cpi.cyclic_refresh_mode_enabled = 0;
+
+    // Test invalid number of rows or colums.
+    roi_retval = vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows + 1,
+                                cpi.common.mb_cols, delta_q,
+                                delta_lf, threshold);
+    EXPECT_EQ(-1, roi_retval) << "MB rows bounds check error";
+
+    roi_retval = vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows,
+                                cpi.common.mb_cols - 1, delta_q,
+                                delta_lf, threshold);
+    EXPECT_EQ(-1, roi_retval) << "MB cols bounds check error";
+  }
+
+  // Free allocated memory
+  if (cpi.segmentation_map)
+    vpx_free(cpi.segmentation_map);
+  if (roi_map)
+    vpx_free(roi_map);
+};
+
+}  // namespace
--- a/test/sixtap_predict_test.cc
+++ b/test/sixtap_predict_test.cc
@ -0,0 +1,222 @@
+/*
+*  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+*
+*  Use of this source code is governed by a BSD-style license
+*  that can be found in the LICENSE file in the root of the source
+*  tree. An additional intellectual property rights grant can be found
+*  in the file PATENTS.  All contributing project authors may
+*  be found in the AUTHORS file in the root of the source tree.
+*/
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+extern "C" {
+#include "./vpx_config.h"
+#include "./vpx_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+}
+
+namespace {
+
+typedef void (*sixtap_predict_fn_t)(uint8_t *src_ptr,
+                                    int  src_pixels_per_line,
+                                    int  xoffset,
+                                    int  yoffset,
+                                    uint8_t *dst_ptr,
+                                    int  dst_pitch);
+
+class SixtapPredictTest : public PARAMS(int, int, sixtap_predict_fn_t) {
+ public:
+  static void SetUpTestCase() {
+    src_ = reinterpret_cast<uint8_t*>(vpx_memalign(kDataAlignment, kSrcSize));
+    dst_ = reinterpret_cast<uint8_t*>(vpx_memalign(kDataAlignment, kDstSize));
+    dst_c_ = reinterpret_cast<uint8_t*>(vpx_memalign(kDataAlignment, kDstSize));
+  }
+
+  static void TearDownTestCase() {
+    vpx_free(src_);
+    src_ = NULL;
+    vpx_free(dst_);
+    dst_ = NULL;
+    vpx_free(dst_c_);
+    dst_c_ = NULL;
+  }
+
+ protected:
+  // Make test arrays big enough for 16x16 functions. Six-tap filters
+  // need 5 extra pixels outside of the macroblock.
+  static const int kSrcStride = 21;
+  static const int kDstStride = 16;
+  static const int kDataAlignment = 16;
+  static const int kSrcSize = kSrcStride * kSrcStride + 1;
+  static const int kDstSize = kDstStride * kDstStride;
+
+  virtual void SetUp() {
+    width_ = GET_PARAM(0);
+    height_ = GET_PARAM(1);
+    sixtap_predict_ = GET_PARAM(2);
+    memset(src_, 0, sizeof(src_));
+    memset(dst_, 0, sizeof(dst_));
+    memset(dst_c_, 0, sizeof(dst_c_));
+  }
+
+  int width_;
+  int height_;
+  sixtap_predict_fn_t sixtap_predict_;
+  // The src stores the macroblock we will filter on, and makes it 1 byte larger
+  // in order to test unaligned access. The result is stored in dst and dst_c(c
+  // reference code result).
+  static uint8_t* src_;
+  static uint8_t* dst_;
+  static uint8_t* dst_c_;
+};
+
+uint8_t* SixtapPredictTest::src_ = NULL;
+uint8_t* SixtapPredictTest::dst_ = NULL;
+uint8_t* SixtapPredictTest::dst_c_ = NULL;
+
+TEST_P(SixtapPredictTest, TestWithPresetData) {
+  // Test input
+  static const uint8_t test_data[kSrcSize] = {
+    216, 184, 4, 191, 82, 92, 41, 0, 1, 226, 236, 172, 20, 182, 42, 226, 177,
+    79, 94, 77, 179, 203, 206, 198, 22, 192, 19, 75, 17, 192, 44, 233, 120,
+    48, 168, 203, 141, 210, 203, 143, 180, 184, 59, 201, 110, 102, 171, 32,
+    182, 10, 109, 105, 213, 60, 47, 236, 253, 67, 55, 14, 3, 99, 247, 124,
+    148, 159, 71, 34, 114, 19, 177, 38, 203, 237, 239, 58, 83, 155, 91, 10,
+    166, 201, 115, 124, 5, 163, 104, 2, 231, 160, 16, 234, 4, 8, 103, 153,
+    167, 174, 187, 26, 193, 109, 64, 141, 90, 48, 200, 174, 204, 36, 184,
+    114, 237, 43, 238, 242, 207, 86, 245, 182, 247, 6, 161, 251, 14, 8, 148,
+    182, 182, 79, 208, 120, 188, 17, 6, 23, 65, 206, 197, 13, 242, 126, 128,
+    224, 170, 110, 211, 121, 197, 200, 47, 188, 207, 208, 184, 221, 216, 76,
+    148, 143, 156, 100, 8, 89, 117, 14, 112, 183, 221, 54, 197, 208, 180, 69,
+    176, 94, 180, 131, 215, 121, 76, 7, 54, 28, 216, 238, 249, 176, 58, 142,
+    64, 215, 242, 72, 49, 104, 87, 161, 32, 52, 216, 230, 4, 141, 44, 181,
+    235, 224, 57, 195, 89, 134, 203, 144, 162, 163, 126, 156, 84, 185, 42,
+    148, 145, 29, 221, 194, 134, 52, 100, 166, 105, 60, 140, 110, 201, 184,
+    35, 181, 153, 93, 121, 243, 227, 68, 131, 134, 232, 2, 35, 60, 187, 77,
+    209, 76, 106, 174, 15, 241, 227, 115, 151, 77, 175, 36, 187, 121, 221,
+    223, 47, 118, 61, 168, 105, 32, 237, 236, 167, 213, 238, 202, 17, 170,
+    24, 226, 247, 131, 145, 6, 116, 117, 121, 11, 194, 41, 48, 126, 162, 13,
+    93, 209, 131, 154, 122, 237, 187, 103, 217, 99, 60, 200, 45, 78, 115, 69,
+    49, 106, 200, 194, 112, 60, 56, 234, 72, 251, 19, 120, 121, 182, 134, 215,
+    135, 10, 114, 2, 247, 46, 105, 209, 145, 165, 153, 191, 243, 12, 5, 36,
+    119, 206, 231, 231, 11, 32, 209, 83, 27, 229, 204, 149, 155, 83, 109, 35,
+    93, 223, 37, 84, 14, 142, 37, 160, 52, 191, 96, 40, 204, 101, 77, 67, 52,
+    53, 43, 63, 85, 253, 147, 113, 226, 96, 6, 125, 179, 115, 161, 17, 83,
+    198, 101, 98, 85, 139, 3, 137, 75, 99, 178, 23, 201, 255, 91, 253, 52,
+    134, 60, 138, 131, 208, 251, 101, 48, 2, 227, 228, 118, 132, 245, 202,
+    75, 91, 44, 160, 231, 47, 41, 50, 147, 220, 74, 92, 219, 165, 89, 16
+  };
+
+  // Expected result
+  static const uint8_t expected_dst[kDstSize] = {
+    117, 102, 74, 135, 42, 98, 175, 206, 70, 73, 222, 197, 50, 24, 39, 49, 38,
+    105, 90, 47, 169, 40, 171, 215, 200, 73, 109, 141, 53, 85, 177, 164, 79,
+    208, 124, 89, 212, 18, 81, 145, 151, 164, 217, 153, 91, 154, 102, 102,
+    159, 75, 164, 152, 136, 51, 213, 219, 186, 116, 193, 224, 186, 36, 231,
+    208, 84, 211, 155, 167, 35, 59, 42, 76, 216, 149, 73, 201, 78, 149, 184,
+    100, 96, 196, 189, 198, 188, 235, 195, 117, 129, 120, 129, 49, 25, 133,
+    113, 69, 221, 114, 70, 143, 99, 157, 108, 189, 140, 78, 6, 55, 65, 240,
+    255, 245, 184, 72, 90, 100, 116, 131, 39, 60, 234, 167, 33, 160, 88, 185,
+    200, 157, 159, 176, 127, 151, 138, 102, 168, 106, 170, 86, 82, 219, 189,
+    76, 33, 115, 197, 106, 96, 198, 136, 97, 141, 237, 151, 98, 137, 191,
+    185, 2, 57, 95, 142, 91, 255, 185, 97, 137, 76, 162, 94, 173, 131, 193,
+    161, 81, 106, 72, 135, 222, 234, 137, 66, 137, 106, 243, 210, 147, 95,
+    15, 137, 110, 85, 66, 16, 96, 167, 147, 150, 173, 203, 140, 118, 196,
+    84, 147, 160, 19, 95, 101, 123, 74, 132, 202, 82, 166, 12, 131, 166,
+    189, 170, 159, 85, 79, 66, 57, 152, 132, 203, 194, 0, 1, 56, 146, 180,
+    224, 156, 28, 83, 181, 79, 76, 80, 46, 160, 175, 59, 106, 43, 87, 75,
+    136, 85, 189, 46, 71, 200, 90
+  };
+
+  uint8_t *src = const_cast<uint8_t*>(test_data);
+
+  sixtap_predict_(&src[kSrcStride * 2 + 2 + 1], kSrcStride,
+                  2, 2, dst_, kDstStride);
+
+  for (int i = 0; i < height_; ++i)
+    for (int j = 0; j < width_; ++j)
+      ASSERT_EQ(expected_dst[i * kDstStride + j], dst_[i * kDstStride + j])
+          << "i==" << (i * width_ + j);
+}
+
+using libvpx_test::ACMRandom;
+
+TEST_P(SixtapPredictTest, TestWithRandomData) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  for (int i = 0; i < kSrcSize; ++i)
+    src_[i] = rnd.Rand8();
+
+  // Run tests for all possible offsets.
+  for (int xoffset = 0; xoffset < 8; ++xoffset) {
+    for (int yoffset = 0; yoffset < 8; ++yoffset) {
+      // Call c reference function.
+      // Move start point to next pixel to test if the function reads
+      // unaligned data correctly.
+      vp8_sixtap_predict16x16_c(&src_[kSrcStride * 2 + 2 + 1], kSrcStride,
+                                xoffset, yoffset, dst_c_, kDstStride);
+
+      // Run test.
+      sixtap_predict_(&src_[kSrcStride * 2 + 2 + 1], kSrcStride,
+                      xoffset, yoffset, dst_, kDstStride);
+
+      for (int i = 0; i < height_; ++i)
+        for (int j = 0; j < width_; ++j)
+          ASSERT_EQ(dst_c_[i * kDstStride + j], dst_[i * kDstStride + j])
+              << "i==" << (i * width_ + j);
+    }
+  }
+}
+
+using std::tr1::make_tuple;
+
+const sixtap_predict_fn_t sixtap_16x16_c = vp8_sixtap_predict16x16_c;
+const sixtap_predict_fn_t sixtap_8x8_c = vp8_sixtap_predict8x8_c;
+const sixtap_predict_fn_t sixtap_8x4_c = vp8_sixtap_predict8x4_c;
+const sixtap_predict_fn_t sixtap_4x4_c = vp8_sixtap_predict4x4_c;
+INSTANTIATE_TEST_CASE_P(
+    C, SixtapPredictTest, ::testing::Values(
+        make_tuple(16, 16, sixtap_16x16_c),
+        make_tuple(8, 8, sixtap_8x8_c),
+        make_tuple(8, 4, sixtap_8x4_c),
+        make_tuple(4, 4, sixtap_4x4_c)));
+#if HAVE_MMX
+const sixtap_predict_fn_t sixtap_16x16_mmx = vp8_sixtap_predict16x16_mmx;
+const sixtap_predict_fn_t sixtap_8x8_mmx = vp8_sixtap_predict8x8_mmx;
+const sixtap_predict_fn_t sixtap_8x4_mmx = vp8_sixtap_predict8x4_mmx;
+const sixtap_predict_fn_t sixtap_4x4_mmx = vp8_sixtap_predict4x4_mmx;
+INSTANTIATE_TEST_CASE_P(
+    MMX, SixtapPredictTest, ::testing::Values(
+        make_tuple(16, 16, sixtap_16x16_mmx),
+        make_tuple(8, 8, sixtap_8x8_mmx),
+        make_tuple(8, 4, sixtap_8x4_mmx),
+        make_tuple(4, 4, sixtap_4x4_mmx)));
+#endif
+#if HAVE_SSE2
+const sixtap_predict_fn_t sixtap_16x16_sse2 = vp8_sixtap_predict16x16_sse2;
+const sixtap_predict_fn_t sixtap_8x8_sse2 = vp8_sixtap_predict8x8_sse2;
+const sixtap_predict_fn_t sixtap_8x4_sse2 = vp8_sixtap_predict8x4_sse2;
+INSTANTIATE_TEST_CASE_P(
+    SSE2, SixtapPredictTest, ::testing::Values(
+        make_tuple(16, 16, sixtap_16x16_sse2),
+        make_tuple(8, 8, sixtap_8x8_sse2),
+        make_tuple(8, 4, sixtap_8x4_sse2)));
+#endif
+#if HAVE_SSSE3
+const sixtap_predict_fn_t sixtap_16x16_ssse3 = vp8_sixtap_predict16x16_ssse3;
+const sixtap_predict_fn_t sixtap_8x8_ssse3 = vp8_sixtap_predict8x8_ssse3;
+const sixtap_predict_fn_t sixtap_8x4_ssse3 = vp8_sixtap_predict8x4_ssse3;
+const sixtap_predict_fn_t sixtap_4x4_ssse3 = vp8_sixtap_predict4x4_ssse3;
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, SixtapPredictTest, ::testing::Values(
+        make_tuple(16, 16, sixtap_16x16_ssse3),
+        make_tuple(8, 8, sixtap_8x8_ssse3),
+        make_tuple(8, 4, sixtap_8x4_ssse3),
+        make_tuple(4, 4, sixtap_4x4_ssse3)));
+#endif
+}  // namespace
--- a/test/subtract_test.cc
+++ b/test/subtract_test.cc
@ -0,0 +1,113 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+extern "C" {
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vp8/common/blockd.h"
+#include "vp8/encoder/block.h"
+#include "vpx_mem/vpx_mem.h"
+}
+
+typedef void (*subtract_b_fn_t)(BLOCK *be, BLOCKD *bd, int pitch);
+
+namespace {
+
+class SubtractBlockTest : public ::testing::TestWithParam<subtract_b_fn_t> {};
+
+using libvpx_test::ACMRandom;
+
+TEST_P(SubtractBlockTest, SimpleSubtract) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  BLOCK be;
+  BLOCKD bd;
+  // in libvpx, this stride is always 16
+  const int kDiffPredStride = 16;
+  const int kSrcStride[] = {32, 16, 8, 4, 0};
+  const int kBlockWidth = 4;
+  const int kBlockHeight = 4;
+
+  // Allocate... align to 16 for mmx/sse tests
+  uint8_t *source = reinterpret_cast<uint8_t*>(
+      vpx_memalign(16, kBlockHeight * kSrcStride[0] * sizeof(*source)));
+  be.src_diff = reinterpret_cast<int16_t*>(
+      vpx_memalign(16, kBlockHeight * kDiffPredStride * sizeof(*be.src_diff)));
+  bd.predictor = reinterpret_cast<unsigned char*>(
+      vpx_memalign(16, kBlockHeight * kDiffPredStride * sizeof(*bd.predictor)));
+
+  for(int i = 0; kSrcStride[i] > 0; ++i) {
+    // start at block0
+    be.src = 0;
+    be.base_src = &source;
+    be.src_stride = kSrcStride[i];
+
+    // set difference
+    int16_t *src_diff = be.src_diff;
+    for (int r = 0; r < kBlockHeight; ++r) {
+      for (int c = 0; c < kBlockWidth; ++c) {
+        src_diff[c] = 0xa5a5;
+      }
+      src_diff += kDiffPredStride;
+    }
+
+    // set destination
+    uint8_t *base_src = *be.base_src;
+    for (int r = 0; r < kBlockHeight; ++r) {
+      for (int c = 0; c < kBlockWidth; ++c) {
+        base_src[c] = rnd.Rand8();
+      }
+      base_src += be.src_stride;
+    }
+
+    // set predictor
+    uint8_t *predictor = bd.predictor;
+    for (int r = 0; r < kBlockHeight; ++r) {
+      for (int c = 0; c < kBlockWidth; ++c) {
+        predictor[c] = rnd.Rand8();
+      }
+      predictor += kDiffPredStride;
+    }
+
+    GetParam()(&be, &bd, kDiffPredStride);
+
+    base_src = *be.base_src;
+    src_diff = be.src_diff;
+    predictor = bd.predictor;
+    for (int r = 0; r < kBlockHeight; ++r) {
+      for (int c = 0; c < kBlockWidth; ++c) {
+        EXPECT_EQ(base_src[c], (src_diff[c] + predictor[c])) << "r = " << r
+                                                             << ", c = " << c;
+      }
+      src_diff += kDiffPredStride;
+      predictor += kDiffPredStride;
+      base_src += be.src_stride;
+    }
+  }
+  vpx_free(be.src_diff);
+  vpx_free(source);
+  vpx_free(bd.predictor);
+}
+
+INSTANTIATE_TEST_CASE_P(C, SubtractBlockTest,
+                        ::testing::Values(vp8_subtract_b_c));
+
+#if HAVE_MMX
+INSTANTIATE_TEST_CASE_P(MMX, SubtractBlockTest,
+                        ::testing::Values(vp8_subtract_b_mmx));
+#endif
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, SubtractBlockTest,
+                        ::testing::Values(vp8_subtract_b_sse2));
+#endif
+
+}  // namespace
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@ -1 +1,123 @@
 d5dfb0151c9051f8c85999255645d7a23916d3c0  hantro_collage_w352h288.yuv
+5184c46ddca8b1fadd16742e8500115bc8f749da  vp80-00-comprehensive-001.ivf
+65bf1bbbced81b97bd030f376d1b7f61a224793f  vp80-00-comprehensive-002.ivf
+906b4c1e99eb734504c504b3f1ad8052137ce672  vp80-00-comprehensive-003.ivf
+ec144b1af53af895db78355785650b96dd3f0ade  vp80-00-comprehensive-004.ivf
+afc7091785c62f1c121c4554a2830c30704587d9  vp80-00-comprehensive-005.ivf
+42ea9d55c818145d06a9b633b8e85c6a6164fd3e  vp80-00-comprehensive-006.ivf
+e5b3a73ab79fe024c14309d653d6bed92902ee3b  vp80-00-comprehensive-007.ivf
+f3c50a58875930adfb84525c0ef59d7e4c08540c  vp80-00-comprehensive-008.ivf
+4b2841fdb83db51ae322096ae468bbb9dc2c8362  vp80-00-comprehensive-009.ivf
+efbff736e3a91ab6a98c5bc2dce65d645944c7b1  vp80-00-comprehensive-010.ivf
+6b315102cae008d22a3d2c231be92cb704a222f8  vp80-00-comprehensive-011.ivf
+f3214a4fea14c2d5ec689936c1613f274c859ee8  vp80-00-comprehensive-012.ivf
+e4094e96d308c8a35b74c480a43d853c5294cd34  vp80-00-comprehensive-013.ivf
+5b0adfaf60a69e0aaf3ec021a39d0a68fc0e1b5a  vp80-00-comprehensive-014.ivf
+e8467688ddf26b5000664f904faf0d70506aa653  vp80-00-comprehensive-015.ivf
+aab55582337dfd2a39ff54fb2576a91910d49337  vp80-00-comprehensive-016.ivf
+1ba24724f80203c9bae4f1d0f99d534721980016  vp80-00-comprehensive-017.ivf
+143a15512b46f436280ddb4d0e6411eb4af434f2  vp80-00-comprehensive-018.ivf
+c5baeaf5714fdfb3a8bc960a8e33ac438e83b16b  vp80-01-intra-1400.ivf
+f383955229afe3408453e316d11553d923ca60d5  vp80-01-intra-1411.ivf
+84e1f4343f174c9f3c83f834bac3196fb325bf2c  vp80-01-intra-1416.ivf
+fb6e712a47dd57a28a3727d2ae2c97a8b7c7ca51  vp80-01-intra-1417.ivf
+71ea772d3e9d315b8cbecf41207b8a237c34853b  vp80-02-inter-1402.ivf
+d85dbc4271525dcd128c503f936fe69091d1f8d0  vp80-02-inter-1412.ivf
+d4e5d3ad56511867d025f93724d090f92ba6ec3d  vp80-02-inter-1418.ivf
+91791cbcc37c60f35dbd8090bacb54e5ec6dd4fa  vp80-02-inter-1424.ivf
+17fbfe2fea70f6e2f3fa6ca4efaae6c0b03b5f02  vp80-03-segmentation-01.ivf
+3c3600dbbcde08e20d54c66fe3b7eadd4f09bdbb  vp80-03-segmentation-02.ivf
+c156778d5340967d4b369c490848076e92f1f875  vp80-03-segmentation-03.ivf
+d25dcff6c60e87a1af70945b8911b6b4998533b0  vp80-03-segmentation-04.ivf
+362baba2ce454c9db21218f35e81c27a5ed0b730  vp80-03-segmentation-1401.ivf
+d223ae7ee748ce07e74c4679bfd219e84aa9f4b0  vp80-03-segmentation-1403.ivf
+033adf7f3a13836a3f1cffcb87c1972900f2b5c6  vp80-03-segmentation-1407.ivf
+4d51dfbf9f3e2c590ec99d1d6f59dd731d04375f  vp80-03-segmentation-1408.ivf
+f37a62b197c2600d75e0ccfbb31b60efdedac251  vp80-03-segmentation-1409.ivf
+eb25bd7bfba5b2f6935018a930f42d123b1e7fcd  vp80-03-segmentation-1410.ivf
+b9d5c436663a30c27cfff84b53a002e501258843  vp80-03-segmentation-1413.ivf
+6da92b9d1a180cc3a8afe348ab12258f5a37be1a  vp80-03-segmentation-1414.ivf
+a4f5842602886bd669f115f93d8a35c035cb0948  vp80-03-segmentation-1415.ivf
+f295dceb8ef278b77251b3f9df8aee22e161d547  vp80-03-segmentation-1425.ivf
+198dbf9f36f733200e432664cc8c5752d59779de  vp80-03-segmentation-1426.ivf
+7704804e32f5de976803929934a7fafe101ac7b0  vp80-03-segmentation-1427.ivf
+831ccd862ea95ca025d2f3bd8b88678752f5416d  vp80-03-segmentation-1432.ivf
+b3c11978529289f9109f2766fcaba3ebc40e11ef  vp80-03-segmentation-1435.ivf
+a835a731f5520ebfc1002c40121264d0020559ac  vp80-03-segmentation-1436.ivf
+1d1732942f773bb2a5775fcb9689b1579ce28eab  vp80-03-segmentation-1437.ivf
+db04799adfe089dfdf74dbd43cc05ede7161f99e  vp80-03-segmentation-1441.ivf
+7caf39b3f20cfd52b998210878062e52a5edf1e6  vp80-03-segmentation-1442.ivf
+3607f6bb4ee106c38fa1ea370dc4ff8b8cde2261  vp80-04-partitions-1404.ivf
+93cc323b6b6867f1b12dd48773424549c6960a6b  vp80-04-partitions-1405.ivf
+047eedb14b865bdac8a3538e63801054e0295e9c  vp80-04-partitions-1406.ivf
+0f1233bd2bc33f56ce5e495dbd455d122339f384  vp80-05-sharpness-1428.ivf
+51767fc136488a9535c2a4c38067c542ee2048df  vp80-05-sharpness-1429.ivf
+9805aa107672de25d6fb8c35e20d06deca5efe18  vp80-05-sharpness-1430.ivf
+61db6b965f9c27aebe71b85bf2d5877e58e4bbdf  vp80-05-sharpness-1431.ivf
+10420d266290d2923555f84af38eeb96edbd3ae8  vp80-05-sharpness-1433.ivf
+3ed24f9a80cddfdf75824ba95cdb4ff9286cb443  vp80-05-sharpness-1434.ivf
+c87599cbecd72d4cd4f7ace3313b7a6bc6eb8163  vp80-05-sharpness-1438.ivf
+aff51d865c2621b60510459244ea83e958e4baed  vp80-05-sharpness-1439.ivf
+da386e72b19b5485a6af199c5eb60ef25e510dd1  vp80-05-sharpness-1440.ivf
+6759a095203d96ccd267ce09b1b050b8cc4c2f1f  vp80-05-sharpness-1443.ivf
+db55ec7fd02c864ba996ff060b25b1e08611330b  vp80-00-comprehensive-001.ivf.md5
+29db0ad011cba1e45f856d5623cd38dac3e3bf19  vp80-00-comprehensive-002.ivf.md5
+e84f258f69e173e7d68f8f8c037a0a3766902182  vp80-00-comprehensive-003.ivf.md5
+eb7912eaf69559a16fd82bc3f5fb1524cf4a4466  vp80-00-comprehensive-004.ivf.md5
+4206f71c94894bd5b5b376f6c09b3817dbc65206  vp80-00-comprehensive-005.ivf.md5
+4f89b356f6f2fecb928f330a10f804f00f5325f5  vp80-00-comprehensive-006.ivf.md5
+2813236a32964dd8007e17648bcf035a20fcda6c  vp80-00-comprehensive-007.ivf.md5
+10746c72098f872803c900e17c5680e451f5f498  vp80-00-comprehensive-008.ivf.md5
+39a23d0692ce64421a7bb7cdf6ccec5928d37fff  vp80-00-comprehensive-009.ivf.md5
+f6e3de8931a0cc659bda8fbc14050346955e72d4  vp80-00-comprehensive-010.ivf.md5
+101683ec195b6e944f7cd1e468fc8921439363e6  vp80-00-comprehensive-011.ivf.md5
+1f592751ce46d8688998fa0fa4fbdcda0fd4058c  vp80-00-comprehensive-012.ivf.md5
+6066176f90ca790251e795fca1a5797d59999841  vp80-00-comprehensive-013.ivf.md5
+2656da94ba93691f23edc4d60b3a09e2be46c217  vp80-00-comprehensive-014.ivf.md5
+c6e0d5f5d61460c8ac8edfa4e701f10312c03133  vp80-00-comprehensive-015.ivf.md5
+ee60fee501d8493e34e8d6a1fe315b51ed09b24a  vp80-00-comprehensive-016.ivf.md5
+9f1914ceffcad4546c0a29de3ef591d8bea304dc  vp80-00-comprehensive-017.ivf.md5
+e0305178fe288a9fd8082b39e2d03181edb19054  vp80-00-comprehensive-018.ivf.md5
+612494da2fa799cc9d76dcdd835ae6c7cb2e5c05  vp80-01-intra-1400.ivf.md5
+48ea06097ac8269c5e8c2131d3d0639f431fcf0e  vp80-01-intra-1411.ivf.md5
+6e2ab4e7677ad0ba868083ca6bc387ee922b400c  vp80-01-intra-1416.ivf.md5
+eca0a90348959ce3854142f8d8641b13050e8349  vp80-01-intra-1417.ivf.md5
+920feea203145d5c2258a91c4e6991934a79a99e  vp80-02-inter-1402.ivf.md5
+f71d97909fe2b3dd65be7e1f56c72237f0cef200  vp80-02-inter-1412.ivf.md5
+e911254569a30bbb2a237ff8b79f69ed9da0672d  vp80-02-inter-1418.ivf.md5
+58c789c50c9bb9cc90580bed291164a0939d28ba  vp80-02-inter-1424.ivf.md5
+ff3e2f441327b9c20a0b37c524e0f5a48a36de7b  vp80-03-segmentation-01.ivf.md5
+0791f417f076a542ae66fbc3426ab4d94cbd6c75  vp80-03-segmentation-02.ivf.md5
+722e50f1a6a91c34302d68681faffc1c26d1cc57  vp80-03-segmentation-03.ivf.md5
+c701f1885bcfb27fb8e70cc65606b289172ef889  vp80-03-segmentation-04.ivf.md5
+f79bc9ec189a2b4807632a3d0c5bf04a178b5300  vp80-03-segmentation-1401.ivf.md5
+b9aa4c74c0219b639811c44760d0b24cd8bb436a  vp80-03-segmentation-1403.ivf.md5
+70d5a2207ca1891bcaebd5cf6dd88ce8d57b4334  vp80-03-segmentation-1407.ivf.md5
+265f962ee781531f9a93b9309461316fd32b2a1d  vp80-03-segmentation-1408.ivf.md5
+0c4ecbbd6dc042d30e626d951b65f460dd6cd563  vp80-03-segmentation-1409.ivf.md5
+cf779af36a937f06570a0fca9db64ba133451dee  vp80-03-segmentation-1410.ivf.md5
+0e6c5036d51ab078842f133934926c598a9cff02  vp80-03-segmentation-1413.ivf.md5
+eb3930aaf229116c80d507516c34759c3f6cdf69  vp80-03-segmentation-1414.ivf.md5
+123d6c0f72ee87911c4ae7538e87b7d163b22d6c  vp80-03-segmentation-1415.ivf.md5
+e70551d1a38920e097a5d8782390b79ecaeb7505  vp80-03-segmentation-1425.ivf.md5
+44e8f4117e46dbb302b2cfd81171cc1a1846e431  vp80-03-segmentation-1426.ivf.md5
+52636e54aee5f95bbace37021bd67de5db767e9a  vp80-03-segmentation-1427.ivf.md5
+b1ad3eff20215c28e295b15ef3636ed926d59cba  vp80-03-segmentation-1432.ivf.md5
+24c22a552fa28a90e5978f67f57181cc2d7546d7  vp80-03-segmentation-1435.ivf.md5
+96c49c390abfced18a7a8c9b9ea10af778e10edb  vp80-03-segmentation-1436.ivf.md5
+f95eb6214571434f1f73ab7833b9ccdf47588020  vp80-03-segmentation-1437.ivf.md5
+1c0700ca27c9b0090a7747a4b0b4dc21d1843181  vp80-03-segmentation-1441.ivf.md5
+81d4f23ca32667ee958bae579c8f5e97ba72eb97  vp80-03-segmentation-1442.ivf.md5
+272efcef07a3a30fbca51bfd566063d8258ec0be  vp80-04-partitions-1404.ivf.md5
+66ed219ab812ac801b256d35cf495d193d4cf478  vp80-04-partitions-1405.ivf.md5
+36083f37f56f502bd60ec5e07502ee9e6b8699b0  vp80-04-partitions-1406.ivf.md5
+6ca909bf168a64c09415626294665dc1be3d1973  vp80-05-sharpness-1428.ivf.md5
+1667d2ee2334e5fdea8a8a866f4ccf3cf76f033a  vp80-05-sharpness-1429.ivf.md5
+71bcbe5357d36a19df5b07fbe3e27bffa8893f0a  vp80-05-sharpness-1430.ivf.md5
+89a09b1dffce2d55770a89e58d9925c70ef79bf8  vp80-05-sharpness-1431.ivf.md5
+08444a18b4e6ba3450c0796dd728d48c399a2dc9  vp80-05-sharpness-1433.ivf.md5
+6d6223719a90c13e848aa2a8a6642098cdb5977a  vp80-05-sharpness-1434.ivf.md5
+41d70bb5fa45bc88da1604a0af466930b8dd77b5  vp80-05-sharpness-1438.ivf.md5
+086c56378df81b6cee264d7540a7b8f2b405c7a4  vp80-05-sharpness-1439.ivf.md5
+d32dc2c4165eb266ea4c23c14a45459b363def32  vp80-05-sharpness-1440.ivf.md5
+8c69dc3d8e563f56ffab5ad1e400d9e689dd23df  vp80-05-sharpness-1443.ivf.md5
--- a/test/test.mk
+++ b/test/test.mk
@ -1,10 +1,186 @@
-LIBVPX_TEST_SRCS-yes += test.mk
 LIBVPX_TEST_SRCS-yes += acm_random.h
-LIBVPX_TEST_SRCS-yes += boolcoder_test.cc
-LIBVPX_TEST_SRCS-yes += dct16x16_test.cc
-LIBVPX_TEST_SRCS-yes += fdct4x4_test.cc
-LIBVPX_TEST_SRCS-yes += fdct8x8_test.cc
-LIBVPX_TEST_SRCS-yes += idct8x8_test.cc
+LIBVPX_TEST_SRCS-yes += test.mk
 LIBVPX_TEST_SRCS-yes += test_libvpx.cc
+LIBVPX_TEST_SRCS-yes += util.h
+LIBVPX_TEST_SRCS-yes += video_source.h

-LIBVPX_TEST_DATA-yes += hantro_collage_w352h288.yuv
+##
+## BLACK BOX TESTS
+##
+## Black box tests only use the public API.
+##
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += altref_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += config_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += cq_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += datarate_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += encode_test_driver.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += encode_test_driver.h
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += error_resilience_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += i420_video_source.h
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += resize_test.cc
+
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += ../md5_utils.h ../md5_utils.c
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += decode_test_driver.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += decode_test_driver.h
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += ivf_video_source.h
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += test_vector_test.cc
+##
+## WHITE BOX TESTS
+##
+## Whitebox tests invoke functions not exposed via the public API. Certain
+## shared library builds don't make these functions accessible.
+##
+ifeq ($(CONFIG_SHARED),)
+
+# These tests require both the encoder and decoder to be built.
+ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),yesyes)
+LIBVPX_TEST_SRCS-yes                   += boolcoder_test.cc
+endif
+
+LIBVPX_TEST_SRCS-yes                   += idctllm_test.cc
+LIBVPX_TEST_SRCS-yes                   += intrapred_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC)    += pp_filter_test.cc
+LIBVPX_TEST_SRCS-yes                   += sad_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc
+LIBVPX_TEST_SRCS-yes                   += sixtap_predict_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += subtract_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc
+
+# VP9 tests
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc
+ifneq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_DECODER),)
+LIBVPX_TEST_SRCS-yes += idct8x8_test.cc
+endif
+
+endif
+
+
+##
+## TEST DATA
+##
+LIBVPX_TEST_DATA-$(CONFIG_VP8_ENCODER) += hantro_collage_w352h288.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-002.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-003.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-004.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-005.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-006.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-007.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-008.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-009.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-010.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-011.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-012.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-013.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-014.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-015.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-016.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-017.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-018.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1400.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1411.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1416.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1417.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1402.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1412.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1418.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1424.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-01.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-02.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-03.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-04.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1401.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1403.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1407.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1408.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1409.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1410.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1413.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1414.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1415.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1425.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1426.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1427.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1432.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1435.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1436.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1437.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1441.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1442.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1404.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1405.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1406.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1428.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1429.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1430.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1431.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1433.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1434.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1438.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1439.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1440.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1443.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-002.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-003.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-004.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-005.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-006.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-007.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-008.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-009.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-010.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-011.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-012.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-013.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-014.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-015.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-016.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-017.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-018.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1400.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1411.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1416.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1417.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1402.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1412.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1418.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1424.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1401.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1403.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1407.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1408.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1409.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1410.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1413.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1414.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1415.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1425.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1426.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1427.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1432.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1435.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1436.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1437.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1441.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1442.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-01.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-02.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-03.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-04.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1404.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1405.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1406.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1428.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1429.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1430.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1431.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1433.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1434.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1438.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1439.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1440.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1443.ivf.md5
--- a/test/test_libvpx.cc
+++ b/test/test_libvpx.cc
@ -26,7 +26,7 @@ int main(int argc, char **argv) {
  ::testing::InitGoogleTest(&argc, argv);

 #if ARCH_X86 || ARCH_X86_64
-  int simd_caps = x86_simd_caps();
+  const int simd_caps = x86_simd_caps();
  if (!(simd_caps & HAS_MMX))
    append_gtest_filter(":-MMX/*");
  if (!(simd_caps & HAS_SSE))
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc
@ -0,0 +1,144 @@
+/*
+ Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+
+ Use of this source code is governed by a BSD-style license
+ that can be found in the LICENSE file in the root of the source
+ tree. An additional intellectual property rights grant can be found
+ in the file PATENTS.  All contributing project authors may
+ be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+extern "C" {
+#include "./md5_utils.h"
+#include "vpx_mem/vpx_mem.h"
+}
+
+#if defined(_MSC_VER)
+#define snprintf sprintf_s
+#endif
+
+namespace {
+// There are 61 test vectors in total.
+const char *kTestVectors[] = {
+  "vp80-00-comprehensive-001.ivf",
+  "vp80-00-comprehensive-002.ivf", "vp80-00-comprehensive-003.ivf",
+  "vp80-00-comprehensive-004.ivf", "vp80-00-comprehensive-005.ivf",
+  "vp80-00-comprehensive-006.ivf", "vp80-00-comprehensive-007.ivf",
+  "vp80-00-comprehensive-008.ivf", "vp80-00-comprehensive-009.ivf",
+  "vp80-00-comprehensive-010.ivf", "vp80-00-comprehensive-011.ivf",
+  "vp80-00-comprehensive-012.ivf", "vp80-00-comprehensive-013.ivf",
+  "vp80-00-comprehensive-014.ivf", "vp80-00-comprehensive-015.ivf",
+  "vp80-00-comprehensive-016.ivf", "vp80-00-comprehensive-017.ivf",
+  "vp80-00-comprehensive-018.ivf", "vp80-01-intra-1400.ivf",
+  "vp80-01-intra-1411.ivf", "vp80-01-intra-1416.ivf",
+  "vp80-01-intra-1417.ivf", "vp80-02-inter-1402.ivf",
+  "vp80-02-inter-1412.ivf", "vp80-02-inter-1418.ivf",
+  "vp80-02-inter-1424.ivf", "vp80-03-segmentation-01.ivf",
+  "vp80-03-segmentation-02.ivf", "vp80-03-segmentation-03.ivf",
+  "vp80-03-segmentation-04.ivf", "vp80-03-segmentation-1401.ivf",
+  "vp80-03-segmentation-1403.ivf", "vp80-03-segmentation-1407.ivf",
+  "vp80-03-segmentation-1408.ivf", "vp80-03-segmentation-1409.ivf",
+  "vp80-03-segmentation-1410.ivf", "vp80-03-segmentation-1413.ivf",
+  "vp80-03-segmentation-1414.ivf", "vp80-03-segmentation-1415.ivf",
+  "vp80-03-segmentation-1425.ivf", "vp80-03-segmentation-1426.ivf",
+  "vp80-03-segmentation-1427.ivf", "vp80-03-segmentation-1432.ivf",
+  "vp80-03-segmentation-1435.ivf", "vp80-03-segmentation-1436.ivf",
+  "vp80-03-segmentation-1437.ivf", "vp80-03-segmentation-1441.ivf",
+  "vp80-03-segmentation-1442.ivf", "vp80-04-partitions-1404.ivf",
+  "vp80-04-partitions-1405.ivf", "vp80-04-partitions-1406.ivf",
+  "vp80-05-sharpness-1428.ivf", "vp80-05-sharpness-1429.ivf",
+  "vp80-05-sharpness-1430.ivf", "vp80-05-sharpness-1431.ivf",
+  "vp80-05-sharpness-1433.ivf", "vp80-05-sharpness-1434.ivf",
+  "vp80-05-sharpness-1438.ivf", "vp80-05-sharpness-1439.ivf",
+  "vp80-05-sharpness-1440.ivf", "vp80-05-sharpness-1443.ivf"
+};
+
+class TestVectorTest : public libvpx_test::DecoderTest,
+    public ::testing::TestWithParam<const char*> {
+ protected:
+  TestVectorTest() : md5_file_(NULL) {}
+
+  virtual ~TestVectorTest() {
+    if (md5_file_)
+      fclose(md5_file_);
+  }
+
+  void OpenMD5File(const std::string& md5_file_name_) {
+    md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_);
+    ASSERT_TRUE(md5_file_) << "Md5 file open failed. Filename: "
+        << md5_file_name_;
+  }
+
+  virtual void DecompressedFrameHook(const vpx_image_t& img,
+                                     const unsigned int frame_number) {
+    char expected_md5[33];
+    char junk[128];
+
+    // Read correct md5 checksums.
+    const int res = fscanf(md5_file_, "%s  %s", expected_md5, junk);
+    ASSERT_NE(res, EOF) << "Read md5 data failed";
+    expected_md5[32] = '\0';
+
+    MD5Context md5;
+    MD5Init(&md5);
+
+    // Compute and update md5 for each raw in decompressed data.
+    for (int plane = 0; plane < 3; ++plane) {
+      uint8_t *buf = img.planes[plane];
+
+      for (unsigned int y = 0; y < (plane ? (img.d_h + 1) >> 1 : img.d_h);
+           ++y) {
+        MD5Update(&md5, buf, (plane ? (img.d_w + 1) >> 1 : img.d_w));
+        buf += img.stride[plane];
+      }
+    }
+
+    uint8_t md5_sum[16];
+    MD5Final(md5_sum, &md5);
+
+    char actual_md5[33];
+    // Convert to get the actual md5.
+    for (int i = 0; i < 16; i++) {
+      snprintf(&actual_md5[i * 2], sizeof(actual_md5) - i * 2, "%02x",
+               md5_sum[i]);
+    }
+    actual_md5[32] = '\0';
+
+    // Check md5 match.
+    ASSERT_STREQ(expected_md5, actual_md5)
+        << "Md5 checksums don't match: frame number = " << frame_number;
+  }
+
+ private:
+  FILE *md5_file_;
+};
+
+// This test runs through the whole set of test vectors, and decodes them.
+// The md5 checksums are computed for each frame in the video file. If md5
+// checksums match the correct md5 data, then the test is passed. Otherwise,
+// the test failed.
+TEST_P(TestVectorTest, MD5Match) {
+  const std::string filename = GetParam();
+  // Open compressed video file.
+  libvpx_test::IVFVideoSource video(filename);
+
+  video.Init();
+
+  // Construct md5 file name.
+  const std::string md5_filename = filename + ".md5";
+  OpenMD5File(md5_filename);
+
+  // Decode frame, and check the md5 matching.
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+INSTANTIATE_TEST_CASE_P(TestVectorSequence, TestVectorTest,
+                        ::testing::ValuesIn(kTestVectors));
+
+}  // namespace
--- a/test/util.h
+++ b/test/util.h
@ -0,0 +1,18 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef TEST_UTIL_H_
+#define TEST_UTIL_H_
+
+// Macros
+#define PARAMS(...) ::testing::TestWithParam< std::tr1::tuple< __VA_ARGS__ > >
+#define GET_PARAM(k) std::tr1::get< k >(GetParam())
+
+#endif  // TEST_UTIL_H_
--- a/test/video_source.h
+++ b/test/video_source.h
@ -0,0 +1,175 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef TEST_VIDEO_SOURCE_H_
+#define TEST_VIDEO_SOURCE_H_
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include "test/acm_random.h"
+#include "vpx/vpx_encoder.h"
+
+namespace libvpx_test {
+
+static FILE *OpenTestDataFile(const std::string& file_name) {
+  std::string path_to_source = file_name;
+  const char *kDataPath = getenv("LIBVPX_TEST_DATA_PATH");
+
+  if (kDataPath) {
+    path_to_source = kDataPath;
+    path_to_source += "/";
+    path_to_source += file_name;
+  }
+
+  return fopen(path_to_source.c_str(), "rb");
+}
+
+// Abstract base class for test video sources, which provide a stream of
+// vpx_image_t images with associated timestamps and duration.
+class VideoSource {
+ public:
+  virtual ~VideoSource() {}
+
+  // Prepare the stream for reading, rewind/open as necessary.
+  virtual void Begin() = 0;
+
+  // Advance the cursor to the next frame
+  virtual void Next() = 0;
+
+  // Get the current video frame, or NULL on End-Of-Stream.
+  virtual vpx_image_t *img() const = 0;
+
+  // Get the presentation timestamp of the current frame.
+  virtual vpx_codec_pts_t pts() const = 0;
+
+  // Get the current frame's duration
+  virtual unsigned long duration() const = 0;
+
+  // Get the timebase for the stream
+  virtual vpx_rational_t timebase() const = 0;
+
+  // Get the current frame counter, starting at 0.
+  virtual unsigned int frame() const = 0;
+
+  // Get the current file limit.
+  virtual unsigned int limit() const = 0;
+};
+
+
+class DummyVideoSource : public VideoSource {
+ public:
+  DummyVideoSource() : img_(NULL), limit_(100), width_(0), height_(0) {
+    SetSize(80, 64);
+  }
+
+  virtual ~DummyVideoSource() { vpx_img_free(img_); }
+
+  virtual void Begin() {
+    frame_ = 0;
+    FillFrame();
+  }
+
+  virtual void Next() {
+    ++frame_;
+    FillFrame();
+  }
+
+  virtual vpx_image_t *img() const {
+    return (frame_ < limit_) ? img_ : NULL;
+  }
+
+  // Models a stream where Timebase = 1/FPS, so pts == frame.
+  virtual vpx_codec_pts_t pts() const { return frame_; }
+
+  virtual unsigned long duration() const { return 1; }
+
+  virtual vpx_rational_t timebase() const {
+    const vpx_rational_t t = {1, 30};
+    return t;
+  }
+
+  virtual unsigned int frame() const { return frame_; }
+
+  virtual unsigned int limit() const { return limit_; }
+
+  void SetSize(unsigned int width, unsigned int height) {
+    if (width != width_ || height != height_) {
+      vpx_img_free(img_);
+      raw_sz_ = ((width + 31)&~31) * height * 3 / 2;
+      img_ = vpx_img_alloc(NULL, VPX_IMG_FMT_VPXI420, width, height, 32);
+      width_ = width;
+      height_ = height;
+    }
+  }
+
+ protected:
+  virtual void FillFrame() { memset(img_->img_data, 0, raw_sz_); }
+
+  vpx_image_t *img_;
+  size_t       raw_sz_;
+  unsigned int limit_;
+  unsigned int frame_;
+  unsigned int width_;
+  unsigned int height_;
+};
+
+
+class RandomVideoSource : public DummyVideoSource {
+ public:
+  RandomVideoSource(int seed = ACMRandom::DeterministicSeed())
+      : rnd_(seed),
+        seed_(seed) { }
+
+ protected:
+  // Reset the RNG to get a matching stream for the second pass
+  virtual void Begin() {
+    frame_ = 0;
+    rnd_.Reset(seed_);
+    FillFrame();
+  }
+
+  // 15 frames of noise, followed by 15 static frames. Reset to 0 rather
+  // than holding previous frames to encourage keyframes to be thrown.
+  virtual void FillFrame() {
+    if (frame_ % 30 < 15)
+      for (size_t i = 0; i < raw_sz_; ++i)
+        img_->img_data[i] = rnd_.Rand8();
+    else
+      memset(img_->img_data, 0, raw_sz_);
+  }
+
+  ACMRandom rnd_;
+  int seed_;
+};
+
+// Abstract base class for test video sources, which provide a stream of
+// decompressed images to the decoder.
+class CompressedVideoSource {
+ public:
+  virtual ~CompressedVideoSource() {}
+
+  virtual void Init() = 0;
+
+  // Prepare the stream for reading, rewind/open as necessary.
+  virtual void Begin() = 0;
+
+  // Advance the cursor to the next frame
+  virtual void Next() = 0;
+
+  virtual const uint8_t *cxdata() const = 0;
+
+  virtual const unsigned int frame_size() const = 0;
+
+  virtual const unsigned int frame_number() const = 0;
+};
+
+}  // namespace libvpx_test
+
+#endif  // TEST_VIDEO_SOURCE_H_
--- a/test/vp8_fdct4x4_test.cc
+++ b/test/vp8_fdct4x4_test.cc
@ -0,0 +1,169 @@
+/*
+*  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+*
+*  Use of this source code is governed by a BSD-style license
+*  that can be found in the LICENSE file in the root of the source
+*  tree. An additional intellectual property rights grant can be found
+*  in the file PATENTS.  All contributing project authors may
+*  be found in the AUTHORS file in the root of the source tree.
+*/
+
+
+#include <math.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+
+
+extern "C" {
+#include "vpx_rtcd.h"
+}
+
+#include "test/acm_random.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "vpx/vpx_integer.h"
+
+
+namespace {
+
+const int cospi8sqrt2minus1 = 20091;
+const int sinpi8sqrt2 = 35468;
+
+void reference_idct4x4(const int16_t *input, int16_t *output) {
+  const int16_t *ip = input;
+  int16_t *op = output;
+
+  for (int i = 0; i < 4; ++i) {
+    const int a1 = ip[0] + ip[8];
+    const int b1 = ip[0] - ip[8];
+    const int temp1 = (ip[4] * sinpi8sqrt2) >> 16;
+    const int temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
+    const int c1 = temp1 - temp2;
+    const int temp3 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16);
+    const int temp4 = (ip[12] * sinpi8sqrt2) >> 16;
+    const int d1 = temp3 + temp4;
+    op[0] = a1 + d1;
+    op[12] = a1 - d1;
+    op[4] = b1 + c1;
+    op[8] = b1 - c1;
+    ++ip;
+    ++op;
+  }
+  ip = output;
+  op = output;
+  for (int i = 0; i < 4; ++i) {
+    const int a1 = ip[0] + ip[2];
+    const int b1 = ip[0] - ip[2];
+    const int temp1 = (ip[1] * sinpi8sqrt2) >> 16;
+    const int temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16);
+    const int c1 = temp1 - temp2;
+    const int temp3 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16);
+    const int temp4 = (ip[3] * sinpi8sqrt2) >> 16;
+    const int d1 = temp3 + temp4;
+    op[0] = (a1 + d1 + 4) >> 3;
+    op[3] = (a1 - d1 + 4) >> 3;
+    op[1] = (b1 + c1 + 4) >> 3;
+    op[2] = (b1 - c1 + 4) >> 3;
+    ip += 4;
+    op += 4;
+  }
+}
+
+using libvpx_test::ACMRandom;
+
+TEST(Vp8FdctTest, SignBiasCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int16_t test_input_block[16];
+  int16_t test_output_block[16];
+  const int pitch = 8;
+  int count_sign_block[16][2];
+  const int count_test_block = 1000000;
+
+  memset(count_sign_block, 0, sizeof(count_sign_block));
+
+  for (int i = 0; i < count_test_block; ++i) {
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 16; ++j)
+      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+
+    vp8_short_fdct4x4_c(test_input_block, test_output_block, pitch);
+
+    for (int j = 0; j < 16; ++j) {
+      if (test_output_block[j] < 0)
+        ++count_sign_block[j][0];
+      else if (test_output_block[j] > 0)
+        ++count_sign_block[j][1];
+    }
+  }
+
+  bool bias_acceptable = true;
+  for (int j = 0; j < 16; ++j)
+    bias_acceptable = bias_acceptable &&
+    (abs(count_sign_block[j][0] - count_sign_block[j][1]) < 10000);
+
+  EXPECT_EQ(true, bias_acceptable)
+    << "Error: 4x4 FDCT has a sign bias > 1% for input range [-255, 255]";
+
+  memset(count_sign_block, 0, sizeof(count_sign_block));
+
+  for (int i = 0; i < count_test_block; ++i) {
+    // Initialize a test block with input range [-15, 15].
+    for (int j = 0; j < 16; ++j)
+      test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);
+
+    vp8_short_fdct4x4_c(test_input_block, test_output_block, pitch);
+
+    for (int j = 0; j < 16; ++j) {
+      if (test_output_block[j] < 0)
+        ++count_sign_block[j][0];
+      else if (test_output_block[j] > 0)
+        ++count_sign_block[j][1];
+    }
+  }
+
+  bias_acceptable = true;
+  for (int j = 0; j < 16; ++j)
+    bias_acceptable = bias_acceptable &&
+    (abs(count_sign_block[j][0] - count_sign_block[j][1]) < 100000);
+
+  EXPECT_EQ(true, bias_acceptable)
+    << "Error: 4x4 FDCT has a sign bias > 10% for input range [-15, 15]";
+};
+
+TEST(Vp8FdctTest, RoundTripErrorCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int max_error = 0;
+  double total_error = 0;
+  const int count_test_block = 1000000;
+  for (int i = 0; i < count_test_block; ++i) {
+    int16_t test_input_block[16];
+    int16_t test_temp_block[16];
+    int16_t test_output_block[16];
+
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 16; ++j)
+      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+
+    const int pitch = 8;
+    vp8_short_fdct4x4_c(test_input_block, test_temp_block, pitch);
+    reference_idct4x4(test_temp_block, test_output_block);
+
+    for (int j = 0; j < 16; ++j) {
+      const int diff = test_input_block[j] - test_output_block[j];
+      const int error = diff * diff;
+      if (max_error < error)
+        max_error = error;
+      total_error += error;
+    }
+  }
+
+  EXPECT_GE(1, max_error )
+    << "Error: FDCT/IDCT has an individual roundtrip error > 1";
+
+  EXPECT_GE(count_test_block, total_error)
+    << "Error: FDCT/IDCT has average roundtrip error > 1 per block";
+};
+
+}  // namespace
--- a/third_party/libyuv/source/scale.c
+++ b/third_party/libyuv/source/scale.c
@ -60,7 +60,7 @@ void SetUseReferenceImpl(int use) {

 #if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
 #define HAS_SCALEROWDOWN2_NEON
-void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */,
+void ScaleRowDown2_NEON(const uint8* src_ptr, int  src_stride,
                        uint8* dst, int dst_width) {
  asm volatile (
    "1:                                        \n"
@ -102,7 +102,7 @@ void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride,
 }

 #define HAS_SCALEROWDOWN4_NEON
-static void ScaleRowDown4_NEON(const uint8* src_ptr, int /* src_stride */,
+static void ScaleRowDown4_NEON(const uint8* src_ptr, int src_stride,
                               uint8* dst_ptr, int dst_width) {
  asm volatile (
    "1:                                        \n"
@ -160,7 +160,7 @@ static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride,
 // Down scale from 4 to 3 pixels.  Use the neon multilane read/write
 //  to load up the every 4th pixel into a 4 different registers.
 // Point samples 32 pixels to 24 pixels.
-static void ScaleRowDown34_NEON(const uint8* src_ptr, int /* src_stride */,
+static void ScaleRowDown34_NEON(const uint8* src_ptr, int src_stride,
                                uint8* dst_ptr, int dst_width) {
  asm volatile (
    "1:                                        \n"
@ -284,7 +284,7 @@ const unsigned short mult38_div9[8] __attribute__ ((aligned(16))) =
    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };

 // 32 -> 12
-static void ScaleRowDown38_NEON(const uint8* src_ptr, int,
+static void ScaleRowDown38_NEON(const uint8* src_ptr, int src_stride,
                                uint8* dst_ptr, int dst_width) {
  asm volatile (
    "vld1.u8      {q3}, [%3]                   \n"
--- a/tools/all_builds.py
+++ b/tools/all_builds.py
@ -5,7 +5,7 @@ import subprocess
 import sys

 LONG_OPTIONS = ["shard=", "shards="]
-BASE_COMMAND = "./configure --enable-internal-stats --enable-experimental"
+BASE_COMMAND = "./configure --disable-vp8 --disable-unit-tests --enable-internal-stats --enable-experimental"

 def RunCommand(command):
  run = subprocess.Popen(command, shell=True)
--- a/usage.dox
+++ b/usage.dox
@ -1,6 +1,6 @@
 /*!\page usage Usage

-    The vpx Multi-Format codec SDK provides a unified interface amongst its
+    The vpx multi-format codec SDK provides a unified interface amongst its
    supported codecs. This abstraction allows applications using this SDK to
    easily support multiple video formats with minimal code duplication or
    "special casing." This section describes the interface common to all codecs.
@ -14,8 +14,12 @@

    Fore more information on decoder and encoder specific usage, see the
    following pages:
-    \if decoder - \subpage usage_decode \endif
-    \if decoder - \subpage usage_encode \endif
+    \if decoder
+    - \subpage usage_decode
+    \endif
+    \if decoder
+    - \subpage usage_encode
+    \endif

    \section usage_types Important Data Types
    There are two important data structures to consider in this interface.
@ -82,6 +86,7 @@

    The available initialization methods are:
    \if encoder - #vpx_codec_enc_init (calls vpx_codec_enc_init_ver()) \endif
+    \if multi-encoder - #vpx_codec_enc_init_multi (calls vpx_codec_enc_init_multi_ver()) \endif
    \if decoder - #vpx_codec_dec_init (calls vpx_codec_dec_init_ver()) \endif


--- a/usage_cx.dox
+++ b/usage_cx.dox
@ -1,6 +1,6 @@
 /*! \page usage_encode Encode

-    The vpx_codec_encode() function is at the core of the decode loop. It
+    The vpx_codec_encode() function is at the core of the encode loop. It
    processes raw images passed by the application, producing packets of
    compressed data. The <code>deadline</code> parameter controls the amount
    of time in microseconds the encoder should spend working on the frame. For
@ -10,5 +10,4 @@

    \ref samples

-
 */
--- a/vp8/common/alloccommon.c
+++ b/vp8/common/alloccommon.c
@ -0,0 +1,190 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "blockd.h"
+#include "vpx_mem/vpx_mem.h"
+#include "onyxc_int.h"
+#include "findnearmv.h"
+#include "entropymode.h"
+#include "systemdependent.h"
+
+void vp8_de_alloc_frame_buffers(VP8_COMMON *oci)
+{
+    int i;
+    for (i = 0; i < NUM_YV12_BUFFERS; i++)
+        vp8_yv12_de_alloc_frame_buffer(&oci->yv12_fb[i]);
+
+    vp8_yv12_de_alloc_frame_buffer(&oci->temp_scale_frame);
+#if CONFIG_POSTPROC
+    vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer);
+    if (oci->post_proc_buffer_int_used)
+        vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer_int);
+
+    vpx_free(oci->pp_limits_buffer);
+    oci->pp_limits_buffer = NULL;
+#endif
+
+    vpx_free(oci->above_context);
+    vpx_free(oci->mip);
+#if CONFIG_ERROR_CONCEALMENT
+    vpx_free(oci->prev_mip);
+    oci->prev_mip = NULL;
+#endif
+
+    oci->above_context = NULL;
+    oci->mip = NULL;
+}
+
+int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height)
+{
+    int i;
+
+    vp8_de_alloc_frame_buffers(oci);
+
+    /* our internal buffers are always multiples of 16 */
+    if ((width & 0xf) != 0)
+        width += 16 - (width & 0xf);
+
+    if ((height & 0xf) != 0)
+        height += 16 - (height & 0xf);
+
+
+    for (i = 0; i < NUM_YV12_BUFFERS; i++)
+    {
+        oci->fb_idx_ref_cnt[i] = 0;
+        oci->yv12_fb[i].flags = 0;
+        if (vp8_yv12_alloc_frame_buffer(&oci->yv12_fb[i], width, height, VP8BORDERINPIXELS) < 0)
+            goto allocation_fail;
+    }
+
+    oci->new_fb_idx = 0;
+    oci->lst_fb_idx = 1;
+    oci->gld_fb_idx = 2;
+    oci->alt_fb_idx = 3;
+
+    oci->fb_idx_ref_cnt[0] = 1;
+    oci->fb_idx_ref_cnt[1] = 1;
+    oci->fb_idx_ref_cnt[2] = 1;
+    oci->fb_idx_ref_cnt[3] = 1;
+
+    if (vp8_yv12_alloc_frame_buffer(&oci->temp_scale_frame,   width, 16, VP8BORDERINPIXELS) < 0)
+        goto allocation_fail;
+
+    oci->mb_rows = height >> 4;
+    oci->mb_cols = width >> 4;
+    oci->MBs = oci->mb_rows * oci->mb_cols;
+    oci->mode_info_stride = oci->mb_cols + 1;
+    oci->mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO));
+
+    if (!oci->mip)
+        goto allocation_fail;
+
+    oci->mi = oci->mip + oci->mode_info_stride + 1;
+
+    /* Allocation of previous mode info will be done in vp8_decode_frame()
+     * as it is a decoder only data */
+
+    oci->above_context = vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * oci->mb_cols, 1);
+
+    if (!oci->above_context)
+        goto allocation_fail;
+
+#if CONFIG_POSTPROC
+    if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer, width, height, VP8BORDERINPIXELS) < 0)
+        goto allocation_fail;
+
+    oci->post_proc_buffer_int_used = 0;
+    vpx_memset(&oci->postproc_state, 0, sizeof(oci->postproc_state));
+    vpx_memset(oci->post_proc_buffer.buffer_alloc, 128,
+               oci->post_proc_buffer.frame_size);
+
+    /* Allocate buffer to store post-processing filter coefficients.
+     *
+     * Note: Round up mb_cols to support SIMD reads
+     */
+    oci->pp_limits_buffer = vpx_memalign(16, 24 * ((oci->mb_cols + 1) & ~1));
+    if (!oci->pp_limits_buffer)
+        goto allocation_fail;
+#endif
+
+    return 0;
+
+allocation_fail:
+    vp8_de_alloc_frame_buffers(oci);
+    return 1;
+}
+
+void vp8_setup_version(VP8_COMMON *cm)
+{
+    switch (cm->version)
+    {
+    case 0:
+        cm->no_lpf = 0;
+        cm->filter_type = NORMAL_LOOPFILTER;
+        cm->use_bilinear_mc_filter = 0;
+        cm->full_pixel = 0;
+        break;
+    case 1:
+        cm->no_lpf = 0;
+        cm->filter_type = SIMPLE_LOOPFILTER;
+        cm->use_bilinear_mc_filter = 1;
+        cm->full_pixel = 0;
+        break;
+    case 2:
+        cm->no_lpf = 1;
+        cm->filter_type = NORMAL_LOOPFILTER;
+        cm->use_bilinear_mc_filter = 1;
+        cm->full_pixel = 0;
+        break;
+    case 3:
+        cm->no_lpf = 1;
+        cm->filter_type = SIMPLE_LOOPFILTER;
+        cm->use_bilinear_mc_filter = 1;
+        cm->full_pixel = 1;
+        break;
+    default:
+        /*4,5,6,7 are reserved for future use*/
+        cm->no_lpf = 0;
+        cm->filter_type = NORMAL_LOOPFILTER;
+        cm->use_bilinear_mc_filter = 0;
+        cm->full_pixel = 0;
+        break;
+    }
+}
+void vp8_create_common(VP8_COMMON *oci)
+{
+    vp8_machine_specific_config(oci);
+
+    vp8_init_mbmode_probs(oci);
+    vp8_default_bmode_probs(oci->fc.bmode_prob);
+
+    oci->mb_no_coeff_skip = 1;
+    oci->no_lpf = 0;
+    oci->filter_type = NORMAL_LOOPFILTER;
+    oci->use_bilinear_mc_filter = 0;
+    oci->full_pixel = 0;
+    oci->multi_token_partition = ONE_PARTITION;
+    oci->clr_type = REG_YUV;
+    oci->clamp_type = RECON_CLAMP_REQUIRED;
+
+    /* Initialize reference frame sign bias structure to defaults */
+    vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
+
+    /* Default disable buffer to buffer copying */
+    oci->copy_buffer_to_gf = 0;
+    oci->copy_buffer_to_arf = 0;
+}
+
+void vp8_remove_common(VP8_COMMON *oci)
+{
+    vp8_de_alloc_frame_buffers(oci);
+}
--- a/vp8/common/alloccommon.h
+++ b/vp8/common/alloccommon.h
@ -0,0 +1,23 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ALLOCCOMMON_H
+#define __INC_ALLOCCOMMON_H
+
+#include "onyxc_int.h"
+
+void vp8_create_common(VP8_COMMON *oci);
+void vp8_remove_common(VP8_COMMON *oci);
+void vp8_de_alloc_frame_buffers(VP8_COMMON *oci);
+int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height);
+void vp8_setup_version(VP8_COMMON *oci);
+
+#endif
--- a/vp8/common/arm/armv6/bilinearfilter_v6.asm
+++ b/vp8/common/arm/armv6/bilinearfilter_v6.asm
@ -0,0 +1,237 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_filter_block2d_bil_first_pass_armv6|
+    EXPORT  |vp8_filter_block2d_bil_second_pass_armv6|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+;-------------------------------------
+; r0    unsigned char  *src_ptr,
+; r1    unsigned short *dst_ptr,
+; r2    unsigned int    src_pitch,
+; r3    unsigned int    height,
+; stack unsigned int    width,
+; stack const short    *vp8_filter
+;-------------------------------------
+; The output is transposed stroed in output array to make it easy for second pass filtering.
+|vp8_filter_block2d_bil_first_pass_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vp8_filter address
+    ldr     r4, [sp, #36]                   ; width
+
+    mov     r12, r3                         ; outer-loop counter
+
+    add     r7, r2, r4                      ; preload next row
+    pld     [r0, r7]
+
+    sub     r2, r2, r4                      ; src increment for height loop
+
+    ldr     r5, [r11]                       ; load up filter coefficients
+
+    mov     r3, r3, lsl #1                  ; height*2
+    add     r3, r3, #2                      ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
+
+    mov     r11, r1                         ; save dst_ptr for each row
+
+    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
+    beq     bil_null_1st_filter
+
+|bil_height_loop_1st_v6|
+    ldrb    r6, [r0]                        ; load source data
+    ldrb    r7, [r0, #1]
+    ldrb    r8, [r0, #2]
+    mov     lr, r4, lsr #2                  ; 4-in-parellel loop counter
+
+|bil_width_loop_1st_v6|
+    ldrb    r9, [r0, #3]
+    ldrb    r10, [r0, #4]
+
+    pkhbt   r6, r6, r7, lsl #16             ; src[1] | src[0]
+    pkhbt   r7, r7, r8, lsl #16             ; src[2] | src[1]
+
+    smuad   r6, r6, r5                      ; apply the filter
+    pkhbt   r8, r8, r9, lsl #16             ; src[3] | src[2]
+    smuad   r7, r7, r5
+    pkhbt   r9, r9, r10, lsl #16            ; src[4] | src[3]
+
+    smuad   r8, r8, r5
+    smuad   r9, r9, r5
+
+    add     r0, r0, #4
+    subs    lr, lr, #1
+
+    add     r6, r6, #0x40                   ; round_shift_and_clamp
+    add     r7, r7, #0x40
+    usat    r6, #16, r6, asr #7
+    usat    r7, #16, r7, asr #7
+
+    strh    r6, [r1], r3                    ; result is transposed and stored
+
+    add     r8, r8, #0x40                   ; round_shift_and_clamp
+    strh    r7, [r1], r3
+    add     r9, r9, #0x40
+    usat    r8, #16, r8, asr #7
+    usat    r9, #16, r9, asr #7
+
+    strh    r8, [r1], r3                    ; result is transposed and stored
+
+    ldrneb  r6, [r0]                        ; load source data
+    strh    r9, [r1], r3
+
+    ldrneb  r7, [r0, #1]
+    ldrneb  r8, [r0, #2]
+
+    bne     bil_width_loop_1st_v6
+
+    add     r0, r0, r2                      ; move to next input row
+    subs    r12, r12, #1
+
+    add     r9, r2, r4, lsl #1              ; adding back block width
+    pld     [r0, r9]                        ; preload next row
+
+    add     r11, r11, #2                    ; move over to next column
+    mov     r1, r11
+
+    bne     bil_height_loop_1st_v6
+
+    ldmia   sp!, {r4 - r11, pc}
+
+|bil_null_1st_filter|
+|bil_height_loop_null_1st|
+    mov     lr, r4, lsr #2                  ; loop counter
+
+|bil_width_loop_null_1st|
+    ldrb    r6, [r0]                        ; load data
+    ldrb    r7, [r0, #1]
+    ldrb    r8, [r0, #2]
+    ldrb    r9, [r0, #3]
+
+    strh    r6, [r1], r3                    ; store it to immediate buffer
+    add     r0, r0, #4
+    strh    r7, [r1], r3
+    subs    lr, lr, #1
+    strh    r8, [r1], r3
+    strh    r9, [r1], r3
+
+    bne     bil_width_loop_null_1st
+
+    subs    r12, r12, #1
+    add     r0, r0, r2                      ; move to next input line
+    add     r11, r11, #2                    ; move over to next column
+    mov     r1, r11
+
+    bne     bil_height_loop_null_1st
+
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP  ; |vp8_filter_block2d_bil_first_pass_armv6|
+
+
+;---------------------------------
+; r0    unsigned short *src_ptr,
+; r1    unsigned char  *dst_ptr,
+; r2    int             dst_pitch,
+; r3    unsigned int    height,
+; stack unsigned int    width,
+; stack const short    *vp8_filter
+;---------------------------------
+|vp8_filter_block2d_bil_second_pass_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vp8_filter address
+    ldr     r4, [sp, #36]                   ; width
+
+    ldr     r5, [r11]                       ; load up filter coefficients
+    mov     r12, r4                         ; outer-loop counter = width, since we work on transposed data matrix
+    mov     r11, r1
+
+    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
+    beq     bil_null_2nd_filter
+
+|bil_height_loop_2nd|
+    ldr     r6, [r0]                        ; load the data
+    ldr     r8, [r0, #4]
+    ldrh    r10, [r0, #8]
+    mov     lr, r3, lsr #2                  ; loop counter
+
+|bil_width_loop_2nd|
+    pkhtb   r7, r6, r8                      ; src[1] | src[2]
+    pkhtb   r9, r8, r10                     ; src[3] | src[4]
+
+    smuad   r6, r6, r5                      ; apply filter
+    smuad   r8, r8, r5                      ; apply filter
+
+    subs    lr, lr, #1
+
+    smuadx  r7, r7, r5                      ; apply filter
+    smuadx  r9, r9, r5                      ; apply filter
+
+    add     r0, r0, #8
+
+    add     r6, r6, #0x40                   ; round_shift_and_clamp
+    add     r7, r7, #0x40
+    usat    r6, #8, r6, asr #7
+    usat    r7, #8, r7, asr #7
+    strb    r6, [r1], r2                    ; the result is transposed back and stored
+
+    add     r8, r8, #0x40                   ; round_shift_and_clamp
+    strb    r7, [r1], r2
+    add     r9, r9, #0x40
+    usat    r8, #8, r8, asr #7
+    usat    r9, #8, r9, asr #7
+    strb    r8, [r1], r2                    ; the result is transposed back and stored
+
+    ldrne   r6, [r0]                        ; load data
+    strb    r9, [r1], r2
+    ldrne   r8, [r0, #4]
+    ldrneh  r10, [r0, #8]
+
+    bne     bil_width_loop_2nd
+
+    subs    r12, r12, #1
+    add     r0, r0, #4                      ; update src for next row
+    add     r11, r11, #1
+    mov     r1, r11
+
+    bne     bil_height_loop_2nd
+    ldmia   sp!, {r4 - r11, pc}
+
+|bil_null_2nd_filter|
+|bil_height_loop_null_2nd|
+    mov     lr, r3, lsr #2
+
+|bil_width_loop_null_2nd|
+    ldr     r6, [r0], #4                    ; load data
+    subs    lr, lr, #1
+    ldr     r8, [r0], #4
+
+    strb    r6, [r1], r2                    ; store data
+    mov     r7, r6, lsr #16
+    strb    r7, [r1], r2
+    mov     r9, r8, lsr #16
+    strb    r8, [r1], r2
+    strb    r9, [r1], r2
+
+    bne     bil_width_loop_null_2nd
+
+    subs    r12, r12, #1
+    add     r0, r0, #4
+    add     r11, r11, #1
+    mov     r1, r11
+
+    bne     bil_height_loop_null_2nd
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP  ; |vp8_filter_block2d_second_pass_armv6|
+
+    END
--- a/vp8/common/arm/armv6/copymem16x16_v6.asm
+++ b/vp8/common/arm/armv6/copymem16x16_v6.asm
@ -0,0 +1,186 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_copy_mem16x16_v6|
+    ; ARM
+    ; REQUIRE8
+    ; PRESERVE8
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void copy_mem16x16_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp8_copy_mem16x16_v6| PROC
+    stmdb       sp!, {r4 - r7}
+    ;push   {r4-r7}
+
+    ;preload
+    pld     [r0, #31]                ; preload for next 16x16 block
+
+    ands    r4, r0, #15
+    beq     copy_mem16x16_fast
+
+    ands    r4, r0, #7
+    beq     copy_mem16x16_8
+
+    ands    r4, r0, #3
+    beq     copy_mem16x16_4
+
+    ;copy one byte each time
+    ldrb    r4, [r0]
+    ldrb    r5, [r0, #1]
+    ldrb    r6, [r0, #2]
+    ldrb    r7, [r0, #3]
+
+    mov     r12, #16
+
+copy_mem16x16_1_loop
+    strb    r4, [r2]
+    strb    r5, [r2, #1]
+    strb    r6, [r2, #2]
+    strb    r7, [r2, #3]
+
+    ldrb    r4, [r0, #4]
+    ldrb    r5, [r0, #5]
+    ldrb    r6, [r0, #6]
+    ldrb    r7, [r0, #7]
+
+    subs    r12, r12, #1
+
+    strb    r4, [r2, #4]
+    strb    r5, [r2, #5]
+    strb    r6, [r2, #6]
+    strb    r7, [r2, #7]
+
+    ldrb    r4, [r0, #8]
+    ldrb    r5, [r0, #9]
+    ldrb    r6, [r0, #10]
+    ldrb    r7, [r0, #11]
+
+    strb    r4, [r2, #8]
+    strb    r5, [r2, #9]
+    strb    r6, [r2, #10]
+    strb    r7, [r2, #11]
+
+    ldrb    r4, [r0, #12]
+    ldrb    r5, [r0, #13]
+    ldrb    r6, [r0, #14]
+    ldrb    r7, [r0, #15]
+
+    add     r0, r0, r1
+
+    strb    r4, [r2, #12]
+    strb    r5, [r2, #13]
+    strb    r6, [r2, #14]
+    strb    r7, [r2, #15]
+
+    add     r2, r2, r3
+
+    ldrneb  r4, [r0]
+    ldrneb  r5, [r0, #1]
+    ldrneb  r6, [r0, #2]
+    ldrneb  r7, [r0, #3]
+
+    pld     [r0, #31]               ; preload for next 16x16 block
+
+    bne     copy_mem16x16_1_loop
+
+    ldmia       sp!, {r4 - r7}
+    ;pop        {r4-r7}
+    mov     pc, lr
+
+;copy 4 bytes each time
+copy_mem16x16_4
+    ldr     r4, [r0]
+    ldr     r5, [r0, #4]
+    ldr     r6, [r0, #8]
+    ldr     r7, [r0, #12]
+
+    mov     r12, #16
+
+copy_mem16x16_4_loop
+    subs    r12, r12, #1
+    add     r0, r0, r1
+
+    str     r4, [r2]
+    str     r5, [r2, #4]
+    str     r6, [r2, #8]
+    str     r7, [r2, #12]
+
+    add     r2, r2, r3
+
+    ldrne   r4, [r0]
+    ldrne   r5, [r0, #4]
+    ldrne   r6, [r0, #8]
+    ldrne   r7, [r0, #12]
+
+    pld     [r0, #31]               ; preload for next 16x16 block
+
+    bne     copy_mem16x16_4_loop
+
+    ldmia       sp!, {r4 - r7}
+    ;pop        {r4-r7}
+    mov     pc, lr
+
+;copy 8 bytes each time
+copy_mem16x16_8
+    sub     r1, r1, #16
+    sub     r3, r3, #16
+
+    mov     r12, #16
+
+copy_mem16x16_8_loop
+    ldmia   r0!, {r4-r5}
+    ;ldm        r0, {r4-r5}
+    ldmia   r0!, {r6-r7}
+
+    add     r0, r0, r1
+
+    stmia   r2!, {r4-r5}
+    subs    r12, r12, #1
+    ;stm        r2, {r4-r5}
+    stmia   r2!, {r6-r7}
+
+    add     r2, r2, r3
+
+    pld     [r0, #31]               ; preload for next 16x16 block
+    bne     copy_mem16x16_8_loop
+
+    ldmia       sp!, {r4 - r7}
+    ;pop        {r4-r7}
+    mov     pc, lr
+
+;copy 16 bytes each time
+copy_mem16x16_fast
+    ;sub        r1, r1, #16
+    ;sub        r3, r3, #16
+
+    mov     r12, #16
+
+copy_mem16x16_fast_loop
+    ldmia   r0, {r4-r7}
+    ;ldm        r0, {r4-r7}
+    add     r0, r0, r1
+
+    subs    r12, r12, #1
+    stmia   r2, {r4-r7}
+    ;stm        r2, {r4-r7}
+    add     r2, r2, r3
+
+    pld     [r0, #31]               ; preload for next 16x16 block
+    bne     copy_mem16x16_fast_loop
+
+    ldmia       sp!, {r4 - r7}
+    ;pop        {r4-r7}
+    mov     pc, lr
+
+    ENDP  ; |vp8_copy_mem16x16_v6|
+
+    END
--- a/vp8/common/arm/armv6/copymem8x4_v6.asm
+++ b/vp8/common/arm/armv6/copymem8x4_v6.asm
@ -0,0 +1,128 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_copy_mem8x4_v6|
+    ; ARM
+    ; REQUIRE8
+    ; PRESERVE8
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void vp8_copy_mem8x4_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp8_copy_mem8x4_v6| PROC
+    ;push   {r4-r5}
+    stmdb  sp!, {r4-r5}
+
+    ;preload
+    pld     [r0]
+    pld     [r0, r1]
+    pld     [r0, r1, lsl #1]
+
+    ands    r4, r0, #7
+    beq     copy_mem8x4_fast
+
+    ands    r4, r0, #3
+    beq     copy_mem8x4_4
+
+    ;copy 1 byte each time
+    ldrb    r4, [r0]
+    ldrb    r5, [r0, #1]
+
+    mov     r12, #4
+
+copy_mem8x4_1_loop
+    strb    r4, [r2]
+    strb    r5, [r2, #1]
+
+    ldrb    r4, [r0, #2]
+    ldrb    r5, [r0, #3]
+
+    subs    r12, r12, #1
+
+    strb    r4, [r2, #2]
+    strb    r5, [r2, #3]
+
+    ldrb    r4, [r0, #4]
+    ldrb    r5, [r0, #5]
+
+    strb    r4, [r2, #4]
+    strb    r5, [r2, #5]
+
+    ldrb    r4, [r0, #6]
+    ldrb    r5, [r0, #7]
+
+    add     r0, r0, r1
+
+    strb    r4, [r2, #6]
+    strb    r5, [r2, #7]
+
+    add     r2, r2, r3
+
+    ldrneb  r4, [r0]
+    ldrneb  r5, [r0, #1]
+
+    bne     copy_mem8x4_1_loop
+
+    ldmia       sp!, {r4 - r5}
+    ;pop        {r4-r5}
+    mov     pc, lr
+
+;copy 4 bytes each time
+copy_mem8x4_4
+    ldr     r4, [r0]
+    ldr     r5, [r0, #4]
+
+    mov     r12, #4
+
+copy_mem8x4_4_loop
+    subs    r12, r12, #1
+    add     r0, r0, r1
+
+    str     r4, [r2]
+    str     r5, [r2, #4]
+
+    add     r2, r2, r3
+
+    ldrne   r4, [r0]
+    ldrne   r5, [r0, #4]
+
+    bne     copy_mem8x4_4_loop
+
+    ldmia  sp!, {r4-r5}
+    ;pop        {r4-r5}
+    mov     pc, lr
+
+;copy 8 bytes each time
+copy_mem8x4_fast
+    ;sub        r1, r1, #8
+    ;sub        r3, r3, #8
+
+    mov     r12, #4
+
+copy_mem8x4_fast_loop
+    ldmia   r0, {r4-r5}
+    ;ldm        r0, {r4-r5}
+    add     r0, r0, r1
+
+    subs    r12, r12, #1
+    stmia   r2, {r4-r5}
+    ;stm        r2, {r4-r5}
+    add     r2, r2, r3
+
+    bne     copy_mem8x4_fast_loop
+
+    ldmia  sp!, {r4-r5}
+    ;pop        {r4-r5}
+    mov     pc, lr
+
+    ENDP  ; |vp8_copy_mem8x4_v6|
+
+    END
--- a/vp8/common/arm/armv6/copymem8x8_v6.asm
+++ b/vp8/common/arm/armv6/copymem8x8_v6.asm
@ -0,0 +1,128 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_copy_mem8x8_v6|
+    ; ARM
+    ; REQUIRE8
+    ; PRESERVE8
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void copy_mem8x8_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp8_copy_mem8x8_v6| PROC
+    ;push   {r4-r5}
+    stmdb  sp!, {r4-r5}
+
+    ;preload
+    pld     [r0]
+    pld     [r0, r1]
+    pld     [r0, r1, lsl #1]
+
+    ands    r4, r0, #7
+    beq     copy_mem8x8_fast
+
+    ands    r4, r0, #3
+    beq     copy_mem8x8_4
+
+    ;copy 1 byte each time
+    ldrb    r4, [r0]
+    ldrb    r5, [r0, #1]
+
+    mov     r12, #8
+
+copy_mem8x8_1_loop
+    strb    r4, [r2]
+    strb    r5, [r2, #1]
+
+    ldrb    r4, [r0, #2]
+    ldrb    r5, [r0, #3]
+
+    subs    r12, r12, #1
+
+    strb    r4, [r2, #2]
+    strb    r5, [r2, #3]
+
+    ldrb    r4, [r0, #4]
+    ldrb    r5, [r0, #5]
+
+    strb    r4, [r2, #4]
+    strb    r5, [r2, #5]
+
+    ldrb    r4, [r0, #6]
+    ldrb    r5, [r0, #7]
+
+    add     r0, r0, r1
+
+    strb    r4, [r2, #6]
+    strb    r5, [r2, #7]
+
+    add     r2, r2, r3
+
+    ldrneb  r4, [r0]
+    ldrneb  r5, [r0, #1]
+
+    bne     copy_mem8x8_1_loop
+
+    ldmia       sp!, {r4 - r5}
+    ;pop        {r4-r5}
+    mov     pc, lr
+
+;copy 4 bytes each time
+copy_mem8x8_4
+    ldr     r4, [r0]
+    ldr     r5, [r0, #4]
+
+    mov     r12, #8
+
+copy_mem8x8_4_loop
+    subs    r12, r12, #1
+    add     r0, r0, r1
+
+    str     r4, [r2]
+    str     r5, [r2, #4]
+
+    add     r2, r2, r3
+
+    ldrne   r4, [r0]
+    ldrne   r5, [r0, #4]
+
+    bne     copy_mem8x8_4_loop
+
+    ldmia       sp!, {r4 - r5}
+    ;pop        {r4-r5}
+    mov     pc, lr
+
+;copy 8 bytes each time
+copy_mem8x8_fast
+    ;sub        r1, r1, #8
+    ;sub        r3, r3, #8
+
+    mov     r12, #8
+
+copy_mem8x8_fast_loop
+    ldmia   r0, {r4-r5}
+    ;ldm        r0, {r4-r5}
+    add     r0, r0, r1
+
+    subs    r12, r12, #1
+    stmia   r2, {r4-r5}
+    ;stm        r2, {r4-r5}
+    add     r2, r2, r3
+
+    bne     copy_mem8x8_fast_loop
+
+    ldmia  sp!, {r4-r5}
+    ;pop        {r4-r5}
+    mov     pc, lr
+
+    ENDP  ; |vp8_copy_mem8x8_v6|
+
+    END
--- a/vp8/common/arm/armv6/dc_only_idct_add_v6.asm
+++ b/vp8/common/arm/armv6/dc_only_idct_add_v6.asm
@ -0,0 +1,70 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+    EXPORT  |vp8_dc_only_idct_add_v6|
+
+    AREA    |.text|, CODE, READONLY
+
+;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
+;                            int pred_stride, unsigned char *dst_ptr,
+;                            int dst_stride)
+; r0  input_dc
+; r1  pred_ptr
+; r2  pred_stride
+; r3  dst_ptr
+; sp  dst_stride
+
+|vp8_dc_only_idct_add_v6| PROC
+    stmdb       sp!, {r4 - r7}
+
+    add         r0, r0, #4                ; input_dc += 4
+    ldr         r12, c0x0000FFFF
+    ldr         r4, [r1], r2
+    and         r0, r12, r0, asr #3       ; input_dc >> 3 + mask
+    ldr         r6, [r1], r2
+    orr         r0, r0, r0, lsl #16       ; a1 | a1
+
+    ldr         r12, [sp, #16]            ; dst stride
+
+    uxtab16     r5, r0, r4                ; a1+2 | a1+0
+    uxtab16     r4, r0, r4, ror #8        ; a1+3 | a1+1
+    uxtab16     r7, r0, r6
+    uxtab16     r6, r0, r6, ror #8
+    usat16      r5, #8, r5
+    usat16      r4, #8, r4
+    usat16      r7, #8, r7
+    usat16      r6, #8, r6
+    orr         r5, r5, r4, lsl #8
+    orr         r7, r7, r6, lsl #8
+    ldr         r4, [r1], r2
+    str         r5, [r3], r12
+    ldr         r6, [r1]
+    str         r7, [r3], r12
+
+    uxtab16     r5, r0, r4
+    uxtab16     r4, r0, r4, ror #8
+    uxtab16     r7, r0, r6
+    uxtab16     r6, r0, r6, ror #8
+    usat16      r5, #8, r5
+    usat16      r4, #8, r4
+    usat16      r7, #8, r7
+    usat16      r6, #8, r6
+    orr         r5, r5, r4, lsl #8
+    orr         r7, r7, r6, lsl #8
+    str         r5, [r3], r12
+    str         r7, [r3]
+
+    ldmia       sp!, {r4 - r7}
+    bx          lr
+
+    ENDP  ; |vp8_dc_only_idct_add_v6|
+
+; Constant Pool
+c0x0000FFFF DCD 0x0000FFFF
+    END
--- a/vp8/common/arm/armv6/dequant_idct_v6.asm
+++ b/vp8/common/arm/armv6/dequant_idct_v6.asm
@ -0,0 +1,190 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+    EXPORT |vp8_dequant_idct_add_v6|
+
+    AREA |.text|, CODE, READONLY
+;void vp8_dequant_idct_v6(short *input, short *dq,
+;                         unsigned char *dest, int stride)
+; r0 = q
+; r1 = dq
+; r2 = dst
+; r3 = stride
+
+|vp8_dequant_idct_add_v6| PROC
+    stmdb   sp!, {r4-r11, lr}
+
+    ldr     r4, [r0]                ;input
+    ldr     r5, [r1], #4            ;dq
+
+    sub     sp, sp, #4
+    str     r3, [sp]
+
+    mov     r12, #4
+
+vp8_dequant_add_loop
+    smulbb  r6, r4, r5
+    smultt  r7, r4, r5
+
+    ldr     r4, [r0, #4]            ;input
+    ldr     r5, [r1], #4            ;dq
+
+    strh    r6, [r0], #2
+    strh    r7, [r0], #2
+
+    smulbb  r6, r4, r5
+    smultt  r7, r4, r5
+
+    subs    r12, r12, #1
+
+    ldrne   r4, [r0, #4]
+    ldrne   r5, [r1], #4
+
+    strh    r6, [r0], #2
+    strh    r7, [r0], #2
+
+    bne     vp8_dequant_add_loop
+
+    sub     r0, r0, #32
+    mov     r1, r0
+
+; short_idct4x4llm_v6_dual
+    ldr     r3, cospi8sqrt2minus1
+    ldr     r4, sinpi8sqrt2
+    ldr     r6, [r0, #8]
+    mov     r5, #2
+vp8_dequant_idct_loop1_v6
+    ldr     r12, [r0, #24]
+    ldr     r14, [r0, #16]
+    smulwt  r9, r3, r6
+    smulwb  r7, r3, r6
+    smulwt  r10, r4, r6
+    smulwb  r8, r4, r6
+    pkhbt   r7, r7, r9, lsl #16
+    smulwt  r11, r3, r12
+    pkhbt   r8, r8, r10, lsl #16
+    uadd16  r6, r6, r7
+    smulwt  r7, r4, r12
+    smulwb  r9, r3, r12
+    smulwb  r10, r4, r12
+    subs    r5, r5, #1
+    pkhbt   r9, r9, r11, lsl #16
+    ldr     r11, [r0], #4
+    pkhbt   r10, r10, r7, lsl #16
+    uadd16  r7, r12, r9
+    usub16  r7, r8, r7
+    uadd16  r6, r6, r10
+    uadd16  r10, r11, r14
+    usub16  r8, r11, r14
+    uadd16  r9, r10, r6
+    usub16  r10, r10, r6
+    uadd16  r6, r8, r7
+    usub16  r7, r8, r7
+    str     r6, [r1, #8]
+    ldrne   r6, [r0, #8]
+    str     r7, [r1, #16]
+    str     r10, [r1, #24]
+    str     r9, [r1], #4
+    bne     vp8_dequant_idct_loop1_v6
+
+    mov     r5, #2
+    sub     r0, r1, #8
+vp8_dequant_idct_loop2_v6
+    ldr     r6, [r0], #4
+    ldr     r7, [r0], #4
+    ldr     r8, [r0], #4
+    ldr     r9, [r0], #4
+    smulwt  r1, r3, r6
+    smulwt  r12, r4, r6
+    smulwt  lr, r3, r8
+    smulwt  r10, r4, r8
+    pkhbt   r11, r8, r6, lsl #16
+    pkhbt   r1, lr, r1, lsl #16
+    pkhbt   r12, r10, r12, lsl #16
+    pkhtb   r6, r6, r8, asr #16
+    uadd16  r6, r1, r6
+    pkhbt   lr, r9, r7, lsl #16
+    uadd16  r10, r11, lr
+    usub16  lr, r11, lr
+    pkhtb   r8, r7, r9, asr #16
+    subs    r5, r5, #1
+    smulwt  r1, r3, r8
+    smulwb  r7, r3, r8
+    smulwt  r11, r4, r8
+    smulwb  r9, r4, r8
+    pkhbt   r1, r7, r1, lsl #16
+    uadd16  r8, r1, r8
+    pkhbt   r11, r9, r11, lsl #16
+    usub16  r1, r12, r8
+    uadd16  r8, r11, r6
+    ldr     r9, c0x00040004
+    ldr     r12, [sp]               ; get stride from stack
+    uadd16  r6, r10, r8
+    usub16  r7, r10, r8
+    uadd16  r7, r7, r9
+    uadd16  r6, r6, r9
+    uadd16  r10, r14, r1
+    usub16  r1, r14, r1
+    uadd16  r10, r10, r9
+    uadd16  r1, r1, r9
+    ldr     r11, [r2]               ; load input from dst
+    mov     r8, r7, asr #3
+    pkhtb   r9, r8, r10, asr #19
+    mov     r8, r1, asr #3
+    pkhtb   r8, r8, r6, asr #19
+    uxtb16  lr, r11, ror #8
+    qadd16  r9, r9, lr
+    uxtb16  lr, r11
+    qadd16  r8, r8, lr
+    usat16  r9, #8, r9
+    usat16  r8, #8, r8
+    orr     r9, r8, r9, lsl #8
+    ldr     r11, [r2, r12]          ; load input from dst
+    mov     r7, r7, lsl #16
+    mov     r1, r1, lsl #16
+    mov     r10, r10, lsl #16
+    mov     r6, r6, lsl #16
+    mov     r7, r7, asr #3
+    pkhtb   r7, r7, r10, asr #19
+    mov     r1, r1, asr #3
+    pkhtb   r1, r1, r6, asr #19
+    uxtb16  r8, r11, ror #8
+    qadd16  r7, r7, r8
+    uxtb16  r8, r11
+    qadd16  r1, r1, r8
+    usat16  r7, #8, r7
+    usat16  r1, #8, r1
+    orr     r1, r1, r7, lsl #8
+    str     r9, [r2], r12           ; store output to dst
+    str     r1, [r2], r12           ; store output to dst
+    bne     vp8_dequant_idct_loop2_v6
+
+; vpx_memset
+    sub     r0, r0, #32
+    add     sp, sp, #4
+
+    mov     r12, #0
+    str     r12, [r0]
+    str     r12, [r0, #4]
+    str     r12, [r0, #8]
+    str     r12, [r0, #12]
+    str     r12, [r0, #16]
+    str     r12, [r0, #20]
+    str     r12, [r0, #24]
+    str     r12, [r0, #28]
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP    ; |vp8_dequant_idct_add_v6|
+
+; Constant Pool
+cospi8sqrt2minus1 DCD 0x00004E7B
+sinpi8sqrt2       DCD 0x00008A8C
+c0x00040004       DCD 0x00040004
+
+    END
--- a/vp8/common/arm/armv6/dequantize_v6.asm
+++ b/vp8/common/arm/armv6/dequantize_v6.asm
@ -0,0 +1,69 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_dequantize_b_loop_v6|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+;-------------------------------
+;void   vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
+; r0    short *Q,
+; r1    short *DQC
+; r2    short *DQ
+|vp8_dequantize_b_loop_v6| PROC
+    stmdb   sp!, {r4-r9, lr}
+
+    ldr     r3, [r0]                ;load Q
+    ldr     r4, [r1]                ;load DQC
+    ldr     r5, [r0, #4]
+    ldr     r6, [r1, #4]
+
+    mov     r12, #2                 ;loop counter
+
+dequant_loop
+    smulbb  r7, r3, r4              ;multiply
+    smultt  r8, r3, r4
+    smulbb  r9, r5, r6
+    smultt  lr, r5, r6
+
+    ldr     r3, [r0, #8]
+    ldr     r4, [r1, #8]
+    ldr     r5, [r0, #12]
+    ldr     r6, [r1, #12]
+
+    strh    r7, [r2], #2            ;store result
+    smulbb  r7, r3, r4              ;multiply
+    strh    r8, [r2], #2
+    smultt  r8, r3, r4
+    strh    r9, [r2], #2
+    smulbb  r9, r5, r6
+    strh    lr, [r2], #2
+    smultt  lr, r5, r6
+
+    subs    r12, r12, #1
+
+    add     r0, r0, #16
+    add     r1, r1, #16
+
+    ldrne       r3, [r0]
+    strh    r7, [r2], #2            ;store result
+    ldrne       r4, [r1]
+    strh    r8, [r2], #2
+    ldrne       r5, [r0, #4]
+    strh    r9, [r2], #2
+    ldrne       r6, [r1, #4]
+    strh    lr, [r2], #2
+
+    bne     dequant_loop
+
+    ldmia   sp!, {r4-r9, pc}
+    ENDP    ;|vp8_dequantize_b_loop_v6|
+
+    END
--- a/vp8/common/arm/armv6/filter_v6.asm
+++ b/vp8/common/arm/armv6/filter_v6.asm
@ -0,0 +1,624 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_filter_block2d_first_pass_armv6|
+    EXPORT  |vp8_filter_block2d_first_pass_16x16_armv6|
+    EXPORT  |vp8_filter_block2d_first_pass_8x8_armv6|
+    EXPORT  |vp8_filter_block2d_second_pass_armv6|
+    EXPORT  |vp8_filter4_block2d_second_pass_armv6|
+    EXPORT  |vp8_filter_block2d_first_pass_only_armv6|
+    EXPORT  |vp8_filter_block2d_second_pass_only_armv6|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+;-------------------------------------
+; r0    unsigned char *src_ptr
+; r1    short         *output_ptr
+; r2    unsigned int src_pixels_per_line
+; r3    unsigned int output_width
+; stack unsigned int output_height
+; stack const short *vp8_filter
+;-------------------------------------
+; vp8_filter the input and put in the output array.  Apply the 6 tap FIR filter with
+; the output being a 2 byte value and the intput being a 1 byte value.
+|vp8_filter_block2d_first_pass_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vp8_filter address
+    ldr     r7, [sp, #36]                   ; output height
+
+    sub     r2, r2, r3                      ; inside loop increments input array,
+                                            ; so the height loop only needs to add
+                                            ; r2 - width to the input pointer
+
+    mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts
+    add     r12, r3, #16                    ; square off the output
+    sub     sp, sp, #4
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+    str     r1, [sp]                        ; push destination to stack
+    mov     r7, r7, lsl #16                 ; height is top part of counter
+
+; six tap filter
+|height_loop_1st_6|
+    ldrb    r8, [r0, #-2]                   ; load source data
+    ldrb    r9, [r0, #-1]
+    ldrb    r10, [r0], #2
+    orr     r7, r7, r3, lsr #2              ; construct loop counter
+
+|width_loop_1st_6|
+    ldrb    r11, [r0, #-1]
+
+    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
+    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
+
+    ldrb    r9, [r0]
+
+    smuad   lr, lr, r4                      ; apply the filter
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+    smuad   r8, r8, r4
+    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
+
+    smlad   lr, r10, r5, lr
+    ldrb    r10, [r0, #1]
+    smlad   r8, r11, r5, r8
+    ldrb    r11, [r0, #2]
+
+    sub     r7, r7, #1
+
+    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+
+    smlad   lr, r9, r6, lr
+    smlad   r11, r10, r6, r8
+
+    ands    r10, r7, #0xff                  ; test loop counter
+
+    add     lr, lr, #0x40                   ; round_shift_and_clamp
+    ldrneb  r8, [r0, #-2]                   ; load data for next loop
+    usat    lr, #8, lr, asr #7
+    add     r11, r11, #0x40
+    ldrneb  r9, [r0, #-1]
+    usat    r11, #8, r11, asr #7
+
+    strh    lr, [r1], r12                   ; result is transposed and stored, which
+                                            ; will make second pass filtering easier.
+    ldrneb  r10, [r0], #2
+    strh    r11, [r1], r12
+
+    bne     width_loop_1st_6
+
+    ldr     r1, [sp]                        ; load and update dst address
+    subs    r7, r7, #0x10000
+    add     r0, r0, r2                      ; move to next input line
+
+    add     r1, r1, #2                      ; move over to next column
+    str     r1, [sp]
+
+    bne     height_loop_1st_6
+
+    add     sp, sp, #4
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP
+
+; --------------------------
+; 16x16 version
+; -----------------------------
+|vp8_filter_block2d_first_pass_16x16_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vp8_filter address
+    ldr     r7, [sp, #36]                   ; output height
+
+    add     r4, r2, #18                     ; preload next low
+    pld     [r0, r4]
+
+    sub     r2, r2, r3                      ; inside loop increments input array,
+                                            ; so the height loop only needs to add
+                                            ; r2 - width to the input pointer
+
+    mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts
+    add     r12, r3, #16                    ; square off the output
+    sub     sp, sp, #4
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+    str     r1, [sp]                        ; push destination to stack
+    mov     r7, r7, lsl #16                 ; height is top part of counter
+
+; six tap filter
+|height_loop_1st_16_6|
+    ldrb    r8, [r0, #-2]                   ; load source data
+    ldrb    r9, [r0, #-1]
+    ldrb    r10, [r0], #2
+    orr     r7, r7, r3, lsr #2              ; construct loop counter
+
+|width_loop_1st_16_6|
+    ldrb    r11, [r0, #-1]
+
+    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
+    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
+
+    ldrb    r9, [r0]
+
+    smuad   lr, lr, r4                      ; apply the filter
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+    smuad   r8, r8, r4
+    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
+
+    smlad   lr, r10, r5, lr
+    ldrb    r10, [r0, #1]
+    smlad   r8, r11, r5, r8
+    ldrb    r11, [r0, #2]
+
+    sub     r7, r7, #1
+
+    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+
+    smlad   lr, r9, r6, lr
+    smlad   r11, r10, r6, r8
+
+    ands    r10, r7, #0xff                  ; test loop counter
+
+    add     lr, lr, #0x40                   ; round_shift_and_clamp
+    ldrneb  r8, [r0, #-2]                   ; load data for next loop
+    usat    lr, #8, lr, asr #7
+    add     r11, r11, #0x40
+    ldrneb  r9, [r0, #-1]
+    usat    r11, #8, r11, asr #7
+
+    strh    lr, [r1], r12                   ; result is transposed and stored, which
+                                            ; will make second pass filtering easier.
+    ldrneb  r10, [r0], #2
+    strh    r11, [r1], r12
+
+    bne     width_loop_1st_16_6
+
+    ldr     r1, [sp]                        ; load and update dst address
+    subs    r7, r7, #0x10000
+    add     r0, r0, r2                      ; move to next input line
+
+    add     r11, r2, #34                    ; adding back block width(=16)
+    pld     [r0, r11]                       ; preload next low
+
+    add     r1, r1, #2                      ; move over to next column
+    str     r1, [sp]
+
+    bne     height_loop_1st_16_6
+
+    add     sp, sp, #4
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP
+
+; --------------------------
+; 8x8 version
+; -----------------------------
+|vp8_filter_block2d_first_pass_8x8_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vp8_filter address
+    ldr     r7, [sp, #36]                   ; output height
+
+    add     r4, r2, #10                     ; preload next low
+    pld     [r0, r4]
+
+    sub     r2, r2, r3                      ; inside loop increments input array,
+                                            ; so the height loop only needs to add
+                                            ; r2 - width to the input pointer
+
+    mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts
+    add     r12, r3, #16                    ; square off the output
+    sub     sp, sp, #4
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+    str     r1, [sp]                        ; push destination to stack
+    mov     r7, r7, lsl #16                 ; height is top part of counter
+
+; six tap filter
+|height_loop_1st_8_6|
+    ldrb    r8, [r0, #-2]                   ; load source data
+    ldrb    r9, [r0, #-1]
+    ldrb    r10, [r0], #2
+    orr     r7, r7, r3, lsr #2              ; construct loop counter
+
+|width_loop_1st_8_6|
+    ldrb    r11, [r0, #-1]
+
+    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
+    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
+
+    ldrb    r9, [r0]
+
+    smuad   lr, lr, r4                      ; apply the filter
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+    smuad   r8, r8, r4
+    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
+
+    smlad   lr, r10, r5, lr
+    ldrb    r10, [r0, #1]
+    smlad   r8, r11, r5, r8
+    ldrb    r11, [r0, #2]
+
+    sub     r7, r7, #1
+
+    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+
+    smlad   lr, r9, r6, lr
+    smlad   r11, r10, r6, r8
+
+    ands    r10, r7, #0xff                  ; test loop counter
+
+    add     lr, lr, #0x40                   ; round_shift_and_clamp
+    ldrneb  r8, [r0, #-2]                   ; load data for next loop
+    usat    lr, #8, lr, asr #7
+    add     r11, r11, #0x40
+    ldrneb  r9, [r0, #-1]
+    usat    r11, #8, r11, asr #7
+
+    strh    lr, [r1], r12                   ; result is transposed and stored, which
+                                            ; will make second pass filtering easier.
+    ldrneb  r10, [r0], #2
+    strh    r11, [r1], r12
+
+    bne     width_loop_1st_8_6
+
+    ldr     r1, [sp]                        ; load and update dst address
+    subs    r7, r7, #0x10000
+    add     r0, r0, r2                      ; move to next input line
+
+    add     r11, r2, #18                    ; adding back block width(=8)
+    pld     [r0, r11]                       ; preload next low
+
+    add     r1, r1, #2                      ; move over to next column
+    str     r1, [sp]
+
+    bne     height_loop_1st_8_6
+
+    add     sp, sp, #4
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP
+
+;---------------------------------
+; r0    short         *src_ptr,
+; r1    unsigned char *output_ptr,
+; r2    unsigned int output_pitch,
+; r3    unsigned int cnt,
+; stack const short *vp8_filter
+;---------------------------------
+|vp8_filter_block2d_second_pass_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #36]                  ; vp8_filter address
+    sub     sp, sp, #4
+    mov     r7, r3, lsl #16                 ; height is top part of counter
+    str     r1, [sp]                        ; push destination to stack
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+    pkhbt   r12, r5, r4                     ; pack the filter differently
+    pkhbt   r11, r6, r5
+
+    sub     r0, r0, #4                      ; offset input buffer
+
+|height_loop_2nd|
+    ldr     r8, [r0]                        ; load the data
+    ldr     r9, [r0, #4]
+    orr     r7, r7, r3, lsr #1              ; loop counter
+
+|width_loop_2nd|
+    smuad   lr, r4, r8                      ; apply filter
+    sub     r7, r7, #1
+    smulbt  r8, r4, r8
+
+    ldr     r10, [r0, #8]
+
+    smlad   lr, r5, r9, lr
+    smladx  r8, r12, r9, r8
+
+    ldrh    r9, [r0, #12]
+
+    smlad   lr, r6, r10, lr
+    smladx  r8, r11, r10, r8
+
+    add     r0, r0, #4
+    smlatb  r10, r6, r9, r8
+
+    add     lr, lr, #0x40                   ; round_shift_and_clamp
+    ands    r8, r7, #0xff
+    usat    lr, #8, lr, asr #7
+    add     r10, r10, #0x40
+    strb    lr, [r1], r2                    ; the result is transposed back and stored
+    usat    r10, #8, r10, asr #7
+
+    ldrne   r8, [r0]                        ; load data for next loop
+    ldrne   r9, [r0, #4]
+    strb    r10, [r1], r2
+
+    bne     width_loop_2nd
+
+    ldr     r1, [sp]                        ; update dst for next loop
+    subs    r7, r7, #0x10000
+    add     r0, r0, #16                     ; updata src for next loop
+    add     r1, r1, #1
+    str     r1, [sp]
+
+    bne     height_loop_2nd
+
+    add     sp, sp, #4
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP
+
+;---------------------------------
+; r0    short         *src_ptr,
+; r1    unsigned char *output_ptr,
+; r2    unsigned int output_pitch,
+; r3    unsigned int cnt,
+; stack const short *vp8_filter
+;---------------------------------
+|vp8_filter4_block2d_second_pass_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #36]                  ; vp8_filter address
+    mov     r7, r3, lsl #16                 ; height is top part of counter
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    add     lr, r1, r3                      ; save final destination pointer
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+    pkhbt   r12, r5, r4                     ; pack the filter differently
+    pkhbt   r11, r6, r5
+    mov     r4, #0x40                       ; rounding factor (for smlad{x})
+
+|height_loop_2nd_4|
+    ldrd    r8, [r0, #-4]                   ; load the data
+    orr     r7, r7, r3, lsr #1              ; loop counter
+
+|width_loop_2nd_4|
+    ldr     r10, [r0, #4]!
+    smladx  r6, r9, r12, r4                 ; apply filter
+    pkhbt   r8, r9, r8
+    smlad   r5, r8, r12, r4
+    pkhbt   r8, r10, r9
+    smladx  r6, r10, r11, r6
+    sub     r7, r7, #1
+    smlad   r5, r8, r11, r5
+
+    mov     r8, r9                          ; shift the data for the next loop
+    mov     r9, r10
+
+    usat    r6, #8, r6, asr #7              ; shift and clamp
+    usat    r5, #8, r5, asr #7
+
+    strb    r5, [r1], r2                    ; the result is transposed back and stored
+    tst     r7, #0xff
+    strb    r6, [r1], r2
+
+    bne     width_loop_2nd_4
+
+    subs    r7, r7, #0x10000
+    add     r0, r0, #16                     ; update src for next loop
+    sub     r1, lr, r7, lsr #16             ; update dst for next loop
+
+    bne     height_loop_2nd_4
+
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP
+
+;------------------------------------
+; r0    unsigned char *src_ptr
+; r1    unsigned char *output_ptr,
+; r2    unsigned int src_pixels_per_line
+; r3    unsigned int cnt,
+; stack unsigned int output_pitch,
+; stack const short *vp8_filter
+;------------------------------------
+|vp8_filter_block2d_first_pass_only_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    add     r7, r2, r3                      ; preload next low
+    add     r7, r7, #2
+    pld     [r0, r7]
+
+    ldr     r4, [sp, #36]                   ; output pitch
+    ldr     r11, [sp, #40]                  ; HFilter address
+    sub     sp, sp, #8
+
+    mov     r7, r3
+    sub     r2, r2, r3                      ; inside loop increments input array,
+                                            ; so the height loop only needs to add
+                                            ; r2 - width to the input pointer
+
+    sub     r4, r4, r3
+    str     r4, [sp]                        ; save modified output pitch
+    str     r2, [sp, #4]
+
+    mov     r2, #0x40
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+; six tap filter
+|height_loop_1st_only_6|
+    ldrb    r8, [r0, #-2]                   ; load data
+    ldrb    r9, [r0, #-1]
+    ldrb    r10, [r0], #2
+
+    mov     r12, r3, lsr #1                 ; loop counter
+
+|width_loop_1st_only_6|
+    ldrb    r11, [r0, #-1]
+
+    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
+    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
+
+    ldrb    r9, [r0]
+
+;;  smuad   lr, lr, r4
+    smlad   lr, lr, r4, r2
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+;;  smuad   r8, r8, r4
+    smlad   r8, r8, r4, r2
+    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
+
+    smlad   lr, r10, r5, lr
+    ldrb    r10, [r0, #1]
+    smlad   r8, r11, r5, r8
+    ldrb    r11, [r0, #2]
+
+    subs    r12, r12, #1
+
+    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+
+    smlad   lr, r9, r6, lr
+    smlad   r10, r10, r6, r8
+
+;;  add     lr, lr, #0x40                   ; round_shift_and_clamp
+    ldrneb  r8, [r0, #-2]                   ; load data for next loop
+    usat    lr, #8, lr, asr #7
+;;  add     r10, r10, #0x40
+    strb    lr, [r1], #1                    ; store the result
+    usat    r10, #8, r10, asr #7
+
+    ldrneb  r9, [r0, #-1]
+    strb    r10, [r1], #1
+    ldrneb  r10, [r0], #2
+
+    bne     width_loop_1st_only_6
+
+    ldr     lr, [sp]                        ; load back output pitch
+    ldr     r12, [sp, #4]                   ; load back output pitch
+    subs    r7, r7, #1
+    add     r0, r0, r12                     ; updata src for next loop
+
+    add     r11, r12, r3                    ; preload next low
+    add     r11, r11, #2
+    pld     [r0, r11]
+
+    add     r1, r1, lr                      ; update dst for next loop
+
+    bne     height_loop_1st_only_6
+
+    add     sp, sp, #8
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP  ; |vp8_filter_block2d_first_pass_only_armv6|
+
+
+;------------------------------------
+; r0    unsigned char *src_ptr,
+; r1    unsigned char *output_ptr,
+; r2    unsigned int src_pixels_per_line
+; r3    unsigned int cnt,
+; stack unsigned int output_pitch,
+; stack const short *vp8_filter
+;------------------------------------
+|vp8_filter_block2d_second_pass_only_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; VFilter address
+    ldr     r12, [sp, #36]                  ; output pitch
+
+    mov     r7, r3, lsl #16                 ; height is top part of counter
+    sub     r0, r0, r2, lsl #1              ; need 6 elements for filtering, 2 before, 3 after
+
+    sub     sp, sp, #8
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+    str     r0, [sp]                        ; save r0 to stack
+    str     r1, [sp, #4]                    ; save dst to stack
+
+; six tap filter
+|width_loop_2nd_only_6|
+    ldrb    r8, [r0], r2                    ; load data
+    orr     r7, r7, r3                      ; loop counter
+    ldrb    r9, [r0], r2
+    ldrb    r10, [r0], r2
+
+|height_loop_2nd_only_6|
+    ; filter first column in this inner loop, than, move to next colum.
+    ldrb    r11, [r0], r2
+
+    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
+    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
+
+    ldrb    r9, [r0], r2
+
+    smuad   lr, lr, r4
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+    smuad   r8, r8, r4
+    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
+
+    smlad   lr, r10, r5, lr
+    ldrb    r10, [r0], r2
+    smlad   r8, r11, r5, r8
+    ldrb    r11, [r0]
+
+    sub     r7, r7, #2
+    sub     r0, r0, r2, lsl #2
+
+    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+
+    smlad   lr, r9, r6, lr
+    smlad   r10, r10, r6, r8
+
+    ands    r9, r7, #0xff
+
+    add     lr, lr, #0x40                   ; round_shift_and_clamp
+    ldrneb  r8, [r0], r2                    ; load data for next loop
+    usat    lr, #8, lr, asr #7
+    add     r10, r10, #0x40
+    strb    lr, [r1], r12                   ; store the result for the column
+    usat    r10, #8, r10, asr #7
+
+    ldrneb  r9, [r0], r2
+    strb    r10, [r1], r12
+    ldrneb  r10, [r0], r2
+
+    bne     height_loop_2nd_only_6
+
+    ldr     r0, [sp]
+    ldr     r1, [sp, #4]
+    subs    r7, r7, #0x10000
+    add     r0, r0, #1                      ; move to filter next column
+    str     r0, [sp]
+    add     r1, r1, #1
+    str     r1, [sp, #4]
+
+    bne     width_loop_2nd_only_6
+
+    add     sp, sp, #8
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP  ; |vp8_filter_block2d_second_pass_only_armv6|
+
+    END
--- a/vp8/common/arm/armv6/idct_blk_v6.c
+++ b/vp8/common/arm/armv6/idct_blk_v6.c
@ -0,0 +1,115 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+
+
+void vp8_dequant_idct_add_y_block_v6(short *q, short *dq,
+                                     unsigned char *dst,
+                                     int stride, char *eobs)
+{
+    int i;
+
+    for (i = 0; i < 4; i++)
+    {
+        if (eobs[0] > 1)
+            vp8_dequant_idct_add_v6 (q, dq, dst, stride);
+        else if (eobs[0] == 1)
+        {
+            vp8_dc_only_idct_add_v6 (q[0]*dq[0], dst, stride, dst, stride);
+            ((int *)q)[0] = 0;
+        }
+
+        if (eobs[1] > 1)
+            vp8_dequant_idct_add_v6 (q+16, dq, dst+4, stride);
+        else if (eobs[1] == 1)
+        {
+            vp8_dc_only_idct_add_v6 (q[16]*dq[0], dst+4, stride, dst+4, stride);
+            ((int *)(q+16))[0] = 0;
+        }
+
+        if (eobs[2] > 1)
+            vp8_dequant_idct_add_v6 (q+32, dq, dst+8, stride);
+        else if (eobs[2] == 1)
+        {
+            vp8_dc_only_idct_add_v6 (q[32]*dq[0], dst+8, stride, dst+8, stride);
+            ((int *)(q+32))[0] = 0;
+        }
+
+        if (eobs[3] > 1)
+            vp8_dequant_idct_add_v6 (q+48, dq, dst+12, stride);
+        else if (eobs[3] == 1)
+        {
+            vp8_dc_only_idct_add_v6 (q[48]*dq[0], dst+12, stride,dst+12,stride);
+            ((int *)(q+48))[0] = 0;
+        }
+
+        q    += 64;
+        dst  += 4*stride;
+        eobs += 4;
+    }
+}
+
+void vp8_dequant_idct_add_uv_block_v6(short *q, short *dq,
+                                      unsigned char *dstu,
+                                      unsigned char *dstv,
+                                      int stride, char *eobs)
+{
+    int i;
+
+    for (i = 0; i < 2; i++)
+    {
+        if (eobs[0] > 1)
+            vp8_dequant_idct_add_v6 (q, dq, dstu, stride);
+        else if (eobs[0] == 1)
+        {
+            vp8_dc_only_idct_add_v6 (q[0]*dq[0], dstu, stride, dstu, stride);
+            ((int *)q)[0] = 0;
+        }
+
+        if (eobs[1] > 1)
+            vp8_dequant_idct_add_v6 (q+16, dq, dstu+4, stride);
+        else if (eobs[1] == 1)
+        {
+            vp8_dc_only_idct_add_v6 (q[16]*dq[0], dstu+4, stride,
+                                                  dstu+4, stride);
+            ((int *)(q+16))[0] = 0;
+        }
+
+        q    += 32;
+        dstu += 4*stride;
+        eobs += 2;
+    }
+
+    for (i = 0; i < 2; i++)
+    {
+        if (eobs[0] > 1)
+            vp8_dequant_idct_add_v6 (q, dq, dstv, stride);
+        else if (eobs[0] == 1)
+        {
+            vp8_dc_only_idct_add_v6 (q[0]*dq[0], dstv, stride, dstv, stride);
+            ((int *)q)[0] = 0;
+        }
+
+        if (eobs[1] > 1)
+            vp8_dequant_idct_add_v6 (q+16, dq, dstv+4, stride);
+        else if (eobs[1] == 1)
+        {
+            vp8_dc_only_idct_add_v6 (q[16]*dq[0], dstv+4, stride,
+                                                  dstv+4, stride);
+            ((int *)(q+16))[0] = 0;
+        }
+
+        q    += 32;
+        dstv += 4*stride;
+        eobs += 2;
+    }
+}
--- a/vp8/common/arm/armv6/idct_v6.asm
+++ b/vp8/common/arm/armv6/idct_v6.asm
@ -0,0 +1,202 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_short_idct4x4llm_v6_dual|
+
+    AREA    |.text|, CODE, READONLY
+
+
+; void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch,
+;                             unsigned char *dst, int stride)
+; r0    short* input
+; r1    unsigned char* pred
+; r2    int pitch
+; r3    unsigned char* dst
+; sp    int stride
+
+|vp8_short_idct4x4llm_v6_dual| PROC
+    stmdb   sp!, {r4-r11, lr}
+
+    sub     sp, sp, #4
+
+    mov     r4, #0x00008A00         ; sin
+    orr     r4, r4, #0x0000008C     ; sinpi8sqrt2
+
+    mov     r5, #0x00004E00         ; cos
+    orr     r5, r5, #0x0000007B     ; cospi8sqrt2minus1
+    orr     r5, r5, #1<<31          ; loop counter on top bit
+
+loop1_dual
+    ldr     r6, [r0, #(4*2)]        ; i5 | i4
+    ldr     r12, [r0, #(12*2)]      ; i13|i12
+    ldr     r14, [r0, #(8*2)]       ; i9 | i8
+
+    smulbt  r9, r5, r6              ; (ip[5] * cospi8sqrt2minus1) >> 16
+    smulbb  r7, r5, r6              ; (ip[4] * cospi8sqrt2minus1) >> 16
+    smulwt  r10, r4, r6             ; (ip[5] * sinpi8sqrt2) >> 16
+    smulwb  r8, r4, r6              ; (ip[4] * sinpi8sqrt2) >> 16
+
+    smulbt  r11, r5, r12            ; (ip[13] * cospi8sqrt2minus1) >> 16
+    pkhtb   r7, r9, r7, asr #16     ; 5c | 4c
+    pkhbt   r8, r8, r10, lsl #16    ; 5s | 4s
+    uadd16  r6, r6, r7              ; 5c+5 | 4c+4
+
+    smulwt  r7, r4, r12             ; (ip[13] * sinpi8sqrt2) >> 16
+    smulbb  r9, r5, r12             ; (ip[12] * cospi8sqrt2minus1) >> 16
+    smulwb  r10, r4, r12            ; (ip[12] * sinpi8sqrt2) >> 16
+
+    subs    r5, r5, #1<<31          ; i--
+
+    pkhtb   r9, r11, r9, asr #16    ; 13c | 12c
+    ldr     r11, [r0]               ; i1 | i0
+    pkhbt   r10, r10, r7, lsl #16   ; 13s | 12s
+    uadd16  r7, r12, r9             ; 13c+13 | 12c+12
+
+    usub16  r7, r8, r7              ; c
+    uadd16  r6, r6, r10             ; d
+    uadd16  r10, r11, r14           ; a
+    usub16  r8, r11, r14            ; b
+
+    uadd16  r9, r10, r6             ; a+d
+    usub16  r10, r10, r6            ; a-d
+    uadd16  r6, r8, r7              ; b+c
+    usub16  r7, r8, r7              ; b-c
+
+    ; use input buffer to store intermediate results
+    str      r6, [r0, #(4*2)]       ; o5 | o4
+    str      r7, [r0, #(8*2)]       ; o9 | o8
+    str      r10,[r0, #(12*2)]      ; o13|o12
+    str      r9, [r0], #4           ; o1 | o0
+
+    bcs loop1_dual
+
+    sub     r0, r0, #8              ; reset input/output
+    str     r0, [sp]
+
+loop2_dual
+
+    ldr     r6, [r0, #(4*2)]        ; i5 | i4
+    ldr     r12,[r0, #(2*2)]        ; i3 | i2
+    ldr     r14,[r0, #(6*2)]        ; i7 | i6
+    ldr     r0, [r0, #(0*2)]        ; i1 | i0
+
+    smulbt  r9, r5, r6              ; (ip[5] * cospi8sqrt2minus1) >> 16
+    smulbt  r7, r5, r0              ; (ip[1] * cospi8sqrt2minus1) >> 16
+    smulwt  r10, r4, r6             ; (ip[5] * sinpi8sqrt2) >> 16
+    smulwt  r8, r4, r0              ; (ip[1] * sinpi8sqrt2) >> 16
+
+    pkhbt   r11, r6, r0, lsl #16    ; i0 | i4
+    pkhtb   r7, r7, r9, asr #16     ; 1c | 5c
+    pkhtb   r0, r0, r6, asr #16     ; i1 | i5
+    pkhbt   r8, r10, r8, lsl #16    ; 1s | 5s = temp1
+
+    uadd16  r0, r7, r0              ; 1c+1 | 5c+5 = temp2
+    pkhbt   r9, r14, r12, lsl #16   ; i2 | i6
+    uadd16  r10, r11, r9            ; a
+    usub16  r9, r11, r9             ; b
+    pkhtb   r6, r12, r14, asr #16   ; i3 | i7
+
+    subs    r5, r5, #1<<31          ; i--
+
+    smulbt  r7, r5, r6              ; (ip[3] * cospi8sqrt2minus1) >> 16
+    smulwt  r11, r4, r6             ; (ip[3] * sinpi8sqrt2) >> 16
+    smulbb  r12, r5, r6             ; (ip[7] * cospi8sqrt2minus1) >> 16
+    smulwb  r14, r4, r6             ; (ip[7] * sinpi8sqrt2) >> 16
+
+    pkhtb   r7, r7, r12, asr #16    ; 3c | 7c
+    pkhbt   r11, r14, r11, lsl #16  ; 3s | 7s = temp1
+
+    uadd16  r6, r7, r6              ; 3c+3 | 7c+7 = temp2
+    usub16  r12, r8, r6             ; c (o1 | o5)
+    uadd16  r6, r11, r0             ; d (o3 | o7)
+    uadd16  r7, r10, r6             ; a+d
+
+    mov     r8, #4                  ; set up 4's
+    orr     r8, r8, #0x40000        ; 4|4
+
+    usub16  r6, r10, r6             ; a-d
+    uadd16  r6, r6, r8              ; a-d+4, 3|7
+    uadd16  r7, r7, r8              ; a+d+4, 0|4
+    uadd16  r10, r9, r12            ; b+c
+    usub16  r0, r9, r12             ; b-c
+    uadd16  r10, r10, r8            ; b+c+4, 1|5
+    uadd16  r8, r0, r8              ; b-c+4, 2|6
+
+    ldr     lr, [sp, #40]           ; dst stride
+
+    ldrb    r0, [r1]                ; pred p0
+    ldrb    r11, [r1, #1]           ; pred p1
+    ldrb    r12, [r1, #2]           ; pred p2
+
+    add     r0, r0, r7, asr #19     ; p0 + o0
+    add     r11, r11, r10, asr #19  ; p1 + o1
+    add     r12, r12, r8, asr #19   ; p2 + o2
+
+    usat    r0, #8, r0              ; d0 = clip8(p0 + o0)
+    usat    r11, #8, r11            ; d1 = clip8(p1 + o1)
+    usat    r12, #8, r12            ; d2 = clip8(p2 + o2)
+
+    add     r0, r0, r11, lsl #8     ; |--|--|d1|d0|
+
+    ldrb    r11, [r1, #3]           ; pred p3
+
+    add     r0, r0, r12, lsl #16    ; |--|d2|d1|d0|
+
+    add     r11, r11, r6, asr #19   ; p3 + o3
+
+    sxth    r7, r7                  ;
+    sxth    r10, r10                ;
+
+    usat    r11, #8, r11            ; d3 = clip8(p3 + o3)
+
+    sxth    r8, r8                  ;
+    sxth    r6, r6                  ;
+
+    add     r0, r0, r11, lsl #24    ; |d3|d2|d1|d0|
+
+    ldrb    r12, [r1, r2]!          ; pred p4
+    str     r0, [r3], lr
+    ldrb    r11, [r1, #1]           ; pred p5
+
+    add     r12, r12, r7, asr #3    ; p4 + o4
+    add     r11, r11, r10, asr #3   ; p5 + o5
+
+    usat    r12, #8, r12            ; d4 = clip8(p4 + o4)
+    usat    r11, #8, r11            ; d5 = clip8(p5 + o5)
+
+    ldrb    r7, [r1, #2]            ; pred p6
+    ldrb    r10, [r1, #3]           ; pred p6
+
+    add     r12, r12, r11, lsl #8   ; |--|--|d5|d4|
+
+    add     r7, r7, r8, asr #3      ; p6 + o6
+    add     r10, r10, r6, asr #3    ; p7 + o7
+
+    ldr     r0, [sp]                ; load input pointer
+
+    usat    r7, #8, r7              ; d6 = clip8(p6 + o6)
+    usat    r10, #8, r10            ; d7 = clip8(p7 + o7)
+
+    add     r12, r12, r7, lsl #16   ; |--|d6|d5|d4|
+    add     r12, r12, r10, lsl #24  ; |d7|d6|d5|d4|
+
+    str     r12, [r3], lr
+    add     r0, r0, #16
+    add     r1, r1, r2              ; pred + pitch
+
+    bcs loop2_dual
+
+    add     sp, sp, #4              ; idct_output buffer
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP
+
+    END
--- a/vp8/common/arm/armv6/intra4x4_predict_v6.asm
+++ b/vp8/common/arm/armv6/intra4x4_predict_v6.asm
@ -0,0 +1,611 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_intra4x4_predict_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+
+;void vp8_intra4x4_predict_armv6(unsigned char *Above, unsigned char *yleft,
+;                                B_PREDICTION_MODE left_stride, int b_mode,
+;                                unsigned char *dst, int dst_stride,
+;                                unsigned char top_left)
+
+; r0: *Above
+; r1: *yleft
+; r2: left_stride
+; r3: b_mode
+; sp + #40: dst
+; sp + #44: dst_stride
+; sp + #48: top_left
+|vp8_intra4x4_predict_armv6| PROC
+    push        {r4-r12, lr}
+
+    cmp         r3, #10
+    addlt       pc, pc, r3, lsl #2       ; position independent switch
+    pop         {r4-r12, pc}             ; default
+    b           b_dc_pred
+    b           b_tm_pred
+    b           b_ve_pred
+    b           b_he_pred
+    b           b_ld_pred
+    b           b_rd_pred
+    b           b_vr_pred
+    b           b_vl_pred
+    b           b_hd_pred
+    b           b_hu_pred
+
+b_dc_pred
+    ; load values
+    ldr         r8, [r0]                 ; Above
+    ldrb        r4, [r1], r2             ; Left[0]
+    mov         r9, #0
+    ldrb        r5, [r1], r2             ; Left[1]
+    ldrb        r6, [r1], r2             ; Left[2]
+    usad8       r12, r8, r9
+    ldrb        r7, [r1]                 ; Left[3]
+
+    ; calculate dc
+    add         r4, r4, r5
+    add         r4, r4, r6
+    add         r4, r4, r7
+    add         r4, r4, r12
+    add         r4, r4, #4
+    ldr         r0, [sp, #44]           ; dst_stride
+    mov         r12, r4, asr #3         ; (expected_dc + 4) >> 3
+
+    add         r12, r12, r12, lsl #8
+    ldr         r3, [sp, #40]           ; dst
+    add         r12, r12, r12, lsl #16
+
+    ; store values
+    str         r12, [r3], r0
+    str         r12, [r3], r0
+    str         r12, [r3], r0
+    str         r12, [r3]
+
+    pop        {r4-r12, pc}
+
+b_tm_pred
+    ldr         r8, [r0]                ; Above
+    ldrb        r9, [sp, #48]           ; top_left
+    ldrb        r4, [r1], r2            ; Left[0]
+    ldrb        r5, [r1], r2            ; Left[1]
+    ldrb        r6, [r1], r2            ; Left[2]
+    ldrb        r7, [r1]                ; Left[3]
+    ldr         r0, [sp, #44]           ; dst_stride
+    ldr         r3, [sp, #40]           ; dst
+
+    add         r9, r9, r9, lsl #16     ; [tl|tl]
+    uxtb16      r10, r8                 ; a[2|0]
+    uxtb16      r11, r8, ror #8         ; a[3|1]
+    ssub16      r10, r10, r9            ; a[2|0] - [tl|tl]
+    ssub16      r11, r11, r9            ; a[3|1] - [tl|tl]
+
+    add         r4, r4, r4, lsl #16     ; l[0|0]
+    add         r5, r5, r5, lsl #16     ; l[1|1]
+    add         r6, r6, r6, lsl #16     ; l[2|2]
+    add         r7, r7, r7, lsl #16     ; l[3|3]
+
+    sadd16      r1, r4, r10             ; l[0|0] + a[2|0] - [tl|tl]
+    sadd16      r2, r4, r11             ; l[0|0] + a[3|1] - [tl|tl]
+    usat16      r1, #8, r1
+    usat16      r2, #8, r2
+
+    sadd16      r4, r5, r10             ; l[1|1] + a[2|0] - [tl|tl]
+    sadd16      r5, r5, r11             ; l[1|1] + a[3|1] - [tl|tl]
+
+    add         r12, r1, r2, lsl #8     ; [3|2|1|0]
+    str         r12, [r3], r0
+
+    usat16      r4, #8, r4
+    usat16      r5, #8, r5
+
+    sadd16      r1, r6, r10             ; l[2|2] + a[2|0] - [tl|tl]
+    sadd16      r2, r6, r11             ; l[2|2] + a[3|1] - [tl|tl]
+
+    add         r12, r4, r5, lsl #8     ; [3|2|1|0]
+    str         r12, [r3], r0
+
+    usat16      r1, #8, r1
+    usat16      r2, #8, r2
+
+    sadd16      r4, r7, r10             ; l[3|3] + a[2|0] - [tl|tl]
+    sadd16      r5, r7, r11             ; l[3|3] + a[3|1] - [tl|tl]
+
+    add         r12, r1, r2, lsl #8     ; [3|2|1|0]
+
+    usat16      r4, #8, r4
+    usat16      r5, #8, r5
+
+    str         r12, [r3], r0
+
+    add         r12, r4, r5, lsl #8     ; [3|2|1|0]
+    str         r12, [r3]
+
+    pop        {r4-r12, pc}
+
+b_ve_pred
+    ldr         r8, [r0]                ; a[3|2|1|0]
+    ldr         r11, c00FF00FF
+    ldrb        r9, [sp, #48]           ; top_left
+    ldrb        r10, [r0, #4]           ; a[4]
+
+    ldr         r0, c00020002
+
+    uxtb16      r4, r8                  ; a[2|0]
+    uxtb16      r5, r8, ror #8          ; a[3|1]
+    ldr         r2, [sp, #44]           ; dst_stride
+    pkhbt       r9, r9, r5, lsl #16     ; a[1|-1]
+
+    add         r9, r9, r4, lsl #1      ;[a[1]+2*a[2]       | tl+2*a[0]       ]
+    uxtab16     r9, r9, r5              ;[a[1]+2*a[2]+a[3]  | tl+2*a[0]+a[1]  ]
+    ldr         r3, [sp, #40]           ; dst
+    uxtab16     r9, r9, r0              ;[a[1]+2*a[2]+a[3]+2| tl+2*a[0]+a[1]+2]
+
+    add         r0, r0, r10, lsl #16    ;[a[4]+2            |                 2]
+    add         r0, r0, r4, asr #16     ;[a[4]+2            |            a[2]+2]
+    add         r0, r0, r5, lsl #1      ;[a[4]+2*a[3]+2     |     a[2]+2*a[1]+2]
+    uadd16      r4, r4, r0              ;[a[4]+2*a[3]+a[2]+2|a[2]+2*a[1]+a[0]+2]
+
+    and         r9, r11, r9, asr #2
+    and         r4, r11, r4, asr #2
+    add         r9, r9, r4, lsl #8
+
+    ; store values
+    str         r9, [r3], r2
+    str         r9, [r3], r2
+    str         r9, [r3], r2
+    str         r9, [r3]
+
+    pop        {r4-r12, pc}
+
+
+b_he_pred
+    ldrb        r4, [r1], r2            ; Left[0]
+    ldrb        r8, [sp, #48]           ; top_left
+    ldrb        r5, [r1], r2            ; Left[1]
+    ldrb        r6, [r1], r2            ; Left[2]
+    ldrb        r7, [r1]                ; Left[3]
+
+    add         r8, r8, r4              ; tl   + l[0]
+    add         r9, r4, r5              ; l[0] + l[1]
+    add         r10, r5, r6             ; l[1] + l[2]
+    add         r11, r6, r7             ; l[2] + l[3]
+
+    mov         r0, #2<<14
+
+    add         r8, r8, r9              ; tl + 2*l[0] + l[1]
+    add         r4, r9, r10             ; l[0] + 2*l[1] + l[2]
+    add         r5, r10, r11            ; l[1] + 2*l[2] + l[3]
+    add         r6, r11, r7, lsl #1     ; l[2] + 2*l[3] + l[3]
+
+
+    add         r8, r0, r8, lsl #14     ; (tl + 2*l[0] + l[1])>>2 in top half
+    add         r9, r0, r4, lsl #14     ; (l[0] + 2*l[1] + l[2])>>2 in top half
+    add         r10,r0, r5, lsl #14     ; (l[1] + 2*l[2] + l[3])>>2 in top half
+    add         r11,r0, r6, lsl #14     ; (l[2] + 2*l[3] + l[3])>>2 in top half
+
+    pkhtb       r8, r8, r8, asr #16     ; l[-|0|-|0]
+    pkhtb       r9, r9, r9, asr #16     ; l[-|1|-|1]
+    pkhtb       r10, r10, r10, asr #16  ; l[-|2|-|2]
+    pkhtb       r11, r11, r11, asr #16  ; l[-|3|-|3]
+
+    ldr         r0, [sp, #44]           ; dst_stride
+    ldr         r3, [sp, #40]           ; dst
+
+    add         r8, r8, r8, lsl #8      ; l[0|0|0|0]
+    add         r9, r9, r9, lsl #8      ; l[1|1|1|1]
+    add         r10, r10, r10, lsl #8   ; l[2|2|2|2]
+    add         r11, r11, r11, lsl #8   ; l[3|3|3|3]
+
+    ; store values
+    str         r8, [r3], r0
+    str         r9, [r3], r0
+    str         r10, [r3], r0
+    str         r11, [r3]
+
+    pop        {r4-r12, pc}
+
+b_ld_pred
+    ldr         r4, [r0]                ; Above[0-3]
+    ldr         r12, c00020002
+    ldr         r5, [r0, #4]            ; Above[4-7]
+    ldr         lr,  c00FF00FF
+
+    uxtb16      r6, r4                  ; a[2|0]
+    uxtb16      r7, r4, ror #8          ; a[3|1]
+    uxtb16      r8, r5                  ; a[6|4]
+    uxtb16      r9, r5, ror #8          ; a[7|5]
+    pkhtb       r10, r6, r8             ; a[2|4]
+    pkhtb       r11, r7, r9             ; a[3|5]
+
+    add         r4, r6, r7, lsl #1      ; [a2+2*a3      |      a0+2*a1]
+    add         r4, r4, r10, ror #16    ; [a2+2*a3+a4   |   a0+2*a1+a2]
+    uxtab16     r4, r4, r12             ; [a2+2*a3+a4+2 | a0+2*a1+a2+2]
+
+    add         r5, r7, r10, ror #15    ; [a3+2*a4      |      a1+2*a2]
+    add         r5, r5, r11, ror #16    ; [a3+2*a4+a5   |   a1+2*a2+a3]
+    uxtab16     r5, r5, r12             ; [a3+2*a4+a5+2 | a1+2*a2+a3+2]
+
+    pkhtb       r7, r9, r8, asr #16
+    add         r6, r8, r9, lsl #1      ; [a6+2*a7      |      a4+2*a5]
+    uadd16      r6, r6, r7              ; [a6+2*a7+a7   |   a4+2*a5+a6]
+    uxtab16     r6, r6, r12             ; [a6+2*a7+a7+2 | a4+2*a5+a6+2]
+
+    uxth        r7, r9                  ; [                         a5]
+    add         r7, r7, r8, asr #15     ; [                    a5+2*a6]
+    add         r7, r7, r9, asr #16     ; [                 a5+2*a6+a7]
+    uxtah       r7, r7, r12             ; [               a5+2*a6+a7+2]
+
+    ldr         r0, [sp, #44]           ; dst_stride
+    ldr         r3, [sp, #40]           ; dst
+
+    ; scale down
+    and         r4, lr, r4, asr #2
+    and         r5, lr, r5, asr #2
+    and         r6, lr, r6, asr #2
+    mov         r7, r7, asr #2
+
+    add         r8, r4, r5, lsl #8      ; [3|2|1|0]
+    str         r8, [r3], r0
+
+    mov         r9, r8, lsr #8
+    add         r9, r9, r6, lsl #24     ; [4|3|2|1]
+    str         r9, [r3], r0
+
+    mov         r10, r9, lsr #8
+    add         r10, r10, r7, lsl #24   ; [5|4|3|2]
+    str         r10, [r3], r0
+
+    mov         r6, r6, lsr #16
+    mov         r11, r10, lsr #8
+    add         r11, r11, r6, lsl #24   ; [6|5|4|3]
+    str         r11, [r3]
+
+    pop        {r4-r12, pc}
+
+b_rd_pred
+    ldrb        r7, [r1], r2            ; l[0] = pp[3]
+    ldr         lr, [r0]                ; Above = pp[8|7|6|5]
+    ldrb        r8, [sp, #48]           ; tl   = pp[4]
+    ldrb        r6, [r1], r2            ; l[1] = pp[2]
+    ldrb        r5, [r1], r2            ; l[2] = pp[1]
+    ldrb        r4, [r1], r2            ; l[3] = pp[0]
+
+
+    uxtb16      r9, lr                  ; p[7|5]
+    uxtb16      r10, lr, ror #8         ; p[8|6]
+    add         r4, r4, r6, lsl #16     ; p[2|0]
+    add         r5, r5, r7, lsl #16     ; p[3|1]
+    add         r6, r6, r8, lsl #16     ; p[4|2]
+    pkhbt       r7, r7, r9, lsl #16     ; p[5|3]
+    pkhbt       r8, r8, r10, lsl #16    ; p[6|4]
+
+    ldr         r12, c00020002
+    ldr         lr,  c00FF00FF
+
+    add         r4, r4, r5, lsl #1      ; [p2+2*p3      |      p0+2*p1]
+    add         r4, r4, r6              ; [p2+2*p3+p4   |   p0+2*p1+p2]
+    uxtab16     r4, r4, r12             ; [p2+2*p3+p4+2 | p0+2*p1+p2+2]
+
+    add         r5, r5, r6, lsl #1      ; [p3+2*p4      |      p1+2*p2]
+    add         r5, r5, r7              ; [p3+2*p4+p5   |   p1+2*p2+p3]
+    uxtab16     r5, r5, r12             ; [p3+2*p4+p5+2 | p1+2*p2+p3+2]
+
+    add         r6, r7, r8, lsl #1      ; [p5+2*p6      |      p3+2*p4]
+    add         r6, r6, r9              ; [p5+2*p6+p7   |   p3+2*p4+p5]
+    uxtab16     r6, r6, r12             ; [p5+2*p6+p7+2 | p3+2*p4+p5+2]
+
+    add         r7, r8, r9, lsl #1      ; [p6+2*p7      |      p4+2*p5]
+    add         r7, r7, r10             ; [p6+2*p7+p8   |   p4+2*p5+p6]
+    uxtab16     r7, r7, r12             ; [p6+2*p7+p8+2 | p4+2*p5+p6+2]
+
+    ldr         r0, [sp, #44]           ; dst_stride
+    ldr         r3, [sp, #40]           ; dst
+
+    ; scale down
+    and         r7, lr, r7, asr #2
+    and         r6, lr, r6, asr #2
+    and         r5, lr, r5, asr #2
+    and         r4, lr, r4, asr #2
+
+    add         r8, r6, r7, lsl #8      ; [6|5|4|3]
+    str         r8, [r3], r0
+
+    mov         r9, r8, lsl #8          ; [5|4|3|-]
+    uxtab       r9, r9, r4, ror #16     ; [5|4|3|2]
+    str         r9, [r3], r0
+
+    mov         r10, r9, lsl #8         ; [4|3|2|-]
+    uxtab       r10, r10, r5            ; [4|3|2|1]
+    str         r10, [r3], r0
+
+    mov         r11, r10, lsl #8        ; [3|2|1|-]
+    uxtab       r11, r11, r4            ; [3|2|1|0]
+    str         r11, [r3]
+
+    pop        {r4-r12, pc}
+
+b_vr_pred
+    ldrb        r7, [r1], r2            ; l[0] = pp[3]
+    ldr         lr, [r0]                ; Above = pp[8|7|6|5]
+    ldrb        r8, [sp, #48]           ; tl   = pp[4]
+    ldrb        r6, [r1], r2            ; l[1] = pp[2]
+    ldrb        r5, [r1], r2            ; l[2] = pp[1]
+    ldrb        r4, [r1]                ; l[3] = pp[0]
+
+    add         r5, r5, r7, lsl #16     ; p[3|1]
+    add         r6, r6, r8, lsl #16     ; p[4|2]
+    uxtb16      r9, lr                  ; p[7|5]
+    uxtb16      r10, lr, ror #8         ; p[8|6]
+    pkhbt       r7, r7, r9, lsl #16     ; p[5|3]
+    pkhbt       r8, r8, r10, lsl #16    ; p[6|4]
+
+    ldr         r4,  c00010001
+    ldr         r12, c00020002
+    ldr         lr,  c00FF00FF
+
+    add         r5, r5, r6, lsl #1      ; [p3+2*p4      |      p1+2*p2]
+    add         r5, r5, r7              ; [p3+2*p4+p5   |   p1+2*p2+p3]
+    uxtab16     r5, r5, r12             ; [p3+2*p4+p5+2 | p1+2*p2+p3+2]
+
+    add         r6, r6, r7, lsl #1      ; [p4+2*p5      |      p2+2*p3]
+    add         r6, r6, r8              ; [p4+2*p5+p6   |   p2+2*p3+p4]
+    uxtab16     r6, r6, r12             ; [p4+2*p5+p6+2 | p2+2*p3+p4+2]
+
+    uadd16      r11, r8, r9             ; [p6+p7        |        p4+p5]
+    uhadd16     r11, r11, r4            ; [(p6+p7+1)>>1 | (p4+p5+1)>>1]
+                                        ; [F|E]
+
+    add         r7, r7, r8, lsl #1      ; [p5+2*p6      |      p3+2*p4]
+    add         r7, r7, r9              ; [p5+2*p6+p7   |   p3+2*p4+p5]
+    uxtab16     r7, r7, r12             ; [p5+2*p6+p7+2 | p3+2*p4+p5+2]
+
+    uadd16      r2, r9, r10             ; [p7+p8        |        p5+p6]
+    uhadd16     r2, r2, r4              ; [(p7+p8+1)>>1 | (p5+p6+1)>>1]
+                                        ; [J|I]
+
+    add         r8, r8, r9, lsl #1      ; [p6+2*p7      |      p4+2*p5]
+    add         r8, r8, r10             ; [p6+2*p7+p8   |   p4+2*p5+p6]
+    uxtab16     r8, r8, r12             ; [p6+2*p7+p8+2 | p4+2*p5+p6+2]
+
+    ldr         r0, [sp, #44]           ; dst_stride
+    ldr         r3, [sp, #40]           ; dst
+
+    ; scale down
+    and         r5, lr, r5, asr #2      ; [B|A]
+    and         r6, lr, r6, asr #2      ; [D|C]
+    and         r7, lr, r7, asr #2      ; [H|G]
+    and         r8, lr, r8, asr #2      ; [L|K]
+
+    add         r12, r11, r2, lsl #8    ; [J|F|I|E]
+    str         r12, [r3], r0
+
+    add         r12, r7, r8, lsl #8     ; [L|H|K|G]
+    str         r12, [r3], r0
+
+    pkhbt       r2, r6, r2, lsl #16     ; [-|I|-|C]
+    add         r2, r2, r11, lsl #8     ; [F|I|E|C]
+
+    pkhtb       r12, r6, r5             ; [-|D|-|A]
+    pkhtb       r10, r7, r5, asr #16    ; [-|H|-|B]
+    str         r2, [r3], r0
+    add         r12, r12, r10, lsl #8   ; [H|D|B|A]
+    str         r12, [r3]
+
+    pop        {r4-r12, pc}
+
+b_vl_pred
+    ldr         r4, [r0]                ; [3|2|1|0] = Above[0-3]
+    ldr         r12, c00020002
+    ldr         r5, [r0, #4]            ; [7|6|5|4] = Above[4-7]
+    ldr         lr,  c00FF00FF
+    ldr         r2,  c00010001
+
+    mov         r0, r4, lsr #16         ; [-|-|3|2]
+    add         r0, r0, r5, lsl #16     ; [5|4|3|2]
+    uxtb16      r6, r4                  ; [2|0]
+    uxtb16      r7, r4, ror #8          ; [3|1]
+    uxtb16      r8, r0                  ; [4|2]
+    uxtb16      r9, r0, ror #8          ; [5|3]
+    uxtb16      r10, r5                 ; [6|4]
+    uxtb16      r11, r5, ror #8         ; [7|5]
+
+    uadd16      r4, r6, r7              ; [p2+p3        |        p0+p1]
+    uhadd16     r4, r4, r2              ; [(p2+p3+1)>>1 | (p0+p1+1)>>1]
+                                        ; [B|A]
+
+    add         r5, r6, r7, lsl #1      ; [p2+2*p3      |      p0+2*p1]
+    add         r5, r5, r8              ; [p2+2*p3+p4   |   p0+2*p1+p2]
+    uxtab16     r5, r5, r12             ; [p2+2*p3+p4+2 | p0+2*p1+p2+2]
+
+    uadd16      r6, r7, r8              ; [p3+p4        |        p1+p2]
+    uhadd16     r6, r6, r2              ; [(p3+p4+1)>>1 | (p1+p2+1)>>1]
+                                        ; [F|E]
+
+    add         r7, r7, r8, lsl #1      ; [p3+2*p4      |      p1+2*p2]
+    add         r7, r7, r9              ; [p3+2*p4+p5   |   p1+2*p2+p3]
+    uxtab16     r7, r7, r12             ; [p3+2*p4+p5+2 | p1+2*p2+p3+2]
+
+    add         r8, r8, r9, lsl #1      ; [p4+2*p5      |      p2+2*p3]
+    add         r8, r8, r10             ; [p4+2*p5+p6   |   p2+2*p3+p4]
+    uxtab16     r8, r8, r12             ; [p4+2*p5+p6+2 | p2+2*p3+p4+2]
+
+    add         r9, r9, r10, lsl #1     ; [p5+2*p6      |      p3+2*p4]
+    add         r9, r9, r11             ; [p5+2*p6+p7   |   p3+2*p4+p5]
+    uxtab16     r9, r9, r12             ; [p5+2*p6+p7+2 | p3+2*p4+p5+2]
+
+    ldr         r0, [sp, #44]           ; dst_stride
+    ldr         r3, [sp, #40]           ; dst
+
+    ; scale down
+    and         r5, lr, r5, asr #2      ; [D|C]
+    and         r7, lr, r7, asr #2      ; [H|G]
+    and         r8, lr, r8, asr #2      ; [I|D]
+    and         r9, lr, r9, asr #2      ; [J|H]
+
+    add         r10, r4, r6, lsl #8     ; [F|B|E|A]
+    str         r10, [r3], r0
+
+    add         r5, r5, r7, lsl #8      ; [H|C|G|D]
+    str         r5, [r3], r0
+
+    pkhtb       r12, r8, r4, asr #16    ; [-|I|-|B]
+    pkhtb       r10, r9, r8             ; [-|J|-|D]
+
+    add         r12, r6, r12, lsl #8    ; [I|F|B|E]
+    str         r12, [r3], r0
+
+    add         r10, r7, r10, lsl #8    ; [J|H|D|G]
+    str         r10, [r3]
+
+    pop        {r4-r12, pc}
+
+b_hd_pred
+    ldrb        r7, [r1], r2            ; l[0] = pp[3]
+    ldr         lr, [r0]                ; Above = pp[8|7|6|5]
+    ldrb        r8, [sp, #48]           ; tl   = pp[4]
+    ldrb        r6, [r1], r2            ; l[1] = pp[2]
+    ldrb        r5, [r1], r2            ; l[2] = pp[1]
+    ldrb        r4, [r1]                ; l[3] = pp[0]
+
+    uxtb16      r9, lr                  ; p[7|5]
+    uxtb16      r10, lr, ror #8         ; p[8|6]
+
+    add         r4, r4, r5, lsl #16     ; p[1|0]
+    add         r5, r5, r6, lsl #16     ; p[2|1]
+    add         r6, r6, r7, lsl #16     ; p[3|2]
+    add         r7, r7, r8, lsl #16     ; p[4|3]
+
+    ldr         r12, c00020002
+    ldr         lr,  c00FF00FF
+    ldr         r2,  c00010001
+
+    pkhtb       r8, r7, r9              ; p[4|5]
+    pkhtb       r1, r9, r10             ; p[7|6]
+    pkhbt       r10, r8, r10, lsl #16   ; p[6|5]
+
+    uadd16      r11, r4, r5             ; [p1+p2        |        p0+p1]
+    uhadd16     r11, r11, r2            ; [(p1+p2+1)>>1 | (p0+p1+1)>>1]
+                                        ; [B|A]
+
+    add         r4, r4, r5, lsl #1      ; [p1+2*p2      |      p0+2*p1]
+    add         r4, r4, r6              ; [p1+2*p2+p3   |   p0+2*p1+p2]
+    uxtab16     r4, r4, r12             ; [p1+2*p2+p3+2 | p0+2*p1+p2+2]
+
+    uadd16      r0, r6, r7              ; [p3+p4        |        p2+p3]
+    uhadd16     r0, r0, r2              ; [(p3+p4+1)>>1 | (p2+p3+1)>>1]
+                                        ; [F|E]
+
+    add         r5, r6, r7, lsl #1      ; [p3+2*p4      |      p2+2*p3]
+    add         r5, r5, r8, ror #16     ; [p3+2*p4+p5   |   p2+2*p3+p4]
+    uxtab16     r5, r5, r12             ; [p3+2*p4+p5+2 | p2+2*p3+p4+2]
+
+    add         r6, r12, r8, ror #16    ; [p5+2         |         p4+2]
+    add         r6, r6, r10, lsl #1     ; [p5+2+2*p6    |    p4+2+2*p5]
+    uxtab16     r6, r6, r1              ; [p5+2+2*p6+p7 | p4+2+2*p5+p6]
+
+    ; scale down
+    and         r4, lr, r4, asr #2      ; [D|C]
+    and         r5, lr, r5, asr #2      ; [H|G]
+    and         r6, lr, r6, asr #2      ; [J|I]
+
+    ldr         lr, [sp, #44]           ; dst_stride
+    ldr         r3, [sp, #40]           ; dst
+
+    pkhtb       r2, r0, r6              ; [-|F|-|I]
+    pkhtb       r12, r6, r5, asr #16    ; [-|J|-|H]
+    add         r12, r12, r2, lsl #8    ; [F|J|I|H]
+    add         r2, r0, r5, lsl #8      ; [H|F|G|E]
+    mov         r12, r12, ror #24       ; [J|I|H|F]
+    str         r12, [r3], lr
+
+    mov         r7, r11, asr #16        ; [-|-|-|B]
+    str         r2, [r3], lr
+    add         r7, r7, r0, lsl #16     ; [-|E|-|B]
+    add         r7, r7, r4, asr #8      ; [-|E|D|B]
+    add         r7, r7, r5, lsl #24     ; [G|E|D|B]
+    str         r7, [r3], lr
+
+    add         r5, r11, r4, lsl #8     ; [D|B|C|A]
+    str         r5, [r3]
+
+    pop        {r4-r12, pc}
+
+
+
+b_hu_pred
+    ldrb        r4, [r1], r2            ; Left[0]
+    ldr         r12, c00020002
+    ldrb        r5, [r1], r2            ; Left[1]
+    ldr         lr,  c00FF00FF
+    ldrb        r6, [r1], r2            ; Left[2]
+    ldr         r2,  c00010001
+    ldrb        r7, [r1]                ; Left[3]
+
+    add         r4, r4, r5, lsl #16     ; [1|0]
+    add         r5, r5, r6, lsl #16     ; [2|1]
+    add         r9, r6, r7, lsl #16     ; [3|2]
+
+    uadd16      r8, r4, r5              ; [p1+p2        |        p0+p1]
+    uhadd16     r8, r8, r2              ; [(p1+p2+1)>>1 | (p0+p1+1)>>1]
+                                        ; [B|A]
+
+    add         r4, r4, r5, lsl #1      ; [p1+2*p2      |      p0+2*p1]
+    add         r4, r4, r9              ; [p1+2*p2+p3   |   p0+2*p1+p2]
+    uxtab16     r4, r4, r12             ; [p1+2*p2+p3+2 | p0+2*p1+p2+2]
+    ldr         r2, [sp, #44]           ; dst_stride
+    ldr         r3, [sp, #40]           ; dst
+    and         r4, lr, r4, asr #2      ; [D|C]
+
+    add         r10, r6, r7             ; [p2+p3]
+    add         r11, r10, r7, lsl #1    ; [p2+3*p3]
+    add         r10, r10, #1
+    add         r11, r11, #2
+    mov         r10, r10, asr #1        ; [E]
+    mov         r11, r11, asr #2        ; [F]
+
+    add         r9, r7, r9, asr #8      ; [-|-|G|G]
+    add         r0, r8, r4, lsl #8      ; [D|B|C|A]
+    add         r7, r9, r9, lsl #16     ; [G|G|G|G]
+
+    str         r0, [r3], r2
+
+    mov         r1, r8, asr #16         ; [-|-|-|B]
+    add         r1, r1, r4, asr #8      ; [-|-|D|B]
+    add         r1, r1, r10, lsl #16    ; [-|E|D|B]
+    add         r1, r1, r11, lsl #24    ; [F|E|D|B]
+    str         r1, [r3], r2
+
+    add         r10, r11, lsl #8        ; [-|-|F|E]
+    add         r10, r10, r9, lsl #16   ; [G|G|F|E]
+    str         r10, [r3], r2
+
+    str         r7, [r3]
+
+    pop        {r4-r12, pc}
+
+    ENDP
+
+; constants
+c00010001
+    DCD         0x00010001
+c00020002
+    DCD         0x00020002
+c00FF00FF
+    DCD         0x00FF00FF
+
+    END
--- a/vp8/common/arm/armv6/iwalsh_v6.asm
+++ b/vp8/common/arm/armv6/iwalsh_v6.asm
@ -0,0 +1,136 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT |vp8_short_inv_walsh4x4_v6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+;short vp8_short_inv_walsh4x4_v6(short *input, short *mb_dqcoeff)
+|vp8_short_inv_walsh4x4_v6| PROC
+
+    stmdb       sp!, {r4 - r12, lr}
+
+    ldr         r2, [r0, #0]         ; [1  |  0]
+    ldr         r3, [r0, #4]         ; [3  |  2]
+    ldr         r4, [r0, #8]         ; [5  |  4]
+    ldr         r5, [r0, #12]        ; [7  |  6]
+    ldr         r6, [r0, #16]        ; [9  |  8]
+    ldr         r7, [r0, #20]        ; [11 | 10]
+    ldr         r8, [r0, #24]        ; [13 | 12]
+    ldr         r9, [r0, #28]        ; [15 | 14]
+
+    qadd16      r10, r2, r8          ; a1 [1+13  |  0+12]
+    qadd16      r11, r4, r6          ; b1 [5+9   |  4+8]
+    qsub16      r12, r4, r6          ; c1 [5-9   |  4-8]
+    qsub16      lr, r2, r8           ; d1 [1-13  |  0-12]
+
+    qadd16      r2, r10, r11         ; a1 + b1 [1  |  0]
+    qadd16      r4, r12, lr          ; c1 + d1 [5  |  4]
+    qsub16      r6, r10, r11         ; a1 - b1 [9  |  8]
+    qsub16      r8, lr, r12          ; d1 - c1 [13 | 12]
+
+    qadd16      r10, r3, r9          ; a1 [3+15  |  2+14]
+    qadd16      r11, r5, r7          ; b1 [7+11  |  6+10]
+    qsub16      r12, r5, r7          ; c1 [7-11  |  6-10]
+    qsub16      lr, r3, r9           ; d1 [3-15  |  2-14]
+
+    qadd16      r3, r10, r11         ; a1 + b1 [3  |  2]
+    qadd16      r5, r12, lr          ; c1 + d1 [7  |  6]
+    qsub16      r7, r10, r11         ; a1 - b1 [11 | 10]
+    qsub16      r9, lr, r12          ; d1 - c1 [15 | 14]
+
+    ; first transform complete
+
+    qsubaddx    r10, r2, r3          ; [c1|a1] [1-2   |   0+3]
+    qaddsubx    r11, r2, r3          ; [b1|d1] [1+2   |   0-3]
+    qsubaddx    r12, r4, r5          ; [c1|a1] [5-6   |   4+7]
+    qaddsubx    lr, r4, r5           ; [b1|d1] [5+6   |   4-7]
+
+    qaddsubx    r2, r10, r11         ; [b2|c2] [c1+d1 | a1-b1]
+    qaddsubx    r3, r11, r10         ; [a2|d2] [b1+a1 | d1-c1]
+    ldr         r10, c0x00030003
+    qaddsubx    r4, r12, lr          ; [b2|c2] [c1+d1 | a1-b1]
+    qaddsubx    r5, lr, r12          ; [a2|d2] [b1+a1 | d1-c1]
+
+    qadd16      r2, r2, r10          ; [b2+3|c2+3]
+    qadd16      r3, r3, r10          ; [a2+3|d2+3]
+    qadd16      r4, r4, r10          ; [b2+3|c2+3]
+    qadd16      r5, r5, r10          ; [a2+3|d2+3]
+
+    asr         r12, r3, #19         ; [0]
+    strh        r12, [r1], #32
+    asr         lr, r2, #19          ; [1]
+    strh        lr, [r1], #32
+    sxth        r2, r2
+    sxth        r3, r3
+    asr         r2, r2, #3           ; [2]
+    strh        r2, [r1], #32
+    asr         r3, r3, #3           ; [3]
+    strh        r3, [r1], #32
+
+    asr         r12, r5, #19         ; [4]
+    strh        r12, [r1], #32
+    asr         lr, r4, #19          ; [5]
+    strh        lr, [r1], #32
+    sxth        r4, r4
+    sxth        r5, r5
+    asr         r4, r4, #3           ; [6]
+    strh        r4, [r1], #32
+    asr         r5, r5, #3           ; [7]
+    strh        r5, [r1], #32
+
+    qsubaddx    r2, r6, r7           ; [c1|a1] [9-10  |  8+11]
+    qaddsubx    r3, r6, r7           ; [b1|d1] [9+10  |  8-11]
+    qsubaddx    r4, r8, r9           ; [c1|a1] [13-14 | 12+15]
+    qaddsubx    r5, r8, r9           ; [b1|d1] [13+14 | 12-15]
+
+    qaddsubx    r6, r2, r3           ; [b2|c2] [c1+d1 | a1-b1]
+    qaddsubx    r7, r3, r2           ; [a2|d2] [b1+a1 | d1-c1]
+    qaddsubx    r8, r4, r5           ; [b2|c2] [c1+d1 | a1-b1]
+    qaddsubx    r9, r5, r4           ; [a2|d2] [b1+a1 | d1-c1]
+
+    qadd16      r6, r6, r10          ; [b2+3|c2+3]
+    qadd16      r7, r7, r10          ; [a2+3|d2+3]
+    qadd16      r8, r8, r10          ; [b2+3|c2+3]
+    qadd16      r9, r9, r10          ; [a2+3|d2+3]
+
+    asr         r12, r7, #19         ; [8]
+    strh        r12, [r1], #32
+    asr         lr, r6, #19          ; [9]
+    strh        lr, [r1], #32
+    sxth        r6, r6
+    sxth        r7, r7
+    asr         r6, r6, #3           ; [10]
+    strh        r6, [r1], #32
+    asr         r7, r7, #3           ; [11]
+    strh        r7, [r1], #32
+
+    asr         r12, r9, #19         ; [12]
+    strh        r12, [r1], #32
+    asr         lr, r8, #19          ; [13]
+    strh        lr, [r1], #32
+    sxth        r8, r8
+    sxth        r9, r9
+    asr         r8, r8, #3           ; [14]
+    strh        r8, [r1], #32
+    asr         r9, r9, #3           ; [15]
+    strh        r9, [r1], #32
+
+    ldmia       sp!, {r4 - r12, pc}
+    ENDP        ; |vp8_short_inv_walsh4x4_v6|
+
+
+; Constant Pool
+c0x00030003 DCD 0x00030003
+    END
--- a/vp8/common/arm/armv6/loopfilter_v6.asm
+++ b/vp8/common/arm/armv6/loopfilter_v6.asm
--- a/vp8/common/arm/armv6/simpleloopfilter_v6.asm
+++ b/vp8/common/arm/armv6/simpleloopfilter_v6.asm
@ -0,0 +1,286 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT |vp8_loop_filter_simple_horizontal_edge_armv6|
+    EXPORT |vp8_loop_filter_simple_vertical_edge_armv6|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+    MACRO
+    TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3
+    ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3
+    ; a0: 03 02 01 00
+    ; a1: 13 12 11 10
+    ; a2: 23 22 21 20
+    ; a3: 33 32 31 30
+    ;     b3 b2 b1 b0
+
+    uxtb16      $b1, $a1                    ; xx 12 xx 10
+    uxtb16      $b0, $a0                    ; xx 02 xx 00
+    uxtb16      $b3, $a3                    ; xx 32 xx 30
+    uxtb16      $b2, $a2                    ; xx 22 xx 20
+    orr         $b1, $b0, $b1, lsl #8       ; 12 02 10 00
+    orr         $b3, $b2, $b3, lsl #8       ; 32 22 30 20
+
+    uxtb16      $a1, $a1, ror #8            ; xx 13 xx 11
+    uxtb16      $a3, $a3, ror #8            ; xx 33 xx 31
+    uxtb16      $a0, $a0, ror #8            ; xx 03 xx 01
+    uxtb16      $a2, $a2, ror #8            ; xx 23 xx 21
+    orr         $a0, $a0, $a1, lsl #8       ; 13 03 11 01
+    orr         $a2, $a2, $a3, lsl #8       ; 33 23 31 21
+
+    pkhtb       $b2, $b3, $b1, asr #16      ; 32 22 12 02   -- p1
+    pkhbt       $b0, $b1, $b3, lsl #16      ; 30 20 10 00   -- p3
+
+    pkhtb       $b3, $a2, $a0, asr #16      ; 33 23 13 03   -- p0
+    pkhbt       $b1, $a0, $a2, lsl #16      ; 31 21 11 01   -- p2
+    MEND
+
+
+
+src         RN  r0
+pstep       RN  r1
+
+;r0     unsigned char *src_ptr,
+;r1     int src_pixel_step,
+;r2     const char *blimit
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_loop_filter_simple_horizontal_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+    stmdb       sp!, {r4 - r11, lr}
+
+    ldrb        r12, [r2]                   ; blimit
+    ldr         r3, [src, -pstep, lsl #1]   ; p1
+    ldr         r4, [src, -pstep]           ; p0
+    ldr         r5, [src]                   ; q0
+    ldr         r6, [src, pstep]            ; q1
+    orr         r12, r12, r12, lsl #8       ; blimit
+    ldr         r2, c0x80808080
+    orr         r12, r12, r12, lsl #16      ; blimit
+    mov         r9, #4                      ; double the count. we're doing 4 at a time
+    mov         lr, #0                      ; need 0 in a couple places
+
+|simple_hnext8|
+    ; vp8_simple_filter_mask()
+
+    uqsub8      r7, r3, r6                  ; p1 - q1
+    uqsub8      r8, r6, r3                  ; q1 - p1
+    uqsub8      r10, r4, r5                 ; p0 - q0
+    uqsub8      r11, r5, r4                 ; q0 - p0
+    orr         r8, r8, r7                  ; abs(p1 - q1)
+    orr         r10, r10, r11               ; abs(p0 - q0)
+    uqadd8      r10, r10, r10               ; abs(p0 - q0) * 2
+    uhadd8      r8, r8, lr                  ; abs(p1 - q2) >> 1
+    uqadd8      r10, r10, r8                ; abs(p0 - q0)*2 + abs(p1 - q1)/2
+    mvn         r8, #0
+    usub8       r10, r12, r10               ; compare to flimit. usub8 sets GE flags
+    sel         r10, r8, lr                 ; filter mask: F or 0
+    cmp         r10, #0
+    beq         simple_hskip_filter         ; skip filtering if all masks are 0x00
+
+    ;vp8_simple_filter()
+
+    eor         r3, r3, r2                  ; p1 offset to convert to a signed value
+    eor         r6, r6, r2                  ; q1 offset to convert to a signed value
+    eor         r4, r4, r2                  ; p0 offset to convert to a signed value
+    eor         r5, r5, r2                  ; q0 offset to convert to a signed value
+
+    qsub8       r3, r3, r6                  ; vp8_filter = p1 - q1
+    qsub8       r6, r5, r4                  ; q0 - p0
+    qadd8       r3, r3, r6                  ; += q0 - p0
+    ldr         r7, c0x04040404
+    qadd8       r3, r3, r6                  ; += q0 - p0
+    ldr         r8, c0x03030303
+    qadd8       r3, r3, r6                  ; vp8_filter = p1-q1 + 3*(q0-p0))
+    ;STALL
+    and         r3, r3, r10                 ; vp8_filter &= mask
+
+    qadd8       r7 , r3 , r7                ; Filter1 = vp8_filter + 4
+    qadd8       r8 , r3 , r8                ; Filter2 = vp8_filter + 3
+
+    shadd8      r7 , r7 , lr
+    shadd8      r8 , r8 , lr
+    shadd8      r7 , r7 , lr
+    shadd8      r8 , r8 , lr
+    shadd8      r7 , r7 , lr                ; Filter1 >>= 3
+    shadd8      r8 , r8 , lr                ; Filter2 >>= 3
+
+    qsub8       r5 ,r5, r7                  ; u = q0 - Filter1
+    qadd8       r4, r4, r8                  ; u = p0 + Filter2
+    eor         r5, r5, r2                  ; *oq0 = u^0x80
+    str         r5, [src]                   ; store oq0 result
+    eor         r4, r4, r2                  ; *op0 = u^0x80
+    str         r4, [src, -pstep]           ; store op0 result
+
+|simple_hskip_filter|
+    subs        r9, r9, #1
+    addne       src, src, #4                ; next row
+
+    ldrne       r3, [src, -pstep, lsl #1]   ; p1
+    ldrne       r4, [src, -pstep]           ; p0
+    ldrne       r5, [src]                   ; q0
+    ldrne       r6, [src, pstep]            ; q1
+
+    bne         simple_hnext8
+
+    ldmia       sp!, {r4 - r11, pc}
+    ENDP        ; |vp8_loop_filter_simple_horizontal_edge_armv6|
+
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_loop_filter_simple_vertical_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+    stmdb       sp!, {r4 - r11, lr}
+
+    ldrb        r12, [r2]                   ; r12: blimit
+    ldr         r2, c0x80808080
+    orr         r12, r12, r12, lsl #8
+
+    ; load soure data to r7, r8, r9, r10
+    ldrh        r3, [src, #-2]
+    pld         [src, #23]                  ; preload for next block
+    ldrh        r4, [src], pstep
+    orr         r12, r12, r12, lsl #16
+
+    ldrh        r5, [src, #-2]
+    pld         [src, #23]
+    ldrh        r6, [src], pstep
+
+    pkhbt       r7, r3, r4, lsl #16
+
+    ldrh        r3, [src, #-2]
+    pld         [src, #23]
+    ldrh        r4, [src], pstep
+
+    pkhbt       r8, r5, r6, lsl #16
+
+    ldrh        r5, [src, #-2]
+    pld         [src, #23]
+    ldrh        r6, [src], pstep
+    mov         r11, #4                     ; double the count. we're doing 4 at a time
+
+|simple_vnext8|
+    ; vp8_simple_filter_mask() function
+    pkhbt       r9, r3, r4, lsl #16
+    pkhbt       r10, r5, r6, lsl #16
+
+    ;transpose r7, r8, r9, r10 to r3, r4, r5, r6
+    TRANSPOSE_MATRIX r7, r8, r9, r10, r3, r4, r5, r6
+
+    uqsub8      r7, r3, r6                  ; p1 - q1
+    uqsub8      r8, r6, r3                  ; q1 - p1
+    uqsub8      r9, r4, r5                  ; p0 - q0
+    uqsub8      r10, r5, r4                 ; q0 - p0
+    orr         r7, r7, r8                  ; abs(p1 - q1)
+    orr         r9, r9, r10                 ; abs(p0 - q0)
+    mov         r8, #0
+    uqadd8      r9, r9, r9                  ; abs(p0 - q0) * 2
+    uhadd8      r7, r7, r8                  ; abs(p1 - q1) / 2
+    uqadd8      r7, r7, r9                  ; abs(p0 - q0)*2 + abs(p1 - q1)/2
+    mvn         r10, #0                     ; r10 == -1
+
+    usub8       r7, r12, r7                 ; compare to flimit
+    sel         lr, r10, r8                 ; filter mask
+
+    cmp         lr, #0
+    beq         simple_vskip_filter         ; skip filtering
+
+    ;vp8_simple_filter() function
+    eor         r3, r3, r2                  ; p1 offset to convert to a signed value
+    eor         r6, r6, r2                  ; q1 offset to convert to a signed value
+    eor         r4, r4, r2                  ; p0 offset to convert to a signed value
+    eor         r5, r5, r2                  ; q0 offset to convert to a signed value
+
+    qsub8       r3, r3, r6                  ; vp8_filter = p1 - q1
+    qsub8       r6, r5, r4                  ; q0 - p0
+
+    qadd8       r3, r3, r6                  ; vp8_filter += q0 - p0
+    ldr         r9, c0x03030303             ; r9 = 3
+
+    qadd8       r3, r3, r6                  ; vp8_filter += q0 - p0
+    ldr         r7, c0x04040404
+
+    qadd8       r3, r3, r6                  ; vp8_filter = p1-q1 + 3*(q0-p0))
+    ;STALL
+    and         r3, r3, lr                  ; vp8_filter &= mask
+
+    qadd8       r9 , r3 , r9                ; Filter2 = vp8_filter + 3
+    qadd8       r3 , r3 , r7                ; Filter1 = vp8_filter + 4
+
+    shadd8      r9 , r9 , r8
+    shadd8      r3 , r3 , r8
+    shadd8      r9 , r9 , r8
+    shadd8      r3 , r3 , r8
+    shadd8      r9 , r9 , r8                ; Filter2 >>= 3
+    shadd8      r3 , r3 , r8                ; Filter1 >>= 3
+
+    ;calculate output
+    sub         src, src, pstep, lsl #2
+
+    qadd8       r4, r4, r9                  ; u = p0 + Filter2
+    qsub8       r5, r5, r3                  ; u = q0 - Filter1
+    eor         r4, r4, r2                  ; *op0 = u^0x80
+    eor         r5, r5, r2                  ; *oq0 = u^0x80
+
+    strb        r4, [src, #-1]              ; store the result
+    mov         r4, r4, lsr #8
+    strb        r5, [src], pstep
+    mov         r5, r5, lsr #8
+
+    strb        r4, [src, #-1]
+    mov         r4, r4, lsr #8
+    strb        r5, [src], pstep
+    mov         r5, r5, lsr #8
+
+    strb        r4, [src, #-1]
+    mov         r4, r4, lsr #8
+    strb        r5, [src], pstep
+    mov         r5, r5, lsr #8
+
+    strb        r4, [src, #-1]
+    strb        r5, [src], pstep
+
+|simple_vskip_filter|
+    subs        r11, r11, #1
+
+    ; load soure data to r7, r8, r9, r10
+    ldrneh      r3, [src, #-2]
+    pld         [src, #23]                  ; preload for next block
+    ldrneh      r4, [src], pstep
+
+    ldrneh      r5, [src, #-2]
+    pld         [src, #23]
+    ldrneh      r6, [src], pstep
+
+    pkhbt       r7, r3, r4, lsl #16
+
+    ldrneh      r3, [src, #-2]
+    pld         [src, #23]
+    ldrneh      r4, [src], pstep
+
+    pkhbt       r8, r5, r6, lsl #16
+
+    ldrneh      r5, [src, #-2]
+    pld         [src, #23]
+    ldrneh      r6, [src], pstep
+
+    bne         simple_vnext8
+
+    ldmia       sp!, {r4 - r11, pc}
+    ENDP        ; |vp8_loop_filter_simple_vertical_edge_armv6|
+
+; Constant Pool
+c0x80808080 DCD     0x80808080
+c0x03030303 DCD     0x03030303
+c0x04040404 DCD     0x04040404
+
+    END
--- a/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
+++ b/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
@ -0,0 +1,273 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sixtap_predict8x4_armv6|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+;-------------------------------------
+; r0    unsigned char *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; stack unsigned char *dst_ptr,
+; stack int  dst_pitch
+;-------------------------------------
+;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184.
+;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack,
+;and the result is stored in transpose.
+|vp8_sixtap_predict8x4_armv6| PROC
+    stmdb       sp!, {r4 - r11, lr}
+    str         r3, [sp, #-184]!            ;reserve space on stack for temporary storage, store yoffset
+
+    cmp         r2, #0                      ;skip first_pass filter if xoffset=0
+    add         lr, sp, #4                  ;point to temporary buffer
+    beq         skip_firstpass_filter
+
+;first-pass filter
+    adr         r12, filter8_coeff
+    sub         r0, r0, r1, lsl #1
+
+    add         r3, r1, #10                 ; preload next low
+    pld         [r0, r3]
+
+    add         r2, r12, r2, lsl #4         ;calculate filter location
+    add         r0, r0, #3                  ;adjust src only for loading convinience
+
+    ldr         r3, [r2]                    ; load up packed filter coefficients
+    ldr         r4, [r2, #4]
+    ldr         r5, [r2, #8]
+
+    mov         r2, #0x90000                ; height=9 is top part of counter
+
+    sub         r1, r1, #8
+
+|first_pass_hloop_v6|
+    ldrb        r6, [r0, #-5]               ; load source data
+    ldrb        r7, [r0, #-4]
+    ldrb        r8, [r0, #-3]
+    ldrb        r9, [r0, #-2]
+    ldrb        r10, [r0, #-1]
+
+    orr         r2, r2, #0x4                ; construct loop counter. width=8=4x2
+
+    pkhbt       r6, r6, r7, lsl #16         ; r7 | r6
+    pkhbt       r7, r7, r8, lsl #16         ; r8 | r7
+
+    pkhbt       r8, r8, r9, lsl #16         ; r9 | r8
+    pkhbt       r9, r9, r10, lsl #16        ; r10 | r9
+
+|first_pass_wloop_v6|
+    smuad       r11, r6, r3                 ; vp8_filter[0], vp8_filter[1]
+    smuad       r12, r7, r3
+
+    ldrb        r6, [r0], #1
+
+    smlad       r11, r8, r4, r11            ; vp8_filter[2], vp8_filter[3]
+    ldrb        r7, [r0], #1
+    smlad       r12, r9, r4, r12
+
+    pkhbt       r10, r10, r6, lsl #16       ; r10 | r9
+    pkhbt       r6, r6, r7, lsl #16         ; r11 | r10
+    smlad       r11, r10, r5, r11           ; vp8_filter[4], vp8_filter[5]
+    smlad       r12, r6, r5, r12
+
+    sub         r2, r2, #1
+
+    add         r11, r11, #0x40             ; round_shift_and_clamp
+    tst         r2, #0xff                   ; test loop counter
+    usat        r11, #8, r11, asr #7
+    add         r12, r12, #0x40
+    strh        r11, [lr], #20              ; result is transposed and stored, which
+    usat        r12, #8, r12, asr #7
+
+    strh        r12, [lr], #20
+
+    movne       r11, r6
+    movne       r12, r7
+
+    movne       r6, r8
+    movne       r7, r9
+    movne       r8, r10
+    movne       r9, r11
+    movne       r10, r12
+
+    bne         first_pass_wloop_v6
+
+    ;;add       r9, ppl, #30                ; attempt to load 2 adjacent cache lines
+    ;;IF ARCHITECTURE=6
+    ;pld        [src, ppl]
+    ;;pld       [src, r9]
+    ;;ENDIF
+
+    subs        r2, r2, #0x10000
+
+    sub         lr, lr, #158
+
+    add         r0, r0, r1                  ; move to next input line
+
+    add         r11, r1, #18                ; preload next low. adding back block width(=8), which is subtracted earlier
+    pld         [r0, r11]
+
+    bne         first_pass_hloop_v6
+
+;second pass filter
+secondpass_filter
+    ldr         r3, [sp], #4                ; load back yoffset
+    ldr         r0, [sp, #216]              ; load dst address from stack 180+36
+    ldr         r1, [sp, #220]              ; load dst stride from stack 180+40
+
+    cmp         r3, #0
+    beq         skip_secondpass_filter
+
+    adr         r12, filter8_coeff
+    add         lr, r12, r3, lsl #4         ;calculate filter location
+
+    mov         r2, #0x00080000
+
+    ldr         r3, [lr]                    ; load up packed filter coefficients
+    ldr         r4, [lr, #4]
+    ldr         r5, [lr, #8]
+
+    pkhbt       r12, r4, r3                 ; pack the filter differently
+    pkhbt       r11, r5, r4
+
+second_pass_hloop_v6
+    ldr         r6, [sp]                    ; load the data
+    ldr         r7, [sp, #4]
+
+    orr         r2, r2, #2                  ; loop counter
+
+second_pass_wloop_v6
+    smuad       lr, r3, r6                  ; apply filter
+    smulbt      r10, r3, r6
+
+    ldr         r8, [sp, #8]
+
+    smlad       lr, r4, r7, lr
+    smladx      r10, r12, r7, r10
+
+    ldrh        r9, [sp, #12]
+
+    smlad       lr, r5, r8, lr
+    smladx      r10, r11, r8, r10
+
+    add         sp, sp, #4
+    smlatb      r10, r5, r9, r10
+
+    sub         r2, r2, #1
+
+    add         lr, lr, #0x40               ; round_shift_and_clamp
+    tst         r2, #0xff
+    usat        lr, #8, lr, asr #7
+    add         r10, r10, #0x40
+    strb        lr, [r0], r1                ; the result is transposed back and stored
+    usat        r10, #8, r10, asr #7
+
+    strb        r10, [r0],r1
+
+    movne       r6, r7
+    movne       r7, r8
+
+    bne         second_pass_wloop_v6
+
+    subs        r2, r2, #0x10000
+    add         sp, sp, #12                 ; updata src for next loop (20-8)
+    sub         r0, r0, r1, lsl #2
+    add         r0, r0, #1
+
+    bne         second_pass_hloop_v6
+
+    add         sp, sp, #20
+    ldmia       sp!, {r4 - r11, pc}
+
+;--------------------
+skip_firstpass_filter
+    sub         r0, r0, r1, lsl #1
+    sub         r1, r1, #8
+    mov         r2, #9
+
+skip_firstpass_hloop
+    ldrb        r4, [r0], #1                ; load data
+    subs        r2, r2, #1
+    ldrb        r5, [r0], #1
+    strh        r4, [lr], #20               ; store it to immediate buffer
+    ldrb        r6, [r0], #1                ; load data
+    strh        r5, [lr], #20
+    ldrb        r7, [r0], #1
+    strh        r6, [lr], #20
+    ldrb        r8, [r0], #1
+    strh        r7, [lr], #20
+    ldrb        r9, [r0], #1
+    strh        r8, [lr], #20
+    ldrb        r10, [r0], #1
+    strh        r9, [lr], #20
+    ldrb        r11, [r0], #1
+    strh        r10, [lr], #20
+    add         r0, r0, r1                  ; move to next input line
+    strh        r11, [lr], #20
+
+    sub         lr, lr, #158                ; move over to next column
+    bne         skip_firstpass_hloop
+
+    b           secondpass_filter
+
+;--------------------
+skip_secondpass_filter
+    mov         r2, #8
+    add         sp, sp, #4                  ;start from src[0] instead of src[-2]
+
+skip_secondpass_hloop
+    ldr         r6, [sp], #4
+    subs        r2, r2, #1
+    ldr         r8, [sp], #4
+
+    mov         r7, r6, lsr #16             ; unpack
+    strb        r6, [r0], r1
+    mov         r9, r8, lsr #16
+    strb        r7, [r0], r1
+    add         sp, sp, #12                 ; 20-8
+    strb        r8, [r0], r1
+    strb        r9, [r0], r1
+
+    sub         r0, r0, r1, lsl #2
+    add         r0, r0, #1
+
+    bne         skip_secondpass_hloop
+
+    add         sp, sp, #16                 ; 180 - (160 +4)
+
+    ldmia       sp!, {r4 - r11, pc}
+
+    ENDP
+
+;-----------------
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+filter8_coeff
+    DCD     0x00000000,     0x00000080,     0x00000000,     0x00000000
+    DCD     0xfffa0000,     0x000c007b,     0x0000ffff,     0x00000000
+    DCD     0xfff50002,     0x0024006c,     0x0001fff8,     0x00000000
+    DCD     0xfff70000,     0x0032005d,     0x0000fffa,     0x00000000
+    DCD     0xfff00003,     0x004d004d,     0x0003fff0,     0x00000000
+    DCD     0xfffa0000,     0x005d0032,     0x0000fff7,     0x00000000
+    DCD     0xfff80001,     0x006c0024,     0x0002fff5,     0x00000000
+    DCD     0xffff0000,     0x007b000c,     0x0000fffa,     0x00000000
+
+    ;DCD        0,  0,  128,    0,   0,  0
+    ;DCD        0, -6,  123,   12,  -1,  0
+    ;DCD        2, -11, 108,   36,  -8,  1
+    ;DCD        0, -9,   93,   50,  -6,  0
+    ;DCD        3, -16,  77,   77, -16,  3
+    ;DCD        0, -6,   50,   93,  -9,  0
+    ;DCD        1, -8,   36,  108, -11,  2
+    ;DCD        0, -1,   12,  123,  -6,  0
+
+    END
--- a/vp8/common/arm/armv6/vp8_sad16x16_armv6.asm
+++ b/vp8/common/arm/armv6/vp8_sad16x16_armv6.asm
@ -0,0 +1,96 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sad16x16_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    const unsigned char *src_ptr
+; r1    int  src_stride
+; r2    const unsigned char *ref_ptr
+; r3    int  ref_stride
+; stack max_sad (not used)
+|vp8_sad16x16_armv6| PROC
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+    pld     [r0, r1, lsl #1]
+    pld     [r2, r3, lsl #1]
+
+    mov     r4, #0              ; sad = 0;
+    mov     r5, #8              ; loop count
+
+loop
+    ; 1st row
+    ldr     r6, [r0, #0x0]      ; load 4 src pixels (1A)
+    ldr     r8, [r2, #0x0]      ; load 4 ref pixels (1A)
+    ldr     r7, [r0, #0x4]      ; load 4 src pixels (1A)
+    ldr     r9, [r2, #0x4]      ; load 4 ref pixels (1A)
+    ldr     r10, [r0, #0x8]     ; load 4 src pixels (1B)
+    ldr     r11, [r0, #0xC]     ; load 4 src pixels (1B)
+
+    usada8  r4, r8, r6, r4      ; calculate sad for 4 pixels
+    usad8   r8, r7, r9          ; calculate sad for 4 pixels
+
+    ldr     r12, [r2, #0x8]     ; load 4 ref pixels (1B)
+    ldr     lr, [r2, #0xC]      ; load 4 ref pixels (1B)
+
+    add     r0, r0, r1          ; set src pointer to next row
+    add     r2, r2, r3          ; set dst pointer to next row
+
+    pld     [r0, r1, lsl #1]
+    pld     [r2, r3, lsl #1]
+
+    usada8  r4, r10, r12, r4    ; calculate sad for 4 pixels
+    usada8  r8, r11, lr, r8     ; calculate sad for 4 pixels
+
+    ldr     r6, [r0, #0x0]      ; load 4 src pixels (2A)
+    ldr     r7, [r0, #0x4]      ; load 4 src pixels (2A)
+    add     r4, r4, r8          ; add partial sad values
+
+    ; 2nd row
+    ldr     r8, [r2, #0x0]      ; load 4 ref pixels (2A)
+    ldr     r9, [r2, #0x4]      ; load 4 ref pixels (2A)
+    ldr     r10, [r0, #0x8]     ; load 4 src pixels (2B)
+    ldr     r11, [r0, #0xC]     ; load 4 src pixels (2B)
+
+    usada8  r4, r6, r8, r4      ; calculate sad for 4 pixels
+    usad8   r8, r7, r9          ; calculate sad for 4 pixels
+
+    ldr     r12, [r2, #0x8]     ; load 4 ref pixels (2B)
+    ldr     lr, [r2, #0xC]      ; load 4 ref pixels (2B)
+
+    add     r0, r0, r1          ; set src pointer to next row
+    add     r2, r2, r3          ; set dst pointer to next row
+
+    usada8  r4, r10, r12, r4    ; calculate sad for 4 pixels
+    usada8  r8, r11, lr, r8     ; calculate sad for 4 pixels
+
+    pld     [r0, r1, lsl #1]
+    pld     [r2, r3, lsl #1]
+
+    subs    r5, r5, #1          ; decrement loop counter
+    add     r4, r4, r8          ; add partial sad values
+
+    bne     loop
+
+    mov     r0, r4              ; return sad
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+    END
+
--- a/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm
+++ b/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm
@ -0,0 +1,154 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_variance16x16_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp8_variance16x16_armv6| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r8, #0              ; initialize sum = 0
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+
+loop
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load 4 src pixels
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load 4 src pixels
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load 4 src pixels
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load 4 src pixels
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+
+    subs    r12, r12, #1
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+    END
+
--- a/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm
+++ b/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm
@ -0,0 +1,101 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_variance8x8_armv6|
+
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp8_variance8x8_armv6| PROC
+
+    push    {r4-r10, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r12, #8             ; set loop counter to 8 (=block height)
+    mov     r4, #0              ; initialize sum = 0
+    mov     r5, #0              ; initialize sse = 0
+
+loop
+    ; 1st 4 pixels
+    ldr     r6, [r0, #0x0]      ; load 4 src pixels
+    ldr     r7, [r2, #0x0]      ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r8, r6, r7          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r10, r8, lr         ; select bytes with positive difference
+    usub8   r9, r7, r6          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r6, r10, lr         ; calculate sum of positive differences
+    usad8   r7, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r10         ; differences of all 4 pixels
+    ; calculate total sum
+    add    r4, r4, r6           ; add positive differences to sum
+    sub    r4, r4, r7           ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r7, r8              ; byte (two pixels) to halfwords
+    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
+    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r6, [r0, #0x4]      ; load 4 src pixels
+    ldr     r7, [r2, #0x4]      ; load 4 ref pixels
+    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r6, r7          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r10, r8, lr         ; select bytes with positive difference
+    usub8   r9, r7, r6          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r6, r10, lr         ; calculate sum of positive differences
+    usad8   r7, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r10         ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r4, r4, r6          ; add positive differences to sum
+    sub     r4, r4, r7          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r7, r8              ; byte (two pixels) to halfwords
+    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
+    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
+    subs    r12, r12, #1        ; next row
+    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
+
+    bne     loop
+
+    ; return stuff
+    ldr     r8, [sp, #32]       ; get address of sse
+    mul     r1, r4, r4          ; sum * sum
+    str     r5, [r8]            ; store sse
+    sub     r0, r5, r1, ASR #6  ; return (sse - ((sum * sum) >> 6))
+
+    pop     {r4-r10, pc}
+
+    ENDP
+
+    END
--- a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
+++ b/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
@ -0,0 +1,182 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_variance_halfpixvar16x16_h_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp8_variance_halfpixvar16x16_h_armv6| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r8, #0              ; initialize sum = 0
+    ldr     r10, c80808080
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     lr, #0              ; constant zero
+loop
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load 4 src pixels
+    ldr     r6, [r0, #1]        ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load 4 src pixels
+    ldr     r6, [r0, #5]        ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load 4 src pixels
+    ldr     r6, [r0, #9]        ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load 4 src pixels
+    ldr     r6, [r0, #13]       ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    subs    r12, r12, #1
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+c80808080
+    DCD     0x80808080
+
+    END
+
--- a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
+++ b/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
@ -0,0 +1,222 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_variance_halfpixvar16x16_hv_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp8_variance_halfpixvar16x16_hv_armv6| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r8, #0              ; initialize sum = 0
+    ldr     r10, c80808080
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     lr, #0              ; constant zero
+loop
+    add     r9, r0, r1          ; pointer to pixels on the next row
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load source pixels a, row N
+    ldr     r6, [r0, #1]        ; load source pixels b, row N
+    ldr     r5, [r9, #0]        ; load source pixels c, row N+1
+    ldr     r7, [r9, #1]        ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load source pixels a, row N
+    ldr     r6, [r0, #5]        ; load source pixels b, row N
+    ldr     r5, [r9, #4]        ; load source pixels c, row N+1
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    ldr     r7, [r9, #5]        ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load source pixels a, row N
+    ldr     r6, [r0, #9]        ; load source pixels b, row N
+    ldr     r5, [r9, #8]        ; load source pixels c, row N+1
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    ldr     r7, [r9, #9]        ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load source pixels a, row N
+    ldr     r6, [r0, #13]       ; load source pixels b, row N
+    ldr     r5, [r9, #12]       ; load source pixels c, row N+1
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+    ldr     r7, [r9, #13]       ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    subs    r12, r12, #1
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+c80808080
+    DCD     0x80808080
+
+    END
--- a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
+++ b/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
@ -0,0 +1,184 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_variance_halfpixvar16x16_v_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp8_variance_halfpixvar16x16_v_armv6| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r8, #0              ; initialize sum = 0
+    ldr     r10, c80808080
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     lr, #0              ; constant zero
+loop
+    add     r9, r0, r1          ; set src pointer to next row
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load 4 src pixels
+    ldr     r6, [r9, #0]        ; load 4 src pixels from next row
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load 4 src pixels
+    ldr     r6, [r9, #4]        ; load 4 src pixels from next row
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load 4 src pixels
+    ldr     r6, [r9, #8]        ; load 4 src pixels from next row
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load 4 src pixels
+    ldr     r6, [r9, #12]       ; load 4 src pixels from next row
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+
+    subs    r12, r12, #1
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+c80808080
+    DCD     0x80808080
+
+    END
+
--- a/vp8/common/arm/bilinearfilter_arm.c
+++ b/vp8/common/arm/bilinearfilter_arm.c
@ -0,0 +1,113 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include <math.h>
+#include "vp8/common/filter.h"
+#include "bilinearfilter_arm.h"
+
+void vp8_filter_block2d_bil_armv6
+(
+    unsigned char *src_ptr,
+    unsigned char *dst_ptr,
+    unsigned int   src_pitch,
+    unsigned int   dst_pitch,
+    const short   *HFilter,
+    const short   *VFilter,
+    int            Width,
+    int            Height
+)
+{
+    unsigned short FData[36*16]; /* Temp data buffer used in filtering */
+
+    /* First filter 1-D horizontally... */
+    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
+
+    /* then 1-D vertically... */
+    vp8_filter_block2d_bil_second_pass_armv6(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
+}
+
+
+void vp8_bilinear_predict4x4_armv6
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
+}
+
+void vp8_bilinear_predict8x8_armv6
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
+}
+
+void vp8_bilinear_predict8x4_armv6
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
+}
+
+void vp8_bilinear_predict16x16_armv6
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
+}
--- a/vp8/common/arm/bilinearfilter_arm.h
+++ b/vp8/common/arm/bilinearfilter_arm.h
@ -0,0 +1,35 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef BILINEARFILTER_ARM_H
+#define BILINEARFILTER_ARM_H
+
+extern void vp8_filter_block2d_bil_first_pass_armv6
+(
+    const unsigned char  *src_ptr,
+    unsigned short       *dst_ptr,
+    unsigned int          src_pitch,
+    unsigned int          height,
+    unsigned int          width,
+    const short          *vp8_filter
+);
+
+extern void vp8_filter_block2d_bil_second_pass_armv6
+(
+    const unsigned short *src_ptr,
+    unsigned char        *dst_ptr,
+    int                   dst_pitch,
+    unsigned int          height,
+    unsigned int          width,
+    const short         *vp8_filter
+);
+
+#endif /* BILINEARFILTER_ARM_H */
--- a/vp8/common/arm/dequantize_arm.c
+++ b/vp8/common/arm/dequantize_arm.c
@ -0,0 +1,42 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp8/common/blockd.h"
+
+#if HAVE_NEON
+extern void vp8_dequantize_b_loop_neon(short *Q, short *DQC, short *DQ);
+#endif
+
+#if HAVE_MEDIA
+extern void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
+#endif
+
+#if HAVE_NEON
+
+void vp8_dequantize_b_neon(BLOCKD *d, short *DQC)
+{
+    short *DQ  = d->dqcoeff;
+    short *Q   = d->qcoeff;
+
+    vp8_dequantize_b_loop_neon(Q, DQC, DQ);
+}
+#endif
+
+#if HAVE_MEDIA
+void vp8_dequantize_b_v6(BLOCKD *d, short *DQC)
+{
+    short *DQ  = d->dqcoeff;
+    short *Q   = d->qcoeff;
+
+    vp8_dequantize_b_loop_v6(Q, DQC, DQ);
+}
+#endif
--- a/vp8/common/arm/filter_arm.c
+++ b/vp8/common/arm/filter_arm.c
@ -0,0 +1,221 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include <math.h>
+#include "vp8/common/filter.h"
+#include "vpx_ports/mem.h"
+
+extern void vp8_filter_block2d_first_pass_armv6
+(
+    unsigned char *src_ptr,
+    short         *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int output_width,
+    unsigned int output_height,
+    const short *vp8_filter
+);
+
+// 8x8
+extern void vp8_filter_block2d_first_pass_8x8_armv6
+(
+    unsigned char *src_ptr,
+    short         *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int output_width,
+    unsigned int output_height,
+    const short *vp8_filter
+);
+
+// 16x16
+extern void vp8_filter_block2d_first_pass_16x16_armv6
+(
+    unsigned char *src_ptr,
+    short         *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int output_width,
+    unsigned int output_height,
+    const short *vp8_filter
+);
+
+extern void vp8_filter_block2d_second_pass_armv6
+(
+    short         *src_ptr,
+    unsigned char *output_ptr,
+    unsigned int output_pitch,
+    unsigned int cnt,
+    const short *vp8_filter
+);
+
+extern void vp8_filter4_block2d_second_pass_armv6
+(
+    short         *src_ptr,
+    unsigned char *output_ptr,
+    unsigned int output_pitch,
+    unsigned int cnt,
+    const short *vp8_filter
+);
+
+extern void vp8_filter_block2d_first_pass_only_armv6
+(
+    unsigned char *src_ptr,
+    unsigned char *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int cnt,
+    unsigned int output_pitch,
+    const short *vp8_filter
+);
+
+
+extern void vp8_filter_block2d_second_pass_only_armv6
+(
+    unsigned char *src_ptr,
+    unsigned char *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int cnt,
+    unsigned int output_pitch,
+    const short *vp8_filter
+);
+
+#if HAVE_MEDIA
+void vp8_sixtap_predict4x4_armv6
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 12*4); /* Temp data buffer used in filtering */
+
+
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
+
+    /* Vfilter is null. First pass only */
+    if (xoffset && !yoffset)
+    {
+        /*vp8_filter_block2d_first_pass_armv6 ( src_ptr, FData+2, src_pixels_per_line, 4, 4, HFilter );
+        vp8_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, VFilter );*/
+
+        vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, HFilter);
+    }
+    /* Hfilter is null. Second pass only */
+    else if (!xoffset && yoffset)
+    {
+        vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, VFilter);
+    }
+    else
+    {
+        /* Vfilter is a 4 tap filter */
+        if (yoffset & 0x1)
+        {
+            vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 4, 7, HFilter);
+            vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
+        }
+        /* Vfilter is 6 tap filter */
+        else
+        {
+            vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 4, 9, HFilter);
+            vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
+        }
+    }
+}
+
+void vp8_sixtap_predict8x8_armv6
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data buffer used in filtering */
+
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
+
+    if (xoffset && !yoffset)
+    {
+        vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter);
+    }
+    /* Hfilter is null. Second pass only */
+    else if (!xoffset && yoffset)
+    {
+        vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter);
+    }
+    else
+    {
+        if (yoffset & 0x1)
+        {
+            vp8_filter_block2d_first_pass_8x8_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 8, 11, HFilter);
+            vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
+        }
+        else
+        {
+            vp8_filter_block2d_first_pass_8x8_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, 13, HFilter);
+            vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
+        }
+    }
+}
+
+
+void vp8_sixtap_predict16x16_armv6
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 24*16);    /* Temp data buffer used in filtering */
+
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
+
+    if (xoffset && !yoffset)
+    {
+        vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, HFilter);
+    }
+    /* Hfilter is null. Second pass only */
+    else if (!xoffset && yoffset)
+    {
+        vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, VFilter);
+    }
+    else
+    {
+        if (yoffset & 0x1)
+        {
+            vp8_filter_block2d_first_pass_16x16_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 16, 19, HFilter);
+            vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
+        }
+        else
+        {
+            vp8_filter_block2d_first_pass_16x16_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, 21, HFilter);
+            vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
+        }
+    }
+
+}
+#endif
--- a/vp8/common/arm/loopfilter_arm.c
+++ b/vp8/common/arm/loopfilter_arm.c
@ -0,0 +1,181 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/onyxc_int.h"
+
+#define prototype_loopfilter(sym) \
+    void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
+             const unsigned char *limit, const unsigned char *thresh, int count)
+
+#if HAVE_MEDIA
+extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6);
+extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6);
+extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6);
+extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6);
+#endif
+
+#if HAVE_NEON
+typedef void loopfilter_y_neon(unsigned char *src, int pitch,
+        unsigned char blimit, unsigned char limit, unsigned char thresh);
+typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
+        unsigned char blimit, unsigned char limit, unsigned char thresh,
+        unsigned char *v);
+
+extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon;
+extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon;
+extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
+extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
+
+extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon;
+extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon;
+extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
+extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon;
+#endif
+
+#if HAVE_MEDIA
+/* ARMV6/MEDIA loopfilter functions*/
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                               int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+    if (u_ptr)
+        vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+    if (v_ptr)
+        vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                               int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+    if (u_ptr)
+        vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+    if (v_ptr)
+        vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+    if (u_ptr)
+        vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+    if (v_ptr)
+        vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride,
+                               const unsigned char *blimit)
+{
+    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit);
+}
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+    if (u_ptr)
+        vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+    if (v_ptr)
+        vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride,
+                               const unsigned char *blimit)
+{
+    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit);
+}
+#endif
+
+#if HAVE_NEON
+/* NEON loopfilter functions */
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    unsigned char mblim = *lfi->mblim;
+    unsigned char lim = *lfi->lim;
+    unsigned char hev_thr = *lfi->hev_thr;
+    vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
+
+    if (u_ptr)
+        vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
+}
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    unsigned char mblim = *lfi->mblim;
+    unsigned char lim = *lfi->lim;
+    unsigned char hev_thr = *lfi->hev_thr;
+
+    vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
+
+    if (u_ptr)
+        vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
+}
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                             int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    unsigned char blim = *lfi->blim;
+    unsigned char lim = *lfi->lim;
+    unsigned char hev_thr = *lfi->hev_thr;
+
+    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim, lim, hev_thr);
+    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim, lim, hev_thr);
+    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim, lim, hev_thr);
+
+    if (u_ptr)
+        vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, blim, lim, hev_thr, v_ptr + 4 * uv_stride);
+}
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                             int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    unsigned char blim = *lfi->blim;
+    unsigned char lim = *lfi->lim;
+    unsigned char hev_thr = *lfi->hev_thr;
+
+    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr);
+    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr);
+    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim, hev_thr);
+
+    if (u_ptr)
+        vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, hev_thr, v_ptr + 4);
+}
+#endif
--- a/vp8/common/arm/neon/bilinearpredict16x16_neon.asm
+++ b/vp8/common/arm/neon/bilinearpredict16x16_neon.asm
@ -0,0 +1,357 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_bilinear_predict16x16_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; r4    unsigned char *dst_ptr,
+; stack(r5) int  dst_pitch
+
+|vp8_bilinear_predict16x16_neon| PROC
+    push            {r4-r5, lr}
+
+    adr             r12, bifilter16_coeff
+    ldr             r4, [sp, #12]           ;load parameters from stack
+    ldr             r5, [sp, #16]           ;load parameters from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             secondpass_bfilter16x16_only
+
+    add             r2, r12, r2, lsl #3     ;calculate filter location
+
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+
+    vld1.s32        {d31}, [r2]             ;load first_pass filter
+
+    beq             firstpass_bfilter16x16_only
+
+    sub             sp, sp, #272            ;reserve space on stack for temporary storage
+    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
+    mov             lr, sp
+    vld1.u8         {d5, d6, d7}, [r0], r1
+
+    mov             r2, #3                  ;loop counter
+    vld1.u8         {d8, d9, d10}, [r0], r1
+
+    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
+    vld1.u8         {d11, d12, d13}, [r0], r1
+
+    vdup.8          d1, d31[4]
+
+;First Pass: output_height lines x output_width columns (17x16)
+filt_blk2d_fp16x16_loop_neon
+    pld             [r0]
+    pld             [r0, r1]
+    pld             [r0, r1, lsl #1]
+
+    vmull.u8        q7, d2, d0              ;(src_ptr[0] * vp8_filter[0])
+    vmull.u8        q8, d3, d0
+    vmull.u8        q9, d5, d0
+    vmull.u8        q10, d6, d0
+    vmull.u8        q11, d8, d0
+    vmull.u8        q12, d9, d0
+    vmull.u8        q13, d11, d0
+    vmull.u8        q14, d12, d0
+
+    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
+    vext.8          d5, d5, d6, #1
+    vext.8          d8, d8, d9, #1
+    vext.8          d11, d11, d12, #1
+
+    vmlal.u8        q7, d2, d1              ;(src_ptr[0] * vp8_filter[1])
+    vmlal.u8        q9, d5, d1
+    vmlal.u8        q11, d8, d1
+    vmlal.u8        q13, d11, d1
+
+    vext.8          d3, d3, d4, #1
+    vext.8          d6, d6, d7, #1
+    vext.8          d9, d9, d10, #1
+    vext.8          d12, d12, d13, #1
+
+    vmlal.u8        q8, d3, d1              ;(src_ptr[0] * vp8_filter[1])
+    vmlal.u8        q10, d6, d1
+    vmlal.u8        q12, d9, d1
+    vmlal.u8        q14, d12, d1
+
+    subs            r2, r2, #1
+
+    vqrshrn.u16    d14, q7, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d15, q8, #7
+    vqrshrn.u16    d16, q9, #7
+    vqrshrn.u16    d17, q10, #7
+    vqrshrn.u16    d18, q11, #7
+    vqrshrn.u16    d19, q12, #7
+    vqrshrn.u16    d20, q13, #7
+
+    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
+    vqrshrn.u16    d21, q14, #7
+    vld1.u8         {d5, d6, d7}, [r0], r1
+
+    vst1.u8         {d14, d15, d16, d17}, [lr]!     ;store result
+    vld1.u8         {d8, d9, d10}, [r0], r1
+    vst1.u8         {d18, d19, d20, d21}, [lr]!
+    vld1.u8         {d11, d12, d13}, [r0], r1
+
+    bne             filt_blk2d_fp16x16_loop_neon
+
+;First-pass filtering for rest 5 lines
+    vld1.u8         {d14, d15, d16}, [r0], r1
+
+    vmull.u8        q9, d2, d0              ;(src_ptr[0] * vp8_filter[0])
+    vmull.u8        q10, d3, d0
+    vmull.u8        q11, d5, d0
+    vmull.u8        q12, d6, d0
+    vmull.u8        q13, d8, d0
+    vmull.u8        q14, d9, d0
+
+    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
+    vext.8          d5, d5, d6, #1
+    vext.8          d8, d8, d9, #1
+
+    vmlal.u8        q9, d2, d1              ;(src_ptr[0] * vp8_filter[1])
+    vmlal.u8        q11, d5, d1
+    vmlal.u8        q13, d8, d1
+
+    vext.8          d3, d3, d4, #1
+    vext.8          d6, d6, d7, #1
+    vext.8          d9, d9, d10, #1
+
+    vmlal.u8        q10, d3, d1             ;(src_ptr[0] * vp8_filter[1])
+    vmlal.u8        q12, d6, d1
+    vmlal.u8        q14, d9, d1
+
+    vmull.u8        q1, d11, d0
+    vmull.u8        q2, d12, d0
+    vmull.u8        q3, d14, d0
+    vmull.u8        q4, d15, d0
+
+    vext.8          d11, d11, d12, #1       ;construct src_ptr[1]
+    vext.8          d14, d14, d15, #1
+
+    vmlal.u8        q1, d11, d1             ;(src_ptr[0] * vp8_filter[1])
+    vmlal.u8        q3, d14, d1
+
+    vext.8          d12, d12, d13, #1
+    vext.8          d15, d15, d16, #1
+
+    vmlal.u8        q2, d12, d1             ;(src_ptr[0] * vp8_filter[1])
+    vmlal.u8        q4, d15, d1
+
+    vqrshrn.u16    d10, q9, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d11, q10, #7
+    vqrshrn.u16    d12, q11, #7
+    vqrshrn.u16    d13, q12, #7
+    vqrshrn.u16    d14, q13, #7
+    vqrshrn.u16    d15, q14, #7
+    vqrshrn.u16    d16, q1, #7
+    vqrshrn.u16    d17, q2, #7
+    vqrshrn.u16    d18, q3, #7
+    vqrshrn.u16    d19, q4, #7
+
+    vst1.u8         {d10, d11, d12, d13}, [lr]!         ;store result
+    vst1.u8         {d14, d15, d16, d17}, [lr]!
+    vst1.u8         {d18, d19}, [lr]!
+
+;Second pass: 16x16
+;secondpass_filter
+    add             r3, r12, r3, lsl #3
+    sub             lr, lr, #272
+
+    vld1.u32        {d31}, [r3]             ;load second_pass filter
+
+    vld1.u8         {d22, d23}, [lr]!       ;load src data
+
+    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
+    vdup.8          d1, d31[4]
+    mov             r12, #4                 ;loop counter
+
+filt_blk2d_sp16x16_loop_neon
+    vld1.u8         {d24, d25}, [lr]!
+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * vp8_filter[0])
+    vld1.u8         {d26, d27}, [lr]!
+    vmull.u8        q2, d23, d0
+    vld1.u8         {d28, d29}, [lr]!
+    vmull.u8        q3, d24, d0
+    vld1.u8         {d30, d31}, [lr]!
+
+    vmull.u8        q4, d25, d0
+    vmull.u8        q5, d26, d0
+    vmull.u8        q6, d27, d0
+    vmull.u8        q7, d28, d0
+    vmull.u8        q8, d29, d0
+
+    vmlal.u8        q1, d24, d1             ;(src_ptr[pixel_step] * vp8_filter[1])
+    vmlal.u8        q2, d25, d1
+    vmlal.u8        q3, d26, d1
+    vmlal.u8        q4, d27, d1
+    vmlal.u8        q5, d28, d1
+    vmlal.u8        q6, d29, d1
+    vmlal.u8        q7, d30, d1
+    vmlal.u8        q8, d31, d1
+
+    subs            r12, r12, #1
+
+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
+    vqrshrn.u16    d3, q2, #7
+    vqrshrn.u16    d4, q3, #7
+    vqrshrn.u16    d5, q4, #7
+    vqrshrn.u16    d6, q5, #7
+    vqrshrn.u16    d7, q6, #7
+    vqrshrn.u16    d8, q7, #7
+    vqrshrn.u16    d9, q8, #7
+
+    vst1.u8         {d2, d3}, [r4], r5      ;store result
+    vst1.u8         {d4, d5}, [r4], r5
+    vst1.u8         {d6, d7}, [r4], r5
+    vmov            q11, q15
+    vst1.u8         {d8, d9}, [r4], r5
+
+    bne             filt_blk2d_sp16x16_loop_neon
+
+    add             sp, sp, #272
+
+    pop             {r4-r5,pc}
+
+;--------------------
+firstpass_bfilter16x16_only
+    mov             r2, #4                      ;loop counter
+    vdup.8          d0, d31[0]                  ;first_pass filter (d0 d1)
+    vdup.8          d1, d31[4]
+
+;First Pass: output_height lines x output_width columns (16x16)
+filt_blk2d_fpo16x16_loop_neon
+    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
+    vld1.u8         {d5, d6, d7}, [r0], r1
+    vld1.u8         {d8, d9, d10}, [r0], r1
+    vld1.u8         {d11, d12, d13}, [r0], r1
+
+    pld             [r0]
+    pld             [r0, r1]
+    pld             [r0, r1, lsl #1]
+
+    vmull.u8        q7, d2, d0              ;(src_ptr[0] * vp8_filter[0])
+    vmull.u8        q8, d3, d0
+    vmull.u8        q9, d5, d0
+    vmull.u8        q10, d6, d0
+    vmull.u8        q11, d8, d0
+    vmull.u8        q12, d9, d0
+    vmull.u8        q13, d11, d0
+    vmull.u8        q14, d12, d0
+
+    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
+    vext.8          d5, d5, d6, #1
+    vext.8          d8, d8, d9, #1
+    vext.8          d11, d11, d12, #1
+
+    vmlal.u8        q7, d2, d1              ;(src_ptr[0] * vp8_filter[1])
+    vmlal.u8        q9, d5, d1
+    vmlal.u8        q11, d8, d1
+    vmlal.u8        q13, d11, d1
+
+    vext.8          d3, d3, d4, #1
+    vext.8          d6, d6, d7, #1
+    vext.8          d9, d9, d10, #1
+    vext.8          d12, d12, d13, #1
+
+    vmlal.u8        q8, d3, d1              ;(src_ptr[0] * vp8_filter[1])
+    vmlal.u8        q10, d6, d1
+    vmlal.u8        q12, d9, d1
+    vmlal.u8        q14, d12, d1
+
+    subs            r2, r2, #1
+
+    vqrshrn.u16    d14, q7, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d15, q8, #7
+    vqrshrn.u16    d16, q9, #7
+    vqrshrn.u16    d17, q10, #7
+    vqrshrn.u16    d18, q11, #7
+    vqrshrn.u16    d19, q12, #7
+    vqrshrn.u16    d20, q13, #7
+    vst1.u8         {d14, d15}, [r4], r5        ;store result
+    vqrshrn.u16    d21, q14, #7
+
+    vst1.u8         {d16, d17}, [r4], r5
+    vst1.u8         {d18, d19}, [r4], r5
+    vst1.u8         {d20, d21}, [r4], r5
+
+    bne             filt_blk2d_fpo16x16_loop_neon
+    pop             {r4-r5,pc}
+
+;---------------------
+secondpass_bfilter16x16_only
+;Second pass: 16x16
+;secondpass_filter
+    add             r3, r12, r3, lsl #3
+    mov             r12, #4                     ;loop counter
+    vld1.u32        {d31}, [r3]                 ;load second_pass filter
+    vld1.u8         {d22, d23}, [r0], r1        ;load src data
+
+    vdup.8          d0, d31[0]                  ;second_pass filter parameters (d0 d1)
+    vdup.8          d1, d31[4]
+
+filt_blk2d_spo16x16_loop_neon
+    vld1.u8         {d24, d25}, [r0], r1
+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * vp8_filter[0])
+    vld1.u8         {d26, d27}, [r0], r1
+    vmull.u8        q2, d23, d0
+    vld1.u8         {d28, d29}, [r0], r1
+    vmull.u8        q3, d24, d0
+    vld1.u8         {d30, d31}, [r0], r1
+
+    vmull.u8        q4, d25, d0
+    vmull.u8        q5, d26, d0
+    vmull.u8        q6, d27, d0
+    vmull.u8        q7, d28, d0
+    vmull.u8        q8, d29, d0
+
+    vmlal.u8        q1, d24, d1             ;(src_ptr[pixel_step] * vp8_filter[1])
+    vmlal.u8        q2, d25, d1
+    vmlal.u8        q3, d26, d1
+    vmlal.u8        q4, d27, d1
+    vmlal.u8        q5, d28, d1
+    vmlal.u8        q6, d29, d1
+    vmlal.u8        q7, d30, d1
+    vmlal.u8        q8, d31, d1
+
+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
+    vqrshrn.u16    d3, q2, #7
+    vqrshrn.u16    d4, q3, #7
+    vqrshrn.u16    d5, q4, #7
+    vqrshrn.u16    d6, q5, #7
+    vqrshrn.u16    d7, q6, #7
+    vqrshrn.u16    d8, q7, #7
+    vqrshrn.u16    d9, q8, #7
+
+    vst1.u8         {d2, d3}, [r4], r5      ;store result
+    subs            r12, r12, #1
+    vst1.u8         {d4, d5}, [r4], r5
+    vmov            q11, q15
+    vst1.u8         {d6, d7}, [r4], r5
+    vst1.u8         {d8, d9}, [r4], r5
+
+    bne             filt_blk2d_spo16x16_loop_neon
+    pop             {r4-r5,pc}
+
+    ENDP
+
+;-----------------
+
+bifilter16_coeff
+    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+    END
--- a/vp8/common/arm/neon/bilinearpredict4x4_neon.asm
+++ b/vp8/common/arm/neon/bilinearpredict4x4_neon.asm
@ -0,0 +1,130 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_bilinear_predict4x4_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; r4    unsigned char *dst_ptr,
+; stack(lr) int  dst_pitch
+
+|vp8_bilinear_predict4x4_neon| PROC
+    push            {r4, lr}
+
+    adr             r12, bifilter4_coeff
+    ldr             r4, [sp, #8]            ;load parameters from stack
+    ldr             lr, [sp, #12]           ;load parameters from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             skip_firstpass_filter
+
+;First pass: output_height lines x output_width columns (5x4)
+    vld1.u8         {d2}, [r0], r1          ;load src data
+    add             r2, r12, r2, lsl #3     ;calculate Hfilter location (2coeffsx4bytes=8bytes)
+
+    vld1.u8         {d3}, [r0], r1
+    vld1.u32        {d31}, [r2]             ;first_pass filter
+
+    vld1.u8         {d4}, [r0], r1
+    vdup.8          d0, d31[0]              ;first_pass filter (d0-d1)
+    vld1.u8         {d5}, [r0], r1
+    vdup.8          d1, d31[4]
+    vld1.u8         {d6}, [r0], r1
+
+    vshr.u64        q4, q1, #8              ;construct src_ptr[1]
+    vshr.u64        q5, q2, #8
+    vshr.u64        d12, d6, #8
+
+    vzip.32         d2, d3                  ;put 2-line data in 1 register (src_ptr[0])
+    vzip.32         d4, d5
+    vzip.32         d8, d9                  ;put 2-line data in 1 register (src_ptr[1])
+    vzip.32         d10, d11
+
+    vmull.u8        q7, d2, d0              ;(src_ptr[0] * vp8_filter[0])
+    vmull.u8        q8, d4, d0
+    vmull.u8        q9, d6, d0
+
+    vmlal.u8        q7, d8, d1              ;(src_ptr[1] * vp8_filter[1])
+    vmlal.u8        q8, d10, d1
+    vmlal.u8        q9, d12, d1
+
+    vqrshrn.u16    d28, q7, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d29, q8, #7
+    vqrshrn.u16    d30, q9, #7
+
+;Second pass: 4x4
+secondpass_filter
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+    beq             skip_secondpass_filter
+
+    add             r3, r12, r3, lsl #3 ;calculate Vfilter location
+    vld1.u32        {d31}, [r3]         ;load second_pass filter
+
+    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0-d5)
+    vdup.8          d1, d31[4]
+
+    vmull.u8        q1, d28, d0
+    vmull.u8        q2, d29, d0
+
+    vext.8          d26, d28, d29, #4       ;construct src_ptr[pixel_step]
+    vext.8          d27, d29, d30, #4
+
+    vmlal.u8        q1, d26, d1
+    vmlal.u8        q2, d27, d1
+
+    add             r0, r4, lr
+    add             r1, r0, lr
+    add             r2, r1, lr
+
+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
+    vqrshrn.u16    d3, q2, #7
+
+    vst1.32         {d2[0]}, [r4]           ;store result
+    vst1.32         {d2[1]}, [r0]
+    vst1.32         {d3[0]}, [r1]
+    vst1.32         {d3[1]}, [r2]
+
+    pop             {r4, pc}
+
+;--------------------
+skip_firstpass_filter
+
+    vld1.32         {d28[0]}, [r0], r1      ;load src data
+    vld1.32         {d28[1]}, [r0], r1
+    vld1.32         {d29[0]}, [r0], r1
+    vld1.32         {d29[1]}, [r0], r1
+    vld1.32         {d30[0]}, [r0], r1
+
+    b               secondpass_filter
+
+;---------------------
+skip_secondpass_filter
+    vst1.32         {d28[0]}, [r4], lr      ;store result
+    vst1.32         {d28[1]}, [r4], lr
+    vst1.32         {d29[0]}, [r4], lr
+    vst1.32         {d29[1]}, [r4], lr
+
+    pop             {r4, pc}
+
+    ENDP
+
+;-----------------
+
+bifilter4_coeff
+    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+    END
--- a/vp8/common/arm/neon/bilinearpredict8x4_neon.asm
+++ b/vp8/common/arm/neon/bilinearpredict8x4_neon.asm
@ -0,0 +1,135 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_bilinear_predict8x4_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; r4    unsigned char *dst_ptr,
+; stack(lr) int  dst_pitch
+
+|vp8_bilinear_predict8x4_neon| PROC
+    push            {r4, lr}
+
+    adr             r12, bifilter8x4_coeff
+    ldr             r4, [sp, #8]            ;load parameters from stack
+    ldr             lr, [sp, #12]           ;load parameters from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             skip_firstpass_filter
+
+;First pass: output_height lines x output_width columns (5x8)
+    add             r2, r12, r2, lsl #3     ;calculate filter location
+
+    vld1.u8         {q1}, [r0], r1          ;load src data
+    vld1.u32        {d31}, [r2]             ;load first_pass filter
+    vld1.u8         {q2}, [r0], r1
+    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
+    vld1.u8         {q3}, [r0], r1
+    vdup.8          d1, d31[4]
+    vld1.u8         {q4}, [r0], r1
+
+    vmull.u8        q6, d2, d0              ;(src_ptr[0] * vp8_filter[0])
+    vld1.u8         {q5}, [r0], r1
+    vmull.u8        q7, d4, d0
+    vmull.u8        q8, d6, d0
+    vmull.u8        q9, d8, d0
+    vmull.u8        q10, d10, d0
+
+    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
+    vext.8          d5, d4, d5, #1
+    vext.8          d7, d6, d7, #1
+    vext.8          d9, d8, d9, #1
+    vext.8          d11, d10, d11, #1
+
+    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * vp8_filter[1])
+    vmlal.u8        q7, d5, d1
+    vmlal.u8        q8, d7, d1
+    vmlal.u8        q9, d9, d1
+    vmlal.u8        q10, d11, d1
+
+    vqrshrn.u16    d22, q6, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d23, q7, #7
+    vqrshrn.u16    d24, q8, #7
+    vqrshrn.u16    d25, q9, #7
+    vqrshrn.u16    d26, q10, #7
+
+;Second pass: 4x8
+secondpass_filter
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+    beq             skip_secondpass_filter
+
+    add             r3, r12, r3, lsl #3
+    add             r0, r4, lr
+
+    vld1.u32        {d31}, [r3]             ;load second_pass filter
+    add             r1, r0, lr
+
+    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
+    vdup.8          d1, d31[4]
+
+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * vp8_filter[0])
+    vmull.u8        q2, d23, d0
+    vmull.u8        q3, d24, d0
+    vmull.u8        q4, d25, d0
+
+    vmlal.u8        q1, d23, d1             ;(src_ptr[pixel_step] * vp8_filter[1])
+    vmlal.u8        q2, d24, d1
+    vmlal.u8        q3, d25, d1
+    vmlal.u8        q4, d26, d1
+
+    add             r2, r1, lr
+
+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
+    vqrshrn.u16    d3, q2, #7
+    vqrshrn.u16    d4, q3, #7
+    vqrshrn.u16    d5, q4, #7
+
+    vst1.u8         {d2}, [r4]              ;store result
+    vst1.u8         {d3}, [r0]
+    vst1.u8         {d4}, [r1]
+    vst1.u8         {d5}, [r2]
+
+    pop             {r4, pc}
+
+;--------------------
+skip_firstpass_filter
+    vld1.u8         {d22}, [r0], r1         ;load src data
+    vld1.u8         {d23}, [r0], r1
+    vld1.u8         {d24}, [r0], r1
+    vld1.u8         {d25}, [r0], r1
+    vld1.u8         {d26}, [r0], r1
+
+    b               secondpass_filter
+
+;---------------------
+skip_secondpass_filter
+    vst1.u8         {d22}, [r4], lr         ;store result
+    vst1.u8         {d23}, [r4], lr
+    vst1.u8         {d24}, [r4], lr
+    vst1.u8         {d25}, [r4], lr
+
+    pop             {r4, pc}
+
+    ENDP
+
+;-----------------
+
+bifilter8x4_coeff
+    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+    END
--- a/vp8/common/arm/neon/bilinearpredict8x8_neon.asm
+++ b/vp8/common/arm/neon/bilinearpredict8x8_neon.asm
@ -0,0 +1,183 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_bilinear_predict8x8_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; r4    unsigned char *dst_ptr,
+; stack(lr) int  dst_pitch
+
+|vp8_bilinear_predict8x8_neon| PROC
+    push            {r4, lr}
+
+    adr             r12, bifilter8_coeff
+    ldr             r4, [sp, #8]            ;load parameters from stack
+    ldr             lr, [sp, #12]           ;load parameters from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             skip_firstpass_filter
+
+;First pass: output_height lines x output_width columns (9x8)
+    add             r2, r12, r2, lsl #3     ;calculate filter location
+
+    vld1.u8         {q1}, [r0], r1          ;load src data
+    vld1.u32        {d31}, [r2]             ;load first_pass filter
+    vld1.u8         {q2}, [r0], r1
+    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
+    vld1.u8         {q3}, [r0], r1
+    vdup.8          d1, d31[4]
+    vld1.u8         {q4}, [r0], r1
+
+    vmull.u8        q6, d2, d0              ;(src_ptr[0] * vp8_filter[0])
+    vmull.u8        q7, d4, d0
+    vmull.u8        q8, d6, d0
+    vmull.u8        q9, d8, d0
+
+    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
+    vext.8          d5, d4, d5, #1
+    vext.8          d7, d6, d7, #1
+    vext.8          d9, d8, d9, #1
+
+    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * vp8_filter[1])
+    vmlal.u8        q7, d5, d1
+    vmlal.u8        q8, d7, d1
+    vmlal.u8        q9, d9, d1
+
+    vld1.u8         {q1}, [r0], r1          ;load src data
+    vqrshrn.u16    d22, q6, #7              ;shift/round/saturate to u8
+    vld1.u8         {q2}, [r0], r1
+    vqrshrn.u16    d23, q7, #7
+    vld1.u8         {q3}, [r0], r1
+    vqrshrn.u16    d24, q8, #7
+    vld1.u8         {q4}, [r0], r1
+    vqrshrn.u16    d25, q9, #7
+
+    ;first_pass filtering on the rest 5-line data
+    vld1.u8         {q5}, [r0], r1
+
+    vmull.u8        q6, d2, d0              ;(src_ptr[0] * vp8_filter[0])
+    vmull.u8        q7, d4, d0
+    vmull.u8        q8, d6, d0
+    vmull.u8        q9, d8, d0
+    vmull.u8        q10, d10, d0
+
+    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
+    vext.8          d5, d4, d5, #1
+    vext.8          d7, d6, d7, #1
+    vext.8          d9, d8, d9, #1
+    vext.8          d11, d10, d11, #1
+
+    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * vp8_filter[1])
+    vmlal.u8        q7, d5, d1
+    vmlal.u8        q8, d7, d1
+    vmlal.u8        q9, d9, d1
+    vmlal.u8        q10, d11, d1
+
+    vqrshrn.u16    d26, q6, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d27, q7, #7
+    vqrshrn.u16    d28, q8, #7
+    vqrshrn.u16    d29, q9, #7
+    vqrshrn.u16    d30, q10, #7
+
+;Second pass: 8x8
+secondpass_filter
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+    beq             skip_secondpass_filter
+
+    add             r3, r12, r3, lsl #3
+    add             r0, r4, lr
+
+    vld1.u32        {d31}, [r3]             ;load second_pass filter
+    add             r1, r0, lr
+
+    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
+    vdup.8          d1, d31[4]
+
+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * vp8_filter[0])
+    vmull.u8        q2, d23, d0
+    vmull.u8        q3, d24, d0
+    vmull.u8        q4, d25, d0
+    vmull.u8        q5, d26, d0
+    vmull.u8        q6, d27, d0
+    vmull.u8        q7, d28, d0
+    vmull.u8        q8, d29, d0
+
+    vmlal.u8        q1, d23, d1             ;(src_ptr[pixel_step] * vp8_filter[1])
+    vmlal.u8        q2, d24, d1
+    vmlal.u8        q3, d25, d1
+    vmlal.u8        q4, d26, d1
+    vmlal.u8        q5, d27, d1
+    vmlal.u8        q6, d28, d1
+    vmlal.u8        q7, d29, d1
+    vmlal.u8        q8, d30, d1
+
+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
+    vqrshrn.u16    d3, q2, #7
+    vqrshrn.u16    d4, q3, #7
+    vqrshrn.u16    d5, q4, #7
+    vqrshrn.u16    d6, q5, #7
+    vqrshrn.u16    d7, q6, #7
+    vqrshrn.u16    d8, q7, #7
+    vqrshrn.u16    d9, q8, #7
+
+    vst1.u8         {d2}, [r4]              ;store result
+    vst1.u8         {d3}, [r0]
+    vst1.u8         {d4}, [r1], lr
+    vst1.u8         {d5}, [r1], lr
+    vst1.u8         {d6}, [r1], lr
+    vst1.u8         {d7}, [r1], lr
+    vst1.u8         {d8}, [r1], lr
+    vst1.u8         {d9}, [r1], lr
+
+    pop             {r4, pc}
+
+;--------------------
+skip_firstpass_filter
+    vld1.u8         {d22}, [r0], r1         ;load src data
+    vld1.u8         {d23}, [r0], r1
+    vld1.u8         {d24}, [r0], r1
+    vld1.u8         {d25}, [r0], r1
+    vld1.u8         {d26}, [r0], r1
+    vld1.u8         {d27}, [r0], r1
+    vld1.u8         {d28}, [r0], r1
+    vld1.u8         {d29}, [r0], r1
+    vld1.u8         {d30}, [r0], r1
+
+    b               secondpass_filter
+
+;---------------------
+skip_secondpass_filter
+    vst1.u8         {d22}, [r4], lr         ;store result
+    vst1.u8         {d23}, [r4], lr
+    vst1.u8         {d24}, [r4], lr
+    vst1.u8         {d25}, [r4], lr
+    vst1.u8         {d26}, [r4], lr
+    vst1.u8         {d27}, [r4], lr
+    vst1.u8         {d28}, [r4], lr
+    vst1.u8         {d29}, [r4], lr
+
+    pop             {r4, pc}
+
+    ENDP
+
+;-----------------
+
+bifilter8_coeff
+    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+    END
--- a/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm
+++ b/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm
@ -0,0 +1,584 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_build_intra_predictors_mby_neon_func|
+    EXPORT  |vp8_build_intra_predictors_mby_s_neon_func|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    unsigned char *y_buffer
+; r1    unsigned char *ypred_ptr
+; r2    int y_stride
+; r3    int mode
+; stack int Up
+; stack int Left
+
+|vp8_build_intra_predictors_mby_neon_func| PROC
+    push            {r4-r8, lr}
+
+    cmp             r3, #0
+    beq             case_dc_pred
+    cmp             r3, #1
+    beq             case_v_pred
+    cmp             r3, #2
+    beq             case_h_pred
+    cmp             r3, #3
+    beq             case_tm_pred
+
+case_dc_pred
+    ldr             r4, [sp, #24]       ; Up
+    ldr             r5, [sp, #28]       ; Left
+
+    ; Default the DC average to 128
+    mov             r12, #128
+    vdup.u8         q0, r12
+
+    ; Zero out running sum
+    mov             r12, #0
+
+    ; compute shift and jump
+    adds            r7, r4, r5
+    beq             skip_dc_pred_up_left
+
+    ; Load above row, if it exists
+    cmp             r4, #0
+    beq             skip_dc_pred_up
+
+    sub             r6, r0, r2
+    vld1.8          {q1}, [r6]
+    vpaddl.u8       q2, q1
+    vpaddl.u16      q3, q2
+    vpaddl.u32      q4, q3
+
+    vmov.32         r4, d8[0]
+    vmov.32         r6, d9[0]
+
+    add             r12, r4, r6
+
+    ; Move back to interger registers
+
+skip_dc_pred_up
+
+    cmp             r5, #0
+    beq             skip_dc_pred_left
+
+    sub             r0, r0, #1
+
+    ; Load left row, if it exists
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+
+    add             r12, r12, r3
+    add             r12, r12, r4
+    add             r12, r12, r5
+    add             r12, r12, r6
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+
+    add             r12, r12, r3
+    add             r12, r12, r4
+    add             r12, r12, r5
+    add             r12, r12, r6
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+
+    add             r12, r12, r3
+    add             r12, r12, r4
+    add             r12, r12, r5
+    add             r12, r12, r6
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0]
+
+    add             r12, r12, r3
+    add             r12, r12, r4
+    add             r12, r12, r5
+    add             r12, r12, r6
+
+skip_dc_pred_left
+    add             r7, r7, #3          ; Shift
+    sub             r4, r7, #1
+    mov             r5, #1
+    add             r12, r12, r5, lsl r4
+    mov             r5, r12, lsr r7     ; expected_dc
+
+    vdup.u8         q0, r5
+
+skip_dc_pred_up_left
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+
+    pop             {r4-r8,pc}
+case_v_pred
+    ; Copy down above row
+    sub             r6, r0, r2
+    vld1.8          {q0}, [r6]
+
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    pop             {r4-r8,pc}
+
+case_h_pred
+    ; Load 4x yleft_col
+    sub             r0, r0, #1
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u8         q0, r3
+    vdup.u8         q1, r4
+    vdup.u8         q2, r5
+    vdup.u8         q3, r6
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q1}, [r1]!
+    vst1.u8         {q2}, [r1]!
+    vst1.u8         {q3}, [r1]!
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u8         q0, r3
+    vdup.u8         q1, r4
+    vdup.u8         q2, r5
+    vdup.u8         q3, r6
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q1}, [r1]!
+    vst1.u8         {q2}, [r1]!
+    vst1.u8         {q3}, [r1]!
+
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u8         q0, r3
+    vdup.u8         q1, r4
+    vdup.u8         q2, r5
+    vdup.u8         q3, r6
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q1}, [r1]!
+    vst1.u8         {q2}, [r1]!
+    vst1.u8         {q3}, [r1]!
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u8         q0, r3
+    vdup.u8         q1, r4
+    vdup.u8         q2, r5
+    vdup.u8         q3, r6
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q1}, [r1]!
+    vst1.u8         {q2}, [r1]!
+    vst1.u8         {q3}, [r1]!
+
+    pop             {r4-r8,pc}
+
+case_tm_pred
+    ; Load yabove_row
+    sub             r3, r0, r2
+    vld1.8          {q8}, [r3]
+
+    ; Load ytop_left
+    sub             r3, r3, #1
+    ldrb            r7, [r3]
+
+    vdup.u16        q7, r7
+
+    ; Compute yabove_row - ytop_left
+    mov             r3, #1
+    vdup.u8         q0, r3
+
+    vmull.u8        q4, d16, d0
+    vmull.u8        q5, d17, d0
+
+    vsub.s16        q4, q4, q7
+    vsub.s16        q5, q5, q7
+
+    ; Load 4x yleft_col
+    sub             r0, r0, #1
+    mov             r12, #4
+
+case_tm_pred_loop
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u16        q0, r3
+    vdup.u16        q1, r4
+    vdup.u16        q2, r5
+    vdup.u16        q3, r6
+
+    vqadd.s16       q8, q0, q4
+    vqadd.s16       q9, q0, q5
+
+    vqadd.s16       q10, q1, q4
+    vqadd.s16       q11, q1, q5
+
+    vqadd.s16       q12, q2, q4
+    vqadd.s16       q13, q2, q5
+
+    vqadd.s16       q14, q3, q4
+    vqadd.s16       q15, q3, q5
+
+    vqshrun.s16     d0, q8, #0
+    vqshrun.s16     d1, q9, #0
+
+    vqshrun.s16     d2, q10, #0
+    vqshrun.s16     d3, q11, #0
+
+    vqshrun.s16     d4, q12, #0
+    vqshrun.s16     d5, q13, #0
+
+    vqshrun.s16     d6, q14, #0
+    vqshrun.s16     d7, q15, #0
+
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q1}, [r1]!
+    vst1.u8         {q2}, [r1]!
+    vst1.u8         {q3}, [r1]!
+
+    subs            r12, r12, #1
+    bne             case_tm_pred_loop
+
+    pop             {r4-r8,pc}
+
+    ENDP
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; r0    unsigned char *y_buffer
+; r1    unsigned char *ypred_ptr
+; r2    int y_stride
+; r3    int mode
+; stack int Up
+; stack int Left
+
+|vp8_build_intra_predictors_mby_s_neon_func| PROC
+    push            {r4-r8, lr}
+
+    mov             r1, r0      ;   unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor;
+
+    cmp             r3, #0
+    beq             case_dc_pred_s
+    cmp             r3, #1
+    beq             case_v_pred_s
+    cmp             r3, #2
+    beq             case_h_pred_s
+    cmp             r3, #3
+    beq             case_tm_pred_s
+
+case_dc_pred_s
+    ldr             r4, [sp, #24]       ; Up
+    ldr             r5, [sp, #28]       ; Left
+
+    ; Default the DC average to 128
+    mov             r12, #128
+    vdup.u8         q0, r12
+
+    ; Zero out running sum
+    mov             r12, #0
+
+    ; compute shift and jump
+    adds            r7, r4, r5
+    beq             skip_dc_pred_up_left_s
+
+    ; Load above row, if it exists
+    cmp             r4, #0
+    beq             skip_dc_pred_up_s
+
+    sub             r6, r0, r2
+    vld1.8          {q1}, [r6]
+    vpaddl.u8       q2, q1
+    vpaddl.u16      q3, q2
+    vpaddl.u32      q4, q3
+
+    vmov.32         r4, d8[0]
+    vmov.32         r6, d9[0]
+
+    add             r12, r4, r6
+
+    ; Move back to interger registers
+
+skip_dc_pred_up_s
+
+    cmp             r5, #0
+    beq             skip_dc_pred_left_s
+
+    sub             r0, r0, #1
+
+    ; Load left row, if it exists
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+
+    add             r12, r12, r3
+    add             r12, r12, r4
+    add             r12, r12, r5
+    add             r12, r12, r6
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+
+    add             r12, r12, r3
+    add             r12, r12, r4
+    add             r12, r12, r5
+    add             r12, r12, r6
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+
+    add             r12, r12, r3
+    add             r12, r12, r4
+    add             r12, r12, r5
+    add             r12, r12, r6
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0]
+
+    add             r12, r12, r3
+    add             r12, r12, r4
+    add             r12, r12, r5
+    add             r12, r12, r6
+
+skip_dc_pred_left_s
+    add             r7, r7, #3          ; Shift
+    sub             r4, r7, #1
+    mov             r5, #1
+    add             r12, r12, r5, lsl r4
+    mov             r5, r12, lsr r7     ; expected_dc
+
+    vdup.u8         q0, r5
+
+skip_dc_pred_up_left_s
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+
+    pop             {r4-r8,pc}
+case_v_pred_s
+    ; Copy down above row
+    sub             r6, r0, r2
+    vld1.8          {q0}, [r6]
+
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    pop             {r4-r8,pc}
+
+case_h_pred_s
+    ; Load 4x yleft_col
+    sub             r0, r0, #1
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u8         q0, r3
+    vdup.u8         q1, r4
+    vdup.u8         q2, r5
+    vdup.u8         q3, r6
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q1}, [r1], r2
+    vst1.u8         {q2}, [r1], r2
+    vst1.u8         {q3}, [r1], r2
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u8         q0, r3
+    vdup.u8         q1, r4
+    vdup.u8         q2, r5
+    vdup.u8         q3, r6
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q1}, [r1], r2
+    vst1.u8         {q2}, [r1], r2
+    vst1.u8         {q3}, [r1], r2
+
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u8         q0, r3
+    vdup.u8         q1, r4
+    vdup.u8         q2, r5
+    vdup.u8         q3, r6
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q1}, [r1], r2
+    vst1.u8         {q2}, [r1], r2
+    vst1.u8         {q3}, [r1], r2
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u8         q0, r3
+    vdup.u8         q1, r4
+    vdup.u8         q2, r5
+    vdup.u8         q3, r6
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q1}, [r1], r2
+    vst1.u8         {q2}, [r1], r2
+    vst1.u8         {q3}, [r1], r2
+
+    pop             {r4-r8,pc}
+
+case_tm_pred_s
+    ; Load yabove_row
+    sub             r3, r0, r2
+    vld1.8          {q8}, [r3]
+
+    ; Load ytop_left
+    sub             r3, r3, #1
+    ldrb            r7, [r3]
+
+    vdup.u16        q7, r7
+
+    ; Compute yabove_row - ytop_left
+    mov             r3, #1
+    vdup.u8         q0, r3
+
+    vmull.u8        q4, d16, d0
+    vmull.u8        q5, d17, d0
+
+    vsub.s16        q4, q4, q7
+    vsub.s16        q5, q5, q7
+
+    ; Load 4x yleft_col
+    sub             r0, r0, #1
+    mov             r12, #4
+
+case_tm_pred_loop_s
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u16        q0, r3
+    vdup.u16        q1, r4
+    vdup.u16        q2, r5
+    vdup.u16        q3, r6
+
+    vqadd.s16       q8, q0, q4
+    vqadd.s16       q9, q0, q5
+
+    vqadd.s16       q10, q1, q4
+    vqadd.s16       q11, q1, q5
+
+    vqadd.s16       q12, q2, q4
+    vqadd.s16       q13, q2, q5
+
+    vqadd.s16       q14, q3, q4
+    vqadd.s16       q15, q3, q5
+
+    vqshrun.s16     d0, q8, #0
+    vqshrun.s16     d1, q9, #0
+
+    vqshrun.s16     d2, q10, #0
+    vqshrun.s16     d3, q11, #0
+
+    vqshrun.s16     d4, q12, #0
+    vqshrun.s16     d5, q13, #0
+
+    vqshrun.s16     d6, q14, #0
+    vqshrun.s16     d7, q15, #0
+
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q1}, [r1], r2
+    vst1.u8         {q2}, [r1], r2
+    vst1.u8         {q3}, [r1], r2
+
+    subs            r12, r12, #1
+    bne             case_tm_pred_loop_s
+
+    pop             {r4-r8,pc}
+
+    ENDP
+
+
+    END
--- a/vp8/common/arm/neon/copymem16x16_neon.asm
+++ b/vp8/common/arm/neon/copymem16x16_neon.asm
@ -0,0 +1,59 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_copy_mem16x16_neon|
+    ; ARM
+    ; REQUIRE8
+    ; PRESERVE8
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void copy_mem16x16_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp8_copy_mem16x16_neon| PROC
+
+    vld1.u8     {q0}, [r0], r1
+    vld1.u8     {q1}, [r0], r1
+    vld1.u8     {q2}, [r0], r1
+    vst1.u8     {q0}, [r2], r3
+    vld1.u8     {q3}, [r0], r1
+    vst1.u8     {q1}, [r2], r3
+    vld1.u8     {q4}, [r0], r1
+    vst1.u8     {q2}, [r2], r3
+    vld1.u8     {q5}, [r0], r1
+    vst1.u8     {q3}, [r2], r3
+    vld1.u8     {q6}, [r0], r1
+    vst1.u8     {q4}, [r2], r3
+    vld1.u8     {q7}, [r0], r1
+    vst1.u8     {q5}, [r2], r3
+    vld1.u8     {q8}, [r0], r1
+    vst1.u8     {q6}, [r2], r3
+    vld1.u8     {q9}, [r0], r1
+    vst1.u8     {q7}, [r2], r3
+    vld1.u8     {q10}, [r0], r1
+    vst1.u8     {q8}, [r2], r3
+    vld1.u8     {q11}, [r0], r1
+    vst1.u8     {q9}, [r2], r3
+    vld1.u8     {q12}, [r0], r1
+    vst1.u8     {q10}, [r2], r3
+    vld1.u8     {q13}, [r0], r1
+    vst1.u8     {q11}, [r2], r3
+    vld1.u8     {q14}, [r0], r1
+    vst1.u8     {q12}, [r2], r3
+    vld1.u8     {q15}, [r0], r1
+    vst1.u8     {q13}, [r2], r3
+    vst1.u8     {q14}, [r2], r3
+    vst1.u8     {q15}, [r2], r3
+
+    mov     pc, lr
+
+    ENDP  ; |vp8_copy_mem16x16_neon|
+
+    END
--- a/vp8/common/arm/neon/copymem8x4_neon.asm
+++ b/vp8/common/arm/neon/copymem8x4_neon.asm
@ -0,0 +1,34 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_copy_mem8x4_neon|
+    ; ARM
+    ; REQUIRE8
+    ; PRESERVE8
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void copy_mem8x4_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp8_copy_mem8x4_neon| PROC
+    vld1.u8     {d0}, [r0], r1
+    vld1.u8     {d1}, [r0], r1
+    vst1.u8     {d0}, [r2], r3
+    vld1.u8     {d2}, [r0], r1
+    vst1.u8     {d1}, [r2], r3
+    vld1.u8     {d3}, [r0], r1
+    vst1.u8     {d2}, [r2], r3
+    vst1.u8     {d3}, [r2], r3
+
+    mov     pc, lr
+
+    ENDP  ; |vp8_copy_mem8x4_neon|
+
+    END
--- a/vp8/common/arm/neon/copymem8x8_neon.asm
+++ b/vp8/common/arm/neon/copymem8x8_neon.asm
@ -0,0 +1,43 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_copy_mem8x8_neon|
+    ; ARM
+    ; REQUIRE8
+    ; PRESERVE8
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void copy_mem8x8_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp8_copy_mem8x8_neon| PROC
+
+    vld1.u8     {d0}, [r0], r1
+    vld1.u8     {d1}, [r0], r1
+    vst1.u8     {d0}, [r2], r3
+    vld1.u8     {d2}, [r0], r1
+    vst1.u8     {d1}, [r2], r3
+    vld1.u8     {d3}, [r0], r1
+    vst1.u8     {d2}, [r2], r3
+    vld1.u8     {d4}, [r0], r1
+    vst1.u8     {d3}, [r2], r3
+    vld1.u8     {d5}, [r0], r1
+    vst1.u8     {d4}, [r2], r3
+    vld1.u8     {d6}, [r0], r1
+    vst1.u8     {d5}, [r2], r3
+    vld1.u8     {d7}, [r0], r1
+    vst1.u8     {d6}, [r2], r3
+    vst1.u8     {d7}, [r2], r3
+
+    mov     pc, lr
+
+    ENDP  ; |vp8_copy_mem8x8_neon|
+
+    END
--- a/vp8/common/arm/neon/dc_only_idct_add_neon.asm
+++ b/vp8/common/arm/neon/dc_only_idct_add_neon.asm
@ -0,0 +1,54 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_dc_only_idct_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
+;                            int pred_stride, unsigned char *dst_ptr,
+;                            int dst_stride)
+
+; r0  input_dc
+; r1  pred_ptr
+; r2  pred_stride
+; r3  dst_ptr
+; sp  dst_stride
+
+|vp8_dc_only_idct_add_neon| PROC
+    add             r0, r0, #4
+    asr             r0, r0, #3
+    ldr             r12, [sp]
+    vdup.16         q0, r0
+
+    vld1.32         {d2[0]}, [r1], r2
+    vld1.32         {d2[1]}, [r1], r2
+    vld1.32         {d4[0]}, [r1], r2
+    vld1.32         {d4[1]}, [r1]
+
+    vaddw.u8        q1, q0, d2
+    vaddw.u8        q2, q0, d4
+
+    vqmovun.s16     d2, q1
+    vqmovun.s16     d4, q2
+
+    vst1.32         {d2[0]}, [r3], r12
+    vst1.32         {d2[1]}, [r3], r12
+    vst1.32         {d4[0]}, [r3], r12
+    vst1.32         {d4[1]}, [r3]
+
+    bx              lr
+
+    ENDP
+
+    END
--- a/vp8/common/arm/neon/dequant_idct_neon.asm
+++ b/vp8/common/arm/neon/dequant_idct_neon.asm
@ -0,0 +1,131 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_dequant_idct_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void vp8_dequant_idct_add_neon(short *input, short *dq,
+;                           unsigned char *dest, int stride)
+; r0    short *input,
+; r1    short *dq,
+; r2    unsigned char *dest
+; r3    int stride
+
+|vp8_dequant_idct_add_neon| PROC
+    vld1.16         {q3, q4}, [r0]
+    vld1.16         {q5, q6}, [r1]
+
+    add             r1, r2, r3              ; r1 = dest + stride
+    lsl             r3, #1                  ; 2x stride
+
+    vld1.32         {d14[0]}, [r2], r3
+    vld1.32         {d14[1]}, [r1], r3
+    vld1.32         {d15[0]}, [r2]
+    vld1.32         {d15[1]}, [r1]
+
+    adr             r12, cospi8sqrt2minus1  ; pointer to the first constant
+
+    vmul.i16        q1, q3, q5              ;input for short_idct4x4llm_neon
+    vmul.i16        q2, q4, q6
+
+;|short_idct4x4llm_neon| PROC
+    vld1.16         {d0}, [r12]
+    vswp            d3, d4                  ;q2(vp[4] vp[12])
+
+    vqdmulh.s16     q3, q2, d0[2]
+    vqdmulh.s16     q4, q2, d0[0]
+
+    vqadd.s16       d12, d2, d3             ;a1
+    vqsub.s16       d13, d2, d3             ;b1
+
+    vshr.s16        q3, q3, #1
+    vshr.s16        q4, q4, #1
+
+    vqadd.s16       q3, q3, q2
+    vqadd.s16       q4, q4, q2
+
+    vqsub.s16       d10, d6, d9             ;c1
+    vqadd.s16       d11, d7, d8             ;d1
+
+    vqadd.s16       d2, d12, d11
+    vqadd.s16       d3, d13, d10
+    vqsub.s16       d4, d13, d10
+    vqsub.s16       d5, d12, d11
+
+    vtrn.32         d2, d4
+    vtrn.32         d3, d5
+    vtrn.16         d2, d3
+    vtrn.16         d4, d5
+
+; memset(input, 0, 32) -- 32bytes
+    vmov.i16        q14, #0
+
+    vswp            d3, d4
+    vqdmulh.s16     q3, q2, d0[2]
+    vqdmulh.s16     q4, q2, d0[0]
+
+    vqadd.s16       d12, d2, d3             ;a1
+    vqsub.s16       d13, d2, d3             ;b1
+
+    vmov            q15, q14
+
+    vshr.s16        q3, q3, #1
+    vshr.s16        q4, q4, #1
+
+    vqadd.s16       q3, q3, q2
+    vqadd.s16       q4, q4, q2
+
+    vqsub.s16       d10, d6, d9             ;c1
+    vqadd.s16       d11, d7, d8             ;d1
+
+    vqadd.s16       d2, d12, d11
+    vqadd.s16       d3, d13, d10
+    vqsub.s16       d4, d13, d10
+    vqsub.s16       d5, d12, d11
+
+    vst1.16         {q14, q15}, [r0]
+
+    vrshr.s16       d2, d2, #3
+    vrshr.s16       d3, d3, #3
+    vrshr.s16       d4, d4, #3
+    vrshr.s16       d5, d5, #3
+
+    vtrn.32         d2, d4
+    vtrn.32         d3, d5
+    vtrn.16         d2, d3
+    vtrn.16         d4, d5
+
+    vaddw.u8        q1, q1, d14
+    vaddw.u8        q2, q2, d15
+
+    sub             r2, r2, r3
+    sub             r1, r1, r3
+
+    vqmovun.s16     d0, q1
+    vqmovun.s16     d1, q2
+
+    vst1.32         {d0[0]}, [r2], r3
+    vst1.32         {d0[1]}, [r1], r3
+    vst1.32         {d1[0]}, [r2]
+    vst1.32         {d1[1]}, [r1]
+
+    bx             lr
+
+    ENDP           ; |vp8_dequant_idct_add_neon|
+
+; Constant Pool
+cospi8sqrt2minus1 DCD 0x4e7b4e7b
+sinpi8sqrt2       DCD 0x8a8c8a8c
+
+    END
--- a/vp8/common/arm/neon/dequantizeb_neon.asm
+++ b/vp8/common/arm/neon/dequantizeb_neon.asm
@ -0,0 +1,34 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_dequantize_b_loop_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    short *Q,
+; r1    short *DQC
+; r2    short *DQ
+|vp8_dequantize_b_loop_neon| PROC
+    vld1.16         {q0, q1}, [r0]
+    vld1.16         {q2, q3}, [r1]
+
+    vmul.i16        q4, q0, q2
+    vmul.i16        q5, q1, q3
+
+    vst1.16         {q4, q5}, [r2]
+
+    bx             lr
+
+    ENDP
+
+    END
--- a/vp8/common/arm/neon/idct_blk_neon.c
+++ b/vp8/common/arm/neon/idct_blk_neon.c
@ -0,0 +1,96 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+
+/* place these declarations here because we don't want to maintain them
+ * outside of this scope
+ */
+void idct_dequant_full_2x_neon(short *q, short *dq,
+                               unsigned char *dst, int stride);
+void idct_dequant_0_2x_neon(short *q, short dq,
+                            unsigned char *dst, int stride);
+
+
+void vp8_dequant_idct_add_y_block_neon(short *q, short *dq,
+                                       unsigned char *dst,
+                                       int stride, char *eobs)
+{
+    int i;
+
+    for (i = 0; i < 4; i++)
+    {
+        if (((short *)(eobs))[0])
+        {
+            if (((short *)eobs)[0] & 0xfefe)
+                idct_dequant_full_2x_neon (q, dq, dst, stride);
+            else
+                idct_dequant_0_2x_neon (q, dq[0], dst, stride);
+        }
+
+        if (((short *)(eobs))[1])
+        {
+            if (((short *)eobs)[1] & 0xfefe)
+                idct_dequant_full_2x_neon (q+32, dq, dst+8, stride);
+            else
+                idct_dequant_0_2x_neon (q+32, dq[0], dst+8, stride);
+        }
+        q    += 64;
+        dst  += 4*stride;
+        eobs += 4;
+    }
+}
+
+void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq,
+                                        unsigned char *dstu,
+                                        unsigned char *dstv,
+                                        int stride, char *eobs)
+{
+    if (((short *)(eobs))[0])
+    {
+        if (((short *)eobs)[0] & 0xfefe)
+            idct_dequant_full_2x_neon (q, dq, dstu, stride);
+        else
+            idct_dequant_0_2x_neon (q, dq[0], dstu, stride);
+    }
+
+    q    += 32;
+    dstu += 4*stride;
+
+    if (((short *)(eobs))[1])
+    {
+        if (((short *)eobs)[1] & 0xfefe)
+            idct_dequant_full_2x_neon (q, dq, dstu, stride);
+        else
+            idct_dequant_0_2x_neon (q, dq[0], dstu, stride);
+    }
+
+    q += 32;
+
+    if (((short *)(eobs))[2])
+    {
+        if (((short *)eobs)[2] & 0xfefe)
+            idct_dequant_full_2x_neon (q, dq, dstv, stride);
+        else
+            idct_dequant_0_2x_neon (q, dq[0], dstv, stride);
+    }
+
+    q    += 32;
+    dstv += 4*stride;
+
+    if (((short *)(eobs))[3])
+    {
+        if (((short *)eobs)[3] & 0xfefe)
+            idct_dequant_full_2x_neon (q, dq, dstv, stride);
+        else
+            idct_dequant_0_2x_neon (q, dq[0], dstv, stride);
+    }
+}
--- a/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm
+++ b/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm
@ -0,0 +1,79 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |idct_dequant_0_2x_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void idct_dequant_0_2x_neon(short *q, short dq,
+;                            unsigned char *dst, int stride);
+; r0   *q
+; r1   dq
+; r2   *dst
+; r3   stride
+|idct_dequant_0_2x_neon| PROC
+    push            {r4, r5}
+
+    add             r12, r2, #4
+    vld1.32         {d2[0]}, [r2], r3
+    vld1.32         {d8[0]}, [r12], r3
+    vld1.32         {d2[1]}, [r2], r3
+    vld1.32         {d8[1]}, [r12], r3
+    vld1.32         {d4[0]}, [r2], r3
+    vld1.32         {d10[0]}, [r12], r3
+    vld1.32         {d4[1]}, [r2], r3
+    vld1.32         {d10[1]}, [r12], r3
+
+    ldrh            r12, [r0]               ; lo q
+    ldrh            r4, [r0, #32]           ; hi q
+    mov             r5, #0
+    strh            r5, [r0]
+    strh            r5, [r0, #32]
+
+    sxth            r12, r12                ; lo
+    mul             r0, r12, r1
+    add             r0, r0, #4
+    asr             r0, r0, #3
+    vdup.16         q0, r0
+    sxth            r4, r4                  ; hi
+    mul             r0, r4, r1
+    add             r0, r0, #4
+    asr             r0, r0, #3
+    vdup.16         q3, r0
+
+    vaddw.u8        q1, q0, d2              ; lo
+    vaddw.u8        q2, q0, d4
+    vaddw.u8        q4, q3, d8              ; hi
+    vaddw.u8        q5, q3, d10
+
+    sub             r2, r2, r3, lsl #2      ; dst - 4*stride
+    add             r0, r2, #4
+
+    vqmovun.s16     d2, q1                  ; lo
+    vqmovun.s16     d4, q2
+    vqmovun.s16     d8, q4                  ; hi
+    vqmovun.s16     d10, q5
+
+    vst1.32         {d2[0]}, [r2], r3       ; lo
+    vst1.32         {d8[0]}, [r0], r3       ; hi
+    vst1.32         {d2[1]}, [r2], r3
+    vst1.32         {d8[1]}, [r0], r3
+    vst1.32         {d4[0]}, [r2], r3
+    vst1.32         {d10[0]}, [r0], r3
+    vst1.32         {d4[1]}, [r2]
+    vst1.32         {d10[1]}, [r0]
+
+    pop             {r4, r5}
+    bx              lr
+
+    ENDP            ; |idct_dequant_0_2x_neon|
+    END
--- a/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm
+++ b/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm
@ -0,0 +1,196 @@
+;
+;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |idct_dequant_full_2x_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void idct_dequant_full_2x_neon(short *q, short *dq,
+;                               unsigned char *dst, int stride);
+; r0    *q,
+; r1    *dq,
+; r2    *dst
+; r3    stride
+|idct_dequant_full_2x_neon| PROC
+    vld1.16         {q0, q1}, [r1]          ; dq (same l/r)
+    vld1.16         {q2, q3}, [r0]          ; l q
+    add             r0, r0, #32
+    vld1.16         {q4, q5}, [r0]          ; r q
+    add             r12, r2, #4
+
+    ; interleave the predictors
+    vld1.32         {d28[0]}, [r2],  r3     ; l pre
+    vld1.32         {d28[1]}, [r12], r3     ; r pre
+    vld1.32         {d29[0]}, [r2],  r3
+    vld1.32         {d29[1]}, [r12], r3
+    vld1.32         {d30[0]}, [r2],  r3
+    vld1.32         {d30[1]}, [r12], r3
+    vld1.32         {d31[0]}, [r2],  r3
+    vld1.32         {d31[1]}, [r12]
+
+    adr             r1, cospi8sqrt2minus1   ; pointer to the first constant
+
+    ; dequant: q[i] = q[i] * dq[i]
+    vmul.i16        q2, q2, q0
+    vmul.i16        q3, q3, q1
+    vmul.i16        q4, q4, q0
+    vmul.i16        q5, q5, q1
+
+    vld1.16         {d0}, [r1]
+
+    ; q2: l0r0  q3: l8r8
+    ; q4: l4r4  q5: l12r12
+    vswp            d5, d8
+    vswp            d7, d10
+
+    ; _CONSTANTS_ * 4,12 >> 16
+    ; q6:  4 * sinpi : c1/temp1
+    ; q7: 12 * sinpi : d1/temp2
+    ; q8:  4 * cospi
+    ; q9: 12 * cospi
+    vqdmulh.s16     q6, q4, d0[2]           ; sinpi8sqrt2
+    vqdmulh.s16     q7, q5, d0[2]
+    vqdmulh.s16     q8, q4, d0[0]           ; cospi8sqrt2minus1
+    vqdmulh.s16     q9, q5, d0[0]
+
+    vqadd.s16       q10, q2, q3             ; a1 = 0 + 8
+    vqsub.s16       q11, q2, q3             ; b1 = 0 - 8
+
+    ; vqdmulh only accepts signed values. this was a problem because
+    ; our constant had the high bit set, and was treated as a negative value.
+    ; vqdmulh also doubles the value before it shifts by 16. we need to
+    ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
+    ; so we can shift the constant without losing precision. this avoids
+    ; shift again afterward, but also avoids the sign issue. win win!
+    ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
+    ; pre-shift it
+    vshr.s16        q8, q8, #1
+    vshr.s16        q9, q9, #1
+
+    ; q4:  4 +  4 * cospi : d1/temp1
+    ; q5: 12 + 12 * cospi : c1/temp2
+    vqadd.s16       q4, q4, q8
+    vqadd.s16       q5, q5, q9
+
+    ; c1 = temp1 - temp2
+    ; d1 = temp1 + temp2
+    vqsub.s16       q2, q6, q5
+    vqadd.s16       q3, q4, q7
+
+    ; [0]: a1+d1
+    ; [1]: b1+c1
+    ; [2]: b1-c1
+    ; [3]: a1-d1
+    vqadd.s16       q4, q10, q3
+    vqadd.s16       q5, q11, q2
+    vqsub.s16       q6, q11, q2
+    vqsub.s16       q7, q10, q3
+
+    ; rotate
+    vtrn.32         q4, q6
+    vtrn.32         q5, q7
+    vtrn.16         q4, q5
+    vtrn.16         q6, q7
+    ; idct loop 2
+    ; q4: l 0, 4, 8,12 r 0, 4, 8,12
+    ; q5: l 1, 5, 9,13 r 1, 5, 9,13
+    ; q6: l 2, 6,10,14 r 2, 6,10,14
+    ; q7: l 3, 7,11,15 r 3, 7,11,15
+
+    ; q8:  1 * sinpi : c1/temp1
+    ; q9:  3 * sinpi : d1/temp2
+    ; q10: 1 * cospi
+    ; q11: 3 * cospi
+    vqdmulh.s16     q8, q5, d0[2]           ; sinpi8sqrt2
+    vqdmulh.s16     q9, q7, d0[2]
+    vqdmulh.s16     q10, q5, d0[0]          ; cospi8sqrt2minus1
+    vqdmulh.s16     q11, q7, d0[0]
+
+    vqadd.s16       q2, q4, q6             ; a1 = 0 + 2
+    vqsub.s16       q3, q4, q6             ; b1 = 0 - 2
+
+    ; see note on shifting above
+    vshr.s16        q10, q10, #1
+    vshr.s16        q11, q11, #1
+
+    ; q10: 1 + 1 * cospi : d1/temp1
+    ; q11: 3 + 3 * cospi : c1/temp2
+    vqadd.s16       q10, q5, q10
+    vqadd.s16       q11, q7, q11
+
+    ; q8: c1 = temp1 - temp2
+    ; q9: d1 = temp1 + temp2
+    vqsub.s16       q8, q8, q11
+    vqadd.s16       q9, q10, q9
+
+    ; a1+d1
+    ; b1+c1
+    ; b1-c1
+    ; a1-d1
+    vqadd.s16       q4, q2, q9
+    vqadd.s16       q5, q3, q8
+    vqsub.s16       q6, q3, q8
+    vqsub.s16       q7, q2, q9
+
+    ; +4 >> 3 (rounding)
+    vrshr.s16       q4, q4, #3              ; lo
+    vrshr.s16       q5, q5, #3
+    vrshr.s16       q6, q6, #3              ; hi
+    vrshr.s16       q7, q7, #3
+
+    vtrn.32         q4, q6
+    vtrn.32         q5, q7
+    vtrn.16         q4, q5
+    vtrn.16         q6, q7
+
+    ; adding pre
+    ; input is still packed. pre was read interleaved
+    vaddw.u8        q4, q4, d28
+    vaddw.u8        q5, q5, d29
+    vaddw.u8        q6, q6, d30
+    vaddw.u8        q7, q7, d31
+
+    vmov.i16        q14, #0
+    vmov            q15, q14
+    vst1.16         {q14, q15}, [r0]        ; write over high input
+    sub             r0, r0, #32
+    vst1.16         {q14, q15}, [r0]        ; write over low input
+
+    sub             r2, r2, r3, lsl #2      ; dst - 4*stride
+    add             r1, r2, #4              ; hi
+
+    ;saturate and narrow
+    vqmovun.s16     d0, q4                  ; lo
+    vqmovun.s16     d1, q5
+    vqmovun.s16     d2, q6                  ; hi
+    vqmovun.s16     d3, q7
+
+    vst1.32         {d0[0]}, [r2], r3       ; lo
+    vst1.32         {d0[1]}, [r1], r3       ; hi
+    vst1.32         {d1[0]}, [r2], r3
+    vst1.32         {d1[1]}, [r1], r3
+    vst1.32         {d2[0]}, [r2], r3
+    vst1.32         {d2[1]}, [r1], r3
+    vst1.32         {d3[0]}, [r2]
+    vst1.32         {d3[1]}, [r1]
+
+    bx             lr
+
+    ENDP           ; |idct_dequant_full_2x_neon|
+
+; Constant Pool
+cospi8sqrt2minus1 DCD 0x4e7b
+; because the lowest bit in 0x8a8c is 0, we can pre-shift this
+sinpi8sqrt2       DCD 0x4546
+
+    END
--- a/vp8/common/arm/neon/iwalsh_neon.asm
+++ b/vp8/common/arm/neon/iwalsh_neon.asm
@ -0,0 +1,87 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+    EXPORT  |vp8_short_inv_walsh4x4_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+;short vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff)
+|vp8_short_inv_walsh4x4_neon| PROC
+
+    ; read in all four lines of values: d0->d3
+    vld1.i16 {q0-q1}, [r0@128]
+
+    ; first for loop
+    vadd.s16 d4, d0, d3 ;a = [0] + [12]
+    vadd.s16 d6, d1, d2 ;b = [4] + [8]
+    vsub.s16 d5, d0, d3 ;d = [0] - [12]
+    vsub.s16 d7, d1, d2 ;c = [4] - [8]
+
+    vadd.s16 q0, q2, q3 ; a+b d+c
+    vsub.s16 q1, q2, q3 ; a-b d-c
+
+    vtrn.32 d0, d2 ;d0:  0  1  8  9
+                   ;d2:  2  3 10 11
+    vtrn.32 d1, d3 ;d1:  4  5 12 13
+                   ;d3:  6  7 14 15
+
+    vtrn.16 d0, d1 ;d0:  0  4  8 12
+                   ;d1:  1  5  9 13
+    vtrn.16 d2, d3 ;d2:  2  6 10 14
+                   ;d3:  3  7 11 15
+
+    ; second for loop
+
+    vadd.s16 d4, d0, d3 ;a = [0] + [3]
+    vadd.s16 d6, d1, d2 ;b = [1] + [2]
+    vsub.s16 d5, d0, d3 ;d = [0] - [3]
+    vsub.s16 d7, d1, d2 ;c = [1] - [2]
+
+    vmov.i16 q8, #3
+
+    vadd.s16 q0, q2, q3 ; a+b d+c
+    vsub.s16 q1, q2, q3 ; a-b d-c
+
+    vadd.i16 q0, q0, q8 ;e/f += 3
+    vadd.i16 q1, q1, q8 ;g/h += 3
+
+    vshr.s16 q0, q0, #3 ;e/f >> 3
+    vshr.s16 q1, q1, #3 ;g/h >> 3
+
+    mov      r2, #64
+    add      r3, r1, #32
+
+    vst1.i16 d0[0], [r1],r2
+    vst1.i16 d1[0], [r3],r2
+    vst1.i16 d2[0], [r1],r2
+    vst1.i16 d3[0], [r3],r2
+
+    vst1.i16 d0[1], [r1],r2
+    vst1.i16 d1[1], [r3],r2
+    vst1.i16 d2[1], [r1],r2
+    vst1.i16 d3[1], [r3],r2
+
+    vst1.i16 d0[2], [r1],r2
+    vst1.i16 d1[2], [r3],r2
+    vst1.i16 d2[2], [r1],r2
+    vst1.i16 d3[2], [r3],r2
+
+    vst1.i16 d0[3], [r1],r2
+    vst1.i16 d1[3], [r3],r2
+    vst1.i16 d2[3], [r1]
+    vst1.i16 d3[3], [r3]
+
+    bx lr
+    ENDP    ; |vp8_short_inv_walsh4x4_neon|
+
+    END
--- a/vp8/common/arm/neon/loopfilter_neon.asm
+++ b/vp8/common/arm/neon/loopfilter_neon.asm
@ -0,0 +1,397 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_loop_filter_horizontal_edge_y_neon|
+    EXPORT  |vp8_loop_filter_horizontal_edge_uv_neon|
+    EXPORT  |vp8_loop_filter_vertical_edge_y_neon|
+    EXPORT  |vp8_loop_filter_vertical_edge_uv_neon|
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src
+; r1    int pitch
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
+|vp8_loop_filter_horizontal_edge_y_neon| PROC
+    push        {lr}
+    vdup.u8     q0, r2                     ; duplicate blimit
+    vdup.u8     q1, r3                     ; duplicate limit
+    sub         r2, r0, r1, lsl #2         ; move src pointer down by 4 lines
+    ldr         r3, [sp, #4]               ; load thresh
+    add         r12, r2, r1
+    add         r1, r1, r1
+
+    vdup.u8     q2, r3                     ; duplicate thresh
+
+    vld1.u8     {q3}, [r2@128], r1              ; p3
+    vld1.u8     {q4}, [r12@128], r1             ; p2
+    vld1.u8     {q5}, [r2@128], r1              ; p1
+    vld1.u8     {q6}, [r12@128], r1             ; p0
+    vld1.u8     {q7}, [r2@128], r1              ; q0
+    vld1.u8     {q8}, [r12@128], r1             ; q1
+    vld1.u8     {q9}, [r2@128]                  ; q2
+    vld1.u8     {q10}, [r12@128]                ; q3
+
+    sub         r2, r2, r1, lsl #1
+    sub         r12, r12, r1, lsl #1
+
+    bl          vp8_loop_filter_neon
+
+    vst1.u8     {q5}, [r2@128], r1              ; store op1
+    vst1.u8     {q6}, [r12@128], r1             ; store op0
+    vst1.u8     {q7}, [r2@128], r1              ; store oq0
+    vst1.u8     {q8}, [r12@128], r1             ; store oq1
+
+    pop         {pc}
+    ENDP        ; |vp8_loop_filter_horizontal_edge_y_neon|
+
+
+; r0    unsigned char *u,
+; r1    int pitch,
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
+; sp+4  unsigned char *v
+|vp8_loop_filter_horizontal_edge_uv_neon| PROC
+    push        {lr}
+    vdup.u8     q0, r2                      ; duplicate blimit
+    vdup.u8     q1, r3                      ; duplicate limit
+    ldr         r12, [sp, #4]               ; load thresh
+    ldr         r2, [sp, #8]                ; load v ptr
+    vdup.u8     q2, r12                     ; duplicate thresh
+
+    sub         r3, r0, r1, lsl #2          ; move u pointer down by 4 lines
+    sub         r12, r2, r1, lsl #2         ; move v pointer down by 4 lines
+
+    vld1.u8     {d6}, [r3@64], r1              ; p3
+    vld1.u8     {d7}, [r12@64], r1             ; p3
+    vld1.u8     {d8}, [r3@64], r1              ; p2
+    vld1.u8     {d9}, [r12@64], r1             ; p2
+    vld1.u8     {d10}, [r3@64], r1             ; p1
+    vld1.u8     {d11}, [r12@64], r1            ; p1
+    vld1.u8     {d12}, [r3@64], r1             ; p0
+    vld1.u8     {d13}, [r12@64], r1            ; p0
+    vld1.u8     {d14}, [r3@64], r1             ; q0
+    vld1.u8     {d15}, [r12@64], r1            ; q0
+    vld1.u8     {d16}, [r3@64], r1             ; q1
+    vld1.u8     {d17}, [r12@64], r1            ; q1
+    vld1.u8     {d18}, [r3@64], r1             ; q2
+    vld1.u8     {d19}, [r12@64], r1            ; q2
+    vld1.u8     {d20}, [r3@64]                 ; q3
+    vld1.u8     {d21}, [r12@64]                ; q3
+
+    bl          vp8_loop_filter_neon
+
+    sub         r0, r0, r1, lsl #1
+    sub         r2, r2, r1, lsl #1
+
+    vst1.u8     {d10}, [r0@64], r1             ; store u op1
+    vst1.u8     {d11}, [r2@64], r1             ; store v op1
+    vst1.u8     {d12}, [r0@64], r1             ; store u op0
+    vst1.u8     {d13}, [r2@64], r1             ; store v op0
+    vst1.u8     {d14}, [r0@64], r1             ; store u oq0
+    vst1.u8     {d15}, [r2@64], r1             ; store v oq0
+    vst1.u8     {d16}, [r0@64]                 ; store u oq1
+    vst1.u8     {d17}, [r2@64]                 ; store v oq1
+
+    pop         {pc}
+    ENDP        ; |vp8_loop_filter_horizontal_edge_uv_neon|
+
+; void vp8_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
+;                                           const signed char *flimit,
+;                                           const signed char *limit,
+;                                           const signed char *thresh,
+;                                           int count)
+; r0    unsigned char *src
+; r1    int pitch
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
+
+|vp8_loop_filter_vertical_edge_y_neon| PROC
+    push        {lr}
+    vdup.u8     q0, r2                     ; duplicate blimit
+    vdup.u8     q1, r3                     ; duplicate limit
+    sub         r2, r0, #4                 ; src ptr down by 4 columns
+    add         r1, r1, r1
+    ldr         r3, [sp, #4]               ; load thresh
+    add         r12, r2, r1, asr #1
+
+    vld1.u8     {d6}, [r2], r1
+    vld1.u8     {d8}, [r12], r1
+    vld1.u8     {d10}, [r2], r1
+    vld1.u8     {d12}, [r12], r1
+    vld1.u8     {d14}, [r2], r1
+    vld1.u8     {d16}, [r12], r1
+    vld1.u8     {d18}, [r2], r1
+    vld1.u8     {d20}, [r12], r1
+
+    vld1.u8     {d7}, [r2], r1              ; load second 8-line src data
+    vld1.u8     {d9}, [r12], r1
+    vld1.u8     {d11}, [r2], r1
+    vld1.u8     {d13}, [r12], r1
+    vld1.u8     {d15}, [r2], r1
+    vld1.u8     {d17}, [r12], r1
+    vld1.u8     {d19}, [r2]
+    vld1.u8     {d21}, [r12]
+
+    ;transpose to 8x16 matrix
+    vtrn.32     q3, q7
+    vtrn.32     q4, q8
+    vtrn.32     q5, q9
+    vtrn.32     q6, q10
+
+    vdup.u8     q2, r3                     ; duplicate thresh
+
+    vtrn.16     q3, q5
+    vtrn.16     q4, q6
+    vtrn.16     q7, q9
+    vtrn.16     q8, q10
+
+    vtrn.8      q3, q4
+    vtrn.8      q5, q6
+    vtrn.8      q7, q8
+    vtrn.8      q9, q10
+
+    bl          vp8_loop_filter_neon
+
+    vswp        d12, d11
+    vswp        d16, d13
+
+    sub         r0, r0, #2                 ; dst ptr
+
+    vswp        d14, d12
+    vswp        d16, d15
+
+    add         r12, r0, r1, asr #1
+
+    ;store op1, op0, oq0, oq1
+    vst4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
+    vst4.8      {d10[1], d11[1], d12[1], d13[1]}, [r12], r1
+    vst4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
+    vst4.8      {d10[3], d11[3], d12[3], d13[3]}, [r12], r1
+    vst4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
+    vst4.8      {d10[5], d11[5], d12[5], d13[5]}, [r12], r1
+    vst4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
+    vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r12], r1
+
+    vst4.8      {d14[0], d15[0], d16[0], d17[0]}, [r0], r1
+    vst4.8      {d14[1], d15[1], d16[1], d17[1]}, [r12], r1
+    vst4.8      {d14[2], d15[2], d16[2], d17[2]}, [r0], r1
+    vst4.8      {d14[3], d15[3], d16[3], d17[3]}, [r12], r1
+    vst4.8      {d14[4], d15[4], d16[4], d17[4]}, [r0], r1
+    vst4.8      {d14[5], d15[5], d16[5], d17[5]}, [r12], r1
+    vst4.8      {d14[6], d15[6], d16[6], d17[6]}, [r0]
+    vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r12]
+
+    pop         {pc}
+    ENDP        ; |vp8_loop_filter_vertical_edge_y_neon|
+
+; void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch
+;                                            const signed char *flimit,
+;                                            const signed char *limit,
+;                                            const signed char *thresh,
+;                                            unsigned char *v)
+; r0    unsigned char *u,
+; r1    int pitch,
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
+; sp+4  unsigned char *v
+|vp8_loop_filter_vertical_edge_uv_neon| PROC
+    push        {lr}
+    vdup.u8     q0, r2                      ; duplicate blimit
+    sub         r12, r0, #4                 ; move u pointer down by 4 columns
+    ldr         r2, [sp, #8]                ; load v ptr
+    vdup.u8     q1, r3                      ; duplicate limit
+    sub         r3, r2, #4                  ; move v pointer down by 4 columns
+
+    vld1.u8     {d6}, [r12], r1             ;load u data
+    vld1.u8     {d7}, [r3], r1              ;load v data
+    vld1.u8     {d8}, [r12], r1
+    vld1.u8     {d9}, [r3], r1
+    vld1.u8     {d10}, [r12], r1
+    vld1.u8     {d11}, [r3], r1
+    vld1.u8     {d12}, [r12], r1
+    vld1.u8     {d13}, [r3], r1
+    vld1.u8     {d14}, [r12], r1
+    vld1.u8     {d15}, [r3], r1
+    vld1.u8     {d16}, [r12], r1
+    vld1.u8     {d17}, [r3], r1
+    vld1.u8     {d18}, [r12], r1
+    vld1.u8     {d19}, [r3], r1
+    vld1.u8     {d20}, [r12]
+    vld1.u8     {d21}, [r3]
+
+    ldr        r12, [sp, #4]               ; load thresh
+
+    ;transpose to 8x16 matrix
+    vtrn.32     q3, q7
+    vtrn.32     q4, q8
+    vtrn.32     q5, q9
+    vtrn.32     q6, q10
+
+    vdup.u8     q2, r12                     ; duplicate thresh
+
+    vtrn.16     q3, q5
+    vtrn.16     q4, q6
+    vtrn.16     q7, q9
+    vtrn.16     q8, q10
+
+    vtrn.8      q3, q4
+    vtrn.8      q5, q6
+    vtrn.8      q7, q8
+    vtrn.8      q9, q10
+
+    bl          vp8_loop_filter_neon
+
+    vswp        d12, d11
+    vswp        d16, d13
+    vswp        d14, d12
+    vswp        d16, d15
+
+    sub         r0, r0, #2
+    sub         r2, r2, #2
+
+    ;store op1, op0, oq0, oq1
+    vst4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
+    vst4.8      {d14[0], d15[0], d16[0], d17[0]}, [r2], r1
+    vst4.8      {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
+    vst4.8      {d14[1], d15[1], d16[1], d17[1]}, [r2], r1
+    vst4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
+    vst4.8      {d14[2], d15[2], d16[2], d17[2]}, [r2], r1
+    vst4.8      {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
+    vst4.8      {d14[3], d15[3], d16[3], d17[3]}, [r2], r1
+    vst4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
+    vst4.8      {d14[4], d15[4], d16[4], d17[4]}, [r2], r1
+    vst4.8      {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
+    vst4.8      {d14[5], d15[5], d16[5], d17[5]}, [r2], r1
+    vst4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
+    vst4.8      {d14[6], d15[6], d16[6], d17[6]}, [r2], r1
+    vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r0]
+    vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r2]
+
+    pop         {pc}
+    ENDP        ; |vp8_loop_filter_vertical_edge_uv_neon|
+
+; void vp8_loop_filter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store.
+
+; r0-r3 PRESERVE
+; q0    flimit
+; q1    limit
+; q2    thresh
+; q3    p3
+; q4    p2
+; q5    p1
+; q6    p0
+; q7    q0
+; q8    q1
+; q9    q2
+; q10   q3
+|vp8_loop_filter_neon| PROC
+
+    ; vp8_filter_mask
+    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
+    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
+    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
+    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
+    vabd.u8     q3, q9, q8                  ; abs(q2 - q1)
+    vabd.u8     q4, q10, q9                 ; abs(q3 - q2)
+
+    vmax.u8     q11, q11, q12
+    vmax.u8     q12, q13, q14
+    vmax.u8     q3, q3, q4
+    vmax.u8     q15, q11, q12
+
+    vabd.u8     q9, q6, q7                  ; abs(p0 - q0)
+
+    ; vp8_hevmask
+    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
+    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
+    vmax.u8     q15, q15, q3
+
+    vmov.u8     q10, #0x80                   ; 0x80
+
+    vabd.u8     q2, q5, q8                  ; a = abs(p1 - q1)
+    vqadd.u8    q9, q9, q9                  ; b = abs(p0 - q0) * 2
+
+    vcge.u8     q15, q1, q15
+
+    ; vp8_filter() function
+    ; convert to signed
+    veor        q7, q7, q10                 ; qs0
+    vshr.u8     q2, q2, #1                  ; a = a / 2
+    veor        q6, q6, q10                 ; ps0
+
+    veor        q5, q5, q10                 ; ps1
+    vqadd.u8    q9, q9, q2                  ; a = b + a
+
+    veor        q8, q8, q10                 ; qs1
+
+    vmov.u8     q10, #3                     ; #3
+
+    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
+    vsubl.s8    q11, d15, d13
+
+    vcge.u8     q9, q0, q9                  ; (a > flimit * 2 + limit) * -1
+
+    vmovl.u8    q4, d20
+
+    vqsub.s8    q1, q5, q8                  ; vp8_filter = clamp(ps1-qs1)
+    vorr        q14, q13, q14               ; vp8_hevmask
+
+    vmul.i16    q2, q2, q4                  ; 3 * ( qs0 - ps0)
+    vmul.i16    q11, q11, q4
+
+    vand        q1, q1, q14                 ; vp8_filter &= hev
+    vand        q15, q15, q9                ; vp8_filter_mask
+
+    vaddw.s8    q2, q2, d2
+    vaddw.s8    q11, q11, d3
+
+    vmov.u8     q9, #4                      ; #4
+
+    ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d2, q2
+    vqmovn.s16  d3, q11
+    vand        q1, q1, q15                 ; vp8_filter &= mask
+
+    vqadd.s8    q2, q1, q10                 ; Filter2 = clamp(vp8_filter+3)
+    vqadd.s8    q1, q1, q9                  ; Filter1 = clamp(vp8_filter+4)
+    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
+    vshr.s8     q1, q1, #3                  ; Filter1 >>= 3
+
+
+    vqadd.s8    q11, q6, q2                 ; u = clamp(ps0 + Filter2)
+    vqsub.s8    q10, q7, q1                 ; u = clamp(qs0 - Filter1)
+
+    ; outer tap adjustments: ++vp8_filter >> 1
+    vrshr.s8    q1, q1, #1
+    vbic        q1, q1, q14                 ; vp8_filter &= ~hev
+    vmov.u8     q0, #0x80                   ; 0x80
+    vqadd.s8    q13, q5, q1                 ; u = clamp(ps1 + vp8_filter)
+    vqsub.s8    q12, q8, q1                 ; u = clamp(qs1 - vp8_filter)
+
+    veor        q6, q11, q0                 ; *op0 = u^0x80
+    veor        q7, q10, q0                 ; *oq0 = u^0x80
+    veor        q5, q13, q0                 ; *op1 = u^0x80
+    veor        q8, q12, q0                 ; *oq1 = u^0x80
+
+    bx          lr
+    ENDP        ; |vp8_loop_filter_horizontal_edge_y_neon|
+
+;-----------------
+
+    END
--- a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
+++ b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
@ -0,0 +1,117 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    ;EXPORT  |vp8_loop_filter_simple_horizontal_edge_neon|
+    EXPORT  |vp8_loop_filter_bhs_neon|
+    EXPORT  |vp8_loop_filter_mbhs_neon|
+    ARM
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *s, PRESERVE
+; r1    int p, PRESERVE
+; q1    limit, PRESERVE
+
+|vp8_loop_filter_simple_horizontal_edge_neon| PROC
+
+    sub         r3, r0, r1, lsl #1          ; move src pointer down by 2 lines
+
+    vld1.u8     {q7}, [r0@128], r1          ; q0
+    vld1.u8     {q5}, [r3@128], r1          ; p0
+    vld1.u8     {q8}, [r0@128]              ; q1
+    vld1.u8     {q6}, [r3@128]              ; p1
+
+    vabd.u8     q15, q6, q7                 ; abs(p0 - q0)
+    vabd.u8     q14, q5, q8                 ; abs(p1 - q1)
+
+    vqadd.u8    q15, q15, q15               ; abs(p0 - q0) * 2
+    vshr.u8     q14, q14, #1                ; abs(p1 - q1) / 2
+    vmov.u8     q0, #0x80                   ; 0x80
+    vmov.s16    q13, #3
+    vqadd.u8    q15, q15, q14               ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
+
+    veor        q7, q7, q0                  ; qs0: q0 offset to convert to a signed value
+    veor        q6, q6, q0                  ; ps0: p0 offset to convert to a signed value
+    veor        q5, q5, q0                  ; ps1: p1 offset to convert to a signed value
+    veor        q8, q8, q0                  ; qs1: q1 offset to convert to a signed value
+
+    vcge.u8     q15, q1, q15                ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > limit)*-1
+
+    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
+    vsubl.s8    q3, d15, d13
+
+    vqsub.s8    q4, q5, q8                  ; q4: vp8_filter = vp8_signed_char_clamp(ps1-qs1)
+
+    vmul.s16    q2, q2, q13                 ;  3 * ( qs0 - ps0)
+    vmul.s16    q3, q3, q13
+
+    vmov.u8     q10, #0x03                  ; 0x03
+    vmov.u8     q9, #0x04                   ; 0x04
+
+    vaddw.s8    q2, q2, d8                  ; vp8_filter + 3 * ( qs0 - ps0)
+    vaddw.s8    q3, q3, d9
+
+    vqmovn.s16  d8, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d9, q3
+
+    vand        q14, q4, q15                ; vp8_filter &= mask
+
+    vqadd.s8    q2, q14, q10                ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
+    vqadd.s8    q3, q14, q9                 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
+    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
+    vshr.s8     q4, q3, #3                  ; Filter1 >>= 3
+
+    sub         r0, r0, r1
+
+    ;calculate output
+    vqadd.s8    q11, q6, q2                 ; u = vp8_signed_char_clamp(ps0 + Filter2)
+    vqsub.s8    q10, q7, q4                 ; u = vp8_signed_char_clamp(qs0 - Filter1)
+
+    veor        q6, q11, q0                 ; *op0 = u^0x80
+    veor        q7, q10, q0                 ; *oq0 = u^0x80
+
+    vst1.u8     {q6}, [r3@128]              ; store op0
+    vst1.u8     {q7}, [r0@128]              ; store oq0
+
+    bx          lr
+    ENDP        ; |vp8_loop_filter_simple_horizontal_edge_neon|
+
+; r0    unsigned char *y
+; r1    int ystride
+; r2    const unsigned char *blimit
+
+|vp8_loop_filter_bhs_neon| PROC
+    push        {r4, lr}
+    ldrb        r3, [r2]                    ; load blim from mem
+    vdup.s8     q1, r3                      ; duplicate blim
+
+    add         r0, r0, r1, lsl #2          ; src = y_ptr + 4 * y_stride
+    bl          vp8_loop_filter_simple_horizontal_edge_neon
+    ; vp8_loop_filter_simple_horizontal_edge_neon preserves r0, r1 and q1
+    add         r0, r0, r1, lsl #2          ; src = y_ptr + 8* y_stride
+    bl          vp8_loop_filter_simple_horizontal_edge_neon
+    add         r0, r0, r1, lsl #2          ; src = y_ptr + 12 * y_stride
+    pop         {r4, lr}
+    b           vp8_loop_filter_simple_horizontal_edge_neon
+    ENDP        ;|vp8_loop_filter_bhs_neon|
+
+; r0    unsigned char *y
+; r1    int ystride
+; r2    const unsigned char *blimit
+
+|vp8_loop_filter_mbhs_neon| PROC
+    ldrb        r3, [r2]                   ; load blim from mem
+    vdup.s8     q1, r3                     ; duplicate mblim
+    b           vp8_loop_filter_simple_horizontal_edge_neon
+    ENDP        ;|vp8_loop_filter_bhs_neon|
+
+    END
--- a/Показать больше
+++ b/Показать больше