Merge "Merge branch 'master' into nextgenv2" into nextgenv2

2016-07-15 04:45:52 +00:00 · 2016-07-15 04:45:52 +00:00 · 06c297bd1c
--- a/1
+++ b/1
@ -47,7 +47,6 @@ COMPILING THE APPLICATIONS/LIBRARIES:
  --help output of the configure script. As of this writing, the list of
  available targets is:

-    armv6-darwin-gcc
    armv6-linux-rvct
    armv6-linux-gcc
    armv6-none-rvct
--- a/build/make/Makefile
+++ b/build/make/Makefile
@ -119,29 +119,25 @@ utiltest:
 test-no-data-check::
 exampletest-no-data-check utiltest-no-data-check:

-# Add compiler flags for intrinsic files
+# Force to realign stack always on OS/2
 ifeq ($(TOOLCHAIN), x86-os2-gcc)
-STACKREALIGN=-mstackrealign
-else
-STACKREALIGN=
+CFLAGS += -mstackrealign
 endif

 $(BUILD_PFX)%_mmx.c.d: CFLAGS += -mmmx
 $(BUILD_PFX)%_mmx.c.o: CFLAGS += -mmmx
-$(BUILD_PFX)%_sse2.c.d: CFLAGS += -msse2 $(STACKREALIGN)
-$(BUILD_PFX)%_sse2.c.o: CFLAGS += -msse2 $(STACKREALIGN)
-$(BUILD_PFX)%_sse3.c.d: CFLAGS += -msse3 $(STACKREALIGN)
-$(BUILD_PFX)%_sse3.c.o: CFLAGS += -msse3 $(STACKREALIGN)
-$(BUILD_PFX)%_ssse3.c.d: CFLAGS += -mssse3 $(STACKREALIGN)
-$(BUILD_PFX)%_ssse3.c.o: CFLAGS += -mssse3 $(STACKREALIGN)
-$(BUILD_PFX)%_sse4.c.d: CFLAGS += -msse4.1 $(STACKREALIGN)
-$(BUILD_PFX)%_sse4.c.o: CFLAGS += -msse4.1 $(STACKREALIGN)
-$(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx $(STACKREALIGN)
-$(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx $(STACKREALIGN)
-$(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2 $(STACKREALIGN)
-$(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2 $(STACKREALIGN)
-$(BUILD_PFX)%vp9_reconintra.c.d: CFLAGS += $(STACKREALIGN)
-$(BUILD_PFX)%vp9_reconintra.c.o: CFLAGS += $(STACKREALIGN)
+$(BUILD_PFX)%_sse2.c.d: CFLAGS += -msse2
+$(BUILD_PFX)%_sse2.c.o: CFLAGS += -msse2
+$(BUILD_PFX)%_sse3.c.d: CFLAGS += -msse3
+$(BUILD_PFX)%_sse3.c.o: CFLAGS += -msse3
+$(BUILD_PFX)%_ssse3.c.d: CFLAGS += -mssse3
+$(BUILD_PFX)%_ssse3.c.o: CFLAGS += -mssse3
+$(BUILD_PFX)%_sse4.c.d: CFLAGS += -msse4.1
+$(BUILD_PFX)%_sse4.c.o: CFLAGS += -msse4.1
+$(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx
+$(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx
+$(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2
+$(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2

 $(BUILD_PFX)%.c.d: %.c
 	$(if $(quiet),@echo "    [DEP] $@")
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@ -185,6 +185,25 @@ add_extralibs() {
 #
 # Boolean Manipulation Functions
 #
+
+enable_codec(){
+  enabled $1 || echo "  enabling $1"
+  set_all yes $1
+
+  is_in $1 vp8 vp9 vp10 && \
+    set_all yes $1_encoder && \
+    set_all yes $1_decoder
+}
+
+disable_codec(){
+  disabled $1 || echo "  disabling $1"
+  set_all no $1
+
+  is_in $1 vp8 vp9 vp10 && \
+    set_all no $1_encoder && \
+    set_all no $1_decoder
+}
+
 enable_feature(){
  set_all yes $*
 }
@ -521,22 +540,20 @@ process_common_cmdline() {
        ;;
      --enable-?*|--disable-?*)
        eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
-        if echo "${ARCH_EXT_LIST}" | grep "^ *$option\$" >/dev/null; then
+        if is_in ${option} ${ARCH_EXT_LIST}; then
          [ $action = "disable" ] && RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${option} "
        elif [ $action = "disable" ] && ! disabled $option ; then
-          echo "${CMDLINE_SELECT}" | grep "^ *$option\$" >/dev/null ||
-            die_unknown $opt
+          is_in ${option} ${CMDLINE_SELECT} || die_unknown $opt
          log_echo "  disabling $option"
        elif [ $action = "enable" ] && ! enabled $option ; then
-          echo "${CMDLINE_SELECT}" | grep "^ *$option\$" >/dev/null ||
-            die_unknown $opt
+          is_in ${option} ${CMDLINE_SELECT} || die_unknown $opt
          log_echo "  enabling $option"
        fi
        ${action}_feature $option
        ;;
      --require-?*)
        eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
-        if echo "${ARCH_EXT_LIST}" none | grep "^ *$option\$" >/dev/null; then
+        if is_in ${option} ${ARCH_EXT_LIST}; then
            RTCD_OPTIONS="${RTCD_OPTIONS}${opt} "
        else
            die_unknown $opt
@ -638,6 +655,26 @@ show_darwin_sdk_major_version() {
  xcrun --sdk $1 --show-sdk-version 2>/dev/null | cut -d. -f1
 }

+# Print the Xcode version.
+show_xcode_version() {
+  xcodebuild -version | head -n1 | cut -d' ' -f2
+}
+
+# Fails when Xcode version is less than 6.3.
+check_xcode_minimum_version() {
+  xcode_major=$(show_xcode_version | cut -f1 -d.)
+  xcode_minor=$(show_xcode_version | cut -f2 -d.)
+  xcode_min_major=6
+  xcode_min_minor=3
+  if [ ${xcode_major} -lt ${xcode_min_major} ]; then
+    return 1
+  fi
+  if [ ${xcode_major} -eq ${xcode_min_major} ] \
+    && [ ${xcode_minor} -lt ${xcode_min_minor} ]; then
+    return 1
+  fi
+}
+
 process_common_toolchain() {
  if [ -z "$toolchain" ]; then
    gcctarget="${CHOST:-$(gcc -dumpmachine 2> /dev/null)}"
@ -751,7 +788,14 @@ process_common_toolchain() {
  enabled shared && soft_enable pic

  # Minimum iOS version for all target platforms (darwin and iphonesimulator).
-  IOS_VERSION_MIN="6.0"
+  # Shared library framework builds are only possible on iOS 8 and later.
+  if enabled shared; then
+    IOS_VERSION_OPTIONS="--enable-shared"
+    IOS_VERSION_MIN="8.0"
+  else
+    IOS_VERSION_OPTIONS=""
+    IOS_VERSION_MIN="6.0"
+  fi

  # Handle darwin variants. Newer SDKs allow targeting older
  # platforms, so use the newest one available.
@ -1018,18 +1062,7 @@ EOF
          NM="$(${XCRUN_FIND} nm)"
          RANLIB="$(${XCRUN_FIND} ranlib)"
          AS_SFX=.s
-
-          # Special handling of ld for armv6 because libclang_rt.ios.a does
-          # not contain armv6 support in Apple's clang package:
-          #   Apple LLVM version 5.1 (clang-503.0.40) (based on LLVM 3.4svn).
-          # TODO(tomfinegan): Remove this. Our minimum iOS version (6.0)
-          # renders support for armv6 unnecessary because the 3GS and up
-          # support neon.
-          if [ "${tgt_isa}" = "armv6" ]; then
-            LD="$(${XCRUN_FIND} ld)"
-          else
-            LD="${CXX:-$(${XCRUN_FIND} ld)}"
-          fi
+          LD="${CXX:-$(${XCRUN_FIND} ld)}"

          # ASFLAGS is written here instead of using check_add_asflags
          # because we need to overwrite all of ASFLAGS and purge the
@ -1055,6 +1088,19 @@ EOF
            [ -d "${try_dir}" ] && add_ldflags -L"${try_dir}"
          done

+          case ${tgt_isa} in
+            armv7|armv7s|armv8|arm64)
+              if enabled neon && ! check_xcode_minimum_version; then
+                soft_disable neon
+                log_echo "  neon disabled: upgrade Xcode (need v6.3+)."
+                if enabled neon_asm; then
+                  soft_disable neon_asm
+                  log_echo "  neon_asm disabled: upgrade Xcode (need v6.3+)."
+                fi
+              fi
+              ;;
+          esac
+
          asm_conversion_cmd="${source_path}/build/make/ads2gas_apple.pl"

          if [ "$(show_darwin_sdk_major_version iphoneos)" -gt 8 ]; then
@ -1069,7 +1115,7 @@ EOF
          if enabled rvct; then
            # Check if we have CodeSourcery GCC in PATH. Needed for
            # libraries
-            hash arm-none-linux-gnueabi-gcc 2>&- || \
+            which arm-none-linux-gnueabi-gcc 2>&- || \
              die "Couldn't find CodeSourcery GCC from PATH"

            # Use armcc as a linker to enable translation of
@ -1110,7 +1156,7 @@ EOF
            check_add_ldflags -mfp64
            ;;
          i6400)
-            check_add_cflags -mips64r6 -mabi=64 -funroll-loops -msched-weight 
+            check_add_cflags -mips64r6 -mabi=64 -funroll-loops -msched-weight
            check_add_cflags  -mload-store-pairs -mhard-float -mfp64
            check_add_asflags -mips64r6 -mabi=64 -mhard-float -mfp64
            check_add_ldflags -mips64r6 -mabi=64 -mfp64
--- a/build/make/gen_msvs_vcxproj.sh
+++ b/build/make/gen_msvs_vcxproj.sh
@ -211,7 +211,7 @@ for opt in "$@"; do
 done

 # Make one call to fix_path for file_list to improve performance.
-fix_file_list
+fix_file_list file_list

 outfile=${outfile:-/dev/stdout}
 guid=${guid:-`generate_uuid`}
--- a/build/make/ios-Info.plist
+++ b/build/make/ios-Info.plist
@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>CFBundleDevelopmentRegion</key>
+	<string>en</string>
+	<key>CFBundleExecutable</key>
+	<string>VPX</string>
+	<key>CFBundleIdentifier</key>
+	<string>org.webmproject.VPX</string>
+	<key>CFBundleInfoDictionaryVersion</key>
+	<string>6.0</string>
+	<key>CFBundleName</key>
+	<string>VPX</string>
+	<key>CFBundlePackageType</key>
+	<string>FMWK</string>
+	<key>CFBundleShortVersionString</key>
+	<string>${VERSION}</string>
+	<key>CFBundleSignature</key>
+	<string>????</string>
+	<key>CFBundleSupportedPlatforms</key>
+	<array>
+		<string>iPhoneOS</string>
+	</array>
+	<key>CFBundleVersion</key>
+	<string>${VERSION}</string>
+	<key>MinimumOSVersion</key>
+	<string>${IOS_VERSION_MIN}</string>
+	<key>UIDeviceFamily</key>
+	<array>
+		<integer>1</integer>
+		<integer>2</integer>
+	</array>
+	<key>VPXFullVersion</key>
+	<string>${FULLVERSION}</string>
+</dict>
+</plist>
--- a/build/make/iosbuild.sh
+++ b/build/make/iosbuild.sh
@ -24,6 +24,7 @@ CONFIGURE_ARGS="--disable-docs
                --disable-unit-tests"
 DIST_DIR="_dist"
 FRAMEWORK_DIR="VPX.framework"
+FRAMEWORK_LIB="VPX.framework/VPX"
 HEADER_DIR="${FRAMEWORK_DIR}/Headers/vpx"
 SCRIPT_DIR=$(dirname "$0")
 LIBVPX_SOURCE_DIR=$(cd ${SCRIPT_DIR}/../..; pwd)
@ -137,6 +138,44 @@ create_vpx_framework_config_shim() {
  printf "#endif  // ${include_guard}" >> "${config_file}"
 }

+# Verifies that $FRAMEWORK_LIB fat library contains requested builds.
+verify_framework_targets() {
+  local requested_cpus=""
+  local cpu=""
+
+  # Extract CPU from full target name.
+  for target; do
+    cpu="${target%%-*}"
+    if [ "${cpu}" = "x86" ]; then
+      # lipo -info outputs i386 for libvpx x86 targets.
+      cpu="i386"
+    fi
+    requested_cpus="${requested_cpus}${cpu} "
+  done
+
+  # Get target CPUs present in framework library.
+  local targets_built=$(${LIPO} -info ${FRAMEWORK_LIB})
+
+  # $LIPO -info outputs a string like the following:
+  #   Architectures in the fat file: $FRAMEWORK_LIB <architectures>
+  # Capture only the architecture strings.
+  targets_built=${targets_built##*: }
+
+  # Sort CPU strings to make the next step a simple string compare.
+  local actual=$(echo ${targets_built} | tr " " "\n" | sort | tr "\n" " ")
+  local requested=$(echo ${requested_cpus} | tr " " "\n" | sort | tr "\n" " ")
+
+  vlog "Requested ${FRAMEWORK_LIB} CPUs: ${requested}"
+  vlog "Actual ${FRAMEWORK_LIB} CPUs: ${actual}"
+
+  if [ "${requested}" != "${actual}" ]; then
+    elog "Actual ${FRAMEWORK_LIB} targets do not match requested target list."
+    elog "  Requested target CPUs: ${requested}"
+    elog "  Actual target CPUs: ${actual}"
+    return 1
+  fi
+}
+
 # Configures and builds each target specified by $1, and then builds
 # VPX.framework.
 build_framework() {
@ -157,7 +196,12 @@ build_framework() {
  for target in ${targets}; do
    build_target "${target}"
    target_dist_dir="${BUILD_ROOT}/${target}/${DIST_DIR}"
-    lib_list="${lib_list} ${target_dist_dir}/lib/libvpx.a"
+    if [ "${ENABLE_SHARED}" = "yes" ]; then
+      local suffix="dylib"
+    else
+      local suffix="a"
+    fi
+    lib_list="${lib_list} ${target_dist_dir}/lib/libvpx.${suffix}"
  done

  cd "${ORIG_PWD}"
@ -176,13 +220,25 @@ build_framework() {
  # Copy in vpx_version.h.
  cp -p "${BUILD_ROOT}/${target}/vpx_version.h" "${HEADER_DIR}"

-  vlog "Created fat library ${FRAMEWORK_DIR}/VPX containing:"
+  if [ "${ENABLE_SHARED}" = "yes" ]; then
+    # Adjust the dylib's name so dynamic linking in apps works as expected.
+    install_name_tool -id '@rpath/VPX.framework/VPX' ${FRAMEWORK_DIR}/VPX
+
+    # Copy in Info.plist.
+    cat "${SCRIPT_DIR}/ios-Info.plist" \
+      | sed "s/\${FULLVERSION}/${FULLVERSION}/g" \
+      | sed "s/\${VERSION}/${VERSION}/g" \
+      | sed "s/\${IOS_VERSION_MIN}/${IOS_VERSION_MIN}/g" \
+      > "${FRAMEWORK_DIR}/Info.plist"
+  fi
+
+  # Confirm VPX.framework/VPX contains the targets requested.
+  verify_framework_targets ${targets}
+
+  vlog "Created fat library ${FRAMEWORK_LIB} containing:"
  for lib in ${lib_list}; do
    vlog "  $(echo ${lib} | awk -F / '{print $2, $NF}')"
  done
-
-  # TODO(tomfinegan): Verify that expected targets are included within
-  # VPX.framework/VPX via lipo -info.
 }

 # Trap function. Cleans up the subtree used to build all targets contained in
@ -213,6 +269,7 @@ iosbuild_usage() {
 cat << EOF
  Usage: ${0##*/} [arguments]
    --help: Display this message and exit.
+    --enable-shared: Build a dynamic framework for use on iOS 8 or later.
    --extra-configure-args <args>: Extra args to pass when configuring libvpx.
    --macosx: Uses darwin15 targets instead of iphonesimulator targets for x86
              and x86_64. Allows linking to framework when builds target MacOSX
@ -251,6 +308,9 @@ while [ -n "$1" ]; do
      iosbuild_usage
      exit
      ;;
+    --enable-shared)
+      ENABLE_SHARED=yes
+      ;;
    --preserve-build-output)
      PRESERVE_BUILD_OUTPUT=yes
      ;;
@ -278,6 +338,21 @@ while [ -n "$1" ]; do
  shift
 done

+if [ "${ENABLE_SHARED}" = "yes" ]; then
+  CONFIGURE_ARGS="--enable-shared ${CONFIGURE_ARGS}"
+fi
+
+FULLVERSION=$("${SCRIPT_DIR}"/version.sh --bare "${LIBVPX_SOURCE_DIR}")
+VERSION=$(echo "${FULLVERSION}" | sed -E 's/^v([0-9]+\.[0-9]+\.[0-9]+).*$/\1/')
+
+if [ "$ENABLE_SHARED" = "yes" ]; then
+  IOS_VERSION_OPTIONS="--enable-shared"
+  IOS_VERSION_MIN="8.0"
+else
+  IOS_VERSION_OPTIONS=""
+  IOS_VERSION_MIN="6.0"
+fi
+
 if [ "${VERBOSE}" = "yes" ]; then
 cat << EOF
  BUILD_ROOT=${BUILD_ROOT}
@ -285,6 +360,7 @@ cat << EOF
  CONFIGURE_ARGS=${CONFIGURE_ARGS}
  EXTRA_CONFIGURE_ARGS=${EXTRA_CONFIGURE_ARGS}
  FRAMEWORK_DIR=${FRAMEWORK_DIR}
+  FRAMEWORK_LIB=${FRAMEWORK_LIB}
  HEADER_DIR=${HEADER_DIR}
  LIBVPX_SOURCE_DIR=${LIBVPX_SOURCE_DIR}
  LIPO=${LIPO}
@ -292,8 +368,13 @@ cat << EOF
  ORIG_PWD=${ORIG_PWD}
  PRESERVE_BUILD_OUTPUT=${PRESERVE_BUILD_OUTPUT}
  TARGETS="$(print_list "" ${TARGETS})"
+  ENABLE_SHARED=${ENABLE_SHARED}
  OSX_TARGETS="${OSX_TARGETS}"
  SIM_TARGETS="${SIM_TARGETS}"
+  SCRIPT_DIR="${SCRIPT_DIR}"
+  FULLVERSION="${FULLVERSION}"
+  VERSION="${VERSION}"
+  IOS_VERSION_MIN="${IOS_VERSION_MIN}"
 EOF
 fi

--- a/build/make/msvs_common.sh
+++ b/build/make/msvs_common.sh
@ -39,11 +39,12 @@ fix_path() {
 }

 # Corrects the paths in file_list in one pass for efficiency.
+# $1 is the name of the array to be modified.
 fix_file_list() {
-    # TODO(jzern): this could be more generic and take the array as a param.
-    files=$(fix_path "${file_list[@]}")
+    declare -n array_ref=$1
+    files=$(fix_path "${array_ref[@]}")
    local IFS=$'\n'
-    file_list=($files)
+    array_ref=($files)
 }

 generate_uuid() {
--- a/build/make/version.sh
+++ b/build/make/version.sh
@ -24,8 +24,9 @@ out_file=${2}
 id=${3:-VERSION_STRING}

 git_version_id=""
-if [ -d "${source_path}/.git" ]; then
+if [ -e "${source_path}/.git" ]; then
    # Source Path is a git working copy. Check for local modifications.
+    # Note that git submodules may have a file as .git, not a directory.
    export GIT_DIR="${source_path}/.git"
    git_version_id=`git describe --match=v[0-9]* 2>/dev/null`
 fi
--- a/36
+++ b/36
@ -98,7 +98,6 @@ EOF

 # all_platforms is a list of all supported target platforms. Maintain
 # alphabetically by architecture, generic-gnu last.
-all_platforms="${all_platforms} armv6-darwin-gcc"
 all_platforms="${all_platforms} armv6-linux-rvct"
 all_platforms="${all_platforms} armv6-linux-gcc"
 all_platforms="${all_platforms} armv6-none-rvct"
@ -191,12 +190,12 @@ if [ ${doxy_major:-0} -ge 1 ]; then
 fi

 # disable codecs when their source directory does not exist
-[ -d "${source_path}/vp8" ] || disable_feature vp8
-[ -d "${source_path}/vp9" ] || disable_feature vp9
-[ -d "${source_path}/vp10" ] || disable_feature vp10
+[ -d "${source_path}/vp8" ] || disable_codec vp8
+[ -d "${source_path}/vp9" ] || disable_codec vp9
+[ -d "${source_path}/vp10" ] || disable_codec vp10

 # disable vp10 codec by default
-disable_feature vp10
+disable_codec vp10

 # install everything except the sources, by default. sources will have
 # to be enabled when doing dist builds, since that's no longer a common
@ -406,15 +405,19 @@ process_cmdline() {
    for opt do
        optval="${opt#*=}"
        case "$opt" in
-        --disable-codecs) for c in ${CODECS}; do disable_feature $c; done ;;
+        --disable-codecs)
+          for c in ${CODEC_FAMILIES}; do disable_codec $c; done
+          ;;
        --enable-?*|--disable-?*)
        eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
-        if echo "${EXPERIMENT_LIST}" | grep "^ *$option\$" >/dev/null; then
+        if is_in ${option} ${EXPERIMENT_LIST}; then
            if enabled experimental; then
                ${action}_feature $option
            else
                log_echo "Ignoring $opt -- not in experimental mode."
            fi
+        elif is_in ${option} "${CODECS} ${CODEC_FAMILIES}"; then
+            ${action}_codec ${option}
        else
            process_common_cmdline $opt
        fi
@ -428,14 +431,6 @@ process_cmdline() {
 post_process_cmdline() {
    c=""

-    # If the codec family is disabled, disable all components of that family.
-    # If the codec family is enabled, enable all components of that family.
-    log_echo "Configuring selected codecs"
-    for c in ${CODECS}; do
-        disabled ${c%%_*} && disable_feature ${c}
-        enabled ${c%%_*} && enable_feature ${c}
-    done
-
    # Enable all detected codecs, if they haven't been disabled
    for c in ${CODECS}; do soft_enable $c; done

@ -530,13 +525,18 @@ process_detect() {
        # Can only build shared libs on a subset of platforms. Doing this check
        # here rather than at option parse time because the target auto-detect
        # magic happens after the command line has been parsed.
-        if ! enabled linux && ! enabled os2; then
+        case "${tgt_os}" in
+        linux|os2|darwin*|iphonesimulator*)
+            # Supported platforms
+            ;;
+        *)
            if enabled gnu; then
                echo "--enable-shared is only supported on ELF; assuming this is OK"
            else
-                die "--enable-shared only supported on ELF and OS/2 for now"
+                die "--enable-shared only supported on ELF, OS/2, and Darwin for now"
            fi
-        fi
+            ;;
+        esac
    fi
    if [ -z "$CC" ] || enabled external_build; then
        echo "Bypassing toolchain for environment detection."
--- a/examples/simple_encoder.c
+++ b/examples/simple_encoder.c
@ -109,8 +109,8 @@ static const char *exec_name;
 void usage_exit(void) {
  fprintf(stderr,
          "Usage: %s <codec> <width> <height> <infile> <outfile> "
-              "<keyframe-interval> [<error-resilient>]\nSee comments in "
-              "simple_encoder.c for more information.\n",
+              "<keyframe-interval> <error-resilient> <frames to encode>\n"
+              "See comments in simple_encoder.c for more information.\n",
          exec_name);
  exit(EXIT_FAILURE);
 }
@ -147,6 +147,7 @@ static int encode_frame(vpx_codec_ctx_t *codec,
  return got_pkts;
 }

+// TODO(tomfinegan): Improve command line parsing and add args for bitrate/fps.
 int main(int argc, char **argv) {
  FILE *infile = NULL;
  vpx_codec_ctx_t codec;
@ -157,12 +158,11 @@ int main(int argc, char **argv) {
  VpxVideoInfo info = {0};
  VpxVideoWriter *writer = NULL;
  const VpxInterface *encoder = NULL;
-  const int fps = 30;        // TODO(dkovalev) add command line argument
-  const int bitrate = 200;   // kbit/s TODO(dkovalev) add command line argument
+  const int fps = 30;
+  const int bitrate = 200;
  int keyframe_interval = 0;
-
-  // TODO(dkovalev): Add some simple command line parsing code to make the
-  // command line more flexible.
+  int max_frames = 0;
+  int frames_encoded = 0;
  const char *codec_arg = NULL;
  const char *width_arg = NULL;
  const char *height_arg = NULL;
@ -172,7 +172,7 @@ int main(int argc, char **argv) {

  exec_name = argv[0];

-  if (argc < 7)
+  if (argc != 9)
    die("Invalid number of arguments");

  codec_arg = argv[1];
@ -181,6 +181,7 @@ int main(int argc, char **argv) {
  infile_arg = argv[4];
  outfile_arg = argv[5];
  keyframe_interval_arg = argv[6];
+  max_frames = strtol(argv[8], NULL, 0);

  encoder = get_vpx_encoder_by_name(codec_arg);
  if (!encoder)
@ -219,7 +220,7 @@ int main(int argc, char **argv) {
  cfg.g_timebase.num = info.time_base.numerator;
  cfg.g_timebase.den = info.time_base.denominator;
  cfg.rc_target_bitrate = bitrate;
-  cfg.g_error_resilient = argc > 7 ? strtol(argv[7], NULL, 0) : 0;
+  cfg.g_error_resilient = strtol(argv[7], NULL, 0);

  writer = vpx_video_writer_open(outfile_arg, kContainerIVF, &info);
  if (!writer)
@ -237,6 +238,9 @@ int main(int argc, char **argv) {
    if (keyframe_interval > 0 && frame_count % keyframe_interval == 0)
      flags |= VPX_EFLAG_FORCE_KF;
    encode_frame(&codec, &raw, frame_count++, flags, writer);
+    frames_encoded++;
+    if (max_frames > 0 && frames_encoded >= max_frames)
+      break;
  }

  // Flush encoder.
--- a/examples/twopass_encoder.c
+++ b/examples/twopass_encoder.c
@ -59,7 +59,9 @@
 static const char *exec_name;

 void usage_exit(void) {
-  fprintf(stderr, "Usage: %s <codec> <width> <height> <infile> <outfile>\n",
+  fprintf(stderr,
+          "Usage: %s <codec> <width> <height> <infile> <outfile> "
+              "<frame limit>\n",
          exec_name);
  exit(EXIT_FAILURE);
 }
@ -129,7 +131,8 @@ static int encode_frame(vpx_codec_ctx_t *ctx,
 static vpx_fixed_buf_t pass0(vpx_image_t *raw,
                             FILE *infile,
                             const VpxInterface *encoder,
-                             const vpx_codec_enc_cfg_t *cfg) {
+                             const vpx_codec_enc_cfg_t *cfg,
+                             int max_frames) {
  vpx_codec_ctx_t codec;
  int frame_count = 0;
  vpx_fixed_buf_t stats = {NULL, 0};
@ -142,6 +145,8 @@ static vpx_fixed_buf_t pass0(vpx_image_t *raw,
    ++frame_count;
    get_frame_stats(&codec, raw, frame_count, 1, 0, VPX_DL_GOOD_QUALITY,
                    &stats);
+    if (max_frames > 0 && frame_count >= max_frames)
+      break;
  }

  // Flush encoder.
@ -159,7 +164,8 @@ static void pass1(vpx_image_t *raw,
                  FILE *infile,
                  const char *outfile_name,
                  const VpxInterface *encoder,
-                  const vpx_codec_enc_cfg_t *cfg) {
+                  const vpx_codec_enc_cfg_t *cfg,
+                  int max_frames) {
  VpxVideoInfo info = {
    encoder->fourcc,
    cfg->g_w,
@ -181,6 +187,9 @@ static void pass1(vpx_image_t *raw,
  while (vpx_img_read(raw, infile)) {
    ++frame_count;
    encode_frame(&codec, raw, frame_count, 1, 0, VPX_DL_GOOD_QUALITY, writer);
+
+    if (max_frames > 0 && frame_count >= max_frames)
+      break;
  }

  // Flush encoder.
@ -213,11 +222,14 @@ int main(int argc, char **argv) {
  const char *const height_arg = argv[3];
  const char *const infile_arg = argv[4];
  const char *const outfile_arg = argv[5];
+  int max_frames = 0;
  exec_name = argv[0];

-  if (argc != 6)
+  if (argc != 7)
    die("Invalid number of arguments.");

+  max_frames = strtol(argv[6], NULL, 0);
+
  encoder = get_vpx_encoder_by_name(codec_arg);
  if (!encoder)
    die("Unsupported codec.");
@ -249,13 +261,13 @@ int main(int argc, char **argv) {

  // Pass 0
  cfg.g_pass = VPX_RC_FIRST_PASS;
-  stats = pass0(&raw, infile, encoder, &cfg);
+  stats = pass0(&raw, infile, encoder, &cfg, max_frames);

  // Pass 1
  rewind(infile);
  cfg.g_pass = VPX_RC_LAST_PASS;
  cfg.rc_twopass_stats_in = stats;
-  pass1(&raw, infile, outfile_arg, encoder, &cfg);
+  pass1(&raw, infile, outfile_arg, encoder, &cfg, max_frames);
  free(stats.buf);

  vpx_img_free(&raw);
--- a/examples/vpx_temporal_svc_encoder.c
+++ b/examples/vpx_temporal_svc_encoder.c
@ -715,7 +715,7 @@ int main(int argc, char **argv) {
    vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed);
    vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
    vpx_codec_control(&codec, VP9E_SET_FRAME_PERIODIC_BOOST, 0);
-    vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, 0);
+    vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, kDenoiserOff);
    vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
    vpx_codec_control(&codec, VP9E_SET_TUNE_CONTENT, 0);
    vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (cfg.g_threads >> 1));
--- a/ivfdec.c
+++ b/ivfdec.c
@ -23,7 +23,7 @@ static void fix_framerate(int *num, int *den) {
  // we can guess the framerate using only the timebase in this
  // case. Other files would require reading ahead to guess the
  // timebase, like we do for webm.
-  if (*num < 1000) {
+  if (*den > 0 && *den < 1000000000 && *num > 0 && *num < 1000) {
    // Correct for the factor of 2 applied to the timebase in the encoder.
    if (*num & 1)
      *den *= 2;
--- a/libs.mk
+++ b/libs.mk
@ -183,6 +183,9 @@ INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += third_party/x86inc/x86inc.asm
 endif
 CODEC_EXPORTS-yes += vpx/exports_com
 CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_enc
+ifeq ($(CONFIG_SPATIAL_SVC),yes)
+CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_spatial_svc
+endif
 CODEC_EXPORTS-$(CONFIG_DECODERS) += vpx/exports_dec

 INSTALL-LIBS-yes += include/vpx/vpx_codec.h
@ -270,6 +273,12 @@ EXPORT_FILE             := libvpx.syms
 LIBVPX_SO_SYMLINKS      := $(addprefix $(LIBSUBDIR)/, \
                             libvpx.dylib  )
 else
+ifeq ($(filter iphonesimulator%,$(TGT_OS)),$(TGT_OS))
+LIBVPX_SO               := libvpx.$(SO_VERSION_MAJOR).dylib
+SHARED_LIB_SUF          := .dylib
+EXPORT_FILE             := libvpx.syms
+LIBVPX_SO_SYMLINKS      := $(addprefix $(LIBSUBDIR)/, libvpx.dylib)
+else
 ifeq ($(filter os2%,$(TGT_OS)),$(TGT_OS))
 LIBVPX_SO               := libvpx$(SO_VERSION_MAJOR).dll
 SHARED_LIB_SUF          := _dll.a
@ -285,6 +294,7 @@ LIBVPX_SO_SYMLINKS      := $(addprefix $(LIBSUBDIR)/, \
                             libvpx.so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR))
 endif
 endif
+endif

 LIBS-$(CONFIG_SHARED) += $(BUILD_PFX)$(LIBVPX_SO)\
                           $(notdir $(LIBVPX_SO_SYMLINKS)) \
--- a/md5_utils.c
+++ b/md5_utils.c
@ -150,12 +150,23 @@ MD5Final(md5byte digest[16], struct MD5Context *ctx) {
 #define MD5STEP(f,w,x,y,z,in,s) \
  (w += f(x,y,z) + in, w = (w<<s | w>>(32-s)) + x)

+#if defined(__clang__) && defined(__has_attribute)
+#if __has_attribute(no_sanitize)
+#define VPX_NO_UNSIGNED_OVERFLOW_CHECK \
+  __attribute__((no_sanitize("unsigned-integer-overflow")))
+#endif
+#endif
+
+#ifndef VPX_NO_UNSIGNED_OVERFLOW_CHECK
+#define VPX_NO_UNSIGNED_OVERFLOW_CHECK
+#endif
+
 /*
 * The core of the MD5 algorithm, this alters an existing MD5 hash to
 * reflect the addition of 16 longwords of new data.  MD5Update blocks
 * the data and converts bytes into longwords for this routine.
 */
-void
+VPX_NO_UNSIGNED_OVERFLOW_CHECK void
 MD5Transform(UWORD32 buf[4], UWORD32 const in[16]) {
  register UWORD32 a, b, c, d;

@ -238,4 +249,6 @@ MD5Transform(UWORD32 buf[4], UWORD32 const in[16]) {
  buf[3] += d;
 }

+#undef VPX_NO_UNSIGNED_OVERFLOW_CHECK
+
 #endif
--- a/test/acm_random.h
+++ b/test/acm_random.h
@ -32,6 +32,12 @@ class ACMRandom {
    return (value >> 15) & 0xffff;
  }

+  int16_t Rand9Signed(void) {
+    // Use 9 bits: values between 255 (0x0FF) and -256 (0x100).
+    const uint32_t value = random_.Generate(512);
+    return static_cast<int16_t>(value) - 256;
+  }
+
  uint8_t Rand8(void) {
    const uint32_t value =
        random_.Generate(testing::internal::Random::kMaxRange);
--- a/test/add_noise_test.cc
+++ b/test/add_noise_test.cc
@ -0,0 +1,197 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <math.h>
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+
+namespace {
+
+// TODO(jimbankoski): make width and height integers not unsigned.
+typedef void (*AddNoiseFunc)(unsigned char *start, char *noise,
+                             char blackclamp[16], char whiteclamp[16],
+                             char bothclamp[16], unsigned int width,
+                             unsigned int height, int pitch);
+
+class AddNoiseTest
+    : public ::testing::TestWithParam<AddNoiseFunc> {
+ public:
+  virtual void TearDown() {
+    libvpx_test::ClearSystemState();
+  }
+  virtual ~AddNoiseTest() {}
+};
+
+double stddev6(char a, char b, char c, char d, char e, char f) {
+  const double n = (a + b + c + d + e + f) / 6.0;
+  const double v = ((a - n) * (a - n) + (b - n) * (b - n) + (c - n) * (c - n) +
+                    (d - n) * (d - n) + (e - n) * (e - n) + (f - n) * (f - n)) /
+                   6.0;
+  return sqrt(v);
+}
+
+// TODO(jimbankoski): The following 2 functions are duplicated in each codec.
+// For now the vp9 one has been copied into the test as is. We should normalize
+// these in vpx_dsp and not have 3 copies of these unless there is different
+// noise we add for each codec.
+
+double gaussian(double sigma, double mu, double x) {
+  return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
+         (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
+}
+
+int setup_noise(int size_noise, char *noise) {
+  char char_dist[300];
+  const int ai = 4;
+  const int qi = 24;
+  const double sigma = ai + .5 + .6 * (63 - qi) / 63.0;
+
+  /* set up a lookup table of 256 entries that matches
+   * a gaussian distribution with sigma determined by q.
+   */
+  int next = 0;
+
+  for (int i = -32; i < 32; i++) {
+    int a_i = (int) (0.5 + 256 * gaussian(sigma, 0, i));
+
+    if (a_i) {
+      for (int j = 0; j < a_i; j++) {
+        char_dist[next + j] = (char)(i);
+      }
+
+      next = next + a_i;
+    }
+  }
+
+  for (; next < 256; next++)
+    char_dist[next] = 0;
+
+  for (int i = 0; i < size_noise; i++) {
+    noise[i] = char_dist[rand() & 0xff];  // NOLINT
+  }
+
+  // Returns the most negative value in distribution.
+  return char_dist[0];
+}
+
+TEST_P(AddNoiseTest, CheckNoiseAdded) {
+  DECLARE_ALIGNED(16, char, blackclamp[16]);
+  DECLARE_ALIGNED(16, char, whiteclamp[16]);
+  DECLARE_ALIGNED(16, char, bothclamp[16]);
+  const int width  = 64;
+  const int height = 64;
+  const int image_size = width * height;
+  char noise[3072];
+
+  const int clamp = setup_noise(3072, noise);
+  for (int i = 0; i < 16; i++) {
+    blackclamp[i] = -clamp;
+    whiteclamp[i] = -clamp;
+    bothclamp[i] = -2 * clamp;
+  }
+
+  uint8_t *const s = reinterpret_cast<uint8_t *>(vpx_calloc(image_size, 1));
+  memset(s, 99, image_size);
+
+  ASM_REGISTER_STATE_CHECK(GetParam()(s, noise, blackclamp, whiteclamp,
+                                      bothclamp, width, height, width));
+
+  // Check to make sure we don't end up having either the same or no added
+  // noise either vertically or horizontally.
+  for (int i = 0; i < image_size - 6 * width - 6; ++i) {
+    const double hd = stddev6(s[i] - 99, s[i + 1] - 99, s[i + 2] - 99,
+                              s[i + 3] - 99, s[i + 4] - 99, s[i + 5] - 99);
+    const double vd = stddev6(s[i] - 99, s[i + width] - 99,
+                              s[i + 2 * width] - 99, s[i + 3 * width] - 99,
+                              s[i + 4 * width] - 99, s[i + 5 * width] - 99);
+
+    EXPECT_NE(hd, 0);
+    EXPECT_NE(vd, 0);
+  }
+
+  // Initialize pixels in the image to 255 and check for roll over.
+  memset(s, 255, image_size);
+
+  ASM_REGISTER_STATE_CHECK(GetParam()(s, noise, blackclamp, whiteclamp,
+                                      bothclamp, width, height, width));
+
+  // Check to make sure don't roll over.
+  for (int i = 0; i < image_size; ++i) {
+    EXPECT_GT((int)s[i], 10) << "i = " << i;
+  }
+
+  // Initialize pixels in the image to 0 and check for roll under.
+  memset(s, 0, image_size);
+
+  ASM_REGISTER_STATE_CHECK(GetParam()(s, noise, blackclamp, whiteclamp,
+                                      bothclamp, width, height, width));
+
+  // Check to make sure don't roll under.
+  for (int i = 0; i < image_size; ++i) {
+    EXPECT_LT((int)s[i], 245) << "i = " << i;
+  }
+
+  vpx_free(s);
+}
+
+TEST_P(AddNoiseTest, CheckCvsAssembly) {
+  DECLARE_ALIGNED(16, char, blackclamp[16]);
+  DECLARE_ALIGNED(16, char, whiteclamp[16]);
+  DECLARE_ALIGNED(16, char, bothclamp[16]);
+  const int width  = 64;
+  const int height = 64;
+  const int image_size = width * height;
+  char noise[3072];
+
+  const int clamp = setup_noise(3072, noise);
+  for (int i = 0; i < 16; i++) {
+    blackclamp[i] = -clamp;
+    whiteclamp[i] = -clamp;
+    bothclamp[i] = -2 * clamp;
+  }
+
+  uint8_t *const s = reinterpret_cast<uint8_t *>(vpx_calloc(image_size, 1));
+  uint8_t *const d = reinterpret_cast<uint8_t *>(vpx_calloc(image_size, 1));
+
+  memset(s, 99, image_size);
+  memset(d, 99, image_size);
+
+  srand(0);
+  ASM_REGISTER_STATE_CHECK(GetParam()(s, noise, blackclamp, whiteclamp,
+                                      bothclamp, width, height, width));
+  srand(0);
+  ASM_REGISTER_STATE_CHECK(vpx_plane_add_noise_c(d, noise, blackclamp,
+                                                 whiteclamp, bothclamp,
+                                                 width, height, width));
+
+  for (int i = 0; i < image_size; ++i) {
+    EXPECT_EQ((int)s[i], (int)d[i]) << "i = " << i;
+  }
+
+  vpx_free(d);
+  vpx_free(s);
+}
+
+INSTANTIATE_TEST_CASE_P(C, AddNoiseTest,
+                        ::testing::Values(vpx_plane_add_noise_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, AddNoiseTest,
+                        ::testing::Values(vpx_plane_add_noise_sse2));
+#endif
+
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(MSA, AddNoiseTest,
+                        ::testing::Values(vpx_plane_add_noise_msa));
+#endif
+}  // namespace
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@ -138,7 +138,8 @@ void filter_block2d_8_c(const uint8_t *src_ptr,
  // and filter_max_width          = 16
  //
  uint8_t intermediate_buffer[(kMaxDimension+8) * kMaxDimension];
-  const int intermediate_next_stride = 1 - intermediate_height * output_width;
+  const int intermediate_next_stride =
+      1 - static_cast<int>(intermediate_height * output_width);

  // Horizontal pass (src -> transposed intermediate).
  uint8_t *output_ptr = intermediate_buffer;
@ -250,7 +251,8 @@ void highbd_filter_block2d_8_c(const uint16_t *src_ptr,
   * and filter_max_width = 16
   */
  uint16_t intermediate_buffer[(kMaxDimension+8) * kMaxDimension];
-  const int intermediate_next_stride = 1 - intermediate_height * output_width;
+  const int intermediate_next_stride =
+      1 - static_cast<int>(intermediate_height * output_width);

  // Horizontal pass (src -> transposed intermediate).
  {
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@ -90,7 +90,7 @@ class DatarateTestLarge : public ::libvpx_test::EncoderTest,
          << pkt->data.frame.pts;
    }

-    const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
+    const int64_t frame_size_in_bits = pkt->data.frame.sz * 8;

    // Subtract from the buffer the bits associated with a played back frame.
    bits_in_buffer_model_ -= frame_size_in_bits;
@ -450,7 +450,28 @@ class DatarateTestVP9Large : public ::libvpx_test::EncoderTest,
  int denoiser_offon_period_;
 };

-// Check basic rate targeting,
+// Check basic rate targeting for VBR mode.
+TEST_P(DatarateTestVP9Large, BasicRateTargetingVBR) {
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_error_resilient = 0;
+  cfg_.rc_end_usage = VPX_VBR;
+  cfg_.g_lag_in_frames = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+  for (int i = 400; i <= 800; i += 400) {
+    cfg_.rc_target_bitrate = i;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75)
+        << " The datarate for the file is lower than target by too much!";
+    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25)
+        << " The datarate for the file is greater than target by too much!";
+  }
+}
+
+// Check basic rate targeting for CBR,
 TEST_P(DatarateTestVP9Large, BasicRateTargeting) {
  cfg_.rc_buf_initial_sz = 500;
  cfg_.rc_buf_optimal_sz = 500;
@ -474,7 +495,7 @@ TEST_P(DatarateTestVP9Large, BasicRateTargeting) {
  }
 }

-// Check basic rate targeting,
+// Check basic rate targeting for CBR.
 TEST_P(DatarateTestVP9Large, BasicRateTargeting444) {
  ::libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140);

--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@ -365,10 +365,10 @@ class Trans16x16TestBase {

      for (int j = 0; j < kNumCoeffs; ++j) {
 #if CONFIG_VP9_HIGHBITDEPTH
-        const uint32_t diff =
+        const int32_t diff =
            bit_depth_ == VPX_BITS_8 ?  dst[j] - src[j] : dst16[j] - src16[j];
 #else
-        const uint32_t diff = dst[j] - src[j];
+        const int32_t diff = dst[j] - src[j];
 #endif
        const uint32_t error = diff * diff;
        if (max_error < error)
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@ -147,10 +147,10 @@ TEST_P(Trans32x32Test, AccuracyCheck) {

    for (int j = 0; j < kNumCoeffs; ++j) {
 #if CONFIG_VP9_HIGHBITDEPTH
-      const uint32_t diff =
+      const int32_t diff =
          bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
 #else
-      const uint32_t diff = dst[j] - src[j];
+      const int32_t diff = dst[j] - src[j];
 #endif
      const uint32_t error = diff * diff;
      if (max_error < error)
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@ -302,22 +302,12 @@ INSTANTIATE_TEST_CASE_P(
        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8, 16)));
 #endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE

-#if CONFIG_USE_X86INC && HAVE_MMX && !CONFIG_VP9_HIGHBITDEPTH && \
-    !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(
-    MMX, Trans4x4WHT,
-    ::testing::Values(
-        make_tuple(&vp9_fwht4x4_mmx, &vpx_iwht4x4_16_add_c, 0,
-                   VPX_BITS_8, 16)));
-#endif
-
-#if CONFIG_USE_X86INC && HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && \
-    !CONFIG_EMULATE_HARDWARE
+#if CONFIG_USE_X86INC && HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
    SSE2, Trans4x4WHT,
    ::testing::Values(
-        make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_sse2, 0,
-                   VPX_BITS_8, 16)));
+        make_tuple(&vp9_fwht4x4_sse2, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8, 16),
+        make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_sse2, 0, VPX_BITS_8, 16)));
 #endif

 #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@ -425,10 +425,10 @@ class FwdTrans8x8TestBase {

      for (int j = 0; j < kNumCoeffs; ++j) {
 #if CONFIG_VP9_HIGHBITDEPTH
-        const uint32_t diff =
+        const int diff =
            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
 #else
-        const uint32_t diff = dst[j] - src[j];
+        const int diff = dst[j] - src[j];
 #endif
        const uint32_t error = diff * diff;
        EXPECT_GE(1u << 2 * (bit_depth_ - 8), error)
@ -458,7 +458,7 @@ class FwdTrans8x8TestBase {
        coeff_r[j] = static_cast<tran_low_t>(round(out_r[j]));

      for (int j = 0; j < kNumCoeffs; ++j) {
-        const uint32_t diff = coeff[j] - coeff_r[j];
+        const int32_t diff = coeff[j] - coeff_r[j];
        const uint32_t error = diff * diff;
        EXPECT_GE(9u << 2 * (bit_depth_ - 8), error)
            << "Error: 8x8 DCT has error " << error
@ -511,10 +511,10 @@ void CompareInvReference(IdctFunc ref_txfm, int thresh) {

      for (int j = 0; j < kNumCoeffs; ++j) {
 #if CONFIG_VP9_HIGHBITDEPTH
-        const uint32_t diff =
+        const int diff =
            bit_depth_ == VPX_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j];
 #else
-        const uint32_t diff = dst[j] - ref[j];
+        const int diff = dst[j] - ref[j];
 #endif
        const uint32_t error = diff * diff;
        EXPECT_EQ(0u, error)
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@ -0,0 +1,220 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <algorithm>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+
+namespace {
+
+using ::libvpx_test::ACMRandom;
+
+typedef void (*HadamardFunc)(const int16_t *a, int a_stride, int16_t *b);
+
+void hadamard_loop(const int16_t *a, int a_stride, int16_t *out) {
+  int16_t b[8];
+  for (int i = 0; i < 8; i += 2) {
+    b[i + 0] = a[i * a_stride] + a[(i + 1) * a_stride];
+    b[i + 1] = a[i * a_stride] - a[(i + 1) * a_stride];
+  }
+  int16_t c[8];
+  for (int i = 0; i < 8; i += 4) {
+    c[i + 0] = b[i + 0] + b[i + 2];
+    c[i + 1] = b[i + 1] + b[i + 3];
+    c[i + 2] = b[i + 0] - b[i + 2];
+    c[i + 3] = b[i + 1] - b[i + 3];
+  }
+  out[0] = c[0] + c[4];
+  out[7] = c[1] + c[5];
+  out[3] = c[2] + c[6];
+  out[4] = c[3] + c[7];
+  out[2] = c[0] - c[4];
+  out[6] = c[1] - c[5];
+  out[1] = c[2] - c[6];
+  out[5] = c[3] - c[7];
+}
+
+void reference_hadamard8x8(const int16_t *a, int a_stride, int16_t *b) {
+  int16_t buf[64];
+  for (int i = 0; i < 8; ++i) {
+    hadamard_loop(a + i, a_stride, buf + i * 8);
+  }
+
+  for (int i = 0; i < 8; ++i) {
+    hadamard_loop(buf + i, 8, b + i * 8);
+  }
+}
+
+void reference_hadamard16x16(const int16_t *a, int a_stride, int16_t *b) {
+  /* The source is a 16x16 block. The destination is rearranged to 8x32.
+   * Input is 9 bit. */
+  reference_hadamard8x8(a + 0 + 0 * a_stride, a_stride, b + 0);
+  reference_hadamard8x8(a + 8 + 0 * a_stride, a_stride, b + 64);
+  reference_hadamard8x8(a + 0 + 8 * a_stride, a_stride, b + 128);
+  reference_hadamard8x8(a + 8 + 8 * a_stride, a_stride, b + 192);
+
+  /* Overlay the 8x8 blocks and combine. */
+  for (int i = 0; i < 64; ++i) {
+    /* 8x8 steps the range up to 15 bits. */
+    const int16_t a0 = b[0];
+    const int16_t a1 = b[64];
+    const int16_t a2 = b[128];
+    const int16_t a3 = b[192];
+
+    /* Prevent the result from escaping int16_t. */
+    const int16_t b0 = (a0 + a1) >> 1;
+    const int16_t b1 = (a0 - a1) >> 1;
+    const int16_t b2 = (a2 + a3) >> 1;
+    const int16_t b3 = (a2 - a3) >> 1;
+
+    /* Store a 16 bit value. */
+    b[  0] = b0 + b2;
+    b[ 64] = b1 + b3;
+    b[128] = b0 - b2;
+    b[192] = b1 - b3;
+
+    ++b;
+  }
+}
+
+class HadamardTestBase : public ::testing::TestWithParam<HadamardFunc> {
+ public:
+  virtual void SetUp() {
+    h_func_ = GetParam();
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+ protected:
+  HadamardFunc h_func_;
+  ACMRandom rnd_;
+};
+
+class Hadamard8x8Test : public HadamardTestBase {};
+
+TEST_P(Hadamard8x8Test, CompareReferenceRandom) {
+  DECLARE_ALIGNED(16, int16_t, a[64]);
+  DECLARE_ALIGNED(16, int16_t, b[64]);
+  int16_t b_ref[64];
+  for (int i = 0; i < 64; ++i) {
+    a[i] = rnd_.Rand9Signed();
+  }
+  memset(b, 0, sizeof(b));
+  memset(b_ref, 0, sizeof(b_ref));
+
+  reference_hadamard8x8(a, 8, b_ref);
+  ASM_REGISTER_STATE_CHECK(h_func_(a, 8, b));
+
+  // The order of the output is not important. Sort before checking.
+  std::sort(b, b + 64);
+  std::sort(b_ref, b_ref + 64);
+  EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
+}
+
+TEST_P(Hadamard8x8Test, VaryStride) {
+  DECLARE_ALIGNED(16, int16_t, a[64 * 8]);
+  DECLARE_ALIGNED(16, int16_t, b[64]);
+  int16_t b_ref[64];
+  for (int i = 0; i < 64 * 8; ++i) {
+    a[i] = rnd_.Rand9Signed();
+  }
+
+  for (int i = 8; i < 64; i += 8) {
+    memset(b, 0, sizeof(b));
+    memset(b_ref, 0, sizeof(b_ref));
+
+    reference_hadamard8x8(a, i, b_ref);
+    ASM_REGISTER_STATE_CHECK(h_func_(a, i, b));
+
+    // The order of the output is not important. Sort before checking.
+    std::sort(b, b + 64);
+    std::sort(b_ref, b_ref + 64);
+    EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(C, Hadamard8x8Test,
+                        ::testing::Values(&vpx_hadamard_8x8_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, Hadamard8x8Test,
+                        ::testing::Values(&vpx_hadamard_8x8_sse2));
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3 && CONFIG_USE_X86INC && ARCH_X86_64
+INSTANTIATE_TEST_CASE_P(SSSE3, Hadamard8x8Test,
+                        ::testing::Values(&vpx_hadamard_8x8_ssse3));
+#endif  // HAVE_SSSE3 && CONFIG_USE_X86INC && ARCH_X86_64
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, Hadamard8x8Test,
+                        ::testing::Values(&vpx_hadamard_8x8_neon));
+#endif  // HAVE_NEON
+
+class Hadamard16x16Test : public HadamardTestBase {};
+
+TEST_P(Hadamard16x16Test, CompareReferenceRandom) {
+  DECLARE_ALIGNED(16, int16_t, a[16 * 16]);
+  DECLARE_ALIGNED(16, int16_t, b[16 * 16]);
+  int16_t b_ref[16 * 16];
+  for (int i = 0; i < 16 * 16; ++i) {
+    a[i] = rnd_.Rand9Signed();
+  }
+  memset(b, 0, sizeof(b));
+  memset(b_ref, 0, sizeof(b_ref));
+
+  reference_hadamard16x16(a, 16, b_ref);
+  ASM_REGISTER_STATE_CHECK(h_func_(a, 16, b));
+
+  // The order of the output is not important. Sort before checking.
+  std::sort(b, b + 16 * 16);
+  std::sort(b_ref, b_ref + 16 * 16);
+  EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
+}
+
+TEST_P(Hadamard16x16Test, VaryStride) {
+  DECLARE_ALIGNED(16, int16_t, a[16 * 16 * 8]);
+  DECLARE_ALIGNED(16, int16_t, b[16 * 16]);
+  int16_t b_ref[16 * 16];
+  for (int i = 0; i < 16 * 16 * 8; ++i) {
+    a[i] = rnd_.Rand9Signed();
+  }
+
+  for (int i = 8; i < 64; i += 8) {
+    memset(b, 0, sizeof(b));
+    memset(b_ref, 0, sizeof(b_ref));
+
+    reference_hadamard16x16(a, i, b_ref);
+    ASM_REGISTER_STATE_CHECK(h_func_(a, i, b));
+
+    // The order of the output is not important. Sort before checking.
+    std::sort(b, b + 16 * 16);
+    std::sort(b_ref, b_ref + 16 * 16);
+    EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(C, Hadamard16x16Test,
+                        ::testing::Values(&vpx_hadamard_16x16_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, Hadamard16x16Test,
+                        ::testing::Values(&vpx_hadamard_16x16_sse2));
+#endif  // HAVE_SSE2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, Hadamard16x16Test,
+                        ::testing::Values(&vpx_hadamard_16x16_neon));
+#endif  // HAVE_NEON
+}  // namespace
--- a/test/level_test.cc
+++ b/test/level_test.cc
@ -0,0 +1,119 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+class LevelTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+ protected:
+  LevelTest()
+     : EncoderTest(GET_PARAM(0)),
+       encoding_mode_(GET_PARAM(1)),
+       cpu_used_(GET_PARAM(2)),
+       min_gf_internal_(24),
+       target_level_(0),
+       level_(0) {}
+  virtual ~LevelTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    if (encoding_mode_ != ::libvpx_test::kRealTime) {
+      cfg_.g_lag_in_frames = 25;
+      cfg_.rc_end_usage = VPX_VBR;
+    } else {
+      cfg_.g_lag_in_frames = 0;
+      cfg_.rc_end_usage = VPX_CBR;
+    }
+    cfg_.rc_2pass_vbr_minsection_pct = 5;
+    cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+    cfg_.rc_target_bitrate = 400;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_min_quantizer = 0;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
+      encoder->Control(VP9E_SET_TARGET_LEVEL, target_level_);
+      encoder->Control(VP9E_SET_MIN_GF_INTERVAL, min_gf_internal_);
+      if (encoding_mode_ != ::libvpx_test::kRealTime) {
+        encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+        encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+      }
+    }
+    encoder->Control(VP9E_GET_LEVEL, &level_);
+    ASSERT_LE(level_, 51);
+    ASSERT_GE(level_, 0);
+  }
+
+  ::libvpx_test::TestMode encoding_mode_;
+  int cpu_used_;
+  int min_gf_internal_;
+  int target_level_;
+  int level_;
+};
+
+// Test for keeping level stats only
+TEST_P(LevelTest, TestTargetLevel0) {
+  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       40);
+  target_level_ = 0;
+  min_gf_internal_ = 4;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_EQ(11, level_);
+
+  cfg_.rc_target_bitrate = 1600;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_EQ(20, level_);
+}
+
+// Test for level control being turned off
+TEST_P(LevelTest, TestTargetLevel255) {
+  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       30);
+  target_level_ = 255;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+TEST_P(LevelTest, TestTargetLevelApi) {
+  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0, 1);
+  static const vpx_codec_iface_t *codec = &vpx_codec_vp9_cx_algo;
+  vpx_codec_ctx_t enc;
+  vpx_codec_enc_cfg_t cfg;
+  EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_config_default(codec, &cfg, 0));
+  EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_init(&enc, codec, &cfg, 0));
+  for (int level = 0; level <= 256; ++level) {
+    if (level == 10 || level == 11 || level == 20 || level == 21 ||
+        level == 30 || level == 31 || level == 40 || level == 41 ||
+        level == 50 || level == 51 || level == 52 || level == 60 ||
+        level == 61 || level == 62 || level == 0 || level == 255)
+      EXPECT_EQ(VPX_CODEC_OK,
+                vpx_codec_control(&enc, VP9E_SET_TARGET_LEVEL, level));
+    else
+      EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+                vpx_codec_control(&enc, VP9E_SET_TARGET_LEVEL, level));
+  }
+  EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&enc));
+}
+
+VP9_INSTANTIATE_TEST_CASE(LevelTest,
+                          ::testing::Values(::libvpx_test::kTwoPassGood,
+                                            ::libvpx_test::kOnePassGood),
+                          ::testing::Range(0, 9));
+}  // namespace
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@ -430,16 +430,6 @@ TEST_P(Loop8Test9Param, ValueCheck) {

 using std::tr1::make_tuple;

-#if HAVE_MMX && CONFIG_USE_X86INC && !CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
-    MMX, Loop8Test6Param,
-    ::testing::Values(
-        make_tuple(&vpx_lpf_horizontal_4_mmx,
-                   &vpx_lpf_horizontal_4_c, 8),
-        make_tuple(&vpx_lpf_vertical_4_mmx,
-                   &vpx_lpf_vertical_4_c, 8)));
-#endif  // HAVE_MMX
-
 #if HAVE_SSE2
 #if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
@ -497,12 +487,16 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
    SSE2, Loop8Test6Param,
    ::testing::Values(
+        make_tuple(&vpx_lpf_horizontal_4_sse2,
+                   &vpx_lpf_horizontal_4_c, 8),
        make_tuple(&vpx_lpf_horizontal_8_sse2,
                   &vpx_lpf_horizontal_8_c, 8),
        make_tuple(&vpx_lpf_horizontal_edge_8_sse2,
                   &vpx_lpf_horizontal_edge_8_c, 8),
        make_tuple(&vpx_lpf_horizontal_edge_16_sse2,
                   &vpx_lpf_horizontal_edge_16_c, 8),
+        make_tuple(&vpx_lpf_vertical_4_sse2,
+                   &vpx_lpf_vertical_4_c, 8),
        make_tuple(&vpx_lpf_vertical_8_sse2,
                   &vpx_lpf_vertical_8_c, 8),
        make_tuple(&vpx_lpf_vertical_16_sse2,
--- a/test/minmax_test.cc
+++ b/test/minmax_test.cc
@ -0,0 +1,132 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+
+namespace {
+
+using ::libvpx_test::ACMRandom;
+
+typedef void (*MinMaxFunc)(const uint8_t *a, int a_stride,
+                           const uint8_t *b, int b_stride,
+                           int *min, int *max);
+
+class MinMaxTest : public ::testing::TestWithParam<MinMaxFunc> {
+ public:
+  virtual void SetUp() {
+    mm_func_ = GetParam();
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+ protected:
+  MinMaxFunc mm_func_;
+  ACMRandom rnd_;
+};
+
+void reference_minmax(const uint8_t *a, int a_stride,
+                      const uint8_t *b, int b_stride,
+                      int *min_ret, int *max_ret) {
+  int min = 255;
+  int max = 0;
+  for (int i = 0; i < 8; i++) {
+    for (int j = 0; j < 8; j++) {
+      const int diff = abs(a[i * a_stride + j] - b[i * b_stride + j]);
+      if (min > diff) min = diff;
+      if (max < diff) max = diff;
+    }
+  }
+
+  *min_ret = min;
+  *max_ret = max;
+}
+
+TEST_P(MinMaxTest, MinValue) {
+  for (int i = 0; i < 64; i++) {
+    uint8_t a[64], b[64];
+    memset(a, 0, sizeof(a));
+    memset(b, 255, sizeof(b));
+    b[i] = i;  // Set a minimum difference of i.
+
+    int min, max;
+    ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
+    EXPECT_EQ(255, max);
+    EXPECT_EQ(i, min);
+  }
+}
+
+TEST_P(MinMaxTest, MaxValue) {
+  for (int i = 0; i < 64; i++) {
+    uint8_t a[64], b[64];
+    memset(a, 0, sizeof(a));
+    memset(b, 0, sizeof(b));
+    b[i] = i;  // Set a maximum difference of i.
+
+    int min, max;
+    ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
+    EXPECT_EQ(i, max);
+    EXPECT_EQ(0, min);
+  }
+}
+
+TEST_P(MinMaxTest, CompareReference) {
+  uint8_t a[64], b[64];
+  for (int j = 0; j < 64; j++) {
+    a[j] = rnd_.Rand8();
+    b[j] = rnd_.Rand8();
+  }
+
+  int min_ref, max_ref, min, max;
+  reference_minmax(a, 8, b, 8, &min_ref, &max_ref);
+  ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
+  EXPECT_EQ(max_ref, max);
+  EXPECT_EQ(min_ref, min);
+}
+
+TEST_P(MinMaxTest, CompareReferenceAndVaryStride) {
+  uint8_t a[8 * 64], b[8 * 64];
+  for (int i = 0; i < 8 * 64; i++) {
+    a[i] = rnd_.Rand8();
+    b[i] = rnd_.Rand8();
+  }
+  for (int a_stride = 8; a_stride <= 64; a_stride += 8) {
+    for (int b_stride = 8; b_stride <= 64; b_stride += 8) {
+      int min_ref, max_ref, min, max;
+      reference_minmax(a, a_stride, b, b_stride, &min_ref, &max_ref);
+      ASM_REGISTER_STATE_CHECK(mm_func_(a, a_stride, b, b_stride, &min, &max));
+      EXPECT_EQ(max_ref, max) << "when a_stride = " << a_stride
+                              << " and b_stride = " << b_stride;;
+      EXPECT_EQ(min_ref, min) << "when a_stride = " << a_stride
+                              << " and b_stride = " << b_stride;;
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(C, MinMaxTest, ::testing::Values(&vpx_minmax_8x8_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, MinMaxTest,
+                        ::testing::Values(&vpx_minmax_8x8_sse2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, MinMaxTest,
+                        ::testing::Values(&vpx_minmax_8x8_neon));
+#endif
+
+}  // namespace
--- a/test/realtime_test.cc
+++ b/test/realtime_test.cc
@ -0,0 +1,64 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/video_source.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+namespace {
+
+const int kVideoSourceWidth = 320;
+const int kVideoSourceHeight = 240;
+const int kFramesToEncode = 2;
+
+class RealtimeTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
+ protected:
+  RealtimeTest()
+      : EncoderTest(GET_PARAM(0)), frame_packets_(0) {}
+  virtual ~RealtimeTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    cfg_.g_lag_in_frames = 0;
+    SetMode(::libvpx_test::kRealTime);
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    // TODO(tomfinegan): We're changing the pass value here to make sure
+    // we get frames when real time mode is combined with |g_pass| set to
+    // VPX_RC_FIRST_PASS. This is necessary because EncoderTest::RunLoop() sets
+    // the pass value based on the mode passed into EncoderTest::SetMode(),
+    // which overrides the one specified in SetUp() above.
+    cfg_.g_pass = VPX_RC_FIRST_PASS;
+  }
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {
+    frame_packets_++;
+  }
+
+  int frame_packets_;
+};
+
+TEST_P(RealtimeTest, RealtimeFirstPassProducesFrames) {
+  ::libvpx_test::RandomVideoSource video;
+  video.SetSize(kVideoSourceWidth, kVideoSourceHeight);
+  video.set_limit(kFramesToEncode);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  EXPECT_EQ(kFramesToEncode, frame_packets_);
+}
+
+VP8_INSTANTIATE_TEST_CASE(RealtimeTest,
+                          ::testing::Values(::libvpx_test::kRealTime));
+VP9_INSTANTIATE_TEST_CASE(RealtimeTest,
+                          ::testing::Values(::libvpx_test::kRealTime));
+
+}  // namespace
--- a/test/register_state_check.h
+++ b/test/register_state_check.h
@ -36,16 +36,10 @@
 #include <windows.h>
 #include <winnt.h>

-namespace testing {
-namespace internal {
-
 inline bool operator==(const M128A& lhs, const M128A& rhs) {
  return (lhs.Low == rhs.Low && lhs.High == rhs.High);
 }

-}  // namespace internal
-}  // namespace testing
-
 namespace libvpx_test {

 // Compares the state of xmm[6-15] at construction with their state at
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@ -7,6 +7,8 @@
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
+#include <stdio.h>
+
 #include <climits>
 #include <vector>
 #include "third_party/googletest/src/include/gtest/gtest.h"
@ -558,9 +560,13 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDown) {
    }
  }

+#if CONFIG_VP9_DECODER
  // Verify that we get 1 resize down event in this test.
  ASSERT_EQ(1, resize_count) << "Resizing should occur.";
  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+#else
+  printf("Warning: VP9 decoder unavailable, unable to check resize count!\n");
+#endif
 }

 // Verify the dynamic resizer behavior for real time, 1 pass CBR mode.
@ -602,9 +608,13 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) {
    }
  }

+#if CONFIG_VP9_DECODER
  // Verify that we get 2 resize events in this test.
  ASSERT_EQ(resize_count, 2) << "Resizing should occur twice.";
  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+#else
+  printf("Warning: VP9 decoder unavailable, unable to check resize count!\n");
+#endif
 }

 vpx_img_fmt_t CspForFrameNumber(int frame) {
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@ -749,17 +749,6 @@ INSTANTIATE_TEST_CASE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests));

 //------------------------------------------------------------------------------
 // x86 functions
-#if HAVE_MMX
-const SadMxNParam mmx_tests[] = {
-  make_tuple(16, 16, &vpx_sad16x16_mmx, -1),
-  make_tuple(16, 8, &vpx_sad16x8_mmx, -1),
-  make_tuple(8, 16, &vpx_sad8x16_mmx, -1),
-  make_tuple(8, 8, &vpx_sad8x8_mmx, -1),
-  make_tuple(4, 4, &vpx_sad4x4_mmx, -1),
-};
-INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::ValuesIn(mmx_tests));
-#endif  // HAVE_MMX
-
 #if HAVE_SSE2
 #if CONFIG_USE_X86INC
 const SadMxNParam sse2_tests[] = {
--- a/test/simple_encoder.sh
+++ b/test/simple_encoder.sh
@ -23,7 +23,7 @@ simple_encoder_verify_environment() {
  fi
 }

-# Runs simple_encoder using the codec specified by $1.
+# Runs simple_encoder using the codec specified by $1 with a frame limit of 100.
 simple_encoder() {
  local encoder="${LIBVPX_BIN_PATH}/simple_encoder${VPX_TEST_EXE_SUFFIX}"
  local codec="$1"
@ -35,7 +35,7 @@ simple_encoder() {
  fi

  eval "${VPX_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
-      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" 9999 \
+      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" 9999 0 100 \
      ${devnull}

  [ -e "${output_file}" ] || return 1
@ -47,16 +47,13 @@ simple_encoder_vp8() {
  fi
 }

-# TODO(tomfinegan): Add a frame limit param to simple_encoder and enable this
-# test. VP9 is just too slow right now: This test takes 4m30s+ on a fast
-# machine.
-DISABLED_simple_encoder_vp9() {
+simple_encoder_vp9() {
  if [ "$(vp9_encode_available)" = "yes" ]; then
    simple_encoder vp9 || return 1
  fi
 }

 simple_encoder_tests="simple_encoder_vp8
-                      DISABLED_simple_encoder_vp9"
+                      simple_encoder_vp9"

 run_tests simple_encoder_verify_environment "${simple_encoder_tests}"
--- a/test/test.mk
+++ b/test/test.mk
@ -25,6 +25,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += datarate_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += encode_api_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += error_resilience_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += i420_video_source.h
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += realtime_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += resize_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += y4m_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += yuv_video_source.h
@ -43,6 +44,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += frame_size_tests.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_lossless_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ethread_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += level_test.cc

 LIBVPX_TEST_SRCS-yes                   += decode_test_driver.cc
 LIBVPX_TEST_SRCS-yes                   += decode_test_driver.h
@ -108,6 +110,7 @@ LIBVPX_TEST_SRCS-yes                   += vp8_boolcoder_test.cc
 LIBVPX_TEST_SRCS-yes                   += vp8_fragments_test.cc
 endif

+LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC)    += add_noise_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC)    += pp_filter_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += vp8_decrypt_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += quantize_test.cc
@ -148,6 +151,8 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct32x32_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += hadamard_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += minmax_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_error_block_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@ -191,14 +191,15 @@ INTRA_PRED_TEST(C, TestIntraPred4, vpx_dc_predictor_4x4_c,
 INTRA_PRED_TEST(SSE2, TestIntraPred4, vpx_dc_predictor_4x4_sse2,
                vpx_dc_left_predictor_4x4_sse2, vpx_dc_top_predictor_4x4_sse2,
                vpx_dc_128_predictor_4x4_sse2, vpx_v_predictor_4x4_sse2,
-                vpx_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
+                vpx_h_predictor_4x4_sse2, vpx_d45_predictor_4x4_sse2, NULL,
+                NULL, NULL, vpx_d207_predictor_4x4_sse2, NULL,
                vpx_tm_predictor_4x4_sse2)
 #endif  // HAVE_SSE2 && CONFIG_USE_X86INC

 #if HAVE_SSSE3 && CONFIG_USE_X86INC
 INTRA_PRED_TEST(SSSE3, TestIntraPred4, NULL, NULL, NULL, NULL, NULL,
-                NULL, vpx_d45_predictor_4x4_ssse3, NULL, NULL,
-                vpx_d153_predictor_4x4_ssse3, vpx_d207_predictor_4x4_ssse3,
+                NULL, NULL, NULL, NULL,
+                vpx_d153_predictor_4x4_ssse3, NULL,
                vpx_d63_predictor_4x4_ssse3, NULL)
 #endif  // HAVE_SSSE3 && CONFIG_USE_X86INC

@ -240,13 +241,13 @@ INTRA_PRED_TEST(C, TestIntraPred8, vpx_dc_predictor_8x8_c,
 INTRA_PRED_TEST(SSE2, TestIntraPred8, vpx_dc_predictor_8x8_sse2,
                vpx_dc_left_predictor_8x8_sse2, vpx_dc_top_predictor_8x8_sse2,
                vpx_dc_128_predictor_8x8_sse2, vpx_v_predictor_8x8_sse2,
-                vpx_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL, NULL,
-                NULL, vpx_tm_predictor_8x8_sse2)
+                vpx_h_predictor_8x8_sse2, vpx_d45_predictor_8x8_sse2, NULL,
+                NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_sse2)
 #endif  // HAVE_SSE2 && CONFIG_USE_X86INC

 #if HAVE_SSSE3 && CONFIG_USE_X86INC
 INTRA_PRED_TEST(SSSE3, TestIntraPred8, NULL, NULL, NULL, NULL, NULL,
-                NULL, vpx_d45_predictor_8x8_ssse3, NULL, NULL,
+                NULL, NULL, NULL, NULL,
                vpx_d153_predictor_8x8_ssse3, vpx_d207_predictor_8x8_ssse3,
                vpx_d63_predictor_8x8_ssse3, NULL)
 #endif  // HAVE_SSSE3 && CONFIG_USE_X86INC
--- a/test/twopass_encoder.sh
+++ b/test/twopass_encoder.sh
@ -23,7 +23,8 @@ twopass_encoder_verify_environment() {
  fi
 }

-# Runs twopass_encoder using the codec specified by $1.
+# Runs twopass_encoder using the codec specified by $1 with a frame limit of
+# 100.
 twopass_encoder() {
  local encoder="${LIBVPX_BIN_PATH}/twopass_encoder${VPX_TEST_EXE_SUFFIX}"
  local codec="$1"
@ -35,7 +36,7 @@ twopass_encoder() {
  fi

  eval "${VPX_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
-      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \
+      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" 100 \
      ${devnull}

  [ -e "${output_file}" ] || return 1
@ -47,16 +48,13 @@ twopass_encoder_vp8() {
  fi
 }

-# TODO(tomfinegan): Add a frame limit param to twopass_encoder and enable this
-# test. VP9 is just too slow right now: This test takes 31m16s+ on a fast
-# machine.
-DISABLED_twopass_encoder_vp9() {
+twopass_encoder_vp9() {
  if [ "$(vp9_encode_available)" = "yes" ]; then
    twopass_encoder vp9 || return 1
  fi
 }

 twopass_encoder_tests="twopass_encoder_vp8
-                       DISABLED_twopass_encoder_vp9"
+                       twopass_encoder_vp9"

 run_tests twopass_encoder_verify_environment "${twopass_encoder_tests}"
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@ -1062,30 +1062,6 @@ INSTANTIATE_TEST_CASE_P(
    ::testing::ValuesIn(kArrayHBDSubpelAvgVariance_c));
 #endif  // CONFIG_VP9_HIGHBITDEPTH

-#if HAVE_MMX
-INSTANTIATE_TEST_CASE_P(MMX, VpxMseTest,
-                        ::testing::Values(make_tuple(4, 4, &vpx_mse16x16_mmx)));
-
-INSTANTIATE_TEST_CASE_P(MMX, SumOfSquaresTest,
-                        ::testing::Values(vpx_get_mb_ss_mmx));
-
-INSTANTIATE_TEST_CASE_P(
-    MMX, VpxVarianceTest,
-    ::testing::Values(make_tuple(4, 4, &vpx_variance16x16_mmx, 0),
-                      make_tuple(4, 3, &vpx_variance16x8_mmx, 0),
-                      make_tuple(3, 4, &vpx_variance8x16_mmx, 0),
-                      make_tuple(3, 3, &vpx_variance8x8_mmx, 0),
-                      make_tuple(2, 2, &vpx_variance4x4_mmx, 0)));
-
-INSTANTIATE_TEST_CASE_P(
-    MMX, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(4, 4, &vpx_sub_pixel_variance16x16_mmx, 0),
-                      make_tuple(4, 3, &vpx_sub_pixel_variance16x8_mmx, 0),
-                      make_tuple(3, 4, &vpx_sub_pixel_variance8x16_mmx, 0),
-                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_mmx, 0),
-                      make_tuple(2, 2, &vpx_sub_pixel_variance4x4_mmx, 0)));
-#endif  // HAVE_MMX
-
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest,
                        ::testing::Values(vpx_get_mb_ss_sse2));
@ -1126,8 +1102,8 @@ INSTANTIATE_TEST_CASE_P(
                      make_tuple(3, 4, &vpx_sub_pixel_variance8x16_sse2, 0),
                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_sse2, 0),
                      make_tuple(3, 2, &vpx_sub_pixel_variance8x4_sse2, 0),
-                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_sse, 0),
-                      make_tuple(2, 2, &vpx_sub_pixel_variance4x4_sse, 0)));
+                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_sse2, 0),
+                      make_tuple(2, 2, &vpx_sub_pixel_variance4x4_sse2, 0)));

 INSTANTIATE_TEST_CASE_P(
    SSE2, VpxSubpelAvgVarianceTest,
@ -1143,8 +1119,8 @@ INSTANTIATE_TEST_CASE_P(
        make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_sse2, 0),
        make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_sse2, 0),
        make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_sse2, 0),
-        make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_sse, 0),
-        make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_sse, 0)));
+        make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_sse2, 0),
+        make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0)));
 #endif  // CONFIG_USE_X86INC

 #if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
--- a/test/vp9_denoiser_sse2_test.cc
+++ b/test/vp9_denoiser_sse2_test.cc
@ -94,8 +94,7 @@ TEST_P(VP9DenoiserTest, BitexactCheck) {
 // Test for all block size.
 INSTANTIATE_TEST_CASE_P(
    SSE2, VP9DenoiserTest,
-    ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, BLOCK_8X8,
-                      BLOCK_8X16, BLOCK_16X8, BLOCK_16X16, BLOCK_16X32,
-                      BLOCK_32X16, BLOCK_32X32, BLOCK_32X64, BLOCK_64X32,
-                      BLOCK_64X64));
+    ::testing::Values(BLOCK_8X8, BLOCK_8X16, BLOCK_16X8, BLOCK_16X16,
+                      BLOCK_16X32, BLOCK_32X16, BLOCK_32X32, BLOCK_32X64,
+                      BLOCK_64X32, BLOCK_64X64));
 }  // namespace
--- a/test/webm_video_source.h
+++ b/test/webm_video_source.h
@ -62,7 +62,7 @@ class WebMVideoSource : public CompressedVideoSource {

  void FillFrame() {
    ASSERT_TRUE(vpx_ctx_->file != NULL);
-    const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_, &buf_sz_);
+    const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_);
    ASSERT_GE(status, 0) << "webm_read_frame failed";
    if (status == 1) {
      end_of_file_ = true;
@ -72,7 +72,7 @@ class WebMVideoSource : public CompressedVideoSource {
  void SeekToNextKeyFrame() {
    ASSERT_TRUE(vpx_ctx_->file != NULL);
    do {
-      const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_, &buf_sz_);
+      const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_);
      ASSERT_GE(status, 0) << "webm_read_frame failed";
      ++frame_;
      if (status == 1) {
--- a/vp10/common/postproc.c
+++ b/vp10/common/postproc.c
@ -13,6 +13,7 @@
 #include <stdio.h>

 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 #include "./vp10_rtcd.h"

@ -587,32 +588,6 @@ static void fillrd(struct postproc_state *state, int q, int a) {
  state->last_noise = a;
 }

-void vp10_plane_add_noise_c(uint8_t *start, char *noise,
-                           char blackclamp[16],
-                           char whiteclamp[16],
-                           char bothclamp[16],
-                           unsigned int width, unsigned int height, int pitch) {
-  unsigned int i, j;
-
-  // TODO(jbb): why does simd code use both but c doesn't,  normalize and
-  // fix..
-  (void) bothclamp;
-  for (i = 0; i < height; i++) {
-    uint8_t *pos = start + i * pitch;
-    char  *ref = (char *)(noise + (rand() & 0xff));  // NOLINT
-
-    for (j = 0; j < width; j++) {
-      if (pos[j] < blackclamp[0])
-        pos[j] = blackclamp[0];
-
-      if (pos[j] > 255 + whiteclamp[0])
-        pos[j] = 255 + whiteclamp[0];
-
-      pos[j] += ref[j];
-    }
-  }
-}
-
 static void swap_mi_and_prev_mi(VP10_COMMON *cm) {
  // Current mip will be the prev_mip for the next frame.
  MODE_INFO *temp = cm->postproc_state.prev_mip;
@ -727,7 +702,7 @@ int vp10_post_proc_frame(struct VP10Common *cm,
      fillrd(ppstate, 63 - q, noise_level);
    }

-    vp10_plane_add_noise(ppbuf->y_buffer, ppstate->noise, ppstate->blackclamp,
+    vpx_plane_add_noise(ppbuf->y_buffer, ppstate->noise, ppstate->blackclamp,
                        ppstate->whiteclamp, ppstate->bothclamp,
                        ppbuf->y_width, ppbuf->y_height, ppbuf->y_stride);
  }
--- a/vp10/common/vp10_inv_txfm.c
+++ b/vp10/common/vp10_inv_txfm.c
--- a/vp10/common/vp10_inv_txfm.h
+++ b/vp10/common/vp10_inv_txfm.h
@ -15,13 +15,14 @@

 #include "./vpx_config.h"
 #include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/inv_txfm.h"
 #include "vpx_ports/mem.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

-static INLINE tran_low_t check_range(tran_high_t input) {
+static INLINE tran_high_t check_range(tran_high_t input) {
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
  // For valid VP9 input streams, intermediate stage coefficients should always
  // stay within the range of a signed 16 bit integer. Coefficients can go out
@ -32,17 +33,17 @@ static INLINE tran_low_t check_range(tran_high_t input) {
  assert(INT16_MIN <= input);
  assert(input <= INT16_MAX);
 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
-  return (tran_low_t)input;
+  return input;
 }

-static INLINE tran_low_t dct_const_round_shift(tran_high_t input) {
+static INLINE tran_high_t dct_const_round_shift(tran_high_t input) {
  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  return check_range(rv);
+  return rv;
 }

 #if CONFIG_VP9_HIGHBITDEPTH
-static INLINE tran_low_t highbd_check_range(tran_high_t input,
-                                            int bd) {
+static INLINE tran_high_t highbd_check_range(tran_high_t input,
+                                             int bd) {
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
  // For valid highbitdepth VP9 streams, intermediate stage coefficients will
  // stay within the ranges:
@ -56,13 +57,12 @@ static INLINE tran_low_t highbd_check_range(tran_high_t input,
  (void) int_min;
 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
  (void) bd;
-  return (tran_low_t)input;
+  return input;
 }

-static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input,
-                                                      int bd) {
+static INLINE tran_high_t highbd_dct_const_round_shift(tran_high_t input) {
  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  return highbd_check_range(rv, bd);
+  return rv;
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH

@ -83,9 +83,21 @@ static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input,
 // bd of 10 uses trans_low with 18bits, need to remove 14bits
 // bd of 12 uses trans_low with 20bits, need to remove 12bits
 // bd of x uses trans_low with 8+x bits, need to remove 24-x bits
-#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
-#else
-#define WRAPLOW(x, bd) ((int32_t)(x))
+
+#define WRAPLOW(x) ((((int32_t)check_range(x)) << 16) >> 16)
+#if CONFIG_VP9_HIGHBITDEPTH
+#define HIGHBD_WRAPLOW(x, bd) \
+        ((((int32_t)highbd_check_range((x), bd)) << (24 - bd)) >> (24 - bd))
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#else   // CONFIG_EMULATE_HARDWARE
+
+#define WRAPLOW(x) ((int32_t)check_range(x))
+#if CONFIG_VP9_HIGHBITDEPTH
+#define HIGHBD_WRAPLOW(x, bd) \
+        ((int32_t)highbd_check_range((x), bd))
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 #endif  // CONFIG_EMULATE_HARDWARE

 void vp10_idct4_c(const tran_low_t *input, tran_low_t *output);
@ -107,14 +119,14 @@ void vp10_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd);

 static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
                                             int bd) {
-  trans = WRAPLOW(trans, bd);
-  return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);
+  trans = HIGHBD_WRAPLOW(trans, bd);
+  return clip_pixel_highbd(dest + trans, bd);
 }
 #endif

 static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
-  trans = WRAPLOW(trans, 8);
-  return clip_pixel(WRAPLOW(dest + trans, 8));
+  trans = WRAPLOW(trans);
+  return clip_pixel(dest + trans);
 }
 #ifdef __cplusplus
 }  // extern "C"
--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl
@ -73,10 +73,6 @@ add_proto qw/void vp10_post_proc_down_and_across/, "const uint8_t *src_ptr, uint
 specialize qw/vp10_post_proc_down_and_across sse2/;
 $vp10_post_proc_down_and_across_sse2=vp10_post_proc_down_and_across_xmm;

-add_proto qw/void vp10_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
-specialize qw/vp10_plane_add_noise sse2/;
-$vp10_plane_add_noise_sse2=vp10_plane_add_noise_wmt;
-
 add_proto qw/void vp10_filter_by_weight16x16/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
 specialize qw/vp10_filter_by_weight16x16 sse2 msa/;

@ -365,9 +361,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

    add_proto qw/void vp10_highbd_post_proc_down_and_across/, "const uint16_t *src_ptr, uint16_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit";
    specialize qw/vp10_highbd_post_proc_down_and_across/;
-
-    add_proto qw/void vp10_highbd_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
-    specialize qw/vp10_highbd_plane_add_noise/;
  }

  #
@ -447,7 +440,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  specialize qw/vp10_fht32x32/;

  add_proto qw/void vp10_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp10_fwht4x4/, "$mmx_x86inc";
+  specialize qw/vp10_fwht4x4/, "$sse2_x86inc";
 } else {
  add_proto qw/void vp10_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
  specialize qw/vp10_fht4x4 sse2/;
@ -468,7 +461,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  specialize qw/vp10_fht32x32/;

  add_proto qw/void vp10_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp10_fwht4x4 msa/, "$mmx_x86inc";
+  specialize qw/vp10_fwht4x4 msa/, "$sse2_x86inc";
 }

 add_proto qw/void vp10_fwd_idtx/, "const int16_t *src_diff, tran_low_t *coeff, int stride, int bs, int tx_type";
--- a/vp10/common/x86/postproc_sse2.asm
+++ b/vp10/common/x86/postproc_sse2.asm
@ -624,68 +624,6 @@ sym(vp10_mbpost_proc_across_ip_xmm):
 %undef flimit4


-;void vp10_plane_add_noise_wmt (unsigned char *start, unsigned char *noise,
-;                            unsigned char blackclamp[16],
-;                            unsigned char whiteclamp[16],
-;                            unsigned char bothclamp[16],
-;                            unsigned int width, unsigned int height, int pitch)
-global sym(vp10_plane_add_noise_wmt) PRIVATE
-sym(vp10_plane_add_noise_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-.addnoise_loop:
-    call sym(LIBVPX_RAND) WRT_PLT
-    mov     rcx, arg(1) ;noise
-    and     rax, 0xff
-    add     rcx, rax
-
-    ; we rely on the fact that the clamping vectors are stored contiguously
-    ; in black/white/both order. Note that we have to reload this here because
-    ; rdx could be trashed by rand()
-    mov     rdx, arg(2) ; blackclamp
-
-
-            mov     rdi, rcx
-            movsxd  rcx, dword arg(5) ;[Width]
-            mov     rsi, arg(0) ;Pos
-            xor         rax,rax
-
-.addnoise_nextset:
-            movdqu      xmm1,[rsi+rax]         ; get the source
-
-            psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
-            paddusb     xmm1, [rdx+32] ;bothclamp
-            psubusb     xmm1, [rdx+16] ;whiteclamp
-
-            movdqu      xmm2,[rdi+rax]         ; get the noise for this line
-            paddb       xmm1,xmm2              ; add it in
-            movdqu      [rsi+rax],xmm1         ; store the result
-
-            add         rax,16                 ; move to the next line
-
-            cmp         rax, rcx
-            jl          .addnoise_nextset
-
-    movsxd  rax, dword arg(7) ; Pitch
-    add     arg(0), rax ; Start += Pitch
-    sub     dword arg(6), 1   ; Height -= 1
-    jg      .addnoise_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 SECTION_RODATA
 align 16
 rd42:
--- a/vp10/encoder/encoder.c
+++ b/vp10/encoder/encoder.c
@ -2805,6 +2805,8 @@ void vp10_remove_compressor(VP10_COMP *cpi) {
      const double dr =
          (double)cpi->bytes * (double) 8 / (double)1000 / time_encoded;
      const double peak = (double)((1 << cpi->oxcf.input_bit_depth) - 1);
+      const double target_rate = (double)cpi->oxcf.target_bandwidth / 1000;
+      const double rate_err = ((100.0 * (dr - target_rate)) / target_rate);

      if (cpi->b_calculate_psnr) {
        const double total_psnr =
@ -2844,8 +2846,9 @@ void vp10_remove_compressor(VP10_COMP *cpi) {
          SNPRINT2(results, "\t%7.3f", cpi->worst_consistency);
        }

-        fprintf(f, "%s\t    Time\n", headings);
-        fprintf(f, "%s\t%8.0f\n", results, total_encode_time);
+        fprintf(f, "%s\t    Time  Rc-Err Abs Err\n", headings);
+        fprintf(f, "%s\t%8.0f %7.2f %7.2f\n", results,
+                total_encode_time, rate_err, fabs(rate_err));
      }

      fclose(f);
--- a/vp10/encoder/encoder.h
+++ b/vp10/encoder/encoder.h
@ -139,7 +139,7 @@ typedef struct VP10EncoderConfig {
  int height;  // height of data passed to the compressor
  unsigned int input_bit_depth;  // Input bit depth.
  double init_framerate;  // set to passed in framerate
-  int64_t target_bandwidth;  // bandwidth to be used in kilobits per second
+  int64_t target_bandwidth;  // bandwidth to be used in bits per second

  int noise_sensitivity;  // pre processing blur: recommendation 0
  int sharpness;  // sharpening output: recommendation 0:
--- a/vp10/encoder/firstpass.c
+++ b/vp10/encoder/firstpass.c
@ -45,7 +45,6 @@

 #define BOOST_BREAKOUT      12.5
 #define BOOST_FACTOR        12.5
-#define ERR_DIVISOR         128.0
 #define FACTOR_PT_LOW       0.70
 #define FACTOR_PT_HIGH      0.90
 #define FIRST_PASS_Q        10.0
@ -230,6 +229,13 @@ static void subtract_stats(FIRSTPASS_STATS *section,
  section->duration   -= frame->duration;
 }

+// Calculate the linear size relative to a baseline of 1080P
+#define BASE_SIZE 2073600.0  // 1920x1080
+static double get_linear_size_factor(const VP10_COMP *cpi) {
+  const double this_area = cpi->initial_width * cpi->initial_height;
+  return pow(this_area / BASE_SIZE, 0.5);
+}
+
 // Calculate an active area of the image that discounts formatting
 // bars and partially discounts other 0 energy areas.
 #define MIN_ACTIVE_AREA 0.5
@ -1121,11 +1127,7 @@ static double calc_correction_factor(double err_per_mb,
  return fclamp(pow(error_term, power_term), 0.05, 5.0);
 }

-// Larger image formats are expected to be a little harder to code relatively
-// given the same prediction error score. This in part at least relates to the
-// increased size and hence coding cost of motion vectors.
-#define EDIV_SIZE_FACTOR 800
-
+#define ERR_DIVISOR         100.0
 static int get_twopass_worst_quality(const VP10_COMP *cpi,
                                     const double section_err,
                                     double inactive_zone,
@ -1144,12 +1146,22 @@ static int get_twopass_worst_quality(const VP10_COMP *cpi,
    const int active_mbs = VPXMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
    const double av_err_per_mb = section_err / active_mbs;
    const double speed_term = 1.0 + 0.04 * oxcf->speed;
-    const double ediv_size_correction = (double)num_mbs / EDIV_SIZE_FACTOR;
+    double ediv_size_correction;
    const int target_norm_bits_per_mb = ((uint64_t)section_target_bandwidth <<
                                         BPER_MB_NORMBITS) / active_mbs;
-
    int q;

+    // Larger image formats are expected to be a little harder to code
+    // relatively given the same prediction error score. This in part at
+    // least relates to the increased size and hence coding overheads of
+    // motion vectors. Some account of this is made through adjustment of
+    // the error divisor.
+    ediv_size_correction =
+        VPXMAX(0.2, VPXMIN(5.0, get_linear_size_factor(cpi)));
+    if (ediv_size_correction < 1.0)
+      ediv_size_correction = -(1.0 / ediv_size_correction);
+    ediv_size_correction *= 4.0;
+
    // Try and pick a max Q that will be high enough to encode the
    // content at the given rate.
    for (q = rc->best_quality; q < rc->worst_quality; ++q) {
--- a/vp10/encoder/lookahead.c
+++ b/vp10/encoder/lookahead.c
@ -20,8 +20,8 @@

 /* Return the buffer at the given absolute index and increment the index */
 static struct lookahead_entry *pop(struct lookahead_ctx *ctx,
-                                   unsigned int *idx) {
-  unsigned int index = *idx;
+                                   int *idx) {
+  int index = *idx;
  struct lookahead_entry *buf = ctx->buf + index;

  assert(index < ctx->max_sz);
@ -35,7 +35,7 @@ static struct lookahead_entry *pop(struct lookahead_ctx *ctx,
 void vp10_lookahead_destroy(struct lookahead_ctx *ctx) {
  if (ctx) {
    if (ctx->buf) {
-      unsigned int i;
+      int i;

      for (i = 0; i < ctx->max_sz; i++)
        vpx_free_frame_buffer(&ctx->buf[i].img);
@ -221,9 +221,9 @@ struct lookahead_entry *vp10_lookahead_peek(struct lookahead_ctx *ctx,

  if (index >= 0) {
    // Forward peek
-    if (index < (int)ctx->sz) {
+    if (index < ctx->sz) {
      index += ctx->read_idx;
-      if (index >= (int)ctx->max_sz)
+      if (index >= ctx->max_sz)
        index -= ctx->max_sz;
      buf = ctx->buf + index;
    }
--- a/vp10/encoder/lookahead.h
+++ b/vp10/encoder/lookahead.h
@ -31,10 +31,10 @@ struct lookahead_entry {
 #define MAX_PRE_FRAMES 1

 struct lookahead_ctx {
-  unsigned int max_sz;         /* Absolute size of the queue */
-  unsigned int sz;             /* Number of buffers currently in the queue */
-  unsigned int read_idx;       /* Read index */
-  unsigned int write_idx;      /* Write index */
+  int max_sz;                  /* Absolute size of the queue */
+  int sz;                      /* Number of buffers currently in the queue */
+  int read_idx;                /* Read index */
+  int write_idx;               /* Write index */
  struct lookahead_entry *buf; /* Buffer list */
 };

--- a/vp10/encoder/quantize.c
+++ b/vp10/encoder/quantize.c
@ -1158,12 +1158,12 @@ void vp10_highbd_quantize_dc(const tran_low_t *coeff_ptr,

 static void invert_quant(int16_t *quant, int16_t *shift, int d) {
  unsigned t;
-  int l;
+  int l, m;
  t = d;
  for (l = 0; t > 1; l++)
    t >>= 1;
-  t = 1 + (1 << (16 + l)) / d;
-  *quant = (int16_t)(t - (1 << 16));
+  m = 1 + (1 << (16 + l)) / d;
+  *quant = (int16_t)(m - (1 << 16));
  *shift = 1 << (16 - l);
 }

--- a/vp10/encoder/x86/dct_intrin_sse2.c
+++ b/vp10/encoder/x86/dct_intrin_sse2.c
--- a/vp10/encoder/x86/dct_mmx.asm
+++ b/vp10/encoder/x86/dct_mmx.asm
@ -1,104 +0,0 @@
-;
-;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%define private_prefix vp10
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro TRANSFORM_COLS 0
-  paddw           m0,        m1
-  movq            m4,        m0
-  psubw           m3,        m2
-  psubw           m4,        m3
-  psraw           m4,        1
-  movq            m5,        m4
-  psubw           m5,        m1 ;b1
-  psubw           m4,        m2 ;c1
-  psubw           m0,        m4
-  paddw           m3,        m5
-                                ; m0 a0
-  SWAP            1,         4  ; m1 c1
-  SWAP            2,         3  ; m2 d1
-  SWAP            3,         5  ; m3 b1
-%endmacro
-
-%macro TRANSPOSE_4X4 0
-  movq            m4,        m0
-  movq            m5,        m2
-  punpcklwd       m4,        m1
-  punpckhwd       m0,        m1
-  punpcklwd       m5,        m3
-  punpckhwd       m2,        m3
-  movq            m1,        m4
-  movq            m3,        m0
-  punpckldq       m1,        m5
-  punpckhdq       m4,        m5
-  punpckldq       m3,        m2
-  punpckhdq       m0,        m2
-  SWAP            2, 3, 0, 1, 4
-%endmacro
-
-INIT_MMX mmx
-cglobal fwht4x4, 3, 4, 8, input, output, stride
-  lea             r3q,       [inputq + strideq*4]
-  movq            m0,        [inputq] ;a1
-  movq            m1,        [inputq + strideq*2] ;b1
-  movq            m2,        [r3q] ;c1
-  movq            m3,        [r3q + strideq*2] ;d1
-
-  TRANSFORM_COLS
-  TRANSPOSE_4X4
-  TRANSFORM_COLS
-  TRANSPOSE_4X4
-
-  psllw           m0,        2
-  psllw           m1,        2
-  psllw           m2,        2
-  psllw           m3,        2
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  pxor            m4,             m4
-  pxor            m5,             m5
-  pcmpgtw         m4,             m0
-  pcmpgtw         m5,             m1
-  movq            m6,             m0
-  movq            m7,             m1
-  punpcklwd       m0,             m4
-  punpcklwd       m1,             m5
-  punpckhwd       m6,             m4
-  punpckhwd       m7,             m5
-  movq            [outputq],      m0
-  movq            [outputq + 8],  m6
-  movq            [outputq + 16], m1
-  movq            [outputq + 24], m7
-  pxor            m4,             m4
-  pxor            m5,             m5
-  pcmpgtw         m4,             m2
-  pcmpgtw         m5,             m3
-  movq            m6,             m2
-  movq            m7,             m3
-  punpcklwd       m2,             m4
-  punpcklwd       m3,             m5
-  punpckhwd       m6,             m4
-  punpckhwd       m7,             m5
-  movq            [outputq + 32], m2
-  movq            [outputq + 40], m6
-  movq            [outputq + 48], m3
-  movq            [outputq + 56], m7
-%else
-  movq            [outputq],      m0
-  movq            [outputq + 8],  m1
-  movq            [outputq + 16], m2
-  movq            [outputq + 24], m3
-%endif
-
-  RET
--- a/vp10/encoder/x86/dct_sse2.asm
+++ b/vp10/encoder/x86/dct_sse2.asm
@ -0,0 +1,86 @@
+;
+;  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%define private_prefix vp10
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro TRANSFORM_COLS 0
+  paddw           m0,        m1
+  movq            m4,        m0
+  psubw           m3,        m2
+  psubw           m4,        m3
+  psraw           m4,        1
+  movq            m5,        m4
+  psubw           m5,        m1 ;b1
+  psubw           m4,        m2 ;c1
+  psubw           m0,        m4
+  paddw           m3,        m5
+                                ; m0 a0
+  SWAP            1,         4  ; m1 c1
+  SWAP            2,         3  ; m2 d1
+  SWAP            3,         5  ; m3 b1
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+                                ; 00 01 02 03
+                                ; 10 11 12 13
+                                ; 20 21 22 23
+                                ; 30 31 32 33
+  punpcklwd       m0,        m1 ; 00 10 01 11  02 12 03 13
+  punpcklwd       m2,        m3 ; 20 30 21 31  22 32 23 33
+  mova            m1,        m0
+  punpckldq       m0,        m2 ; 00 10 20 30  01 11 21 31
+  punpckhdq       m1,        m2 ; 02 12 22 32  03 13 23 33
+%endmacro
+
+INIT_XMM sse2
+cglobal fwht4x4, 3, 4, 8, input, output, stride
+  lea             r3q,       [inputq + strideq*4]
+  movq            m0,        [inputq] ;a1
+  movq            m1,        [inputq + strideq*2] ;b1
+  movq            m2,        [r3q] ;c1
+  movq            m3,        [r3q + strideq*2] ;d1
+
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+  SWAP            1,         2
+  psrldq          m1,        m0, 8
+  psrldq          m3,        m2, 8
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+
+  psllw           m0,        2
+  psllw           m1,        2
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; sign extension
+  mova            m2,             m0
+  mova            m3,             m1
+  punpcklwd       m0,             m0
+  punpcklwd       m1,             m1
+  punpckhwd       m2,             m2
+  punpckhwd       m3,             m3
+  psrad           m0,             16
+  psrad           m1,             16
+  psrad           m2,             16
+  psrad           m3,             16
+  mova            [outputq],      m0
+  mova            [outputq + 16], m2
+  mova            [outputq + 32], m1
+  mova            [outputq + 48], m3
+%else
+  mova            [outputq],      m0
+  mova            [outputq + 16], m1
+%endif
+
+  RET
--- a/vp10/vp10cx.mk
+++ b/vp10/vp10cx.mk
@ -104,7 +104,7 @@ VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/highbd_block_error_intrin_sse2.c
 endif

 ifeq ($(CONFIG_USE_X86INC),yes)
-VP10_CX_SRCS-$(HAVE_MMX) += encoder/x86/dct_mmx.asm
+VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
 VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/error_sse2.asm
 endif

@ -114,7 +114,7 @@ VP10_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3_x86_64.asm
 endif
 endif

-VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.c
+VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_intrin_sse2.c
 VP10_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/dct_ssse3.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP10_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_fwd_txfm_sse4.c
--- a/vp8/common/arm/neon/bilinearpredict_neon.c
+++ b/vp8/common/arm/neon/bilinearpredict_neon.c
@ -21,114 +21,6 @@ static const uint8_t bifilter4_coeff[8][2] = {
    { 16, 112}
 };

-void vp8_bilinear_predict4x4_neon(
-        unsigned char *src_ptr,
-        int src_pixels_per_line,
-        int xoffset,
-        int yoffset,
-        unsigned char *dst_ptr,
-        int dst_pitch) {
-    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8;
-    uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8;
-    uint8x16_t q1u8, q2u8;
-    uint16x8_t q1u16, q2u16;
-    uint16x8_t q7u16, q8u16, q9u16;
-    uint64x2_t q4u64, q5u64;
-    uint64x1_t d12u64;
-    uint32x2x2_t d0u32x2, d1u32x2, d2u32x2, d3u32x2;
-
-    if (xoffset == 0) {  // skip_1stpass_filter
-        uint32x2_t d28u32 = vdup_n_u32(0);
-        uint32x2_t d29u32 = vdup_n_u32(0);
-        uint32x2_t d30u32 = vdup_n_u32(0);
-
-        d28u32 = vld1_lane_u32((const uint32_t *)src_ptr, d28u32, 0);
-        src_ptr += src_pixels_per_line;
-        d28u32 = vld1_lane_u32((const uint32_t *)src_ptr, d28u32, 1);
-        src_ptr += src_pixels_per_line;
-        d29u32 = vld1_lane_u32((const uint32_t *)src_ptr, d29u32, 0);
-        src_ptr += src_pixels_per_line;
-        d29u32 = vld1_lane_u32((const uint32_t *)src_ptr, d29u32, 1);
-        src_ptr += src_pixels_per_line;
-        d30u32 = vld1_lane_u32((const uint32_t *)src_ptr, d30u32, 0);
-        d28u8 = vreinterpret_u8_u32(d28u32);
-        d29u8 = vreinterpret_u8_u32(d29u32);
-        d30u8 = vreinterpret_u8_u32(d30u32);
-    } else {
-        d2u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
-        d3u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
-        d4u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
-        d5u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
-        d6u8 = vld1_u8(src_ptr);
-
-        q1u8 = vcombine_u8(d2u8, d3u8);
-        q2u8 = vcombine_u8(d4u8, d5u8);
-
-        d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
-        d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
-
-        q4u64  = vshrq_n_u64(vreinterpretq_u64_u8(q1u8), 8);
-        q5u64  = vshrq_n_u64(vreinterpretq_u64_u8(q2u8), 8);
-        d12u64 = vshr_n_u64(vreinterpret_u64_u8(d6u8), 8);
-
-        d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q1u8)),
-                           vreinterpret_u32_u8(vget_high_u8(q1u8)));
-        d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q2u8)),
-                           vreinterpret_u32_u8(vget_high_u8(q2u8)));
-        d2u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q4u64)),
-                           vreinterpret_u32_u64(vget_high_u64(q4u64)));
-        d3u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),
-                           vreinterpret_u32_u64(vget_high_u64(q5u64)));
-
-        q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d0u8);
-        q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d0u8);
-        q9u16 = vmull_u8(d6u8, d0u8);
-
-        q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d2u32x2.val[0]), d1u8);
-        q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d3u32x2.val[0]), d1u8);
-        q9u16 = vmlal_u8(q9u16, vreinterpret_u8_u64(d12u64), d1u8);
-
-        d28u8 = vqrshrn_n_u16(q7u16, 7);
-        d29u8 = vqrshrn_n_u16(q8u16, 7);
-        d30u8 = vqrshrn_n_u16(q9u16, 7);
-    }
-
-    // secondpass_filter
-    if (yoffset == 0) {  // skip_2ndpass_filter
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 0);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 1);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d29u8), 0);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d29u8), 1);
-    } else {
-        d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
-        d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
-
-        q1u16 = vmull_u8(d28u8, d0u8);
-        q2u16 = vmull_u8(d29u8, d0u8);
-
-        d26u8 = vext_u8(d28u8, d29u8, 4);
-        d27u8 = vext_u8(d29u8, d30u8, 4);
-
-        q1u16 = vmlal_u8(q1u16, d26u8, d1u8);
-        q2u16 = vmlal_u8(q2u16, d27u8, d1u8);
-
-        d2u8 = vqrshrn_n_u16(q1u16, 7);
-        d3u8 = vqrshrn_n_u16(q2u16, 7);
-
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1);
-    }
-    return;
-}
-
 void vp8_bilinear_predict8x4_neon(
        unsigned char *src_ptr,
        int src_pixels_per_line,
--- a/vp8/common/arm/neon/sixtappredict_neon.c
+++ b/vp8/common/arm/neon/sixtappredict_neon.c
@ -22,383 +22,6 @@ static const int8_t vp8_sub_pel_filters[8][8] = {
    {0, -1,   12, 123,  -6, 0, 0, 0},
 };

-void vp8_sixtap_predict4x4_neon(
-        unsigned char *src_ptr,
-        int src_pixels_per_line,
-        int xoffset,
-        int yoffset,
-        unsigned char *dst_ptr,
-        int dst_pitch) {
-    unsigned char *src;
-    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d18u8, d19u8, d20u8, d21u8;
-    uint8x8_t d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
-    int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
-    uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
-    uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
-    int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
-    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
-    uint8x16_t q3u8, q4u8, q5u8, q6u8, q11u8;
-    uint64x2_t q3u64, q4u64, q5u64, q6u64, q9u64, q10u64;
-    uint32x2x2_t d0u32x2, d1u32x2;
-
-    if (xoffset == 0) {  // secondpass_filter4x4_only
-        uint32x2_t d27u32 = vdup_n_u32(0);
-        uint32x2_t d28u32 = vdup_n_u32(0);
-        uint32x2_t d29u32 = vdup_n_u32(0);
-        uint32x2_t d30u32 = vdup_n_u32(0);
-        uint32x2_t d31u32 = vdup_n_u32(0);
-
-        // load second_pass filter
-        dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
-        d0s8 = vdup_lane_s8(dtmps8, 0);
-        d1s8 = vdup_lane_s8(dtmps8, 1);
-        d2s8 = vdup_lane_s8(dtmps8, 2);
-        d3s8 = vdup_lane_s8(dtmps8, 3);
-        d4s8 = vdup_lane_s8(dtmps8, 4);
-        d5s8 = vdup_lane_s8(dtmps8, 5);
-        d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
-        d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
-        d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
-        d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
-        d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
-        d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
-
-        // load src data
-        src = src_ptr - src_pixels_per_line * 2;
-        d27u32 = vld1_lane_u32((const uint32_t *)src, d27u32, 0);
-        src += src_pixels_per_line;
-        d27u32 = vld1_lane_u32((const uint32_t *)src, d27u32, 1);
-        src += src_pixels_per_line;
-        d28u32 = vld1_lane_u32((const uint32_t *)src, d28u32, 0);
-        src += src_pixels_per_line;
-        d28u32 = vld1_lane_u32((const uint32_t *)src, d28u32, 1);
-        src += src_pixels_per_line;
-        d29u32 = vld1_lane_u32((const uint32_t *)src, d29u32, 0);
-        src += src_pixels_per_line;
-        d29u32 = vld1_lane_u32((const uint32_t *)src, d29u32, 1);
-        src += src_pixels_per_line;
-        d30u32 = vld1_lane_u32((const uint32_t *)src, d30u32, 0);
-        src += src_pixels_per_line;
-        d30u32 = vld1_lane_u32((const uint32_t *)src, d30u32, 1);
-        src += src_pixels_per_line;
-        d31u32 = vld1_lane_u32((const uint32_t *)src, d31u32, 0);
-
-        d27u8 = vreinterpret_u8_u32(d27u32);
-        d28u8 = vreinterpret_u8_u32(d28u32);
-        d29u8 = vreinterpret_u8_u32(d29u32);
-        d30u8 = vreinterpret_u8_u32(d30u32);
-        d31u8 = vreinterpret_u8_u32(d31u32);
-
-        d23u8 = vext_u8(d27u8, d28u8, 4);
-        d24u8 = vext_u8(d28u8, d29u8, 4);
-        d25u8 = vext_u8(d29u8, d30u8, 4);
-        d26u8 = vext_u8(d30u8, d31u8, 4);
-
-        q3u16 = vmull_u8(d27u8, d0u8);
-        q4u16 = vmull_u8(d28u8, d0u8);
-        q5u16 = vmull_u8(d25u8, d5u8);
-        q6u16 = vmull_u8(d26u8, d5u8);
-
-        q3u16 = vmlsl_u8(q3u16, d29u8, d4u8);
-        q4u16 = vmlsl_u8(q4u16, d30u8, d4u8);
-        q5u16 = vmlsl_u8(q5u16, d23u8, d1u8);
-        q6u16 = vmlsl_u8(q6u16, d24u8, d1u8);
-
-        q3u16 = vmlal_u8(q3u16, d28u8, d2u8);
-        q4u16 = vmlal_u8(q4u16, d29u8, d2u8);
-        q5u16 = vmlal_u8(q5u16, d24u8, d3u8);
-        q6u16 = vmlal_u8(q6u16, d25u8, d3u8);
-
-        q3s16 = vreinterpretq_s16_u16(q3u16);
-        q4s16 = vreinterpretq_s16_u16(q4u16);
-        q5s16 = vreinterpretq_s16_u16(q5u16);
-        q6s16 = vreinterpretq_s16_u16(q6u16);
-
-        q5s16 = vqaddq_s16(q5s16, q3s16);
-        q6s16 = vqaddq_s16(q6s16, q4s16);
-
-        d3u8 = vqrshrun_n_s16(q5s16, 7);
-        d4u8 = vqrshrun_n_s16(q6s16, 7);
-
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 0);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 1);
-        return;
-    }
-
-    // load first_pass filter
-    dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
-    d0s8 = vdup_lane_s8(dtmps8, 0);
-    d1s8 = vdup_lane_s8(dtmps8, 1);
-    d2s8 = vdup_lane_s8(dtmps8, 2);
-    d3s8 = vdup_lane_s8(dtmps8, 3);
-    d4s8 = vdup_lane_s8(dtmps8, 4);
-    d5s8 = vdup_lane_s8(dtmps8, 5);
-    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
-    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
-    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
-    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
-    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
-    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
-
-    // First pass: output_height lines x output_width columns (9x4)
-
-    if (yoffset == 0)  // firstpass_filter4x4_only
-        src = src_ptr - 2;
-    else
-        src = src_ptr - 2 - (src_pixels_per_line * 2);
-
-    q3u8 = vld1q_u8(src);
-    src += src_pixels_per_line;
-    q4u8 = vld1q_u8(src);
-    src += src_pixels_per_line;
-    q5u8 = vld1q_u8(src);
-    src += src_pixels_per_line;
-    q6u8 = vld1q_u8(src);
-    src += src_pixels_per_line;
-
-    d18u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
-    d19u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
-    d20u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
-    d21u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
-
-    // vswp here
-    q3u8 = vcombine_u8(vget_low_u8(q3u8), vget_low_u8(q4u8));
-    q5u8 = vcombine_u8(vget_low_u8(q5u8), vget_low_u8(q6u8));
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u8(d18u8),  // d18 d19
-                       vreinterpret_u32_u8(d19u8));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u8(d20u8),  // d20 d21
-                       vreinterpret_u32_u8(d21u8));
-    q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d5u8);
-    q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d5u8);
-
-    // keep original src data in q4 q6
-    q4u64 = vreinterpretq_u64_u8(q3u8);
-    q6u64 = vreinterpretq_u64_u8(q5u8);
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q3u8)),  // d6 d7
-                       vreinterpret_u32_u8(vget_high_u8(q3u8)));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q5u8)),  // d10 d11
-                       vreinterpret_u32_u8(vget_high_u8(q5u8)));
-    q9u64 = vshrq_n_u64(q4u64, 8);
-    q10u64 = vshrq_n_u64(q6u64, 8);
-    q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d0u8);
-    q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d0u8);
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)),   // d18 d19
-                       vreinterpret_u32_u64(vget_high_u64(q9u64)));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)),  // d20 d211
-                       vreinterpret_u32_u64(vget_high_u64(q10u64)));
-    q3u64 = vshrq_n_u64(q4u64, 32);
-    q5u64 = vshrq_n_u64(q6u64, 32);
-    q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d1u8);
-    q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d1u8);
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)),  // d6 d7
-                       vreinterpret_u32_u64(vget_high_u64(q3u64)));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),  // d10 d11
-                       vreinterpret_u32_u64(vget_high_u64(q5u64)));
-    q9u64 = vshrq_n_u64(q4u64, 16);
-    q10u64 = vshrq_n_u64(q6u64, 16);
-    q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d4u8);
-    q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d4u8);
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)),   // d18 d19
-                       vreinterpret_u32_u64(vget_high_u64(q9u64)));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)),  // d20 d211
-                       vreinterpret_u32_u64(vget_high_u64(q10u64)));
-    q3u64 = vshrq_n_u64(q4u64, 24);
-    q5u64 = vshrq_n_u64(q6u64, 24);
-    q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d2u8);
-    q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d2u8);
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)),  // d6 d7
-                       vreinterpret_u32_u64(vget_high_u64(q3u64)));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),  // d10 d11
-                       vreinterpret_u32_u64(vget_high_u64(q5u64)));
-    q9u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d3u8);
-    q10u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d3u8);
-
-    q7s16 = vreinterpretq_s16_u16(q7u16);
-    q8s16 = vreinterpretq_s16_u16(q8u16);
-    q9s16 = vreinterpretq_s16_u16(q9u16);
-    q10s16 = vreinterpretq_s16_u16(q10u16);
-    q7s16 = vqaddq_s16(q7s16, q9s16);
-    q8s16 = vqaddq_s16(q8s16, q10s16);
-
-    d27u8 = vqrshrun_n_s16(q7s16, 7);
-    d28u8 = vqrshrun_n_s16(q8s16, 7);
-
-    if (yoffset == 0) {  // firstpass_filter4x4_only
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d27u8), 0);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d27u8), 1);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 0);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 1);
-        return;
-    }
-
-    // First Pass on rest 5-line data
-    q3u8 = vld1q_u8(src);
-    src += src_pixels_per_line;
-    q4u8 = vld1q_u8(src);
-    src += src_pixels_per_line;
-    q5u8 = vld1q_u8(src);
-    src += src_pixels_per_line;
-    q6u8 = vld1q_u8(src);
-    src += src_pixels_per_line;
-    q11u8 = vld1q_u8(src);
-
-    d18u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
-    d19u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
-    d20u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
-    d21u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
-
-    // vswp here
-    q3u8 = vcombine_u8(vget_low_u8(q3u8), vget_low_u8(q4u8));
-    q5u8 = vcombine_u8(vget_low_u8(q5u8), vget_low_u8(q6u8));
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u8(d18u8),  // d18 d19
-                       vreinterpret_u32_u8(d19u8));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u8(d20u8),  // d20 d21
-                       vreinterpret_u32_u8(d21u8));
-    d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 5);
-    q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d5u8);
-    q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d5u8);
-    q12u16 = vmull_u8(d31u8, d5u8);
-
-    q4u64 = vreinterpretq_u64_u8(q3u8);
-    q6u64 = vreinterpretq_u64_u8(q5u8);
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q3u8)),  // d6 d7
-                       vreinterpret_u32_u8(vget_high_u8(q3u8)));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q5u8)),  // d10 d11
-                       vreinterpret_u32_u8(vget_high_u8(q5u8)));
-    q9u64 = vshrq_n_u64(q4u64, 8);
-    q10u64 = vshrq_n_u64(q6u64, 8);
-    q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d0u8);
-    q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d0u8);
-    q12u16 = vmlal_u8(q12u16, vget_low_u8(q11u8), d0u8);
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)),   // d18 d19
-                       vreinterpret_u32_u64(vget_high_u64(q9u64)));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)),  // d20 d211
-                       vreinterpret_u32_u64(vget_high_u64(q10u64)));
-    q3u64 = vshrq_n_u64(q4u64, 32);
-    q5u64 = vshrq_n_u64(q6u64, 32);
-    d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 1);
-    q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d1u8);
-    q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d1u8);
-    q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)),  // d6 d7
-                       vreinterpret_u32_u64(vget_high_u64(q3u64)));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),  // d10 d11
-                       vreinterpret_u32_u64(vget_high_u64(q5u64)));
-    q9u64 = vshrq_n_u64(q4u64, 16);
-    q10u64 = vshrq_n_u64(q6u64, 16);
-    d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 4);
-    q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d4u8);
-    q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d4u8);
-    q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)),   // d18 d19
-                       vreinterpret_u32_u64(vget_high_u64(q9u64)));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)),  // d20 d211
-                       vreinterpret_u32_u64(vget_high_u64(q10u64)));
-    q3u64 = vshrq_n_u64(q4u64, 24);
-    q5u64 = vshrq_n_u64(q6u64, 24);
-    d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 2);
-    q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d2u8);
-    q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d2u8);
-    q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)),  // d6 d7
-                       vreinterpret_u32_u64(vget_high_u64(q3u64)));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),  // d10 d11
-                       vreinterpret_u32_u64(vget_high_u64(q5u64)));
-    d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 3);
-    q9u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d3u8);
-    q10u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d3u8);
-    q11u16 = vmull_u8(d31u8, d3u8);
-
-    q7s16 = vreinterpretq_s16_u16(q7u16);
-    q8s16 = vreinterpretq_s16_u16(q8u16);
-    q9s16 = vreinterpretq_s16_u16(q9u16);
-    q10s16 = vreinterpretq_s16_u16(q10u16);
-    q11s16 = vreinterpretq_s16_u16(q11u16);
-    q12s16 = vreinterpretq_s16_u16(q12u16);
-    q7s16 = vqaddq_s16(q7s16, q9s16);
-    q8s16 = vqaddq_s16(q8s16, q10s16);
-    q12s16 = vqaddq_s16(q12s16, q11s16);
-
-    d29u8 = vqrshrun_n_s16(q7s16, 7);
-    d30u8 = vqrshrun_n_s16(q8s16, 7);
-    d31u8 = vqrshrun_n_s16(q12s16, 7);
-
-    // Second pass: 4x4
-    dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
-    d0s8 = vdup_lane_s8(dtmps8, 0);
-    d1s8 = vdup_lane_s8(dtmps8, 1);
-    d2s8 = vdup_lane_s8(dtmps8, 2);
-    d3s8 = vdup_lane_s8(dtmps8, 3);
-    d4s8 = vdup_lane_s8(dtmps8, 4);
-    d5s8 = vdup_lane_s8(dtmps8, 5);
-    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
-    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
-    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
-    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
-    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
-    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
-
-    d23u8 = vext_u8(d27u8, d28u8, 4);
-    d24u8 = vext_u8(d28u8, d29u8, 4);
-    d25u8 = vext_u8(d29u8, d30u8, 4);
-    d26u8 = vext_u8(d30u8, d31u8, 4);
-
-    q3u16 = vmull_u8(d27u8, d0u8);
-    q4u16 = vmull_u8(d28u8, d0u8);
-    q5u16 = vmull_u8(d25u8, d5u8);
-    q6u16 = vmull_u8(d26u8, d5u8);
-
-    q3u16 = vmlsl_u8(q3u16, d29u8, d4u8);
-    q4u16 = vmlsl_u8(q4u16, d30u8, d4u8);
-    q5u16 = vmlsl_u8(q5u16, d23u8, d1u8);
-    q6u16 = vmlsl_u8(q6u16, d24u8, d1u8);
-
-    q3u16 = vmlal_u8(q3u16, d28u8, d2u8);
-    q4u16 = vmlal_u8(q4u16, d29u8, d2u8);
-    q5u16 = vmlal_u8(q5u16, d24u8, d3u8);
-    q6u16 = vmlal_u8(q6u16, d25u8, d3u8);
-
-    q3s16 = vreinterpretq_s16_u16(q3u16);
-    q4s16 = vreinterpretq_s16_u16(q4u16);
-    q5s16 = vreinterpretq_s16_u16(q5u16);
-    q6s16 = vreinterpretq_s16_u16(q6u16);
-
-    q5s16 = vqaddq_s16(q5s16, q3s16);
-    q6s16 = vqaddq_s16(q6s16, q4s16);
-
-    d3u8 = vqrshrun_n_s16(q5s16, 7);
-    d4u8 = vqrshrun_n_s16(q6s16, 7);
-
-    vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0);
-    dst_ptr += dst_pitch;
-    vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1);
-    dst_ptr += dst_pitch;
-    vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 0);
-    dst_ptr += dst_pitch;
-    vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 1);
-    return;
-}
-
 void vp8_sixtap_predict8x4_neon(
        unsigned char *src_ptr,
        int src_pixels_per_line,
--- a/vp8/common/findnearmv.h
+++ b/vp8/common/findnearmv.h
@ -104,7 +104,7 @@ vp8_prob *vp8_mv_ref_probs(
 extern const unsigned char vp8_mbsplit_offset[4][16];


-static INLINE int left_block_mv(const MODE_INFO *cur_mb, int b)
+static INLINE uint32_t left_block_mv(const MODE_INFO *cur_mb, int b)
 {
    if (!(b & 3))
    {
@ -119,7 +119,8 @@ static INLINE int left_block_mv(const MODE_INFO *cur_mb, int b)
    return (cur_mb->bmi + b - 1)->mv.as_int;
 }

-static INLINE int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride)
+static INLINE uint32_t above_block_mv(const MODE_INFO *cur_mb, int b,
+                                      int mi_stride)
 {
    if (!(b >> 2))
    {
--- a/vp8/common/mips/msa/postproc_msa.c
+++ b/vp8/common/mips/msa/postproc_msa.c
@ -10,6 +10,7 @@

 #include <stdlib.h>
 #include "./vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vp8/common/mips/msa/vp8_macros_msa.h"

 static const int16_t vp8_rv_msa[] =
@ -798,54 +799,3 @@ void vp8_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
        }
    }
 }
-
-void vp8_plane_add_noise_msa(uint8_t *start_ptr, char *noise,
-                             char blackclamp[16], char whiteclamp[16],
-                             char bothclamp[16],
-                             uint32_t width, uint32_t height,
-                             int32_t pitch)
-{
-    uint32_t i, j;
-
-    for (i = 0; i < height / 2; ++i)
-    {
-        uint8_t *pos0_ptr = start_ptr + (2 * i) * pitch;
-        int8_t *ref0_ptr = (int8_t *) (noise + (rand() & 0xff));
-        uint8_t *pos1_ptr = start_ptr + (2 * i + 1) * pitch;
-        int8_t *ref1_ptr = (int8_t *) (noise + (rand() & 0xff));
-        for (j = width / 16; j--;)
-        {
-            v16i8 temp00_s, temp01_s;
-            v16u8 temp00, temp01, black_clamp, white_clamp;
-            v16u8 pos0, ref0, pos1, ref1;
-            v16i8 const127 = __msa_ldi_b(127);
-
-            pos0 = LD_UB(pos0_ptr);
-            ref0 = LD_UB(ref0_ptr);
-            pos1 = LD_UB(pos1_ptr);
-            ref1 = LD_UB(ref1_ptr);
-            black_clamp = (v16u8)__msa_fill_b(blackclamp[0]);
-            white_clamp = (v16u8)__msa_fill_b(whiteclamp[0]);
-            temp00 = (pos0 < black_clamp);
-            pos0 = __msa_bmnz_v(pos0, black_clamp, temp00);
-            temp01 = (pos1 < black_clamp);
-            pos1 = __msa_bmnz_v(pos1, black_clamp, temp01);
-            XORI_B2_128_UB(pos0, pos1);
-            temp00_s = __msa_adds_s_b((v16i8)white_clamp, const127);
-            temp00 = (v16u8)(temp00_s < pos0);
-            pos0 = (v16u8)__msa_bmnz_v((v16u8)pos0, (v16u8)temp00_s, temp00);
-            temp01_s = __msa_adds_s_b((v16i8)white_clamp, const127);
-            temp01 = (temp01_s < pos1);
-            pos1 = (v16u8)__msa_bmnz_v((v16u8)pos1, (v16u8)temp01_s, temp01);
-            XORI_B2_128_UB(pos0, pos1);
-            pos0 += ref0;
-            ST_UB(pos0, pos0_ptr);
-            pos1 += ref1;
-            ST_UB(pos1, pos1_ptr);
-            pos0_ptr += 16;
-            pos1_ptr += 16;
-            ref0_ptr += 16;
-            ref1_ptr += 16;
-        }
-    }
-}
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@ -10,6 +10,7 @@


 #include "vpx_config.h"
+#include "vpx_dsp_rtcd.h"
 #include "vp8_rtcd.h"
 #include "vpx_scale_rtcd.h"
 #include "vpx_scale/yv12config.h"
@ -490,54 +491,6 @@ static void fillrd(struct postproc_state *state, int q, int a)
    state->last_noise = a;
 }

-/****************************************************************************
- *
- *  ROUTINE       : plane_add_noise_c
- *
- *  INPUTS        : unsigned char *Start    starting address of buffer to add gaussian
- *                                  noise to
- *                  unsigned int Width    width of plane
- *                  unsigned int Height   height of plane
- *                  int  Pitch    distance between subsequent lines of frame
- *                  int  q        quantizer used to determine amount of noise
- *                                  to add
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void.
- *
- *  FUNCTION      : adds gaussian noise to a plane of pixels
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-void vp8_plane_add_noise_c(unsigned char *Start, char *noise,
-                           char blackclamp[16],
-                           char whiteclamp[16],
-                           char bothclamp[16],
-                           unsigned int Width, unsigned int Height, int Pitch)
-{
-    unsigned int i, j;
-    (void)bothclamp;
-
-    for (i = 0; i < Height; i++)
-    {
-        unsigned char *Pos = Start + i * Pitch;
-        char  *Ref = (char *)(noise + (rand() & 0xff));
-
-        for (j = 0; j < Width; j++)
-        {
-            if (Pos[j] < blackclamp[0])
-                Pos[j] = blackclamp[0];
-
-            if (Pos[j] > 255 + whiteclamp[0])
-                Pos[j] = 255 + whiteclamp[0];
-
-            Pos[j] += Ref[j];
-        }
-    }
-}
-
 /* Blend the macro block with a solid colored square.  Leave the
 * edges unblended to give distinction to macro blocks in areas
 * filled with the same color block.
@ -828,7 +781,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
            fillrd(&oci->postproc_state, 63 - q, noise_level);
        }

-        vp8_plane_add_noise
+        vpx_plane_add_noise
        (oci->post_proc_buffer.y_buffer,
         oci->postproc_state.noise,
         oci->postproc_state.blackclamp,
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@ -167,10 +167,6 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes") {
    add_proto qw/void vp8_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size";
    specialize qw/vp8_post_proc_down_and_across_mb_row sse2 msa/;

-    add_proto qw/void vp8_plane_add_noise/, "unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch";
-    specialize qw/vp8_plane_add_noise mmx sse2 msa/;
-    $vp8_plane_add_noise_sse2=vp8_plane_add_noise_wmt;
-
    add_proto qw/void vp8_blend_mb_inner/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride";
    # no asm yet

@ -209,7 +205,6 @@ $vp8_sixtap_predict8x4_media=vp8_sixtap_predict8x4_armv6;
 $vp8_sixtap_predict8x4_dspr2=vp8_sixtap_predict8x4_dspr2;

 add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-#TODO(johannkoenig): fix the neon version https://code.google.com/p/webm/issues/detail?id=817
 specialize qw/vp8_sixtap_predict4x4 mmx ssse3 media dspr2 msa/;
 $vp8_sixtap_predict4x4_media=vp8_sixtap_predict4x4_armv6;
 $vp8_sixtap_predict4x4_dspr2=vp8_sixtap_predict4x4_dspr2;
@ -227,7 +222,6 @@ specialize qw/vp8_bilinear_predict8x4 mmx media neon msa/;
 $vp8_bilinear_predict8x4_media=vp8_bilinear_predict8x4_armv6;

 add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-#TODO(johannkoenig): fix the neon version https://code.google.com/p/webm/issues/detail?id=892
 specialize qw/vp8_bilinear_predict4x4 mmx media msa/;
 $vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6;

--- a/vp8/common/threading.h
+++ b/vp8/common/threading.h
@ -44,8 +44,8 @@ extern "C" {
 #include <os2.h>

 #include <stdlib.h>
-#define THREAD_FUNCTION void
-#define THREAD_FUNCTION_RETURN void
+#define THREAD_FUNCTION void *
+#define THREAD_FUNCTION_RETURN void *
 #define THREAD_SPECIFIC_INDEX PULONG
 #define pthread_t TID
 #define pthread_attr_t ULONG
--- a/vp8/common/x86/postproc_mmx.asm
+++ b/vp8/common/x86/postproc_mmx.asm
@ -241,68 +241,6 @@ sym(vp8_mbpost_proc_down_mmx):
 %undef flimit2


-;void vp8_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise,
-;                            unsigned char blackclamp[16],
-;                            unsigned char whiteclamp[16],
-;                            unsigned char bothclamp[16],
-;                            unsigned int Width, unsigned int Height, int Pitch)
-global sym(vp8_plane_add_noise_mmx) PRIVATE
-sym(vp8_plane_add_noise_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-.addnoise_loop:
-    call sym(LIBVPX_RAND) WRT_PLT
-    mov     rcx, arg(1) ;noise
-    and     rax, 0xff
-    add     rcx, rax
-
-    ; we rely on the fact that the clamping vectors are stored contiguously
-    ; in black/white/both order. Note that we have to reload this here because
-    ; rdx could be trashed by rand()
-    mov     rdx, arg(2) ; blackclamp
-
-
-            mov     rdi, rcx
-            movsxd  rcx, dword arg(5) ;[Width]
-            mov     rsi, arg(0) ;Pos
-            xor         rax,rax
-
-.addnoise_nextset:
-            movq        mm1,[rsi+rax]         ; get the source
-
-            psubusb     mm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
-            paddusb     mm1, [rdx+32] ;bothclamp
-            psubusb     mm1, [rdx+16] ;whiteclamp
-
-            movq        mm2,[rdi+rax]         ; get the noise for this line
-            paddb       mm1,mm2              ; add it in
-            movq        [rsi+rax],mm1         ; store the result
-
-            add         rax,8                 ; move to the next line
-
-            cmp         rax, rcx
-            jl          .addnoise_nextset
-
-    movsxd  rax, dword arg(7) ; Pitch
-    add     arg(0), rax ; Start += Pitch
-    sub     dword arg(6), 1   ; Height -= 1
-    jg      .addnoise_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 SECTION_RODATA
 align 16
 Blur:
--- a/vp8/common/x86/postproc_sse2.asm
+++ b/vp8/common/x86/postproc_sse2.asm
@ -655,68 +655,6 @@ sym(vp8_mbpost_proc_across_ip_xmm):
 %undef flimit4


-;void vp8_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise,
-;                            unsigned char blackclamp[16],
-;                            unsigned char whiteclamp[16],
-;                            unsigned char bothclamp[16],
-;                            unsigned int Width, unsigned int Height, int Pitch)
-global sym(vp8_plane_add_noise_wmt) PRIVATE
-sym(vp8_plane_add_noise_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-.addnoise_loop:
-    call sym(LIBVPX_RAND) WRT_PLT
-    mov     rcx, arg(1) ;noise
-    and     rax, 0xff
-    add     rcx, rax
-
-    ; we rely on the fact that the clamping vectors are stored contiguously
-    ; in black/white/both order. Note that we have to reload this here because
-    ; rdx could be trashed by rand()
-    mov     rdx, arg(2) ; blackclamp
-
-
-            mov     rdi, rcx
-            movsxd  rcx, dword arg(5) ;[Width]
-            mov     rsi, arg(0) ;Pos
-            xor         rax,rax
-
-.addnoise_nextset:
-            movdqu      xmm1,[rsi+rax]         ; get the source
-
-            psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
-            paddusb     xmm1, [rdx+32] ;bothclamp
-            psubusb     xmm1, [rdx+16] ;whiteclamp
-
-            movdqu      xmm2,[rdi+rax]         ; get the noise for this line
-            paddb       xmm1,xmm2              ; add it in
-            movdqu      [rsi+rax],xmm1         ; store the result
-
-            add         rax,16                 ; move to the next line
-
-            cmp         rax, rcx
-            jl          .addnoise_nextset
-
-    movsxd  rax, dword arg(7) ; Pitch
-    add     arg(0), rax ; Start += Pitch
-    sub     dword arg(6), 1   ; Height -= 1
-    jg      .addnoise_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 SECTION_RODATA
 align 16
 four8s:
--- a/vp8/decoder/dboolhuff.c
+++ b/vp8/decoder/dboolhuff.c
@ -44,7 +44,7 @@ void vp8dx_bool_decoder_fill(BOOL_DECODER *br)
    int shift = VP8_BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT);
    size_t bytes_left = br->user_buffer_end - bufptr;
    size_t bits_left = bytes_left * CHAR_BIT;
-    int x = (int)(shift + CHAR_BIT - bits_left);
+    int x = shift + CHAR_BIT - (int)bits_left;
    int loop_end = 0;
    unsigned char decrypted[sizeof(VP8_BD_VALUE) + 1];

--- a/vp8/decoder/dboolhuff.h
+++ b/vp8/decoder/dboolhuff.h
@ -83,7 +83,7 @@ static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) {
    }

    {
-        register unsigned int shift = vp8_norm[range];
+        register int shift = vp8_norm[range];
        range <<= shift;
        value <<= shift;
        count -= shift;
--- a/vp8/decoder/decodeframe.c
+++ b/vp8/decoder/decodeframe.c
@ -986,7 +986,8 @@ int vp8_decode_frame(VP8D_COMP *pbi)
    VP8_COMMON *const pc = &pbi->common;
    MACROBLOCKD *const xd  = &pbi->mb;
    const unsigned char *data = pbi->fragments.ptrs[0];
-    const unsigned char *data_end =  data + pbi->fragments.sizes[0];
+    const unsigned int data_sz = pbi->fragments.sizes[0];
+    const unsigned char *data_end = data + data_sz;
    ptrdiff_t first_partition_length_in_bytes;

    int i, j, k, l;
@ -1022,7 +1023,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
        const unsigned char *clear = data;
        if (pbi->decrypt_cb)
        {
-            int n = (int)VPXMIN(sizeof(clear_buffer), data_end - data);
+            int n = (int)VPXMIN(sizeof(clear_buffer), data_sz);
            pbi->decrypt_cb(pbi->decrypt_state, data, clear_buffer, n);
            clear = clear_buffer;
        }
--- a/vp8/decoder/error_concealment.c
+++ b/vp8/decoder/error_concealment.c
@ -194,7 +194,7 @@ void vp8_calculate_overlaps(MB_OVERLAP *overlap_ul,
        return;
    }

-    if (new_row <= (-4 << 3) || new_col <= (-4 << 3))
+    if (new_row <= -32 || new_col <= -32)
    {
        /* outside the frame */
        return;
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@ -163,7 +163,7 @@ void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount)
 {
    const TOKENEXTRA *stop = p + xcount;
    unsigned int split;
-    unsigned int shift;
+    int shift;
    int count = w->count;
    unsigned int range = w->range;
    unsigned int lowvalue = w->lowvalue;
--- a/vp8/encoder/boolhuff.h
+++ b/vp8/encoder/boolhuff.h
@ -65,7 +65,7 @@ static void vp8_encode_bool(BOOL_CODER *br, int bit, int probability)
    int count = br->count;
    unsigned int range = br->range;
    unsigned int lowvalue = br->lowvalue;
-    register unsigned int shift;
+    register int shift;

 #ifdef VP8_ENTROPY_STATS
 #if defined(SECTIONBITS_OUTPUT)
--- a/vp8/encoder/denoising.c
+++ b/vp8/encoder/denoising.c
@ -529,7 +529,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
        // Bias on zero motion vector sse.
        const int zero_bias = denoiser->denoise_pars.denoise_mv_bias;
        zero_mv_sse = (unsigned int)((int64_t)zero_mv_sse * zero_bias / 100);
-        sse_diff = zero_mv_sse - best_sse;
+        sse_diff = (int)zero_mv_sse - (int)best_sse;

        saved_mbmi = *mbmi;

--- a/vp8/encoder/denoising.h
+++ b/vp8/encoder/denoising.h
@ -18,8 +18,8 @@
 extern "C" {
 #endif

-#define SUM_DIFF_THRESHOLD 448
-#define SUM_DIFF_THRESHOLD_HIGH 512
+#define SUM_DIFF_THRESHOLD 512
+#define SUM_DIFF_THRESHOLD_HIGH 600
 #define MOTION_MAGNITUDE_THRESHOLD (8*3)

 #define SUM_DIFF_THRESHOLD_UV (96)   // (8 * 8 * 1.5)
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@ -18,6 +18,7 @@
 #include "onyx_int.h"
 #include "vpx_dsp/variance.h"
 #include "encodeintra.h"
+#include "vp8/common/common.h"
 #include "vp8/common/setupintrarecon.h"
 #include "vp8/common/systemdependent.h"
 #include "mcomp.h"
@ -2417,7 +2418,7 @@ void vp8_second_pass(VP8_COMP *cpi)
    int tmp_q;
    int frames_left = (int)(cpi->twopass.total_stats.count - cpi->common.current_video_frame);

-    FIRSTPASS_STATS this_frame = {0};
+    FIRSTPASS_STATS this_frame;
    FIRSTPASS_STATS this_frame_copy;

    double this_frame_intra_error;
@ -2425,6 +2426,8 @@ void vp8_second_pass(VP8_COMP *cpi)

    int overhead_bits;

+    vp8_zero(this_frame);
+
    if (!cpi->twopass.stats_in)
    {
        return ;
@ -2808,7 +2811,8 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
             * static scene.
             */
            if ( detect_transition_to_still( cpi, i,
-                                             (cpi->key_frame_frequency-i),
+                                             ((int)(cpi->key_frame_frequency) -
+                                              (int)i),
                                             loop_decay_rate,
                                             decay_accumulator ) )
            {
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@ -1591,7 +1591,6 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
    int col_min = ref_col - distance;
    int col_max = ref_col + distance;

-    // TODO(johannkoenig): check if this alignment is necessary.
    DECLARE_ALIGNED(16, unsigned int, sad_array8[8]);
    unsigned int sad_array[3];

--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@ -1523,7 +1523,8 @@ static void update_layer_contexts (VP8_COMP *cpi)
 void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
 {
    VP8_COMMON *cm = &cpi->common;
-    int last_w, last_h, prev_number_of_layers;
+    int last_w, last_h;
+    unsigned int prev_number_of_layers;

    if (!cpi)
        return;
@ -1786,10 +1787,8 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
    if (last_w != cpi->oxcf.Width || last_h != cpi->oxcf.Height)
        cpi->force_next_frame_intra = 1;

-    if (((cm->Width + 15) & 0xfffffff0) !=
-          cm->yv12_fb[cm->lst_fb_idx].y_width ||
-        ((cm->Height + 15) & 0xfffffff0) !=
-          cm->yv12_fb[cm->lst_fb_idx].y_height ||
+    if (((cm->Width + 15) & ~15) != cm->yv12_fb[cm->lst_fb_idx].y_width ||
+        ((cm->Height + 15) & ~15) != cm->yv12_fb[cm->lst_fb_idx].y_height ||
        cm->yv12_fb[cm->lst_fb_idx].y_width == 0)
    {
        dealloc_raw_frame_buffers(cpi);
@ -2247,6 +2246,8 @@ void vp8_remove_compressor(VP8_COMP **ptr)
            double total_encode_time = (cpi->time_receive_data +
                                            cpi->time_compress_data) / 1000.000;
            double dr = (double)cpi->bytes * 8.0 / 1000.0 / time_encoded;
+            const double target_rate = (double)cpi->oxcf.target_bandwidth / 1000;
+            const double rate_err = ((100.0 * (dr - target_rate)) / target_rate);

            if (cpi->b_calculate_psnr)
            {
@ -2292,12 +2293,14 @@ void vp8_remove_compressor(VP8_COMP **ptr)
                                                      cpi->summed_weights, 8.0);

                    fprintf(f, "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\t"
-                               "GLPsnrP\tVPXSSIM\t  Time(us)\n");
+                               "GLPsnrP\tVPXSSIM\t  Time(us)  Rc-Err "
+                               "Abs Err\n");
                    fprintf(f, "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
-                               "%7.3f\t%8.0f\n",
+                               "%7.3f\t%8.0f %7.2f %7.2f\n",
                               dr, cpi->total / cpi->count, total_psnr,
                               cpi->totalp / cpi->count, total_psnr2,
-                               total_ssim, total_encode_time);
+                               total_ssim, total_encode_time,
+                               rate_err, fabs(rate_err));
                }
            }
            fclose(f);
@ -5168,7 +5171,7 @@ static void Pass2Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest,
        vp8_second_pass(cpi);

    encode_frame_to_data_rate(cpi, size, dest, dest_end, frame_flags);
-    cpi->twopass.bits_left -= 8 * *size;
+    cpi->twopass.bits_left -= 8 * (int)(*size);

    if (!cpi->common.refresh_alt_ref_frame)
    {
@ -5772,7 +5775,7 @@ int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows, unsigne
        return -1;

    // Check number of rows and columns match
-    if (cpi->common.mb_rows != rows || cpi->common.mb_cols != cols)
+    if (cpi->common.mb_rows != (int)rows || cpi->common.mb_cols != (int)cols)
        return -1;

    // Range check the delta Q values and convert the external Q range values
@ -5828,7 +5831,7 @@ int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows, unsigne

 int vp8_set_active_map(VP8_COMP *cpi, unsigned char *map, unsigned int rows, unsigned int cols)
 {
-    if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols)
+    if ((int)rows == cpi->common.mb_rows && (int)cols == cpi->common.mb_cols)
    {
        if (map)
        {
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@ -371,7 +371,7 @@ typedef struct VP8_COMP
    double key_frame_rate_correction_factor;
    double gf_rate_correction_factor;

-    unsigned int frames_since_golden;
+    int frames_since_golden;
    /* Count down till next GF */
    int frames_till_gf_update_due;

--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@ -90,7 +90,7 @@ static int is_skin_color(int y, int cb, int cr, int consec_zeromv)
    {
      int i = 0;
      // No skin if block has been zero motion for long consecutive time.
-      if (consec_zeromv > 80)
+      if (consec_zeromv > 60)
        return 0;
      // Exit on grey.
       if (cb == 128 && cr == 128)
@ -103,7 +103,7 @@ static int is_skin_color(int y, int cb, int cr, int consec_zeromv)
         if (skin_color_diff < skin_threshold[i + 1]) {
            if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2))
              return 0;
-            else if (consec_zeromv > 30 &&
+            else if (consec_zeromv > 25 &&
                     skin_color_diff > (skin_threshold[i + 1] >> 1))
              return 0;
            else
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@ -1899,7 +1899,8 @@ static int calculate_final_rd_costs(int this_rd,
                    int prob_skip_cost;

                    prob_skip_cost = vp8_cost_bit(cpi->prob_skip_false, 1);
-                    prob_skip_cost -= vp8_cost_bit(cpi->prob_skip_false, 0);
+                    prob_skip_cost -=
+                        (int)vp8_cost_bit(cpi->prob_skip_false, 0);
                    rd->rate2 += prob_skip_cost;
                    *other_cost += prob_skip_cost;
                }
--- a/vp8/encoder/vp8_quantize.c
+++ b/vp8/encoder/vp8_quantize.c
@ -227,12 +227,12 @@ static void invert_quant(int improved_quant, short *quant,
    if(improved_quant)
    {
        unsigned t;
-        int l;
+        int l, m;
        t = d;
        for(l = 0; t > 1; l++)
            t>>=1;
-        t = 1 + (1<<(16+l))/d;
-        *quant = (short)(t - (1<<16));
+        m = 1 + (1<<(16+l))/d;
+        *quant = (short)(m - (1<<16));
        *shift = l;
        /* use multiplication and constant shift by 16 */
        *shift = 1 << (16 - *shift);
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@ -22,6 +22,7 @@
 #include "vpx/vp8cx.h"
 #include "vp8/encoder/firstpass.h"
 #include "vp8/common/onyx.h"
+#include "vp8/common/common.h"
 #include <stdlib.h>
 #include <string.h>

@ -760,7 +761,7 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t  *ctx,
                                    unsigned long          duration,
                                    unsigned long          deadline)
 {
-    unsigned int new_qc;
+    int new_qc;

 #if !(CONFIG_REALTIME_ONLY)
    /* Use best quality mode if no deadline is given. */
@ -785,7 +786,9 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t  *ctx,
    new_qc = MODE_REALTIME;
 #endif

-    if (ctx->cfg.g_pass == VPX_RC_FIRST_PASS)
+    if (deadline == VPX_DL_REALTIME)
+        new_qc = MODE_REALTIME;
+    else if (ctx->cfg.g_pass == VPX_RC_FIRST_PASS)
        new_qc = MODE_FIRSTPASS;
    else if (ctx->cfg.g_pass == VPX_RC_LAST_PASS)
        new_qc = (new_qc == MODE_BESTQUALITY)
@ -1116,7 +1119,8 @@ static vpx_image_t *vp8e_get_preview(vpx_codec_alg_priv_t *ctx)
 {

    YV12_BUFFER_CONFIG sd;
-    vp8_ppflags_t flags = {0};
+    vp8_ppflags_t flags;
+    vp8_zero(flags);

    if (ctx->preview_ppcfg.post_proc_flag)
    {
@ -1305,8 +1309,8 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] =
        30,                 /* rc_resize_up_thresold */

        VPX_VBR,            /* rc_end_usage */
-        {0},                /* rc_twopass_stats_in */
-        {0},                /* rc_firstpass_mb_stats_in */
+        {NULL, 0},          /* rc_twopass_stats_in */
+        {NULL, 0},          /* rc_firstpass_mb_stats_in */
        256,                /* rc_target_bandwidth */
        4,                  /* rc_min_quantizer */
        63,                 /* rc_max_quantizer */
@ -1334,6 +1338,8 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] =
        {0},                /* ts_rate_decimator */
        0,                  /* ts_periodicity */
        {0},                /* ts_layer_id */
+        {0},                /* layer_target_bitrate */
+        0                   /* temporal_layering_mode */
    }},
 };

--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@ -522,7 +522,8 @@ static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t  *ctx,
    {
        YV12_BUFFER_CONFIG sd;
        int64_t time_stamp = 0, time_end_stamp = 0;
-        vp8_ppflags_t flags = {0};
+        vp8_ppflags_t flags;
+        vp8_zero(flags);

        if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
        {
@ -816,11 +817,12 @@ CODEC_INTERFACE(vpx_codec_vp8_dx) =
    },
    { /* encoder functions */
        0,
-        NULL,
-        NULL,
-        NULL,
-        NULL,
-        NULL,
-        NULL
+        NULL,  /* vpx_codec_enc_cfg_map_t */
+        NULL,  /* vpx_codec_encode_fn_t */
+        NULL,  /* vpx_codec_get_cx_data_fn_t */
+        NULL,  /* vpx_codec_enc_config_set_fn_t */
+        NULL,  /* vpx_codec_get_global_headers_fn_t */
+        NULL,  /* vpx_codec_get_preview_frame_fn_t */
+        NULL   /* vpx_codec_enc_mr_get_mem_loc_fn_t */
    }
 };
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@ -67,7 +67,6 @@ static INLINE int get_unsigned_bits(unsigned int num_values) {

 #define VP9_FRAME_MARKER 0x2

-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/common/vp9_common_data.c
+++ b/vp9/common/vp9_common_data.c
@ -159,3 +159,18 @@ const struct {
  {0,  8 },  // 64X32 - {0b0000, 0b1000}
  {0,  0 },  // 64X64 - {0b0000, 0b0000}
 };
+
+#if CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
+const uint8_t need_top_left[INTRA_MODES] = {
+    0,  // DC_PRED
+    0,  // V_PRED
+    0,  // H_PRED
+    0,  // D45_PRED
+    1,  // D135_PRED
+    1,  // D117_PRED
+    1,  // D153_PRED
+    0,  // D207_PRED
+    0,  // D63_PRED
+    1,  // TM_PRED
+};
+#endif  // CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
--- a/vp9/common/vp9_common_data.h
+++ b/vp9/common/vp9_common_data.h
@ -33,6 +33,9 @@ extern const TX_SIZE max_txsize_lookup[BLOCK_SIZES];
 extern const BLOCK_SIZE txsize_to_bsize[TX_SIZES];
 extern const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES];
 extern const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2];
+#if CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
+extern const uint8_t need_top_left[INTRA_MODES];
+#endif  // CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH

 #ifdef __cplusplus
 }  // extern "C"
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@ -298,196 +298,168 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {

 static void filter_selectively_vert_row2(int subsampling_factor,
                                         uint8_t *s, int pitch,
-                                         unsigned int mask_16x16_l,
-                                         unsigned int mask_8x8_l,
-                                         unsigned int mask_4x4_l,
-                                         unsigned int mask_4x4_int_l,
-                                         const loop_filter_info_n *lfi_n,
+                                         unsigned int mask_16x16,
+                                         unsigned int mask_8x8,
+                                         unsigned int mask_4x4,
+                                         unsigned int mask_4x4_int,
+                                         const loop_filter_thresh *lfthr,
                                         const uint8_t *lfl) {
-  const int mask_shift = subsampling_factor ? 4 : 8;
-  const int mask_cutoff = subsampling_factor ? 0xf : 0xff;
+  const int dual_mask_cutoff = subsampling_factor ? 0xff : 0xffff;
  const int lfl_forward = subsampling_factor ? 4 : 8;
-
-  unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff;
-  unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff;
-  unsigned int mask_4x4_0 = mask_4x4_l & mask_cutoff;
-  unsigned int mask_4x4_int_0 = mask_4x4_int_l & mask_cutoff;
-  unsigned int mask_16x16_1 = (mask_16x16_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_8x8_1 = (mask_8x8_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_4x4_1 = (mask_4x4_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_4x4_int_1 = (mask_4x4_int_l >> mask_shift) & mask_cutoff;
+  const unsigned int dual_one = 1 | (1 << lfl_forward);
  unsigned int mask;
+  uint8_t *ss[2];
+  ss[0] = s;

-  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_4x4_int_0 |
-              mask_16x16_1 | mask_8x8_1 | mask_4x4_1 | mask_4x4_int_1;
-       mask; mask >>= 1) {
-    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
-    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward);
+  for (mask =
+           (mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int) & dual_mask_cutoff;
+       mask; mask = (mask & ~dual_one) >> 1) {
+    if (mask & dual_one) {
+      const loop_filter_thresh *lfis[2];
+      lfis[0] = lfthr + *lfl;
+      lfis[1] = lfthr + *(lfl + lfl_forward);
+      ss[1] = ss[0] + 8 * pitch;

-    if (mask & 1) {
-      if ((mask_16x16_0 | mask_16x16_1) & 1) {
-        if ((mask_16x16_0 & mask_16x16_1) & 1) {
-          vpx_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                   lfi0->hev_thr);
-        } else if (mask_16x16_0 & 1) {
-          vpx_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
-                              lfi0->hev_thr);
+      if (mask_16x16 & dual_one) {
+        if ((mask_16x16 & dual_one) == dual_one) {
+          vpx_lpf_vertical_16_dual(ss[0], pitch, lfis[0]->mblim, lfis[0]->lim,
+                                   lfis[0]->hev_thr);
        } else {
-          vpx_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim,
-                              lfi1->lim, lfi1->hev_thr);
+          const loop_filter_thresh *lfi = lfis[!(mask_16x16 & 1)];
+          vpx_lpf_vertical_16(ss[!(mask_16x16 & 1)], pitch, lfi->mblim,
+                              lfi->lim, lfi->hev_thr);
        }
      }

-      if ((mask_8x8_0 | mask_8x8_1) & 1) {
-        if ((mask_8x8_0 & mask_8x8_1) & 1) {
-          vpx_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                  lfi1->hev_thr);
-        } else if (mask_8x8_0 & 1) {
-          vpx_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+      if (mask_8x8 & dual_one) {
+        if ((mask_8x8 & dual_one) == dual_one) {
+          vpx_lpf_vertical_8_dual(ss[0], pitch, lfis[0]->mblim, lfis[0]->lim,
+                                  lfis[0]->hev_thr, lfis[1]->mblim,
+                                  lfis[1]->lim, lfis[1]->hev_thr);
        } else {
-          vpx_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                             lfi1->hev_thr);
+          const loop_filter_thresh *lfi = lfis[!(mask_8x8 & 1)];
+          vpx_lpf_vertical_8(ss[!(mask_8x8 & 1)], pitch, lfi->mblim, lfi->lim,
+                             lfi->hev_thr);
        }
      }

-      if ((mask_4x4_0 | mask_4x4_1) & 1) {
-        if ((mask_4x4_0 & mask_4x4_1) & 1) {
-          vpx_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                  lfi1->hev_thr);
-        } else if (mask_4x4_0 & 1) {
-          vpx_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+      if (mask_4x4 & dual_one) {
+        if ((mask_4x4 & dual_one) == dual_one) {
+          vpx_lpf_vertical_4_dual(ss[0], pitch, lfis[0]->mblim, lfis[0]->lim,
+                                  lfis[0]->hev_thr, lfis[1]->mblim,
+                                  lfis[1]->lim, lfis[1]->hev_thr);
        } else {
-          vpx_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                             lfi1->hev_thr);
+          const loop_filter_thresh *lfi = lfis[!(mask_4x4 & 1)];
+          vpx_lpf_vertical_4(ss[!(mask_4x4 & 1)], pitch, lfi->mblim, lfi->lim,
+                             lfi->hev_thr);
        }
      }

-      if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
-        if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
-          vpx_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
-                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                  lfi1->hev_thr);
-        } else if (mask_4x4_int_0 & 1) {
-          vpx_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
-                             lfi0->hev_thr);
+      if (mask_4x4_int & dual_one) {
+        if ((mask_4x4_int & dual_one) == dual_one) {
+          vpx_lpf_vertical_4_dual(ss[0] + 4, pitch, lfis[0]->mblim,
+                                  lfis[0]->lim, lfis[0]->hev_thr,
+                                  lfis[1]->mblim, lfis[1]->lim,
+                                  lfis[1]->hev_thr);
        } else {
-          vpx_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim, lfi1->lim,
-                             lfi1->hev_thr);
+          const loop_filter_thresh *lfi = lfis[!(mask_4x4_int & 1)];
+          vpx_lpf_vertical_4(ss[!(mask_4x4_int & 1)] + 4, pitch, lfi->mblim,
+                             lfi->lim, lfi->hev_thr);
        }
      }
    }

-    s += 8;
+    ss[0] += 8;
    lfl += 1;
-    mask_16x16_0 >>= 1;
-    mask_8x8_0 >>= 1;
-    mask_4x4_0 >>= 1;
-    mask_4x4_int_0 >>= 1;
-    mask_16x16_1 >>= 1;
-    mask_8x8_1 >>= 1;
-    mask_4x4_1 >>= 1;
-    mask_4x4_int_1 >>= 1;
+    mask_16x16 >>= 1;
+    mask_8x8 >>= 1;
+    mask_4x4 >>= 1;
+    mask_4x4_int >>= 1;
  }
 }

 #if CONFIG_VP9_HIGHBITDEPTH
 static void highbd_filter_selectively_vert_row2(int subsampling_factor,
                                                uint16_t *s, int pitch,
-                                                unsigned int mask_16x16_l,
-                                                unsigned int mask_8x8_l,
-                                                unsigned int mask_4x4_l,
-                                                unsigned int mask_4x4_int_l,
-                                                const loop_filter_info_n *lfi_n,
+                                                unsigned int mask_16x16,
+                                                unsigned int mask_8x8,
+                                                unsigned int mask_4x4,
+                                                unsigned int mask_4x4_int,
+                                                const loop_filter_thresh *lfthr,
                                                const uint8_t *lfl, int bd) {
-  const int mask_shift = subsampling_factor ? 4 : 8;
-  const int mask_cutoff = subsampling_factor ? 0xf : 0xff;
+  const int dual_mask_cutoff = subsampling_factor ? 0xff : 0xffff;
  const int lfl_forward = subsampling_factor ? 4 : 8;
-
-  unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff;
-  unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff;
-  unsigned int mask_4x4_0 = mask_4x4_l & mask_cutoff;
-  unsigned int mask_4x4_int_0 = mask_4x4_int_l & mask_cutoff;
-  unsigned int mask_16x16_1 = (mask_16x16_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_8x8_1 = (mask_8x8_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_4x4_1 = (mask_4x4_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_4x4_int_1 = (mask_4x4_int_l >> mask_shift) & mask_cutoff;
+  const unsigned int dual_one = 1 | (1 << lfl_forward);
  unsigned int mask;
+  uint16_t *ss[2];
+  ss[0] = s;

-  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_4x4_int_0 |
-       mask_16x16_1 | mask_8x8_1 | mask_4x4_1 | mask_4x4_int_1;
-       mask; mask >>= 1) {
-    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
-    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward);
+  for (mask =
+           (mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int) & dual_mask_cutoff;
+       mask; mask = (mask & ~dual_one) >> 1) {
+    if (mask & dual_one) {
+      const loop_filter_thresh *lfis[2];
+      lfis[0] = lfthr + *lfl;
+      lfis[1] = lfthr + *(lfl + lfl_forward);
+      ss[1] = ss[0] + 8 * pitch;

-    if (mask & 1) {
-      if ((mask_16x16_0 | mask_16x16_1) & 1) {
-        if ((mask_16x16_0 & mask_16x16_1) & 1) {
-          vpx_highbd_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                          lfi0->hev_thr, bd);
-        } else if (mask_16x16_0 & 1) {
-          vpx_highbd_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
-                                     lfi0->hev_thr, bd);
+      if (mask_16x16 & dual_one) {
+        if ((mask_16x16 & dual_one) == dual_one) {
+          vpx_highbd_lpf_vertical_16_dual(ss[0], pitch, lfis[0]->mblim,
+                                          lfis[0]->lim, lfis[0]->hev_thr, bd);
        } else {
-          vpx_highbd_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim,
-                                     lfi1->lim, lfi1->hev_thr, bd);
+          const loop_filter_thresh *lfi = lfis[!(mask_16x16 & 1)];
+          vpx_highbd_lpf_vertical_16(ss[!(mask_16x16 & 1)], pitch, lfi->mblim,
+                                     lfi->lim, lfi->hev_thr, bd);
        }
      }

-      if ((mask_8x8_0 | mask_8x8_1) & 1) {
-        if ((mask_8x8_0 & mask_8x8_1) & 1) {
-          vpx_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                         lfi1->hev_thr, bd);
-        } else if (mask_8x8_0 & 1) {
-          vpx_highbd_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, bd);
+      if (mask_8x8 & dual_one) {
+        if ((mask_8x8 & dual_one) == dual_one) {
+          vpx_highbd_lpf_vertical_8_dual(ss[0], pitch, lfis[0]->mblim,
+                                         lfis[0]->lim, lfis[0]->hev_thr,
+                                         lfis[1]->mblim, lfis[1]->lim,
+                                         lfis[1]->hev_thr, bd);
        } else {
-          vpx_highbd_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim,
-                                    lfi1->lim, lfi1->hev_thr, bd);
+          const loop_filter_thresh *lfi = lfis[!(mask_8x8 & 1)];
+          vpx_highbd_lpf_vertical_8(ss[!(mask_8x8 & 1)], pitch, lfi->mblim,
+                                    lfi->lim, lfi->hev_thr, bd);
        }
      }

-      if ((mask_4x4_0 | mask_4x4_1) & 1) {
-        if ((mask_4x4_0 & mask_4x4_1) & 1) {
-          vpx_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                         lfi1->hev_thr, bd);
-        } else if (mask_4x4_0 & 1) {
-          vpx_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, bd);
+      if (mask_4x4 & dual_one) {
+        if ((mask_4x4 & dual_one) == dual_one) {
+          vpx_highbd_lpf_vertical_4_dual(ss[0], pitch, lfis[0]->mblim,
+                                         lfis[0]->lim, lfis[0]->hev_thr,
+                                         lfis[1]->mblim, lfis[1]->lim,
+                                         lfis[1]->hev_thr, bd);
        } else {
-          vpx_highbd_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim,
-                                    lfi1->lim, lfi1->hev_thr, bd);
+          const loop_filter_thresh *lfi = lfis[!(mask_4x4 & 1)];
+          vpx_highbd_lpf_vertical_4(ss[!(mask_4x4 & 1)], pitch, lfi->mblim,
+                                    lfi->lim, lfi->hev_thr, bd);
        }
      }

-      if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
-        if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
-          vpx_highbd_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
-                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                         lfi1->hev_thr, bd);
-        } else if (mask_4x4_int_0 & 1) {
-          vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, bd);
+      if (mask_4x4_int & dual_one) {
+        if ((mask_4x4_int & dual_one) == dual_one) {
+          vpx_highbd_lpf_vertical_4_dual(ss[0] + 4, pitch, lfis[0]->mblim,
+                                         lfis[0]->lim, lfis[0]->hev_thr,
+                                         lfis[1]->mblim, lfis[1]->lim,
+                                         lfis[1]->hev_thr, bd);
        } else {
-          vpx_highbd_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim,
-                                    lfi1->lim, lfi1->hev_thr, bd);
+          const loop_filter_thresh *lfi = lfis[!(mask_4x4_int & 1)];
+          vpx_highbd_lpf_vertical_4(ss[!(mask_4x4_int & 1)] + 4, pitch,
+                                    lfi->mblim, lfi->lim, lfi->hev_thr, bd);
        }
      }
    }

-    s += 8;
+    ss[0] += 8;
    lfl += 1;
-    mask_16x16_0 >>= 1;
-    mask_8x8_0 >>= 1;
-    mask_4x4_0 >>= 1;
-    mask_4x4_int_0 >>= 1;
-    mask_16x16_1 >>= 1;
-    mask_8x8_1 >>= 1;
-    mask_4x4_1 >>= 1;
-    mask_4x4_int_1 >>= 1;
+    mask_16x16 >>= 1;
+    mask_8x8 >>= 1;
+    mask_4x4 >>= 1;
+    mask_4x4_int >>= 1;
  }
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@ -497,17 +469,17 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
                                     unsigned int mask_8x8,
                                     unsigned int mask_4x4,
                                     unsigned int mask_4x4_int,
-                                     const loop_filter_info_n *lfi_n,
+                                     const loop_filter_thresh *lfthr,
                                     const uint8_t *lfl) {
  unsigned int mask;
  int count;

  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
       mask; mask >>= count) {
-    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
-
    count = 1;
    if (mask & 1) {
+      const loop_filter_thresh *lfi = lfthr + *lfl;
+
      if (mask_16x16 & 1) {
        if ((mask_16x16 & 3) == 3) {
          vpx_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim,
@ -520,7 +492,7 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
      } else if (mask_8x8 & 1) {
        if ((mask_8x8 & 3) == 3) {
          // Next block's thresholds.
-          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+          const loop_filter_thresh *lfin = lfthr + *(lfl + 1);

          vpx_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
                                    lfi->hev_thr, lfin->mblim, lfin->lim,
@ -549,7 +521,7 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
      } else if (mask_4x4 & 1) {
        if ((mask_4x4 & 3) == 3) {
          // Next block's thresholds.
-          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+          const loop_filter_thresh *lfin = lfthr + *(lfl + 1);

          vpx_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
                                    lfi->hev_thr, lfin->mblim, lfin->lim,
@ -574,7 +546,7 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
            vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
                                 lfi->hev_thr);
        }
-      } else if (mask_4x4_int & 1) {
+      } else {
        vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
                             lfi->hev_thr);
      }
@ -594,17 +566,17 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch,
                                            unsigned int mask_8x8,
                                            unsigned int mask_4x4,
                                            unsigned int mask_4x4_int,
-                                            const loop_filter_info_n *lfi_n,
+                                            const loop_filter_thresh *lfthr,
                                            const uint8_t *lfl, int bd) {
  unsigned int mask;
  int count;

  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
       mask; mask >>= count) {
-    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
-
    count = 1;
    if (mask & 1) {
+      const loop_filter_thresh *lfi = lfthr + *lfl;
+
      if (mask_16x16 & 1) {
        if ((mask_16x16 & 3) == 3) {
          vpx_highbd_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim,
@ -617,7 +589,7 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch,
      } else if (mask_8x8 & 1) {
        if ((mask_8x8 & 3) == 3) {
          // Next block's thresholds.
-          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+          const loop_filter_thresh *lfin = lfthr + *(lfl + 1);

          vpx_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
                                           lfi->hev_thr, lfin->mblim, lfin->lim,
@ -650,7 +622,7 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch,
      } else if (mask_4x4 & 1) {
        if ((mask_4x4 & 3) == 3) {
          // Next block's thresholds.
-          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+          const loop_filter_thresh *lfin = lfthr + *(lfl + 1);

          vpx_highbd_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
                                           lfi->hev_thr, lfin->mblim, lfin->lim,
@ -679,7 +651,7 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch,
                                        lfi->lim, lfi->hev_thr, bd);
          }
        }
-      } else if (mask_4x4_int & 1) {
+      } else {
        vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
                                    lfi->hev_thr, bd);
      }
@ -700,7 +672,6 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch,
 // whether there were any coefficients encoded, and the loop filter strength
 // block we are currently looking at. Shift is used to position the
 // 1's we produce.
-// TODO(JBB) Need another function for different resolution color..
 static void build_masks(const loop_filter_info_n *const lfi_n,
                        const MODE_INFO *mi, const int shift_y,
                        const int shift_uv,
@ -935,7 +906,6 @@ void vp9_adjust_mask(VP9_COMMON *const cm, const int mi_row,

 // This function sets up the bit masks for the entire 64x64 region represented
 // by mi_row, mi_col.
-// TODO(JBB): This function only works for yv12.
 void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
                    MODE_INFO **mi, const int mode_info_stride,
                    LOOP_FILTER_MASK *lfm) {
@ -971,9 +941,6 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
  vp9_zero(*lfm);
  assert(mip[0] != NULL);

-  // TODO(jimbankoski): Try moving most of the following code into decode
-  // loop and storing lfm in the mbmi structure so that we don't have to go
-  // through the recursive loop structure multiple times.
  switch (mip[0]->sb_type) {
    case BLOCK_64X64:
      build_masks(lfi_n, mip[0] , 0, 0, lfm);
@ -1077,8 +1044,6 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
      }
      break;
  }
-
-  vp9_adjust_mask(cm, mi_row, mi_col, lfm);
 }

 static void filter_selectively_vert(uint8_t *s, int pitch,
@ -1086,13 +1051,13 @@ static void filter_selectively_vert(uint8_t *s, int pitch,
                                    unsigned int mask_8x8,
                                    unsigned int mask_4x4,
                                    unsigned int mask_4x4_int,
-                                    const loop_filter_info_n *lfi_n,
+                                    const loop_filter_thresh *lfthr,
                                    const uint8_t *lfl) {
  unsigned int mask;

  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
       mask; mask >>= 1) {
-    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+    const loop_filter_thresh *lfi = lfthr + *lfl;

    if (mask & 1) {
      if (mask_16x16 & 1) {
@ -1120,13 +1085,13 @@ static void highbd_filter_selectively_vert(uint16_t *s, int pitch,
                                           unsigned int mask_8x8,
                                           unsigned int mask_4x4,
                                           unsigned int mask_4x4_int,
-                                           const loop_filter_info_n *lfi_n,
+                                           const loop_filter_thresh *lfthr,
                                           const uint8_t *lfl, int bd) {
  unsigned int mask;

  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
       mask; mask >>= 1) {
-    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+    const loop_filter_thresh *lfi = lfthr + *lfl;

    if (mask & 1) {
      if (mask_16x16 & 1) {
@ -1257,23 +1222,18 @@ void vp9_filter_block_plane_non420(VP9_COMMON *cm,
                                     mask_8x8_c & border_mask,
                                     mask_4x4_c & border_mask,
                                     mask_4x4_int[r],
-                                     &cm->lf_info, &lfl[r << 3],
+                                     cm->lf_info.lfthr, &lfl[r << 3],
                                     (int)cm->bit_depth);
    } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
      filter_selectively_vert(dst->buf, dst->stride,
                              mask_16x16_c & border_mask,
                              mask_8x8_c & border_mask,
                              mask_4x4_c & border_mask,
                              mask_4x4_int[r],
-                              &cm->lf_info, &lfl[r << 3]);
+                              cm->lf_info.lfthr, &lfl[r << 3]);
+#if CONFIG_VP9_HIGHBITDEPTH
    }
-#else
-    filter_selectively_vert(dst->buf, dst->stride,
-                            mask_16x16_c & border_mask,
-                            mask_8x8_c & border_mask,
-                            mask_4x4_c & border_mask,
-                            mask_4x4_int[r],
-                            &cm->lf_info, &lfl[r << 3]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
    dst->buf += 8 * dst->stride;
    mi_8x8 += row_step_stride;
@ -1306,23 +1266,18 @@ void vp9_filter_block_plane_non420(VP9_COMMON *cm,
                                      mask_8x8_r,
                                      mask_4x4_r,
                                      mask_4x4_int_r,
-                                      &cm->lf_info, &lfl[r << 3],
+                                      cm->lf_info.lfthr, &lfl[r << 3],
                                      (int)cm->bit_depth);
    } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
      filter_selectively_horiz(dst->buf, dst->stride,
                               mask_16x16_r,
                               mask_8x8_r,
                               mask_4x4_r,
                               mask_4x4_int_r,
-                               &cm->lf_info, &lfl[r << 3]);
+                               cm->lf_info.lfthr, &lfl[r << 3]);
+#if CONFIG_VP9_HIGHBITDEPTH
    }
-#else
-    filter_selectively_horiz(dst->buf, dst->stride,
-                             mask_16x16_r,
-                             mask_8x8_r,
-                             mask_4x4_r,
-                             mask_4x4_int_r,
-                             &cm->lf_info, &lfl[r << 3]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
    dst->buf += 8 * dst->stride;
  }
@ -1344,27 +1299,29 @@ void vp9_filter_block_plane_ss00(VP9_COMMON *const cm,

  // Vertical pass: do 2 rows at one time
  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
-    unsigned int mask_16x16_l = mask_16x16 & 0xffff;
-    unsigned int mask_8x8_l = mask_8x8 & 0xffff;
-    unsigned int mask_4x4_l = mask_4x4 & 0xffff;
-    unsigned int mask_4x4_int_l = mask_4x4_int & 0xffff;
-
-// Disable filtering on the leftmost column.
+    // Disable filtering on the leftmost column.
 #if CONFIG_VP9_HIGHBITDEPTH
    if (cm->use_highbitdepth) {
-      highbd_filter_selectively_vert_row2(
-          plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
-          mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-          &lfm->lfl_y[r << 3], (int)cm->bit_depth);
+      highbd_filter_selectively_vert_row2(plane->subsampling_x,
+                                          CONVERT_TO_SHORTPTR(dst->buf),
+                                          dst->stride,
+                                          (unsigned int)mask_16x16,
+                                          (unsigned int)mask_8x8,
+                                          (unsigned int)mask_4x4,
+                                          (unsigned int)mask_4x4_int,
+                                          cm->lf_info.lfthr,
+                                          &lfm->lfl_y[r << 3],
+                                          (int)cm->bit_depth);
    } else {
-      filter_selectively_vert_row2(
-          plane->subsampling_x, dst->buf, dst->stride, mask_16x16_l, mask_8x8_l,
-          mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_y[r << 3]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      filter_selectively_vert_row2(plane->subsampling_x, dst->buf, dst->stride,
+                                   (unsigned int)mask_16x16,
+                                   (unsigned int)mask_8x8,
+                                   (unsigned int)mask_4x4,
+                                   (unsigned int)mask_4x4_int,
+                                   cm->lf_info.lfthr, &lfm->lfl_y[r << 3]);
+#if CONFIG_VP9_HIGHBITDEPTH
    }
-#else
-    filter_selectively_vert_row2(
-        plane->subsampling_x, dst->buf, dst->stride, mask_16x16_l, mask_8x8_l,
-        mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_y[r << 3]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
    dst->buf += 16 * dst->stride;
    mask_16x16 >>= 16;
@ -1397,19 +1354,18 @@ void vp9_filter_block_plane_ss00(VP9_COMMON *const cm,

 #if CONFIG_VP9_HIGHBITDEPTH
    if (cm->use_highbitdepth) {
-      highbd_filter_selectively_horiz(
-          CONVERT_TO_SHORTPTR(dst->buf), dst->stride, mask_16x16_r, mask_8x8_r,
-          mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info, &lfm->lfl_y[r << 3],
-          (int)cm->bit_depth);
+      highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
+                                      dst->stride, mask_16x16_r, mask_8x8_r,
+                                      mask_4x4_r, mask_4x4_int & 0xff,
+                                      cm->lf_info.lfthr, &lfm->lfl_y[r << 3],
+                                      (int)cm->bit_depth);
    } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
      filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
-                               mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info,
-                               &lfm->lfl_y[r << 3]);
+                               mask_4x4_r, mask_4x4_int & 0xff,
+                               cm->lf_info.lfthr, &lfm->lfl_y[r << 3]);
+#if CONFIG_VP9_HIGHBITDEPTH
    }
-#else
-    filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
-                             mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info,
-                             &lfm->lfl_y[r << 3]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH

    dst->buf += 8 * dst->stride;
@ -1443,38 +1399,35 @@ void vp9_filter_block_plane_ss11(VP9_COMMON *const cm,
      lfl_uv[((r + 2) << 1) + c] = lfm->lfl_y[((r + 2) << 3) + (c << 1)];
    }

-    {
-      unsigned int mask_16x16_l = mask_16x16 & 0xff;
-      unsigned int mask_8x8_l = mask_8x8 & 0xff;
-      unsigned int mask_4x4_l = mask_4x4 & 0xff;
-      unsigned int mask_4x4_int_l = mask_4x4_int & 0xff;
-
-// Disable filtering on the leftmost column.
+    // Disable filtering on the leftmost column.
 #if CONFIG_VP9_HIGHBITDEPTH
-      if (cm->use_highbitdepth) {
-        highbd_filter_selectively_vert_row2(
-            plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
-            mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-            &lfl_uv[r << 1], (int)cm->bit_depth);
-      } else {
-        filter_selectively_vert_row2(
-            plane->subsampling_x, dst->buf, dst->stride,
-            mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-            &lfl_uv[r << 1]);
-      }
-#else
-      filter_selectively_vert_row2(
-          plane->subsampling_x, dst->buf, dst->stride,
-          mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-          &lfl_uv[r << 1]);
+    if (cm->use_highbitdepth) {
+      highbd_filter_selectively_vert_row2(plane->subsampling_x,
+                                          CONVERT_TO_SHORTPTR(dst->buf),
+                                          dst->stride,
+                                          (unsigned int)mask_16x16,
+                                          (unsigned int)mask_8x8,
+                                          (unsigned int)mask_4x4,
+                                          (unsigned int)mask_4x4_int,
+                                          cm->lf_info.lfthr, &lfl_uv[r << 1],
+                                          (int)cm->bit_depth);
+    } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      filter_selectively_vert_row2(plane->subsampling_x, dst->buf, dst->stride,
+                                   (unsigned int)mask_16x16,
+                                   (unsigned int)mask_8x8,
+                                   (unsigned int)mask_4x4,
+                                   (unsigned int)mask_4x4_int,
+                                   cm->lf_info.lfthr, &lfl_uv[r << 1]);
+#if CONFIG_VP9_HIGHBITDEPTH
+    }
 #endif  // CONFIG_VP9_HIGHBITDEPTH

-      dst->buf += 16 * dst->stride;
-      mask_16x16 >>= 8;
-      mask_8x8 >>= 8;
-      mask_4x4 >>= 8;
-      mask_4x4_int >>= 8;
-    }
+    dst->buf += 16 * dst->stride;
+    mask_16x16 >>= 8;
+    mask_8x8 >>= 8;
+    mask_4x4 >>= 8;
+    mask_4x4_int >>= 8;
  }

  // Horizontal pass
@ -1506,17 +1459,16 @@ void vp9_filter_block_plane_ss11(VP9_COMMON *const cm,
    if (cm->use_highbitdepth) {
      highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
                                      dst->stride, mask_16x16_r, mask_8x8_r,
-                                      mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
-                                      &lfl_uv[r << 1], (int)cm->bit_depth);
+                                      mask_4x4_r, mask_4x4_int_r,
+                                      cm->lf_info.lfthr, &lfl_uv[r << 1],
+                                      (int)cm->bit_depth);
    } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
      filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
-                               mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
+                               mask_4x4_r, mask_4x4_int_r, cm->lf_info.lfthr,
                               &lfl_uv[r << 1]);
+#if CONFIG_VP9_HIGHBITDEPTH
    }
-#else
-    filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
-                             mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
-                             &lfl_uv[r << 1]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH

    dst->buf += 8 * dst->stride;
@ -1552,7 +1504,7 @@ static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, VP9_COMMON *cm,

      vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);

-      // TODO(JBB): Make setup_mask work for non 420.
+      // TODO(jimbankoski): For 444 only need to do y mask.
      vp9_adjust_mask(cm, mi_row, mi_col, lfm);

      vp9_filter_block_plane_ss00(cm, &planes[0], mi_row, lfm);
@ -1592,6 +1544,8 @@ void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame,
 }

 // Used by the encoder to build the loopfilter masks.
+// TODO(slavarnway): Do the encoder the same way the decoder does it and
+//                   build the masks in line as part of the encode process.
 void vp9_build_mask_frame(VP9_COMMON *cm, int frame_filter_level,
                          int partial_frame) {
  int start_mi_row, end_mi_row, mi_rows_to_filter;
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@ -12,6 +12,7 @@
 #include <stdlib.h>
 #include <stdio.h>

+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"
 #include "./vpx_scale_rtcd.h"
 #include "./vp9_rtcd.h"
@ -587,32 +588,6 @@ static void fillrd(struct postproc_state *state, int q, int a) {
  state->last_noise = a;
 }

-void vp9_plane_add_noise_c(uint8_t *start, char *noise,
-                           char blackclamp[16],
-                           char whiteclamp[16],
-                           char bothclamp[16],
-                           unsigned int width, unsigned int height, int pitch) {
-  unsigned int i, j;
-
-  // TODO(jbb): why does simd code use both but c doesn't,  normalize and
-  // fix..
-  (void) bothclamp;
-  for (i = 0; i < height; i++) {
-    uint8_t *pos = start + i * pitch;
-    char  *ref = (char *)(noise + (rand() & 0xff));  // NOLINT
-
-    for (j = 0; j < width; j++) {
-      if (pos[j] < blackclamp[0])
-        pos[j] = blackclamp[0];
-
-      if (pos[j] > 255 + whiteclamp[0])
-        pos[j] = 255 + whiteclamp[0];
-
-      pos[j] += ref[j];
-    }
-  }
-}
-
 static void swap_mi_and_prev_mi(VP9_COMMON *cm) {
  // Current mip will be the prev_mip for the next frame.
  MODE_INFO *temp = cm->postproc_state.prev_mip;
@ -726,8 +701,7 @@ int vp9_post_proc_frame(struct VP9Common *cm,
        ppstate->last_noise != noise_level) {
      fillrd(ppstate, 63 - q, noise_level);
    }
-
-    vp9_plane_add_noise(ppbuf->y_buffer, ppstate->noise, ppstate->blackclamp,
+    vpx_plane_add_noise(ppbuf->y_buffer, ppstate->noise, ppstate->blackclamp,
                        ppstate->whiteclamp, ppstate->bothclamp,
                        ppbuf->y_width, ppbuf->y_height, ppbuf->y_stride);
  }
--- a/vp9/common/vp9_pred_common.c
+++ b/vp9/common/vp9_pred_common.c
@ -28,9 +28,9 @@ int vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {

  if (left_type == above_type)
    return left_type;
-  else if (left_type == SWITCHABLE_FILTERS && above_type != SWITCHABLE_FILTERS)
+  else if (left_type == SWITCHABLE_FILTERS)
    return above_type;
-  else if (left_type != SWITCHABLE_FILTERS && above_type == SWITCHABLE_FILTERS)
+  else if (above_type == SWITCHABLE_FILTERS)
    return left_type;
  else
    return SWITCHABLE_FILTERS;
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@ -20,19 +20,6 @@
 #include "vp9/common/vp9_reconintra.h"

 #if CONFIG_VP9_HIGHBITDEPTH
-void high_inter_predictor(const uint8_t *src, int src_stride,
-                                 uint8_t *dst, int dst_stride,
-                                 const int subpel_x,
-                                 const int subpel_y,
-                                 const struct scale_factors *sf,
-                                 int w, int h, int ref,
-                                 const InterpKernel *kernel,
-                                 int xs, int ys, int bd) {
-  sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref](
-      src, src_stride, dst, dst_stride,
-      kernel[subpel_x], xs, kernel[subpel_y], ys, w, h, bd);
-}
-
 void vp9_highbd_build_inter_predictor(const uint8_t *src, int src_stride,
                                      uint8_t *dst, int dst_stride,
                                      const MV *src_mv,
@ -50,8 +37,9 @@ void vp9_highbd_build_inter_predictor(const uint8_t *src, int src_stride,

  src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);

-  high_inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
-                       sf, w, h, ref, kernel, sf->x_step_q4, sf->y_step_q4, bd);
+  highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
+                         sf, w, h, ref, kernel, sf->x_step_q4, sf->y_step_q4,
+                         bd);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH

@ -222,9 +210,9 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,

 #if CONFIG_VP9_HIGHBITDEPTH
    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      high_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
-                           subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys,
-                           xd->bd);
+      highbd_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
+                             subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys,
+                             xd->bd);
    } else {
      inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
                      subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys);
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@ -34,14 +34,18 @@ static INLINE void inter_predictor(const uint8_t *src, int src_stride,
 }

 #if CONFIG_VP9_HIGHBITDEPTH
-void high_inter_predictor(const uint8_t *src, int src_stride,
-                                 uint8_t *dst, int dst_stride,
-                                 const int subpel_x,
-                                 const int subpel_y,
-                                 const struct scale_factors *sf,
-                                 int w, int h, int ref,
-                                 const InterpKernel *kernel,
-                                 int xs, int ys, int bd);
+static INLINE void highbd_inter_predictor(const uint8_t *src, int src_stride,
+                                          uint8_t *dst, int dst_stride,
+                                          const int subpel_x,
+                                          const int subpel_y,
+                                          const struct scale_factors *sf,
+                                          int w, int h, int ref,
+                                          const InterpKernel *kernel,
+                                          int xs, int ys, int bd) {
+  sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref](
+      src, src_stride, dst, dst_stride,
+      kernel[subpel_x], xs, kernel[subpel_y], ys, w, h, bd);
+}
 #endif  // CONFIG_VP9_HIGHBITDEPTH

 MV average_split_mvs(const struct macroblockd_plane *pd, const MODE_INFO *mi,
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@ -142,6 +142,7 @@ static void build_intra_predictors_high(const MACROBLOCKD *xd,
  // 129  C   D  ..  W   X
  // 129  E   F  ..  U   V
  // 129  G   H  ..  S   T   T   T   T   T
+  // For 10 bit and 12 bit, 127 and 129 are replaced by base -1 and base + 1.

  // Get current frame pointer, width and height.
  if (plane == 0) {
@ -177,7 +178,6 @@ static void build_intra_predictors_high(const MACROBLOCKD *xd,
          left_col[i] = ref[i * ref_stride - 1];
      }
    } else {
-      // TODO(Peter): this value should probably change for high bitdepth
      vpx_memset16(left_col, base + 1, bs);
    }
  }
@ -239,7 +239,6 @@ static void build_intra_predictors_high(const MACROBLOCKD *xd,
          vpx_memset16(above_row + r, above_row[r - 1],
                       x0 + 2 * bs - frame_width);
        }
-        // TODO(Peter) this value should probably change for high bitdepth
        above_row[-1] = left_available ? above_ref[-1] : (base + 1);
      } else {
        /* faster path if the block does not need extension */
@ -251,13 +250,11 @@ static void build_intra_predictors_high(const MACROBLOCKD *xd,
            memcpy(above_row + bs, above_ref + bs, bs * sizeof(above_row[0]));
          else
            vpx_memset16(above_row + bs, above_row[bs - 1], bs);
-          // TODO(Peter): this value should probably change for high bitdepth
          above_row[-1] = left_available ? above_ref[-1] : (base + 1);
        }
      }
    } else {
      vpx_memset16(above_row, base - 1, bs * 2);
-      // TODO(Peter): this value should probably change for high bitdepth
      above_row[-1] = base - 1;
    }
  }
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@ -70,10 +70,6 @@ add_proto qw/void vp9_post_proc_down_and_across/, "const uint8_t *src_ptr, uint8
 specialize qw/vp9_post_proc_down_and_across sse2/;
 $vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm;

-add_proto qw/void vp9_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
-specialize qw/vp9_plane_add_noise sse2/;
-$vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt;
-
 add_proto qw/void vp9_filter_by_weight16x16/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
 specialize qw/vp9_filter_by_weight16x16 sse2 msa/;

@ -169,9 +165,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

    add_proto qw/void vp9_highbd_post_proc_down_and_across/, "const uint16_t *src_ptr, uint16_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit";
    specialize qw/vp9_highbd_post_proc_down_and_across/;
-
-    add_proto qw/void vp9_highbd_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
-    specialize qw/vp9_highbd_plane_add_noise/;
  }

  #
@ -252,7 +245,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  specialize qw/vp9_fht16x16 sse2/;

  add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fwht4x4/, "$mmx_x86inc";
+  specialize qw/vp9_fwht4x4/, "$sse2_x86inc";
 } else {
  add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
  specialize qw/vp9_fht4x4 sse2 msa/;
@ -264,7 +257,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  specialize qw/vp9_fht16x16 sse2 msa/;

  add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fwht4x4 msa/, "$mmx_x86inc";
+  specialize qw/vp9_fwht4x4 msa/, "$sse2_x86inc";
 }

 #
@ -276,7 +269,7 @@ $vp9_full_search_sad_sse3=vp9_full_search_sadx3;
 $vp9_full_search_sad_sse4_1=vp9_full_search_sadx8;

 add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
-specialize qw/vp9_diamond_search_sad/;
+specialize qw/vp9_diamond_search_sad avx/;

 add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
 specialize qw/vp9_temporal_filter_apply sse2 msa/;
--- a/vp9/common/vp9_seg_common.c
+++ b/vp9/common/vp9_seg_common.c
@ -28,6 +28,7 @@ static const int seg_feature_data_max[SEG_LVL_MAX] = {
 void vp9_clearall_segfeatures(struct segmentation *seg) {
  vp9_zero(seg->feature_data);
  vp9_zero(seg->feature_mask);
+  seg->aq_av_offset = 0;
 }

 void vp9_enable_segfeature(struct segmentation *seg, int segment_id,
--- a/vp9/common/vp9_seg_common.h
+++ b/vp9/common/vp9_seg_common.h
@ -47,6 +47,7 @@ struct segmentation {

  int16_t feature_data[MAX_SEGMENTS][SEG_LVL_MAX];
  unsigned int feature_mask[MAX_SEGMENTS];
+  int aq_av_offset;
 };

 static INLINE int segfeature_active(const struct segmentation *seg,
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@ -8,6 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

+#include "./vp9_rtcd.h"
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 #include "vpx_ports/mem.h"
--- a/vp9/common/x86/vp9_postproc_sse2.asm
+++ b/vp9/common/x86/vp9_postproc_sse2.asm
@ -624,68 +624,6 @@ sym(vp9_mbpost_proc_across_ip_xmm):
 %undef flimit4


-;void vp9_plane_add_noise_wmt (unsigned char *start, unsigned char *noise,
-;                            unsigned char blackclamp[16],
-;                            unsigned char whiteclamp[16],
-;                            unsigned char bothclamp[16],
-;                            unsigned int width, unsigned int height, int pitch)
-global sym(vp9_plane_add_noise_wmt) PRIVATE
-sym(vp9_plane_add_noise_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-.addnoise_loop:
-    call sym(LIBVPX_RAND) WRT_PLT
-    mov     rcx, arg(1) ;noise
-    and     rax, 0xff
-    add     rcx, rax
-
-    ; we rely on the fact that the clamping vectors are stored contiguously
-    ; in black/white/both order. Note that we have to reload this here because
-    ; rdx could be trashed by rand()
-    mov     rdx, arg(2) ; blackclamp
-
-
-            mov     rdi, rcx
-            movsxd  rcx, dword arg(5) ;[Width]
-            mov     rsi, arg(0) ;Pos
-            xor         rax,rax
-
-.addnoise_nextset:
-            movdqu      xmm1,[rsi+rax]         ; get the source
-
-            psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
-            paddusb     xmm1, [rdx+32] ;bothclamp
-            psubusb     xmm1, [rdx+16] ;whiteclamp
-
-            movdqu      xmm2,[rdi+rax]         ; get the noise for this line
-            paddb       xmm1,xmm2              ; add it in
-            movdqu      [rsi+rax],xmm1         ; store the result
-
-            add         rax,16                 ; move to the next line
-
-            cmp         rax, rcx
-            jl          .addnoise_nextset
-
-    movsxd  rax, dword arg(7) ; Pitch
-    add     arg(0), rax ; Start += Pitch
-    sub     dword arg(6), 1   ; Height -= 1
-    jg      .addnoise_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 SECTION_RODATA
 align 16
 rd42:
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@ -525,8 +525,8 @@ static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride,
  }

  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    high_inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
-                         subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
+    highbd_inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
+                           subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
  } else {
    inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
                    subpel_y, sf, w, h, ref, kernel, xs, ys);
@ -699,8 +699,8 @@ static void dec_build_inter_predictors(VPxWorker *const worker, MACROBLOCKD *xd,
  }
 #if CONFIG_VP9_HIGHBITDEPTH
  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    high_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
-                         subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
+    highbd_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+                           subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
  } else {
    inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
                    subpel_y, sf, w, h, ref, kernel, xs, ys);
@ -1315,11 +1315,16 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
  BufferPool *const pool = cm->buffer_pool;
  for (i = 0; i < REFS_PER_FRAME; ++i) {
    if (vpx_rb_read_bit(rb)) {
-      YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf;
-      width = buf->y_crop_width;
-      height = buf->y_crop_height;
-      found = 1;
-      break;
+      if (cm->frame_refs[i].idx != INVALID_IDX) {
+        YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf;
+        width = buf->y_crop_width;
+        height = buf->y_crop_height;
+        found = 1;
+        break;
+      } else {
+        vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                           "Failed to decode frame size");
+      }
    }
  }

@ -1334,22 +1339,23 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
  // has valid dimensions.
  for (i = 0; i < REFS_PER_FRAME; ++i) {
    RefBuffer *const ref_frame = &cm->frame_refs[i];
-    has_valid_ref_frame |= valid_ref_frame_size(ref_frame->buf->y_crop_width,
-                                                ref_frame->buf->y_crop_height,
-                                                width, height);
+    has_valid_ref_frame |= (ref_frame->idx != INVALID_IDX &&
+                            valid_ref_frame_size(ref_frame->buf->y_crop_width,
+                                                 ref_frame->buf->y_crop_height,
+                                                 width, height));
  }
  if (!has_valid_ref_frame)
    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                       "Referenced frame has invalid size");
  for (i = 0; i < REFS_PER_FRAME; ++i) {
    RefBuffer *const ref_frame = &cm->frame_refs[i];
-    if (!valid_ref_frame_img_fmt(
-            ref_frame->buf->bit_depth,
-            ref_frame->buf->subsampling_x,
-            ref_frame->buf->subsampling_y,
-            cm->bit_depth,
-            cm->subsampling_x,
-            cm->subsampling_y))
+    if (ref_frame->idx == INVALID_IDX ||
+        !valid_ref_frame_img_fmt(ref_frame->buf->bit_depth,
+                                 ref_frame->buf->subsampling_x,
+                                 ref_frame->buf->subsampling_y,
+                                 cm->bit_depth,
+                                 cm->subsampling_x,
+                                 cm->subsampling_y))
      vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                         "Referenced frame has incompatible color format");
  }
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@ -371,9 +371,9 @@ static int dec_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {

  if (left_type == above_type)
    return left_type;
-  else if (left_type == SWITCHABLE_FILTERS && above_type != SWITCHABLE_FILTERS)
+  else if (left_type == SWITCHABLE_FILTERS)
    return above_type;
-  else if (left_type != SWITCHABLE_FILTERS && above_type == SWITCHABLE_FILTERS)
+  else if (above_type == SWITCHABLE_FILTERS)
    return left_type;
  else
    return SWITCHABLE_FILTERS;
@ -902,4 +902,10 @@ void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
      frame_mvs += cm->mi_cols;
    }
  }
+#if CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
+    if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) &&
+        (xd->above_mi == NULL || xd->left_mi == NULL) &&
+        !is_inter_block(mi) && need_top_left[mi->uv_mode])
+      assert(0);
+#endif  // CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
 }
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@ -505,7 +505,7 @@ vpx_codec_err_t vp9_parse_superframe_index(const uint8_t *data,
        uint32_t this_sz = 0;

        for (j = 0; j < mag; ++j)
-          this_sz |= (*x++) << (j * 8);
+          this_sz |= ((uint32_t)(*x++)) << (j * 8);
        sizes[i] = this_sz;
      }
      *count = frames;
--- a/vp9/encoder/vp9_aq_360.c
+++ b/vp9/encoder/vp9_aq_360.c
@ -13,6 +13,7 @@
 #include "vpx_ports/mem.h"
 #include "vpx_ports/system_state.h"

+#include "vp9/encoder/vp9_aq_360.h"
 #include "vp9/encoder/vp9_aq_variance.h"

 #include "vp9/common/vp9_seg_common.h"
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@ -22,7 +22,6 @@
 #include "vp9/encoder/vp9_segmentation.h"
 CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
  size_t last_coded_q_map_size;
-  size_t consec_zero_mv_size;
  CYCLIC_REFRESH *const cr = vpx_calloc(1, sizeof(*cr));
  if (cr == NULL)
    return NULL;
@ -40,21 +39,12 @@ CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
  }
  assert(MAXQ <= 255);
  memset(cr->last_coded_q_map, MAXQ, last_coded_q_map_size);
-
-  consec_zero_mv_size = mi_rows * mi_cols * sizeof(*cr->consec_zero_mv);
-  cr->consec_zero_mv = vpx_malloc(consec_zero_mv_size);
-  if (cr->consec_zero_mv == NULL) {
-    vp9_cyclic_refresh_free(cr);
-    return NULL;
-  }
-  memset(cr->consec_zero_mv, 0, consec_zero_mv_size);
  return cr;
 }

 void vp9_cyclic_refresh_free(CYCLIC_REFRESH *cr) {
  vpx_free(cr->map);
  vpx_free(cr->last_coded_q_map);
-  vpx_free(cr->consec_zero_mv);
  vpx_free(cr);
 }

@ -244,7 +234,6 @@ void vp9_cyclic_refresh_update_sb_postencode(VP9_COMP *const cpi,
                                             BLOCK_SIZE bsize) {
  const VP9_COMMON *const cm = &cpi->common;
  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
-  MV mv = mi->mv[0].as_mv;
  const int bw = num_8x8_blocks_wide_lookup[bsize];
  const int bh = num_8x8_blocks_high_lookup[bsize];
  const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
@ -268,15 +257,8 @@ void vp9_cyclic_refresh_update_sb_postencode(VP9_COMP *const cpi,
            clamp(cm->base_qindex + cr->qindex_delta[mi->segment_id],
                  0, MAXQ),
            cr->last_coded_q_map[map_offset]);
-      // Update the consecutive zero/low_mv count.
-      if (is_inter_block(mi) && (abs(mv.row) < 8 && abs(mv.col) < 8)) {
-        if (cr->consec_zero_mv[map_offset] < 255)
-          cr->consec_zero_mv[map_offset]++;
-      } else {
-        cr->consec_zero_mv[map_offset] = 0;
      }
    }
-  }
 }

 // Update the actual number of blocks that were applied the segment delta q.
@ -410,13 +392,18 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
  cr->target_num_seg_blocks = 0;
  if (cpi->oxcf.content != VP9E_CONTENT_SCREEN) {
    consec_zero_mv_thresh = 100;
-   if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium)
-     consec_zero_mv_thresh = 80;
  }
  qindex_thresh =
      cpi->oxcf.content == VP9E_CONTENT_SCREEN
      ? vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex)
      : vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex);
+  // More aggressive settings for noisy content.
+  if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium) {
+    consec_zero_mv_thresh = 80;
+    qindex_thresh =
+        VPXMAX(vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex),
+                              7 * cm->base_qindex >> 3);
+  }
  do {
    int sum_map = 0;
    // Get the mi_row/mi_col corresponding to superblock index i.
@ -441,7 +428,7 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
        if (cr->map[bl_index2] == 0) {
          count_tot++;
          if (cr->last_coded_q_map[bl_index2] > qindex_thresh ||
-              cr->consec_zero_mv[bl_index2] < consec_zero_mv_thresh) {
+              cpi->consec_zero_mv[bl_index2] < consec_zero_mv_thresh) {
            sum_map++;
            count_sel++;
          }
@ -480,6 +467,8 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
    cr->percent_refresh = 5;
  cr->max_qdelta_perc = 50;
  cr->time_for_refresh = 0;
+  cr->motion_thresh = 32;
+  cr->rate_boost_fac = 15;
  // Use larger delta-qp (increase rate_ratio_qdelta) for first few (~4)
  // periods of the refresh cycle, after a key frame.
  // Account for larger interval on base layer for temporal layers.
@ -489,9 +478,11 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
    cr->rate_ratio_qdelta = 3.0;
  } else {
    cr->rate_ratio_qdelta = 2.0;
-  if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium)
-    // Reduce the delta-qp if the estimated source noise is above threshold.
-    cr->rate_ratio_qdelta = 1.5;
+    if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium) {
+      // Reduce the delta-qp if the estimated source noise is above threshold.
+      cr->rate_ratio_qdelta = 1.7;
+      cr->rate_boost_fac = 13;
+    }
  }
  // Adjust some parameters for low resolutions at low bitrates.
  if (cm->width <= 352 &&
@ -499,9 +490,6 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
      rc->avg_frame_bandwidth < 3400) {
    cr->motion_thresh = 4;
    cr->rate_boost_fac = 10;
-  } else {
-    cr->motion_thresh = 32;
-    cr->rate_boost_fac = 15;
  }
  if (cpi->svc.spatial_layer_id > 0) {
    cr->motion_thresh = 4;
@ -544,8 +532,6 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) {
    if (cm->frame_type == KEY_FRAME) {
      memset(cr->last_coded_q_map, MAXQ,
             cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map));
-      memset(cr->consec_zero_mv, 0,
-             cm->mi_rows * cm->mi_cols * sizeof(*cr->consec_zero_mv));
      cr->sb_index = 0;
    }
    return;
@ -620,7 +606,6 @@ void vp9_cyclic_refresh_reset_resize(VP9_COMP *const cpi) {
  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
  memset(cr->map, 0, cm->mi_rows * cm->mi_cols);
  memset(cr->last_coded_q_map, MAXQ, cm->mi_rows * cm->mi_cols);
-  memset(cr->consec_zero_mv, 0, cm->mi_rows * cm->mi_cols);
  cr->sb_index = 0;
  cpi->refresh_golden_frame = 1;
  cpi->refresh_alt_ref_frame = 1;
--- a/vp9/encoder/vp9_aq_cyclicrefresh.h
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.h
@ -53,8 +53,6 @@ struct CYCLIC_REFRESH {
  signed char *map;
  // Map of the last q a block was coded at.
  uint8_t *last_coded_q_map;
-  // Count on how many consecutive times a block uses ZER0MV for encoding.
-  uint8_t *consec_zero_mv;
  // Thresholds applied to the projected rate/distortion of the coding block,
  // when deciding whether block should be refreshed.
  int64_t thresh_rate_sb;
--- a/vp9/encoder/vp9_aq_variance.c
+++ b/vp9/encoder/vp9_aq_variance.c
@ -167,7 +167,7 @@ static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x,
                vp9_64_zeros, 0, bw, bh, &sse, &avg);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
    var = sse - (((int64_t)avg * avg) / (bw * bh));
-    return (256 * var) / (bw * bh);
+    return (unsigned int)(((uint64_t)256 * var) / (bw * bh));
  } else {
 #if CONFIG_VP9_HIGHBITDEPTH
    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@ -185,7 +185,7 @@ static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x,
                             x->plane[0].src.stride,
                             vp9_64_zeros, 0, &sse);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-    return (256 * var) >> num_pels_log2_lookup[bs];
+    return (unsigned int)(((uint64_t)256 * var) >> num_pels_log2_lookup[bs]);
  }
 }

--- a/Показать больше
+++ b/Показать больше