Bug 1284803 part 1 - Update libyuv to rev 1602 r=jesup

2016-07-25 23:58:15 -07:00 · 2016-07-25 23:58:15 -07:00 · 9952d80a52
--- a/media/libyuv/Android.mk
+++ b/media/libyuv/Android.mk
@ -8,7 +8,8 @@ LOCAL_CPP_EXTENSION := .cc
 LOCAL_SRC_FILES := \
    source/compare.cc           \
    source/compare_common.cc    \
-    source/compare_posix.cc     \
+    source/compare_neon64.cc    \
+    source/compare_gcc.cc       \
    source/convert.cc           \
    source/convert_argb.cc      \
    source/convert_from.cc      \
@ -16,20 +17,26 @@ LOCAL_SRC_FILES := \
    source/convert_to_argb.cc   \
    source/convert_to_i420.cc   \
    source/cpu_id.cc            \
-    source/format_conversion.cc \
    source/planar_functions.cc  \
    source/rotate.cc            \
+    source/rotate_any.cc        \
    source/rotate_argb.cc       \
+    source/rotate_common.cc     \
    source/rotate_mips.cc       \
+    source/rotate_neon64.cc     \
+    source/rotate_gcc.cc        \
    source/row_any.cc           \
    source/row_common.cc        \
    source/row_mips.cc          \
-    source/row_posix.cc         \
+    source/row_neon64.cc        \
+    source/row_gcc.cc	        \
    source/scale.cc             \
+    source/scale_any.cc         \
    source/scale_argb.cc        \
    source/scale_common.cc      \
    source/scale_mips.cc        \
-    source/scale_posix.cc       \
+    source/scale_neon64.cc      \
+    source/scale_gcc.cc         \
    source/video_common.cc

 # TODO(fbarchard): Enable mjpeg encoder.
--- a/media/libyuv/BUILD.gn
+++ b/media/libyuv/BUILD.gn
@ -0,0 +1,135 @@
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+import("//build/config/arm.gni")
+import("//build/config/sanitizers/sanitizers.gni")
+
+config("libyuv_config") {
+  include_dirs = [
+    ".",
+    "include",
+  ]
+}
+
+use_neon = current_cpu == "arm64" || (current_cpu == "arm" && (arm_use_neon || arm_optionally_use_neon))
+
+source_set("libyuv") {
+  sources = [
+    # Headers
+    "include/libyuv.h",
+    "include/libyuv/basic_types.h",
+    "include/libyuv/compare.h",
+    "include/libyuv/convert.h",
+    "include/libyuv/convert_argb.h",
+    "include/libyuv/convert_from.h",
+    "include/libyuv/convert_from_argb.h",
+    "include/libyuv/cpu_id.h",
+    "include/libyuv/mjpeg_decoder.h",
+    "include/libyuv/planar_functions.h",
+    "include/libyuv/rotate.h",
+    "include/libyuv/rotate_argb.h",
+    "include/libyuv/rotate_row.h",
+    "include/libyuv/row.h",
+    "include/libyuv/scale.h",
+    "include/libyuv/scale_argb.h",
+    "include/libyuv/scale_row.h",
+    "include/libyuv/version.h",
+    "include/libyuv/video_common.h",
+
+    # Source Files
+    "source/compare.cc",
+    "source/compare_common.cc",
+    "source/compare_gcc.cc",
+    "source/compare_win.cc",
+    "source/convert.cc",
+    "source/convert_argb.cc",
+    "source/convert_from.cc",
+    "source/convert_from_argb.cc",
+    "source/convert_jpeg.cc",
+    "source/convert_to_argb.cc",
+    "source/convert_to_i420.cc",
+    "source/cpu_id.cc",
+    "source/mjpeg_decoder.cc",
+    "source/mjpeg_validate.cc",
+    "source/planar_functions.cc",
+    "source/rotate.cc",
+    "source/rotate_any.cc",
+    "source/rotate_argb.cc",
+    "source/rotate_common.cc",
+    "source/rotate_mips.cc",
+    "source/rotate_gcc.cc",
+    "source/rotate_win.cc",
+    "source/row_any.cc",
+    "source/row_common.cc",
+    "source/row_mips.cc",
+    "source/row_gcc.cc",
+    "source/row_win.cc",
+    "source/scale.cc",
+    "source/scale_any.cc",
+    "source/scale_argb.cc",
+    "source/scale_common.cc",
+    "source/scale_mips.cc",
+    "source/scale_gcc.cc",
+    "source/scale_win.cc",
+    "source/video_common.cc",
+  ]
+
+  configs -= [ "//build/config/compiler:chromium_code" ]
+  configs += [ "//build/config/compiler:no_chromium_code" ]
+
+  public_configs = [ ":libyuv_config" ]
+
+  defines = []
+
+  if (!is_ios) {
+    defines += [ "HAVE_JPEG" ]
+  }
+
+  if (is_msan) {
+    # MemorySanitizer does not support assembly code yet.
+    # http://crbug.com/344505
+    defines += [ "LIBYUV_DISABLE_X86" ]
+  }
+
+  deps = [
+    "//third_party:jpeg",
+  ]
+
+  if (use_neon) {
+    deps += [ ":libyuv_neon" ]
+  }
+
+  if (is_nacl) {
+    # Always enable optimization under NaCl to workaround crbug.com/538243 .
+    configs -= [ "//build/config/compiler:default_optimization" ]
+    configs += [ "//build/config/compiler:optimize_max" ]
+  }
+}
+
+if (use_neon) {
+  static_library("libyuv_neon") {
+    sources = [
+      # ARM Source Files
+      "source/compare_neon.cc",
+      "source/compare_neon64.cc",
+      "source/rotate_neon.cc",
+      "source/rotate_neon64.cc",
+      "source/row_neon.cc",
+      "source/row_neon64.cc",
+      "source/scale_neon.cc",
+      "source/scale_neon64.cc",
+    ]
+
+    public_configs = [ ":libyuv_config" ]
+
+    if (current_cpu != "arm64") {
+      configs -= [ "//build/config/compiler:compiler_arm_fpu" ]
+      cflags = [ "-mfpu=neon" ]
+    }
+  }
+}
--- a/media/libyuv/CMakeLists.txt
+++ b/media/libyuv/CMakeLists.txt
@ -0,0 +1,142 @@
+cmake_minimum_required(VERSION 2.8)
+
+# CMakeLists for libyuv
+# Originally created for "roxlu build system" to compile libyuv on windows
+# Run with -DTEST=ON to build unit tests
+option(TEST "Built unit tests" OFF)
+
+set(ly_base_dir ${CMAKE_CURRENT_LIST_DIR})
+set(ly_src_dir ${ly_base_dir}/source/)
+set(ly_inc_dir ${ly_base_dir}/include)
+set(ly_lib_name "yuv")
+
+set(ly_source_files
+  ${ly_src_dir}/compare.cc
+  ${ly_src_dir}/compare_common.cc
+  ${ly_src_dir}/compare_neon.cc
+  ${ly_src_dir}/compare_neon64.cc
+  ${ly_src_dir}/compare_gcc.cc
+  ${ly_src_dir}/compare_win.cc
+  ${ly_src_dir}/convert.cc
+  ${ly_src_dir}/convert_argb.cc
+  ${ly_src_dir}/convert_from.cc
+  ${ly_src_dir}/convert_from_argb.cc
+  ${ly_src_dir}/convert_jpeg.cc
+  ${ly_src_dir}/convert_to_argb.cc
+  ${ly_src_dir}/convert_to_i420.cc
+  ${ly_src_dir}/cpu_id.cc
+  ${ly_src_dir}/mjpeg_decoder.cc
+  ${ly_src_dir}/mjpeg_validate.cc
+  ${ly_src_dir}/planar_functions.cc
+  ${ly_src_dir}/rotate.cc
+  ${ly_src_dir}/rotate_any.cc
+  ${ly_src_dir}/rotate_argb.cc
+  ${ly_src_dir}/rotate_common.cc
+  ${ly_src_dir}/rotate_mips.cc
+  ${ly_src_dir}/rotate_neon.cc
+  ${ly_src_dir}/rotate_neon64.cc
+  ${ly_src_dir}/rotate_gcc.cc
+  ${ly_src_dir}/rotate_win.cc
+  ${ly_src_dir}/row_any.cc
+  ${ly_src_dir}/row_common.cc
+  ${ly_src_dir}/row_mips.cc
+  ${ly_src_dir}/row_neon.cc
+  ${ly_src_dir}/row_neon64.cc
+  ${ly_src_dir}/row_gcc.cc
+  ${ly_src_dir}/row_win.cc
+  ${ly_src_dir}/scale.cc
+  ${ly_src_dir}/scale_any.cc
+  ${ly_src_dir}/scale_argb.cc
+  ${ly_src_dir}/scale_common.cc
+  ${ly_src_dir}/scale_mips.cc
+  ${ly_src_dir}/scale_neon.cc
+  ${ly_src_dir}/scale_neon64.cc
+  ${ly_src_dir}/scale_gcc.cc
+  ${ly_src_dir}/scale_win.cc
+  ${ly_src_dir}/video_common.cc
+)
+
+set(ly_unittest_sources
+  ${ly_base_dir}/unit_test/basictypes_test.cc
+  ${ly_base_dir}/unit_test/color_test.cc
+  ${ly_base_dir}/unit_test/compare_test.cc
+  ${ly_base_dir}/unit_test/convert_test.cc
+  ${ly_base_dir}/unit_test/cpu_test.cc
+  ${ly_base_dir}/unit_test/math_test.cc
+  ${ly_base_dir}/unit_test/planar_test.cc
+  ${ly_base_dir}/unit_test/rotate_argb_test.cc
+  ${ly_base_dir}/unit_test/rotate_test.cc
+  ${ly_base_dir}/unit_test/scale_argb_test.cc
+  ${ly_base_dir}/unit_test/scale_test.cc
+  ${ly_base_dir}/unit_test/unit_test.cc
+  ${ly_base_dir}/unit_test/video_common_test.cc
+)
+
+set(ly_header_files
+  ${ly_inc_dir}/libyuv/basic_types.h
+  ${ly_inc_dir}/libyuv/compare.h
+  ${ly_inc_dir}/libyuv/convert.h
+  ${ly_inc_dir}/libyuv/convert_argb.h
+  ${ly_inc_dir}/libyuv/convert_from.h
+  ${ly_inc_dir}/libyuv/convert_from_argb.h
+  ${ly_inc_dir}/libyuv/cpu_id.h
+  ${ly_inc_dir}/libyuv/planar_functions.h
+  ${ly_inc_dir}/libyuv/rotate.h
+  ${ly_inc_dir}/libyuv/rotate_argb.h
+  ${ly_inc_dir}/libyuv/rotate_row.h
+  ${ly_inc_dir}/libyuv/row.h
+  ${ly_inc_dir}/libyuv/scale.h
+  ${ly_inc_dir}/libyuv/scale_argb.h
+  ${ly_inc_dir}/libyuv/scale_row.h
+  ${ly_inc_dir}/libyuv/version.h
+  ${ly_inc_dir}/libyuv/video_common.h
+  ${ly_inc_dir}/libyuv/mjpeg_decoder.h
+)
+
+include_directories(${ly_inc_dir})
+
+add_library(${ly_lib_name} STATIC ${ly_source_files})
+
+add_executable(convert ${ly_base_dir}/util/convert.cc)
+target_link_libraries(convert ${ly_lib_name})
+
+include(FindJPEG)
+if (JPEG_FOUND)
+  include_directories(${JPEG_INCLUDE_DIR})
+  target_link_libraries(convert ${JPEG_LIBRARY})
+  add_definitions(-DHAVE_JPEG)
+endif()
+
+if(TEST)
+  find_library(GTEST_LIBRARY gtest)
+  if(GTEST_LIBRARY STREQUAL "GTEST_LIBRARY-NOTFOUND")
+    set(GTEST_SRC_DIR /usr/src/gtest CACHE STRING "Location of gtest sources")
+    if(EXISTS ${GTEST_SRC_DIR}/src/gtest-all.cc)
+      message(STATUS "building gtest from sources in ${GTEST_SRC_DIR}")
+      set(gtest_sources ${GTEST_SRC_DIR}/src/gtest-all.cc)
+      add_library(gtest STATIC ${gtest_sources})
+      include_directories(${GTEST_SRC_DIR})
+      include_directories(${GTEST_SRC_DIR}/include)
+      set(GTEST_LIBRARY gtest)
+    else()
+      message(FATAL_ERROR "TEST is set but unable to find gtest library")
+    endif()
+  endif()
+
+  add_executable(libyuv_unittest ${ly_unittest_sources})
+  target_link_libraries(libyuv_unittest ${ly_lib_name} ${GTEST_LIBRARY} pthread)
+  if (JPEG_FOUND)
+    target_link_libraries(libyuv_unittest ${JPEG_LIBRARY})
+  endif()
+  
+  if(NACL AND NACL_LIBC STREQUAL "newlib")
+    target_link_libraries(libyuv_unittest glibc-compat)
+  endif()
+
+  target_link_libraries(libyuv_unittest gflags)
+  
+endif()
+
+install(TARGETS ${ly_lib_name} DESTINATION lib)
+install(FILES ${ly_header_files} DESTINATION include/libyuv)
+install(FILES ${ly_inc_dir}/libyuv.h DESTINATION include/)
--- a/media/libyuv/DEPS
+++ b/media/libyuv/DEPS
@ -1,130 +1,42 @@
-use_relative_paths = True
-
 vars = {
-  "libyuv_trunk" : "https://libyuv.googlecode.com/svn/trunk",
-
  # Override root_dir in your .gclient's custom_vars to specify a custom root
  # folder name.
-  "root_dir": "trunk",
-  "extra_gyp_flag": "-Dextra_gyp_flag=0",
+  'root_dir': 'libyuv',
+  'extra_gyp_flag': '-Dextra_gyp_flag=0',
+  'chromium_git': 'https://chromium.googlesource.com',

-  # Use this googlecode_url variable only if there is an internal mirror for it.
-  # If you do not know, use the full path while defining your new deps entry.
-  "googlecode_url": "http://%s.googlecode.com/svn",
-  "chromium_trunk" : "http://src.chromium.org/svn/trunk",
-  # chrome://version/ for revision of canary Chrome.
-  "chromium_revision": "232627",
+  # Roll the Chromium Git hash to pick up newer versions of all the
+  # dependencies and tools linked to in setup_links.py.
+  'chromium_revision': '2a818f54130d8c93f81490adce5a1e87307bf5f0',
 }

 # NOTE: Prefer revision numbers to tags for svn deps. Use http rather than
 # https; the latter can cause problems for users behind proxies.
 deps = {
-  "../chromium_deps":
-    File(Var("chromium_trunk") + "/src/DEPS@" + Var("chromium_revision")),
-
-  "build":
-    Var("chromium_trunk") + "/src/build@" + Var("chromium_revision"),
-
-  # Needed by common.gypi.
-  "google_apis/build":
-    Var("chromium_trunk") + "/src/google_apis/build@" + Var("chromium_revision"),
-
-  "testing":
-    Var("chromium_trunk") + "/src/testing@" + Var("chromium_revision"),
-
-  "testing/gtest":
-    From("chromium_deps", "src/testing/gtest"),
-
-  "tools/clang":
-    Var("chromium_trunk") + "/src/tools/clang@" + Var("chromium_revision"),
-
-  "tools/gyp":
-    From("chromium_deps", "src/tools/gyp"),
-
-  "tools/python":
-    Var("chromium_trunk") + "/src/tools/python@" + Var("chromium_revision"),
-
-  "tools/valgrind":
-    Var("chromium_trunk") + "/src/tools/valgrind@" + Var("chromium_revision"),
-
-  # Needed by build/common.gypi.
-  "tools/win/supalink":
-    Var("chromium_trunk") + "/src/tools/win/supalink@" + Var("chromium_revision"),
-
-  "third_party/libjpeg_turbo":
-    From("chromium_deps", "src/third_party/libjpeg_turbo"),
-
-  # Yasm assember required for libjpeg_turbo
-  "third_party/yasm":
-    Var("chromium_trunk") + "/src/third_party/yasm@" + Var("chromium_revision"),
-
-  "third_party/yasm/source/patched-yasm":
-    Var("chromium_trunk") + "/deps/third_party/yasm/patched-yasm@" + Var("chromium_revision"),
+  Var('root_dir') + '/third_party/gflags/src':
+    Var('chromium_git') + '/external/github.com/gflags/gflags@03bebcb065c83beff83d50ae025a55a4bf94dfca',
 }

-deps_os = {
-  "win": {
-    # Use WebRTC's, stripped down, version of Cygwin (required by GYP).
-    "third_party/cygwin":
-      (Var("googlecode_url") % "webrtc") + "/deps/third_party/cygwin@2672",
-
-    # Used by libjpeg-turbo.
-    # TODO(fbarchard): Remove binaries and run yasm from build folder.
-    "third_party/yasm/binaries":
-      Var("chromium_trunk") + "/deps/third_party/yasm/binaries@" + Var("chromium_revision"),
-    "third_party/yasm": None,
-  },
-  "unix": {
-    "third_party/gold":
-      From("chromium_deps", "src/third_party/gold"),
-  },
-  "android": {
-    "third_party/android_tools":
-      From("chromium_deps", "src/third_party/android_tools"),
-
-    "third_party/libjpeg":
-      From("chromium_deps", "src/third_party/libjpeg"),
-  },
-  "ios": {
-    # NSS, for SSLClientSocketNSS.
-    "third_party/nss":
-      From("chromium_deps", "src/third_party/nss"),
-
-    "net/third_party/nss":
-      Var("chromium_trunk") + "/src/net/third_party/nss@" + Var("chromium_revision"),
-
-    # class-dump utility to generate header files for undocumented SDKs.
-    "testing/iossim/third_party/class-dump":
-      From("chromium_deps", "src/testing/iossim/third_party/class-dump"),
-
-    # Helper for running under the simulator.
-    "testing/iossim":
-      Var("chromium_trunk") + "/src/testing/iossim@" + Var("chromium_revision"),
-  },
-}
+# Define rules for which include paths are allowed in our source.
+include_rules = [ '+gflags' ]

 hooks = [
  {
-    # Pull clang on mac. If nothing changed, or on non-mac platforms, this takes
-    # zero seconds to run. If something changed, it downloads a prebuilt clang.
-    "pattern": ".",
-    "action": ["python", Var("root_dir") + "/tools/clang/scripts/update.py",
-               "--mac-only"],
+    # Clone chromium and its deps.
+    'name': 'sync chromium',
+    'pattern': '.',
+    'action': ['python', '-u', Var('root_dir') + '/sync_chromium.py',
+               '--target-revision', Var('chromium_revision')],
+  },
+  {
+    # Create links to shared dependencies in Chromium.
+    'name': 'setup_links',
+    'pattern': '.',
+    'action': ['python', Var('root_dir') + '/setup_links.py'],
  },
  {
    # A change to a .gyp, .gypi, or to GYP itself should run the generator.
-    "pattern": ".",
-    "action": ["python", Var("root_dir") + "/build/gyp_chromium",
-               "--depth=" + Var("root_dir"), Var("root_dir") + "/all.gyp",
-               Var("extra_gyp_flag")],
-  },
-  {
-    # Update the cygwin mount on Windows.
-    # This is necessary to get the correct mapping between e.g. /bin and the
-    # cygwin path on Windows. Without it we can't run bash scripts in actions.
-    # Ideally this should be solved in "pylib/gyp/msvs_emulation.py".
-    "pattern": ".",
-    "action": ["python", Var("root_dir") + "/build/win/setup_cygwin_mount.py",
-               "--win-only"],
+    'pattern': '.',
+    'action': ['python', Var('root_dir') + '/gyp_libyuv'],
  },
 ]
--- a/media/libyuv/OWNERS
+++ b/media/libyuv/OWNERS
@ -1,2 +1,13 @@
-fbarchard@chromium.org
-mflodman@chromium.org
+fbarchard@chromium.org
+magjed@chromium.org
+torbjorng@chromium.org
+
+per-file *.gyp=kjellander@chromium.org
+per-file *.gn=kjellander@chromium.org
+per-file .gitignore=*
+per-file AUTHORS=*
+per-file DEPS=*
+per-file PRESUBMIT.py=kjellander@chromium.org
+per-file gyp_libyuv.py=kjellander@chromium.org
+per-file setup_links.py=*
+per-file sync_chromium.py=kjellander@chromium.org
--- a/media/libyuv/PRESUBMIT.py
+++ b/media/libyuv/PRESUBMIT.py
@ -0,0 +1,65 @@
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+import re
+import sys
+
+
+def GetDefaultTryConfigs(bots=None):
+  """Returns a list of ('bot', set(['tests']), optionally filtered by [bots].
+
+  For WebRTC purposes, we always return an empty list of tests, since we want
+  to run all tests by default on all our trybots.
+  """
+  return { 'tryserver.libyuv': dict((bot, []) for bot in bots)}
+
+
+# pylint: disable=W0613
+def GetPreferredTryMasters(project, change):
+  files = change.LocalPaths()
+  bots = [
+    'win',
+    'win_rel',
+    'win_x64_rel',
+    'win_x64_gn',
+    'win_x64_gn_rel',
+    'win_clang',
+    'win_clang_rel',
+    'win_x64_clang_rel',
+    'mac',
+    'mac_rel',
+    'mac_gn',
+    'mac_gn_rel',
+    'mac_asan',
+    'ios',
+    'ios_rel',
+    'ios_arm64',
+    'ios_arm64_rel',
+    'linux',
+    'linux_rel',
+    'linux_gn',
+    'linux_gn_rel',
+    'linux_memcheck',
+    'linux_tsan2',
+    'linux_asan',
+    'linux_msan',
+    'linux_ubsan',
+    'linux_ubsan_vptr',
+    'android',
+    'android_rel',
+    'android_clang',
+    'android_arm64',
+    'android_mips',
+    'android_x64',
+    'android_x86',
+    'android_gn',
+    'android_gn_rel',
+  ]
+  if not files or all(re.search(r'[\\/]OWNERS$', f) for f in files):
+    return {}
+  return GetDefaultTryConfigs(bots)
--- a/media/libyuv/README.chromium
+++ b/media/libyuv/README.chromium
@ -1,9 +1,8 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 971
+Version: 1602
 License: BSD
 License File: LICENSE

 Description:
-libyuv is an open source project that includes
-YUV conversion and scaling functionality.
+libyuv is an open source project that includes YUV conversion and scaling functionality.
--- a/media/libyuv/README.md
+++ b/media/libyuv/README.md
@ -0,0 +1,18 @@
+**libyuv** is an open source project that includes YUV scaling and conversion functionality.
+
+* Scale YUV to prepare content for compression, with point, bilinear or box filter.
+* Convert to YUV from webcam formats.
+* Convert from YUV to formats for rendering/effects.
+* Rotate by 90/180/270 degrees to adjust for mobile devices in portrait mode.
+* Optimized for SSE2/SSSE3/AVX2 on x86/x64.
+* Optimized for Neon on Arm.
+* Optimized for DSP R2 on Mips.
+
+### Development
+
+See [Getting started] [1] for instructions on how to get started developing.
+
+You can also browse the [docs directory] [2] for more documentation.
+
+[1]: https://chromium.googlesource.com/libyuv/libyuv/+/master/docs/getting_started.md
+[2]: https://chromium.googlesource.com/libyuv/libyuv/+/master/docs/
--- a/media/libyuv/build_overrides/build.gni
+++ b/media/libyuv/build_overrides/build.gni
@ -0,0 +1,15 @@
+# Copyright 2016 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# Using same overrides as WebRTC
+# See https://bugs.chromium.org/p/webrtc/issues/detail?id=5453.
+# Some WebRTC targets require the 10.7 deployment version of the Mac SDK and a
+# 10.11 min SDK but those targets are only used in non-Chromium builds. We can
+# remove this when Chromium drops 10.6 support and also requires 10.7.
+mac_sdk_min_build_override = "10.11"
+mac_deployment_target_build_override = "10.7"
--- a/media/libyuv/chromium/README
+++ b/media/libyuv/chromium/README
@ -0,0 +1,5 @@
+This .gclient file is used to do download a copy of Chromium.
+Libyuv uses the Chromium build toolchain and a number of shared
+dependencies by creating symlinks to folders in this checkout,
+using the ../setup_links.py script.
+
--- a/media/libyuv/codereview.settings
+++ b/media/libyuv/codereview.settings
@ -1,9 +1,10 @@
 # This file is used by gcl to get repository specific information.
-# The LibYuv code review is via WebRtc's code review
-CODE_REVIEW_SERVER: webrtc-codereview.appspot.com
+CODE_REVIEW_SERVER: codereview.chromium.org
 #CC_LIST:
-VIEW_VC: https://code.google.com/p/libyuv/source/detail?r=
+VIEW_VC: https://chromium.googlesource.com/libyuv/libyuv/+/
 #STATUS:
+FORCE_HTTPS_COMMIT_URL: True
+PROJECT: libyuv
 TRY_ON_UPLOAD: False
 TRYSERVER_ROOT: src
 TRYSERVER_SVN_URL: svn://svn.chromium.org/chrome-try/try-libyuv
--- a/media/libyuv/docs/environment_variables.md
+++ b/media/libyuv/docs/environment_variables.md
@ -0,0 +1,32 @@
+# Introduction
+
+For test purposes, environment variables can be set to control libyuv behavior.  These should only be used for testing, to narrow down bugs or to test performance.
+
+# CPU
+
+By default the cpu is detected and the most advanced form of SIMD is used.  But you can disable instruction sets selectively, or completely, falling back on C code.  Set the variable to 1 to disable the specified instruction set.
+
+    LIBYUV_DISABLE_ASM
+    LIBYUV_DISABLE_X86
+    LIBYUV_DISABLE_SSE2
+    LIBYUV_DISABLE_SSSE3
+    LIBYUV_DISABLE_SSE41
+    LIBYUV_DISABLE_SSE42
+    LIBYUV_DISABLE_AVX
+    LIBYUV_DISABLE_AVX2
+    LIBYUV_DISABLE_AVX3
+    LIBYUV_DISABLE_ERMS
+    LIBYUV_DISABLE_FMA3
+    LIBYUV_DISABLE_DSPR2
+    LIBYUV_DISABLE_NEON
+
+# Test Width/Height/Repeat
+
+The unittests default to a small image (128x72) to run fast.  This can be set by environment variable to test a specific resolutions.
+You can also repeat the test a specified number of iterations, allowing benchmarking and profiling.
+
+    set LIBYUV_WIDTH=1280
+    set LIBYUV_HEIGHT=720
+    set LIBYUV_REPEAT=999
+    set LIBYUV_FLAGS=-1
+    set LIBYUV_CPU_INFO=-1
--- a/media/libyuv/docs/filtering.md
+++ b/media/libyuv/docs/filtering.md
@ -0,0 +1,196 @@
+# Introduction
+
+This document discusses the current state of filtering in libyuv. An emphasis on maximum performance while avoiding memory exceptions, and minimal amount of code/complexity.  See future work at end.
+
+# LibYuv Filter Subsampling
+
+There are 2 challenges with subsampling
+
+1. centering of samples, which involves clamping on edges
+2. clipping a source region
+
+Centering depends on scale factor and filter mode.
+
+# Down Sampling
+
+If scaling down, the stepping rate is always src_width / dst_width.
+
+    dx = src_width / dst_width;
+
+e.g. If scaling from 1280x720 to 640x360, the step thru the source will be 2.0, stepping over 2 pixels of source for each pixel of destination.
+
+Centering, depends on filter mode.
+
+*Point* downsampling takes the middle pixel.
+
+    x = dx >> 1;
+
+For odd scale factors (e.g. 3x down) this is exactly the middle.  For even scale factors, this rounds up and takes the pixel to the right of center.  e.g. scale of 4x down will take pixel 2.
+
+**Bilinear** filter, uses the 2x2 pixels in the middle.
+
+    x = dx / 2 - 0.5;
+
+For odd scale factors (e.g. 3x down) this is exactly the middle, and point sampling is used.
+For even scale factors, this evenly filters the middle 2x2 pixels.  e.g. 4x down will filter pixels 1,2 at 50% in both directions.
+
+**Box** filter averages the entire box so sampling starts at 0.
+
+    x = 0;
+
+For a scale factor of 2x down, this is equivalent to bilinear.
+
+# Up Sampling
+
+**Point** upsampling use stepping rate of src_width / dst_width and a starting coordinate of 0.
+
+    x = 0;
+    dx = src_width / dst_width;
+
+e.g. If scaling from 640x360 to 1280x720 the step thru the source will be 0.0, stepping half a pixel of source for each pixel of destination. Each pixel is replicated by the scale factor.
+
+**Bilinear** filter stretches such that the first pixel of source maps to the first pixel of destination, and the last pixel of source maps to the last pixel of destination.
+
+    x = 0;
+    dx = (src_width - 1) / (dst_width - 1);
+
+This method is not technically correct, and will likely change in the future.
+
+* It is inconsistent with the bilinear down sampler.  The same method could be used for down sampling, and then it would be more reversible, but that would prevent specialized 2x down sampling.
+* Although centered, the image is slightly magnified.
+* The filtering was changed in early 2013 - previously it used:
+
+        x = 0;
+        dx = (src_width - 1) / (dst_width - 1);
+
+Which is the correct scale factor, but shifted the image left, and extruded the last pixel.  The reason for the change was to remove the extruding code from the low level row functions, allowing 3 functions to sshare the same row functions - ARGBScale, I420Scale, and ARGBInterpolate.  Then the one function was ported to many cpu variations: SSE2, SSSE3, AVX2, Neon and 'Any' version for any number of pixels and alignment.  The function is also specialized for 0,25,50,75%.
+
+The above goes still has the potential to read the last pixel 100% and last pixel + 1 0%, which may cause a memory exception.  So the left pixel goes to a fraction less than the last pixel, but filters in the minimum amount of it, and the maximum of the last pixel.
+
+    dx = FixedDiv((src_width << 16) - 0x00010001, (dst << 16) - 0x00010000);
+
+**Box** filter for upsampling switches over to Bilinear.
+
+# Scale snippet:
+
+    #define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
+    #define FIXEDDIV1(src, dst) FixedDiv((src << 16) - 0x00010001, \
+                                         (dst << 16) - 0x00010000);
+
+    // Compute slope values for stepping.
+    void ScaleSlope(int src_width, int src_height,
+                    int dst_width, int dst_height,
+                    FilterMode filtering,
+                    int* x, int* y, int* dx, int* dy) {
+      assert(x != NULL);
+      assert(y != NULL);
+      assert(dx != NULL);
+      assert(dy != NULL);
+      assert(src_width != 0);
+      assert(src_height != 0);
+      assert(dst_width > 0);
+      assert(dst_height > 0);
+      if (filtering == kFilterBox) {
+        // Scale step for point sampling duplicates all pixels equally.
+        *dx = FixedDiv(Abs(src_width), dst_width);
+        *dy = FixedDiv(src_height, dst_height);
+        *x = 0;
+        *y = 0;
+      } else if (filtering == kFilterBilinear) {
+        // Scale step for bilinear sampling renders last pixel once for upsample.
+        if (dst_width <= Abs(src_width)) {
+          *dx = FixedDiv(Abs(src_width), dst_width);
+          *x = CENTERSTART(*dx, -32768);
+        } else if (dst_width > 1) {
+          *dx = FIXEDDIV1(Abs(src_width), dst_width);
+          *x = 0;
+        }
+        if (dst_height <= src_height) {
+          *dy = FixedDiv(src_height,  dst_height);
+          *y = CENTERSTART(*dy, -32768);  // 32768 = -0.5 to center bilinear.
+        } else if (dst_height > 1) {
+          *dy = FIXEDDIV1(src_height, dst_height);
+          *y = 0;
+        }
+      } else if (filtering == kFilterLinear) {
+        // Scale step for bilinear sampling renders last pixel once for upsample.
+        if (dst_width <= Abs(src_width)) {
+          *dx = FixedDiv(Abs(src_width), dst_width);
+          *x = CENTERSTART(*dx, -32768);
+        } else if (dst_width > 1) {
+          *dx = FIXEDDIV1(Abs(src_width), dst_width);
+          *x = 0;
+        }
+        *dy = FixedDiv(src_height, dst_height);
+        *y = *dy >> 1;
+      } else {
+        // Scale step for point sampling duplicates all pixels equally.
+        *dx = FixedDiv(Abs(src_width), dst_width);
+        *dy = FixedDiv(src_height, dst_height);
+        *x = CENTERSTART(*dx, 0);
+        *y = CENTERSTART(*dy, 0);
+      }
+      // Negative src_width means horizontally mirror.
+      if (src_width < 0) {
+        *x += (dst_width - 1) * *dx;
+        *dx = -*dx;
+        src_width = -src_width;
+      }
+    }
+
+# Future Work
+
+Point sampling should ideally be the same as bilinear, but pixel by pixel, round to nearest neighbor.  But as is, it is reversible and exactly matches ffmpeg at all scale factors, both up and down.  The scale factor is
+
+    dx = src_width / dst_width;
+
+The step value is centered for down sample:
+
+    x = dx / 2;
+
+Or starts at 0 for upsample.
+
+    x = 0;
+
+Bilinear filtering is currently correct for down sampling, but not for upsampling.
+Upsampling is stretching the first and last pixel of source, to the first and last pixel of destination.
+
+    dx = (src_width - 1) / (dst_width - 1);<br>
+    x = 0;
+
+It should be stretching such that the first pixel is centered in the middle of the scale factor, to match the pixel that would be sampled for down sampling by the same amount.  And same on last pixel.
+
+    dx = src_width / dst_width;<br>
+    x = dx / 2 - 0.5;
+
+This would start at -0.5 and go to last pixel + 0.5, sampling 50% from last pixel + 1.
+Then clamping would be needed.  On GPUs there are numerous ways to clamp.
+
+1. Clamp the coordinate to the edge of the texture, duplicating the first and last pixel.
+2. Blend with a constant color, such as transparent black.  Typically best for fonts.
+3. Mirror the UV coordinate, which is similar to clamping.  Good for continuous tone images.
+4. Wrap the coordinate, for texture tiling.
+5. Allow the coordinate to index beyond the image, which may be the correct data if sampling a subimage.
+6. Extrapolate the edge based on the previous pixel.  pixel -0.5 is computed from slope of pixel 0 and 1.
+
+Some of these are computational, even for a GPU, which is one reason textures are sometimes limited to power of 2 sizes.
+We do care about the clipping case, where allowing coordinates to become negative and index pixels before the image is the correct data.  But normally for simple scaling, we want to clamp to the edge pixel.  For example, if bilinear scaling from 3x3 to 30x30, we’d essentially want 10 pixels of each of the original 3 pixels.  But we want the original pixels to land in the middle of each 10 pixels, at offsets 5, 15 and 25.  There would be filtering between 5 and 15 between the original pixels 0 and 1.  And filtering between 15 and 25 from original pixels 1 and 2.  The first 5 pixels are clamped to pixel 0 and the last 5 pixels are clamped to pixel 2.
+The easiest way to implement this is copy the original 3 pixels to a buffer, and duplicate the first and last pixels.  0,1,2 becomes 0, 0,1,2, 2.  Then implement a filtering without clamping.  We call this source extruding.  Its only necessary on up sampling, since down sampler will always have valid surrounding pixels.
+Extruding is practical when the image is already copied to a temporary buffer.   It could be done to the original image, as long as the original memory is restored, but valgrind and/or memory protection would disallow this, so it requires a memcpy to a temporary buffer, which may hurt performance.  The memcpy has a performance advantage, from a cache point of view, that can actually make this technique faster, depending on hardware characteristics.
+Vertical extrusion can be done with a memcpy of the first/last row, or clamping a pointer.
+
+
+The other way to implement clamping is handle the edges with a memset.  e.g. Read first source pixel and memset the first 5 pixels.  Filter pixels 0,1,2 to 5 to 25.  Read last pixel and memset the last 5 pixels.  Blur is implemented with this method like this, which has 3 loops per row - left, middle and right.
+
+Box filter is only used for 2x down sample or more.  Its based on integer sized boxes.  Technically it should be filtered edges, but thats substantially slower (roughly 100x), and at that point you may as well do a cubic filter which is more correct.
+
+Box filter currently sums rows into a row buffer.  It does this with
+
+Mirroring will use the same slope as normal, but with a negative.
+The starting coordinate needs to consider the scale factor and filter.  e.g. box filter of 30x30 to 3x3 with mirroring would use -10 for step, but x = 20.  width (30) - dx.
+
+Step needs to be accurate, so it uses an integer divide.  This is as much as 5% of the profile.  An approximated divide is substantially faster, but the inaccuracy causes stepping beyond the original image boundaries.  3 general solutions:
+
+1. copy image to buffer with padding.  allows for small errors in stepping.
+2. hash the divide, so common values are quickly found.
+3. change api so caller provides the slope.
--- a/media/libyuv/docs/formats.md
+++ b/media/libyuv/docs/formats.md
@ -0,0 +1,133 @@
+# Introduction
+
+Formats (FOURCC) supported by libyuv are detailed here.
+
+# Core Formats
+
+There are 2 core formats supported by libyuv - I420 and ARGB.  All YUV formats can be converted to/from I420.  All RGB formats can be converted to/from ARGB.
+
+Filtering functions such as scaling and planar functions work on I420 and/or ARGB.
+
+# OSX Core Media Pixel Formats
+
+This is how OSX formats map to libyuv
+
+    enum {
+      kCMPixelFormat_32ARGB          = 32,      FOURCC_BGRA
+      kCMPixelFormat_32BGRA          = 'BGRA',  FOURCC_ARGB
+      kCMPixelFormat_24RGB           = 24,      FOURCC_RAW
+      kCMPixelFormat_16BE555         = 16,      Not supported.
+      kCMPixelFormat_16BE565         = 'B565',  Not supported.
+      kCMPixelFormat_16LE555         = 'L555',  FOURCC_RGBO
+      kCMPixelFormat_16LE565         = 'L565',  FOURCC_RGBP
+      kCMPixelFormat_16LE5551        = '5551',  FOURCC_RGBO
+      kCMPixelFormat_422YpCbCr8      = '2vuy',  FOURCC_UYVY
+      kCMPixelFormat_422YpCbCr8_yuvs = 'yuvs',  FOURCC_YUY2
+      kCMPixelFormat_444YpCbCr8      = 'v308',  FOURCC_I444 ?
+      kCMPixelFormat_4444YpCbCrA8    = 'v408',  Not supported.
+      kCMPixelFormat_422YpCbCr16     = 'v216',  Not supported.
+      kCMPixelFormat_422YpCbCr10     = 'v210',  FOURCC_V210 previously.  Removed now.
+      kCMPixelFormat_444YpCbCr10     = 'v410',  Not supported.
+      kCMPixelFormat_8IndexedGray_WhiteIsZero = 0x00000028,  Not supported.
+    };
+
+
+# FOURCC (Four Charactacter Code) List
+
+The following is extracted from video_common.h as a complete list of formats supported by libyuv.
+
+    enum FourCC {
+      // 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
+      FOURCC_I420 = FOURCC('I', '4', '2', '0'),
+      FOURCC_I422 = FOURCC('I', '4', '2', '2'),
+      FOURCC_I444 = FOURCC('I', '4', '4', '4'),
+      FOURCC_I411 = FOURCC('I', '4', '1', '1'),
+      FOURCC_I400 = FOURCC('I', '4', '0', '0'),
+      FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
+      FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
+      FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
+      FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
+
+      // 2 Secondary YUV formats: row biplanar.
+      FOURCC_M420 = FOURCC('M', '4', '2', '0'),
+      FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),
+
+      // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp.
+      FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
+      FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
+      FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
+      FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
+      FOURCC_RAW  = FOURCC('r', 'a', 'w', ' '),
+      FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
+      FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'),  // rgb565 LE.
+      FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'),  // argb1555 LE.
+      FOURCC_R444 = FOURCC('R', '4', '4', '4'),  // argb4444 LE.
+
+      // 4 Secondary RGB formats: 4 Bayer Patterns.
+      FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
+      FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
+      FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
+      FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
+
+      // 1 Primary Compressed YUV format.
+      FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
+
+      // 5 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
+      FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
+      FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
+      FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
+      FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'),  // Linux version of I420.
+      FOURCC_J420 = FOURCC('J', '4', '2', '0'),
+      FOURCC_J400 = FOURCC('J', '4', '0', '0'),
+
+      // 14 Auxiliary aliases.  CanonicalFourCC() maps these to canonical fourcc.
+      FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'),  // Alias for I420.
+      FOURCC_YU16 = FOURCC('Y', 'U', '1', '6'),  // Alias for I422.
+      FOURCC_YU24 = FOURCC('Y', 'U', '2', '4'),  // Alias for I444.
+      FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'),  // Alias for YUY2.
+      FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'),  // Alias for YUY2 on Mac.
+      FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'),  // Alias for UYVY.
+      FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'),  // Alias for UYVY on Mac.
+      FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'),  // Alias for MJPG.
+      FOURCC_DMB1 = FOURCC('d', 'm', 'b', '1'),  // Alias for MJPG on Mac.
+      FOURCC_BA81 = FOURCC('B', 'A', '8', '1'),  // Alias for BGGR.
+      FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'),  // Alias for RAW.
+      FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'),  // Alias for 24BG.
+      FOURCC_CM32 = FOURCC(0, 0, 0, 32),  // Alias for BGRA kCMPixelFormat_32ARGB
+      FOURCC_CM24 = FOURCC(0, 0, 0, 24),  // Alias for RAW kCMPixelFormat_24RGB
+      FOURCC_L555 = FOURCC('L', '5', '5', '5'),  // Alias for RGBO.
+      FOURCC_L565 = FOURCC('L', '5', '6', '5'),  // Alias for RGBP.
+      FOURCC_5551 = FOURCC('5', '5', '5', '1'),  // Alias for RGBO.
+
+      // 1 Auxiliary compressed YUV format set aside for capturer.
+      FOURCC_H264 = FOURCC('H', '2', '6', '4'),
+
+# The ARGB FOURCC
+
+There are 4 ARGB layouts - ARGB, BGRA, ABGR and RGBA.  ARGB is most common by far, used for screen formats, and windows webcam drivers.
+
+The fourcc describes the order of channels in a ***register***.
+
+A fourcc provided by capturer, can be thought of string, e.g. "ARGB".
+
+On little endian machines, as an int, this would have 'A' in the lowest byte.  The FOURCC macro reverses the order:
+
+    #define FOURCC(a, b, c, d) (((uint32)(a)) | ((uint32)(b) << 8) | ((uint32)(c) << 16) | ((uint32)(d) << 24))
+
+So the "ARGB" string, read as an uint32, is
+
+    FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B')
+
+If you were to read ARGB pixels as uint32's, the alpha would be in the high byte, and the blue in the lowest byte.  In memory, these are stored little endian, so 'B' is first, then 'G', 'R' and 'A' last.
+
+When calling conversion functions, the names match the FOURCC, so in this case it would be I420ToARGB().
+
+All formats can be converted to/from ARGB.
+
+Most 'planar_functions' work on ARGB (e.g. ARGBBlend).
+
+Some are channel order agnostic (e.g. ARGBScale).
+
+Some functions are symmetric (e.g. ARGBToBGRA is the same as BGRAToARGB, so its a macro).
+
+ARGBBlend expects preattenuated ARGB. The R,G,B are premultiplied by alpha.  Other functions don't care.
--- a/media/libyuv/docs/getting_started.md
+++ b/media/libyuv/docs/getting_started.md
@ -0,0 +1,429 @@
+# Getting Started
+
+How to get and build the libyuv code.
+
+## Pre-requisites
+
+You'll need to have depot tools installed: https://www.chromium.org/developers/how-tos/install-depot-tools
+Refer to chromium instructions for each platform for other prerequisites.
+
+## Getting the Code
+
+Create a working directory, enter it, and run:
+
+    gclient config https://chromium.googlesource.com/libyuv/libyuv
+    gclient sync
+
+
+Then you'll get a .gclient file like:
+
+    solutions = [
+      { "name"        : "libyuv",
+        "url"         : "https://chromium.googlesource.com/libyuv/libyuv",
+        "deps_file"   : "DEPS",
+        "managed"     : True,
+        "custom_deps" : {
+        },
+        "safesync_url": "",
+      },
+    ];
+
+
+For iOS add `;target_os=['ios'];` to your OSX .gclient and run `GYP_DEFINES="OS=ios" gclient sync.`
+
+Browse the Git reprository: https://chromium.googlesource.com/libyuv/libyuv/+/master
+
+### Android
+For Android add `;target_os=['android'];` to your Linux .gclient
+
+
+    solutions = [
+      { "name"        : "libyuv",
+        "url"         : "https://chromium.googlesource.com/libyuv/libyuv",
+        "deps_file"   : "DEPS",
+        "managed"     : True,
+        "custom_deps" : {
+        },
+        "safesync_url": "",
+      },
+    ];
+    target_os = ["android", "unix"];
+
+Then run:
+
+    export GYP_DEFINES="OS=android"
+    gclient sync
+
+Caveat: Theres an error with Google Play services updates.  If you get the error "Your version of the Google Play services library is not up to date", run the following:
+    cd chromium/src
+    ./build/android/play_services/update.py download
+    cd ../..
+
+For Windows the gclient sync must be done from an Administrator command prompt.
+
+The sync will generate native build files for your environment using gyp (Windows: Visual Studio, OSX: XCode, Linux: make). This generation can also be forced manually: `gclient runhooks`
+
+To get just the source (not buildable):
+    git clone https://chromium.googlesource.com/libyuv/libyuv
+
+
+## Building the Library and Unittests
+
+### Windows
+
+    set GYP_DEFINES=target_arch=ia32
+    call python gyp_libyuv -fninja -G msvs_version=2013
+    ninja -j7 -C out\Release
+    ninja -j7 -C out\Debug
+
+    set GYP_DEFINES=target_arch=x64
+    call python gyp_libyuv -fninja -G msvs_version=2013
+    ninja -C out\Debug_x64
+    ninja -C out\Release_x64
+
+#### Building with clangcl
+    set GYP_DEFINES=clang=1 target_arch=ia32 libyuv_enable_svn=1
+    set LLVM_REPO_URL=svn://svn.chromium.org/llvm-project
+    call python tools\clang\scripts\update.py
+    call python gyp_libyuv -fninja libyuv_test.gyp
+    ninja -C out\Debug
+    ninja -C out\Release
+
+### OSX
+
+Clang 64 bit shown. Remove `clang=1` for GCC and change x64 to ia32 for 32 bit.
+
+    GYP_DEFINES="clang=1 target_arch=x64" ./gyp_libyuv
+    ninja -j7 -C out/Debug
+    ninja -j7 -C out/Release
+
+    GYP_DEFINES="clang=1 target_arch=ia32" ./gyp_libyuv
+    ninja -j7 -C out/Debug
+    ninja -j7 -C out/Release
+
+### iOS
+http://www.chromium.org/developers/how-tos/build-instructions-ios
+
+Add to .gclient last line: `target_os=['ios'];`
+
+armv7
+
+    GYP_DEFINES="OS=ios target_arch=armv7 target_subarch=arm32" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_ios" ./gyp_libyuv
+    ninja -j7 -C out_ios/Debug-iphoneos libyuv_unittest
+    ninja -j7 -C out_ios/Release-iphoneos libyuv_unittest
+
+arm64
+
+    GYP_DEFINES="OS=ios target_arch=arm64 target_subarch=arm64" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_ios" ./gyp_libyuv
+    ninja -j7 -C out_ios/Debug-iphoneos libyuv_unittest
+    ninja -j7 -C out_ios/Release-iphoneos libyuv_unittest
+
+both armv7 and arm64 (fat)
+
+    GYP_DEFINES="OS=ios target_arch=armv7 target_subarch=both" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_ios" ./gyp_libyuv
+    ninja -j7 -C out_ios/Debug-iphoneos libyuv_unittest
+    ninja -j7 -C out_ios/Release-iphoneos libyuv_unittest
+
+simulator
+
+    GYP_DEFINES="OS=ios target_arch=ia32 target_subarch=arm32" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_sim" ./gyp_libyuv
+    ninja -j7 -C out_sim/Debug-iphonesimulator libyuv_unittest
+    ninja -j7 -C out_sim/Release-iphonesimulator libyuv_unittest
+
+### Android
+https://code.google.com/p/chromium/wiki/AndroidBuildInstructions
+
+Add to .gclient last line: `target_os=['android'];`
+
+armv7
+
+    GYP_DEFINES="OS=android" GYP_CROSSCOMPILE=1 ./gyp_libyuv
+    ninja -j7 -C out/Debug libyuv_unittest_apk
+    ninja -j7 -C out/Release libyuv_unittest_apk
+
+arm64
+
+    GYP_DEFINES="OS=android target_arch=arm64 target_subarch=arm64" GYP_CROSSCOMPILE=1 ./gyp_libyuv
+    ninja -j7 -C out/Debug libyuv_unittest_apk
+    ninja -j7 -C out/Release libyuv_unittest_apk
+
+ia32
+
+    GYP_DEFINES="OS=android target_arch=ia32" GYP_CROSSCOMPILE=1 ./gyp_libyuv
+    ninja -j7 -C out/Debug libyuv_unittest_apk
+    ninja -j7 -C out/Release libyuv_unittest_apk
+
+    GYP_DEFINES="OS=android target_arch=ia32 android_full_debug=1" GYP_CROSSCOMPILE=1 ./gyp_libyuv
+    ninja -j7 -C out/Debug libyuv_unittest_apk
+
+mipsel
+
+    GYP_DEFINES="OS=android target_arch=mipsel" GYP_CROSSCOMPILE=1 ./gyp_libyuv
+    ninja -j7 -C out/Debug libyuv_unittest_apk
+    ninja -j7 -C out/Release libyuv_unittest_apk
+
+arm32 disassembly:
+
+    third_party/android_tools/ndk/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/bin/arm-linux-androideabi-objdump -d out/Release/obj/source/libyuv.row_neon.o
+
+arm64 disassembly:
+
+    third_party/android_tools/ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d out/Release/obj/source/libyuv.row_neon64.o
+
+Running tests:
+
+    util/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=*
+
+Running test as benchmark:
+
+    util/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* -a "--libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=-1"
+
+Running test with C code:
+
+    util/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* -a "--libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=1 --libyuv_cpu_info=1"
+
+#### Building with GN
+
+    gn gen out/Release "--args=is_debug=false target_cpu=\"x86\""
+    gn gen out/Debug "--args=is_debug=true target_cpu=\"x86\""
+    ninja -C out/Release
+    ninja -C out/Debug
+
+### Building Offical with GN
+
+    gn gen out/Official "--args=is_debug=false is_official_build=true is_chrome_branded=true"
+    ninja -C out/Official
+
+### Linux
+
+    GYP_DEFINES="target_arch=x64" ./gyp_libyuv
+    ninja -j7 -C out/Debug
+    ninja -j7 -C out/Release
+
+    GYP_DEFINES="target_arch=ia32" ./gyp_libyuv
+    ninja -j7 -C out/Debug
+    ninja -j7 -C out/Release
+
+#### CentOS
+
+On CentOS 32 bit the following work around allows a sync:
+
+    export GYP_DEFINES="host_arch=ia32"
+    gclient sync
+
+### Windows Shared Library
+
+Modify libyuv.gyp from 'static_library' to 'shared_library', and add 'LIBYUV_BUILDING_SHARED_LIBRARY' to 'defines'.
+
+    gclient runhooks
+
+After this command follow the building the library instructions above.
+
+If you get a compile error for atlthunk.lib on Windows, read http://www.chromium.org/developers/how-tos/build-instructions-windows
+
+
+### Build targets
+
+    ninja -C out/Debug libyuv
+    ninja -C out/Debug libyuv_unittest
+    ninja -C out/Debug compare
+    ninja -C out/Debug convert
+    ninja -C out/Debug psnr
+    ninja -C out/Debug cpuid
+
+
+## Building the Library with make
+
+### Linux
+
+    make -j7 V=1 -f linux.mk
+    make -j7 V=1 -f linux.mk clean
+    make -j7 V=1 -f linux.mk CXX=clang++
+
+## Building the Library with cmake
+
+Install cmake: http://www.cmake.org/
+
+Default debug build:
+
+    mkdir out
+    cd out
+    cmake ..
+    cmake --build .
+
+Release build/install
+
+    mkdir out
+    cd out
+    cmake -DCMAKE_INSTALL_PREFIX="/usr/lib" -DCMAKE_BUILD_TYPE="Release" ..
+    cmake --build . --config Release
+    sudo cmake --build . --target install --config Release
+
+### Windows 8 Phone
+
+Pre-requisite:
+
+* Install Visual Studio 2012 and Arm to your environment.<br>
+
+Then:
+
+    call "c:\Program Files (x86)\Microsoft Visual Studio 11.0\VC\bin\x86_arm\vcvarsx86_arm.bat"
+
+or with Visual Studio 2013:
+
+    call "c:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin\x86_arm\vcvarsx86_arm.bat"
+    nmake /f winarm.mk clean
+    nmake /f winarm.mk
+
+### Windows Shared Library
+
+Modify libyuv.gyp from 'static_library' to 'shared_library', and add 'LIBYUV_BUILDING_SHARED_LIBRARY' to 'defines'. Then run this.
+
+    gclient runhooks
+
+After this command follow the building the library instructions above.
+
+If you get a compile error for atlthunk.lib on Windows, read http://www.chromium.org/developers/how-tos/build-instructions-windows
+
+### 64 bit Windows
+
+    set GYP_DEFINES=target_arch=x64
+    gclient runhooks V=1
+
+### ARM Linux
+
+    export GYP_DEFINES="target_arch=arm"
+    export CROSSTOOL=`<path>`/arm-none-linux-gnueabi
+    export CXX=$CROSSTOOL-g++
+    export CC=$CROSSTOOL-gcc
+    export AR=$CROSSTOOL-ar
+    export AS=$CROSSTOOL-as
+    export RANLIB=$CROSSTOOL-ranlib
+    gclient runhooks
+
+## Running Unittests
+
+### Windows
+
+    out\Release\libyuv_unittest.exe --gtest_catch_exceptions=0 --gtest_filter="*"
+
+### OSX
+
+    out/Release/libyuv_unittest --gtest_filter="*"
+
+### Linux
+
+    out/Release/libyuv_unittest --gtest_filter="*"
+
+Replace --gtest_filter="*" with specific unittest to run.  May include wildcards. e.g.
+
+    out/Release/libyuv_unittest --gtest_filter=libyuvTest.I420ToARGB_Opt
+
+## CPU Emulator tools
+
+### Intel SDE (Software Development Emulator)
+
+Pre-requisite: Install IntelSDE for Windows: http://software.intel.com/en-us/articles/intel-software-development-emulator
+
+Then run:
+
+    c:\intelsde\sde -hsw -- out\release\libyuv_unittest.exe --gtest_filter=*
+
+
+## Memory tools
+
+### Running Dr Memory memcheck for Windows
+
+Pre-requisite: Install Dr Memory for Windows and add it to your path: http://www.drmemory.org/docs/page_install_windows.html
+
+    set GYP_DEFINES=build_for_tool=drmemory target_arch=ia32
+    call python gyp_libyuv -fninja -G msvs_version=2013
+    ninja -C out\Debug
+    drmemory out\Debug\libyuv_unittest.exe --gtest_catch_exceptions=0 --gtest_filter=*
+
+### Running UBSan
+
+See Chromium instructions for sanitizers: https://www.chromium.org/developers/testing/undefinedbehaviorsanitizer
+
+Sanitizers available: TSan, MSan, ASan, UBSan, LSan
+
+    GYP_DEFINES='ubsan=1' gclient runhooks
+    ninja -C out/Release
+
+### Running Valgrind memcheck
+
+Memory errors and race conditions can be found by running tests under special memory tools. [Valgrind] [1] is an instrumentation framework for building dynamic analysis tools. Various tests and profilers are built upon it to find memory handling errors and memory leaks, for instance.
+
+[1]: http://valgrind.org
+
+    solutions = [
+      { "name"        : "libyuv",
+        "url"         : "https://chromium.googlesource.com/libyuv/libyuv",
+        "deps_file"   : "DEPS",
+        "managed"     : True,
+        "custom_deps" : {
+           "libyuv/chromium/src/third_party/valgrind": "https://chromium.googlesource.com/chromium/deps/valgrind/binaries",
+        },
+        "safesync_url": "",
+      },
+    ]
+
+Then run:
+
+    GYP_DEFINES="clang=0 target_arch=x64 build_for_tool=memcheck" python gyp_libyuv
+    ninja -C out/Debug
+    valgrind out/Debug/libyuv_unittest
+
+
+For more information, see http://www.chromium.org/developers/how-tos/using-valgrind
+
+### Running Thread Sanitizer (TSan)
+
+    GYP_DEFINES="clang=0 target_arch=x64 build_for_tool=tsan" python gyp_libyuv
+    ninja -C out/Debug
+    valgrind out/Debug/libyuv_unittest
+
+For more info, see http://www.chromium.org/developers/how-tos/using-valgrind/threadsanitizer
+
+### Running Address Sanitizer (ASan)
+
+    GYP_DEFINES="clang=0 target_arch=x64 build_for_tool=asan" python gyp_libyuv
+    ninja -C out/Debug
+    valgrind out/Debug/libyuv_unittest
+
+For more info, see http://dev.chromium.org/developers/testing/addresssanitizer
+
+## Benchmarking
+
+The unittests can be used to benchmark.
+
+### Windows
+
+    set LIBYUV_WIDTH=1280
+    set LIBYUV_HEIGHT=720
+    set LIBYUV_REPEAT=999
+    set LIBYUV_FLAGS=-1
+    out\Release\libyuv_unittest.exe --gtest_filter=*I420ToARGB_Opt
+
+
+### Linux and Mac
+
+    LIBYUV_WIDTH=1280 LIBYUV_HEIGHT=720 LIBYUV_REPEAT=1000 out/Release/libyuv_unittest --gtest_filter=*I420ToARGB_Opt
+
+    libyuvTest.I420ToARGB_Opt (547 ms)
+
+Indicates 0.547 ms/frame for 1280 x 720.
+
+## Making a change
+
+    gclient sync
+    git checkout -b mycl -t origin/master
+    git pull
+    <edit files>
+    git add -u
+    git commit -m "my change"
+    git cl lint
+    git cl try
+    git cl upload -r a-reviewer@chomium.org -s
+    <once approved..>
+    git cl land
--- a/media/libyuv/docs/rotation.md
+++ b/media/libyuv/docs/rotation.md
@ -0,0 +1,103 @@
+# Introduction
+
+Rotation by multiplies of 90 degrees allows mobile devices to rotate webcams from landscape to portrait.  The higher level functions ConvertToI420 and ConvertToARGB allow rotation of any format.  Optimized functionality is supported for I420, ARGB, NV12 and NV21.
+
+# ConvertToI420
+
+    int ConvertToI420(const uint8* src_frame, size_t src_size,
+                      uint8* dst_y, int dst_stride_y,
+                      uint8* dst_u, int dst_stride_u,
+                      uint8* dst_v, int dst_stride_v,
+                      int crop_x, int crop_y,
+                      int src_width, int src_height,
+                      int crop_width, int crop_height,
+                      enum RotationMode rotation,
+                      uint32 format);
+
+This function crops, converts, and rotates.  You should think of it in that order.
+  * Crops the original image, which is src_width x src_height, to crop_width x crop_height.  At this point the image is still not rotated.
+  * Converts the cropped region to I420.  Supports inverted source for src_height negative.
+  * Rotates by 90, 180 or 270 degrees.
+The buffer the caller provides should account for rotation.  Be especially important to get stride of the destination correct.
+
+e.g.
+640 x 480 NV12 captured<br>
+Crop to 640 x 360<br>
+Rotate by 90 degrees to 360 x 640.<br>
+Caller passes stride of 360 for Y and 360 / 2 for U and V.<br>
+Caller passes crop_width of 640, crop_height of 360.<br>
+
+# ConvertToARGB
+
+    int ConvertToARGB(const uint8* src_frame, size_t src_size,
+                      uint8* dst_argb, int dst_stride_argb,
+                      int crop_x, int crop_y,
+                      int src_width, int src_height,
+                      int crop_width, int crop_height,
+                      enum RotationMode rotation,
+                      uint32 format);
+
+Same as I420, but implementation is less optimized - reads columns and writes rows, 16 bytes at a time.
+
+# I420Rotate
+
+    int I420Rotate(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint8* dst_y, int dst_stride_y,
+                   uint8* dst_u, int dst_stride_u,
+                   uint8* dst_v, int dst_stride_v,
+                   int src_width, int src_height, enum RotationMode mode);
+
+Destination is rotated, so pass dst_stride_y etc that consider rotation.<br>
+Rotate by 180 can be done in place, but 90 and 270 can not.
+
+Implementation (Neon/SSE2) uses 8 x 8 block transpose, so best efficiency is with sizes and pointers that are aligned to 8.
+
+Cropping can be achieved by adjusting the src_y/u/v pointers and src_width, src_height.
+
+Lower level plane functions are provided, allowing other planar formats to be rotated.  (e.g. I444)
+
+For other planar YUV formats (I444, I422, I411, I400, NV16, NV24), the planar functions are exposed and can be called directly
+
+
+    // Rotate a plane by 0, 90, 180, or 270.
+    int RotatePlane(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int src_width, int src_height, enum RotationMode mode);
+
+# ARGBRotate
+
+    LIBYUV_API
+    int ARGBRotate(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb, int dst_stride_argb,
+                   int src_width, int src_height, enum RotationMode mode);
+
+Same as I420, but implementation is less optimized - reads columns and writes rows.
+
+Rotate by 90, or any angle, can be achieved using ARGBAffine.
+
+# Mirror - Horizontal Flip
+
+Mirror functions for horizontally flipping an image, which can be useful for 'self view' of a webcam.
+
+    int I420Mirror(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint8* dst_y, int dst_stride_y,
+                   uint8* dst_u, int dst_stride_u,
+                   uint8* dst_v, int dst_stride_v,
+                   int width, int height);
+    int ARGBMirror(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb, int dst_stride_argb,
+                   int width, int height);
+
+Mirror functionality can also be achieved with the I420Scale and ARGBScale functions by passing negative width and/or height.
+
+# Invert - Vertical Flip
+
+Inverting can be achieved with almost any libyuv function by passing a negative source height.
+
+I420Mirror and ARGBMirror can also be used to rotate by 180 degrees by passing a negative height.
+
+
--- a/media/libyuv/download_vs_toolchain.py
+++ b/media/libyuv/download_vs_toolchain.py
@ -0,0 +1,29 @@
+#!/usr/bin/env python
+#
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# This script is used to run the vs_toolchain.py script to download the
+# Visual Studio toolchain. It's just a temporary measure while waiting for the
+# Chrome team to move find_depot_tools into src/build to get rid of these
+# workarounds (similar one in gyp_libyuv).
+
+import os
+import sys
+
+
+checkout_root = os.path.dirname(os.path.realpath(__file__))
+sys.path.insert(0, os.path.join(checkout_root, 'build'))
+sys.path.insert(0, os.path.join(checkout_root, 'tools', 'find_depot_tools'))
+
+
+import vs_toolchain
+
+
+if __name__ == '__main__':
+  sys.exit(vs_toolchain.main())
--- a/media/libyuv/gyp_libyuv
+++ b/media/libyuv/gyp_libyuv
@ -0,0 +1,101 @@
+#!/usr/bin/env python
+#
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# This script is used to run GYP for libyuv. It contains selected parts of the
+# main function from the src/build/gyp_chromium file.
+
+import glob
+import os
+import shlex
+import sys
+
+checkout_root = os.path.dirname(os.path.realpath(__file__))
+
+sys.path.insert(0, os.path.join(checkout_root, 'build'))
+import gyp_chromium
+import gyp_helper
+import vs_toolchain
+
+sys.path.insert(0, os.path.join(checkout_root, 'tools', 'gyp', 'pylib'))
+import gyp
+
+def GetSupplementalFiles():
+  """Returns a list of the supplemental files that are included in all GYP
+  sources."""
+  # Can't use the one in gyp_chromium since the directory location of the root
+  # is different.
+  return glob.glob(os.path.join(checkout_root, '*', 'supplement.gypi'))
+
+
+if __name__ == '__main__':
+  args = sys.argv[1:]
+
+  if int(os.environ.get('GYP_CHROMIUM_NO_ACTION', 0)):
+    print 'Skipping gyp_libyuv due to GYP_CHROMIUM_NO_ACTION env var.'
+    sys.exit(0)
+
+  # This could give false positives since it doesn't actually do real option
+  # parsing.  Oh well.
+  gyp_file_specified = False
+  for arg in args:
+    if arg.endswith('.gyp'):
+      gyp_file_specified = True
+      break
+
+  # If we didn't get a file, assume 'all.gyp' in the root of the checkout.
+  if not gyp_file_specified:
+    # Because of a bug in gyp, simply adding the abspath to all.gyp doesn't
+    # work, but chdir'ing and adding the relative path does. Spooky :/
+    os.chdir(checkout_root)
+    args.append('all.gyp')
+
+  # There shouldn't be a circular dependency relationship between .gyp files,
+  args.append('--no-circular-check')
+
+  # Default to ninja unless GYP_GENERATORS is set.
+  if not os.environ.get('GYP_GENERATORS'):
+    os.environ['GYP_GENERATORS'] = 'ninja'
+
+  vs2013_runtime_dll_dirs = None
+  if int(os.environ.get('DEPOT_TOOLS_WIN_TOOLCHAIN', '1')):
+    vs2013_runtime_dll_dirs = vs_toolchain.SetEnvironmentAndGetRuntimeDllDirs()
+
+  # Enforce gyp syntax checking. This adds about 20% execution time.
+  args.append('--check')
+
+  supplemental_includes = gyp_chromium.GetSupplementalFiles()
+  gyp_vars_dict = gyp_chromium.GetGypVars(supplemental_includes)
+
+  # Automatically turn on crosscompile support for platforms that need it.
+  if all(('ninja' in os.environ.get('GYP_GENERATORS', ''),
+          gyp_vars_dict.get('OS') in ['android', 'ios'],
+          'GYP_CROSSCOMPILE' not in os.environ)):
+    os.environ['GYP_CROSSCOMPILE'] = '1'
+
+  args.extend(['-I' + i for i in
+               gyp_chromium.additional_include_files(supplemental_includes,
+                                                     args)])
+
+  # Set the gyp depth variable to the root of the checkout.
+  args.append('--depth=' + os.path.relpath(checkout_root))
+
+  print 'Updating projects from gyp files...'
+  sys.stdout.flush()
+
+  # Off we go...
+  gyp_rc = gyp.main(args)
+
+  if vs2013_runtime_dll_dirs:
+    x64_runtime, x86_runtime = vs2013_runtime_dll_dirs
+    vs_toolchain.CopyVsRuntimeDlls(
+        os.path.join(checkout_root, gyp_chromium.GetOutputDirectory()),
+        (x86_runtime, x64_runtime))
+
+  sys.exit(gyp_rc)
--- a/media/libyuv/gyp_libyuv.py
+++ b/media/libyuv/gyp_libyuv.py
@ -0,0 +1,28 @@
+#!/usr/bin/env python
+#
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+
+# This script is a modified copy of the src/build/gyp_chromium.py file. 
+# It is needed for parallel processing.
+
+# This file is (possibly, depending on python version) imported by
+# gyp_libyuv when GYP_PARALLEL=1 and it creates sub-processes
+# through the multiprocessing library.
+
+# Importing in Python 2.6 (fixed in 2.7) on Windows doesn't search for
+# imports that don't end in .py (and aren't directories with an
+# __init__.py). This wrapper makes "import gyp_libyuv" work with
+# those old versions and makes it possible to execute gyp_libyuv.py
+# directly on Windows where the extension is useful.
+
+import os
+
+path = os.path.abspath(os.path.split(__file__)[0])
+execfile(os.path.join(path, 'gyp_libyuv'))
--- a/media/libyuv/include/libyuv.h
+++ b/media/libyuv/include/libyuv.h
@ -18,7 +18,6 @@
 #include "libyuv/convert_from.h"
 #include "libyuv/convert_from_argb.h"
 #include "libyuv/cpu_id.h"
-#include "libyuv/format_conversion.h"
 #include "libyuv/mjpeg_decoder.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate.h"
--- a/media/libyuv/include/libyuv/basic_types.h
+++ b/media/libyuv/include/libyuv/basic_types.h
@ -13,26 +13,12 @@

 #include <stddef.h>  // for NULL, size_t

-#if !(defined(_MSC_VER) && (_MSC_VER < 1600))
+#if defined(__ANDROID__) || (defined(_MSC_VER) && (_MSC_VER < 1600))
+#include <sys/types.h>  // for uintptr_t on x86
+#else
 #include <stdint.h>  // for uintptr_t
 #endif

-typedef uint64_t uint64;
-typedef int64_t  int64;
-#if defined(_MSC_VER)
-// nsprpub/pr/include/obsolete/protypes.h defines these weirdly
-typedef long int32;
-typedef unsigned long uint32;
-#else
-typedef uint32_t uint32;
-typedef int32_t  int32;
-#endif
-typedef uint16_t uint16;
-typedef int16_t  int16;
-typedef uint8_t  uint8;
-typedef int8_t   int8;
-#define INT_TYPES_DEFINED 1
-
 #ifndef GG_LONGLONG
 #ifndef INT_TYPES_DEFINED
 #define INT_TYPES_DEFINED
--- a/media/libyuv/include/libyuv/compare.h
+++ b/media/libyuv/include/libyuv/compare.h
@ -22,6 +22,11 @@ extern "C" {
 LIBYUV_API
 uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed);

+// Scan an opaque argb image and return fourcc based on alpha offset.
+// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
+LIBYUV_API
+uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height);
+
 // Sum Square Error - used to compute Mean Square Error or PSNR.
 LIBYUV_API
 uint64 ComputeSumSquareError(const uint8* src_a,
--- a/media/libyuv/include/libyuv/compare_row.h
+++ b/media/libyuv/include/libyuv/compare_row.h
@ -0,0 +1,84 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_COMPARE_ROW_H_  // NOLINT
+#define INCLUDE_LIBYUV_COMPARE_ROW_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__i386__) && !defined(__SSE2__))
+#define LIBYUV_DISABLE_X86
+#endif
+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define LIBYUV_DISABLE_X86
+#endif
+#endif
+
+// Visual C 2012 required for AVX2.
+#if defined(_M_IX86) && !defined(__clang__) && \
+    defined(_MSC_VER) && _MSC_VER >= 1700
+#define VISUALC_HAS_AVX2 1
+#endif  // VisualStudio >= 2012
+
+// clang >= 3.4.0 required for AVX2.
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
+#define CLANG_HAS_AVX2 1
+#endif  // clang >= 3.4
+#endif  // __clang__
+
+#if !defined(LIBYUV_DISABLE_X86) && \
+    defined(_M_IX86) && (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
+#define HAS_HASHDJB2_AVX2
+#endif
+
+// The following are available for Visual C and GCC:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) || defined(_M_IX86)))
+#define HAS_HASHDJB2_SSE41
+#define HAS_SUMSQUAREERROR_SSE2
+#endif
+
+// The following are available for Visual C and clangcl 32 bit:
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+    (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
+#define HAS_HASHDJB2_AVX2
+#define HAS_SUMSQUAREERROR_AVX2
+#endif
+
+// The following are available for Neon:
+#if !defined(LIBYUV_DISABLE_NEON) && \
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_SUMSQUAREERROR_NEON
+#endif
+
+uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
+uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
+
+uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
+uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_COMPARE_ROW_H_  NOLINT
--- a/media/libyuv/include/libyuv/convert.h
+++ b/media/libyuv/include/libyuv/convert.h
@ -12,10 +12,8 @@
 #define INCLUDE_LIBYUV_CONVERT_H_

 #include "libyuv/basic_types.h"
-// TODO(fbarchard): Remove the following headers includes.
-#include "libyuv/convert_from.h"
-#include "libyuv/planar_functions.h"
-#include "libyuv/rotate.h"
+
+#include "libyuv/rotate.h"  // For enum RotationMode.

 #ifdef __cplusplus
 namespace libyuv {
@ -71,6 +69,8 @@ int I400ToI420(const uint8* src_y, int src_stride_y,
               uint8* dst_v, int dst_stride_v,
               int width, int height);

+#define J400ToJ420 I400ToI420
+
 // Convert NV12 to I420.
 LIBYUV_API
 int NV12ToI420(const uint8* src_y, int src_stride_y,
@ -113,15 +113,6 @@ int M420ToI420(const uint8* src_m420, int src_stride_m420,
               uint8* dst_v, int dst_stride_v,
               int width, int height);

-// Convert Q420 to I420.
-LIBYUV_API
-int Q420ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
 // ARGB little endian (bgra in memory) to I420.
 LIBYUV_API
 int ARGBToI420(const uint8* src_frame, int src_stride_frame,
@ -211,8 +202,6 @@ int MJPGSize(const uint8* sample, size_t sample_size,
             int* width, int* height);
 #endif

-// Note Bayer formats (BGGR) To I420 are in format_conversion.h
-
 // Convert camera sample to I420 with cropping, rotation and vertical flip.
 // "src_size" is needed to parse MJPG.
 // "dst_stride_y" number of bytes in a row of the dst_y plane.
--- a/media/libyuv/include/libyuv/convert_argb.h
+++ b/media/libyuv/include/libyuv/convert_argb.h
@ -12,13 +12,10 @@
 #define INCLUDE_LIBYUV_CONVERT_ARGB_H_

 #include "libyuv/basic_types.h"
-// TODO(fbarchard): Remove the following headers includes
-#include "libyuv/convert_from.h"
-#include "libyuv/planar_functions.h"
-#include "libyuv/rotate.h"
+
+#include "libyuv/rotate.h"  // For enum RotationMode.

 // TODO(fbarchard): This set of functions should exactly match convert.h
-// Add missing Q420.
 // TODO(fbarchard): Add tests. Create random content of right size and convert
 // with C vs Opt and or to I420 and compare.
 // TODO(fbarchard): Some of these functions lack parameter setting.
@ -61,6 +58,22 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height);

+// Convert J444 to ARGB.
+LIBYUV_API
+int J444ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert I444 to ABGR.
+LIBYUV_API
+int I444ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height);
+
 // Convert I411 to ARGB.
 LIBYUV_API
 int I411ToARGB(const uint8* src_y, int src_stride_y,
@ -69,20 +82,38 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height);

-// Convert I400 (grey) to ARGB.
+// Convert I420 with Alpha to preattenuated ARGB.
+LIBYUV_API
+int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
+                    const uint8* src_u, int src_stride_u,
+                    const uint8* src_v, int src_stride_v,
+                    const uint8* src_a, int src_stride_a,
+                    uint8* dst_argb, int dst_stride_argb,
+                    int width, int height, int attenuate);
+
+// Convert I420 with Alpha to preattenuated ABGR.
+LIBYUV_API
+int I420AlphaToABGR(const uint8* src_y, int src_stride_y,
+                    const uint8* src_u, int src_stride_u,
+                    const uint8* src_v, int src_stride_v,
+                    const uint8* src_a, int src_stride_a,
+                    uint8* dst_abgr, int dst_stride_abgr,
+                    int width, int height, int attenuate);
+
+// Convert I400 (grey) to ARGB.  Reverse of ARGBToI400.
 LIBYUV_API
 int I400ToARGB(const uint8* src_y, int src_stride_y,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height);

-// Alias.
-#define YToARGB I400ToARGB_Reference
-
-// Convert I400 to ARGB. Reverse of ARGBToI400.
+// Convert J400 (jpeg grey) to ARGB.
 LIBYUV_API
-int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
-                         uint8* dst_argb, int dst_stride_argb,
-                         int width, int height);
+int J400ToARGB(const uint8* src_y, int src_stride_y,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Alias.
+#define YToARGB I400ToARGB

 // Convert NV12 to ARGB.
 LIBYUV_API
@ -104,13 +135,6 @@ int M420ToARGB(const uint8* src_m420, int src_stride_m420,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height);

-// TODO(fbarchard): Convert Q420 to ARGB.
-// LIBYUV_API
-// int Q420ToARGB(const uint8* src_y, int src_stride_y,
-//                const uint8* src_yuy2, int src_stride_yuy2,
-//                uint8* dst_argb, int dst_stride_argb,
-//                int width, int height);
-
 // Convert YUY2 to ARGB.
 LIBYUV_API
 int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
@ -123,6 +147,70 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height);

+// Convert J420 to ARGB.
+LIBYUV_API
+int J420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert J422 to ARGB.
+LIBYUV_API
+int J422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert J420 to ABGR.
+LIBYUV_API
+int J420ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height);
+
+// Convert J422 to ABGR.
+LIBYUV_API
+int J422ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height);
+
+// Convert H420 to ARGB.
+LIBYUV_API
+int H420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert H422 to ARGB.
+LIBYUV_API
+int H422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert H420 to ABGR.
+LIBYUV_API
+int H420ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height);
+
+// Convert H422 to ABGR.
+LIBYUV_API
+int H422ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height);
+
 // BGRA little endian (argb in memory) to ARGB.
 LIBYUV_API
 int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
@ -184,8 +272,6 @@ int MJPGToARGB(const uint8* sample, size_t sample_size,
               int dst_width, int dst_height);
 #endif

-// Note Bayer formats (BGGR) to ARGB are in format_conversion.h.
-
 // Convert camera sample to ARGB with cropping, rotation and vertical flip.
 // "src_size" is needed to parse MJPG.
 // "dst_stride_argb" number of bytes in a row of the dst_argb plane.
--- a/media/libyuv/include/libyuv/convert_from.h
+++ b/media/libyuv/include/libyuv/convert_from.h
@ -56,9 +56,6 @@ int I400Copy(const uint8* src_y, int src_stride_y,
             uint8* dst_y, int dst_stride_y,
             int width, int height);

-// TODO(fbarchard): I420ToM420
-// TODO(fbarchard): I420ToQ420
-
 LIBYUV_API
 int I420ToNV12(const uint8* src_y, int src_stride_y,
               const uint8* src_u, int src_stride_u,
@ -138,6 +135,17 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
                 uint8* dst_frame, int dst_stride_frame,
                 int width, int height);

+// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
+// Values in dither matrix from 0 to 7 recommended.
+// The order of the dither matrix is first byte is upper left.
+
+LIBYUV_API
+int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
+                       const uint8* src_u, int src_stride_u,
+                       const uint8* src_v, int src_stride_v,
+                       uint8* dst_frame, int dst_stride_frame,
+                       const uint8* dither4x4, int width, int height);
+
 LIBYUV_API
 int I420ToARGB1555(const uint8* src_y, int src_stride_y,
                   const uint8* src_u, int src_stride_u,
@ -152,8 +160,6 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
                   uint8* dst_frame, int dst_stride_frame,
                   int width, int height);

-// Note Bayer formats (BGGR) To I420 are in format_conversion.h.
-
 // Convert I420 to specified format.
 // "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
 //    buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.
--- a/media/libyuv/include/libyuv/convert_from_argb.h
+++ b/media/libyuv/include/libyuv/convert_from_argb.h
@ -25,24 +25,22 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb,
             uint8* dst_argb, int dst_stride_argb,
             int width, int height);

-// Convert ARGB To BGRA. (alias)
-#define ARGBToBGRA BGRAToARGB
+// Convert ARGB To BGRA.
 LIBYUV_API
-int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_argb, int dst_stride_argb,
+int ARGBToBGRA(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_bgra, int dst_stride_bgra,
               int width, int height);

-// Convert ARGB To ABGR. (alias)
-#define ARGBToABGR ABGRToARGB
+// Convert ARGB To ABGR.
 LIBYUV_API
-int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_argb, int dst_stride_argb,
+int ARGBToABGR(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_abgr, int dst_stride_abgr,
               int width, int height);

 // Convert ARGB To RGBA.
 LIBYUV_API
-int ARGBToRGBA(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_argb, int dst_stride_argb,
+int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_rgba, int dst_stride_rgba,
               int width, int height);

 // Convert ARGB To RGB24.
@ -63,6 +61,16 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
                 uint8* dst_rgb565, int dst_stride_rgb565,
                 int width, int height);

+// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
+// Values in dither matrix from 0 to 7 recommended.
+// The order of the dither matrix is first byte is upper left.
+// TODO(fbarchard): Consider pointer to 2d array for dither4x4.
+// const uint8(*dither)[4][4];
+LIBYUV_API
+int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_rgb565, int dst_stride_rgb565,
+                       const uint8* dither4x4, int width, int height);
+
 // Convert ARGB To ARGB1555.
 LIBYUV_API
 int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
@ -107,6 +115,14 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
               uint8* dst_v, int dst_stride_v,
               int width, int height);

+// Convert ARGB to J422.
+LIBYUV_API
+int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
 // Convert ARGB To I411.
 LIBYUV_API
 int ARGBToI411(const uint8* src_argb, int src_stride_argb,
@ -127,6 +143,12 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
               uint8* dst_y, int dst_stride_y,
               int width, int height);

+// Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB)
+LIBYUV_API
+int ARGBToG(const uint8* src_argb, int src_stride_argb,
+            uint8* dst_g, int dst_stride_g,
+            int width, int height);
+
 // Convert ARGB To NV12.
 LIBYUV_API
 int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
--- a/media/libyuv/include/libyuv/cpu_id.h
+++ b/media/libyuv/include/libyuv/cpu_id.h
@ -18,9 +18,8 @@ namespace libyuv {
 extern "C" {
 #endif

-// TODO(fbarchard): Consider overlapping bits for different architectures.
 // Internal flag to indicate cpuid requires initialization.
-#define kCpuInit 0x1
+static const int kCpuInitialized = 0x1;

 // These flags are only valid on ARM processors.
 static const int kCpuHasARM = 0x2;
@ -37,12 +36,12 @@ static const int kCpuHasAVX = 0x200;
 static const int kCpuHasAVX2 = 0x400;
 static const int kCpuHasERMS = 0x800;
 static const int kCpuHasFMA3 = 0x1000;
+static const int kCpuHasAVX3 = 0x2000;
 // 0x2000, 0x4000, 0x8000 reserved for future X86 flags.

 // These flags are only valid on MIPS processors.
 static const int kCpuHasMIPS = 0x10000;
-static const int kCpuHasMIPS_DSP = 0x20000;
-static const int kCpuHasMIPS_DSPR2 = 0x40000;
+static const int kCpuHasDSPR2 = 0x20000;

 // Internal function used to auto-init.
 LIBYUV_API
@ -57,13 +56,13 @@ int ArmCpuCaps(const char* cpuinfo_name);
 // returns non-zero if instruction set is detected
 static __inline int TestCpuFlag(int test_flag) {
  LIBYUV_API extern int cpu_info_;
-  return (cpu_info_ == kCpuInit ? InitCpuFlags() : cpu_info_) & test_flag;
+  return (!cpu_info_ ? InitCpuFlags() : cpu_info_) & test_flag;
 }

 // For testing, allow CPU flags to be disabled.
 // ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
 // MaskCpuFlags(-1) to enable all cpu specific optimizations.
-// MaskCpuFlags(0) to disable all cpu specific optimizations.
+// MaskCpuFlags(1) to disable all cpu specific optimizations.
 LIBYUV_API
 void MaskCpuFlags(int enable_flags);

--- a/media/libyuv/include/libyuv/format_conversion.h
+++ b/media/libyuv/include/libyuv/format_conversion.h
@ -1,168 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_FORMATCONVERSION_H_  // NOLINT
-#define INCLUDE_LIBYUV_FORMATCONVERSION_H_
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Convert Bayer RGB formats to I420.
-LIBYUV_API
-int BayerBGGRToI420(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_y, int dst_stride_y,
-                    uint8* dst_u, int dst_stride_u,
-                    uint8* dst_v, int dst_stride_v,
-                    int width, int height);
-
-LIBYUV_API
-int BayerGBRGToI420(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_y, int dst_stride_y,
-                    uint8* dst_u, int dst_stride_u,
-                    uint8* dst_v, int dst_stride_v,
-                    int width, int height);
-
-LIBYUV_API
-int BayerGRBGToI420(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_y, int dst_stride_y,
-                    uint8* dst_u, int dst_stride_u,
-                    uint8* dst_v, int dst_stride_v,
-                    int width, int height);
-
-LIBYUV_API
-int BayerRGGBToI420(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_y, int dst_stride_y,
-                    uint8* dst_u, int dst_stride_u,
-                    uint8* dst_v, int dst_stride_v,
-                    int width, int height);
-
-// Temporary API mapper.
-#define BayerRGBToI420(b, bs, f, y, ys, u, us, v, vs, w, h) \
-    BayerToI420(b, bs, y, ys, u, us, v, vs, w, h, f)
-
-LIBYUV_API
-int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
-                uint8* dst_y, int dst_stride_y,
-                uint8* dst_u, int dst_stride_u,
-                uint8* dst_v, int dst_stride_v,
-                int width, int height,
-                uint32 src_fourcc_bayer);
-
-// Convert I420 to Bayer RGB formats.
-LIBYUV_API
-int I420ToBayerBGGR(const uint8* src_y, int src_stride_y,
-                    const uint8* src_u, int src_stride_u,
-                    const uint8* src_v, int src_stride_v,
-                    uint8* dst_frame, int dst_stride_frame,
-                    int width, int height);
-
-LIBYUV_API
-int I420ToBayerGBRG(const uint8* src_y, int src_stride_y,
-                    const uint8* src_u, int src_stride_u,
-                    const uint8* src_v, int src_stride_v,
-                    uint8* dst_frame, int dst_stride_frame,
-                    int width, int height);
-
-LIBYUV_API
-int I420ToBayerGRBG(const uint8* src_y, int src_stride_y,
-                    const uint8* src_u, int src_stride_u,
-                    const uint8* src_v, int src_stride_v,
-                    uint8* dst_frame, int dst_stride_frame,
-                    int width, int height);
-
-LIBYUV_API
-int I420ToBayerRGGB(const uint8* src_y, int src_stride_y,
-                    const uint8* src_u, int src_stride_u,
-                    const uint8* src_v, int src_stride_v,
-                    uint8* dst_frame, int dst_stride_frame,
-                    int width, int height);
-
-// Temporary API mapper.
-#define I420ToBayerRGB(y, ys, u, us, v, vs, b, bs, f, w, h) \
-    I420ToBayer(y, ys, u, us, v, vs, b, bs, w, h, f)
-
-LIBYUV_API
-int I420ToBayer(const uint8* src_y, int src_stride_y,
-                const uint8* src_u, int src_stride_u,
-                const uint8* src_v, int src_stride_v,
-                uint8* dst_frame, int dst_stride_frame,
-                int width, int height,
-                uint32 dst_fourcc_bayer);
-
-// Convert Bayer RGB formats to ARGB.
-LIBYUV_API
-int BayerBGGRToARGB(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height);
-
-LIBYUV_API
-int BayerGBRGToARGB(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height);
-
-LIBYUV_API
-int BayerGRBGToARGB(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height);
-
-LIBYUV_API
-int BayerRGGBToARGB(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height);
-
-// Temporary API mapper.
-#define BayerRGBToARGB(b, bs, f, a, as, w, h) BayerToARGB(b, bs, a, as, w, h, f)
-
-LIBYUV_API
-int BayerToARGB(const uint8* src_bayer, int src_stride_bayer,
-                uint8* dst_argb, int dst_stride_argb,
-                int width, int height,
-                uint32 src_fourcc_bayer);
-
-// Converts ARGB to Bayer RGB formats.
-LIBYUV_API
-int ARGBToBayerBGGR(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_bayer, int dst_stride_bayer,
-                    int width, int height);
-
-LIBYUV_API
-int ARGBToBayerGBRG(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_bayer, int dst_stride_bayer,
-                    int width, int height);
-
-LIBYUV_API
-int ARGBToBayerGRBG(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_bayer, int dst_stride_bayer,
-                    int width, int height);
-
-LIBYUV_API
-int ARGBToBayerRGGB(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_bayer, int dst_stride_bayer,
-                    int width, int height);
-
-// Temporary API mapper.
-#define ARGBToBayerRGB(a, as, b, bs, f, w, h) ARGBToBayer(b, bs, a, as, w, h, f)
-
-LIBYUV_API
-int ARGBToBayer(const uint8* src_argb, int src_stride_argb,
-                uint8* dst_bayer, int dst_stride_bayer,
-                int width, int height,
-                uint32 dst_fourcc_bayer);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_FORMATCONVERSION_H_  NOLINT
--- a/media/libyuv/include/libyuv/mjpeg_decoder.h
+++ b/media/libyuv/include/libyuv/mjpeg_decoder.h
@ -43,6 +43,17 @@ enum JpegSubsamplingType {
  kJpegUnknown
 };

+struct Buffer {
+  const uint8* data;
+  int len;
+};
+
+struct BufferVector {
+  Buffer* buffers;
+  int len;
+  int pos;
+};
+
 struct SetJmpErrorMgr;

 // MJPEG ("Motion JPEG") is a pseudo-standard video codec where the frames are
@ -142,27 +153,6 @@ class LIBYUV_API MJpegDecoder {
     int* subsample_x, int* subsample_y, int number_of_components);

 private:
-  struct Buffer {
-    const uint8* data;
-    int len;
-  };
-
-  struct BufferVector {
-    Buffer* buffers;
-    int len;
-    int pos;
-  };
-
-  // Methods that are passed to jpeglib.
-  static int fill_input_buffer(jpeg_decompress_struct* cinfo);
-  static void init_source(jpeg_decompress_struct* cinfo);
-  static void skip_input_data(jpeg_decompress_struct* cinfo,
-                              long num_bytes);  // NOLINT
-  static void term_source(jpeg_decompress_struct* cinfo);
-
-  static void ErrorHandler(jpeg_common_struct* cinfo);
-  static void OutputHandler(jpeg_common_struct* cinfo);
-
  void AllocOutputBuffers(int num_outbufs);
  void DestroyOutputBuffers();

--- a/media/libyuv/include/libyuv/planar_functions.h
+++ b/media/libyuv/include/libyuv/planar_functions.h
@ -28,6 +28,11 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
               uint8* dst_y, int dst_stride_y,
               int width, int height);

+LIBYUV_API
+void CopyPlane_16(const uint16* src_y, int src_stride_y,
+                  uint16* dst_y, int dst_stride_y,
+                  int width, int height);
+
 // Set a plane of data to a 32 bit value.
 LIBYUV_API
 void SetPlane(uint8* dst_y, int dst_stride_y,
@ -40,6 +45,7 @@ int I400ToI400(const uint8* src_y, int src_stride_y,
               uint8* dst_y, int dst_stride_y,
               int width, int height);

+#define J400ToJ400 I400ToI400

 // Copy I422 to I422.
 #define I422ToI422 I422Copy
@ -79,6 +85,18 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
               uint8* dst_v, int dst_stride_v,
               int width, int height);

+LIBYUV_API
+int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height);
+
+LIBYUV_API
+int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height);
+
 // Convert I420 to I400. (calls CopyPlane ignoring u/v).
 LIBYUV_API
 int I420ToI400(const uint8* src_y, int src_stride_y,
@ -88,6 +106,7 @@ int I420ToI400(const uint8* src_y, int src_stride_y,
               int width, int height);

 // Alias
+#define J420ToJ400 I420ToI400
 #define I420ToI420Mirror I420Mirror

 // I420 mirror.
@ -126,13 +145,6 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
                 uint8* dst_rgb565, int dst_stride_rgb565,
                 int width, int height);

-// Convert NV21 to RGB565.
-LIBYUV_API
-int NV21ToRGB565(const uint8* src_y, int src_stride_y,
-                 const uint8* src_uv, int src_stride_uv,
-                 uint8* dst_rgb565, int dst_stride_rgb565,
-                 int width, int height);
-
 // I422ToARGB is in convert_argb.h
 // Convert I422 to BGRA.
 LIBYUV_API
@ -158,6 +170,14 @@ int I422ToRGBA(const uint8* src_y, int src_stride_y,
               uint8* dst_rgba, int dst_stride_rgba,
               int width, int height);

+// Alias
+#define RGB24ToRAW RAWToRGB24
+
+LIBYUV_API
+int RAWToRGB24(const uint8* src_raw, int src_stride_raw,
+               uint8* dst_rgb24, int dst_stride_rgb24,
+               int width, int height);
+
 // Draw a rectangle into I420.
 LIBYUV_API
 int I420Rect(uint8* dst_y, int dst_stride_y,
@ -262,13 +282,19 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb,
             uint8* dst_argb, int dst_stride_argb,
             int width, int height);

-// Copy ARGB to ARGB.
+// Copy Alpha channel of ARGB to alpha of ARGB.
 LIBYUV_API
 int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
                  uint8* dst_argb, int dst_stride_argb,
                  int width, int height);

-// Copy ARGB to ARGB.
+// Extract the alpha channel from ARGB.
+LIBYUV_API
+int ARGBExtractAlpha(const uint8* src_argb, int src_stride_argb,
+                     uint8* dst_a, int dst_stride_a,
+                     int width, int height);
+
+// Copy Y channel to Alpha of ARGB.
 LIBYUV_API
 int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
                     uint8* dst_argb, int dst_stride_argb,
@ -282,6 +308,7 @@ LIBYUV_API
 ARGBBlendRow GetARGBBlend();

 // Alpha Blend ARGB images and store to destination.
+// Source is pre-multiplied by alpha using ARGBAttenuate.
 // Alpha of destination is set to 255.
 LIBYUV_API
 int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
@ -289,6 +316,31 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
              uint8* dst_argb, int dst_stride_argb,
              int width, int height);

+// Alpha Blend plane and store to destination.
+// Source is not pre-multiplied by alpha.
+LIBYUV_API
+int BlendPlane(const uint8* src_y0, int src_stride_y0,
+               const uint8* src_y1, int src_stride_y1,
+               const uint8* alpha, int alpha_stride,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height);
+
+// Alpha Blend YUV images and store to destination.
+// Source is not pre-multiplied by alpha.
+// Alpha is full width x height and subsampled to half size to apply to UV.
+LIBYUV_API
+int I420Blend(const uint8* src_y0, int src_stride_y0,
+              const uint8* src_u0, int src_stride_u0,
+              const uint8* src_v0, int src_stride_v0,
+              const uint8* src_y1, int src_stride_y1,
+              const uint8* src_u1, int src_stride_u1,
+              const uint8* src_v1, int src_stride_v1,
+              const uint8* alpha, int alpha_stride,
+              uint8* dst_y, int dst_stride_y,
+              uint8* dst_u, int dst_stride_u,
+              uint8* dst_v, int dst_stride_v,
+              int width, int height);
+
 // Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255.
 LIBYUV_API
 int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
@ -338,12 +390,6 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
                    uint8* dst_argb, int dst_stride_argb,
                    int width, int height);

-// Convert MJPG to ARGB.
-LIBYUV_API
-int MJPGToARGB(const uint8* sample, size_t sample_size,
-               uint8* argb, int argb_stride,
-               int w, int h, int dw, int dh);
-
 // Internal function - do not call directly.
 // Computes table of cumulative sum for image where the value is the sum
 // of all values above and to the left of the entry. Used by ARGBBlur.
@ -370,36 +416,63 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
              uint8* dst_argb, int dst_stride_argb,
              int width, int height, uint32 value);

-// Interpolate between two ARGB images using specified amount of interpolation
+// Interpolate between two images using specified amount of interpolation
 // (0 to 255) and store to destination.
-// 'interpolation' is specified as 8 bit fraction where 0 means 100% src_argb0
-// and 255 means 1% src_argb0 and 99% src_argb1.
-// Internally uses ARGBScale bilinear filtering.
-// Caveat: This function will write up to 16 bytes beyond the end of dst_argb.
+// 'interpolation' is specified as 8 bit fraction where 0 means 100% src0
+// and 255 means 1% src0 and 99% src1.
+LIBYUV_API
+int InterpolatePlane(const uint8* src0, int src_stride0,
+                     const uint8* src1, int src_stride1,
+                     uint8* dst, int dst_stride,
+                     int width, int height, int interpolation);
+
+// Interpolate between two ARGB images using specified amount of interpolation
+// Internally calls InterpolatePlane with width * 4 (bpp).
 LIBYUV_API
 int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
                    const uint8* src_argb1, int src_stride_argb1,
                    uint8* dst_argb, int dst_stride_argb,
                    int width, int height, int interpolation);

-#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
-    defined(TARGET_IPHONE_SIMULATOR)
+// Interpolate between two YUV images using specified amount of interpolation
+// Internally calls InterpolatePlane on each plane where the U and V planes
+// are half width and half height.
+LIBYUV_API
+int I420Interpolate(const uint8* src0_y, int src0_stride_y,
+                    const uint8* src0_u, int src0_stride_u,
+                    const uint8* src0_v, int src0_stride_v,
+                    const uint8* src1_y, int src1_stride_y,
+                    const uint8* src1_u, int src1_stride_u,
+                    const uint8* src1_v, int src1_stride_v,
+                    uint8* dst_y, int dst_stride_y,
+                    uint8* dst_u, int dst_stride_u,
+                    uint8* dst_v, int dst_stride_v,
+                    int width, int height, int interpolation);
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__i386__) && !defined(__SSE2__))
 #define LIBYUV_DISABLE_X86
 #endif
+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define LIBYUV_DISABLE_X86
+#endif
+#endif
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#define HAS_ARGBAFFINEROW_SSE2
+#endif

-// Row functions for copying a pixels from a source with a slope to a row
+// Row function for copying pixels from a source with a slope to a row
 // of destination. Useful for scaling, rotation, mirror, texture mapping.
 LIBYUV_API
 void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
                     uint8* dst_argb, const float* uv_dudv, int width);
-// The following are available on all x86 platforms:
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
 LIBYUV_API
 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
                        uint8* dst_argb, const float* uv_dudv, int width);
-#define HAS_ARGBAFFINEROW_SSE2
-#endif  // LIBYUV_DISABLE_X86

 // Shuffle ARGB channel order.  e.g. BGRA to ARGB.
 // shuffler is 16 bytes and must be aligned.
--- a/media/libyuv/include/libyuv/rotate_row.h
+++ b/media/libyuv/include/libyuv/rotate_row.h
@ -0,0 +1,121 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_  // NOLINT
+#define INCLUDE_LIBYUV_ROTATE_ROW_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__i386__) && !defined(__SSE2__))
+#define LIBYUV_DISABLE_X86
+#endif
+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define LIBYUV_DISABLE_X86
+#endif
+#endif
+// The following are available for Visual C and clangcl 32 bit:
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+#define HAS_TRANSPOSEWX8_SSSE3
+#define HAS_TRANSPOSEUVWX8_SSE2
+#endif
+
+// The following are available for GCC 32 or 64 bit but not NaCL for 64 bit:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
+#define HAS_TRANSPOSEWX8_SSSE3
+#endif
+
+// The following are available for 64 bit GCC but not NaCL:
+#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
+    defined(__x86_64__)
+#define HAS_TRANSPOSEWX8_FAST_SSSE3
+#define HAS_TRANSPOSEUVWX8_SSE2
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_TRANSPOSEWX8_NEON
+#define HAS_TRANSPOSEUVWX8_NEON
+#endif
+
+#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
+    defined(__mips__) && \
+    defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+#define HAS_TRANSPOSEWX8_DSPR2
+#define HAS_TRANSPOSEUVWX8_DSPR2
+#endif  // defined(__mips__)
+
+void TransposeWxH_C(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride, int width, int height);
+
+void TransposeWx8_C(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride, int width);
+void TransposeWx8_NEON(const uint8* src, int src_stride,
+                       uint8* dst, int dst_stride, int width);
+void TransposeWx8_SSSE3(const uint8* src, int src_stride,
+                        uint8* dst, int dst_stride, int width);
+void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
+                             uint8* dst, int dst_stride, int width);
+void TransposeWx8_DSPR2(const uint8* src, int src_stride,
+                        uint8* dst, int dst_stride, int width);
+void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
+                             uint8* dst, int dst_stride, int width);
+
+void TransposeWx8_Any_NEON(const uint8* src, int src_stride,
+                           uint8* dst, int dst_stride, int width);
+void TransposeWx8_Any_SSSE3(const uint8* src, int src_stride,
+                            uint8* dst, int dst_stride, int width);
+void TransposeWx8_Fast_Any_SSSE3(const uint8* src, int src_stride,
+                                 uint8* dst, int dst_stride, int width);
+void TransposeWx8_Any_DSPR2(const uint8* src, int src_stride,
+                            uint8* dst, int dst_stride, int width);
+
+void TransposeUVWxH_C(const uint8* src, int src_stride,
+                      uint8* dst_a, int dst_stride_a,
+                      uint8* dst_b, int dst_stride_b,
+                      int width, int height);
+
+void TransposeUVWx8_C(const uint8* src, int src_stride,
+                      uint8* dst_a, int dst_stride_a,
+                      uint8* dst_b, int dst_stride_b, int width);
+void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b, int width);
+void TransposeUVWx8_NEON(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b, int width);
+void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
+                          uint8* dst_a, int dst_stride_a,
+                          uint8* dst_b, int dst_stride_b, int width);
+
+void TransposeUVWx8_Any_SSE2(const uint8* src, int src_stride,
+                             uint8* dst_a, int dst_stride_a,
+                             uint8* dst_b, int dst_stride_b, int width);
+void TransposeUVWx8_Any_NEON(const uint8* src, int src_stride,
+                             uint8* dst_a, int dst_stride_a,
+                             uint8* dst_b, int dst_stride_b, int width);
+void TransposeUVWx8_Any_DSPR2(const uint8* src, int src_stride,
+                              uint8* dst_a, int dst_stride_a,
+                              uint8* dst_b, int dst_stride_b, int width);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_ROTATE_ROW_H_  NOLINT
--- a/media/libyuv/include/libyuv/row.h
+++ b/media/libyuv/include/libyuv/row.h
--- a/media/libyuv/include/libyuv/scale.h
+++ b/media/libyuv/include/libyuv/scale.h
@ -34,6 +34,13 @@ void ScalePlane(const uint8* src, int src_stride,
                int dst_width, int dst_height,
                enum FilterMode filtering);

+LIBYUV_API
+void ScalePlane_16(const uint16* src, int src_stride,
+                   int src_width, int src_height,
+                   uint16* dst, int dst_stride,
+                   int dst_width, int dst_height,
+                   enum FilterMode filtering);
+
 // Scales a YUV 4:2:0 image from the src width and height to the
 // dst width and height.
 // If filtering is kFilterNone, a simple nearest-neighbor algorithm is
@ -55,6 +62,17 @@ int I420Scale(const uint8* src_y, int src_stride_y,
              int dst_width, int dst_height,
              enum FilterMode filtering);

+LIBYUV_API
+int I420Scale_16(const uint16* src_y, int src_stride_y,
+                 const uint16* src_u, int src_stride_u,
+                 const uint16* src_v, int src_stride_v,
+                 int src_width, int src_height,
+                 uint16* dst_y, int dst_stride_y,
+                 uint16* dst_u, int dst_stride_u,
+                 uint16* dst_v, int dst_stride_v,
+                 int dst_width, int dst_height,
+                 enum FilterMode filtering);
+
 #ifdef __cplusplus
 // Legacy API.  Deprecated.
 LIBYUV_API
--- a/media/libyuv/include/libyuv/scale_argb.h
+++ b/media/libyuv/include/libyuv/scale_argb.h
@ -35,7 +35,6 @@ int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
                  int clip_x, int clip_y, int clip_width, int clip_height,
                  enum FilterMode filtering);

-// TODO(fbarchard): Implement this.
 // Scale with YUV conversion to ARGB and clipping.
 LIBYUV_API
 int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
--- a/media/libyuv/include/libyuv/scale_row.h
+++ b/media/libyuv/include/libyuv/scale_row.h
@ -12,54 +12,94 @@
 #define INCLUDE_LIBYUV_SCALE_ROW_H_

 #include "libyuv/basic_types.h"
+#include "libyuv/scale.h"

 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif

-#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
-    defined(TARGET_IPHONE_SIMULATOR)
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__i386__) && !defined(__SSE2__))
 #define LIBYUV_DISABLE_X86
 #endif
+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define LIBYUV_DISABLE_X86
+#endif
+#endif
+
+// GCC >= 4.7.0 required for AVX2.
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
+#define GCC_HAS_AVX2 1
+#endif  // GNUC >= 4.7
+#endif  // __GNUC__
+
+// clang >= 3.4.0 required for AVX2.
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
+#define CLANG_HAS_AVX2 1
+#endif  // clang >= 3.4
+#endif  // __clang__
+
+// Visual C 2012 required for AVX2.
+#if defined(_M_IX86) && !defined(__clang__) && \
+    defined(_MSC_VER) && _MSC_VER >= 1700
+#define VISUALC_HAS_AVX2 1
+#endif  // VisualStudio >= 2012

 // The following are available on all x86 platforms:
 #if !defined(LIBYUV_DISABLE_X86) && \
    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-#define HAS_SCALEROWDOWN2_SSE2
-#define HAS_SCALEROWDOWN4_SSE2
-#define HAS_SCALEROWDOWN34_SSSE3
-#define HAS_SCALEROWDOWN38_SSSE3
-#define HAS_SCALEADDROWS_SSE2
-#define HAS_SCALEFILTERCOLS_SSSE3
-#define HAS_SCALECOLSUP2_SSE2
+#define HAS_FIXEDDIV1_X86
+#define HAS_FIXEDDIV_X86
+#define HAS_SCALEARGBCOLS_SSE2
+#define HAS_SCALEARGBCOLSUP2_SSE2
+#define HAS_SCALEARGBFILTERCOLS_SSSE3
 #define HAS_SCALEARGBROWDOWN2_SSE2
 #define HAS_SCALEARGBROWDOWNEVEN_SSE2
-#define HAS_SCALEARGBCOLS_SSE2
-#define HAS_SCALEARGBFILTERCOLS_SSSE3
-#define HAS_SCALEARGBCOLSUP2_SSE2
-#define HAS_FIXEDDIV_X86
-#define HAS_FIXEDDIV1_X86
+#define HAS_SCALECOLSUP2_SSE2
+#define HAS_SCALEFILTERCOLS_SSSE3
+#define HAS_SCALEROWDOWN2_SSSE3
+#define HAS_SCALEROWDOWN34_SSSE3
+#define HAS_SCALEROWDOWN38_SSSE3
+#define HAS_SCALEROWDOWN4_SSSE3
+#define HAS_SCALEADDROW_SSE2
+#endif
+
+// The following are available on all x86 platforms, but
+// require VS2012, clang 3.4 or gcc 4.7.
+// The code supports NaCL but requires a new compiler and validator.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \
+    defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#define HAS_SCALEADDROW_AVX2
+#define HAS_SCALEROWDOWN2_AVX2
+#define HAS_SCALEROWDOWN4_AVX2
 #endif

 // The following are available on Neon platforms:
 #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_SCALEARGBCOLS_NEON
+#define HAS_SCALEARGBROWDOWN2_NEON
+#define HAS_SCALEARGBROWDOWNEVEN_NEON
+#define HAS_SCALEFILTERCOLS_NEON
 #define HAS_SCALEROWDOWN2_NEON
-#define HAS_SCALEROWDOWN4_NEON
 #define HAS_SCALEROWDOWN34_NEON
 #define HAS_SCALEROWDOWN38_NEON
-#define HAS_SCALEARGBROWDOWNEVEN_NEON
-#define HAS_SCALEARGBROWDOWN2_NEON
+#define HAS_SCALEROWDOWN4_NEON
+#define HAS_SCALEARGBFILTERCOLS_NEON
 #endif

 // The following are available on Mips platforms:
 #if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
    defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-#define HAS_SCALEROWDOWN2_MIPS_DSPR2
-#define HAS_SCALEROWDOWN4_MIPS_DSPR2
-#define HAS_SCALEROWDOWN34_MIPS_DSPR2
-#define HAS_SCALEROWDOWN38_MIPS_DSPR2
+#define HAS_SCALEROWDOWN2_DSPR2
+#define HAS_SCALEROWDOWN4_DSPR2
+#define HAS_SCALEROWDOWN34_DSPR2
+#define HAS_SCALEROWDOWN38_DSPR2
 #endif

 // Scale ARGB vertically with bilinear interpolation.
@ -70,6 +110,13 @@ void ScalePlaneVertical(int src_height,
                        int x, int y, int dy,
                        int bpp, enum FilterMode filtering);

+void ScalePlaneVertical_16(int src_height,
+                           int dst_width, int dst_height,
+                           int src_stride, int dst_stride,
+                           const uint16* src_argb, uint16* dst_argb,
+                           int x, int y, int dy,
+                           int wpp, enum FilterMode filtering);
+
 // Simplify the filtering based on scale factors.
 enum FilterMode ScaleFilterReduce(int src_width, int src_height,
                                  int dst_width, int dst_height,
@ -97,37 +144,70 @@ void ScaleSlope(int src_width, int src_height,

 void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
                     uint8* dst, int dst_width);
+void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                        uint16* dst, int dst_width);
 void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst, int dst_width);
+void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                              uint16* dst, int dst_width);
 void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst, int dst_width);
+void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width);
+void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst, int dst_width);
 void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
                     uint8* dst, int dst_width);
+void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                        uint16* dst, int dst_width);
 void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst, int dst_width);
+void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst, int dst_width);
 void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
                      uint8* dst, int dst_width);
+void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                         uint16* dst, int dst_width);
 void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* d, int dst_width);
+void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* d, int dst_width);
 void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* d, int dst_width);
+void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* d, int dst_width);
 void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
                 int dst_width, int x, int dx);
+void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                    int dst_width, int x, int dx);
 void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
                    int dst_width, int, int);
+void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                       int dst_width, int, int);
 void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
                       int dst_width, int x, int dx);
+void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                          int dst_width, int x, int dx);
 void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
                         int dst_width, int x, int dx);
+void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                            int dst_width, int x, int dx);
 void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
                      uint8* dst, int dst_width);
+void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                         uint16* dst, int dst_width);
 void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
                            ptrdiff_t src_stride,
                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16* dst_ptr, int dst_width);
 void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst_ptr, int dst_width);
-void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                    uint16* dst_ptr, int src_width, int src_height);
+void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* dst_ptr, int dst_width);
+void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width);
 void ScaleARGBRowDown2_C(const uint8* src_argb,
                         ptrdiff_t src_stride,
                         uint8* dst_argb, int dst_width);
@ -154,25 +234,28 @@ void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
 void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
                             int dst_width, int x, int dx);

-void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+// Specialized scalers for x86.
+void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
                              uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst_ptr, int dst_width);
-void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
-                                        ptrdiff_t src_stride,
-                                        uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
-                                     ptrdiff_t src_stride,
-                                     uint8* dst_ptr, int dst_width);
-void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst_ptr, int dst_width);
+
 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
                          uint8* dst_ptr, int dst_width);
 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
@ -189,46 +272,128 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
                                ptrdiff_t src_stride,
                                uint8* dst_ptr, int dst_width);
-void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                       uint16* dst_ptr, int src_width,
-                       int src_height);
+void ScaleRowDown2_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                             uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                                   uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_Odd_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                  uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_Odd_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown4_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                             uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+
+void ScaleRowDown34_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_1_Box_Any_SSSE3(const uint8* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_0_Box_Any_SSSE3(const uint8* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_Any_SSSE3(const uint8* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_2_Box_Any_SSSE3(const uint8* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8* dst_ptr, int dst_width);
+
+void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_Any_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_Any_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+
 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
                           int dst_width, int x, int dx);
 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
                       int dst_width, int x, int dx);
-void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
-                            ptrdiff_t src_stride,
-                            uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
-                                  ptrdiff_t src_stride,
-                                  uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
-                               ptrdiff_t src_stride,
-                               uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                               int src_stepx,
-                               uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
-                                  ptrdiff_t src_stride,
-                                  int src_stepx,
-                                  uint8* dst_argb, int dst_width);
+
+
+// ARGB Column functions
 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
                        int dst_width, int x, int dx);
 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
                               int dst_width, int x, int dx);
 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
                           int dst_width, int x, int dx);
-// Row functions.
-void ScaleARGBRowDownEven_NEON(const uint8* src_argb, int src_stride,
-                               int src_stepx,
-                               uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, int src_stride,
-                                  int src_stepx,
+void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                              int dst_width, int x, int dx);
+void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx);
+void ScaleARGBFilterCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
+                                  int dst_width, int x, int dx);
+void ScaleARGBCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
+                            int dst_width, int x, int dx);
+
+// ARGB Row functions
+void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
                                  uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                               uint8* dst_argb, int dst_width);
 void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst, int dst_width);
+void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width);
 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                               uint8* dst, int dst_width);
+void ScaleARGBRowDown2_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Linear_Any_SSE2(const uint8* src_argb,
+                                      ptrdiff_t src_stride,
+                                      uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                   uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* dst, int dst_width);
+void ScaleARGBRowDown2Linear_Any_NEON(const uint8* src_argb,
+                                      ptrdiff_t src_stride,
+                                      uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                   uint8* dst, int dst_width);
+
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_stepx, uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEven_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                   int src_stepx,
+                                   uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8* src_argb,
+                                      ptrdiff_t src_stride,
+                                      int src_stepx,
+                                      uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEven_Any_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                   int src_stepx,
+                                   uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_Any_NEON(const uint8* src_argb,
+                                      ptrdiff_t src_stride,
+                                      int src_stepx,
+                                      uint8* dst_argb, int dst_width);

 // ScaleRowDown2Box also used by planar functions
 // NEON downscalers with interpolation.
@ -236,7 +401,8 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
 // Note - not static due to reuse in convert for 444 to 420.
 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst, int dst_width);
-
+void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst, int dst_width);
 void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst, int dst_width);

@ -271,27 +437,63 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
                               ptrdiff_t src_stride,
                               uint8* dst_ptr, int dst_width);

-void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst, int dst_width);
-void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                 uint8* dst, int dst_width);
-void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst, int dst_width);
-void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                 uint8* dst, int dst_width);
-void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+void ScaleRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width);
+void ScaleRowDown2Linear_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                  uint8* dst, int dst_width);
+void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                               uint8* dst, int dst_width);
-void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                     uint8* d, int dst_width);
-void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                     uint8* d, int dst_width);
-void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+void ScaleRowDown2Box_Odd_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                               uint8* dst, int dst_width);
-void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                     uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
-                                     ptrdiff_t src_stride,
-                                     uint8* dst_ptr, int dst_width);
+void ScaleRowDown4_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                             uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_0_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                   uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_1_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                   uint8* dst_ptr, int dst_width);
+// 32 -> 12
+void ScaleRowDown38_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                             uint8* dst_ptr, int dst_width);
+// 32x3 -> 12x1
+void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+
+void ScaleAddRow_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_Any_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+
+void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                          int dst_width, int x, int dx);
+
+void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                              int dst_width, int x, int dx);
+
+void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                         uint8* dst, int dst_width);
+void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width);
+void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                         uint8* dst, int dst_width);
+void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width);
+void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst, int dst_width);
+void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* d, int dst_width);
+void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* d, int dst_width);
+void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst, int dst_width);
+void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);

 #ifdef __cplusplus
 }  // extern "C"
--- a/media/libyuv/include/libyuv/version.h
+++ b/media/libyuv/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 971
+#define LIBYUV_VERSION 1602

 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/media/libyuv/include/libyuv/video_common.h
+++ b/media/libyuv/include/libyuv/video_common.h
@ -62,7 +62,7 @@ enum FourCC {

  // 2 Secondary YUV formats: row biplanar.
  FOURCC_M420 = FOURCC('M', '4', '2', '0'),
-  FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),
+  FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),  // deprecated.

  // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp.
  FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
@ -75,7 +75,7 @@ enum FourCC {
  FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'),  // argb1555 LE.
  FOURCC_R444 = FOURCC('R', '4', '4', '4'),  // argb4444 LE.

-  // 4 Secondary RGB formats: 4 Bayer Patterns.
+  // 4 Secondary RGB formats: 4 Bayer Patterns. deprecated.
  FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
  FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
  FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
@ -90,7 +90,8 @@ enum FourCC {
  FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
  FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'),  // Linux version of I420.
  FOURCC_J420 = FOURCC('J', '4', '2', '0'),
-  FOURCC_J400 = FOURCC('J', '4', '0', '0'),
+  FOURCC_J400 = FOURCC('J', '4', '0', '0'),  // unofficial fourcc
+  FOURCC_H420 = FOURCC('H', '4', '2', '0'),  // unofficial fourcc

  // 14 Auxiliary aliases.  CanonicalFourCC() maps these to canonical fourcc.
  FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'),  // Alias for I420.
@ -150,6 +151,7 @@ enum FourCCBpp {
  FOURCC_BPP_YU12 = 12,
  FOURCC_BPP_J420 = 12,
  FOURCC_BPP_J400 = 8,
+  FOURCC_BPP_H420 = 12,
  FOURCC_BPP_MJPG = 0,  // 0 means unknown.
  FOURCC_BPP_H264 = 0,
  FOURCC_BPP_IYUV = 12,
--- a/media/libyuv/libyuv.gyp
+++ b/media/libyuv/libyuv.gyp
@ -10,92 +10,83 @@
  'includes': [
    'libyuv.gypi',
  ],
+  # Make sure that if we are being compiled to an xcodeproj, nothing tries to
+  # include a .pch.
+  'xcode_settings': {
+    'GCC_PREFIX_HEADER': '',
+    'GCC_PRECOMPILE_PREFIX_HEADER': 'NO',
+  },
  'variables': {
    'use_system_libjpeg%': 0,
-    'yuv_disable_asm%': 0,
-    'yuv_disable_avx2%': 0,
+    'libyuv_disable_jpeg%': 0,
+    # 'chromium_code' treats libyuv as internal and increases warning level.
+    'chromium_code': 1,
+    # clang compiler default variable usable by other apps that include libyuv.
+    'clang%': 0,
+    # Link-Time Optimizations.
+    'use_lto%': 0,
    'build_neon': 0,
    'conditions': [
-       ['target_arch == "arm" and arm_version >= 7 and (arm_neon == 1 or arm_neon_optional == 1)', {
+       ['(target_arch == "armv7" or target_arch == "armv7s" or \
+       (target_arch == "arm" and arm_version >= 7) or target_arch == "arm64")\
+       and (arm_neon == 1 or arm_neon_optional == 1)',
+       {
         'build_neon': 1,
       }],
    ],
  },
-  'conditions': [
-    [ 'build_neon != 0', {
-      'targets': [
-        # The NEON-specific components.
-        {
-          'target_name': 'libyuv_neon',
-          'type': 'static_library',
-          'standalone_static_library': 1,
-          'defines': [
-            'LIBYUV_NEON',
-          ],
-          # TODO(noahric): This should remove whatever mfpu is set, not
-          # just vfpv3-d16.
-          'cflags!': [
-            '-mfpu=vfp',
-            '-mfpu=vfpv3',
-            '-mfpu=vfpv3-d16',
-          ],
-          # XXX Doesn't work currently
-          'cflags_mozilla!': [
-            '-mfpu=vfp',
-            '-mfpu=vfpv3',
-            '-mfpu=vfpv3-d16',
-          ],
-          'cflags': [
-            '-mfpu=neon',
-          ],
-          'cflags_mozilla': [
-            '-mfpu=neon',
-          ],
-          'include_dirs': [
-            'include',
-            '.',
-          ],
-          'direct_dependent_settings': {
-            'include_dirs': [
-              'include',
-              '.',
-            ],
-          },
-          'sources': [
-            # sources.
-            'source/compare_neon.cc',
-            'source/rotate_neon.cc',
-            'source/row_neon.cc',
-            'source/scale_neon.cc',
-          ],
-        },
-      ],
-    }],
-  ],
+
  'targets': [
    {
      'target_name': 'libyuv',
      # Change type to 'shared_library' to build .so or .dll files.
      'type': 'static_library',
+      'variables': {
+        'optimize': 'max',  # enable O2 and ltcg.
+      },
      # Allows libyuv.a redistributable library without external dependencies.
-      # 'standalone_static_library': 1,
+      'standalone_static_library': 1,
      'conditions': [
-        # TODO(fbarchard): Use gyp define to enable jpeg.
-        [ 'build_with_mozilla==1', {
+       # Disable -Wunused-parameter
+        ['clang == 1', {
+          'cflags': [
+            '-Wno-unused-parameter',
+         ],
+        }],
+        ['build_neon != 0', {
          'defines': [
-            'HAVE_JPEG'
+            'LIBYUV_NEON',
          ],
-          'cflags_mozilla': [
-            '$(MOZ_JPEG_CFLAGS)',
+          'cflags!': [
+            '-mfpu=vfp',
+            '-mfpu=vfpv3',
+            '-mfpu=vfpv3-d16',
+            # '-mthumb',  # arm32 not thumb
+          ],
+          'conditions': [
+            # Disable LTO in libyuv_neon target due to gcc 4.9 compiler bug.
+            ['clang == 0 and use_lto == 1', {
+              'cflags!': [
+                '-flto',
+                '-ffat-lto-objects',
+              ],
+            }],
+            # arm64 does not need -mfpu=neon option as neon is not optional
+            ['target_arch != "arm64"', {
+              'cflags': [
+                '-mfpu=neon',
+                # '-marm',  # arm32 not thumb
+              ],
+            }],
          ],
        }],
-        [ 'OS != "ios" and build_with_mozilla!=1', {
+        ['OS != "ios" and libyuv_disable_jpeg != 1', {
          'defines': [
            'HAVE_JPEG'
          ],
          'conditions': [
-            # Android uses libjpeg for system jpeg support.
-            [ 'OS == "android" and use_system_libjpeg == 1', {
+            # Caveat system jpeg support may not support motion jpeg
+            [ 'use_system_libjpeg == 1', {
              'dependencies': [
                 '<(DEPTH)/third_party/libjpeg/libjpeg.gyp:libjpeg',
              ],
@ -113,37 +104,15 @@
            }],
          ],
        }],
-        [ 'build_neon != 0', {
-          'dependencies': [
-            'libyuv_neon',
-          ],
-          'defines': [
-            'LIBYUV_NEON',
-          ]
-        }],
-        [ 'yuv_disable_asm!=0', {
-          'defines': [
-            # Enable the following 3 macros to turn off assembly for specified CPU.
-            'LIBYUV_DISABLE_X86',
-            'LIBYUV_DISABLE_NEON',
-            'LIBYUV_DISABLE_MIPS',
-          ],
-        }],
-        [ 'yuv_disable_avx2==1', {
-          'defines': [
-            'LIBYUV_DISABLE_AVX2',
-          ]
-        }],
-      ],
+      ], #conditions
      'defines': [
        # Enable the following 3 macros to turn off assembly for specified CPU.
        # 'LIBYUV_DISABLE_X86',
        # 'LIBYUV_DISABLE_NEON',
        # 'LIBYUV_DISABLE_MIPS',
-        # This disables AVX2 (Haswell) support, overriding compiler checks
-        # 'LIBYUV_DISABLE_AVX2',
        # Enable the following macro to build libyuv as a shared library (dll).
        # 'LIBYUV_USING_SHARED_LIBRARY',
+        # TODO(fbarchard): Make these into gyp defines.
      ],
      'include_dirs': [
        'include',
@ -154,6 +123,18 @@
          'include',
          '.',
        ],
+        'conditions': [
+          ['OS == "android" and target_arch == "arm64"', {
+            'ldflags': [
+              '-Wl,--dynamic-linker,/system/bin/linker64',
+            ],
+          }],
+          ['OS == "android" and target_arch != "arm64"', {
+            'ldflags': [
+              '-Wl,--dynamic-linker,/system/bin/linker',
+            ],
+          }],
+        ], #conditions
      },
      'sources': [
        '<@(libyuv_sources)',
--- a/media/libyuv/libyuv.gypi
+++ b/media/libyuv/libyuv.gypi
@ -18,11 +18,11 @@
      'include/libyuv/convert_from.h',
      'include/libyuv/convert_from_argb.h',
      'include/libyuv/cpu_id.h',
-      'include/libyuv/format_conversion.h',
      'include/libyuv/mjpeg_decoder.h',
      'include/libyuv/planar_functions.h',
      'include/libyuv/rotate.h',
      'include/libyuv/rotate_argb.h',
+      'include/libyuv/rotate_row.h',
      'include/libyuv/row.h',
      'include/libyuv/scale.h',
      'include/libyuv/scale_argb.h',
@ -33,7 +33,9 @@
      # sources.
      'source/compare.cc',
      'source/compare_common.cc',
-      'source/compare_posix.cc',
+      'source/compare_gcc.cc',
+      'source/compare_neon.cc',
+      'source/compare_neon64.cc',
      'source/compare_win.cc',
      'source/convert.cc',
      'source/convert_argb.cc',
@ -43,23 +45,33 @@
      'source/convert_to_argb.cc',
      'source/convert_to_i420.cc',
      'source/cpu_id.cc',
-      'source/format_conversion.cc',
      'source/mjpeg_decoder.cc',
      'source/mjpeg_validate.cc',
      'source/planar_functions.cc',
      'source/rotate.cc',
+      'source/rotate_any.cc',
      'source/rotate_argb.cc',
+      'source/rotate_common.cc',
+      'source/rotate_gcc.cc',
      'source/rotate_mips.cc',
+      'source/rotate_neon.cc',
+      'source/rotate_neon64.cc',
+      'source/rotate_win.cc',
      'source/row_any.cc',
      'source/row_common.cc',
+      'source/row_gcc.cc',
      'source/row_mips.cc',
-      'source/row_posix.cc',
+      'source/row_neon.cc',
+      'source/row_neon64.cc',
      'source/row_win.cc',
      'source/scale.cc',
+      'source/scale_any.cc',
      'source/scale_argb.cc',
      'source/scale_common.cc',
+      'source/scale_gcc.cc',
      'source/scale_mips.cc',
-      'source/scale_posix.cc',
+      'source/scale_neon.cc',
+      'source/scale_neon64.cc',
      'source/scale_win.cc',
      'source/video_common.cc',
    ],
--- a/media/libyuv/libyuv_nacl.gyp
+++ b/media/libyuv/libyuv_nacl.gyp
@ -21,9 +21,6 @@
        'build_newlib': 0,
        'build_pnacl_newlib': 1,
      },
-      'dependencies': [
-        '../../native_client/tools.gyp:prep_toolchain',
-      ],
      'include_dirs': [
        'include',
      ],
--- a/media/libyuv/libyuv_test.gyp
+++ b/media/libyuv/libyuv_test.gyp
@ -7,24 +7,25 @@
 # be found in the AUTHORS file in the root of the source tree.

 {
+  'variables': {
+    'libyuv_disable_jpeg%': 0,
+  },
  'targets': [
    {
      'target_name': 'libyuv_unittest',
-      'type': 'executable',
+      'type': '<(gtest_target_type)',
      'dependencies': [
        'libyuv.gyp:libyuv',
-        # The tests are based on gtest
        'testing/gtest.gyp:gtest',
-        'testing/gtest.gyp:gtest_main',
+        'third_party/gflags/gflags.gyp:gflags',
      ],
-      'defines': [
-        'LIBYUV_SVNREVISION="<!(svnversion -n)"',
-        # Enable the following 3 macros to turn off assembly for specified CPU.
-        # 'LIBYUV_DISABLE_X86',
-        # 'LIBYUV_DISABLE_NEON',
-        # 'LIBYUV_DISABLE_MIPS',
-        # Enable the following macro to build libyuv as a shared library (dll).
-        # 'LIBYUV_USING_SHARED_LIBRARY',
+      'direct_dependent_settings': {
+        'defines': [
+          'GTEST_RELATIVE_PATH',
+        ],
+      },
+      'export_dependent_settings': [
+        '<(DEPTH)/testing/gtest.gyp:gtest',
      ],
      'sources': [
        # headers
@ -33,6 +34,7 @@
        # sources
        'unit_test/basictypes_test.cc',
        'unit_test/compare_test.cc',
+        'unit_test/color_test.cc',
        'unit_test/convert_test.cc',
        'unit_test/cpu_test.cc',
        'unit_test/math_test.cc',
@ -43,7 +45,6 @@
        'unit_test/scale_test.cc',
        'unit_test/unit_test.cc',
        'unit_test/video_common_test.cc',
-        'unit_test/version_test.cc',
      ],
      'conditions': [
        ['OS=="linux"', {
@ -51,14 +52,55 @@
            '-fexceptions',
          ],
        }],
-        [ 'OS != "ios"', {
+        [ 'OS == "ios" and target_subarch == 64', {
+          'defines': [
+            'LIBYUV_DISABLE_NEON'
+          ],
+        }],
+        [ 'OS == "ios"', {
+          'xcode_settings': {
+            'DEBUGGING_SYMBOLS': 'YES',
+            'DEBUG_INFORMATION_FORMAT' : 'dwarf-with-dsym',
+            # Work around compile issue with isosim.mm, see
+            # https://code.google.com/p/libyuv/issues/detail?id=548 for details.
+            'WARNING_CFLAGS': [
+              '-Wno-sometimes-uninitialized',
+            ],
+          },
+          'cflags': [
+            '-Wno-sometimes-uninitialized',
+          ],
+        }],
+        [ 'OS != "ios" and libyuv_disable_jpeg != 1', {
          'defines': [
            'HAVE_JPEG',
          ],
        }],
+        ['OS=="android"', {
+          'dependencies': [
+            '<(DEPTH)/testing/android/native_test.gyp:native_test_native_code',
+          ],
+        }],
+        # TODO(YangZhang): These lines can be removed when high accuracy
+        # YUV to RGB to Neon is ported.
+        [ '(target_arch == "armv7" or target_arch == "armv7s" \
+          or (target_arch == "arm" and arm_version >= 7) \
+          or target_arch == "arm64") \
+          and (arm_neon == 1 or arm_neon_optional == 1)', {
+          'defines': [
+            'LIBYUV_NEON'
+          ],
+        }],
      ], # conditions
+      'defines': [
+        # Enable the following 3 macros to turn off assembly for specified CPU.
+        # 'LIBYUV_DISABLE_X86',
+        # 'LIBYUV_DISABLE_NEON',
+        # 'LIBYUV_DISABLE_MIPS',
+        # Enable the following macro to build libyuv as a shared library (dll).
+        # 'LIBYUV_USING_SHARED_LIBRARY',
+      ],
    },
-
    {
      'target_name': 'compare',
      'type': 'executable',
@ -105,7 +147,24 @@
        'util/psnr.cc',
        'util/ssim.cc',
      ],
+      'dependencies': [
+        'libyuv.gyp:libyuv',
+      ],
+      'conditions': [
+        [ 'OS == "ios" and target_subarch == 64', {
+          'defines': [
+            'LIBYUV_DISABLE_NEON'
+          ],
+        }],
+
+        [ 'OS != "ios" and libyuv_disable_jpeg != 1', {
+          'defines': [
+            'HAVE_JPEG',
+          ],
+        }],
+      ], # conditions
    },
+
    {
      'target_name': 'cpuid',
      'type': 'executable',
@ -118,6 +177,50 @@
      ],
    },
  ], # targets
+  'conditions': [
+    ['OS=="android"', {
+      'targets': [
+        {
+          # TODO(kjellander): Figure out what to change in build/apk_test.gypi
+          # to it can be used instead of the copied code below. Using it in its
+          # current version was not possible, since the target starts with 'lib',
+          # which somewhere confuses the variables.
+          'target_name': 'libyuv_unittest_apk',
+          'type': 'none',
+          'variables': {
+            # These are used to configure java_apk.gypi included below.
+            'test_type': 'gtest',
+            'apk_name': 'libyuv_unittest',
+            'test_suite_name': 'libyuv_unittest',
+            'intermediate_dir': '<(PRODUCT_DIR)/libyuv_unittest_apk',
+            'input_shlib_path': '<(SHARED_LIB_DIR)/<(SHARED_LIB_PREFIX)libyuv_unittest<(SHARED_LIB_SUFFIX)',
+            'final_apk_path': '<(intermediate_dir)/libyuv_unittest-debug.apk',
+            'java_in_dir': '<(DEPTH)/testing/android/native_test/java',
+            'test_runner_path': '<(DEPTH)/util/android/test_runner.py',
+            'native_lib_target': 'libyuv_unittest',
+            'gyp_managed_install': 0,
+          },
+          'includes': [
+            'build/android/test_runner.gypi',
+            'build/java_apk.gypi',
+           ],
+          'dependencies': [
+            '<(DEPTH)/base/base.gyp:base_java',
+            # TODO(kjellander): Figure out why base_build_config_gen is needed
+            # here. It really shouldn't since it's a dependency of base_java
+            # above, but there's always 0 tests run if it's missing.
+            '<(DEPTH)/base/base.gyp:base_build_config_gen',
+            '<(DEPTH)/build/android/pylib/device/commands/commands.gyp:chromium_commands',
+            '<(DEPTH)/build/android/pylib/remote/device/dummy/dummy.gyp:remote_device_dummy_apk',
+            '<(DEPTH)/testing/android/appurify_support.gyp:appurify_support_java',
+            '<(DEPTH)/testing/android/on_device_instrumentation.gyp:reporter_java',
+            '<(DEPTH)/tools/android/android_tools.gyp:android_tools',
+            'libyuv_unittest',
+          ],
+        },
+      ],
+    }],
+  ],
 }

 # Local Variables:
--- a/media/libyuv/linux.mk
+++ b/media/libyuv/linux.mk
@ -1,48 +1,81 @@
 # This is a generic makefile for libyuv for gcc.
-# make -f linux.mk CC=clang++
+# make -f linux.mk CXX=clang++

-CC=g++
-CCFLAGS=-O2 -fomit-frame-pointer -Iinclude/
+CC?=gcc
+CFLAGS?=-O2 -fomit-frame-pointer
+CFLAGS+=-Iinclude/
+
+CXX?=g++
+CXXFLAGS?=-O2 -fomit-frame-pointer
+CXXFLAGS+=-Iinclude/

 LOCAL_OBJ_FILES := \
-    source/compare.o           \
-    source/compare_common.o    \
-    source/compare_posix.o     \
-    source/convert.o           \
-    source/convert_argb.o      \
-    source/convert_from.o      \
-    source/convert_from_argb.o \
-    source/convert_to_argb.o   \
-    source/convert_to_i420.o   \
-    source/cpu_id.o            \
-    source/format_conversion.o \
-    source/planar_functions.o  \
-    source/rotate.o            \
-    source/rotate_argb.o       \
-    source/rotate_mips.o       \
-    source/row_any.o           \
-    source/row_common.o        \
-    source/row_mips.o          \
-    source/row_posix.o         \
-    source/scale.o             \
-    source/scale_argb.o        \
-    source/scale_common.o      \
-    source/scale_mips.o        \
-    source/scale_posix.o       \
-    source/video_common.o
+	source/compare.o           \
+	source/compare_common.o    \
+	source/compare_gcc.o       \
+	source/compare_neon64.o    \
+	source/compare_neon.o      \
+	source/compare_win.o       \
+	source/convert_argb.o      \
+	source/convert.o           \
+	source/convert_from_argb.o \
+	source/convert_from.o      \
+	source/convert_jpeg.o      \
+	source/convert_to_argb.o   \
+	source/convert_to_i420.o   \
+	source/cpu_id.o            \
+	source/mjpeg_decoder.o     \
+	source/mjpeg_validate.o    \
+	source/planar_functions.o  \
+	source/rotate_any.o        \
+	source/rotate_argb.o       \
+	source/rotate.o            \
+	source/rotate_common.o     \
+	source/rotate_gcc.o        \
+	source/rotate_mips.o       \
+	source/rotate_neon64.o     \
+	source/rotate_neon.o       \
+	source/rotate_win.o        \
+	source/row_any.o           \
+	source/row_common.o        \
+	source/row_gcc.o           \
+	source/row_mips.o          \
+	source/row_neon64.o        \
+	source/row_neon.o          \
+	source/row_win.o           \
+	source/scale_any.o         \
+	source/scale_argb.o        \
+	source/scale.o             \
+	source/scale_common.o      \
+	source/scale_gcc.o         \
+	source/scale_mips.o        \
+	source/scale_neon64.o      \
+	source/scale_neon.o        \
+	source/scale_win.o         \
+	source/video_common.o

 .cc.o:
-	$(CC) -c $(CCFLAGS) $*.cc -o $*.o
+	$(CXX) -c $(CXXFLAGS) $*.cc -o $*.o

-all: libyuv.a convert linux.mk
+.c.o:
+	$(CC) -c $(CFLAGS) $*.c -o $*.o

-libyuv.a: $(LOCAL_OBJ_FILES) linux.mk
-	$(AR) $(ARFLAGS) -o $@ $(LOCAL_OBJ_FILES)
+all: libyuv.a convert cpuid psnr

-# A test utility that uses libyuv conversion.
-convert: util/convert.cc linux.mk
-	$(CC) $(CCFLAGS) -Iutil/ -o $@ util/convert.cc libyuv.a
+libyuv.a: $(LOCAL_OBJ_FILES)
+	$(AR) $(ARFLAGS) $@ $(LOCAL_OBJ_FILES)
+
+# A C++ test utility that uses libyuv conversion.
+convert: util/convert.cc libyuv.a
+	$(CXX) $(CXXFLAGS) -Iutil/ -o $@ util/convert.cc libyuv.a
+
+# A standalone test utility
+psnr: util/psnr.cc
+	$(CXX) $(CXXFLAGS) -Iutil/ -o $@ util/psnr.cc util/psnr_main.cc util/ssim.cc
+
+# A C test utility that uses libyuv conversion from C.
+cpuid: util/cpuid.c libyuv.a
+	$(CC) $(CFLAGS) -o $@ util/cpuid.c libyuv.a

 clean:
-	/bin/rm -f source/*.o *.ii *.s libyuv.a convert
-
+	/bin/rm -f source/*.o *.ii *.s libyuv.a convert cpuid psnr
--- a/media/libyuv/setup_links.py
+++ b/media/libyuv/setup_links.py
@ -0,0 +1,497 @@
+#!/usr/bin/env python
+# Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS.  All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+"""Setup links to a Chromium checkout for WebRTC.
+
+WebRTC standalone shares a lot of dependencies and build tools with Chromium.
+To do this, many of the paths of a Chromium checkout is emulated by creating
+symlinks to files and directories. This script handles the setup of symlinks to
+achieve this.
+
+It also handles cleanup of the legacy Subversion-based approach that was used
+before Chrome switched over their master repo from Subversion to Git.
+"""
+
+
+import ctypes
+import errno
+import logging
+import optparse
+import os
+import shelve
+import shutil
+import subprocess
+import sys
+import textwrap
+
+
+DIRECTORIES = [
+  'build',
+  'buildtools',
+  'mojo',  # TODO(kjellander): Remove, see webrtc:5629.
+  'native_client',
+  'net',
+  'testing',
+  'third_party/binutils',
+  'third_party/drmemory',
+  'third_party/instrumented_libraries',
+  'third_party/libjpeg',
+  'third_party/libjpeg_turbo',
+  'third_party/llvm-build',
+  'third_party/lss',
+  'third_party/yasm',
+  'third_party/WebKit',  # TODO(kjellander): Remove, see webrtc:5629.
+  'tools/clang',
+  'tools/gn',
+  'tools/gyp',
+  'tools/memory',
+  'tools/python',
+  'tools/swarming_client',
+  'tools/valgrind',
+  'tools/vim',
+  'tools/win',
+]
+
+from sync_chromium import get_target_os_list
+target_os = get_target_os_list()
+if 'android' in target_os:
+  DIRECTORIES += [
+    'base',
+    'third_party/android_platform',
+    'third_party/android_tools',
+    'third_party/appurify-python',
+    'third_party/ashmem',
+    'third_party/catapult',
+    'third_party/icu',
+    'third_party/ijar',
+    'third_party/jsr-305',
+    'third_party/junit',
+    'third_party/libxml',
+    'third_party/mockito',
+    'third_party/modp_b64',
+    'third_party/protobuf',
+    'third_party/requests',
+    'third_party/robolectric',
+    'tools/android',
+    'tools/grit',
+  ]
+if 'ios' in target_os:
+  DIRECTORIES.append('third_party/class-dump')
+
+FILES = {
+  'tools/isolate_driver.py': None,
+  'third_party/BUILD.gn': None,
+}
+
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+CHROMIUM_CHECKOUT = os.path.join('chromium', 'src')
+LINKS_DB = 'links'
+
+# Version management to make future upgrades/downgrades easier to support.
+SCHEMA_VERSION = 1
+
+
+def query_yes_no(question, default=False):
+  """Ask a yes/no question via raw_input() and return their answer.
+
+  Modified from http://stackoverflow.com/a/3041990.
+  """
+  prompt = " [%s/%%s]: "
+  prompt = prompt % ('Y' if default is True  else 'y')
+  prompt = prompt % ('N' if default is False else 'n')
+
+  if default is None:
+    default = 'INVALID'
+
+  while True:
+    sys.stdout.write(question + prompt)
+    choice = raw_input().lower()
+    if choice == '' and default != 'INVALID':
+      return default
+
+    if 'yes'.startswith(choice):
+      return True
+    elif 'no'.startswith(choice):
+      return False
+
+    print "Please respond with 'yes' or 'no' (or 'y' or 'n')."
+
+
+# Actions
+class Action(object):
+  def __init__(self, dangerous):
+    self.dangerous = dangerous
+
+  def announce(self, planning):
+    """Log a description of this action.
+
+    Args:
+      planning - True iff we're in the planning stage, False if we're in the
+                 doit stage.
+    """
+    pass
+
+  def doit(self, links_db):
+    """Execute the action, recording what we did to links_db, if necessary."""
+    pass
+
+
+class Remove(Action):
+  def __init__(self, path, dangerous):
+    super(Remove, self).__init__(dangerous)
+    self._priority = 0
+    self._path = path
+
+  def announce(self, planning):
+    log = logging.warn
+    filesystem_type = 'file'
+    if not self.dangerous:
+      log = logging.info
+      filesystem_type = 'link'
+    if planning:
+      log('Planning to remove %s: %s', filesystem_type, self._path)
+    else:
+      log('Removing %s: %s', filesystem_type, self._path)
+
+  def doit(self, _):
+    os.remove(self._path)
+
+
+class Rmtree(Action):
+  def __init__(self, path):
+    super(Rmtree, self).__init__(dangerous=True)
+    self._priority = 0
+    self._path = path
+
+  def announce(self, planning):
+    if planning:
+      logging.warn('Planning to remove directory: %s', self._path)
+    else:
+      logging.warn('Removing directory: %s', self._path)
+
+  def doit(self, _):
+    if sys.platform.startswith('win'):
+      # shutil.rmtree() doesn't work on Windows if any of the directories are
+      # read-only, which svn repositories are.
+      subprocess.check_call(['rd', '/q', '/s', self._path], shell=True)
+    else:
+      shutil.rmtree(self._path)
+
+
+class Makedirs(Action):
+  def __init__(self, path):
+    super(Makedirs, self).__init__(dangerous=False)
+    self._priority = 1
+    self._path = path
+
+  def doit(self, _):
+    try:
+      os.makedirs(self._path)
+    except OSError as e:
+      if e.errno != errno.EEXIST:
+        raise
+
+
+class Symlink(Action):
+  def __init__(self, source_path, link_path):
+    super(Symlink, self).__init__(dangerous=False)
+    self._priority = 2
+    self._source_path = source_path
+    self._link_path = link_path
+
+  def announce(self, planning):
+    if planning:
+      logging.info(
+          'Planning to create link from %s to %s', self._link_path,
+          self._source_path)
+    else:
+      logging.debug(
+          'Linking from %s to %s', self._link_path, self._source_path)
+
+  def doit(self, links_db):
+    # Files not in the root directory need relative path calculation.
+    # On Windows, use absolute paths instead since NTFS doesn't seem to support
+    # relative paths for symlinks.
+    if sys.platform.startswith('win'):
+      source_path = os.path.abspath(self._source_path)
+    else:
+      if os.path.dirname(self._link_path) != self._link_path:
+        source_path = os.path.relpath(self._source_path,
+                                      os.path.dirname(self._link_path))
+
+    os.symlink(source_path, os.path.abspath(self._link_path))
+    links_db[self._source_path] = self._link_path
+
+
+class LinkError(IOError):
+  """Failed to create a link."""
+  pass
+
+
+# Handles symlink creation on the different platforms.
+if sys.platform.startswith('win'):
+  def symlink(source_path, link_path):
+    flag = 1 if os.path.isdir(source_path) else 0
+    if not ctypes.windll.kernel32.CreateSymbolicLinkW(
+        unicode(link_path), unicode(source_path), flag):
+      raise OSError('Failed to create symlink to %s. Notice that only NTFS '
+                    'version 5.0 and up has all the needed APIs for '
+                    'creating symlinks.' % source_path)
+  os.symlink = symlink
+
+
+class WebRTCLinkSetup(object):
+  def __init__(self, links_db, force=False, dry_run=False, prompt=False):
+    self._force = force
+    self._dry_run = dry_run
+    self._prompt = prompt
+    self._links_db = links_db
+
+  def CreateLinks(self, on_bot):
+    logging.debug('CreateLinks')
+    # First, make a plan of action
+    actions = []
+
+    for source_path, link_path in FILES.iteritems():
+      actions += self._ActionForPath(
+          source_path, link_path, check_fn=os.path.isfile, check_msg='files')
+    for source_dir in DIRECTORIES:
+      actions += self._ActionForPath(
+          source_dir, None, check_fn=os.path.isdir,
+          check_msg='directories')
+
+    if not on_bot and self._force:
+      # When making the manual switch from legacy SVN checkouts to the new
+      # Git-based Chromium DEPS, the .gclient_entries file that contains cached
+      # URLs for all DEPS entries must be removed to avoid future sync problems.
+      entries_file = os.path.join(os.path.dirname(ROOT_DIR), '.gclient_entries')
+      if os.path.exists(entries_file):
+        actions.append(Remove(entries_file, dangerous=True))
+
+    actions.sort()
+
+    if self._dry_run:
+      for action in actions:
+        action.announce(planning=True)
+      logging.info('Not doing anything because dry-run was specified.')
+      sys.exit(0)
+
+    if any(a.dangerous for a in actions):
+      logging.warn('Dangerous actions:')
+      for action in (a for a in actions if a.dangerous):
+        action.announce(planning=True)
+      print
+
+      if not self._force:
+        logging.error(textwrap.dedent("""\
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+                              A C T I O N     R E Q I R E D
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        Because chromium/src is transitioning to Git (from SVN), we needed to
+        change the way that the WebRTC standalone checkout works. Instead of
+        individually syncing subdirectories of Chromium in SVN, we're now
+        syncing Chromium (and all of its DEPS, as defined by its own DEPS file),
+        into the `chromium/src` directory.
+
+        As such, all Chromium directories which are currently pulled by DEPS are
+        now replaced with a symlink into the full Chromium checkout.
+
+        To avoid disrupting developers, we've chosen to not delete your
+        directories forcibly, in case you have some work in progress in one of
+        them :).
+
+        ACTION REQUIRED:
+        Before running `gclient sync|runhooks` again, you must run:
+        %s%s --force
+
+        Which will replace all directories which now must be symlinks, after
+        prompting with a summary of the work-to-be-done.
+        """), 'python ' if sys.platform.startswith('win') else '', sys.argv[0])
+        sys.exit(1)
+      elif self._prompt:
+        if not query_yes_no('Would you like to perform the above plan?'):
+          sys.exit(1)
+
+    for action in actions:
+      action.announce(planning=False)
+      action.doit(self._links_db)
+
+    if not on_bot and self._force:
+      logging.info('Completed!\n\nNow run `gclient sync|runhooks` again to '
+                   'let the remaining hooks (that probably were interrupted) '
+                   'execute.')
+
+  def CleanupLinks(self):
+    logging.debug('CleanupLinks')
+    for source, link_path  in self._links_db.iteritems():
+      if source == 'SCHEMA_VERSION':
+        continue
+      if os.path.islink(link_path) or sys.platform.startswith('win'):
+        # os.path.islink() always returns false on Windows
+        # See http://bugs.python.org/issue13143.
+        logging.debug('Removing link to %s at %s', source, link_path)
+        if not self._dry_run:
+          if os.path.exists(link_path):
+            if sys.platform.startswith('win') and os.path.isdir(link_path):
+              subprocess.check_call(['rmdir', '/q', '/s', link_path],
+                                    shell=True)
+            else:
+              os.remove(link_path)
+          del self._links_db[source]
+
+  @staticmethod
+  def _ActionForPath(source_path, link_path=None, check_fn=None,
+                     check_msg=None):
+    """Create zero or more Actions to link to a file or directory.
+
+    This will be a symlink on POSIX platforms. On Windows this requires
+    that NTFS is version 5.0 or higher (Vista or newer).
+
+    Args:
+      source_path: Path relative to the Chromium checkout root.
+        For readability, the path may contain slashes, which will
+        automatically be converted to the right path delimiter on Windows.
+      link_path: The location for the link to create. If omitted it will be the
+        same path as source_path.
+      check_fn: A function returning true if the type of filesystem object is
+        correct for the attempted call. Otherwise an error message with
+        check_msg will be printed.
+      check_msg: String used to inform the user of an invalid attempt to create
+        a file.
+    Returns:
+      A list of Action objects.
+    """
+    def fix_separators(path):
+      if sys.platform.startswith('win'):
+        return path.replace(os.altsep, os.sep)
+      else:
+        return path
+
+    assert check_fn
+    assert check_msg
+    link_path = link_path or source_path
+    link_path = fix_separators(link_path)
+
+    source_path = fix_separators(source_path)
+    source_path = os.path.join(CHROMIUM_CHECKOUT, source_path)
+    if os.path.exists(source_path) and not check_fn:
+      raise LinkError('_LinkChromiumPath can only be used to link to %s: '
+                      'Tried to link to: %s' % (check_msg, source_path))
+
+    if not os.path.exists(source_path):
+      logging.debug('Silently ignoring missing source: %s. This is to avoid '
+                    'errors on platform-specific dependencies.', source_path)
+      return []
+
+    actions = []
+
+    if os.path.exists(link_path) or os.path.islink(link_path):
+      if os.path.islink(link_path):
+        actions.append(Remove(link_path, dangerous=False))
+      elif os.path.isfile(link_path):
+        actions.append(Remove(link_path, dangerous=True))
+      elif os.path.isdir(link_path):
+        actions.append(Rmtree(link_path))
+      else:
+        raise LinkError('Don\'t know how to plan: %s' % link_path)
+
+    # Create parent directories to the target link if needed.
+    target_parent_dirs = os.path.dirname(link_path)
+    if (target_parent_dirs and
+        target_parent_dirs != link_path and
+        not os.path.exists(target_parent_dirs)):
+      actions.append(Makedirs(target_parent_dirs))
+
+    actions.append(Symlink(source_path, link_path))
+
+    return actions
+
+def _initialize_database(filename):
+  links_database = shelve.open(filename)
+
+  # Wipe the database if this version of the script ends up looking at a
+  # newer (future) version of the links db, just to be sure.
+  version = links_database.get('SCHEMA_VERSION')
+  if version and version != SCHEMA_VERSION:
+    logging.info('Found database with schema version %s while this script only '
+                 'supports %s. Wiping previous database contents.', version,
+                 SCHEMA_VERSION)
+    links_database.clear()
+  links_database['SCHEMA_VERSION'] = SCHEMA_VERSION
+  return links_database
+
+
+def main():
+  on_bot = os.environ.get('CHROME_HEADLESS') == '1'
+
+  parser = optparse.OptionParser()
+  parser.add_option('-d', '--dry-run', action='store_true', default=False,
+                    help='Print what would be done, but don\'t perform any '
+                         'operations. This will automatically set logging to '
+                         'verbose.')
+  parser.add_option('-c', '--clean-only', action='store_true', default=False,
+                    help='Only clean previously created links, don\'t create '
+                         'new ones. This will automatically set logging to '
+                         'verbose.')
+  parser.add_option('-f', '--force', action='store_true', default=on_bot,
+                    help='Force link creation. CAUTION: This deletes existing '
+                         'folders and files in the locations where links are '
+                         'about to be created.')
+  parser.add_option('-n', '--no-prompt', action='store_false', dest='prompt',
+                    default=(not on_bot),
+                    help='Prompt if we\'re planning to do a dangerous action')
+  parser.add_option('-v', '--verbose', action='store_const',
+                    const=logging.DEBUG, default=logging.INFO,
+                    help='Print verbose output for debugging.')
+  options, _ = parser.parse_args()
+
+  if options.dry_run or options.force or options.clean_only:
+    options.verbose = logging.DEBUG
+  logging.basicConfig(format='%(message)s', level=options.verbose)
+
+  # Work from the root directory of the checkout.
+  script_dir = os.path.dirname(os.path.abspath(__file__))
+  os.chdir(script_dir)
+
+  if sys.platform.startswith('win'):
+    def is_admin():
+      try:
+        return os.getuid() == 0
+      except AttributeError:
+        return ctypes.windll.shell32.IsUserAnAdmin() != 0
+    if not is_admin():
+      logging.error('On Windows, you now need to have administrator '
+                    'privileges for the shell running %s (or '
+                    '`gclient sync|runhooks`).\nPlease start another command '
+                    'prompt as Administrator and try again.', sys.argv[0])
+      return 1
+
+  if not os.path.exists(CHROMIUM_CHECKOUT):
+    logging.error('Cannot find a Chromium checkout at %s. Did you run "gclient '
+                  'sync" before running this script?', CHROMIUM_CHECKOUT)
+    return 2
+
+  links_database = _initialize_database(LINKS_DB)
+  try:
+    symlink_creator = WebRTCLinkSetup(links_database, options.force,
+                                      options.dry_run, options.prompt)
+    symlink_creator.CleanupLinks()
+    if not options.clean_only:
+      symlink_creator.CreateLinks(on_bot)
+  except LinkError as e:
+    print >> sys.stderr, e.message
+    return 3
+  finally:
+    links_database.close()
+  return 0
+
+
+if __name__ == '__main__':
+  sys.exit(main())
--- a/media/libyuv/source/compare.cc
+++ b/media/libyuv/source/compare.cc
@ -17,38 +17,23 @@
 #endif

 #include "libyuv/basic_types.h"
+#include "libyuv/compare_row.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/row.h"
+#include "libyuv/video_common.h"

 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif

-// hash seed of 5381 recommended.
-// Internal C version of HashDjb2 with int sized count for efficiency.
-uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
-
-// This module is for Visual C x86
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(_M_IX86) || \
-    (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))))
-#define HAS_HASHDJB2_SSE41
-uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
-
-#if _MSC_VER >= 1700
-#define HAS_HASHDJB2_AVX2
-uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);
-#endif
-
-#endif  // HAS_HASHDJB2_SSE41
-
 // hash seed of 5381 recommended.
 LIBYUV_API
 uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
  const int kBlockSize = 1 << 15;  // 32768;
  int remainder;
-  uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C;
+  uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) =
+      HashDjb2_C;
 #if defined(HAS_HASHDJB2_SSE41)
  if (TestCpuFlag(kCpuHasSSE41)) {
    HashDjb2_SSE = HashDjb2_SSE41;
@ -78,22 +63,53 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
  return seed;
 }

-uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
-#if !defined(LIBYUV_DISABLE_NEON) && \
-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
-#define HAS_SUMSQUAREERROR_NEON
-uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
-#endif
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-#define HAS_SUMSQUAREERROR_SSE2
-uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
-#endif
-// Visual C 2012 required for AVX2.
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && _MSC_VER >= 1700
-#define HAS_SUMSQUAREERROR_AVX2
-uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
-#endif
+static uint32 ARGBDetectRow_C(const uint8* argb, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.
+      return FOURCC_BGRA;
+    }
+    if (argb[3] != 255) {  // 4th byte is not Alpha of 255, so not BGRA.
+      return FOURCC_ARGB;
+    }
+    if (argb[4] != 255) {  // Second pixel first byte is not Alpha of 255.
+      return FOURCC_BGRA;
+    }
+    if (argb[7] != 255) {  // Second pixel 4th byte is not Alpha of 255.
+      return FOURCC_ARGB;
+    }
+    argb += 8;
+  }
+  if (width & 1) {
+    if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.
+      return FOURCC_BGRA;
+    }
+    if (argb[3] != 255) {  // 4th byte is not Alpha of 255, so not BGRA.
+      return FOURCC_ARGB;
+    }
+  }
+  return 0;
+}
+
+// Scan an opaque argb image and return fourcc based on alpha offset.
+// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
+LIBYUV_API
+uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
+  uint32 fourcc = 0;
+  int h;
+
+  // Coalesce rows.
+  if (stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    stride_argb = 0;
+  }
+  for (h = 0; h < height && fourcc == 0; ++h) {
+    fourcc = ARGBDetectRow_C(argb, width);
+    argb += stride_argb;
+  }
+  return fourcc;
+}

 // TODO(fbarchard): Refactor into row function.
 LIBYUV_API
@ -114,8 +130,7 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
  }
 #endif
 #if defined(HAS_SUMSQUAREERROR_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
    // Note only used for multiples of 16 so count is not checked.
    SumSquareError = SumSquareError_SSE2;
  }
--- a/media/libyuv/source/compare_common.cc
+++ b/media/libyuv/source/compare_common.cc
@ -10,6 +10,8 @@

 #include "libyuv/basic_types.h"

+#include "libyuv/compare_row.h"
+
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
--- a/media/libyuv/source/compare_posix.cc
+++ b/media/libyuv/source/compare_posix.cc
@ -9,6 +9,8 @@
 */

 #include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
 #include "libyuv/row.h"

 #ifdef __cplusplus
@ -16,20 +18,21 @@ namespace libyuv {
 extern "C" {
 #endif

-#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))

 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
  uint32 sse;
-  asm volatile (  // NOLINT
+  asm volatile (
    "pxor      %%xmm0,%%xmm0                   \n"
    "pxor      %%xmm5,%%xmm5                   \n"
    LABELALIGN
  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm1         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
    "lea       " MEMLEA(0x10, 0) ",%0          \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm2         \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
    "lea       " MEMLEA(0x10, 1) ",%1          \n"
-    "sub       $0x10,%2                        \n"
    "movdqa    %%xmm1,%%xmm3                   \n"
    "psubusb   %%xmm2,%%xmm1                   \n"
    "psubusb   %%xmm3,%%xmm2                   \n"
@ -41,6 +44,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
    "pmaddwd   %%xmm2,%%xmm2                   \n"
    "paddd     %%xmm1,%%xmm0                   \n"
    "paddd     %%xmm2,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
    "jg        1b                              \n"

    "pshufd    $0xee,%%xmm0,%%xmm1             \n"
@ -53,20 +57,11 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
    "+r"(src_b),      // %1
    "+r"(count),      // %2
    "=g"(sse)         // %3
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
-  );  // NOLINT
+  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
  return sse;
 }

-#endif  // defined(__x86_64__) || defined(__i386__)
-
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
-#define HAS_HASHDJB2_SSE41
 static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
 static uvec32 kHashMul0 = {
  0x0c3525e1,  // 33 ^ 15
@ -95,7 +90,7 @@ static uvec32 kHashMul3 = {

 uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
  uint32 hash;
-  asm volatile (  // NOLINT
+  asm volatile (
    "movd      %2,%%xmm0                       \n"
    "pxor      %%xmm7,%%xmm7                   \n"
    "movdqa    %4,%%xmm6                       \n"
@ -124,13 +119,13 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
    "pmulld    %%xmm5,%%xmm1                   \n"
    "paddd     %%xmm4,%%xmm3                   \n"
    "paddd     %%xmm2,%%xmm1                   \n"
-    "sub       $0x10,%1                        \n"
    "paddd     %%xmm3,%%xmm1                   \n"
    "pshufd    $0xe,%%xmm1,%%xmm2              \n"
    "paddd     %%xmm2,%%xmm1                   \n"
    "pshufd    $0x1,%%xmm1,%%xmm2              \n"
    "paddd     %%xmm2,%%xmm1                   \n"
    "paddd     %%xmm1,%%xmm0                   \n"
+    "sub       $0x10,%1                        \n"
    "jg        1b                              \n"
    "movd      %%xmm0,%3                       \n"
  : "+r"(src),        // %0
@ -143,10 +138,8 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
    "m"(kHashMul2),   // %7
    "m"(kHashMul3)    // %8
  : "memory", "cc"
-#if defined(__SSE2__)
    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
-  );  // NOLINT
+  );
  return hash;
 }
 #endif  // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
--- a/media/libyuv/source/compare_neon.cc
+++ b/media/libyuv/source/compare_neon.cc
@ -10,12 +10,16 @@

 #include "libyuv/basic_types.h"

+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif

-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)

 uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
  volatile uint32 sse;
@ -25,9 +29,10 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
    "vmov.u8    q9, #0                         \n"
    "vmov.u8    q11, #0                        \n"

-    ".p2align  2                               \n"
  "1:                                          \n"
+    MEMACCESS(0)
    "vld1.8     {q0}, [%0]!                    \n"
+    MEMACCESS(1)
    "vld1.8     {q1}, [%1]!                    \n"
    "subs       %2, %2, #16                    \n"
    "vsubl.u8   q2, d0, d2                     \n"
@ -53,7 +58,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
  return sse;
 }

-#endif  // __ARM_NEON__
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)

 #ifdef __cplusplus
 }  // extern "C"
--- a/media/libyuv/source/compare_neon64.cc
+++ b/media/libyuv/source/compare_neon64.cc
@ -0,0 +1,64 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
+  volatile uint32 sse;
+  asm volatile (
+    "eor        v16.16b, v16.16b, v16.16b      \n"
+    "eor        v18.16b, v18.16b, v18.16b      \n"
+    "eor        v17.16b, v17.16b, v17.16b      \n"
+    "eor        v19.16b, v19.16b, v19.16b      \n"
+
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"
+    MEMACCESS(1)
+    "ld1        {v1.16b}, [%1], #16            \n"
+    "subs       %w2, %w2, #16                  \n"
+    "usubl      v2.8h, v0.8b, v1.8b            \n"
+    "usubl2     v3.8h, v0.16b, v1.16b          \n"
+    "smlal      v16.4s, v2.4h, v2.4h           \n"
+    "smlal      v17.4s, v3.4h, v3.4h           \n"
+    "smlal2     v18.4s, v2.8h, v2.8h           \n"
+    "smlal2     v19.4s, v3.8h, v3.8h           \n"
+    "b.gt       1b                             \n"
+
+    "add        v16.4s, v16.4s, v17.4s         \n"
+    "add        v18.4s, v18.4s, v19.4s         \n"
+    "add        v19.4s, v16.4s, v18.4s         \n"
+    "addv       s0, v19.4s                     \n"
+    "fmov       %w3, s0                        \n"
+    : "+r"(src_a),
+      "+r"(src_b),
+      "+r"(count),
+      "=r"(sse)
+    :
+    : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
+  return sse;
+}
+
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
--- a/media/libyuv/source/compare_win.cc
+++ b/media/libyuv/source/compare_win.cc
@ -9,6 +9,8 @@
 */

 #include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
 #include "libyuv/row.h"

 #ifdef __cplusplus
@ -16,7 +18,8 @@ namespace libyuv {
 extern "C" {
 #endif

-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+// This module is for 32 bit Visual C x86 and clangcl
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)

 __declspec(naked)
 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
@ -27,13 +30,11 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
    pxor       xmm0, xmm0
    pxor       xmm5, xmm5

-    align      4
  wloop:
-    movdqa     xmm1, [eax]
+    movdqu     xmm1, [eax]
    lea        eax,  [eax + 16]
-    movdqa     xmm2, [edx]
+    movdqu     xmm2, [edx]
    lea        edx,  [edx + 16]
-    sub        ecx, 16
    movdqa     xmm3, xmm1  // abs trick
    psubusb    xmm1, xmm2
    psubusb    xmm2, xmm3
@ -45,6 +46,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
    pmaddwd    xmm2, xmm2
    paddd      xmm0, xmm1
    paddd      xmm0, xmm2
+    sub        ecx, 16
    jg         wloop

    pshufd     xmm1, xmm0, 0xee
@ -70,12 +72,10 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
    vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck
    sub        edx, eax

-    align      4
  wloop:
    vmovdqu    ymm1, [eax]
    vmovdqu    ymm2, [eax + edx]
    lea        eax,  [eax + 32]
-    sub        ecx, 32
    vpsubusb   ymm3, ymm1, ymm2  // abs difference trick
    vpsubusb   ymm2, ymm2, ymm1
    vpor       ymm1, ymm2, ymm3
@ -85,6 +85,7 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
    vpmaddwd   ymm1, ymm1, ymm1
    vpaddd     ymm0, ymm0, ymm1
    vpaddd     ymm0, ymm0, ymm2
+    sub        ecx, 32
    jg         wloop

    vpshufd    ymm1, ymm0, 0xee  // 3, 2 + 1, 0 both lanes.
@ -100,41 +101,32 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
 }
 #endif  // _MSC_VER >= 1700

-#define HAS_HASHDJB2_SSE41
-static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
-static uvec32 kHashMul0 = {
+uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
+uvec32 kHashMul0 = {
  0x0c3525e1,  // 33 ^ 15
  0xa3476dc1,  // 33 ^ 14
  0x3b4039a1,  // 33 ^ 13
  0x4f5f0981,  // 33 ^ 12
 };
-static uvec32 kHashMul1 = {
+uvec32 kHashMul1 = {
  0x30f35d61,  // 33 ^ 11
  0x855cb541,  // 33 ^ 10
  0x040a9121,  // 33 ^ 9
  0x747c7101,  // 33 ^ 8
 };
-static uvec32 kHashMul2 = {
+uvec32 kHashMul2 = {
  0xec41d4e1,  // 33 ^ 7
  0x4cfa3cc1,  // 33 ^ 6
  0x025528a1,  // 33 ^ 5
  0x00121881,  // 33 ^ 4
 };
-static uvec32 kHashMul3 = {
+uvec32 kHashMul3 = {
  0x00008c61,  // 33 ^ 3
  0x00000441,  // 33 ^ 2
  0x00000021,  // 33 ^ 1
  0x00000001,  // 33 ^ 0
 };

-// 27: 66 0F 38 40 C6     pmulld      xmm0,xmm6
-// 44: 66 0F 38 40 DD     pmulld      xmm3,xmm5
-// 59: 66 0F 38 40 E5     pmulld      xmm4,xmm5
-// 72: 66 0F 38 40 D5     pmulld      xmm2,xmm5
-// 83: 66 0F 38 40 CD     pmulld      xmm1,xmm5
-#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
-    _asm _emit 0x40 _asm _emit reg
-
 __declspec(naked)
 uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
  __asm {
@ -143,34 +135,32 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
    movd       xmm0, [esp + 12]  // seed

    pxor       xmm7, xmm7        // constant 0 for unpck
-    movdqa     xmm6, kHash16x33
+    movdqa     xmm6, xmmword ptr kHash16x33

-    align      4
  wloop:
    movdqu     xmm1, [eax]       // src[0-15]
    lea        eax, [eax + 16]
-    pmulld(0xc6)                 // pmulld      xmm0,xmm6  hash *= 33 ^ 16
-    movdqa     xmm5, kHashMul0
+    pmulld     xmm0, xmm6        // hash *= 33 ^ 16
+    movdqa     xmm5, xmmword ptr kHashMul0
    movdqa     xmm2, xmm1
    punpcklbw  xmm2, xmm7        // src[0-7]
    movdqa     xmm3, xmm2
    punpcklwd  xmm3, xmm7        // src[0-3]
-    pmulld(0xdd)                 // pmulld     xmm3, xmm5
-    movdqa     xmm5, kHashMul1
+    pmulld     xmm3, xmm5
+    movdqa     xmm5, xmmword ptr kHashMul1
    movdqa     xmm4, xmm2
    punpckhwd  xmm4, xmm7        // src[4-7]
-    pmulld(0xe5)                 // pmulld     xmm4, xmm5
-    movdqa     xmm5, kHashMul2
+    pmulld     xmm4, xmm5
+    movdqa     xmm5, xmmword ptr kHashMul2
    punpckhbw  xmm1, xmm7        // src[8-15]
    movdqa     xmm2, xmm1
    punpcklwd  xmm2, xmm7        // src[8-11]
-    pmulld(0xd5)                 // pmulld     xmm2, xmm5
-    movdqa     xmm5, kHashMul3
+    pmulld     xmm2, xmm5
+    movdqa     xmm5, xmmword ptr kHashMul3
    punpckhwd  xmm1, xmm7        // src[12-15]
-    pmulld(0xcd)                 // pmulld     xmm1, xmm5
+    pmulld     xmm1, xmm5
    paddd      xmm3, xmm4        // add 16 results
    paddd      xmm1, xmm2
-    sub        ecx, 16
    paddd      xmm1, xmm3

    pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
@ -178,6 +168,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
    pshufd     xmm2, xmm1, 0x01
    paddd      xmm1, xmm2
    paddd      xmm0, xmm1
+    sub        ecx, 16
    jg         wloop

    movd       eax, xmm0         // return hash
@ -192,39 +183,38 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
  __asm {
    mov        eax, [esp + 4]    // src
    mov        ecx, [esp + 8]    // count
-    movd       xmm0, [esp + 12]  // seed
-    movdqa     xmm6, kHash16x33
+    vmovd      xmm0, [esp + 12]  // seed

-    align      4
  wloop:
-    vpmovzxbd  xmm3, dword ptr [eax]  // src[0-3]
-    pmulld     xmm0, xmm6  // hash *= 33 ^ 16
-    vpmovzxbd  xmm4, dword ptr [eax + 4]  // src[4-7]
-    pmulld     xmm3, kHashMul0
-    vpmovzxbd  xmm2, dword ptr [eax + 8]  // src[8-11]
-    pmulld     xmm4, kHashMul1
-    vpmovzxbd  xmm1, dword ptr [eax + 12]  // src[12-15]
-    pmulld     xmm2, kHashMul2
+    vpmovzxbd  xmm3, [eax]  // src[0-3]
+    vpmulld    xmm0, xmm0, xmmword ptr kHash16x33  // hash *= 33 ^ 16
+    vpmovzxbd  xmm4, [eax + 4]  // src[4-7]
+    vpmulld    xmm3, xmm3, xmmword ptr kHashMul0
+    vpmovzxbd  xmm2, [eax + 8]  // src[8-11]
+    vpmulld    xmm4, xmm4, xmmword ptr kHashMul1
+    vpmovzxbd  xmm1, [eax + 12]  // src[12-15]
+    vpmulld    xmm2, xmm2, xmmword ptr kHashMul2
    lea        eax, [eax + 16]
-    pmulld     xmm1, kHashMul3
-    paddd      xmm3, xmm4        // add 16 results
-    paddd      xmm1, xmm2
+    vpmulld    xmm1, xmm1, xmmword ptr kHashMul3
+    vpaddd     xmm3, xmm3, xmm4        // add 16 results
+    vpaddd     xmm1, xmm1, xmm2
+    vpaddd     xmm1, xmm1, xmm3
+    vpshufd    xmm2, xmm1, 0x0e  // upper 2 dwords
+    vpaddd     xmm1, xmm1,xmm2
+    vpshufd    xmm2, xmm1, 0x01
+    vpaddd     xmm1, xmm1, xmm2
+    vpaddd     xmm0, xmm0, xmm1
    sub        ecx, 16
-    paddd      xmm1, xmm3
-    pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
-    paddd      xmm1, xmm2
-    pshufd     xmm2, xmm1, 0x01
-    paddd      xmm1, xmm2
-    paddd      xmm0, xmm1
    jg         wloop

-    movd       eax, xmm0         // return hash
+    vmovd      eax, xmm0         // return hash
+    vzeroupper
    ret
  }
 }
 #endif  // _MSC_VER >= 1700

-#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)

 #ifdef __cplusplus
 }  // extern "C"
--- a/media/libyuv/source/convert.cc
+++ b/media/libyuv/source/convert.cc
--- a/media/libyuv/source/convert_argb.cc
+++ b/media/libyuv/source/convert_argb.cc
--- a/media/libyuv/source/convert_from.cc
+++ b/media/libyuv/source/convert_from.cc
--- a/media/libyuv/source/convert_from_argb.cc
+++ b/media/libyuv/source/convert_from_argb.cc
--- a/media/libyuv/source/convert_jpeg.cc
+++ b/media/libyuv/source/convert_jpeg.cc
@ -9,6 +9,7 @@
 */

 #include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"

 #ifdef HAVE_JPEG
 #include "libyuv/mjpeg_decoder.h"
@ -218,7 +219,7 @@ int MJPGToI420(const uint8* sample,
      return 1;
    }
  }
-  return ret ? 0 : -1;
+  return ret ? 0 : 1;
 }

 #ifdef HAVE_JPEG
@ -380,7 +381,7 @@ int MJPGToARGB(const uint8* sample,
      return 1;
    }
  }
-  return ret ? 0 : -1;
+  return ret ? 0 : 1;
 }
 #endif

--- a/media/libyuv/source/convert_to_argb.cc
+++ b/media/libyuv/source/convert_to_argb.cc
@ -11,7 +11,6 @@
 #include "libyuv/convert_argb.h"

 #include "libyuv/cpu_id.h"
-#include "libyuv/format_conversion.h"
 #ifdef HAVE_JPEG
 #include "libyuv/mjpeg_decoder.h"
 #endif
@ -24,7 +23,7 @@ namespace libyuv {
 extern "C" {
 #endif

-// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// Convert camera sample to ARGB with cropping, rotation and vertical flip.
 // src_width is used for source stride computation
 // src_height is used to compute location of planes, and indicate inversion
 // sample_size is measured in bytes and is the size of the frame.
@ -52,8 +51,8 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
  // also enable temporary buffer.
  LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) ||
      crop_argb == sample;
-  uint8* tmp_argb = crop_argb;
-  int tmp_argb_stride = argb_stride;
+  uint8* dest_argb = crop_argb;
+  int dest_argb_stride = argb_stride;
  uint8* rotate_buffer = NULL;
  int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;

@ -67,13 +66,13 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
  }

  if (need_buf) {
-    int argb_size = crop_width * abs_crop_height * 4;
+    int argb_size = crop_width * 4 * abs_crop_height;
    rotate_buffer = (uint8*)malloc(argb_size);
    if (!rotate_buffer) {
      return 1;  // Out of memory runtime error.
    }
    crop_argb = rotate_buffer;
-    argb_stride = crop_width;
+    argb_stride = crop_width * 4;
  }

  switch (format) {
@ -144,36 +143,6 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
                         crop_argb, argb_stride,
                         crop_width, inv_crop_height);
      break;
-    // TODO(fbarchard): Support cropping Bayer by odd numbers
-    // by adjusting fourcc.
-    case FOURCC_BGGR:
-      src = sample + (src_width * crop_y + crop_x);
-      r = BayerBGGRToARGB(src, src_width,
-                          crop_argb, argb_stride,
-                          crop_width, inv_crop_height);
-      break;
-
-    case FOURCC_GBRG:
-      src = sample + (src_width * crop_y + crop_x);
-      r = BayerGBRGToARGB(src, src_width,
-                          crop_argb, argb_stride,
-                          crop_width, inv_crop_height);
-      break;
-
-    case FOURCC_GRBG:
-      src = sample + (src_width * crop_y + crop_x);
-      r = BayerGRBGToARGB(src, src_width,
-                          crop_argb, argb_stride,
-                          crop_width, inv_crop_height);
-      break;
-
-    case FOURCC_RGGB:
-      src = sample + (src_width * crop_y + crop_x);
-      r = BayerRGGBToARGB(src, src_width,
-                          crop_argb, argb_stride,
-                          crop_width, inv_crop_height);
-      break;
-
    case FOURCC_I400:
      src = sample + src_width * crop_y + crop_x;
      r = I400ToARGB(src, src_width,
@ -205,18 +174,8 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
                     crop_argb, argb_stride,
                     crop_width, inv_crop_height);
      break;
-//    case FOURCC_Q420:
-//      src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x;
-//      src_uv = sample + (src_width + aligned_src_width * 2) * crop_y +
-//               src_width + crop_x * 2;
-//      r = Q420ToARGB(src, src_width * 3,
-//                    src_uv, src_width * 3,
-//                    crop_argb, argb_stride,
-//                    crop_width, inv_crop_height);
-//      break;
    // Triplanar formats
    case FOURCC_I420:
-    case FOURCC_YU12:
    case FOURCC_YV12: {
      const uint8* src_y = sample + (src_width * crop_y + crop_x);
      const uint8* src_u;
@ -241,6 +200,25 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
                     crop_width, inv_crop_height);
      break;
    }
+
+    case FOURCC_J420: {
+      const uint8* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      int halfheight = (abs_src_height + 1) / 2;
+      src_u = sample + src_width * abs_src_height +
+          (halfwidth * crop_y + crop_x) / 2;
+      src_v = sample + src_width * abs_src_height +
+          halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      r = J420ToARGB(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+
    case FOURCC_I422:
    case FOURCC_YV16: {
      const uint8* src_y = sample + src_width * crop_y + crop_x;
@ -312,7 +290,7 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
  if (need_buf) {
    if (!r) {
      r = ARGBRotate(crop_argb, argb_stride,
-                     tmp_argb, tmp_argb_stride,
+                     dest_argb, dest_argb_stride,
                     crop_width, abs_crop_height, rotation);
    }
    free(rotate_buffer);
--- a/media/libyuv/source/convert_to_i420.cc
+++ b/media/libyuv/source/convert_to_i420.cc
@ -12,7 +12,6 @@

 #include "libyuv/convert.h"

-#include "libyuv/format_conversion.h"
 #include "libyuv/video_common.h"

 #ifdef __cplusplus
@ -40,12 +39,13 @@ int ConvertToI420(const uint8* sample,
  int aligned_src_width = (src_width + 1) & ~1;
  const uint8* src;
  const uint8* src_uv;
-  int abs_src_height = (src_height < 0) ? -src_height : src_height;
-  int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+  const int abs_src_height = (src_height < 0) ? -src_height : src_height;
+  // TODO(nisse): Why allow crop_height < 0?
+  const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
  int r = 0;
  LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 &&
      format != FOURCC_NV12 && format != FOURCC_NV21 &&
-      format != FOURCC_YU12 && format != FOURCC_YV12) || y == sample;
+      format != FOURCC_YV12) || y == sample;
  uint8* tmp_y = y;
  uint8* tmp_u = u;
  uint8* tmp_v = v;
@ -53,16 +53,14 @@ int ConvertToI420(const uint8* sample,
  int tmp_u_stride = u_stride;
  int tmp_v_stride = v_stride;
  uint8* rotate_buffer = NULL;
-  int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+  const int inv_crop_height =
+      (src_height < 0) ? -abs_crop_height : abs_crop_height;

  if (!y || !u || !v || !sample ||
      src_width <= 0 || crop_width <= 0  ||
      src_height == 0 || crop_height == 0) {
    return -1;
  }
-  if (src_height < 0) {
-    inv_crop_height = -inv_crop_height;
-  }

  // One pass rotation is available for some formats. For the rest, convert
  // to I420 (with optional vertical flipping) into a temporary I420 buffer,
@ -173,40 +171,6 @@ int ConvertToI420(const uint8* sample,
                     v, v_stride,
                     crop_width, inv_crop_height);
      break;
-    // TODO(fbarchard): Support cropping Bayer by odd numbers
-    // by adjusting fourcc.
-    case FOURCC_BGGR:
-      src = sample + (src_width * crop_y + crop_x);
-      r = BayerBGGRToI420(src, src_width,
-                          y, y_stride,
-                          u, u_stride,
-                          v, v_stride,
-                          crop_width, inv_crop_height);
-      break;
-    case FOURCC_GBRG:
-      src = sample + (src_width * crop_y + crop_x);
-      r = BayerGBRGToI420(src, src_width,
-                          y, y_stride,
-                          u, u_stride,
-                          v, v_stride,
-                          crop_width, inv_crop_height);
-      break;
-    case FOURCC_GRBG:
-      src = sample + (src_width * crop_y + crop_x);
-      r = BayerGRBGToI420(src, src_width,
-                          y, y_stride,
-                          u, u_stride,
-                          v, v_stride,
-                          crop_width, inv_crop_height);
-      break;
-    case FOURCC_RGGB:
-      src = sample + (src_width * crop_y + crop_x);
-      r = BayerRGGBToI420(src, src_width,
-                          y, y_stride,
-                          u, u_stride,
-                          v, v_stride,
-                          crop_width, inv_crop_height);
-      break;
    case FOURCC_I400:
      src = sample + src_width * crop_y + crop_x;
      r = I400ToI420(src, src_width,
@ -218,7 +182,8 @@ int ConvertToI420(const uint8* sample,
    // Biplanar formats
    case FOURCC_NV12:
      src = sample + (src_width * crop_y + crop_x);
-      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+      src_uv = sample + (src_width * src_height) +
+        ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
      r = NV12ToI420Rotate(src, src_width,
                           src_uv, aligned_src_width,
                           y, y_stride,
@ -228,7 +193,8 @@ int ConvertToI420(const uint8* sample,
      break;
    case FOURCC_NV21:
      src = sample + (src_width * crop_y + crop_x);
-      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+      src_uv = sample + (src_width * src_height) +
+        ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
      // Call NV12 but with u and v parameters swapped.
      r = NV12ToI420Rotate(src, src_width,
                           src_uv, aligned_src_width,
@ -245,20 +211,8 @@ int ConvertToI420(const uint8* sample,
                     v, v_stride,
                     crop_width, inv_crop_height);
      break;
-    case FOURCC_Q420:
-      src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x;
-      src_uv = sample + (src_width + aligned_src_width * 2) * crop_y +
-               src_width + crop_x * 2;
-      r = Q420ToI420(src, src_width * 3,
-                    src_uv, src_width * 3,
-                    y, y_stride,
-                    u, u_stride,
-                    v, v_stride,
-                    crop_width, inv_crop_height);
-      break;
    // Triplanar formats
    case FOURCC_I420:
-    case FOURCC_YU12:
    case FOURCC_YV12: {
      const uint8* src_y = sample + (src_width * crop_y + crop_x);
      const uint8* src_u;
--- a/media/libyuv/source/cpu_id.cc
+++ b/media/libyuv/source/cpu_id.cc
@ -10,12 +10,12 @@

 #include "libyuv/cpu_id.h"

-#if defined(_MSC_VER) && !defined(__clang__)
+#if defined(_MSC_VER)
 #include <intrin.h>  // For __cpuidex()
 #endif
 #if !defined(__pnacl__) && !defined(__CLR_VER) && \
-    !defined(__native_client__) && defined(_M_X64) && \
-    defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
+    !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
+    defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
 #include <immintrin.h>  // For _xgetbv()
 #endif

@ -36,20 +36,22 @@ extern "C" {

 // For functions that use the stack and have runtime checks for overflow,
 // use SAFEBUFFERS to avoid additional check.
-#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) && \
+    !defined(__clang__)
 #define SAFEBUFFERS __declspec(safebuffers)
 #else
 #define SAFEBUFFERS
 #endif

-// Low level cpuid for X86. Returns zeros on other CPUs.
-#if !defined(__pnacl__) && !defined(__CLR_VER) && \
-    (defined(_M_IX86) || defined(_M_X64) || \
-    defined(__i386__) || defined(__x86_64__))
+// Low level cpuid for X86.
+#if (defined(_M_IX86) || defined(_M_X64) || \
+    defined(__i386__) || defined(__x86_64__)) && \
+    !defined(__pnacl__) && !defined(__CLR_VER)
 LIBYUV_API
 void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
-#if defined(_MSC_VER) && !defined(__clang__)
-#if (_MSC_FULL_VER >= 160040219)
+#if defined(_MSC_VER)
+// Visual C version uses intrinsic or inline x86 assembly.
+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
  __cpuidex((int*)(cpu_info), info_eax, info_ecx);
 #elif defined(_M_IX86)
  __asm {
@ -62,16 +64,17 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
    mov        [edi + 8], ecx
    mov        [edi + 12], edx
  }
-#else
+#else  // Visual C but not x86
  if (info_ecx == 0) {
    __cpuid((int*)(cpu_info), info_eax);
  } else {
    cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0;
  }
 #endif
+// GCC version uses inline x86 assembly.
 #else  // defined(_MSC_VER)
  uint32 info_ebx, info_edx;
-  asm volatile (  // NOLINT
+  asm volatile (
 #if defined( __i386__) && defined(__PIC__)
    // Preserve ebx for fpic 32 bit.
    "mov %%ebx, %%edi                          \n"
@ -89,76 +92,78 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
  cpu_info[3] = info_edx;
 #endif  // defined(_MSC_VER)
 }
-
-#if !defined(__native_client__)
-#define HAS_XGETBV
-// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
-int TestOsSaveYmm() {
-  uint32 xcr0 = 0u;
-#if defined(_MSC_VER) && defined(_XCR_XFEATURE_ENABLED_MASK)
-  xcr0 = (uint32)(_xgetbv(_XCR_XFEATURE_ENABLED_MASK));
-#elif defined(_MSC_VER) && defined(_M_IX86)
-  __asm {
-    xor        ecx, ecx    // xcr 0
-    _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0  // For VS2010 and earlier.
-    mov        xcr0, eax
-  }
-#elif defined(__i386__) || defined(__x86_64__)
-  asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
-#endif  // defined(_MSC_VER)
-  return((xcr0 & 6) == 6);  // Is ymm saved?
-}
-#endif  // !defined(__native_client__)
-#else
+#else  // (defined(_M_IX86) || defined(_M_X64) ...
 LIBYUV_API
 void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
  cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
 }
 #endif

+// For VS2010 and earlier emit can be used:
+//   _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0  // For VS2010 and earlier.
+//  __asm {
+//    xor        ecx, ecx    // xcr 0
+//    xgetbv
+//    mov        xcr0, eax
+//  }
+// For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code.
+// https://code.google.com/p/libyuv/issues/detail?id=529
+#if defined(_M_IX86) && (_MSC_VER < 1900)
+#pragma optimize("g", off)
+#endif
+#if (defined(_M_IX86) || defined(_M_X64) || \
+    defined(__i386__) || defined(__x86_64__)) && \
+    !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
+#define HAS_XGETBV
+// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
+int GetXCR0() {
+  uint32 xcr0 = 0u;
+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
+  xcr0 = (uint32)(_xgetbv(0));  // VS2010 SP1 required.
+#elif defined(__i386__) || defined(__x86_64__)
+  asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
+#endif  // defined(__i386__) || defined(__x86_64__)
+  return xcr0;
+}
+#endif  // defined(_M_IX86) || defined(_M_X64) ..
+// Return optimization to previous setting.
+#if defined(_M_IX86) && (_MSC_VER < 1900)
+#pragma optimize("g", on)
+#endif
+
 // based on libvpx arm_cpudetect.c
 // For Arm, but public to allow testing on any CPU
 LIBYUV_API SAFEBUFFERS
 int ArmCpuCaps(const char* cpuinfo_name) {
+  char cpuinfo_line[512];
  FILE* f = fopen(cpuinfo_name, "r");
-  if (f) {
-    char cpuinfo_line[512];
-    while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
-      if (memcmp(cpuinfo_line, "Features", 8) == 0) {
-        char* p = strstr(cpuinfo_line, " neon");
-        if (p && (p[5] == ' ' || p[5] == '\n')) {
-          fclose(f);
-          return kCpuHasNEON;
-        }
-      }
-    }
-    fclose(f);
+  if (!f) {
+    // Assume Neon if /proc/cpuinfo is unavailable.
+    // This will occur for Chrome sandbox for Pepper or Render process.
+    return kCpuHasNEON;
  }
-  return 0;
-}
-
-#if defined(__mips__) && defined(__linux__)
-static int MipsCpuCaps(const char* search_string) {
-  const char* file_name = "/proc/cpuinfo";
-  char cpuinfo_line[256];
-  FILE* f = NULL;
-  if ((f = fopen(file_name, "r")) != NULL) {
-    while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f) != NULL) {
-      if (strstr(cpuinfo_line, search_string) != NULL) {
+  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+    if (memcmp(cpuinfo_line, "Features", 8) == 0) {
+      char* p = strstr(cpuinfo_line, " neon");
+      if (p && (p[5] == ' ' || p[5] == '\n')) {
        fclose(f);
-        return kCpuHasMIPS_DSP;
+        return kCpuHasNEON;
+      }
+      // aarch64 uses asimd for Neon.
+      p = strstr(cpuinfo_line, " asimd");
+      if (p && (p[6] == ' ' || p[6] == '\n')) {
+        fclose(f);
+        return kCpuHasNEON;
      }
    }
-    fclose(f);
  }
-  /* Did not find string in the proc file, or not Linux ELF. */
+  fclose(f);
  return 0;
 }
-#endif

 // CPU detect function for SIMD instruction sets.
 LIBYUV_API
-int cpu_info_ = kCpuInit;  // cpu_info is not initialized yet.
+int cpu_info_ = 0;  // cpu_info is not initialized yet.

 // Test environment variable for disabling CPU features. Any non-zero value
 // to disable. Zero ignored to make it easy to set the variable on/off.
@ -181,93 +186,109 @@ static LIBYUV_BOOL TestEnv(const char*) {

 LIBYUV_API SAFEBUFFERS
 int InitCpuFlags(void) {
+  // TODO(fbarchard): swap kCpuInit logic so 0 means uninitialized.
+  int cpu_info = 0;
 #if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)
-
+  uint32 cpu_info0[4] = { 0, 0, 0, 0 };
  uint32 cpu_info1[4] = { 0, 0, 0, 0 };
  uint32 cpu_info7[4] = { 0, 0, 0, 0 };
+  CpuId(0, 0, cpu_info0);
  CpuId(1, 0, cpu_info1);
-  CpuId(7, 0, cpu_info7);
-  cpu_info_ = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
-              ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
-              ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
-              ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
-              ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
-              ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
-              kCpuHasX86;
+  if (cpu_info0[0] >= 7) {
+    CpuId(7, 0, cpu_info7);
+  }
+  cpu_info = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
+             ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
+             ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
+             ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
+             ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
+             ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
+             kCpuHasX86;
+
 #ifdef HAS_XGETBV
-  if ((cpu_info1[2] & 0x18000000) == 0x18000000 &&  // AVX and OSSave
-      TestOsSaveYmm()) {  // Saves YMM.
-    cpu_info_ |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
-                 kCpuHasAVX;
+  // AVX requires CPU has AVX, XSAVE and OSXSave for xgetbv
+  if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) &&  // AVX and OSXSave
+      ((GetXCR0() & 6) == 6)) {  // Test OS saves YMM registers
+    cpu_info |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | kCpuHasAVX;
+
+    // Detect AVX512bw
+    if ((GetXCR0() & 0xe0) == 0xe0) {
+      cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX3 : 0;
+    }
  }
 #endif
+
  // Environment variable overrides for testing.
  if (TestEnv("LIBYUV_DISABLE_X86")) {
-    cpu_info_ &= ~kCpuHasX86;
+    cpu_info &= ~kCpuHasX86;
  }
  if (TestEnv("LIBYUV_DISABLE_SSE2")) {
-    cpu_info_ &= ~kCpuHasSSE2;
+    cpu_info &= ~kCpuHasSSE2;
  }
  if (TestEnv("LIBYUV_DISABLE_SSSE3")) {
-    cpu_info_ &= ~kCpuHasSSSE3;
+    cpu_info &= ~kCpuHasSSSE3;
  }
  if (TestEnv("LIBYUV_DISABLE_SSE41")) {
-    cpu_info_ &= ~kCpuHasSSE41;
+    cpu_info &= ~kCpuHasSSE41;
  }
  if (TestEnv("LIBYUV_DISABLE_SSE42")) {
-    cpu_info_ &= ~kCpuHasSSE42;
+    cpu_info &= ~kCpuHasSSE42;
  }
  if (TestEnv("LIBYUV_DISABLE_AVX")) {
-    cpu_info_ &= ~kCpuHasAVX;
+    cpu_info &= ~kCpuHasAVX;
  }
  if (TestEnv("LIBYUV_DISABLE_AVX2")) {
-    cpu_info_ &= ~kCpuHasAVX2;
+    cpu_info &= ~kCpuHasAVX2;
  }
  if (TestEnv("LIBYUV_DISABLE_ERMS")) {
-    cpu_info_ &= ~kCpuHasERMS;
+    cpu_info &= ~kCpuHasERMS;
  }
  if (TestEnv("LIBYUV_DISABLE_FMA3")) {
-    cpu_info_ &= ~kCpuHasFMA3;
+    cpu_info &= ~kCpuHasFMA3;
  }
-#elif defined(__mips__) && defined(__linux__)
-  // Linux mips parse text file for dsp detect.
-  cpu_info_ = MipsCpuCaps("dsp");  // set kCpuHasMIPS_DSP.
+  if (TestEnv("LIBYUV_DISABLE_AVX3")) {
+    cpu_info &= ~kCpuHasAVX3;
+  }
+#endif
+#if defined(__mips__) && defined(__linux__)
 #if defined(__mips_dspr2)
-  cpu_info_ |= kCpuHasMIPS_DSPR2;
+  cpu_info |= kCpuHasDSPR2;
 #endif
-  cpu_info_ |= kCpuHasMIPS;
-
-  if (getenv("LIBYUV_DISABLE_MIPS")) {
-    cpu_info_ &= ~kCpuHasMIPS;
+  cpu_info |= kCpuHasMIPS;
+  if (getenv("LIBYUV_DISABLE_DSPR2")) {
+    cpu_info &= ~kCpuHasDSPR2;
  }
-  if (getenv("LIBYUV_DISABLE_MIPS_DSP")) {
-    cpu_info_ &= ~kCpuHasMIPS_DSP;
-  }
-  if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) {
-    cpu_info_ &= ~kCpuHasMIPS_DSPR2;
-  }
-#elif defined(__arm__)
-#if defined(__linux__) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) && \
-    !defined(__native_client__)
+#endif
+#if defined(__arm__) || defined(__aarch64__)
+// gcc -mfpu=neon defines __ARM_NEON__
+// __ARM_NEON__ generates code that requires Neon.  NaCL also requires Neon.
+// For Linux, /proc/cpuinfo can be tested but without that assume Neon.
+#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)
+  cpu_info = kCpuHasNEON;
+// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon
+// flag in it.
+// So for aarch64, neon enabling is hard coded here.
+#endif
+#if defined(__aarch64__)
+  cpu_info = kCpuHasNEON;
+#else
  // Linux arm parse text file for neon detect.
-  cpu_info_ = ArmCpuCaps("/proc/cpuinfo");
-#elif defined(__ARM_NEON__) || defined(__native_client__)
-  // gcc -mfpu=neon defines __ARM_NEON__
-  // Enable Neon if you want support for Neon and Arm, and use MaskCpuFlags
-  // to disable Neon on devices that do not have it.
-  cpu_info_ = kCpuHasNEON;
+  cpu_info = ArmCpuCaps("/proc/cpuinfo");
 #endif
-  cpu_info_ |= kCpuHasARM;
+  cpu_info |= kCpuHasARM;
  if (TestEnv("LIBYUV_DISABLE_NEON")) {
-    cpu_info_ &= ~kCpuHasNEON;
+    cpu_info &= ~kCpuHasNEON;
  }
 #endif  // __arm__
  if (TestEnv("LIBYUV_DISABLE_ASM")) {
-    cpu_info_ = 0;
+    cpu_info = 0;
  }
-  return cpu_info_;
+  cpu_info  |= kCpuInitialized;
+  cpu_info_ = cpu_info;
+  return cpu_info;
 }

+// Note that use of this function is not thread safe.
 LIBYUV_API
 void MaskCpuFlags(int enable_flags) {
  cpu_info_ = InitCpuFlags() & enable_flags;
--- a/media/libyuv/source/format_conversion.cc
+++ b/media/libyuv/source/format_conversion.cc
@ -1,538 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/format_conversion.h"
-
-#include "libyuv/basic_types.h"
-#include "libyuv/cpu_id.h"
-#include "libyuv/video_common.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// generate a selector mask useful for pshufb
-static uint32 GenerateSelector(int select0, int select1) {
-  return (uint32)(select0) |
-         (uint32)((select1 + 4) << 8) |
-         (uint32)((select0 + 8) << 16) |
-         (uint32)((select1 + 12) << 24);
-}
-
-static int MakeSelectors(const int blue_index,
-                         const int green_index,
-                         const int red_index,
-                         uint32 dst_fourcc_bayer,
-                         uint32* index_map) {
-  // Now build a lookup table containing the indices for the four pixels in each
-  // 2x2 Bayer grid.
-  switch (dst_fourcc_bayer) {
-    case FOURCC_BGGR:
-      index_map[0] = GenerateSelector(blue_index, green_index);
-      index_map[1] = GenerateSelector(green_index, red_index);
-      break;
-    case FOURCC_GBRG:
-      index_map[0] = GenerateSelector(green_index, blue_index);
-      index_map[1] = GenerateSelector(red_index, green_index);
-      break;
-    case FOURCC_RGGB:
-      index_map[0] = GenerateSelector(red_index, green_index);
-      index_map[1] = GenerateSelector(green_index, blue_index);
-      break;
-    case FOURCC_GRBG:
-      index_map[0] = GenerateSelector(green_index, red_index);
-      index_map[1] = GenerateSelector(blue_index, green_index);
-      break;
-    default:
-      return -1;  // Bad FourCC
-  }
-  return 0;
-}
-
-// Converts 32 bit ARGB to Bayer RGB formats.
-LIBYUV_API
-int ARGBToBayer(const uint8* src_argb, int src_stride_argb,
-                uint8* dst_bayer, int dst_stride_bayer,
-                int width, int height,
-                uint32 dst_fourcc_bayer) {
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
-                         uint32 selector, int pix) = ARGBToBayerRow_C;
-#if defined(HAS_ARGBTOBAYERROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-    ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToBayerRow = ARGBToBayerRow_SSSE3;
-    }
-  }
-#elif defined(HAS_ARGBTOBAYERROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
-    ARGBToBayerRow = ARGBToBayerRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToBayerRow = ARGBToBayerRow_NEON;
-    }
-  }
-#endif
-  const int blue_index = 0;  // Offsets for ARGB format
-  const int green_index = 1;
-  const int red_index = 2;
-  uint32 index_map[2];
-  if (MakeSelectors(blue_index, green_index, red_index,
-                    dst_fourcc_bayer, index_map)) {
-    return -1;  // Bad FourCC
-  }
-
-  for (int y = 0; y < height; ++y) {
-    ARGBToBayerRow(src_argb, dst_bayer, index_map[y & 1], width);
-    src_argb += src_stride_argb;
-    dst_bayer += dst_stride_bayer;
-  }
-  return 0;
-}
-
-#define AVG(a, b) (((a) + (b)) >> 1)
-
-static void BayerRowBG(const uint8* src_bayer0, int src_stride_bayer,
-                       uint8* dst_argb, int pix) {
-  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
-  uint8 g = src_bayer0[1];
-  uint8 r = src_bayer1[1];
-  for (int x = 0; x < pix - 2; x += 2) {
-    dst_argb[0] = src_bayer0[0];
-    dst_argb[1] = AVG(g, src_bayer0[1]);
-    dst_argb[2] = AVG(r, src_bayer1[1]);
-    dst_argb[3] = 255U;
-    dst_argb[4] = AVG(src_bayer0[0], src_bayer0[2]);
-    dst_argb[5] = src_bayer0[1];
-    dst_argb[6] = src_bayer1[1];
-    dst_argb[7] = 255U;
-    g = src_bayer0[1];
-    r = src_bayer1[1];
-    src_bayer0 += 2;
-    src_bayer1 += 2;
-    dst_argb += 8;
-  }
-  dst_argb[0] = src_bayer0[0];
-  dst_argb[1] = AVG(g, src_bayer0[1]);
-  dst_argb[2] = AVG(r, src_bayer1[1]);
-  dst_argb[3] = 255U;
-  if (!(pix & 1)) {
-    dst_argb[4] = src_bayer0[0];
-    dst_argb[5] = src_bayer0[1];
-    dst_argb[6] = src_bayer1[1];
-    dst_argb[7] = 255U;
-  }
-}
-
-static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer,
-                       uint8* dst_argb, int pix) {
-  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
-  uint8 g = src_bayer0[1];
-  uint8 b = src_bayer1[1];
-  for (int x = 0; x < pix - 2; x += 2) {
-    dst_argb[0] = AVG(b, src_bayer1[1]);
-    dst_argb[1] = AVG(g, src_bayer0[1]);
-    dst_argb[2] = src_bayer0[0];
-    dst_argb[3] = 255U;
-    dst_argb[4] = src_bayer1[1];
-    dst_argb[5] = src_bayer0[1];
-    dst_argb[6] = AVG(src_bayer0[0], src_bayer0[2]);
-    dst_argb[7] = 255U;
-    g = src_bayer0[1];
-    b = src_bayer1[1];
-    src_bayer0 += 2;
-    src_bayer1 += 2;
-    dst_argb += 8;
-  }
-  dst_argb[0] = AVG(b, src_bayer1[1]);
-  dst_argb[1] = AVG(g, src_bayer0[1]);
-  dst_argb[2] = src_bayer0[0];
-  dst_argb[3] = 255U;
-  if (!(pix & 1)) {
-    dst_argb[4] = src_bayer1[1];
-    dst_argb[5] = src_bayer0[1];
-    dst_argb[6] = src_bayer0[0];
-    dst_argb[7] = 255U;
-  }
-}
-
-static void BayerRowGB(const uint8* src_bayer0, int src_stride_bayer,
-                       uint8* dst_argb, int pix) {
-  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
-  uint8 b = src_bayer0[1];
-  for (int x = 0; x < pix - 2; x += 2) {
-    dst_argb[0] = AVG(b, src_bayer0[1]);
-    dst_argb[1] = src_bayer0[0];
-    dst_argb[2] = src_bayer1[0];
-    dst_argb[3] = 255U;
-    dst_argb[4] = src_bayer0[1];
-    dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]);
-    dst_argb[6] = AVG(src_bayer1[0], src_bayer1[2]);
-    dst_argb[7] = 255U;
-    b = src_bayer0[1];
-    src_bayer0 += 2;
-    src_bayer1 += 2;
-    dst_argb += 8;
-  }
-  dst_argb[0] = AVG(b, src_bayer0[1]);
-  dst_argb[1] = src_bayer0[0];
-  dst_argb[2] = src_bayer1[0];
-  dst_argb[3] = 255U;
-  if (!(pix & 1)) {
-    dst_argb[4] = src_bayer0[1];
-    dst_argb[5] = src_bayer0[0];
-    dst_argb[6] = src_bayer1[0];
-    dst_argb[7] = 255U;
-  }
-}
-
-static void BayerRowGR(const uint8* src_bayer0, int src_stride_bayer,
-                       uint8* dst_argb, int pix) {
-  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
-  uint8 r = src_bayer0[1];
-  for (int x = 0; x < pix - 2; x += 2) {
-    dst_argb[0] = src_bayer1[0];
-    dst_argb[1] = src_bayer0[0];
-    dst_argb[2] = AVG(r, src_bayer0[1]);
-    dst_argb[3] = 255U;
-    dst_argb[4] = AVG(src_bayer1[0], src_bayer1[2]);
-    dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]);
-    dst_argb[6] = src_bayer0[1];
-    dst_argb[7] = 255U;
-    r = src_bayer0[1];
-    src_bayer0 += 2;
-    src_bayer1 += 2;
-    dst_argb += 8;
-  }
-  dst_argb[0] = src_bayer1[0];
-  dst_argb[1] = src_bayer0[0];
-  dst_argb[2] = AVG(r, src_bayer0[1]);
-  dst_argb[3] = 255U;
-  if (!(pix & 1)) {
-    dst_argb[4] = src_bayer1[0];
-    dst_argb[5] = src_bayer0[0];
-    dst_argb[6] = src_bayer0[1];
-    dst_argb[7] = 255U;
-  }
-}
-
-// Converts any Bayer RGB format to ARGB.
-LIBYUV_API
-int BayerToARGB(const uint8* src_bayer, int src_stride_bayer,
-                uint8* dst_argb, int dst_stride_argb,
-                int width, int height,
-                uint32 src_fourcc_bayer) {
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-  void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_argb, int pix);
-  void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_argb, int pix);
-  switch (src_fourcc_bayer) {
-    case FOURCC_BGGR:
-      BayerRow0 = BayerRowBG;
-      BayerRow1 = BayerRowGR;
-      break;
-    case FOURCC_GBRG:
-      BayerRow0 = BayerRowGB;
-      BayerRow1 = BayerRowRG;
-      break;
-    case FOURCC_GRBG:
-      BayerRow0 = BayerRowGR;
-      BayerRow1 = BayerRowBG;
-      break;
-    case FOURCC_RGGB:
-      BayerRow0 = BayerRowRG;
-      BayerRow1 = BayerRowGB;
-      break;
-    default:
-      return -1;    // Bad FourCC
-  }
-
-  for (int y = 0; y < height - 1; y += 2) {
-    BayerRow0(src_bayer, src_stride_bayer, dst_argb, width);
-    BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
-              dst_argb + dst_stride_argb, width);
-    src_bayer += src_stride_bayer * 2;
-    dst_argb += dst_stride_argb * 2;
-  }
-  if (height & 1) {
-    BayerRow0(src_bayer, src_stride_bayer, dst_argb, width);
-  }
-  return 0;
-}
-
-// Converts any Bayer RGB format to ARGB.
-LIBYUV_API
-int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
-                uint8* dst_y, int dst_stride_y,
-                uint8* dst_u, int dst_stride_u,
-                uint8* dst_v, int dst_stride_v,
-                int width, int height,
-                uint32 src_fourcc_bayer) {
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    int halfheight = (height + 1) >> 1;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
-    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
-    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
-    dst_stride_y = -dst_stride_y;
-    dst_stride_u = -dst_stride_u;
-    dst_stride_v = -dst_stride_v;
-  }
-  void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_argb, int pix);
-  void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_argb, int pix);
-
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
-      ARGBToYRow_C;
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-        ARGBToYRow = ARGBToYRow_SSSE3;
-      }
-    }
-  }
-#elif defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
-    ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_NEON;
-    }
-    if (width >= 16) {
-      ARGBToUVRow = ARGBToUVRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVRow = ARGBToUVRow_NEON;
-      }
-    }
-  }
-#endif
-
-  switch (src_fourcc_bayer) {
-    case FOURCC_BGGR:
-      BayerRow0 = BayerRowBG;
-      BayerRow1 = BayerRowGR;
-      break;
-    case FOURCC_GBRG:
-      BayerRow0 = BayerRowGB;
-      BayerRow1 = BayerRowRG;
-      break;
-    case FOURCC_GRBG:
-      BayerRow0 = BayerRowGR;
-      BayerRow1 = BayerRowBG;
-      break;
-    case FOURCC_RGGB:
-      BayerRow0 = BayerRowRG;
-      BayerRow1 = BayerRowGB;
-      break;
-    default:
-      return -1;  // Bad FourCC
-  }
-
-  // Allocate 2 rows of ARGB.
-  const int kRowSize = (width * 4 + 15) & ~15;
-  align_buffer_64(row, kRowSize * 2);
-  for (int y = 0; y < height - 1; y += 2) {
-    BayerRow0(src_bayer, src_stride_bayer, row, width);
-    BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
-              row + kRowSize, width);
-    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
-    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-    src_bayer += src_stride_bayer * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    BayerRow0(src_bayer, src_stride_bayer, row, width);
-    ARGBToUVRow(row, 0, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
-  }
-  free_aligned_buffer_64(row);
-  return 0;
-}
-
-// Convert I420 to Bayer.
-LIBYUV_API
-int I420ToBayer(const uint8* src_y, int src_stride_y,
-                const uint8* src_u, int src_stride_u,
-                const uint8* src_v, int src_stride_v,
-                uint8* dst_bayer, int dst_stride_bayer,
-                int width, int height,
-                uint32 dst_fourcc_bayer) {
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    int halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (halfheight - 1) * src_stride_u;
-    src_v = src_v + (halfheight - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToARGBRow_C;
-#if defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
-    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToARGBRow = I422ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
-    I422ToARGBRow = I422ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {
-    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
-  }
-#endif
-
-  void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
-                         uint32 selector, int pix) = ARGBToBayerRow_C;
-#if defined(HAS_ARGBTOBAYERROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-    ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToBayerRow = ARGBToBayerRow_SSSE3;
-    }
-  }
-#elif defined(HAS_ARGBTOBAYERROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
-    ARGBToBayerRow = ARGBToBayerRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToBayerRow = ARGBToBayerRow_NEON;
-    }
-  }
-#endif
-
-  const int blue_index = 0;  // Offsets for ARGB format
-  const int green_index = 1;
-  const int red_index = 2;
-  uint32 index_map[2];
-  if (MakeSelectors(blue_index, green_index, red_index,
-                    dst_fourcc_bayer, index_map)) {
-    return -1;  // Bad FourCC
-  }
-  // Allocate a row of ARGB.
-  align_buffer_64(row, width * 4);
-  for (int y = 0; y < height; ++y) {
-    I422ToARGBRow(src_y, src_u, src_v, row, width);
-    ARGBToBayerRow(row, dst_bayer, index_map[y & 1], width);
-    dst_bayer += dst_stride_bayer;
-    src_y += src_stride_y;
-    if (y & 1) {
-      src_u += src_stride_u;
-      src_v += src_stride_v;
-    }
-  }
-  free_aligned_buffer_64(row);
-  return 0;
-}
-
-#define MAKEBAYERFOURCC(BAYER)                                                 \
-LIBYUV_API                                                                     \
-int Bayer##BAYER##ToI420(const uint8* src_bayer, int src_stride_bayer,         \
-                         uint8* dst_y, int dst_stride_y,                       \
-                         uint8* dst_u, int dst_stride_u,                       \
-                         uint8* dst_v, int dst_stride_v,                       \
-                         int width, int height) {                              \
-  return BayerToI420(src_bayer, src_stride_bayer,                              \
-                     dst_y, dst_stride_y,                                      \
-                     dst_u, dst_stride_u,                                      \
-                     dst_v, dst_stride_v,                                      \
-                     width, height,                                            \
-                     FOURCC_##BAYER);                                          \
-}                                                                              \
-                                                                               \
-LIBYUV_API                                                                     \
-int I420ToBayer##BAYER(const uint8* src_y, int src_stride_y,                   \
-                       const uint8* src_u, int src_stride_u,                   \
-                       const uint8* src_v, int src_stride_v,                   \
-                       uint8* dst_bayer, int dst_stride_bayer,                 \
-                       int width, int height) {                                \
-  return I420ToBayer(src_y, src_stride_y,                                      \
-                     src_u, src_stride_u,                                      \
-                     src_v, src_stride_v,                                      \
-                     dst_bayer, dst_stride_bayer,                              \
-                     width, height,                                            \
-                     FOURCC_##BAYER);                                          \
-}                                                                              \
-                                                                               \
-LIBYUV_API                                                                     \
-int ARGBToBayer##BAYER(const uint8* src_argb, int src_stride_argb,             \
-                       uint8* dst_bayer, int dst_stride_bayer,                 \
-                       int width, int height) {                                \
-  return ARGBToBayer(src_argb, src_stride_argb,                                \
-                     dst_bayer, dst_stride_bayer,                              \
-                     width, height,                                            \
-                     FOURCC_##BAYER);                                          \
-}                                                                              \
-                                                                               \
-LIBYUV_API                                                                     \
-int Bayer##BAYER##ToARGB(const uint8* src_bayer, int src_stride_bayer,         \
-                         uint8* dst_argb, int dst_stride_argb,                 \
-                         int width, int height) {                              \
-  return BayerToARGB(src_bayer, src_stride_bayer,                              \
-                     dst_argb, dst_stride_argb,                                \
-                     width, height,                                            \
-                     FOURCC_##BAYER);                                          \
-}
-
-MAKEBAYERFOURCC(BGGR)
-MAKEBAYERFOURCC(GBRG)
-MAKEBAYERFOURCC(GRBG)
-MAKEBAYERFOURCC(RGGB)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
--- a/media/libyuv/source/mjpeg_decoder.cc
+++ b/media/libyuv/source/mjpeg_decoder.cc
@ -13,13 +13,20 @@
 #ifdef HAVE_JPEG
 #include <assert.h>

-#if !defined(__pnacl__) && !defined(__CLR_VER) && !defined(COVERAGE_ENABLED) &&\
-    !defined(TARGET_IPHONE_SIMULATOR)
+#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
 // Must be included before jpeglib.
 #include <setjmp.h>
 #define HAVE_SETJMP
+
+#if defined(_MSC_VER)
+// disable warning 4324: structure was padded due to __declspec(align())
+#pragma warning(disable:4324)
 #endif

+#endif
+struct FILE;  // For jpeglib.h.
+
 // C++ build requires extern C for jpeg internals.
 #ifdef __cplusplus
 extern "C" {
@ -49,6 +56,13 @@ const int MJpegDecoder::kColorSpaceYCbCr = JCS_YCbCr;
 const int MJpegDecoder::kColorSpaceCMYK = JCS_CMYK;
 const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK;

+// Methods that are passed to jpeglib.
+boolean fill_input_buffer(jpeg_decompress_struct* cinfo);
+void init_source(jpeg_decompress_struct* cinfo);
+void skip_input_data(jpeg_decompress_struct* cinfo, long num_bytes);  // NOLINT
+void term_source(jpeg_decompress_struct* cinfo);
+void ErrorHandler(jpeg_common_struct* cinfo);
+
 MJpegDecoder::MJpegDecoder()
    : has_scanline_padding_(LIBYUV_FALSE),
      num_outbufs_(0),
@ -63,9 +77,6 @@ MJpegDecoder::MJpegDecoder()
  decompress_struct_->err = jpeg_std_error(&error_mgr_->base);
  // Override standard exit()-based error handler.
  error_mgr_->base.error_exit = &ErrorHandler;
-#ifndef DEBUG_MJPEG
-  error_mgr_->base.output_message = &OutputHandler;
-#endif
 #endif
  decompress_struct_->client_data = NULL;
  source_mgr_->init_source = &init_source;
@ -95,7 +106,7 @@ LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
  }

  buf_.data = src;
-  buf_.len = (int)(src_len);
+  buf_.len = static_cast<int>(src_len);
  buf_vec_.pos = 0;
  decompress_struct_->client_data = &buf_vec_;
 #ifdef HAVE_SETJMP
@ -400,12 +411,12 @@ LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
  return FinishDecode();
 }

-void MJpegDecoder::init_source(j_decompress_ptr cinfo) {
+void init_source(j_decompress_ptr cinfo) {
  fill_input_buffer(cinfo);
 }

-boolean MJpegDecoder::fill_input_buffer(j_decompress_ptr cinfo) {
-  BufferVector* buf_vec = (BufferVector*)(cinfo->client_data);
+boolean fill_input_buffer(j_decompress_ptr cinfo) {
+  BufferVector* buf_vec = reinterpret_cast<BufferVector*>(cinfo->client_data);
  if (buf_vec->pos >= buf_vec->len) {
    assert(0 && "No more data");
    // ERROR: No more data
@ -417,17 +428,16 @@ boolean MJpegDecoder::fill_input_buffer(j_decompress_ptr cinfo) {
  return TRUE;
 }

-void MJpegDecoder::skip_input_data(j_decompress_ptr cinfo,
-                                   long num_bytes) {  // NOLINT
+void skip_input_data(j_decompress_ptr cinfo, long num_bytes) {  // NOLINT
  cinfo->src->next_input_byte += num_bytes;
 }

-void MJpegDecoder::term_source(j_decompress_ptr cinfo) {
+void term_source(j_decompress_ptr cinfo) {
  // Nothing to do.
 }

 #ifdef HAVE_SETJMP
-void MJpegDecoder::ErrorHandler(j_common_ptr cinfo) {
+void ErrorHandler(j_common_ptr cinfo) {
  // This is called when a jpeglib command experiences an error. Unfortunately
  // jpeglib's error handling model is not very flexible, because it expects the
  // error handler to not return--i.e., it wants the program to terminate. To
@ -441,18 +451,12 @@ void MJpegDecoder::ErrorHandler(j_common_ptr cinfo) {
  // ERROR: Error in jpeglib: buf
 #endif

-  SetJmpErrorMgr* mgr = (SetJmpErrorMgr*)(cinfo->err);
+  SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
  // This rewinds the call stack to the point of the corresponding setjmp()
  // and causes it to return (for a second time) with value 1.
  longjmp(mgr->setjmp_buffer, 1);
 }
-
-#ifndef DEBUG_MJPEG
-void MJpegDecoder::OutputHandler(j_common_ptr cinfo) {
-  // silently eat messages
-}
 #endif
-#endif // HAVE_SETJMP

 void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
  if (num_outbufs != num_outbufs_) {
@ -499,11 +503,11 @@ LIBYUV_BOOL MJpegDecoder::StartDecode() {
  decompress_struct_->dct_method = JDCT_IFAST;  // JDCT_ISLOW is default
  decompress_struct_->dither_mode = JDITHER_NONE;
  // Not applicable to 'raw':
-  decompress_struct_->do_fancy_upsampling = LIBYUV_FALSE;
+  decompress_struct_->do_fancy_upsampling = (boolean)(LIBYUV_FALSE);
  // Only for buffered mode:
-  decompress_struct_->enable_2pass_quant = LIBYUV_FALSE;
+  decompress_struct_->enable_2pass_quant = (boolean)(LIBYUV_FALSE);
  // Blocky but fast:
-  decompress_struct_->do_block_smoothing = LIBYUV_FALSE;
+  decompress_struct_->do_block_smoothing = (boolean)(LIBYUV_FALSE);

  if (!jpeg_start_decompress(decompress_struct_)) {
    // ERROR: Couldn't start JPEG decompressor";
--- a/media/libyuv/source/mjpeg_validate.cc
+++ b/media/libyuv/source/mjpeg_validate.cc
@ -10,36 +10,60 @@

 #include "libyuv/mjpeg_decoder.h"

+#include <string.h>  // For memchr.
+
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif

-// Helper function to validate the jpeg appears intact.
-// TODO(fbarchard): Optimize case where SOI is found but EOI is not.
-LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
-  size_t i;
-  if (sample_size < 64) {
-    // ERROR: Invalid jpeg size: sample_size
-    return LIBYUV_FALSE;
-  }
-  if (sample[0] != 0xff || sample[1] != 0xd8) {  // Start Of Image
-    // ERROR: Invalid jpeg initial start code
-    return LIBYUV_FALSE;
-  }
-  for (i = sample_size - 2; i > 1;) {
-    if (sample[i] != 0xd9) {
-      if (sample[i] == 0xff && sample[i + 1] == 0xd9) {  // End Of Image
+// Helper function to scan for EOI marker (0xff 0xd9).
+static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {
+  if (sample_size >= 2) {
+    const uint8* end = sample + sample_size - 1;
+    const uint8* it = sample;
+    while (it < end) {
+      // TODO(fbarchard): scan for 0xd9 instead.
+      it = static_cast<const uint8 *>(memchr(it, 0xff, end - it));
+      if (it == NULL) {
+        break;
+      }
+      if (it[1] == 0xd9) {
        return LIBYUV_TRUE;  // Success: Valid jpeg.
      }
-      --i;
+      ++it;  // Skip over current 0xff.
    }
-    --i;
  }
  // ERROR: Invalid jpeg end code not found. Size sample_size
  return LIBYUV_FALSE;
 }

+// Helper function to validate the jpeg appears intact.
+LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
+  // Maximum size that ValidateJpeg will consider valid.
+  const size_t kMaxJpegSize = 0x7fffffffull;
+  const size_t kBackSearchSize = 1024;
+  if (sample_size < 64 || sample_size > kMaxJpegSize || !sample) {
+    // ERROR: Invalid jpeg size: sample_size
+    return LIBYUV_FALSE;
+  }
+  if (sample[0] != 0xff || sample[1] != 0xd8) {  // SOI marker
+    // ERROR: Invalid jpeg initial start code
+    return LIBYUV_FALSE;
+  }
+
+  // Look for the End Of Image (EOI) marker near the end of the buffer.
+  if (sample_size > kBackSearchSize) {
+    if (ScanEOI(sample + sample_size - kBackSearchSize, kBackSearchSize)) {
+      return LIBYUV_TRUE;  // Success: Valid jpeg.
+    }
+    // Reduce search size for forward search.
+    sample_size = sample_size - kBackSearchSize + 1;
+  }
+  // Step over SOI marker and scan for EOI.
+  return ScanEOI(sample + 2, sample_size - 2);
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
--- a/media/libyuv/source/planar_functions.cc
+++ b/media/libyuv/source/planar_functions.cc
--- a/media/libyuv/source/rotate.cc
+++ b/media/libyuv/source/rotate.cc
--- a/media/libyuv/source/rotate_any.cc
+++ b/media/libyuv/source/rotate_any.cc
@ -0,0 +1,80 @@
+/*
+ *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate.h"
+#include "libyuv/rotate_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define TANY(NAMEANY, TPOS_SIMD, MASK)                                         \
+    void NAMEANY(const uint8* src, int src_stride,                             \
+                 uint8* dst, int dst_stride, int width) {                      \
+      int r = width & MASK;                                                    \
+      int n = width - r;                                                       \
+      if (n > 0) {                                                             \
+        TPOS_SIMD(src, src_stride, dst, dst_stride, n);                        \
+      }                                                                        \
+      TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r);\
+    }
+
+#ifdef HAS_TRANSPOSEWX8_NEON
+TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7)
+#endif
+#ifdef HAS_TRANSPOSEWX8_SSSE3
+TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7)
+#endif
+#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3
+TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15)
+#endif
+#ifdef HAS_TRANSPOSEWX8_DSPR2
+TANY(TransposeWx8_Any_DSPR2, TransposeWx8_DSPR2, 7)
+#endif
+#undef TANY
+
+#define TUVANY(NAMEANY, TPOS_SIMD, MASK)                                       \
+    void NAMEANY(const uint8* src, int src_stride,                             \
+                uint8* dst_a, int dst_stride_a,                                \
+                uint8* dst_b, int dst_stride_b, int width) {                   \
+      int r = width & MASK;                                                    \
+      int n = width - r;                                                       \
+      if (n > 0) {                                                             \
+        TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,   \
+                  n);                                                          \
+      }                                                                        \
+      TransposeUVWx8_C(src + n * 2, src_stride,                                \
+                       dst_a + n * dst_stride_a, dst_stride_a,                 \
+                       dst_b + n * dst_stride_b, dst_stride_b, r);             \
+    }
+
+#ifdef HAS_TRANSPOSEUVWX8_NEON
+TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7)
+#endif
+#ifdef HAS_TRANSPOSEUVWX8_SSE2
+TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7)
+#endif
+#ifdef HAS_TRANSPOSEUVWX8_DSPR2
+TUVANY(TransposeUVWx8_Any_DSPR2, TransposeUVWx8_DSPR2, 7)
+#endif
+#undef TUVANY
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+
+
+
+
--- a/media/libyuv/source/rotate_argb.cc
+++ b/media/libyuv/source/rotate_argb.cc
@ -27,36 +27,31 @@ extern "C" {
    (defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__))
 #define HAS_SCALEARGBROWDOWNEVEN_SSE2
 void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
-                               int src_stepx,
-                               uint8* dst_ptr, int dst_width);
+                               int src_stepx, uint8* dst_ptr, int dst_width);
 #endif
 #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
 #define HAS_SCALEARGBROWDOWNEVEN_NEON
 void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
-                               int src_stepx,
-                               uint8* dst_ptr, int dst_width);
+                               int src_stepx, uint8* dst_ptr, int dst_width);
 #endif

 void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,
-                            int src_stepx,
-                            uint8* dst_ptr, int dst_width);
+                            int src_stepx, uint8* dst_ptr, int dst_width);

 static void ARGBTranspose(const uint8* src, int src_stride,
-                          uint8* dst, int dst_stride,
-                          int width, int height) {
+                          uint8* dst, int dst_stride, int width, int height) {
  int i;
  int src_pixel_step = src_stride >> 2;
  void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
      int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C;
 #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4) &&  // Width of dest.
-      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) {  // Width of dest.
    ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
  }
-#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4) &&  // Width of dest.
-      IS_ALIGNED(src, 4)) {
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4)) {  // Width of dest.
    ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
  }
 #endif
@ -69,8 +64,7 @@ static void ARGBTranspose(const uint8* src, int src_stride,
 }

 void ARGBRotate90(const uint8* src, int src_stride,
-                  uint8* dst, int dst_stride,
-                  int width, int height) {
+                  uint8* dst, int dst_stride, int width, int height) {
  // Rotate by 90 is a ARGBTranspose with the source read
  // from bottom to top. So set the source pointer to the end
  // of the buffer and flip the sign of the source stride.
@ -80,8 +74,7 @@ void ARGBRotate90(const uint8* src, int src_stride,
 }

 void ARGBRotate270(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height) {
+                    uint8* dst, int dst_stride, int width, int height) {
  // Rotate by 270 is a ARGBTranspose with the destination written
  // from bottom to top. So set the destination pointer to the end
  // of the buffer and flip the sign of the destination stride.
@ -91,8 +84,7 @@ void ARGBRotate270(const uint8* src, int src_stride,
 }

 void ARGBRotate180(const uint8* src, int src_stride,
-                   uint8* dst, int dst_stride,
-                   int width, int height) {
+                   uint8* dst, int dst_stride, int width, int height) {
  // Swap first and last row and mirror the content. Uses a temporary row.
  align_buffer_64(row, width * 4);
  const uint8* src_bot = src + src_stride * (height - 1);
@ -102,38 +94,38 @@ void ARGBRotate180(const uint8* src, int src_stride,
  void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
      ARGBMirrorRow_C;
  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
-#if defined(HAS_ARGBMIRRORROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
-      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
-    ARGBMirrorRow = ARGBMirrorRow_SSSE3;
+#if defined(HAS_ARGBMIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMirrorRow = ARGBMirrorRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMIRRORROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMirrorRow = ARGBMirrorRow_SSE2;
+    }
  }
 #endif
 #if defined(HAS_ARGBMIRRORROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
-    ARGBMirrorRow = ARGBMirrorRow_AVX2;
-  }
-#endif
-#if defined(HAS_ARGBMIRRORROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
-    ARGBMirrorRow = ARGBMirrorRow_NEON;
-  }
-#endif
-#if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 32)) {
-    CopyRow = CopyRow_NEON;
-  }
-#endif
-#if defined(HAS_COPYROW_X86)
-  if (TestCpuFlag(kCpuHasX86)) {
-    CopyRow = CopyRow_X86;
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMirrorRow = ARGBMirrorRow_AVX2;
+    }
  }
 #endif
 #if defined(HAS_COPYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width * 4, 32) &&
-      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
-      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
-    CopyRow = CopyRow_SSE2;
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
  }
 #endif
 #if defined(HAS_COPYROW_ERMS)
@ -141,6 +133,11 @@ void ARGBRotate180(const uint8* src, int src_stride,
    CopyRow = CopyRow_ERMS;
  }
 #endif
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+  }
+#endif
 #if defined(HAS_COPYROW_MIPS)
  if (TestCpuFlag(kCpuHasMIPS)) {
    CopyRow = CopyRow_MIPS;
@ -162,8 +159,7 @@ void ARGBRotate180(const uint8* src, int src_stride,

 LIBYUV_API
 int ARGBRotate(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height,
+               uint8* dst_argb, int dst_stride_argb, int width, int height,
               enum RotationMode mode) {
  if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
    return -1;
--- a/media/libyuv/source/rotate_common.cc
+++ b/media/libyuv/source/rotate_common.cc
@ -0,0 +1,92 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+void TransposeWx8_C(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    dst[0] = src[0 * src_stride];
+    dst[1] = src[1 * src_stride];
+    dst[2] = src[2 * src_stride];
+    dst[3] = src[3 * src_stride];
+    dst[4] = src[4 * src_stride];
+    dst[5] = src[5 * src_stride];
+    dst[6] = src[6 * src_stride];
+    dst[7] = src[7 * src_stride];
+    ++src;
+    dst += dst_stride;
+  }
+}
+
+void TransposeUVWx8_C(const uint8* src, int src_stride,
+                      uint8* dst_a, int dst_stride_a,
+                      uint8* dst_b, int dst_stride_b, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    dst_a[0] = src[0 * src_stride + 0];
+    dst_b[0] = src[0 * src_stride + 1];
+    dst_a[1] = src[1 * src_stride + 0];
+    dst_b[1] = src[1 * src_stride + 1];
+    dst_a[2] = src[2 * src_stride + 0];
+    dst_b[2] = src[2 * src_stride + 1];
+    dst_a[3] = src[3 * src_stride + 0];
+    dst_b[3] = src[3 * src_stride + 1];
+    dst_a[4] = src[4 * src_stride + 0];
+    dst_b[4] = src[4 * src_stride + 1];
+    dst_a[5] = src[5 * src_stride + 0];
+    dst_b[5] = src[5 * src_stride + 1];
+    dst_a[6] = src[6 * src_stride + 0];
+    dst_b[6] = src[6 * src_stride + 1];
+    dst_a[7] = src[7 * src_stride + 0];
+    dst_b[7] = src[7 * src_stride + 1];
+    src += 2;
+    dst_a += dst_stride_a;
+    dst_b += dst_stride_b;
+  }
+}
+
+void TransposeWxH_C(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int j;
+    for (j = 0; j < height; ++j) {
+      dst[i * dst_stride + j] = src[j * src_stride + i];
+    }
+  }
+}
+
+void TransposeUVWxH_C(const uint8* src, int src_stride,
+                      uint8* dst_a, int dst_stride_a,
+                      uint8* dst_b, int dst_stride_b,
+                      int width, int height) {
+  int i;
+  for (i = 0; i < width * 2; i += 2) {
+    int j;
+    for (j = 0; j < height; ++j) {
+      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
+      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
+    }
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
--- a/media/libyuv/source/rotate_gcc.cc
+++ b/media/libyuv/source/rotate_gcc.cc
@ -0,0 +1,368 @@
+/*
+ *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+
+// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
+#if defined(HAS_TRANSPOSEWX8_SSSE3)
+void TransposeWx8_SSSE3(const uint8* src, int src_stride,
+                        uint8* dst, int dst_stride, int width) {
+  asm volatile (
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    LABELALIGN
+  "1:                                            \n"
+    "movq       (%0),%%xmm0                      \n"
+    "movq       (%0,%3),%%xmm1                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm1,%%xmm0                    \n"
+    "movq       (%0),%%xmm2                      \n"
+    "movdqa     %%xmm0,%%xmm1                    \n"
+    "palignr    $0x8,%%xmm1,%%xmm1               \n"
+    "movq       (%0,%3),%%xmm3                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm3,%%xmm2                    \n"
+    "movdqa     %%xmm2,%%xmm3                    \n"
+    "movq       (%0),%%xmm4                      \n"
+    "palignr    $0x8,%%xmm3,%%xmm3               \n"
+    "movq       (%0,%3),%%xmm5                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm5,%%xmm4                    \n"
+    "movdqa     %%xmm4,%%xmm5                    \n"
+    "movq       (%0),%%xmm6                      \n"
+    "palignr    $0x8,%%xmm5,%%xmm5               \n"
+    "movq       (%0,%3),%%xmm7                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm7,%%xmm6                    \n"
+    "neg        %3                               \n"
+    "movdqa     %%xmm6,%%xmm7                    \n"
+    "lea        0x8(%0,%3,8),%0                  \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    "neg        %3                               \n"
+     // Second round of bit swap.
+    "punpcklwd  %%xmm2,%%xmm0                    \n"
+    "punpcklwd  %%xmm3,%%xmm1                    \n"
+    "movdqa     %%xmm0,%%xmm2                    \n"
+    "movdqa     %%xmm1,%%xmm3                    \n"
+    "palignr    $0x8,%%xmm2,%%xmm2               \n"
+    "palignr    $0x8,%%xmm3,%%xmm3               \n"
+    "punpcklwd  %%xmm6,%%xmm4                    \n"
+    "punpcklwd  %%xmm7,%%xmm5                    \n"
+    "movdqa     %%xmm4,%%xmm6                    \n"
+    "movdqa     %%xmm5,%%xmm7                    \n"
+    "palignr    $0x8,%%xmm6,%%xmm6               \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    "punpckldq  %%xmm4,%%xmm0                    \n"
+    "movq       %%xmm0,(%1)                      \n"
+    "movdqa     %%xmm0,%%xmm4                    \n"
+    "palignr    $0x8,%%xmm4,%%xmm4               \n"
+    "movq       %%xmm4,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "punpckldq  %%xmm6,%%xmm2                    \n"
+    "movdqa     %%xmm2,%%xmm6                    \n"
+    "movq       %%xmm2,(%1)                      \n"
+    "palignr    $0x8,%%xmm6,%%xmm6               \n"
+    "punpckldq  %%xmm5,%%xmm1                    \n"
+    "movq       %%xmm6,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "movdqa     %%xmm1,%%xmm5                    \n"
+    "movq       %%xmm1,(%1)                      \n"
+    "palignr    $0x8,%%xmm5,%%xmm5               \n"
+    "movq       %%xmm5,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "punpckldq  %%xmm7,%%xmm3                    \n"
+    "movq       %%xmm3,(%1)                      \n"
+    "movdqa     %%xmm3,%%xmm7                    \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    "sub        $0x8,%2                          \n"
+    "movq       %%xmm7,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "jg         1b                               \n"
+    : "+r"(src),    // %0
+      "+r"(dst),    // %1
+      "+r"(width)   // %2
+    : "r"((intptr_t)(src_stride)),  // %3
+      "r"((intptr_t)(dst_stride))   // %4
+    : "memory", "cc",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // defined(HAS_TRANSPOSEWX8_SSSE3)
+
+// Transpose 16x8. 64 bit
+#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
+                             uint8* dst, int dst_stride, int width) {
+  asm volatile (
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    LABELALIGN
+  "1:                                            \n"
+    "movdqu     (%0),%%xmm0                      \n"
+    "movdqu     (%0,%3),%%xmm1                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "movdqa     %%xmm0,%%xmm8                    \n"
+    "punpcklbw  %%xmm1,%%xmm0                    \n"
+    "punpckhbw  %%xmm1,%%xmm8                    \n"
+    "movdqu     (%0),%%xmm2                      \n"
+    "movdqa     %%xmm0,%%xmm1                    \n"
+    "movdqa     %%xmm8,%%xmm9                    \n"
+    "palignr    $0x8,%%xmm1,%%xmm1               \n"
+    "palignr    $0x8,%%xmm9,%%xmm9               \n"
+    "movdqu     (%0,%3),%%xmm3                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "movdqa     %%xmm2,%%xmm10                   \n"
+    "punpcklbw  %%xmm3,%%xmm2                    \n"
+    "punpckhbw  %%xmm3,%%xmm10                   \n"
+    "movdqa     %%xmm2,%%xmm3                    \n"
+    "movdqa     %%xmm10,%%xmm11                  \n"
+    "movdqu     (%0),%%xmm4                      \n"
+    "palignr    $0x8,%%xmm3,%%xmm3               \n"
+    "palignr    $0x8,%%xmm11,%%xmm11             \n"
+    "movdqu     (%0,%3),%%xmm5                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "movdqa     %%xmm4,%%xmm12                   \n"
+    "punpcklbw  %%xmm5,%%xmm4                    \n"
+    "punpckhbw  %%xmm5,%%xmm12                   \n"
+    "movdqa     %%xmm4,%%xmm5                    \n"
+    "movdqa     %%xmm12,%%xmm13                  \n"
+    "movdqu     (%0),%%xmm6                      \n"
+    "palignr    $0x8,%%xmm5,%%xmm5               \n"
+    "palignr    $0x8,%%xmm13,%%xmm13             \n"
+    "movdqu     (%0,%3),%%xmm7                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "movdqa     %%xmm6,%%xmm14                   \n"
+    "punpcklbw  %%xmm7,%%xmm6                    \n"
+    "punpckhbw  %%xmm7,%%xmm14                   \n"
+    "neg        %3                               \n"
+    "movdqa     %%xmm6,%%xmm7                    \n"
+    "movdqa     %%xmm14,%%xmm15                  \n"
+    "lea        0x10(%0,%3,8),%0                 \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    "palignr    $0x8,%%xmm15,%%xmm15             \n"
+    "neg        %3                               \n"
+     // Second round of bit swap.
+    "punpcklwd  %%xmm2,%%xmm0                    \n"
+    "punpcklwd  %%xmm3,%%xmm1                    \n"
+    "movdqa     %%xmm0,%%xmm2                    \n"
+    "movdqa     %%xmm1,%%xmm3                    \n"
+    "palignr    $0x8,%%xmm2,%%xmm2               \n"
+    "palignr    $0x8,%%xmm3,%%xmm3               \n"
+    "punpcklwd  %%xmm6,%%xmm4                    \n"
+    "punpcklwd  %%xmm7,%%xmm5                    \n"
+    "movdqa     %%xmm4,%%xmm6                    \n"
+    "movdqa     %%xmm5,%%xmm7                    \n"
+    "palignr    $0x8,%%xmm6,%%xmm6               \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    "punpcklwd  %%xmm10,%%xmm8                   \n"
+    "punpcklwd  %%xmm11,%%xmm9                   \n"
+    "movdqa     %%xmm8,%%xmm10                   \n"
+    "movdqa     %%xmm9,%%xmm11                   \n"
+    "palignr    $0x8,%%xmm10,%%xmm10             \n"
+    "palignr    $0x8,%%xmm11,%%xmm11             \n"
+    "punpcklwd  %%xmm14,%%xmm12                  \n"
+    "punpcklwd  %%xmm15,%%xmm13                  \n"
+    "movdqa     %%xmm12,%%xmm14                  \n"
+    "movdqa     %%xmm13,%%xmm15                  \n"
+    "palignr    $0x8,%%xmm14,%%xmm14             \n"
+    "palignr    $0x8,%%xmm15,%%xmm15             \n"
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    "punpckldq  %%xmm4,%%xmm0                    \n"
+    "movq       %%xmm0,(%1)                      \n"
+    "movdqa     %%xmm0,%%xmm4                    \n"
+    "palignr    $0x8,%%xmm4,%%xmm4               \n"
+    "movq       %%xmm4,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "punpckldq  %%xmm6,%%xmm2                    \n"
+    "movdqa     %%xmm2,%%xmm6                    \n"
+    "movq       %%xmm2,(%1)                      \n"
+    "palignr    $0x8,%%xmm6,%%xmm6               \n"
+    "punpckldq  %%xmm5,%%xmm1                    \n"
+    "movq       %%xmm6,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "movdqa     %%xmm1,%%xmm5                    \n"
+    "movq       %%xmm1,(%1)                      \n"
+    "palignr    $0x8,%%xmm5,%%xmm5               \n"
+    "movq       %%xmm5,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "punpckldq  %%xmm7,%%xmm3                    \n"
+    "movq       %%xmm3,(%1)                      \n"
+    "movdqa     %%xmm3,%%xmm7                    \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    "movq       %%xmm7,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "punpckldq  %%xmm12,%%xmm8                   \n"
+    "movq       %%xmm8,(%1)                      \n"
+    "movdqa     %%xmm8,%%xmm12                   \n"
+    "palignr    $0x8,%%xmm12,%%xmm12             \n"
+    "movq       %%xmm12,(%1,%4)                  \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "punpckldq  %%xmm14,%%xmm10                  \n"
+    "movdqa     %%xmm10,%%xmm14                  \n"
+    "movq       %%xmm10,(%1)                     \n"
+    "palignr    $0x8,%%xmm14,%%xmm14             \n"
+    "punpckldq  %%xmm13,%%xmm9                   \n"
+    "movq       %%xmm14,(%1,%4)                  \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "movdqa     %%xmm9,%%xmm13                   \n"
+    "movq       %%xmm9,(%1)                      \n"
+    "palignr    $0x8,%%xmm13,%%xmm13             \n"
+    "movq       %%xmm13,(%1,%4)                  \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "punpckldq  %%xmm15,%%xmm11                  \n"
+    "movq       %%xmm11,(%1)                     \n"
+    "movdqa     %%xmm11,%%xmm15                  \n"
+    "palignr    $0x8,%%xmm15,%%xmm15             \n"
+    "sub        $0x10,%2                         \n"
+    "movq       %%xmm15,(%1,%4)                  \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "jg         1b                               \n"
+    : "+r"(src),    // %0
+      "+r"(dst),    // %1
+      "+r"(width)   // %2
+    : "r"((intptr_t)(src_stride)),  // %3
+      "r"((intptr_t)(dst_stride))   // %4
+    : "memory", "cc",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+      "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"
+  );
+}
+#endif  // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+
+// Transpose UV 8x8.  64 bit.
+#if defined(HAS_TRANSPOSEUVWX8_SSE2)
+void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b, int width) {
+  asm volatile (
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    LABELALIGN
+  "1:                                            \n"
+    "movdqu     (%0),%%xmm0                      \n"
+    "movdqu     (%0,%4),%%xmm1                   \n"
+    "lea        (%0,%4,2),%0                     \n"
+    "movdqa     %%xmm0,%%xmm8                    \n"
+    "punpcklbw  %%xmm1,%%xmm0                    \n"
+    "punpckhbw  %%xmm1,%%xmm8                    \n"
+    "movdqa     %%xmm8,%%xmm1                    \n"
+    "movdqu     (%0),%%xmm2                      \n"
+    "movdqu     (%0,%4),%%xmm3                   \n"
+    "lea        (%0,%4,2),%0                     \n"
+    "movdqa     %%xmm2,%%xmm8                    \n"
+    "punpcklbw  %%xmm3,%%xmm2                    \n"
+    "punpckhbw  %%xmm3,%%xmm8                    \n"
+    "movdqa     %%xmm8,%%xmm3                    \n"
+    "movdqu     (%0),%%xmm4                      \n"
+    "movdqu     (%0,%4),%%xmm5                   \n"
+    "lea        (%0,%4,2),%0                     \n"
+    "movdqa     %%xmm4,%%xmm8                    \n"
+    "punpcklbw  %%xmm5,%%xmm4                    \n"
+    "punpckhbw  %%xmm5,%%xmm8                    \n"
+    "movdqa     %%xmm8,%%xmm5                    \n"
+    "movdqu     (%0),%%xmm6                      \n"
+    "movdqu     (%0,%4),%%xmm7                   \n"
+    "lea        (%0,%4,2),%0                     \n"
+    "movdqa     %%xmm6,%%xmm8                    \n"
+    "punpcklbw  %%xmm7,%%xmm6                    \n"
+    "neg        %4                               \n"
+    "lea        0x10(%0,%4,8),%0                 \n"
+    "punpckhbw  %%xmm7,%%xmm8                    \n"
+    "movdqa     %%xmm8,%%xmm7                    \n"
+    "neg        %4                               \n"
+     // Second round of bit swap.
+    "movdqa     %%xmm0,%%xmm8                    \n"
+    "movdqa     %%xmm1,%%xmm9                    \n"
+    "punpckhwd  %%xmm2,%%xmm8                    \n"
+    "punpckhwd  %%xmm3,%%xmm9                    \n"
+    "punpcklwd  %%xmm2,%%xmm0                    \n"
+    "punpcklwd  %%xmm3,%%xmm1                    \n"
+    "movdqa     %%xmm8,%%xmm2                    \n"
+    "movdqa     %%xmm9,%%xmm3                    \n"
+    "movdqa     %%xmm4,%%xmm8                    \n"
+    "movdqa     %%xmm5,%%xmm9                    \n"
+    "punpckhwd  %%xmm6,%%xmm8                    \n"
+    "punpckhwd  %%xmm7,%%xmm9                    \n"
+    "punpcklwd  %%xmm6,%%xmm4                    \n"
+    "punpcklwd  %%xmm7,%%xmm5                    \n"
+    "movdqa     %%xmm8,%%xmm6                    \n"
+    "movdqa     %%xmm9,%%xmm7                    \n"
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    "movdqa     %%xmm0,%%xmm8                    \n"
+    "punpckldq  %%xmm4,%%xmm0                    \n"
+    "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
+    "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
+    "punpckhdq  %%xmm4,%%xmm8                    \n"
+    "movlpd     %%xmm8,(%1,%5)                   \n"
+    "lea        (%1,%5,2),%1                     \n"
+    "movhpd     %%xmm8,(%2,%6)                   \n"
+    "lea        (%2,%6,2),%2                     \n"
+    "movdqa     %%xmm2,%%xmm8                    \n"
+    "punpckldq  %%xmm6,%%xmm2                    \n"
+    "movlpd     %%xmm2,(%1)                      \n"
+    "movhpd     %%xmm2,(%2)                      \n"
+    "punpckhdq  %%xmm6,%%xmm8                    \n"
+    "movlpd     %%xmm8,(%1,%5)                   \n"
+    "lea        (%1,%5,2),%1                     \n"
+    "movhpd     %%xmm8,(%2,%6)                   \n"
+    "lea        (%2,%6,2),%2                     \n"
+    "movdqa     %%xmm1,%%xmm8                    \n"
+    "punpckldq  %%xmm5,%%xmm1                    \n"
+    "movlpd     %%xmm1,(%1)                      \n"
+    "movhpd     %%xmm1,(%2)                      \n"
+    "punpckhdq  %%xmm5,%%xmm8                    \n"
+    "movlpd     %%xmm8,(%1,%5)                   \n"
+    "lea        (%1,%5,2),%1                     \n"
+    "movhpd     %%xmm8,(%2,%6)                   \n"
+    "lea        (%2,%6,2),%2                     \n"
+    "movdqa     %%xmm3,%%xmm8                    \n"
+    "punpckldq  %%xmm7,%%xmm3                    \n"
+    "movlpd     %%xmm3,(%1)                      \n"
+    "movhpd     %%xmm3,(%2)                      \n"
+    "punpckhdq  %%xmm7,%%xmm8                    \n"
+    "sub        $0x8,%3                          \n"
+    "movlpd     %%xmm8,(%1,%5)                   \n"
+    "lea        (%1,%5,2),%1                     \n"
+    "movhpd     %%xmm8,(%2,%6)                   \n"
+    "lea        (%2,%6,2),%2                     \n"
+    "jg         1b                               \n"
+    : "+r"(src),    // %0
+      "+r"(dst_a),  // %1
+      "+r"(dst_b),  // %2
+      "+r"(width)   // %3
+    : "r"((intptr_t)(src_stride)),    // %4
+      "r"((intptr_t)(dst_stride_a)),  // %5
+      "r"((intptr_t)(dst_stride_b))   // %6
+    : "memory", "cc",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+      "xmm8", "xmm9"
+  );
+}
+#endif  // defined(HAS_TRANSPOSEUVWX8_SSE2)
+#endif  // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
--- a/media/libyuv/source/rotate_mips.cc
+++ b/media/libyuv/source/rotate_mips.cc
@ -9,6 +9,7 @@
 */

 #include "libyuv/row.h"
+#include "libyuv/rotate_row.h"

 #include "libyuv/basic_types.h"

@ -18,11 +19,11 @@ extern "C" {
 #endif

 #if !defined(LIBYUV_DISABLE_MIPS) && \
-    defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+    defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32)

-void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
-                             uint8* dst, int dst_stride,
-                             int width) {
+void TransposeWx8_DSPR2(const uint8* src, int src_stride,
+                        uint8* dst, int dst_stride, int width) {
   __asm__ __volatile__ (
      ".set push                                         \n"
      ".set noreorder                                    \n"
@ -105,9 +106,8 @@ void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
  );
 }

-void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
-                                  uint8* dst, int dst_stride,
-                                  int width) {
+void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
+                             uint8* dst, int dst_stride, int width) {
  __asm__ __volatile__ (
      ".set noat                                         \n"
      ".set push                                         \n"
@ -303,17 +303,15 @@ void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
       [width] "+r" (width)
      :[src_stride] "r" (src_stride),
       [dst_stride] "r" (dst_stride)
-      : "t0", "t1",  "t2", "t3",  "t4", "t5",
-        "t6", "t7", "t8", "t9",
-        "s0", "s1", "s2", "s3", "s4",
-        "s5", "s6", "s7"
+      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9",
+        "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7"
  );
 }

-void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
-                               uint8* dst_a, int dst_stride_a,
-                               uint8* dst_b, int dst_stride_b,
-                               int width) {
+void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
+                          uint8* dst_a, int dst_stride_a,
+                          uint8* dst_b, int dst_stride_b,
+                          int width) {
  __asm__ __volatile__ (
      ".set push                                         \n"
      ".set noreorder                                    \n"
--- a/media/libyuv/source/rotate_neon.cc
+++ b/media/libyuv/source/rotate_neon.cc
@ -9,6 +9,7 @@
 */

 #include "libyuv/row.h"
+#include "libyuv/rotate_row.h"

 #include "libyuv/basic_types.h"

@ -17,32 +18,42 @@ namespace libyuv {
 extern "C" {
 #endif

-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
+
 static uvec8 kVTbl4x4Transpose =
  { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };

 void TransposeWx8_NEON(const uint8* src, int src_stride,
                       uint8* dst, int dst_stride,
                       int width) {
+  const uint8* src_temp;
  asm volatile (
    // loops are on blocks of 8. loop will stop when
    // counter gets to or below 0. starting the counter
    // at w-8 allow for this
-    "sub         %4, #8                        \n"
+    "sub         %5, #8                        \n"

    // handle 8x8 blocks. this should be the majority of the plane
-    ".p2align  2                               \n"
    "1:                                        \n"
-      "mov         r9, %0                      \n"
+      "mov         %0, %1                      \n"

-      "vld1.8      {d0}, [r9], %1              \n"
-      "vld1.8      {d1}, [r9], %1              \n"
-      "vld1.8      {d2}, [r9], %1              \n"
-      "vld1.8      {d3}, [r9], %1              \n"
-      "vld1.8      {d4}, [r9], %1              \n"
-      "vld1.8      {d5}, [r9], %1              \n"
-      "vld1.8      {d6}, [r9], %1              \n"
-      "vld1.8      {d7}, [r9]                  \n"
+      MEMACCESS(0)
+      "vld1.8      {d0}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d1}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d2}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d3}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d4}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d5}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d6}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d7}, [%0]                  \n"

      "vtrn.8      d1, d0                      \n"
      "vtrn.8      d3, d2                      \n"
@ -64,48 +75,65 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
      "vrev16.8    q2, q2                      \n"
      "vrev16.8    q3, q3                      \n"

-      "mov         r9, %2                      \n"
+      "mov         %0, %3                      \n"

-      "vst1.8      {d1}, [r9], %3              \n"
-      "vst1.8      {d0}, [r9], %3              \n"
-      "vst1.8      {d3}, [r9], %3              \n"
-      "vst1.8      {d2}, [r9], %3              \n"
-      "vst1.8      {d5}, [r9], %3              \n"
-      "vst1.8      {d4}, [r9], %3              \n"
-      "vst1.8      {d7}, [r9], %3              \n"
-      "vst1.8      {d6}, [r9]                  \n"
+    MEMACCESS(0)
+      "vst1.8      {d1}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d0}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d3}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d2}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d5}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d4}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d7}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d6}, [%0]                  \n"

-      "add         %0, #8                      \n"  // src += 8
-      "add         %2, %2, %3, lsl #3          \n"  // dst += 8 * dst_stride
-      "subs        %4,  #8                     \n"  // w   -= 8
+      "add         %1, #8                      \n"  // src += 8
+      "add         %3, %3, %4, lsl #3          \n"  // dst += 8 * dst_stride
+      "subs        %5,  #8                     \n"  // w   -= 8
      "bge         1b                          \n"

    // add 8 back to counter. if the result is 0 there are
    // no residuals.
-    "adds        %4, #8                        \n"
+    "adds        %5, #8                        \n"
    "beq         4f                            \n"

    // some residual, so between 1 and 7 lines left to transpose
-    "cmp         %4, #2                        \n"
+    "cmp         %5, #2                        \n"
    "blt         3f                            \n"

-    "cmp         %4, #4                        \n"
+    "cmp         %5, #4                        \n"
    "blt         2f                            \n"

    // 4x8 block
-    "mov         r9, %0                        \n"
-    "vld1.32     {d0[0]}, [r9], %1             \n"
-    "vld1.32     {d0[1]}, [r9], %1             \n"
-    "vld1.32     {d1[0]}, [r9], %1             \n"
-    "vld1.32     {d1[1]}, [r9], %1             \n"
-    "vld1.32     {d2[0]}, [r9], %1             \n"
-    "vld1.32     {d2[1]}, [r9], %1             \n"
-    "vld1.32     {d3[0]}, [r9], %1             \n"
-    "vld1.32     {d3[1]}, [r9]                 \n"
+    "mov         %0, %1                        \n"
+    MEMACCESS(0)
+    "vld1.32     {d0[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d0[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d1[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d1[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d2[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d2[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d3[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d3[1]}, [%0]                 \n"

-    "mov         r9, %2                        \n"
+    "mov         %0, %3                        \n"

-    "vld1.8      {q3}, [%5]                    \n"
+    MEMACCESS(6)
+    "vld1.8      {q3}, [%6]                    \n"

    "vtbl.8      d4, {d0, d1}, d6              \n"
    "vtbl.8      d5, {d0, d1}, d7              \n"
@ -114,73 +142,101 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,

    // TODO(frkoenig): Rework shuffle above to
    // write out with 4 instead of 8 writes.
-    "vst1.32     {d4[0]}, [r9], %3             \n"
-    "vst1.32     {d4[1]}, [r9], %3             \n"
-    "vst1.32     {d5[0]}, [r9], %3             \n"
-    "vst1.32     {d5[1]}, [r9]                 \n"
+    MEMACCESS(0)
+    "vst1.32     {d4[0]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d4[1]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d5[0]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d5[1]}, [%0]                 \n"

-    "add         r9, %2, #4                    \n"
-    "vst1.32     {d0[0]}, [r9], %3             \n"
-    "vst1.32     {d0[1]}, [r9], %3             \n"
-    "vst1.32     {d1[0]}, [r9], %3             \n"
-    "vst1.32     {d1[1]}, [r9]                 \n"
+    "add         %0, %3, #4                    \n"
+    MEMACCESS(0)
+    "vst1.32     {d0[0]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d0[1]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d1[0]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d1[1]}, [%0]                 \n"

-    "add         %0, #4                        \n"  // src += 4
-    "add         %2, %2, %3, lsl #2            \n"  // dst += 4 * dst_stride
-    "subs        %4,  #4                       \n"  // w   -= 4
+    "add         %1, #4                        \n"  // src += 4
+    "add         %3, %3, %4, lsl #2            \n"  // dst += 4 * dst_stride
+    "subs        %5,  #4                       \n"  // w   -= 4
    "beq         4f                            \n"

    // some residual, check to see if it includes a 2x8 block,
    // or less
-    "cmp         %4, #2                        \n"
+    "cmp         %5, #2                        \n"
    "blt         3f                            \n"

    // 2x8 block
    "2:                                        \n"
-    "mov         r9, %0                        \n"
-    "vld1.16     {d0[0]}, [r9], %1             \n"
-    "vld1.16     {d1[0]}, [r9], %1             \n"
-    "vld1.16     {d0[1]}, [r9], %1             \n"
-    "vld1.16     {d1[1]}, [r9], %1             \n"
-    "vld1.16     {d0[2]}, [r9], %1             \n"
-    "vld1.16     {d1[2]}, [r9], %1             \n"
-    "vld1.16     {d0[3]}, [r9], %1             \n"
-    "vld1.16     {d1[3]}, [r9]                 \n"
+    "mov         %0, %1                        \n"
+    MEMACCESS(0)
+    "vld1.16     {d0[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d1[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d0[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d1[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d0[2]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d1[2]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d0[3]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d1[3]}, [%0]                 \n"

    "vtrn.8      d0, d1                        \n"

-    "mov         r9, %2                        \n"
+    "mov         %0, %3                        \n"

-    "vst1.64     {d0}, [r9], %3                \n"
-    "vst1.64     {d1}, [r9]                    \n"
+    MEMACCESS(0)
+    "vst1.64     {d0}, [%0], %4                \n"
+    MEMACCESS(0)
+    "vst1.64     {d1}, [%0]                    \n"

-    "add         %0, #2                        \n"  // src += 2
-    "add         %2, %2, %3, lsl #1            \n"  // dst += 2 * dst_stride
-    "subs        %4,  #2                       \n"  // w   -= 2
+    "add         %1, #2                        \n"  // src += 2
+    "add         %3, %3, %4, lsl #1            \n"  // dst += 2 * dst_stride
+    "subs        %5,  #2                       \n"  // w   -= 2
    "beq         4f                            \n"

    // 1x8 block
    "3:                                        \n"
-    "vld1.8      {d0[0]}, [%0], %1             \n"
-    "vld1.8      {d0[1]}, [%0], %1             \n"
-    "vld1.8      {d0[2]}, [%0], %1             \n"
-    "vld1.8      {d0[3]}, [%0], %1             \n"
-    "vld1.8      {d0[4]}, [%0], %1             \n"
-    "vld1.8      {d0[5]}, [%0], %1             \n"
-    "vld1.8      {d0[6]}, [%0], %1             \n"
-    "vld1.8      {d0[7]}, [%0]                 \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[0]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[1]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[2]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[3]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[4]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[5]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[6]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[7]}, [%1]                 \n"

-    "vst1.64     {d0}, [%2]                    \n"
+    MEMACCESS(3)
+    "vst1.64     {d0}, [%3]                    \n"

    "4:                                        \n"

-    : "+r"(src),               // %0
-      "+r"(src_stride),        // %1
-      "+r"(dst),               // %2
-      "+r"(dst_stride),        // %3
-      "+r"(width)              // %4
-    : "r"(&kVTbl4x4Transpose)  // %5
-    : "memory", "cc", "r9", "q0", "q1", "q2", "q3"
+    : "=&r"(src_temp),         // %0
+      "+r"(src),               // %1
+      "+r"(src_stride),        // %2
+      "+r"(dst),               // %3
+      "+r"(dst_stride),        // %4
+      "+r"(width)              // %5
+    : "r"(&kVTbl4x4Transpose)  // %6
+    : "memory", "cc", "q0", "q1", "q2", "q3"
  );
 }

@ -191,25 +247,33 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
                         uint8* dst_a, int dst_stride_a,
                         uint8* dst_b, int dst_stride_b,
                         int width) {
+  const uint8* src_temp;
  asm volatile (
    // loops are on blocks of 8. loop will stop when
    // counter gets to or below 0. starting the counter
    // at w-8 allow for this
-    "sub         %6, #8                        \n"
+    "sub         %7, #8                        \n"

    // handle 8x8 blocks. this should be the majority of the plane
-    ".p2align  2                               \n"
    "1:                                        \n"
-      "mov         r9, %0                      \n"
+      "mov         %0, %1                      \n"

-      "vld2.8      {d0,  d1},  [r9], %1        \n"
-      "vld2.8      {d2,  d3},  [r9], %1        \n"
-      "vld2.8      {d4,  d5},  [r9], %1        \n"
-      "vld2.8      {d6,  d7},  [r9], %1        \n"
-      "vld2.8      {d16, d17}, [r9], %1        \n"
-      "vld2.8      {d18, d19}, [r9], %1        \n"
-      "vld2.8      {d20, d21}, [r9], %1        \n"
-      "vld2.8      {d22, d23}, [r9]            \n"
+      MEMACCESS(0)
+      "vld2.8      {d0,  d1},  [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d2,  d3},  [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d4,  d5},  [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d6,  d7},  [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d16, d17}, [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d18, d19}, [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d20, d21}, [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d22, d23}, [%0]            \n"

      "vtrn.8      q1, q0                      \n"
      "vtrn.8      q3, q2                      \n"
@ -235,59 +299,84 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
      "vrev16.8    q10, q10                    \n"
      "vrev16.8    q11, q11                    \n"

-      "mov         r9, %2                      \n"
+      "mov         %0, %3                      \n"

-      "vst1.8      {d2},  [r9], %3             \n"
-      "vst1.8      {d0},  [r9], %3             \n"
-      "vst1.8      {d6},  [r9], %3             \n"
-      "vst1.8      {d4},  [r9], %3             \n"
-      "vst1.8      {d18}, [r9], %3             \n"
-      "vst1.8      {d16}, [r9], %3             \n"
-      "vst1.8      {d22}, [r9], %3             \n"
-      "vst1.8      {d20}, [r9]                 \n"
+    MEMACCESS(0)
+      "vst1.8      {d2},  [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d0},  [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d6},  [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d4},  [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d18}, [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d16}, [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d22}, [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d20}, [%0]                 \n"

-      "mov         r9, %4                      \n"
+      "mov         %0, %5                      \n"

-      "vst1.8      {d3},  [r9], %5             \n"
-      "vst1.8      {d1},  [r9], %5             \n"
-      "vst1.8      {d7},  [r9], %5             \n"
-      "vst1.8      {d5},  [r9], %5             \n"
-      "vst1.8      {d19}, [r9], %5             \n"
-      "vst1.8      {d17}, [r9], %5             \n"
-      "vst1.8      {d23}, [r9], %5             \n"
-      "vst1.8      {d21}, [r9]                 \n"
+    MEMACCESS(0)
+      "vst1.8      {d3},  [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d1},  [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d7},  [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d5},  [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d19}, [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d17}, [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d23}, [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d21}, [%0]                 \n"

-      "add         %0, #8*2                    \n"  // src   += 8*2
-      "add         %2, %2, %3, lsl #3          \n"  // dst_a += 8 * dst_stride_a
-      "add         %4, %4, %5, lsl #3          \n"  // dst_b += 8 * dst_stride_b
-      "subs        %6,  #8                     \n"  // w     -= 8
+      "add         %1, #8*2                    \n"  // src   += 8*2
+      "add         %3, %3, %4, lsl #3          \n"  // dst_a += 8 * dst_stride_a
+      "add         %5, %5, %6, lsl #3          \n"  // dst_b += 8 * dst_stride_b
+      "subs        %7,  #8                     \n"  // w     -= 8
      "bge         1b                          \n"

    // add 8 back to counter. if the result is 0 there are
    // no residuals.
-    "adds        %6, #8                        \n"
+    "adds        %7, #8                        \n"
    "beq         4f                            \n"

    // some residual, so between 1 and 7 lines left to transpose
-    "cmp         %6, #2                        \n"
+    "cmp         %7, #2                        \n"
    "blt         3f                            \n"

-    "cmp         %6, #4                        \n"
+    "cmp         %7, #4                        \n"
    "blt         2f                            \n"

-    //TODO(frkoenig): Clean this up
+    // TODO(frkoenig): Clean this up
    // 4x8 block
-    "mov         r9, %0                        \n"
-    "vld1.64     {d0}, [r9], %1                \n"
-    "vld1.64     {d1}, [r9], %1                \n"
-    "vld1.64     {d2}, [r9], %1                \n"
-    "vld1.64     {d3}, [r9], %1                \n"
-    "vld1.64     {d4}, [r9], %1                \n"
-    "vld1.64     {d5}, [r9], %1                \n"
-    "vld1.64     {d6}, [r9], %1                \n"
-    "vld1.64     {d7}, [r9]                    \n"
+    "mov         %0, %1                        \n"
+    MEMACCESS(0)
+    "vld1.64     {d0}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d1}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d2}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d3}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d4}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d5}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d6}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d7}, [%0]                    \n"

-    "vld1.8      {q15}, [%7]                   \n"
+    MEMACCESS(8)
+    "vld1.8      {q15}, [%8]                   \n"

    "vtrn.8      q0, q1                        \n"
    "vtrn.8      q2, q3                        \n"
@ -301,103 +390,142 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
    "vtbl.8      d22, {d6, d7}, d30            \n"
    "vtbl.8      d23, {d6, d7}, d31            \n"

-    "mov         r9, %2                        \n"
+    "mov         %0, %3                        \n"

-    "vst1.32     {d16[0]},  [r9], %3           \n"
-    "vst1.32     {d16[1]},  [r9], %3           \n"
-    "vst1.32     {d17[0]},  [r9], %3           \n"
-    "vst1.32     {d17[1]},  [r9], %3           \n"
+    MEMACCESS(0)
+    "vst1.32     {d16[0]},  [%0], %4           \n"
+    MEMACCESS(0)
+    "vst1.32     {d16[1]},  [%0], %4           \n"
+    MEMACCESS(0)
+    "vst1.32     {d17[0]},  [%0], %4           \n"
+    MEMACCESS(0)
+    "vst1.32     {d17[1]},  [%0], %4           \n"

-    "add         r9, %2, #4                    \n"
-    "vst1.32     {d20[0]}, [r9], %3            \n"
-    "vst1.32     {d20[1]}, [r9], %3            \n"
-    "vst1.32     {d21[0]}, [r9], %3            \n"
-    "vst1.32     {d21[1]}, [r9]                \n"
+    "add         %0, %3, #4                    \n"
+    MEMACCESS(0)
+    "vst1.32     {d20[0]}, [%0], %4            \n"
+    MEMACCESS(0)
+    "vst1.32     {d20[1]}, [%0], %4            \n"
+    MEMACCESS(0)
+    "vst1.32     {d21[0]}, [%0], %4            \n"
+    MEMACCESS(0)
+    "vst1.32     {d21[1]}, [%0]                \n"

-    "mov         r9, %4                        \n"
+    "mov         %0, %5                        \n"

-    "vst1.32     {d18[0]}, [r9], %5            \n"
-    "vst1.32     {d18[1]}, [r9], %5            \n"
-    "vst1.32     {d19[0]}, [r9], %5            \n"
-    "vst1.32     {d19[1]}, [r9], %5            \n"
+    MEMACCESS(0)
+    "vst1.32     {d18[0]}, [%0], %6            \n"
+    MEMACCESS(0)
+    "vst1.32     {d18[1]}, [%0], %6            \n"
+    MEMACCESS(0)
+    "vst1.32     {d19[0]}, [%0], %6            \n"
+    MEMACCESS(0)
+    "vst1.32     {d19[1]}, [%0], %6            \n"

-    "add         r9, %4, #4                    \n"
-    "vst1.32     {d22[0]},  [r9], %5           \n"
-    "vst1.32     {d22[1]},  [r9], %5           \n"
-    "vst1.32     {d23[0]},  [r9], %5           \n"
-    "vst1.32     {d23[1]},  [r9]               \n"
+    "add         %0, %5, #4                    \n"
+    MEMACCESS(0)
+    "vst1.32     {d22[0]},  [%0], %6           \n"
+    MEMACCESS(0)
+    "vst1.32     {d22[1]},  [%0], %6           \n"
+    MEMACCESS(0)
+    "vst1.32     {d23[0]},  [%0], %6           \n"
+    MEMACCESS(0)
+    "vst1.32     {d23[1]},  [%0]               \n"

-    "add         %0, #4*2                      \n"  // src   += 4 * 2
-    "add         %2, %2, %3, lsl #2            \n"  // dst_a += 4 * dst_stride_a
-    "add         %4, %4, %5, lsl #2            \n"  // dst_b += 4 * dst_stride_b
-    "subs        %6,  #4                       \n"  // w     -= 4
+    "add         %1, #4*2                      \n"  // src   += 4 * 2
+    "add         %3, %3, %4, lsl #2            \n"  // dst_a += 4 * dst_stride_a
+    "add         %5, %5, %6, lsl #2            \n"  // dst_b += 4 * dst_stride_b
+    "subs        %7,  #4                       \n"  // w     -= 4
    "beq         4f                            \n"

    // some residual, check to see if it includes a 2x8 block,
    // or less
-    "cmp         %6, #2                        \n"
+    "cmp         %7, #2                        \n"
    "blt         3f                            \n"

    // 2x8 block
    "2:                                        \n"
-    "mov         r9, %0                        \n"
-    "vld2.16     {d0[0], d2[0]}, [r9], %1      \n"
-    "vld2.16     {d1[0], d3[0]}, [r9], %1      \n"
-    "vld2.16     {d0[1], d2[1]}, [r9], %1      \n"
-    "vld2.16     {d1[1], d3[1]}, [r9], %1      \n"
-    "vld2.16     {d0[2], d2[2]}, [r9], %1      \n"
-    "vld2.16     {d1[2], d3[2]}, [r9], %1      \n"
-    "vld2.16     {d0[3], d2[3]}, [r9], %1      \n"
-    "vld2.16     {d1[3], d3[3]}, [r9]          \n"
+    "mov         %0, %1                        \n"
+    MEMACCESS(0)
+    "vld2.16     {d0[0], d2[0]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d1[0], d3[0]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d0[1], d2[1]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d1[1], d3[1]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d0[2], d2[2]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d1[2], d3[2]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d0[3], d2[3]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d1[3], d3[3]}, [%0]          \n"

    "vtrn.8      d0, d1                        \n"
    "vtrn.8      d2, d3                        \n"

-    "mov         r9, %2                        \n"
+    "mov         %0, %3                        \n"

-    "vst1.64     {d0}, [r9], %3                \n"
-    "vst1.64     {d2}, [r9]                    \n"
+    MEMACCESS(0)
+    "vst1.64     {d0}, [%0], %4                \n"
+    MEMACCESS(0)
+    "vst1.64     {d2}, [%0]                    \n"

-    "mov         r9, %4                        \n"
+    "mov         %0, %5                        \n"

-    "vst1.64     {d1}, [r9], %5                \n"
-    "vst1.64     {d3}, [r9]                    \n"
+    MEMACCESS(0)
+    "vst1.64     {d1}, [%0], %6                \n"
+    MEMACCESS(0)
+    "vst1.64     {d3}, [%0]                    \n"

-    "add         %0, #2*2                      \n"  // src   += 2 * 2
-    "add         %2, %2, %3, lsl #1            \n"  // dst_a += 2 * dst_stride_a
-    "add         %4, %4, %5, lsl #1            \n"  // dst_b += 2 * dst_stride_b
-    "subs        %6,  #2                       \n"  // w     -= 2
+    "add         %1, #2*2                      \n"  // src   += 2 * 2
+    "add         %3, %3, %4, lsl #1            \n"  // dst_a += 2 * dst_stride_a
+    "add         %5, %5, %6, lsl #1            \n"  // dst_b += 2 * dst_stride_b
+    "subs        %7,  #2                       \n"  // w     -= 2
    "beq         4f                            \n"

    // 1x8 block
    "3:                                        \n"
-    "vld2.8      {d0[0], d1[0]}, [%0], %1      \n"
-    "vld2.8      {d0[1], d1[1]}, [%0], %1      \n"
-    "vld2.8      {d0[2], d1[2]}, [%0], %1      \n"
-    "vld2.8      {d0[3], d1[3]}, [%0], %1      \n"
-    "vld2.8      {d0[4], d1[4]}, [%0], %1      \n"
-    "vld2.8      {d0[5], d1[5]}, [%0], %1      \n"
-    "vld2.8      {d0[6], d1[6]}, [%0], %1      \n"
-    "vld2.8      {d0[7], d1[7]}, [%0]          \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[0], d1[0]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[1], d1[1]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[2], d1[2]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[3], d1[3]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[4], d1[4]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[5], d1[5]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[6], d1[6]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[7], d1[7]}, [%1]          \n"

-    "vst1.64     {d0}, [%2]                    \n"
-    "vst1.64     {d1}, [%4]                    \n"
+    MEMACCESS(3)
+    "vst1.64     {d0}, [%3]                    \n"
+    MEMACCESS(5)
+    "vst1.64     {d1}, [%5]                    \n"

    "4:                                        \n"

-    : "+r"(src),                 // %0
-      "+r"(src_stride),          // %1
-      "+r"(dst_a),               // %2
-      "+r"(dst_stride_a),        // %3
-      "+r"(dst_b),               // %4
-      "+r"(dst_stride_b),        // %5
-      "+r"(width)                // %6
-    : "r"(&kVTbl4x4TransposeDi)  // %7
-    : "memory", "cc", "r9",
+    : "=&r"(src_temp),           // %0
+      "+r"(src),                 // %1
+      "+r"(src_stride),          // %2
+      "+r"(dst_a),               // %3
+      "+r"(dst_stride_a),        // %4
+      "+r"(dst_b),               // %5
+      "+r"(dst_stride_b),        // %6
+      "+r"(width)                // %7
+    : "r"(&kVTbl4x4TransposeDi)  // %8
+    : "memory", "cc",
      "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
  );
 }
-#endif
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)

 #ifdef __cplusplus
 }  // extern "C"
--- a/media/libyuv/source/rotate_neon64.cc
+++ b/media/libyuv/source/rotate_neon64.cc
@ -0,0 +1,543 @@
+/*
+ *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+static uvec8 kVTbl4x4Transpose =
+  { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
+
+void TransposeWx8_NEON(const uint8* src, int src_stride,
+                       uint8* dst, int dst_stride, int width) {
+  const uint8* src_temp;
+  int64 width64 = (int64) width;  // Work around clang 3.4 warning.
+  asm volatile (
+    // loops are on blocks of 8. loop will stop when
+    // counter gets to or below 0. starting the counter
+    // at w-8 allow for this
+    "sub         %3, %3, #8                      \n"
+
+    // handle 8x8 blocks. this should be the majority of the plane
+    "1:                                          \n"
+      "mov         %0, %1                        \n"
+
+      MEMACCESS(0)
+      "ld1        {v0.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v1.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v2.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v3.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v4.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v5.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v6.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v7.8b}, [%0]                  \n"
+
+      "trn2     v16.8b, v0.8b, v1.8b             \n"
+      "trn1     v17.8b, v0.8b, v1.8b             \n"
+      "trn2     v18.8b, v2.8b, v3.8b             \n"
+      "trn1     v19.8b, v2.8b, v3.8b             \n"
+      "trn2     v20.8b, v4.8b, v5.8b             \n"
+      "trn1     v21.8b, v4.8b, v5.8b             \n"
+      "trn2     v22.8b, v6.8b, v7.8b             \n"
+      "trn1     v23.8b, v6.8b, v7.8b             \n"
+
+      "trn2     v3.4h, v17.4h, v19.4h            \n"
+      "trn1     v1.4h, v17.4h, v19.4h            \n"
+      "trn2     v2.4h, v16.4h, v18.4h            \n"
+      "trn1     v0.4h, v16.4h, v18.4h            \n"
+      "trn2     v7.4h, v21.4h, v23.4h            \n"
+      "trn1     v5.4h, v21.4h, v23.4h            \n"
+      "trn2     v6.4h, v20.4h, v22.4h            \n"
+      "trn1     v4.4h, v20.4h, v22.4h            \n"
+
+      "trn2     v21.2s, v1.2s, v5.2s             \n"
+      "trn1     v17.2s, v1.2s, v5.2s             \n"
+      "trn2     v20.2s, v0.2s, v4.2s             \n"
+      "trn1     v16.2s, v0.2s, v4.2s             \n"
+      "trn2     v23.2s, v3.2s, v7.2s             \n"
+      "trn1     v19.2s, v3.2s, v7.2s             \n"
+      "trn2     v22.2s, v2.2s, v6.2s             \n"
+      "trn1     v18.2s, v2.2s, v6.2s             \n"
+
+      "mov         %0, %2                        \n"
+
+    MEMACCESS(0)
+      "st1      {v17.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v16.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v19.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v18.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v21.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v20.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v23.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v22.8b}, [%0]                   \n"
+
+      "add         %1, %1, #8                    \n"  // src += 8
+      "add         %2, %2, %6, lsl #3            \n"  // dst += 8 * dst_stride
+      "subs        %3, %3, #8                    \n"  // w   -= 8
+      "b.ge        1b                            \n"
+
+    // add 8 back to counter. if the result is 0 there are
+    // no residuals.
+    "adds        %3, %3, #8                      \n"
+    "b.eq        4f                              \n"
+
+    // some residual, so between 1 and 7 lines left to transpose
+    "cmp         %3, #2                          \n"
+    "b.lt        3f                              \n"
+
+    "cmp         %3, #4                          \n"
+    "b.lt        2f                              \n"
+
+    // 4x8 block
+    "mov         %0, %1                          \n"
+    MEMACCESS(0)
+    "ld1     {v0.s}[0], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.s}[1], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.s}[2], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.s}[3], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.s}[0], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.s}[1], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.s}[2], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.s}[3], [%0]                     \n"
+
+    "mov         %0, %2                          \n"
+
+    MEMACCESS(4)
+    "ld1      {v2.16b}, [%4]                     \n"
+
+    "tbl      v3.16b, {v0.16b}, v2.16b           \n"
+    "tbl      v0.16b, {v1.16b}, v2.16b           \n"
+
+    // TODO(frkoenig): Rework shuffle above to
+    // write out with 4 instead of 8 writes.
+    MEMACCESS(0)
+    "st1 {v3.s}[0], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v3.s}[1], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v3.s}[2], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v3.s}[3], [%0]                         \n"
+
+    "add         %0, %2, #4                      \n"
+    MEMACCESS(0)
+    "st1 {v0.s}[0], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v0.s}[1], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v0.s}[2], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v0.s}[3], [%0]                         \n"
+
+    "add         %1, %1, #4                      \n"  // src += 4
+    "add         %2, %2, %6, lsl #2              \n"  // dst += 4 * dst_stride
+    "subs        %3, %3, #4                      \n"  // w   -= 4
+    "b.eq        4f                              \n"
+
+    // some residual, check to see if it includes a 2x8 block,
+    // or less
+    "cmp         %3, #2                          \n"
+    "b.lt        3f                              \n"
+
+    // 2x8 block
+    "2:                                          \n"
+    "mov         %0, %1                          \n"
+    MEMACCESS(0)
+    "ld1     {v0.h}[0], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.h}[0], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.h}[1], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.h}[1], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.h}[2], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.h}[2], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.h}[3], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.h}[3], [%0]                     \n"
+
+    "trn2    v2.8b, v0.8b, v1.8b                 \n"
+    "trn1    v3.8b, v0.8b, v1.8b                 \n"
+
+    "mov         %0, %2                          \n"
+
+    MEMACCESS(0)
+    "st1     {v3.8b}, [%0], %6                   \n"
+    MEMACCESS(0)
+    "st1     {v2.8b}, [%0]                       \n"
+
+    "add         %1, %1, #2                      \n"  // src += 2
+    "add         %2, %2, %6, lsl #1              \n"  // dst += 2 * dst_stride
+    "subs        %3, %3,  #2                     \n"  // w   -= 2
+    "b.eq        4f                              \n"
+
+    // 1x8 block
+    "3:                                          \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[0], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[1], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[2], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[3], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[4], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[5], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[6], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[7], [%1]                 \n"
+
+    MEMACCESS(2)
+    "st1         {v0.8b}, [%2]                   \n"
+
+    "4:                                          \n"
+
+    : "=&r"(src_temp),                            // %0
+      "+r"(src),                                  // %1
+      "+r"(dst),                                  // %2
+      "+r"(width64)                               // %3
+    : "r"(&kVTbl4x4Transpose),                    // %4
+      "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
+      "r"(static_cast<ptrdiff_t>(dst_stride))     // %6
+    : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+      "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+  );
+}
+
+static uint8 kVTbl4x4TransposeDi[32] =
+  { 0,  16, 32, 48,  2, 18, 34, 50,  4, 20, 36, 52,  6, 22, 38, 54,
+    1,  17, 33, 49,  3, 19, 35, 51,  5, 21, 37, 53,  7, 23, 39, 55};
+
+void TransposeUVWx8_NEON(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b,
+                         int width) {
+  const uint8* src_temp;
+  int64 width64 = (int64) width;  // Work around clang 3.4 warning.
+  asm volatile (
+    // loops are on blocks of 8. loop will stop when
+    // counter gets to or below 0. starting the counter
+    // at w-8 allow for this
+    "sub       %4, %4, #8                      \n"
+
+    // handle 8x8 blocks. this should be the majority of the plane
+    "1:                                        \n"
+    "mov       %0, %1                          \n"
+
+    MEMACCESS(0)
+    "ld1       {v0.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v1.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v2.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v3.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v4.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v5.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v6.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v7.16b}, [%0]                  \n"
+
+    "trn1      v16.16b, v0.16b, v1.16b         \n"
+    "trn2      v17.16b, v0.16b, v1.16b         \n"
+    "trn1      v18.16b, v2.16b, v3.16b         \n"
+    "trn2      v19.16b, v2.16b, v3.16b         \n"
+    "trn1      v20.16b, v4.16b, v5.16b         \n"
+    "trn2      v21.16b, v4.16b, v5.16b         \n"
+    "trn1      v22.16b, v6.16b, v7.16b         \n"
+    "trn2      v23.16b, v6.16b, v7.16b         \n"
+
+    "trn1      v0.8h, v16.8h, v18.8h           \n"
+    "trn2      v1.8h, v16.8h, v18.8h           \n"
+    "trn1      v2.8h, v20.8h, v22.8h           \n"
+    "trn2      v3.8h, v20.8h, v22.8h           \n"
+    "trn1      v4.8h, v17.8h, v19.8h           \n"
+    "trn2      v5.8h, v17.8h, v19.8h           \n"
+    "trn1      v6.8h, v21.8h, v23.8h           \n"
+    "trn2      v7.8h, v21.8h, v23.8h           \n"
+
+    "trn1      v16.4s, v0.4s, v2.4s            \n"
+    "trn2      v17.4s, v0.4s, v2.4s            \n"
+    "trn1      v18.4s, v1.4s, v3.4s            \n"
+    "trn2      v19.4s, v1.4s, v3.4s            \n"
+    "trn1      v20.4s, v4.4s, v6.4s            \n"
+    "trn2      v21.4s, v4.4s, v6.4s            \n"
+    "trn1      v22.4s, v5.4s, v7.4s            \n"
+    "trn2      v23.4s, v5.4s, v7.4s            \n"
+
+    "mov       %0, %2                          \n"
+
+    MEMACCESS(0)
+    "st1       {v16.d}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.d}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v17.d}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v19.d}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v16.d}[1], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.d}[1], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v17.d}[1], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v19.d}[1], [%0]                \n"
+
+    "mov       %0, %3                          \n"
+
+    MEMACCESS(0)
+    "st1       {v20.d}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v22.d}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v21.d}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v23.d}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v20.d}[1], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v22.d}[1], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v21.d}[1], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v23.d}[1], [%0]                \n"
+
+    "add       %1, %1, #16                     \n"  // src   += 8*2
+    "add       %2, %2, %6, lsl #3              \n"  // dst_a += 8 * dst_stride_a
+    "add       %3, %3, %7, lsl #3              \n"  // dst_b += 8 * dst_stride_b
+    "subs      %4, %4,  #8                     \n"  // w     -= 8
+    "b.ge      1b                              \n"
+
+    // add 8 back to counter. if the result is 0 there are
+    // no residuals.
+    "adds      %4, %4, #8                      \n"
+    "b.eq      4f                              \n"
+
+    // some residual, so between 1 and 7 lines left to transpose
+    "cmp       %4, #2                          \n"
+    "b.lt      3f                              \n"
+
+    "cmp       %4, #4                          \n"
+    "b.lt      2f                              \n"
+
+    // TODO(frkoenig): Clean this up
+    // 4x8 block
+    "mov       %0, %1                          \n"
+    MEMACCESS(0)
+    "ld1       {v0.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v1.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v2.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v3.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v4.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v5.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v6.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v7.8b}, [%0]                   \n"
+
+    MEMACCESS(8)
+    "ld1       {v30.16b}, [%8], #16            \n"
+    "ld1       {v31.16b}, [%8]                 \n"
+
+    "tbl       v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b  \n"
+    "tbl       v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b  \n"
+    "tbl       v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b  \n"
+    "tbl       v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b  \n"
+
+    "mov       %0, %2                          \n"
+
+    MEMACCESS(0)
+    "st1       {v16.s}[0],  [%0], %6           \n"
+    MEMACCESS(0)
+    "st1       {v16.s}[1],  [%0], %6           \n"
+    MEMACCESS(0)
+    "st1       {v16.s}[2],  [%0], %6           \n"
+    MEMACCESS(0)
+    "st1       {v16.s}[3],  [%0], %6           \n"
+
+    "add       %0, %2, #4                      \n"
+    MEMACCESS(0)
+    "st1       {v18.s}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.s}[1], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.s}[2], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.s}[3], [%0]                \n"
+
+    "mov       %0, %3                          \n"
+
+    MEMACCESS(0)
+    "st1       {v17.s}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v17.s}[1], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v17.s}[2], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v17.s}[3], [%0], %7            \n"
+
+    "add       %0, %3, #4                      \n"
+    MEMACCESS(0)
+    "st1       {v19.s}[0],  [%0], %7           \n"
+    MEMACCESS(0)
+    "st1       {v19.s}[1],  [%0], %7           \n"
+    MEMACCESS(0)
+    "st1       {v19.s}[2],  [%0], %7           \n"
+    MEMACCESS(0)
+    "st1       {v19.s}[3],  [%0]               \n"
+
+    "add       %1, %1, #8                      \n"  // src   += 4 * 2
+    "add       %2, %2, %6, lsl #2              \n"  // dst_a += 4 * dst_stride_a
+    "add       %3, %3, %7, lsl #2              \n"  // dst_b += 4 * dst_stride_b
+    "subs      %4,  %4,  #4                    \n"  // w     -= 4
+    "b.eq      4f                              \n"
+
+    // some residual, check to see if it includes a 2x8 block,
+    // or less
+    "cmp       %4, #2                          \n"
+    "b.lt      3f                              \n"
+
+    // 2x8 block
+    "2:                                        \n"
+    "mov       %0, %1                          \n"
+    MEMACCESS(0)
+    "ld2       {v0.h, v1.h}[0], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v2.h, v3.h}[0], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v0.h, v1.h}[1], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v2.h, v3.h}[1], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v0.h, v1.h}[2], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v2.h, v3.h}[2], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v0.h, v1.h}[3], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v2.h, v3.h}[3], [%0]           \n"
+
+    "trn1      v4.8b, v0.8b, v2.8b             \n"
+    "trn2      v5.8b, v0.8b, v2.8b             \n"
+    "trn1      v6.8b, v1.8b, v3.8b             \n"
+    "trn2      v7.8b, v1.8b, v3.8b             \n"
+
+    "mov       %0, %2                          \n"
+
+    MEMACCESS(0)
+    "st1       {v4.d}[0], [%0], %6             \n"
+    MEMACCESS(0)
+    "st1       {v6.d}[0], [%0]                 \n"
+
+    "mov       %0, %3                          \n"
+
+    MEMACCESS(0)
+    "st1       {v5.d}[0], [%0], %7             \n"
+    MEMACCESS(0)
+    "st1       {v7.d}[0], [%0]                 \n"
+
+    "add       %1, %1, #4                      \n"  // src   += 2 * 2
+    "add       %2, %2, %6, lsl #1              \n"  // dst_a += 2 * dst_stride_a
+    "add       %3, %3, %7, lsl #1              \n"  // dst_b += 2 * dst_stride_b
+    "subs      %4,  %4,  #2                    \n"  // w     -= 2
+    "b.eq      4f                              \n"
+
+    // 1x8 block
+    "3:                                        \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[0], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[1], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[2], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[3], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[4], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[5], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[6], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[7], [%1]           \n"
+
+    MEMACCESS(2)
+    "st1       {v0.d}[0], [%2]                 \n"
+    MEMACCESS(3)
+    "st1       {v1.d}[0], [%3]                 \n"
+
+    "4:                                        \n"
+
+    : "=&r"(src_temp),                            // %0
+      "+r"(src),                                  // %1
+      "+r"(dst_a),                                // %2
+      "+r"(dst_b),                                // %3
+      "+r"(width64)                               // %4
+    : "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
+      "r"(static_cast<ptrdiff_t>(dst_stride_a)),  // %6
+      "r"(static_cast<ptrdiff_t>(dst_stride_b)),  // %7
+      "r"(&kVTbl4x4TransposeDi)                   // %8
+    : "memory", "cc",
+      "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+      "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+      "v30", "v31"
+  );
+}
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
--- a/media/libyuv/source/rotate_win.cc
+++ b/media/libyuv/source/rotate_win.cc
@ -0,0 +1,247 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for 32 bit Visual C x86 and clangcl
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+__declspec(naked)
+void TransposeWx8_SSSE3(const uint8* src, int src_stride,
+                        uint8* dst, int dst_stride, int width) {
+  __asm {
+    push      edi
+    push      esi
+    push      ebp
+    mov       eax, [esp + 12 + 4]   // src
+    mov       edi, [esp + 12 + 8]   // src_stride
+    mov       edx, [esp + 12 + 12]  // dst
+    mov       esi, [esp + 12 + 16]  // dst_stride
+    mov       ecx, [esp + 12 + 20]  // width
+
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    align      4
+ convertloop:
+    movq      xmm0, qword ptr [eax]
+    lea       ebp, [eax + 8]
+    movq      xmm1, qword ptr [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    punpcklbw xmm0, xmm1
+    movq      xmm2, qword ptr [eax]
+    movdqa    xmm1, xmm0
+    palignr   xmm1, xmm1, 8
+    movq      xmm3, qword ptr [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    punpcklbw xmm2, xmm3
+    movdqa    xmm3, xmm2
+    movq      xmm4, qword ptr [eax]
+    palignr   xmm3, xmm3, 8
+    movq      xmm5, qword ptr [eax + edi]
+    punpcklbw xmm4, xmm5
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm5, xmm4
+    movq      xmm6, qword ptr [eax]
+    palignr   xmm5, xmm5, 8
+    movq      xmm7, qword ptr [eax + edi]
+    punpcklbw xmm6, xmm7
+    mov       eax, ebp
+    movdqa    xmm7, xmm6
+    palignr   xmm7, xmm7, 8
+    // Second round of bit swap.
+    punpcklwd xmm0, xmm2
+    punpcklwd xmm1, xmm3
+    movdqa    xmm2, xmm0
+    movdqa    xmm3, xmm1
+    palignr   xmm2, xmm2, 8
+    palignr   xmm3, xmm3, 8
+    punpcklwd xmm4, xmm6
+    punpcklwd xmm5, xmm7
+    movdqa    xmm6, xmm4
+    movdqa    xmm7, xmm5
+    palignr   xmm6, xmm6, 8
+    palignr   xmm7, xmm7, 8
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    punpckldq xmm0, xmm4
+    movq      qword ptr [edx], xmm0
+    movdqa    xmm4, xmm0
+    palignr   xmm4, xmm4, 8
+    movq      qword ptr [edx + esi], xmm4
+    lea       edx, [edx + 2 * esi]
+    punpckldq xmm2, xmm6
+    movdqa    xmm6, xmm2
+    palignr   xmm6, xmm6, 8
+    movq      qword ptr [edx], xmm2
+    punpckldq xmm1, xmm5
+    movq      qword ptr [edx + esi], xmm6
+    lea       edx, [edx + 2 * esi]
+    movdqa    xmm5, xmm1
+    movq      qword ptr [edx], xmm1
+    palignr   xmm5, xmm5, 8
+    punpckldq xmm3, xmm7
+    movq      qword ptr [edx + esi], xmm5
+    lea       edx, [edx + 2 * esi]
+    movq      qword ptr [edx], xmm3
+    movdqa    xmm7, xmm3
+    palignr   xmm7, xmm7, 8
+    sub       ecx, 8
+    movq      qword ptr [edx + esi], xmm7
+    lea       edx, [edx + 2 * esi]
+    jg        convertloop
+
+    pop       ebp
+    pop       esi
+    pop       edi
+    ret
+  }
+}
+
+__declspec(naked)
+void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b,
+                         int w) {
+  __asm {
+    push      ebx
+    push      esi
+    push      edi
+    push      ebp
+    mov       eax, [esp + 16 + 4]   // src
+    mov       edi, [esp + 16 + 8]   // src_stride
+    mov       edx, [esp + 16 + 12]  // dst_a
+    mov       esi, [esp + 16 + 16]  // dst_stride_a
+    mov       ebx, [esp + 16 + 20]  // dst_b
+    mov       ebp, [esp + 16 + 24]  // dst_stride_b
+    mov       ecx, esp
+    sub       esp, 4 + 16
+    and       esp, ~15
+    mov       [esp + 16], ecx
+    mov       ecx, [ecx + 16 + 28]  // w
+
+    align      4
+ convertloop:
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    movdqu    xmm0, [eax]
+    movdqu    xmm1, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm7, xmm0  // use xmm7 as temp register.
+    punpcklbw xmm0, xmm1
+    punpckhbw xmm7, xmm1
+    movdqa    xmm1, xmm7
+    movdqu    xmm2, [eax]
+    movdqu    xmm3, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm7, xmm2
+    punpcklbw xmm2, xmm3
+    punpckhbw xmm7, xmm3
+    movdqa    xmm3, xmm7
+    movdqu    xmm4, [eax]
+    movdqu    xmm5, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm7, xmm4
+    punpcklbw xmm4, xmm5
+    punpckhbw xmm7, xmm5
+    movdqa    xmm5, xmm7
+    movdqu    xmm6, [eax]
+    movdqu    xmm7, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqu    [esp], xmm5  // backup xmm5
+    neg       edi
+    movdqa    xmm5, xmm6   // use xmm5 as temp register.
+    punpcklbw xmm6, xmm7
+    punpckhbw xmm5, xmm7
+    movdqa    xmm7, xmm5
+    lea       eax, [eax + 8 * edi + 16]
+    neg       edi
+    // Second round of bit swap.
+    movdqa    xmm5, xmm0
+    punpcklwd xmm0, xmm2
+    punpckhwd xmm5, xmm2
+    movdqa    xmm2, xmm5
+    movdqa    xmm5, xmm1
+    punpcklwd xmm1, xmm3
+    punpckhwd xmm5, xmm3
+    movdqa    xmm3, xmm5
+    movdqa    xmm5, xmm4
+    punpcklwd xmm4, xmm6
+    punpckhwd xmm5, xmm6
+    movdqa    xmm6, xmm5
+    movdqu    xmm5, [esp]  // restore xmm5
+    movdqu    [esp], xmm6  // backup xmm6
+    movdqa    xmm6, xmm5    // use xmm6 as temp register.
+    punpcklwd xmm5, xmm7
+    punpckhwd xmm6, xmm7
+    movdqa    xmm7, xmm6
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    movdqa    xmm6, xmm0
+    punpckldq xmm0, xmm4
+    punpckhdq xmm6, xmm4
+    movdqa    xmm4, xmm6
+    movdqu    xmm6, [esp]  // restore xmm6
+    movlpd    qword ptr [edx], xmm0
+    movhpd    qword ptr [ebx], xmm0
+    movlpd    qword ptr [edx + esi], xmm4
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm4
+    lea       ebx, [ebx + 2 * ebp]
+    movdqa    xmm0, xmm2   // use xmm0 as the temp register.
+    punpckldq xmm2, xmm6
+    movlpd    qword ptr [edx], xmm2
+    movhpd    qword ptr [ebx], xmm2
+    punpckhdq xmm0, xmm6
+    movlpd    qword ptr [edx + esi], xmm0
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm0
+    lea       ebx, [ebx + 2 * ebp]
+    movdqa    xmm0, xmm1   // use xmm0 as the temp register.
+    punpckldq xmm1, xmm5
+    movlpd    qword ptr [edx], xmm1
+    movhpd    qword ptr [ebx], xmm1
+    punpckhdq xmm0, xmm5
+    movlpd    qword ptr [edx + esi], xmm0
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm0
+    lea       ebx, [ebx + 2 * ebp]
+    movdqa    xmm0, xmm3   // use xmm0 as the temp register.
+    punpckldq xmm3, xmm7
+    movlpd    qword ptr [edx], xmm3
+    movhpd    qword ptr [ebx], xmm3
+    punpckhdq xmm0, xmm7
+    sub       ecx, 8
+    movlpd    qword ptr [edx + esi], xmm0
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm0
+    lea       ebx, [ebx + 2 * ebp]
+    jg        convertloop
+
+    mov       esp, [esp + 16]
+    pop       ebp
+    pop       edi
+    pop       esi
+    pop       ebx
+    ret
+  }
+}
+
+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
--- a/media/libyuv/source/row_any.cc
+++ b/media/libyuv/source/row_any.cc
--- a/media/libyuv/source/row_common.cc
+++ b/media/libyuv/source/row_common.cc
--- a/media/libyuv/source/row_posix.cc
+++ b/media/libyuv/source/row_posix.cc
--- a/media/libyuv/source/row_mips.cc
+++ b/media/libyuv/source/row_mips.cc
@ -16,13 +16,8 @@ extern "C" {
 #endif

 // The following are available on Mips platforms:
-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__)
-
-#include <sgidefs.h>
-
-#if (_MIPS_ISA == _MIPS_ISA_MIPS4) || (_MIPS_ISA == _MIPS_ISA_MIPS5)
-#define HAS_MIPS_PREFETCH 1
-#endif
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32)

 #ifdef HAS_COPYROW_MIPS
 void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
@ -66,31 +61,23 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
    // Alternatively, for x=64 the last "safe" a1 address is "t0-96"
    // we will use "pref 30,128(a1)", so "t0-160" is the limit
    "subu      $t9, $t0, 160                     \n"
-#ifdef HAS_MIPS_PREFETCH
    // t9 is the "last safe pref 30,128(a1)" address
    "pref      0, 0(%[src])                      \n"  // first line of src
    "pref      0, 32(%[src])                     \n"  // second line of src
    "pref      0, 64(%[src])                     \n"
    "pref      30, 32(%[dst])                    \n"
-#endif
    // In case the a1 > t9 don't use "pref 30" at all
    "sgtu      $v1, %[dst], $t9                  \n"
    "bgtz      $v1, $loop16w                     \n"
    "nop                                         \n"
    // otherwise, start with using pref30
-#ifdef HAS_MIPS_PREFETCH
    "pref      30, 64(%[dst])                    \n"
-#endif
    "$loop16w:                                    \n"
-#ifdef HAS_MIPS_PREFETCH
    "pref      0, 96(%[src])                     \n"
-#endif
    "lw        $t0, 0(%[src])                    \n"
    "bgtz      $v1, $skip_pref30_96              \n"  // skip
    "lw        $t1, 4(%[src])                    \n"
-#ifdef HAS_MIPS_PREFETCH
    "pref      30, 96(%[dst])                    \n"  // continue
-#endif
    "$skip_pref30_96:                            \n"
    "lw        $t2, 8(%[src])                    \n"
    "lw        $t3, 12(%[src])                   \n"
@ -98,9 +85,7 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
    "lw        $t5, 20(%[src])                   \n"
    "lw        $t6, 24(%[src])                   \n"
    "lw        $t7, 28(%[src])                   \n"
-#ifdef HAS_MIPS_PREFETCH
    "pref      0, 128(%[src])                    \n"
-#endif
    //  bring the next lines of src, addr 128
    "sw        $t0, 0(%[dst])                    \n"
    "sw        $t1, 4(%[dst])                    \n"
@ -113,9 +98,7 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
    "lw        $t0, 32(%[src])                   \n"
    "bgtz      $v1, $skip_pref30_128             \n"  // skip pref 30,128(a1)
    "lw        $t1, 36(%[src])                   \n"
-#ifdef HAS_MIPS_PREFETCH
    "pref      30, 128(%[dst])                   \n"  // set dest, addr 128
-#endif
    "$skip_pref30_128:                           \n"
    "lw        $t2, 40(%[src])                   \n"
    "lw        $t3, 44(%[src])                   \n"
@ -123,9 +106,7 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
    "lw        $t5, 52(%[src])                   \n"
    "lw        $t6, 56(%[src])                   \n"
    "lw        $t7, 60(%[src])                   \n"
-#ifdef HAS_MIPS_PREFETCH
    "pref      0, 160(%[src])                    \n"
-#endif
    // bring the next lines of src, addr 160
    "sw        $t0, 32(%[dst])                   \n"
    "sw        $t1, 36(%[dst])                   \n"
@ -145,9 +126,7 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
    // Here we have src and dest word-aligned but less than 64-bytes to go

    "chk8w:                                      \n"
-#ifdef HAS_MIPS_PREFETCH
    "pref      0, 0x0(%[src])                    \n"
-#endif
    "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?
    // the t8 is the reminder count past 32-bytes
    "beq       %[count], $t8, chk1w              \n"
@ -235,12 +214,10 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
    "addu      $t0, %[dst], %[count]             \n"  // t0 "past the end"
    "subu      $t9, $t0, 160                     \n"
    // t9 is the "last safe pref 30,128(a1)" address
-#ifdef HAS_MIPS_PREFETCH
    "pref      0, 0(%[src])                      \n"  // first line of src
    "pref      0, 32(%[src])                     \n"  // second line  addr 32
    "pref      0, 64(%[src])                     \n"
    "pref      30, 32(%[dst])                    \n"
-#endif
    // safe, as we have at least 64 bytes ahead
    // In case the a1 > t9 don't use "pref 30" at all
    "sgtu      $v1, %[dst], $t9                  \n"
@ -248,21 +225,15 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
    // skip "pref 30,64(a1)" for too short arrays
    " nop                                        \n"
    // otherwise, start with using pref30
-#ifdef HAS_MIPS_PREFETCH
    "pref      30, 64(%[dst])                    \n"
-#endif
    "$ua_loop16w:                                \n"
-#ifdef HAS_MIPS_PREFETCH
    "pref      0, 96(%[src])                     \n"
-#endif
    "lwr       $t0, 0(%[src])                    \n"
    "lwl       $t0, 3(%[src])                    \n"
    "lwr       $t1, 4(%[src])                    \n"
    "bgtz      $v1, $ua_skip_pref30_96           \n"
    " lwl      $t1, 7(%[src])                    \n"
-#ifdef HAS_MIPS_PREFETCH
    "pref      30, 96(%[dst])                    \n"
-#endif
    // continue setting up the dest, addr 96
    "$ua_skip_pref30_96:                         \n"
    "lwr       $t2, 8(%[src])                    \n"
@ -277,9 +248,7 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
    "lwl       $t6, 27(%[src])                   \n"
    "lwr       $t7, 28(%[src])                   \n"
    "lwl       $t7, 31(%[src])                   \n"
-#ifdef HAS_MIPS_PREFETCH
    "pref      0, 128(%[src])                    \n"
-#endif
    // bring the next lines of src, addr 128
    "sw        $t0, 0(%[dst])                    \n"
    "sw        $t1, 4(%[dst])                    \n"
@ -294,9 +263,7 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
    "lwr       $t1, 36(%[src])                   \n"
    "bgtz      $v1, ua_skip_pref30_128           \n"
    " lwl      $t1, 39(%[src])                   \n"
-#ifdef HAS_MIPS_PREFETCH
    "pref      30, 128(%[dst])                   \n"
-#endif
    // continue setting up the dest, addr 128
    "ua_skip_pref30_128:                         \n"

@ -312,9 +279,7 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
    "lwl       $t6, 59(%[src])                   \n"
    "lwr       $t7, 60(%[src])                   \n"
    "lwl       $t7, 63(%[src])                   \n"
-#ifdef HAS_MIPS_PREFETCH
    "pref      0, 160(%[src])                    \n"
-#endif
    // bring the next lines of src, addr 160
    "sw        $t0, 32(%[dst])                   \n"
    "sw        $t1, 36(%[dst])                   \n"
@ -334,9 +299,7 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
    // Here we have src and dest word-aligned but less than 64-bytes to go

    "ua_chk8w:                                   \n"
-#ifdef HAS_MIPS_PREFETCH
    "pref      0, 0x0(%[src])                    \n"
-#endif
    "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?
    // the t8 is the reminder count
    "beq       %[count], $t8, $ua_chk1w          \n"
@ -412,11 +375,13 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
 }
 #endif  // HAS_COPYROW_MIPS

-// MIPS DSPR2 functions
+// DSPR2 functions
 #if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
-    (__mips_dsp_rev >= 2)
-void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                           int width) {
+    (__mips_dsp_rev >= 2) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
+
+void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                      int width) {
  __asm__ __volatile__ (
    ".set push                                     \n"
    ".set noreorder                                \n"
@ -424,7 +389,6 @@ void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
    "blez            $t4, 2f                       \n"
    " andi           %[width], %[width], 0xf       \n"  // residual

-    ".p2align        2                             \n"
  "1:                                              \n"
    "addiu           $t4, $t4, -1                  \n"
    "lw              $t0, 0(%[src_uv])             \n"  // V1 | U1 | V0 | U0
@ -482,90 +446,7 @@ void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
  );
 }

-void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,
-                                     uint8* dst_v, int width) {
-  __asm__ __volatile__ (
-    ".set push                                     \n"
-    ".set noreorder                                \n"
-    "srl             $t4, %[width], 4              \n"  // multiplies of 16
-    "blez            $t4, 2f                       \n"
-    " andi           %[width], %[width], 0xf       \n"  // residual
-
-    ".p2align        2                             \n"
-  "1:                                              \n"
-    "addiu           $t4, $t4, -1                  \n"
-    "lwr             $t0, 0(%[src_uv])             \n"
-    "lwl             $t0, 3(%[src_uv])             \n"  // V1 | U1 | V0 | U0
-    "lwr             $t1, 4(%[src_uv])             \n"
-    "lwl             $t1, 7(%[src_uv])             \n"  // V3 | U3 | V2 | U2
-    "lwr             $t2, 8(%[src_uv])             \n"
-    "lwl             $t2, 11(%[src_uv])            \n"  // V5 | U5 | V4 | U4
-    "lwr             $t3, 12(%[src_uv])            \n"
-    "lwl             $t3, 15(%[src_uv])            \n"  // V7 | U7 | V6 | U6
-    "lwr             $t5, 16(%[src_uv])            \n"
-    "lwl             $t5, 19(%[src_uv])            \n"  // V9 | U9 | V8 | U8
-    "lwr             $t6, 20(%[src_uv])            \n"
-    "lwl             $t6, 23(%[src_uv])            \n"  // V11 | U11 | V10 | U10
-    "lwr             $t7, 24(%[src_uv])            \n"
-    "lwl             $t7, 27(%[src_uv])            \n"  // V13 | U13 | V12 | U12
-    "lwr             $t8, 28(%[src_uv])            \n"
-    "lwl             $t8, 31(%[src_uv])            \n"  // V15 | U15 | V14 | U14
-    "precrq.qb.ph    $t9, $t1, $t0                 \n"  // V3 | V2 | V1 | V0
-    "precr.qb.ph     $t0, $t1, $t0                 \n"  // U3 | U2 | U1 | U0
-    "precrq.qb.ph    $t1, $t3, $t2                 \n"  // V7 | V6 | V5 | V4
-    "precr.qb.ph     $t2, $t3, $t2                 \n"  // U7 | U6 | U5 | U4
-    "precrq.qb.ph    $t3, $t6, $t5                 \n"  // V11 | V10 | V9 | V8
-    "precr.qb.ph     $t5, $t6, $t5                 \n"  // U11 | U10 | U9 | U8
-    "precrq.qb.ph    $t6, $t8, $t7                 \n"  // V15 | V14 | V13 | V12
-    "precr.qb.ph     $t7, $t8, $t7                 \n"  // U15 | U14 | U13 | U12
-    "addiu           %[src_uv], %[src_uv], 32      \n"
-    "swr             $t9, 0(%[dst_v])              \n"
-    "swl             $t9, 3(%[dst_v])              \n"
-    "swr             $t0, 0(%[dst_u])              \n"
-    "swl             $t0, 3(%[dst_u])              \n"
-    "swr             $t1, 4(%[dst_v])              \n"
-    "swl             $t1, 7(%[dst_v])              \n"
-    "swr             $t2, 4(%[dst_u])              \n"
-    "swl             $t2, 7(%[dst_u])              \n"
-    "swr             $t3, 8(%[dst_v])              \n"
-    "swl             $t3, 11(%[dst_v])             \n"
-    "swr             $t5, 8(%[dst_u])              \n"
-    "swl             $t5, 11(%[dst_u])             \n"
-    "swr             $t6, 12(%[dst_v])             \n"
-    "swl             $t6, 15(%[dst_v])             \n"
-    "swr             $t7, 12(%[dst_u])             \n"
-    "swl             $t7, 15(%[dst_u])             \n"
-    "addiu           %[dst_u], %[dst_u], 16        \n"
-    "bgtz            $t4, 1b                       \n"
-    " addiu          %[dst_v], %[dst_v], 16        \n"
-
-    "beqz            %[width], 3f                  \n"
-    " nop                                          \n"
-
-  "2:                                              \n"
-    "lbu             $t0, 0(%[src_uv])             \n"
-    "lbu             $t1, 1(%[src_uv])             \n"
-    "addiu           %[src_uv], %[src_uv], 2       \n"
-    "addiu           %[width], %[width], -1        \n"
-    "sb              $t0, 0(%[dst_u])              \n"
-    "sb              $t1, 0(%[dst_v])              \n"
-    "addiu           %[dst_u], %[dst_u], 1         \n"
-    "bgtz            %[width], 2b                  \n"
-    " addiu          %[dst_v], %[dst_v], 1         \n"
-
-  "3:                                              \n"
-    ".set pop                                      \n"
-     : [src_uv] "+r" (src_uv),
-       [width] "+r" (width),
-       [dst_u] "+r" (dst_u),
-       [dst_v] "+r" (dst_v)
-     :
-     : "t0", "t1", "t2", "t3",
-     "t4", "t5", "t6", "t7", "t8", "t9"
-  );
-}
-
-void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
+void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {
  __asm__ __volatile__ (
    ".set push                             \n"
    ".set noreorder                        \n"
@ -575,7 +456,6 @@ void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
    "blez      $t4, 2f                     \n"
    " addu     %[src], %[src], %[width]    \n"  // src += width

-    ".p2align  2                           \n"
   "1:                                     \n"
    "lw        $t0, -16(%[src])            \n"  // |3|2|1|0|
    "lw        $t1, -12(%[src])            \n"  // |7|6|5|4|
@ -616,10 +496,10 @@ void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
  );
 }

-void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                            int width) {
-  int x = 0;
-  int y = 0;
+void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                       int width) {
+  int x;
+  int y;
  __asm__ __volatile__ (
    ".set push                                    \n"
    ".set noreorder                               \n"
@ -630,7 +510,6 @@ void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
    "blez            %[x], 2f                     \n"
    " addu           %[src_uv], %[src_uv], $t4    \n"

-    ".p2align        2                            \n"
   "1:                                            \n"
    "lw              $t0, -32(%[src_uv])          \n"  // |3|2|1|0|
    "lw              $t1, -28(%[src_uv])          \n"  // |7|6|5|4|
@ -700,7 +579,7 @@ void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
        [dst_u] "+r" (dst_u),
        [dst_v] "+r" (dst_v),
        [x] "=&r" (x),
-        [y] "+r" (y)
+        [y] "=&r" (y)
      : [width] "r" (width)
      : "t0", "t1", "t2", "t3", "t4",
      "t5", "t7", "t8", "t9"
@ -714,7 +593,7 @@ void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
 // t8 = | 0 | G1 | 0 | g1 |
 // t2 = | 0 | R0 | 0 | r0 |
 // t1 = | 0 | R1 | 0 | r1 |
-#define I422ToTransientMipsRGB                                                 \
+#define YUVTORGB                                                               \
      "lw                $t0, 0(%[y_buf])       \n"                            \
      "lhu               $t1, 0(%[u_buf])       \n"                            \
      "lhu               $t2, 0(%[v_buf])       \n"                            \
@ -773,11 +652,13 @@ void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
      "addu.ph           $t2, $t2, $s5          \n"                            \
      "addu.ph           $t1, $t1, $s5          \n"

-void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              uint8* rgb_buf,
-                              int width) {
+// TODO(fbarchard): accept yuv conversion constants.
+void I422ToARGBRow_DSPR2(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* rgb_buf,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
  __asm__ __volatile__ (
    ".set push                                \n"
    ".set noreorder                           \n"
@ -791,9 +672,8 @@ void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
    "lui               $s6, 0xff00            \n"
    "ori               $s6, 0xff00            \n"  // |ff|00|ff|00|ff|

-    ".p2align          2                      \n"
   "1:                                        \n"
-      I422ToTransientMipsRGB
+      YUVTORGB
 // Arranging into argb format
    "precr.qb.ph       $t4, $t8, $t4          \n"  // |G1|g1|B1|b1|
    "precr.qb.ph       $t5, $t9, $t5          \n"  // |G0|g0|B0|b0|
@ -835,136 +715,10 @@ void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
  );
 }

-void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              uint8* rgb_buf,
-                              int width) {
-  __asm__ __volatile__ (
-    ".set push                                \n"
-    ".set noreorder                           \n"
-    "beqz              %[width], 2f           \n"
-    " repl.ph          $s0, 74                \n"  // |YG|YG| = |74|74|
-    "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|
-    "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|
-    "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|
-    "repl.ph           $s4, 16                \n"  // |0|16|0|16|
-    "repl.ph           $s5, 128               \n"  // |128|128|
-    "lui               $s6, 0xff00            \n"
-    "ori               $s6, 0xff00            \n"  // |ff|00|ff|00|
-
-    ".p2align          2                       \n"
-   "1:                                         \n"
-      I422ToTransientMipsRGB
-// Arranging into abgr format
-    "precr.qb.ph      $t0, $t8, $t1           \n"  // |G1|g1|R1|r1|
-    "precr.qb.ph      $t3, $t9, $t2           \n"  // |G0|g0|R0|r0|
-    "precrq.qb.ph     $t8, $t0, $t3           \n"  // |G1|R1|G0|R0|
-    "precr.qb.ph      $t9, $t0, $t3           \n"  // |g1|r1|g0|r0|
-
-    "precr.qb.ph       $t2, $t4, $t5          \n"  // |B1|b1|B0|b0|
-    "addiu             %[width], -4           \n"
-    "addiu             %[y_buf], 4            \n"
-    "preceu.ph.qbla    $t1, $t2               \n"  // |0 |B1|0 |B0|
-    "preceu.ph.qbra    $t2, $t2               \n"  // |0 |b1|0 |b0|
-    "or                $t1, $t1, $s6          \n"  // |ff|B1|ff|B0|
-    "or                $t2, $t2, $s6          \n"  // |ff|b1|ff|b0|
-    "precrq.ph.w       $t0, $t2, $t9          \n"  // |ff|b1|g1|r1|
-    "precrq.ph.w       $t3, $t1, $t8          \n"  // |ff|B1|G1|R1|
-    "sll               $t9, $t9, 16           \n"
-    "sll               $t8, $t8, 16           \n"
-    "packrl.ph         $t2, $t2, $t9          \n"  // |ff|b0|g0|r0|
-    "packrl.ph         $t1, $t1, $t8          \n"  // |ff|B0|G0|R0|
-// Store results.
-    "sw                $t2, 0(%[rgb_buf])     \n"
-    "sw                $t0, 4(%[rgb_buf])     \n"
-    "sw                $t1, 8(%[rgb_buf])     \n"
-    "sw                $t3, 12(%[rgb_buf])    \n"
-    "bnez              %[width], 1b           \n"
-    " addiu            %[rgb_buf], 16         \n"
-   "2:                                        \n"
-    ".set pop                                 \n"
-      :[y_buf] "+r" (y_buf),
-       [u_buf] "+r" (u_buf),
-       [v_buf] "+r" (v_buf),
-       [width] "+r" (width),
-       [rgb_buf] "+r" (rgb_buf)
-      :
-      : "t0", "t1",  "t2", "t3",  "t4", "t5",
-      "t6", "t7", "t8", "t9",
-      "s0", "s1", "s2", "s3",
-      "s4", "s5", "s6"
-  );
-}
-
-void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              uint8* rgb_buf,
-                              int width) {
-  __asm__ __volatile__ (
-    ".set push                                \n"
-    ".set noreorder                           \n"
-    "beqz              %[width], 2f           \n"
-    " repl.ph          $s0, 74                \n"  // |YG|YG| = |74 |74 |
-    "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|
-    "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|
-    "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|
-    "repl.ph           $s4, 16                \n"  // |0|16|0|16|
-    "repl.ph           $s5, 128               \n"  // |128|128|
-    "lui               $s6, 0xff              \n"
-    "ori               $s6, 0xff              \n"  // |00|ff|00|ff|
-
-    ".p2align          2                      \n"
-   "1:                                        \n"
-      I422ToTransientMipsRGB
-      // Arranging into bgra format
-    "precr.qb.ph       $t4, $t4, $t8          \n"  // |B1|b1|G1|g1|
-    "precr.qb.ph       $t5, $t5, $t9          \n"  // |B0|b0|G0|g0|
-    "precrq.qb.ph      $t8, $t4, $t5          \n"  // |B1|G1|B0|G0|
-    "precr.qb.ph       $t9, $t4, $t5          \n"  // |b1|g1|b0|g0|
-
-    "precr.qb.ph       $t2, $t1, $t2          \n"  // |R1|r1|R0|r0|
-    "addiu             %[width], -4           \n"
-    "addiu             %[y_buf], 4            \n"
-    "preceu.ph.qbla    $t1, $t2               \n"  // |0 |R1|0 |R0|
-    "preceu.ph.qbra    $t2, $t2               \n"  // |0 |r1|0 |r0|
-    "sll               $t1, $t1, 8            \n"  // |R1|0 |R0|0 |
-    "sll               $t2, $t2, 8            \n"  // |r1|0 |r0|0 |
-    "or                $t1, $t1, $s6          \n"  // |R1|ff|R0|ff|
-    "or                $t2, $t2, $s6          \n"  // |r1|ff|r0|ff|
-    "precrq.ph.w       $t0, $t9, $t2          \n"  // |b1|g1|r1|ff|
-    "precrq.ph.w       $t3, $t8, $t1          \n"  // |B1|G1|R1|ff|
-    "sll               $t1, $t1, 16           \n"
-    "sll               $t2, $t2, 16           \n"
-    "packrl.ph         $t2, $t9, $t2          \n"  // |b0|g0|r0|ff|
-    "packrl.ph         $t1, $t8, $t1          \n"  // |B0|G0|R0|ff|
-// Store results.
-    "sw                $t2, 0(%[rgb_buf])     \n"
-    "sw                $t0, 4(%[rgb_buf])     \n"
-    "sw                $t1, 8(%[rgb_buf])     \n"
-    "sw                $t3, 12(%[rgb_buf])    \n"
-    "bnez              %[width], 1b           \n"
-    " addiu            %[rgb_buf], 16         \n"
-   "2:                                        \n"
-    ".set pop                                 \n"
-      :[y_buf] "+r" (y_buf),
-       [u_buf] "+r" (u_buf),
-       [v_buf] "+r" (v_buf),
-       [width] "+r" (width),
-       [rgb_buf] "+r" (rgb_buf)
-      :
-      : "t0", "t1",  "t2", "t3",  "t4", "t5",
-      "t6", "t7", "t8", "t9",
-      "s0", "s1", "s2", "s3",
-      "s4", "s5", "s6"
-  );
-}
-
 // Bilinear filter 8x2 -> 8x1
-void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
-                                ptrdiff_t src_stride, int dst_width,
-                                int source_y_fraction) {
+void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+                          ptrdiff_t src_stride, int dst_width,
+                          int source_y_fraction) {
    int y0_fraction = 256 - source_y_fraction;
    const uint8* src_ptr1 = src_ptr + src_stride;

@ -975,7 +729,6 @@ void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
     "replv.ph          $t0, %[y0_fraction]               \n"
     "replv.ph          $t1, %[source_y_fraction]         \n"

-    ".p2align           2                                 \n"
   "1:                                                    \n"
     "lw                $t2, 0(%[src_ptr])                \n"
     "lw                $t3, 0(%[src_ptr1])               \n"
--- a/media/libyuv/source/row_neon.cc
+++ b/media/libyuv/source/row_neon.cc
--- a/media/libyuv/source/row_neon64.cc
+++ b/media/libyuv/source/row_neon64.cc
--- a/media/libyuv/source/row_win.cc
+++ b/media/libyuv/source/row_win.cc
--- a/media/libyuv/source/row_x86.asm
+++ b/media/libyuv/source/row_x86.asm
@ -1,146 +0,0 @@
-;
-; Copyright 2012 The LibYuv Project Authors. All rights reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%ifdef __YASM_VERSION_ID__
-%if __YASM_VERSION_ID__ < 01020000h
-%error AVX2 is supported only by yasm 1.2.0 or later.
-%endif
-%endif
-%include "x86inc.asm"
-
-SECTION .text
-
-; cglobal numeric constants are parameters, gpr regs, mm regs
-
-; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix)
-
-%macro YUY2TOYROW 2-3
-cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix
-%ifidn %1,YUY2
-    pcmpeqb    m2, m2, m2        ; generate mask 0x00ff00ff
-    psrlw      m2, m2, 8
-%endif
-
-    ALIGN      4
-.convertloop:
-    mov%2      m0, [src_yuy2q]
-    mov%2      m1, [src_yuy2q + mmsize]
-    lea        src_yuy2q, [src_yuy2q + mmsize * 2]
-%ifidn %1,YUY2
-    pand       m0, m0, m2   ; YUY2 even bytes are Y
-    pand       m1, m1, m2
-%else
-    psrlw      m0, m0, 8    ; UYVY odd bytes are Y
-    psrlw      m1, m1, 8
-%endif
-    packuswb   m0, m0, m1
-%if cpuflag(AVX2)
-    vpermq     m0, m0, 0xd8
-%endif
-    sub        pixd, mmsize
-    mov%2      [dst_yq], m0
-    lea        dst_yq, [dst_yq + mmsize]
-    jg         .convertloop
-    REP_RET
-%endmacro
-
-; TODO(fbarchard): Remove MMX.  Add SSSE3 pshufb version.
-INIT_MMX MMX
-YUY2TOYROW YUY2,a,
-YUY2TOYROW YUY2,u,_Unaligned
-YUY2TOYROW UYVY,a,
-YUY2TOYROW UYVY,u,_Unaligned
-INIT_XMM SSE2
-YUY2TOYROW YUY2,a,
-YUY2TOYROW YUY2,u,_Unaligned
-YUY2TOYROW UYVY,a,
-YUY2TOYROW UYVY,u,_Unaligned
-INIT_YMM AVX2
-YUY2TOYROW YUY2,a,
-YUY2TOYROW UYVY,a,
-
-; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix)
-
-%macro SplitUVRow 1-2
-cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix
-    pcmpeqb    m4, m4, m4        ; generate mask 0x00ff00ff
-    psrlw      m4, m4, 8
-    sub        dst_vq, dst_uq
-
-    ALIGN      4
-.convertloop:
-    mov%1      m0, [src_uvq]
-    mov%1      m1, [src_uvq + mmsize]
-    lea        src_uvq, [src_uvq + mmsize * 2]
-    psrlw      m2, m0, 8         ; odd bytes
-    psrlw      m3, m1, 8
-    pand       m0, m0, m4        ; even bytes
-    pand       m1, m1, m4
-    packuswb   m0, m0, m1
-    packuswb   m2, m2, m3
-%if cpuflag(AVX2)
-    vpermq     m0, m0, 0xd8
-    vpermq     m2, m2, 0xd8
-%endif
-    mov%1      [dst_uq], m0
-    mov%1      [dst_uq + dst_vq], m2
-    lea        dst_uq, [dst_uq + mmsize]
-    sub        pixd, mmsize
-    jg         .convertloop
-    REP_RET
-%endmacro
-
-INIT_MMX MMX
-SplitUVRow a,
-SplitUVRow u,_Unaligned
-INIT_XMM SSE2
-SplitUVRow a,
-SplitUVRow u,_Unaligned
-INIT_YMM AVX2
-SplitUVRow a,
-
-; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-;                      int width);
-
-%macro MergeUVRow_ 1-2
-cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix
-    sub        src_vq, src_uq
-
-    ALIGN      4
-.convertloop:
-    mov%1      m0, [src_uq]
-    mov%1      m1, [src_vq]
-    lea        src_uq, [src_uq + mmsize]
-    punpcklbw  m2, m0, m1       // first 8 UV pairs
-    punpckhbw  m0, m0, m1       // next 8 UV pairs
-%if cpuflag(AVX2)
-    vperm2i128 m1, m2, m0, 0x20  // low 128 of ymm2 and low 128 of ymm0
-    vperm2i128 m2, m2, m0, 0x31  // high 128 of ymm2 and high 128 of ymm0
-    mov%1      [dst_uvq], m1
-    mov%1      [dst_uvq + mmsize], m2
-%else
-    mov%1      [dst_uvq], m2
-    mov%1      [dst_uvq + mmsize], m0
-%endif
-    lea        dst_uvq, [dst_uvq + mmsize * 2]
-    sub        pixd, mmsize
-    jg         .convertloop
-    REP_RET
-%endmacro
-
-INIT_MMX MMX
-MergeUVRow_ a,
-MergeUVRow_ u,_Unaligned
-INIT_XMM SSE2
-MergeUVRow_ a,
-MergeUVRow_ u,_Unaligned
-INIT_YMM AVX2
-MergeUVRow_ a,
-
--- a/media/libyuv/source/scale.cc
+++ b/media/libyuv/source/scale.cc
--- a/media/libyuv/source/scale_any.cc
+++ b/media/libyuv/source/scale_any.cc
@ -0,0 +1,221 @@
+/*
+ *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+#include "libyuv/scale_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
+#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK)                            \
+    void NAMEANY(uint8* dst_ptr, const uint8* src_ptr,                         \
+                 int dst_width, int x, int dx) {                               \
+      int n = dst_width & ~MASK;                                               \
+      if (n > 0) {                                                             \
+        TERP_SIMD(dst_ptr, src_ptr, n, x, dx);                                 \
+      }                                                                        \
+      TERP_C(dst_ptr + n * BPP, src_ptr,                                       \
+             dst_width & MASK, x + n * dx, dx);                                \
+    }
+
+#ifdef HAS_SCALEFILTERCOLS_NEON
+CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
+#endif
+#ifdef HAS_SCALEARGBCOLS_NEON
+CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
+#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_NEON
+CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON,
+     ScaleARGBFilterCols_C, 4, 3)
+#endif
+#undef CANY
+
+// Fixed scale down.
+#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
+    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride,                   \
+                 uint8* dst_ptr, int dst_width) {                              \
+      int r = (int)((unsigned int)dst_width % (MASK + 1));                     \
+      int n = dst_width - r;                                                   \
+      if (n > 0) {                                                             \
+        SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                    \
+      }                                                                        \
+      SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                 \
+                     dst_ptr + n * BPP, r);                                    \
+    }
+
+// Fixed scale down for odd source width.  Used by I420Blend subsampling.
+// Since dst_width is (width + 1) / 2, this function scales one less pixel
+// and copies the last pixel.
+#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
+    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride,                   \
+                 uint8* dst_ptr, int dst_width) {                              \
+      int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1));               \
+      int n = dst_width - r;                                                   \
+      if (n > 0) {                                                             \
+        SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                    \
+      }                                                                        \
+      SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                 \
+                     dst_ptr + n * BPP, r);                                    \
+    }
+
+#ifdef HAS_SCALEROWDOWN2_SSSE3
+SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_SSSE3, ScaleRowDown2Linear_SSSE3,
+      ScaleRowDown2Linear_C, 2, 1, 15)
+SDANY(ScaleRowDown2Box_Any_SSSE3, ScaleRowDown2Box_SSSE3, ScaleRowDown2Box_C,
+      2, 1, 15)
+SDODD(ScaleRowDown2Box_Odd_SSSE3, ScaleRowDown2Box_SSSE3,
+      ScaleRowDown2Box_Odd_C, 2, 1, 15)
+#endif
+#ifdef HAS_SCALEROWDOWN2_AVX2
+SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
+SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2,
+      ScaleRowDown2Linear_C, 2, 1, 31)
+SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C,
+      2, 1, 31)
+SDODD(ScaleRowDown2Box_Odd_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_Odd_C,
+      2, 1, 31)
+#endif
+#ifdef HAS_SCALEROWDOWN2_NEON
+SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON,
+      ScaleRowDown2Linear_C, 2, 1, 15)
+SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON,
+      ScaleRowDown2Box_C, 2, 1, 15)
+SDODD(ScaleRowDown2Box_Odd_NEON, ScaleRowDown2Box_NEON,
+      ScaleRowDown2Box_Odd_C, 2, 1, 15)
+#endif
+#ifdef HAS_SCALEROWDOWN4_SSSE3
+SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_SSSE3, ScaleRowDown4Box_SSSE3, ScaleRowDown4Box_C,
+      4, 1, 7)
+#endif
+#ifdef HAS_SCALEROWDOWN4_AVX2
+SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15)
+SDANY(ScaleRowDown4Box_Any_AVX2, ScaleRowDown4Box_AVX2, ScaleRowDown4Box_C,
+      4, 1, 15)
+#endif
+#ifdef HAS_SCALEROWDOWN4_NEON
+SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_NEON, ScaleRowDown4Box_NEON, ScaleRowDown4Box_C,
+      4, 1, 7)
+#endif
+#ifdef HAS_SCALEROWDOWN34_SSSE3
+SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3,
+      ScaleRowDown34_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_0_Box_Any_SSSE3, ScaleRowDown34_0_Box_SSSE3,
+      ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_1_Box_Any_SSSE3, ScaleRowDown34_1_Box_SSSE3,
+      ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
+#endif
+#ifdef HAS_SCALEROWDOWN34_NEON
+SDANY(ScaleRowDown34_Any_NEON, ScaleRowDown34_NEON,
+      ScaleRowDown34_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_0_Box_Any_NEON, ScaleRowDown34_0_Box_NEON,
+      ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_1_Box_Any_NEON, ScaleRowDown34_1_Box_NEON,
+      ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
+#endif
+#ifdef HAS_SCALEROWDOWN38_SSSE3
+SDANY(ScaleRowDown38_Any_SSSE3, ScaleRowDown38_SSSE3,
+      ScaleRowDown38_C, 8 / 3, 1, 11)
+SDANY(ScaleRowDown38_3_Box_Any_SSSE3, ScaleRowDown38_3_Box_SSSE3,
+      ScaleRowDown38_3_Box_C, 8 / 3, 1, 5)
+SDANY(ScaleRowDown38_2_Box_Any_SSSE3, ScaleRowDown38_2_Box_SSSE3,
+      ScaleRowDown38_2_Box_C, 8 / 3, 1, 5)
+#endif
+#ifdef HAS_SCALEROWDOWN38_NEON
+SDANY(ScaleRowDown38_Any_NEON, ScaleRowDown38_NEON,
+      ScaleRowDown38_C, 8 / 3, 1, 11)
+SDANY(ScaleRowDown38_3_Box_Any_NEON, ScaleRowDown38_3_Box_NEON,
+      ScaleRowDown38_3_Box_C, 8 / 3, 1, 11)
+SDANY(ScaleRowDown38_2_Box_Any_NEON, ScaleRowDown38_2_Box_NEON,
+      ScaleRowDown38_2_Box_C, 8 / 3, 1, 11)
+#endif
+
+#ifdef HAS_SCALEARGBROWDOWN2_SSE2
+SDANY(ScaleARGBRowDown2_Any_SSE2, ScaleARGBRowDown2_SSE2,
+      ScaleARGBRowDown2_C, 2, 4, 3)
+SDANY(ScaleARGBRowDown2Linear_Any_SSE2, ScaleARGBRowDown2Linear_SSE2,
+      ScaleARGBRowDown2Linear_C, 2, 4, 3)
+SDANY(ScaleARGBRowDown2Box_Any_SSE2, ScaleARGBRowDown2Box_SSE2,
+      ScaleARGBRowDown2Box_C, 2, 4, 3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWN2_NEON
+SDANY(ScaleARGBRowDown2_Any_NEON, ScaleARGBRowDown2_NEON,
+      ScaleARGBRowDown2_C, 2, 4, 7)
+SDANY(ScaleARGBRowDown2Linear_Any_NEON, ScaleARGBRowDown2Linear_NEON,
+      ScaleARGBRowDown2Linear_C, 2, 4, 7)
+SDANY(ScaleARGBRowDown2Box_Any_NEON, ScaleARGBRowDown2Box_NEON,
+      ScaleARGBRowDown2Box_C, 2, 4, 7)
+#endif
+#undef SDANY
+
+// Scale down by even scale factor.
+#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK)          \
+    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx,    \
+                 uint8* dst_ptr, int dst_width) {                              \
+      int r = (int)((unsigned int)dst_width % (MASK + 1));                     \
+      int n = dst_width - r;                                                   \
+      if (n > 0) {                                                             \
+        SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n);         \
+      }                                                                        \
+      SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride,              \
+                     src_stepx, dst_ptr + n * BPP, r);                         \
+    }
+
+#ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2
+SDAANY(ScaleARGBRowDownEven_Any_SSE2, ScaleARGBRowDownEven_SSE2,
+       ScaleARGBRowDownEven_C, 4, 3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, ScaleARGBRowDownEvenBox_SSE2,
+       ScaleARGBRowDownEvenBox_C, 4, 3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
+SDAANY(ScaleARGBRowDownEven_Any_NEON, ScaleARGBRowDownEven_NEON,
+       ScaleARGBRowDownEven_C, 4, 3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON,
+       ScaleARGBRowDownEvenBox_C, 4, 3)
+#endif
+
+// Add rows box filter scale down.
+#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK)                  \
+  void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) {         \
+      int n = src_width & ~MASK;                                               \
+      if (n > 0) {                                                             \
+        SCALEADDROW_SIMD(src_ptr, dst_ptr, n);                                 \
+      }                                                                        \
+      SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK);               \
+    }
+
+#ifdef HAS_SCALEADDROW_SSE2
+SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
+#endif
+#ifdef HAS_SCALEADDROW_AVX2
+SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
+#endif
+#ifdef HAS_SCALEADDROW_NEON
+SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
+#endif
+#undef SAANY
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+
+
+
+
--- a/media/libyuv/source/scale_argb.cc
+++ b/media/libyuv/source/scale_argb.cc
@ -53,18 +53,27 @@ static void ScaleARGBDown2(int src_width, int src_height,
  }

 #if defined(HAS_SCALEARGBROWDOWN2_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
-    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
-        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
-        ScaleARGBRowDown2Box_SSE2);
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 :
+        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 :
+        ScaleARGBRowDown2Box_Any_SSE2);
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
+          (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
+          ScaleARGBRowDown2Box_SSE2);
+    }
  }
-#elif defined(HAS_SCALEARGBROWDOWN2_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
-      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {
-    ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_NEON :
-        ScaleARGBRowDown2_NEON;
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON :
+        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON :
+        ScaleARGBRowDown2Box_Any_NEON);
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON :
+          (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON :
+          ScaleARGBRowDown2Box_NEON);
+    }
  }
 #endif

@ -88,7 +97,7 @@ static void ScaleARGBDown4Box(int src_width, int src_height,
                              int x, int dx, int y, int dy) {
  int j;
  // Allocate 2 rows of ARGB.
-  const int kRowSize = (dst_width * 2 * 4 + 15) & ~15;
+  const int kRowSize = (dst_width * 2 * 4 + 31) & ~31;
  align_buffer_64(row, kRowSize * 2);
  int row_stride = src_stride * (dy >> 16);
  void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
@ -98,17 +107,22 @@ static void ScaleARGBDown4Box(int src_width, int src_height,
  assert(dx == 65536 * 4);  // Test scale factor of 4.
  assert((dy & 0x3ffff) == 0);  // Test vertical scale is multiple of 4.
 #if defined(HAS_SCALEARGBROWDOWN2_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
-    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
-  }
-#elif defined(HAS_SCALEARGBROWDOWN2_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
-      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {
-    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
+    }
  }
 #endif
+#if defined(HAS_SCALEARGBROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
+    }
+  }
+#endif
+
  for (j = 0; j < dst_height; ++j) {
    ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
    ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride,
@ -139,16 +153,23 @@ static void ScaleARGBDownEven(int src_width, int src_height,
  assert(IS_ALIGNED(src_height, 2));
  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
 #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
-    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
-        ScaleARGBRowDownEven_SSE2;
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 :
+        ScaleARGBRowDownEven_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
+          ScaleARGBRowDownEven_SSE2;
+    }
  }
-#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4) &&
-      IS_ALIGNED(src_argb, 4)) {
-    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
-        ScaleARGBRowDownEven_NEON;
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON :
+        ScaleARGBRowDownEven_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
+          ScaleARGBRowDownEven_NEON;
+    }
  }
 #endif

@ -170,42 +191,35 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
                                  int x, int dx, int y, int dy,
                                  enum FilterMode filtering) {
  int j;
+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
+      int dst_width, int x, int dx) =
+      (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
  int64 xlast = x + (int64)(dst_width - 1) * dx;
  int64 xl = (dx >= 0) ? x : xlast;
  int64 xr = (dx >= 0) ? xlast : x;
  int clip_src_width;
  xl = (xl >> 16) & ~3;  // Left edge aligned.
-  xr = (xr >> 16) + 1;  // Right most pixel used.
-  clip_src_width = (((xr - xl) + 1 + 3) & ~3) * 4;  // Width aligned to 4.
+  xr = (xr >> 16) + 1;  // Right most pixel used.  Bilinear uses 2 pixels.
+  xr = (xr + 1 + 3) & ~3;  // 1 beyond 4 pixel aligned right most pixel.
+  if (xr > src_width) {
+    xr = src_width;
+  }
+  clip_src_width = (int)(xr - xl) * 4;  // Width aligned to 4.
  src_argb += xl * 4;
  x -= (int)(xl << 16);
-  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_C;
-#if defined(HAS_INTERPOLATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && clip_src_width >= 16) {
-    InterpolateRow = InterpolateRow_Any_SSE2;
-    if (IS_ALIGNED(clip_src_width, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
-        InterpolateRow = InterpolateRow_SSE2;
-      }
-    }
-  }
-#endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && clip_src_width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
    InterpolateRow = InterpolateRow_Any_SSSE3;
    if (IS_ALIGNED(clip_src_width, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
-        InterpolateRow = InterpolateRow_SSSE3;
-      }
+      InterpolateRow = InterpolateRow_SSSE3;
    }
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && clip_src_width >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
    InterpolateRow = InterpolateRow_Any_AVX2;
    if (IS_ALIGNED(clip_src_width, 32)) {
      InterpolateRow = InterpolateRow_AVX2;
@ -213,52 +227,62 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && clip_src_width >= 16) {
+  if (TestCpuFlag(kCpuHasNEON)) {
    InterpolateRow = InterpolateRow_Any_NEON;
    if (IS_ALIGNED(clip_src_width, 16)) {
      InterpolateRow = InterpolateRow_NEON;
    }
  }
 #endif
-#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && clip_src_width >= 4 &&
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) &&
      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
-    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
+    InterpolateRow = InterpolateRow_Any_DSPR2;
    if (IS_ALIGNED(clip_src_width, 4)) {
-      InterpolateRow = InterpolateRow_MIPS_DSPR2;
+      InterpolateRow = InterpolateRow_DSPR2;
    }
  }
 #endif
-  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
-      int dst_width, int x, int dx) =
-      (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
 #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+    }
+  }
 #endif
  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
  // Allocate a row of ARGB.
-  align_buffer_64(row, clip_src_width * 4);
+  {
+    align_buffer_64(row, clip_src_width * 4);

-  const int max_y = (src_height - 1) << 16;
-  for (j = 0; j < dst_height; ++j) {
+    const int max_y = (src_height - 1) << 16;
    if (y > max_y) {
      y = max_y;
    }
-    int yi = y >> 16;
-    const uint8* src = src_argb + yi * src_stride;
-    if (filtering == kFilterLinear) {
-      ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
-    } else {
-      int yf = (y >> 8) & 255;
-      InterpolateRow(row, src, src_stride, clip_src_width, yf);
-      ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
+    for (j = 0; j < dst_height; ++j) {
+      int yi = y >> 16;
+      const uint8* src = src_argb + yi * src_stride;
+      if (filtering == kFilterLinear) {
+        ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
+      } else {
+        int yf = (y >> 8) & 255;
+        InterpolateRow(row, src, src_stride, clip_src_width, yf);
+        ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
+      }
+      dst_argb += dst_stride;
+      y += dy;
+      if (y > max_y) {
+        y = max_y;
+      }
    }
-    dst_argb += dst_stride;
-    y += dy;
+    free_aligned_buffer_64(row);
  }
-  free_aligned_buffer_64(row);
 }

 // Scale ARGB up with bilinear interpolation.
@ -275,30 +299,17 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
      int dst_width, int x, int dx) =
      filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
-#if defined(HAS_INTERPOLATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
-    InterpolateRow = InterpolateRow_Any_SSE2;
-    if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSE2;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
-        InterpolateRow = InterpolateRow_SSE2;
-      }
-    }
-  }
-#endif
+  const int max_y = (src_height - 1) << 16;
 #if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
    InterpolateRow = InterpolateRow_Any_SSSE3;
    if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
-        InterpolateRow = InterpolateRow_SSSE3;
-      }
+      InterpolateRow = InterpolateRow_SSSE3;
    }
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
    InterpolateRow = InterpolateRow_Any_AVX2;
    if (IS_ALIGNED(dst_width, 8)) {
      InterpolateRow = InterpolateRow_AVX2;
@ -306,17 +317,17 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) {
+  if (TestCpuFlag(kCpuHasNEON)) {
    InterpolateRow = InterpolateRow_Any_NEON;
    if (IS_ALIGNED(dst_width, 4)) {
      InterpolateRow = InterpolateRow_NEON;
    }
  }
 #endif
-#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 &&
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) &&
      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
-    InterpolateRow = InterpolateRow_MIPS_DSPR2;
+    InterpolateRow = InterpolateRow_DSPR2;
  }
 #endif
  if (src_width >= 32768) {
@ -328,70 +339,86 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
  }
 #endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+  if (filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+    }
+  }
+#endif
 #if defined(HAS_SCALEARGBCOLS_SSE2)
  if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
    ScaleARGBFilterCols = ScaleARGBCols_SSE2;
  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+  if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBCols_NEON;
+    }
+  }
 #endif
  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
    ScaleARGBFilterCols = ScaleARGBColsUp2_C;
 #if defined(HAS_SCALEARGBCOLSUP2_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
-        IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
-        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
      ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
    }
 #endif
  }

-  const int max_y = (src_height - 1) << 16;
  if (y > max_y) {
    y = max_y;
  }
-  int yi = y >> 16;
-  const uint8* src = src_argb + yi * src_stride;

-  // Allocate 2 rows of ARGB.
-  const int kRowSize = (dst_width * 4 + 15) & ~15;
-  align_buffer_64(row, kRowSize * 2);
+  {
+    int yi = y >> 16;
+    const uint8* src = src_argb + yi * src_stride;

-  uint8* rowptr = row;
-  int rowstride = kRowSize;
-  int lasty = yi;
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (dst_width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);

-  ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
-  if (src_height > 1) {
+    uint8* rowptr = row;
+    int rowstride = kRowSize;
+    int lasty = yi;
+
+    ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
+    if (src_height > 1) {
+      src += src_stride;
+    }
+    ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx);
    src += src_stride;
-  }
-  ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx);
-  src += src_stride;

-  for (j = 0; j < dst_height; ++j) {
-    yi = y >> 16;
-    if (yi != lasty) {
-      if (y > max_y) {
-        y = max_y;
-        yi = y >> 16;
-        src = src_argb + yi * src_stride;
-      }
+    for (j = 0; j < dst_height; ++j) {
+      yi = y >> 16;
      if (yi != lasty) {
-        ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
-        rowptr += rowstride;
-        rowstride = -rowstride;
-        lasty = yi;
-        src += src_stride;
+        if (y > max_y) {
+          y = max_y;
+          yi = y >> 16;
+          src = src_argb + yi * src_stride;
+        }
+        if (yi != lasty) {
+          ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
+          rowptr += rowstride;
+          rowstride = -rowstride;
+          lasty = yi;
+          src += src_stride;
+        }
      }
+      if (filtering == kFilterLinear) {
+        InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+      } else {
+        int yf = (y >> 8) & 255;
+        InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+      }
+      dst_argb += dst_stride;
+      y += dy;
    }
-    if (filtering == kFilterLinear) {
-      InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
-    } else {
-      int yf = (y >> 8) & 255;
-      InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
-    }
-    dst_argb += dst_stride;
-    y += dy;
+    free_aligned_buffer_64(row);
  }
-  free_aligned_buffer_64(row);
 }

 #ifdef YUVSCALEUP
@ -415,18 +442,15 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
                        uint8* rgb_buf,
                        int width) = I422ToARGBRow_C;
 #if defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
    if (IS_ALIGNED(src_width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        I422ToARGBRow = I422ToARGBRow_SSSE3;
-      }
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
    }
  }
 #endif
 #if defined(HAS_I422TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && src_width >= 16) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
    if (IS_ALIGNED(src_width, 16)) {
      I422ToARGBRow = I422ToARGBRow_AVX2;
@ -434,50 +458,36 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
  }
 #endif
 #if defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && src_width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
    I422ToARGBRow = I422ToARGBRow_Any_NEON;
    if (IS_ALIGNED(src_width, 8)) {
      I422ToARGBRow = I422ToARGBRow_NEON;
    }
  }
 #endif
-#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_width, 4) &&
+#if defined(HAS_I422TOARGBROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_width, 4) &&
      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
-    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+    I422ToARGBRow = I422ToARGBRow_DSPR2;
  }
 #endif

  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
      InterpolateRow_C;
-#if defined(HAS_INTERPOLATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
-    InterpolateRow = InterpolateRow_Any_SSE2;
-    if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSE2;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        InterpolateRow = InterpolateRow_SSE2;
-      }
-    }
-  }
-#endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
    InterpolateRow = InterpolateRow_Any_SSSE3;
    if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        InterpolateRow = InterpolateRow_SSSE3;
-      }
+      InterpolateRow = InterpolateRow_SSSE3;
    }
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
    InterpolateRow = InterpolateRow_Any_AVX2;
    if (IS_ALIGNED(dst_width, 8)) {
      InterpolateRow = InterpolateRow_AVX2;
@ -485,17 +495,17 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) {
+  if (TestCpuFlag(kCpuHasNEON)) {
    InterpolateRow = InterpolateRow_Any_NEON;
    if (IS_ALIGNED(dst_width, 4)) {
      InterpolateRow = InterpolateRow_NEON;
    }
  }
 #endif
-#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 &&
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) &&
      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
-    InterpolateRow = InterpolateRow_MIPS_DSPR2;
+    InterpolateRow = InterpolateRow_DSPR2;
  }
 #endif

@ -511,17 +521,31 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
  }
 #endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+  if (filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+    }
+  }
+#endif
 #if defined(HAS_SCALEARGBCOLS_SSE2)
  if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
    ScaleARGBFilterCols = ScaleARGBCols_SSE2;
  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+  if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBCols_NEON;
+    }
+  }
 #endif
  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
    ScaleARGBFilterCols = ScaleARGBColsUp2_C;
 #if defined(HAS_SCALEARGBCOLSUP2_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
-        IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
-        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
      ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
    }
 #endif
@ -539,7 +563,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
  const uint8* src_row_v = src_v + uv_yi * src_stride_v;

  // Allocate 2 rows of ARGB.
-  const int kRowSize = (dst_width * 4 + 15) & ~15;
+  const int kRowSize = (dst_width * 4 + 31) & ~31;
  align_buffer_64(row, kRowSize * 2);

  // Allocate 1 row of ARGB for source conversion.
@ -624,13 +648,19 @@ static void ScaleARGBSimple(int src_width, int src_height,
  if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
    ScaleARGBCols = ScaleARGBCols_SSE2;
  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBCols = ScaleARGBCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBCols = ScaleARGBCols_NEON;
+    }
+  }
 #endif
  if (src_width * 2 == dst_width && x < 0x8000) {
    ScaleARGBCols = ScaleARGBColsUp2_C;
 #if defined(HAS_SCALEARGBCOLSUP2_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
-        IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
-        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
      ScaleARGBCols = ScaleARGBColsUp2_SSE2;
    }
 #endif
@ -764,6 +794,7 @@ int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
  if (!src_argb || src_width == 0 || src_height == 0 ||
      !dst_argb || dst_width <= 0 || dst_height <= 0 ||
      clip_x < 0 || clip_y < 0 ||
+      clip_width > 32768 || clip_height > 32768 ||
      (clip_x + clip_width) > dst_width ||
      (clip_y + clip_height) > dst_height) {
    return -1;
@ -782,6 +813,7 @@ int ARGBScale(const uint8* src_argb, int src_stride_argb,
              int dst_width, int dst_height,
              enum FilterMode filtering) {
  if (!src_argb || src_width == 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 ||
      !dst_argb || dst_width <= 0 || dst_height <= 0) {
    return -1;
  }
@ -791,6 +823,36 @@ int ARGBScale(const uint8* src_argb, int src_stride_argb,
  return 0;
 }

+// Scale with YUV conversion to ARGB and clipping.
+LIBYUV_API
+int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
+                       const uint8* src_u, int src_stride_u,
+                       const uint8* src_v, int src_stride_v,
+                       uint32 src_fourcc,
+                       int src_width, int src_height,
+                       uint8* dst_argb, int dst_stride_argb,
+                       uint32 dst_fourcc,
+                       int dst_width, int dst_height,
+                       int clip_x, int clip_y, int clip_width, int clip_height,
+                       enum FilterMode filtering) {
+  uint8* argb_buffer = (uint8*)malloc(src_width * src_height * 4);
+  int r;
+  I420ToARGB(src_y, src_stride_y,
+             src_u, src_stride_u,
+             src_v, src_stride_v,
+             argb_buffer, src_width * 4,
+             src_width, src_height);
+
+  r = ARGBScaleClip(argb_buffer, src_width * 4,
+                    src_width, src_height,
+                    dst_argb, dst_stride_argb,
+                    dst_width, dst_height,
+                    clip_x, clip_y, clip_width, clip_height,
+                    filtering);
+  free(argb_buffer);
+  return r;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
--- a/media/libyuv/source/scale_common.cc
+++ b/media/libyuv/source/scale_common.cc
@ -42,6 +42,20 @@ void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
  }
 }

+void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                        uint16* dst, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src_ptr[1];
+    dst[1] = src_ptr[3];
+    dst += 2;
+    src_ptr += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = src_ptr[1];
+  }
+}
+
 void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst, int dst_width) {
  const uint8* s = src_ptr;
@ -57,6 +71,21 @@ void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
  }
 }

+void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                              uint16* dst, int dst_width) {
+  const uint16* s = src_ptr;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (s[0] + s[1] + 1) >> 1;
+    dst[1] = (s[2] + s[3] + 1) >> 1;
+    dst += 2;
+    s += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + 1) >> 1;
+  }
+}
+
 void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst, int dst_width) {
  const uint8* s = src_ptr;
@ -74,6 +103,45 @@ void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
  }
 }

+void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width) {
+  const uint8* s = src_ptr;
+  const uint8* t = src_ptr + src_stride;
+  int x;
+  dst_width -= 1;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+    dst += 2;
+    s += 4;
+    t += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+    dst += 1;
+    s += 2;
+    t += 2;
+  }
+  dst[0] = (s[0] + t[0] + 1) >> 1;
+}
+
+void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst, int dst_width) {
+  const uint16* s = src_ptr;
+  const uint16* t = src_ptr + src_stride;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+    dst += 2;
+    s += 4;
+    t += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+  }
+}
+
 void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
                     uint8* dst, int dst_width) {
  int x;
@ -88,6 +156,20 @@ void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
  }
 }

+void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                        uint16* dst, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src_ptr[2];
+    dst[1] = src_ptr[6];
+    dst += 2;
+    src_ptr += 8;
+  }
+  if (dst_width & 1) {
+    dst[0] = src_ptr[2];
+  }
+}
+
 void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst, int dst_width) {
  intptr_t stride = src_stride;
@ -124,6 +206,42 @@ void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
  }
 }

+void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst, int dst_width) {
+  intptr_t stride = src_stride;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+             src_ptr[stride + 0] + src_ptr[stride + 1] +
+             src_ptr[stride + 2] + src_ptr[stride + 3] +
+             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+             8) >> 4;
+    dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
+             src_ptr[stride + 4] + src_ptr[stride + 5] +
+             src_ptr[stride + 6] + src_ptr[stride + 7] +
+             src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
+             src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
+             src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
+             src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
+             8) >> 4;
+    dst += 2;
+    src_ptr += 8;
+  }
+  if (dst_width & 1) {
+    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+             src_ptr[stride + 0] + src_ptr[stride + 1] +
+             src_ptr[stride + 2] + src_ptr[stride + 3] +
+             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+             8) >> 4;
+  }
+}
+
 void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
                      uint8* dst, int dst_width) {
  int x;
@ -137,6 +255,19 @@ void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
  }
 }

+void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                         uint16* dst, int dst_width) {
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[1];
+    dst[2] = src_ptr[3];
+    dst += 3;
+    src_ptr += 4;
+  }
+}
+
 // Filter rows 0 and 1 together, 3 : 1
 void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* d, int dst_width) {
@ -160,6 +291,28 @@ void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
  }
 }

+void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* d, int dst_width) {
+  const uint16* s = src_ptr;
+  const uint16* t = src_ptr + src_stride;
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    d[0] = (a0 * 3 + b0 + 2) >> 2;
+    d[1] = (a1 * 3 + b1 + 2) >> 2;
+    d[2] = (a2 * 3 + b2 + 2) >> 2;
+    d += 3;
+    s += 4;
+    t += 4;
+  }
+}
+
 // Filter rows 1 and 2 together, 1 : 1
 void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* d, int dst_width) {
@ -183,6 +336,28 @@ void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
  }
 }

+void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* d, int dst_width) {
+  const uint16* s = src_ptr;
+  const uint16* t = src_ptr + src_stride;
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    d[0] = (a0 + b0 + 1) >> 1;
+    d[1] = (a1 + b1 + 1) >> 1;
+    d[2] = (a2 + b2 + 1) >> 1;
+    d += 3;
+    s += 4;
+    t += 4;
+  }
+}
+
 // Scales a single row of pixels using point sampling.
 void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
                 int dst_width, int x, int dx) {
@ -199,6 +374,21 @@ void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
  }
 }

+void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                    int dst_width, int x, int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst_ptr[0] = src_ptr[x >> 16];
+    x += dx;
+    dst_ptr[1] = src_ptr[x >> 16];
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    dst_ptr[0] = src_ptr[x >> 16];
+  }
+}
+
 // Scales a single row of pixels up by 2x using point sampling.
 void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
                    int dst_width, int x, int dx) {
@ -213,9 +403,28 @@ void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
  }
 }

+void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                       int dst_width, int x, int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst_ptr[1] = dst_ptr[0] = src_ptr[0];
+    src_ptr += 1;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    dst_ptr[0] = src_ptr[0];
+  }
+}
+
 // (1-f)a + fb can be replaced with a + f(b-a)
+#if defined(__arm__) || defined(__aarch64__)
 #define BLENDER(a, b, f) (uint8)((int)(a) + \
-    ((int)(f) * ((int)(b) - (int)(a)) >> 16))
+    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+#else
+// inteluses 7 bit math with rounding.
+#define BLENDER(a, b, f) (uint8)((int)(a) + \
+    (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7))
+#endif

 void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
                       int dst_width, int x, int dx) {
@ -267,6 +476,60 @@ void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
 }
 #undef BLENDER

+// Same as 8 bit arm blender but return is cast to uint16
+#define BLENDER(a, b, f) (uint16)((int)(a) + \
+    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+
+void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                       int dst_width, int x, int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    xi = x >> 16;
+    a = src_ptr[xi];
+    b = src_ptr[xi + 1];
+    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    int xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+  }
+}
+
+void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                         int dst_width, int x32, int dx) {
+  int64 x = (int64)(x32);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int64 xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    xi = x >> 16;
+    a = src_ptr[xi];
+    b = src_ptr[xi + 1];
+    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    int64 xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+  }
+}
+#undef BLENDER
+
 void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
                      uint8* dst, int dst_width) {
  int x;
@ -280,6 +543,19 @@ void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
  }
 }

+void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                         uint16* dst, int dst_width) {
+  int x;
+  assert(dst_width % 3 == 0);
+  for (x = 0; x < dst_width; x += 3) {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[3];
+    dst[2] = src_ptr[6];
+    dst += 3;
+    src_ptr += 8;
+  }
+}
+
 // 8x3 -> 3x1
 void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
                            ptrdiff_t src_stride,
@ -307,6 +583,32 @@ void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
  }
 }

+void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  int i;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (i = 0; i < dst_width; i += 3) {
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+        src_ptr[stride + 0] + src_ptr[stride + 1] +
+        src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+        src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+        (65536 / 9) >> 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+        src_ptr[stride + 3] + src_ptr[stride + 4] +
+        src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+        src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+        (65536 / 9) >> 16;
+    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+        src_ptr[stride + 6] + src_ptr[stride + 7] +
+        src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+        (65536 / 6) >> 16;
+    src_ptr += 8;
+    dst_ptr += 3;
+  }
+}
+
 // 8x2 -> 3x1
 void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst_ptr, int dst_width) {
@ -328,21 +630,51 @@ void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
  }
 }

-void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                    uint16* dst_ptr, int src_width, int src_height) {
+void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  int i;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (i = 0; i < dst_width; i += 3) {
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+        src_ptr[stride + 0] + src_ptr[stride + 1] +
+        src_ptr[stride + 2]) * (65536 / 6) >> 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+        src_ptr[stride + 3] + src_ptr[stride + 4] +
+        src_ptr[stride + 5]) * (65536 / 6) >> 16;
+    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+        src_ptr[stride + 6] + src_ptr[stride + 7]) *
+        (65536 / 4) >> 16;
+    src_ptr += 8;
+    dst_ptr += 3;
+  }
+}
+
+void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
  int x;
  assert(src_width > 0);
-  assert(src_height > 0);
-  for (x = 0; x < src_width; ++x) {
-    const uint8* s = src_ptr + x;
-    unsigned int sum = 0u;
-    int y;
-    for (y = 0; y < src_height; ++y) {
-      sum += s[0];
-      s += src_stride;
-    }
-    // TODO(fbarchard): Consider limitting height to 256 to avoid overflow.
-    dst_ptr[x] = sum < 65535u ? sum : 65535u;
+  for (x = 0; x < src_width - 1; x += 2) {
+    dst_ptr[0] += src_ptr[0];
+    dst_ptr[1] += src_ptr[1];
+    src_ptr += 2;
+    dst_ptr += 2;
+  }
+  if (src_width & 1) {
+    dst_ptr[0] += src_ptr[0];
+  }
+}
+
+void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) {
+  int x;
+  assert(src_width > 0);
+  for (x = 0; x < src_width - 1; x += 2) {
+    dst_ptr[0] += src_ptr[0];
+    dst_ptr[1] += src_ptr[1];
+    src_ptr += 2;
+    dst_ptr += 2;
+  }
+  if (src_width & 1) {
+    dst_ptr[0] += src_ptr[0];
  }
 }

@ -484,6 +816,7 @@ void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
  }
 }

+// TODO(fbarchard): Replace 0x7f ^ f with 128-f.  bug=607.
 // Mimics SSSE3 blender
 #define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
 #define BLENDERC(a, b, f, s) (uint32)( \
@ -573,32 +906,16 @@ void ScalePlaneVertical(int src_height,
  assert(dst_width > 0);
  assert(dst_height > 0);
  src_argb += (x >> 16) * bpp;
-#if defined(HAS_INTERPOLATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) {
-    InterpolateRow = InterpolateRow_Any_SSE2;
-    if (IS_ALIGNED(dst_width_bytes, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
-          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
-        InterpolateRow = InterpolateRow_SSE2;
-      }
-    }
-  }
-#endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
    InterpolateRow = InterpolateRow_Any_SSSE3;
    if (IS_ALIGNED(dst_width_bytes, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
-          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
-        InterpolateRow = InterpolateRow_SSSE3;
-      }
+      InterpolateRow = InterpolateRow_SSSE3;
    }
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && dst_width_bytes >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
    InterpolateRow = InterpolateRow_Any_AVX2;
    if (IS_ALIGNED(dst_width_bytes, 32)) {
      InterpolateRow = InterpolateRow_AVX2;
@ -606,20 +923,20 @@ void ScalePlaneVertical(int src_height,
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && dst_width_bytes >= 16) {
+  if (TestCpuFlag(kCpuHasNEON)) {
    InterpolateRow = InterpolateRow_Any_NEON;
    if (IS_ALIGNED(dst_width_bytes, 16)) {
      InterpolateRow = InterpolateRow_NEON;
    }
  }
 #endif
-#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width_bytes >= 4 &&
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) &&
      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
-    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
+    InterpolateRow = InterpolateRow_Any_DSPR2;
    if (IS_ALIGNED(dst_width_bytes, 4)) {
-      InterpolateRow = InterpolateRow_MIPS_DSPR2;
+      InterpolateRow = InterpolateRow_DSPR2;
    }
  }
 #endif
@ -637,6 +954,80 @@ void ScalePlaneVertical(int src_height,
    y += dy;
  }
 }
+void ScalePlaneVertical_16(int src_height,
+                           int dst_width, int dst_height,
+                           int src_stride, int dst_stride,
+                           const uint16* src_argb, uint16* dst_argb,
+                           int x, int y, int dy,
+                           int wpp, enum FilterMode filtering) {
+  // TODO(fbarchard): Allow higher wpp.
+  int dst_width_words = dst_width * wpp;
+  void (*InterpolateRow)(uint16* dst_argb, const uint16* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_16_C;
+  const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+  int j;
+  assert(wpp >= 1 && wpp <= 2);
+  assert(src_height != 0);
+  assert(dst_width > 0);
+  assert(dst_height > 0);
+  src_argb += (x >> 16) * wpp;
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_16_SSE2;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_16_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_16_SSSE3;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_16_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_16_AVX2;
+    if (IS_ALIGNED(dst_width_bytes, 32)) {
+      InterpolateRow = InterpolateRow_16_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_16_NEON;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_16_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) &&
+      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
+    InterpolateRow = InterpolateRow_Any_16_DSPR2;
+    if (IS_ALIGNED(dst_width_bytes, 4)) {
+      InterpolateRow = InterpolateRow_16_DSPR2;
+    }
+  }
+#endif
+  for (j = 0; j < dst_height; ++j) {
+    int yi;
+    int yf;
+    if (y > max_y) {
+      y = max_y;
+    }
+    yi = y >> 16;
+    yf = filtering ? ((y >> 8) & 255) : 0;
+    InterpolateRow(dst_argb, src_argb + yi * src_stride,
+                   src_stride, dst_width_words, yf);
+    dst_argb += dst_stride;
+    y += dy;
+  }
+}

 // Simplify the filtering based on scale factors.
 enum FilterMode ScaleFilterReduce(int src_width, int src_height,
@ -653,10 +1044,6 @@ enum FilterMode ScaleFilterReduce(int src_width, int src_height,
    if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) {
      filtering = kFilterBilinear;
    }
-    // If scaling to larger, switch from Box to Bilinear.
-    if (dst_width >= src_width || dst_height >= src_height) {
-      filtering = kFilterBilinear;
-    }
  }
  if (filtering == kFilterBilinear) {
    if (src_height == 1) {
--- a/media/libyuv/source/scale_posix.cc
+++ b/media/libyuv/source/scale_posix.cc
--- a/media/libyuv/source/scale_mips.cc
+++ b/media/libyuv/source/scale_mips.cc
@ -18,10 +18,11 @@ extern "C" {

 // This module is for GCC MIPS DSPR2
 #if !defined(LIBYUV_DISABLE_MIPS) && \
-    defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+    defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32)

-void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst, int dst_width) {
+void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                         uint8* dst, int dst_width) {
  __asm__ __volatile__(
    ".set push                                     \n"
    ".set noreorder                                \n"
@ -30,7 +31,6 @@ void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
    "beqz           $t9, 2f                        \n"
    " nop                                          \n"

-    ".p2align       2                              \n"
  "1:                                              \n"
    "lw             $t0, 0(%[src_ptr])             \n"  // |3|2|1|0|
    "lw             $t1, 4(%[src_ptr])             \n"  // |7|6|5|4|
@ -77,8 +77,8 @@ void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
  );
 }

-void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                 uint8* dst, int dst_width) {
+void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width) {
  const uint8* t = src_ptr + src_stride;

  __asm__ __volatile__ (
@ -89,7 +89,6 @@ void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
    "bltz           $t9, 2f                       \n"
    " nop                                         \n"

-    ".p2align       2                             \n"
  "1:                                             \n"
    "lw             $t0, 0(%[src_ptr])            \n"  // |3|2|1|0|
    "lw             $t1, 4(%[src_ptr])            \n"  // |7|6|5|4|
@ -177,8 +176,8 @@ void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
  );
 }

-void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst, int dst_width) {
+void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                         uint8* dst, int dst_width) {
  __asm__ __volatile__ (
      ".set push                                    \n"
      ".set noreorder                               \n"
@ -187,7 +186,6 @@ void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
      "beqz           $t9, 2f                       \n"
      " nop                                         \n"

-      ".p2align       2                             \n"
     "1:                                            \n"
      "lw             $t1, 0(%[src_ptr])            \n"  // |3|2|1|0|
      "lw             $t2, 4(%[src_ptr])            \n"  // |7|6|5|4|
@ -233,8 +231,8 @@ void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
  );
 }

-void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                 uint8* dst, int dst_width) {
+void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width) {
  intptr_t stride = src_stride;
  const uint8* s1 = src_ptr + stride;
  const uint8* s2 = s1 + stride;
@ -247,7 +245,6 @@ void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
      "srl           $t9, %[dst_width], 1         \n"
      "andi          $t8, %[dst_width], 1         \n"

-      ".p2align      2                            \n"
     "1:                                          \n"
      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
@ -313,12 +310,11 @@ void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
  );
 }

-void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst, int dst_width) {
+void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst, int dst_width) {
  __asm__ __volatile__ (
      ".set push                                          \n"
      ".set noreorder                                     \n"
-      ".p2align        2                                  \n"
    "1:                                                   \n"
      "lw              $t1, 0(%[src_ptr])                 \n"  // |3|2|1|0|
      "lw              $t2, 4(%[src_ptr])                 \n"  // |7|6|5|4|
@ -360,14 +356,13 @@ void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
  );
 }

-void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                     uint8* d, int dst_width) {
+void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* d, int dst_width) {
  __asm__ __volatile__ (
      ".set push                                         \n"
      ".set noreorder                                    \n"
      "repl.ph           $t3, 3                          \n"  // 0x00030003

-     ".p2align           2                               \n"
    "1:                                                  \n"
      "lw                $t0, 0(%[src_ptr])              \n"  // |S3|S2|S1|S0|
      "lwx               $t1, %[src_stride](%[src_ptr])  \n"  // |T3|T2|T1|T0|
@ -417,14 +412,13 @@ void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
  );
 }

-void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                     uint8* d, int dst_width) {
+void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* d, int dst_width) {
  __asm__ __volatile__ (
      ".set push                                           \n"
      ".set noreorder                                      \n"
      "repl.ph           $t2, 3                            \n"  // 0x00030003

-      ".p2align          2                                 \n"
    "1:                                                    \n"
      "lw                $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
      "lwx               $t1, %[src_stride](%[src_ptr])    \n"  // |T3|T2|T1|T0|
@ -470,13 +464,12 @@ void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
  );
 }

-void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst, int dst_width) {
+void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst, int dst_width) {
  __asm__ __volatile__ (
      ".set push                                     \n"
      ".set noreorder                                \n"

-      ".p2align   2                                  \n"
    "1:                                              \n"
      "lw         $t0, 0(%[src_ptr])                 \n"  // |3|2|1|0|
      "lw         $t1, 4(%[src_ptr])                 \n"  // |7|6|5|4|
@ -517,8 +510,8 @@ void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
  );
 }

-void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                     uint8* dst_ptr, int dst_width) {
+void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
  intptr_t stride = src_stride;
  const uint8* t = src_ptr + stride;
  const int c = 0x2AAA;
@ -527,7 +520,6 @@ void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
      ".set push                                         \n"
      ".set noreorder                                    \n"

-      ".p2align        2                                 \n"
    "1:                                                  \n"
      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
@ -571,9 +563,9 @@ void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
  );
 }

-void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
-                                     ptrdiff_t src_stride,
-                                     uint8* dst_ptr, int dst_width) {
+void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
  intptr_t stride = src_stride;
  const uint8* s1 = src_ptr + stride;
  stride += stride;
@ -585,7 +577,6 @@ void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
      ".set push                                         \n"
      ".set noreorder                                    \n"

-      ".p2align        2                                 \n"
    "1:                                                  \n"
      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
--- a/media/libyuv/source/scale_neon.cc
+++ b/media/libyuv/source/scale_neon.cc
@ -16,7 +16,8 @@ extern "C" {
 #endif

 // This module is for GCC Neon.
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)

 // NEON downscalers with interpolation.
 // Provided by Fritz Koenig
@ -25,11 +26,12 @@ extern "C" {
 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst, int dst_width) {
  asm volatile (
-    ".p2align   2                              \n"
  "1:                                          \n"
    // load even pixels into q0, odd into q1
+    MEMACCESS(0)
    "vld2.8     {q0, q1}, [%0]!                \n"
    "subs       %2, %2, #16                    \n"  // 16 processed per loop
+    MEMACCESS(1)
    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
    "bgt        1b                             \n"
  : "+r"(src_ptr),          // %0
@ -40,15 +42,39 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
  );
 }

+// Read 32x1 average down and write 16x1.
+void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0, q1}, [%0]!                \n"  // load pixels and post inc
+    "subs       %2, %2, #16                    \n"  // 16 processed per loop
+    "vpaddl.u8  q0, q0                         \n"  // add adjacent
+    "vpaddl.u8  q1, q1                         \n"
+    "vrshrn.u16 d0, q0, #1                     \n"  // downshift, round and pack
+    "vrshrn.u16 d1, q1, #1                     \n"
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst),              // %1
+    "+r"(dst_width)         // %2
+  :
+  : "q0", "q1"     // Clobber List
+  );
+}
+
 // Read 32x2 average down and write 16x1.
 void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst, int dst_width) {
  asm volatile (
    // change the stride to row 2 pointer
    "add        %1, %0                         \n"
-    ".p2align   2                              \n"
  "1:                                          \n"
+    MEMACCESS(0)
    "vld1.8     {q0, q1}, [%0]!                \n"  // load row 1 and post inc
+    MEMACCESS(1)
    "vld1.8     {q2, q3}, [%1]!                \n"  // load row 2 and post inc
    "subs       %3, %3, #16                    \n"  // 16 processed per loop
    "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent
@ -57,6 +83,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
    "vpadal.u8  q1, q3                         \n"
    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
    "vrshrn.u16 d1, q1, #2                     \n"
+    MEMACCESS(2)
    "vst1.8     {q0}, [%2]!                    \n"
    "bgt        1b                             \n"
  : "+r"(src_ptr),          // %0
@ -71,10 +98,11 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
 void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst_ptr, int dst_width) {
  asm volatile (
-    ".p2align   2                              \n"
  "1:                                          \n"
+    MEMACCESS(0)
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n" // src line 0
    "subs       %2, %2, #8                     \n" // 8 processed per loop
+    MEMACCESS(1)
    "vst1.8     {d2}, [%1]!                    \n"
    "bgt        1b                             \n"
  : "+r"(src_ptr),          // %0
@ -87,16 +115,19 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

 void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "add        r4, %0, %3                     \n"
-    "add        r5, r4, %3                     \n"
-    "add        %3, r5, %3                     \n"
-    ".p2align   2                              \n"
+  const uint8* src_ptr1 = src_ptr + src_stride;
+  const uint8* src_ptr2 = src_ptr + src_stride * 2;
+  const uint8* src_ptr3 = src_ptr + src_stride * 3;
+asm volatile (
  "1:                                          \n"
+    MEMACCESS(0)
    "vld1.8     {q0}, [%0]!                    \n"   // load up 16x4
-    "vld1.8     {q1}, [r4]!                    \n"
-    "vld1.8     {q2}, [r5]!                    \n"
-    "vld1.8     {q3}, [%3]!                    \n"
+    MEMACCESS(3)
+    "vld1.8     {q1}, [%3]!                    \n"
+    MEMACCESS(4)
+    "vld1.8     {q2}, [%4]!                    \n"
+    MEMACCESS(5)
+    "vld1.8     {q3}, [%5]!                    \n"
    "subs       %2, %2, #4                     \n"
    "vpaddl.u8  q0, q0                         \n"
    "vpadal.u8  q0, q1                         \n"
@ -105,13 +136,17 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
    "vpaddl.u16 q0, q0                         \n"
    "vrshrn.u32 d0, q0, #4                     \n"   // divide by 16 w/rounding
    "vmovn.u16  d0, q0                         \n"
+    MEMACCESS(1)
    "vst1.32    {d0[0]}, [%1]!                 \n"
    "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width)         // %2
-  : "r"(src_stride)         // %3
-  : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width), // %2
+    "+r"(src_ptr1),  // %3
+    "+r"(src_ptr2),  // %4
+    "+r"(src_ptr3)   // %5
+  :
+  : "q0", "q1", "q2", "q3", "memory", "cc"
  );
 }

@ -122,11 +157,12 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
                         ptrdiff_t src_stride,
                         uint8* dst_ptr, int dst_width) {
  asm volatile (
-    ".p2align   2                              \n"
  "1:                                          \n"
+    MEMACCESS(0)
    "vld4.8     {d0, d1, d2, d3}, [%0]!      \n" // src line 0
    "subs       %2, %2, #24                  \n"
    "vmov       d2, d3                       \n" // order d0, d1, d2
+    MEMACCESS(1)
    "vst3.8     {d0, d1, d2}, [%1]!          \n"
    "bgt        1b                           \n"
  : "+r"(src_ptr),          // %0
@ -143,9 +179,10 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
  asm volatile (
    "vmov.u8    d24, #3                        \n"
    "add        %3, %0                         \n"
-    ".p2align   2                              \n"
  "1:                                          \n"
+    MEMACCESS(0)
    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    MEMACCESS(3)
    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
    "subs         %2, %2, #24                  \n"

@ -182,6 +219,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
    "vmlal.u8     q8, d3, d24                  \n"
    "vqrshrn.u16  d2, q8, #2                   \n"

+    MEMACCESS(1)
    "vst3.8       {d0, d1, d2}, [%1]!          \n"

    "bgt          1b                           \n"
@ -200,9 +238,10 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
  asm volatile (
    "vmov.u8    d24, #3                        \n"
    "add        %3, %0                         \n"
-    ".p2align   2                              \n"
  "1:                                          \n"
+    MEMACCESS(0)
    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    MEMACCESS(3)
    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
    "subs         %2, %2, #24                  \n"
    // average src line 0 with src line 1
@ -222,6 +261,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
    "vmlal.u8     q3, d3, d24                  \n"
    "vqrshrn.u16  d2, q3, #2                   \n"

+    MEMACCESS(1)
    "vst3.8       {d0, d1, d2}, [%1]!          \n"
    "bgt          1b                           \n"
  : "+r"(src_ptr),          // %0
@ -250,14 +290,17 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
                         ptrdiff_t src_stride,
                         uint8* dst_ptr, int dst_width) {
  asm volatile (
+    MEMACCESS(3)
    "vld1.8     {q3}, [%3]                     \n"
-    ".p2align   2                              \n"
  "1:                                          \n"
+    MEMACCESS(0)
    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"
    "subs       %2, %2, #12                    \n"
    "vtbl.u8    d4, {d0, d1, d2, d3}, d6       \n"
    "vtbl.u8    d5, {d0, d1, d2, d3}, d7       \n"
+    MEMACCESS(1)
    "vst1.8     {d4}, [%1]!                    \n"
+    MEMACCESS(1)
    "vst1.32    {d5[0]}, [%1]!                 \n"
    "bgt        1b                             \n"
  : "+r"(src_ptr),          // %0
@ -272,22 +315,28 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
                                      ptrdiff_t src_stride,
                                      uint8* dst_ptr, int dst_width) {
+  const uint8* src_ptr1 = src_ptr + src_stride * 2;
+
  asm volatile (
-    "vld1.16    {q13}, [%4]                    \n"
-    "vld1.8     {q14}, [%5]                    \n"
-    "vld1.8     {q15}, [%6]                    \n"
-    "add        r4, %0, %3, lsl #1             \n"
+    MEMACCESS(5)
+    "vld1.16    {q13}, [%5]                    \n"
+    MEMACCESS(6)
+    "vld1.8     {q14}, [%6]                    \n"
+    MEMACCESS(7)
+    "vld1.8     {q15}, [%7]                    \n"
    "add        %3, %0                         \n"
-    ".p2align   2                              \n"
  "1:                                          \n"

    // d0 = 00 40 01 41 02 42 03 43
    // d1 = 10 50 11 51 12 52 13 53
    // d2 = 20 60 21 61 22 62 23 63
    // d3 = 30 70 31 71 32 72 33 73
+    MEMACCESS(0)
    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
+    MEMACCESS(3)
    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
-    "vld4.8       {d16, d17, d18, d19}, [r4]!  \n"
+    MEMACCESS(4)
+    "vld4.8       {d16, d17, d18, d19}, [%4]!  \n"
    "subs         %2, %2, #12                  \n"

    // Shuffle the input data around to get align the data
@ -364,18 +413,20 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"

+    MEMACCESS(1)
    "vst1.8       {d3}, [%1]!                  \n"
+    MEMACCESS(1)
    "vst1.32      {d4[0]}, [%1]!               \n"
    "bgt          1b                           \n"
  : "+r"(src_ptr),          // %0
    "+r"(dst_ptr),          // %1
    "+r"(dst_width),        // %2
-    "+r"(src_stride)        // %3
-  : "r"(&kMult38_Div6),     // %4
-    "r"(&kShuf38_2),        // %5
-    "r"(&kMult38_Div9)      // %6
-  : "r4", "q0", "q1", "q2", "q3", "q8", "q9",
-    "q13", "q14", "q15", "memory", "cc"
+    "+r"(src_stride),       // %3
+    "+r"(src_ptr1)          // %4
+  : "r"(&kMult38_Div6),     // %5
+    "r"(&kShuf38_2),        // %6
+    "r"(&kMult38_Div9)      // %7
+  : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
  );
 }

@ -384,17 +435,20 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
                               ptrdiff_t src_stride,
                               uint8* dst_ptr, int dst_width) {
  asm volatile (
+    MEMACCESS(4)
    "vld1.16    {q13}, [%4]                    \n"
+    MEMACCESS(5)
    "vld1.8     {q14}, [%5]                    \n"
    "add        %3, %0                         \n"
-    ".p2align   2                              \n"
  "1:                                          \n"

    // d0 = 00 40 01 41 02 42 03 43
    // d1 = 10 50 11 51 12 52 13 53
    // d2 = 20 60 21 61 22 62 23 63
    // d3 = 30 70 31 71 32 72 33 73
+    MEMACCESS(0)
    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
+    MEMACCESS(3)
    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
    "subs         %2, %2, #12                  \n"

@ -461,7 +515,9 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"

+    MEMACCESS(1)
    "vst1.8       {d3}, [%1]!                  \n"
+    MEMACCESS(1)
    "vst1.32      {d4[0]}, [%1]!               \n"
    "bgt          1b                           \n"
  : "+r"(src_ptr),       // %0
@ -474,6 +530,114 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
  );
 }

+void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                    uint16* dst_ptr, int src_width, int src_height) {
+  const uint8* src_tmp;
+  asm volatile (
+  "1:                                          \n"
+    "mov       %0, %1                          \n"
+    "mov       r12, %5                         \n"
+    "veor      q2, q2, q2                      \n"
+    "veor      q3, q3, q3                      \n"
+  "2:                                          \n"
+    // load 16 pixels into q0
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0], %3                 \n"
+    "vaddw.u8   q3, q3, d1                     \n"
+    "vaddw.u8   q2, q2, d0                     \n"
+    "subs       r12, r12, #1                   \n"
+    "bgt        2b                             \n"
+    MEMACCESS(2)
+    "vst1.16    {q2, q3}, [%2]!                \n"  // store pixels
+    "add        %1, %1, #16                    \n"
+    "subs       %4, %4, #16                    \n"  // 16 processed per loop
+    "bgt        1b                             \n"
+  : "=&r"(src_tmp),    // %0
+    "+r"(src_ptr),     // %1
+    "+r"(dst_ptr),     // %2
+    "+r"(src_stride),  // %3
+    "+r"(src_width),   // %4
+    "+r"(src_height)   // %5
+  :
+  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA8_LANE(n)                                    \
+    "lsr        %5, %3, #16                    \n"             \
+    "add        %6, %1, %5                     \n"             \
+    "add        %3, %3, %4                     \n"             \
+    MEMACCESS(6)                                               \
+    "vld2.8     {d6["#n"], d7["#n"]}, [%6]     \n"
+
+// The NEON version mimics this formula:
+// #define BLENDER(a, b, f) (uint8)((int)(a) +
+//    ((int)(f) * ((int)(b) - (int)(a)) >> 16))
+
+void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                          int dst_width, int x, int dx) {
+  int dx_offset[4] = {0, 1, 2, 3};
+  int* tmp = dx_offset;
+  const uint8* src_tmp = src_ptr;
+  asm volatile (
+    "vdup.32    q0, %3                         \n"  // x
+    "vdup.32    q1, %4                         \n"  // dx
+    "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3
+    "vshl.i32   q3, q1, #2                     \n"  // 4 * dx
+    "vmul.s32   q1, q1, q2                     \n"
+    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+    "vadd.s32   q1, q1, q0                     \n"
+    // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
+    "vadd.s32   q2, q1, q3                     \n"
+    "vshl.i32   q0, q3, #1                     \n"  // 8 * dx
+  "1:                                          \n"
+    LOAD2_DATA8_LANE(0)
+    LOAD2_DATA8_LANE(1)
+    LOAD2_DATA8_LANE(2)
+    LOAD2_DATA8_LANE(3)
+    LOAD2_DATA8_LANE(4)
+    LOAD2_DATA8_LANE(5)
+    LOAD2_DATA8_LANE(6)
+    LOAD2_DATA8_LANE(7)
+    "vmov       q10, q1                        \n"
+    "vmov       q11, q2                        \n"
+    "vuzp.16    q10, q11                       \n"
+    "vmovl.u8   q8, d6                         \n"
+    "vmovl.u8   q9, d7                         \n"
+    "vsubl.s16  q11, d18, d16                  \n"
+    "vsubl.s16  q12, d19, d17                  \n"
+    "vmovl.u16  q13, d20                       \n"
+    "vmovl.u16  q10, d21                       \n"
+    "vmul.s32   q11, q11, q13                  \n"
+    "vmul.s32   q12, q12, q10                  \n"
+    "vrshrn.s32  d18, q11, #16                 \n"
+    "vrshrn.s32  d19, q12, #16                 \n"
+    "vadd.s16   q8, q8, q9                     \n"
+    "vmovn.s16  d6, q8                         \n"
+
+    MEMACCESS(0)
+    "vst1.8     {d6}, [%0]!                    \n"  // store pixels
+    "vadd.s32   q1, q1, q0                     \n"
+    "vadd.s32   q2, q2, q0                     \n"
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    "bgt        1b                             \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(x),                // %3
+    "+r"(dx),               // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3",
+    "q8", "q9", "q10", "q11", "q12", "q13"
+  );
+}
+
+#undef LOAD2_DATA8_LANE
+
 // 16x2 -> 16x1
 void ScaleFilterRows_NEON(uint8* dst_ptr,
                          const uint8* src_ptr, ptrdiff_t src_stride,
@ -494,7 +658,9 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
    "vdup.8       d4, %4                       \n"
    // General purpose row blend.
  "1:                                          \n"
+    MEMACCESS(1)
    "vld1.8       {q0}, [%1]!                  \n"
+    MEMACCESS(2)
    "vld1.8       {q1}, [%2]!                  \n"
    "subs         %3, %3, #16                  \n"
    "vmull.u8     q13, d0, d4                  \n"
@ -503,50 +669,63 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
    "vmlal.u8     q14, d3, d5                  \n"
    "vrshrn.u16   d0, q13, #8                  \n"
    "vrshrn.u16   d1, q14, #8                  \n"
+    MEMACCESS(0)
    "vst1.8       {q0}, [%0]!                  \n"
    "bgt          1b                           \n"
    "b            99f                          \n"

    // Blend 25 / 75.
  "25:                                         \n"
+    MEMACCESS(1)
    "vld1.8       {q0}, [%1]!                  \n"
+    MEMACCESS(2)
    "vld1.8       {q1}, [%2]!                  \n"
    "subs         %3, %3, #16                  \n"
    "vrhadd.u8    q0, q1                       \n"
    "vrhadd.u8    q0, q1                       \n"
+    MEMACCESS(0)
    "vst1.8       {q0}, [%0]!                  \n"
    "bgt          25b                          \n"
    "b            99f                          \n"

    // Blend 50 / 50.
  "50:                                         \n"
+    MEMACCESS(1)
    "vld1.8       {q0}, [%1]!                  \n"
+    MEMACCESS(2)
    "vld1.8       {q1}, [%2]!                  \n"
    "subs         %3, %3, #16                  \n"
    "vrhadd.u8    q0, q1                       \n"
+    MEMACCESS(0)
    "vst1.8       {q0}, [%0]!                  \n"
    "bgt          50b                          \n"
    "b            99f                          \n"

    // Blend 75 / 25.
  "75:                                         \n"
+    MEMACCESS(1)
    "vld1.8       {q1}, [%1]!                  \n"
+    MEMACCESS(2)
    "vld1.8       {q0}, [%2]!                  \n"
    "subs         %3, %3, #16                  \n"
    "vrhadd.u8    q0, q1                       \n"
    "vrhadd.u8    q0, q1                       \n"
+    MEMACCESS(0)
    "vst1.8       {q0}, [%0]!                  \n"
    "bgt          75b                          \n"
    "b            99f                          \n"

    // Blend 100 / 0 - Copy row unchanged.
  "100:                                        \n"
+    MEMACCESS(1)
    "vld1.8       {q0}, [%1]!                  \n"
    "subs         %3, %3, #16                  \n"
+    MEMACCESS(0)
    "vst1.8       {q0}, [%0]!                  \n"
    "bgt          100b                         \n"

  "99:                                         \n"
+    MEMACCESS(0)
    "vst1.8       {d1[7]}, [%0]                \n"
  : "+r"(dst_ptr),          // %0
    "+r"(src_ptr),          // %1
@ -561,13 +740,16 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
 void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst, int dst_width) {
  asm volatile (
-    ".p2align   2                              \n"
  "1:                                          \n"
    // load even pixels into q0, odd into q1
+    MEMACCESS(0)
    "vld2.32    {q0, q1}, [%0]!                \n"
+    MEMACCESS(0)
    "vld2.32    {q2, q3}, [%0]!                \n"
    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    MEMACCESS(1)
    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
+    MEMACCESS(1)
    "vst1.8     {q3}, [%1]!                    \n"
    "bgt        1b                             \n"
  : "+r"(src_ptr),          // %0
@ -578,21 +760,52 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
  );
 }

+void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
+    "vrshrn.u16 d0, q0, #1                     \n"  // downshift, round and pack
+    "vrshrn.u16 d1, q1, #1                     \n"
+    "vrshrn.u16 d2, q2, #1                     \n"
+    "vrshrn.u16 d3, q3, #1                     \n"
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"
+    "bgt       1b                              \n"
+  : "+r"(src_argb),         // %0
+    "+r"(dst_argb),         // %1
+    "+r"(dst_width)         // %2
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3"     // Clobber List
+  );
+}
+
 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                               uint8* dst, int dst_width) {
  asm volatile (
    // change the stride to row 2 pointer
    "add        %1, %1, %0                     \n"
-    ".p2align   2                              \n"
  "1:                                          \n"
+    MEMACCESS(0)
    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
+    MEMACCESS(1)
    "vld4.8     {d16, d18, d20, d22}, [%1]!    \n"  // load 8 more ARGB pixels.
+    MEMACCESS(1)
    "vld4.8     {d17, d19, d21, d23}, [%1]!    \n"  // load last 8 ARGB pixels.
    "vpadal.u8  q0, q8                         \n"  // B 16 bytes -> 8 shorts.
    "vpadal.u8  q1, q9                         \n"  // G 16 bytes -> 8 shorts.
@ -602,6 +815,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
    "vrshrn.u16 d1, q1, #2                     \n"
    "vrshrn.u16 d2, q2, #2                     \n"
    "vrshrn.u16 d3, q3, #2                     \n"
+    MEMACCESS(2)
    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"
    "bgt        1b                             \n"
  : "+r"(src_ptr),          // %0
@ -619,13 +833,17 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,
                               int src_stepx, uint8* dst_argb, int dst_width) {
  asm volatile (
    "mov        r12, %3, lsl #2                \n"
-    ".p2align   2                              \n"
  "1:                                          \n"
+    MEMACCESS(0)
    "vld1.32    {d0[0]}, [%0], r12             \n"
+    MEMACCESS(0)
    "vld1.32    {d0[1]}, [%0], r12             \n"
+    MEMACCESS(0)
    "vld1.32    {d1[0]}, [%0], r12             \n"
+    MEMACCESS(0)
    "vld1.32    {d1[1]}, [%0], r12             \n"
    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
+    MEMACCESS(1)
    "vst1.8     {q0}, [%1]!                    \n"
    "bgt        1b                             \n"
  : "+r"(src_argb),    // %0
@ -644,15 +862,22 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
  asm volatile (
    "mov        r12, %4, lsl #2                \n"
    "add        %1, %1, %0                     \n"
-    ".p2align   2                              \n"
  "1:                                          \n"
+    MEMACCESS(0)
    "vld1.8     {d0}, [%0], r12                \n"  // Read 4 2x2 blocks -> 2x1
+    MEMACCESS(1)
    "vld1.8     {d1}, [%1], r12                \n"
+    MEMACCESS(0)
    "vld1.8     {d2}, [%0], r12                \n"
+    MEMACCESS(1)
    "vld1.8     {d3}, [%1], r12                \n"
+    MEMACCESS(0)
    "vld1.8     {d4}, [%0], r12                \n"
+    MEMACCESS(1)
    "vld1.8     {d5}, [%1], r12                \n"
+    MEMACCESS(0)
    "vld1.8     {d6}, [%0], r12                \n"
+    MEMACCESS(1)
    "vld1.8     {d7}, [%1], r12                \n"
    "vaddl.u8   q0, d0, d1                     \n"
    "vaddl.u8   q1, d2, d3                     \n"
@ -665,6 +890,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
    "vrshrn.u16 d0, q0, #2                     \n"  // first 2 pixels.
    "vrshrn.u16 d1, q2, #2                     \n"  // next 2 pixels.
    "subs       %3, %3, #4                     \n"  // 4 pixels per loop.
+    MEMACCESS(2)
    "vst1.8     {q0}, [%2]!                    \n"
    "bgt        1b                             \n"
  : "+r"(src_argb),    // %0
@ -676,7 +902,118 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
  );
 }

-#endif  // __ARM_NEON__
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD1_DATA32_LANE(dn, n)                               \
+    "lsr        %5, %3, #16                    \n"             \
+    "add        %6, %1, %5, lsl #2             \n"             \
+    "add        %3, %3, %4                     \n"             \
+    MEMACCESS(6)                                               \
+    "vld1.32    {"#dn"["#n"]}, [%6]            \n"
+
+void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx) {
+  int tmp;
+  const uint8* src_tmp = src_argb;
+  asm volatile (
+  "1:                                          \n"
+    LOAD1_DATA32_LANE(d0, 0)
+    LOAD1_DATA32_LANE(d0, 1)
+    LOAD1_DATA32_LANE(d1, 0)
+    LOAD1_DATA32_LANE(d1, 1)
+    LOAD1_DATA32_LANE(d2, 0)
+    LOAD1_DATA32_LANE(d2, 1)
+    LOAD1_DATA32_LANE(d3, 0)
+    LOAD1_DATA32_LANE(d3, 1)
+
+    MEMACCESS(0)
+    "vst1.32     {q0, q1}, [%0]!               \n"  // store pixels
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    "bgt        1b                             \n"
+  : "+r"(dst_argb),   // %0
+    "+r"(src_argb),   // %1
+    "+r"(dst_width),  // %2
+    "+r"(x),          // %3
+    "+r"(dx),         // %4
+    "=&r"(tmp),       // %5
+    "+r"(src_tmp)     // %6
+  :
+  : "memory", "cc", "q0", "q1"
+  );
+}
+
+#undef LOAD1_DATA32_LANE
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA32_LANE(dn1, dn2, n)                         \
+    "lsr        %5, %3, #16                           \n"      \
+    "add        %6, %1, %5, lsl #2                    \n"      \
+    "add        %3, %3, %4                            \n"      \
+    MEMACCESS(6)                                               \
+    "vld2.32    {"#dn1"["#n"], "#dn2"["#n"]}, [%6]    \n"
+
+void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                              int dst_width, int x, int dx) {
+  int dx_offset[4] = {0, 1, 2, 3};
+  int* tmp = dx_offset;
+  const uint8* src_tmp = src_argb;
+  asm volatile (
+    "vdup.32    q0, %3                         \n"  // x
+    "vdup.32    q1, %4                         \n"  // dx
+    "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3
+    "vshl.i32   q9, q1, #2                     \n"  // 4 * dx
+    "vmul.s32   q1, q1, q2                     \n"
+    "vmov.i8    q3, #0x7f                      \n"  // 0x7F
+    "vmov.i16   q15, #0x7f                     \n"  // 0x7F
+    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+    "vadd.s32   q8, q1, q0                     \n"
+  "1:                                          \n"
+    // d0, d1: a
+    // d2, d3: b
+    LOAD2_DATA32_LANE(d0, d2, 0)
+    LOAD2_DATA32_LANE(d0, d2, 1)
+    LOAD2_DATA32_LANE(d1, d3, 0)
+    LOAD2_DATA32_LANE(d1, d3, 1)
+    "vshrn.i32   d22, q8, #9                   \n"
+    "vand.16     d22, d22, d30                 \n"
+    "vdup.8      d24, d22[0]                   \n"
+    "vdup.8      d25, d22[2]                   \n"
+    "vdup.8      d26, d22[4]                   \n"
+    "vdup.8      d27, d22[6]                   \n"
+    "vext.8      d4, d24, d25, #4              \n"
+    "vext.8      d5, d26, d27, #4              \n"  // f
+    "veor.8      q10, q2, q3                   \n"  // 0x7f ^ f
+    "vmull.u8    q11, d0, d20                  \n"
+    "vmull.u8    q12, d1, d21                  \n"
+    "vmull.u8    q13, d2, d4                   \n"
+    "vmull.u8    q14, d3, d5                   \n"
+    "vadd.i16    q11, q11, q13                 \n"
+    "vadd.i16    q12, q12, q14                 \n"
+    "vshrn.i16   d0, q11, #7                   \n"
+    "vshrn.i16   d1, q12, #7                   \n"
+
+    MEMACCESS(0)
+    "vst1.32     {d0, d1}, [%0]!               \n"  // store pixels
+    "vadd.s32    q8, q8, q9                    \n"
+    "subs        %2, %2, #4                    \n"  // 4 processed per loop
+    "bgt         1b                            \n"
+  : "+r"(dst_argb),         // %0
+    "+r"(src_argb),         // %1
+    "+r"(dst_width),        // %2
+    "+r"(x),                // %3
+    "+r"(dx),               // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
+    "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+#undef LOAD2_DATA32_LANE
+
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)

 #ifdef __cplusplus
 }  // extern "C"
--- a/media/libyuv/source/scale_neon64.cc
+++ b/media/libyuv/source/scale_neon64.cc
--- a/media/libyuv/source/scale_win.cc
+++ b/media/libyuv/source/scale_win.cc
--- a/media/libyuv/source/video_common.cc
+++ b/media/libyuv/source/video_common.cc
@ -25,6 +25,7 @@ struct FourCCAliasEntry {

 static const struct FourCCAliasEntry kFourCCAliases[] = {
  {FOURCC_IYUV, FOURCC_I420},
+  {FOURCC_YU12, FOURCC_I420},
  {FOURCC_YU16, FOURCC_I422},
  {FOURCC_YU24, FOURCC_I444},
  {FOURCC_YUYV, FOURCC_YUY2},
@ -33,7 +34,7 @@ static const struct FourCCAliasEntry kFourCCAliases[] = {
  {FOURCC_2VUY, FOURCC_UYVY},  // kCMPixelFormat_422YpCbCr8
  {FOURCC_JPEG, FOURCC_MJPG},  // Note: JPEG has DHT while MJPG does not.
  {FOURCC_DMB1, FOURCC_MJPG},
-  {FOURCC_BA81, FOURCC_BGGR},
+  {FOURCC_BA81, FOURCC_BGGR},  // deprecated.
  {FOURCC_RGB3, FOURCC_RAW },
  {FOURCC_BGR3, FOURCC_24BG},
  {FOURCC_CM32, FOURCC_BGRA},  // kCMPixelFormat_32ARGB
--- a/media/libyuv/source/x86inc.asm
+++ b/media/libyuv/source/x86inc.asm
--- a/media/libyuv/sync_chromium.py
+++ b/media/libyuv/sync_chromium.py
@ -0,0 +1,154 @@
+#!/usr/bin/env python
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+"""Script to download a Chromium checkout into the workspace.
+
+The script downloads a full Chromium Git clone and its DEPS.
+
+The following environment variable can be used to alter the behavior:
+* CHROMIUM_NO_HISTORY - If set to 1, a Git checkout with no history will be
+  downloaded. This is consumes less bandwidth and disk space but is known to be
+  slower in general if you have a high-speed connection.
+
+After a successful sync has completed, a .last_sync_chromium file is written to
+the chromium directory. While it exists, no more gclient sync operations will be
+performed until the --target-revision changes or the SCRIPT_VERSION constant is
+incremented. The file can be removed manually to force a new sync.
+"""
+
+import argparse
+import os
+import subprocess
+import sys
+
+# Bump this whenever the algorithm changes and you need bots/devs to re-sync,
+# ignoring the .last_sync_chromium file
+SCRIPT_VERSION = 4
+
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+CHROMIUM_NO_HISTORY = 'CHROMIUM_NO_HISTORY'
+
+def _parse_gclient_dict():
+  gclient_dict = {}
+  try:
+    main_gclient = os.path.join(os.path.dirname(ROOT_DIR), '.gclient')
+    with open(main_gclient, 'rb') as deps_content:
+      exec(deps_content, gclient_dict)
+  except Exception as e:
+    print >> sys.stderr, 'error while parsing .gclient:', e
+  return gclient_dict
+
+
+def get_cache_dir():
+  return _parse_gclient_dict().get('cache_dir')
+
+
+def get_target_os_list():
+  return ','.join(_parse_gclient_dict().get('target_os', []))
+
+
+def main():
+  CR_DIR = os.path.join(ROOT_DIR, 'chromium')
+
+  p = argparse.ArgumentParser()
+  p.add_argument('--target-revision', required=True,
+                 help='The target chromium git revision [REQUIRED]')
+  p.add_argument('--chromium-dir', default=CR_DIR,
+                 help=('The path to the chromium directory to sync '
+                       '(default: %(default)r)'))
+  opts = p.parse_args()
+  opts.chromium_dir = os.path.abspath(opts.chromium_dir)
+
+  target_os_list = get_target_os_list()
+
+  # Do a quick check to see if we were successful last time to make runhooks
+  # sooper fast.
+  flag_file = os.path.join(opts.chromium_dir, '.last_sync_chromium')
+  flag_file_content = '\n'.join([
+    str(SCRIPT_VERSION),
+    opts.target_revision,
+    repr(target_os_list),
+  ])
+  if (os.path.exists(os.path.join(opts.chromium_dir, 'src')) and
+      os.path.exists(flag_file)):
+    with open(flag_file, 'r') as f:
+      if f.read() == flag_file_content:
+        print 'Chromium already up to date: ', opts.target_revision
+        return 0
+    os.unlink(flag_file)
+
+  env = os.environ.copy()
+
+  # Avoid downloading NaCl toolchain as part of the Chromium hooks.
+  env['GYP_CHROMIUM_NO_ACTION'] = '1'
+  gclient_cmd = 'gclient.bat' if sys.platform.startswith('win') else 'gclient'
+  args = [
+      gclient_cmd, 'sync', '--force', '--revision', 'src@'+opts.target_revision
+  ]
+
+  if os.environ.get('CHROME_HEADLESS') == '1':
+    # Running on a buildbot.
+    args.append('-vvv')
+
+    if sys.platform.startswith('win'):
+      cache_path = os.path.join(os.path.splitdrive(ROOT_DIR)[0] + os.path.sep,
+                                'b', 'git-cache')
+    else:
+      cache_path = '/b/git-cache'
+  else:
+    # Support developers setting the cache_dir in .gclient.
+    cache_path = get_cache_dir()
+
+  # Allow for users with poor internet connections to download a Git clone
+  # without history (saves several gigs but is generally slower and doesn't work
+  # with the Git cache).
+  if os.environ.get(CHROMIUM_NO_HISTORY) == '1':
+    if cache_path:
+      print >> sys.stderr, (
+          'You cannot use "no-history" mode for syncing Chrome (i.e. set the '
+          '%s environment variable to 1) when you have cache_dir configured in '
+          'your .gclient.' % CHROMIUM_NO_HISTORY)
+      return 1
+    args.append('--no-history')
+    gclient_entries_file = os.path.join(opts.chromium_dir, '.gclient_entries')
+  else:
+    # Write a temporary .gclient file that has the cache_dir variable added.
+    gclientfile = os.path.join(opts.chromium_dir, '.gclient')
+    with open(gclientfile, 'rb') as spec:
+      spec = spec.read().splitlines()
+      spec[-1] = 'cache_dir = %r' % (cache_path,)
+    with open(gclientfile + '.tmp', 'wb') as f:
+      f.write('\n'.join(spec))
+
+    args += [
+      '--gclientfile', '.gclient.tmp',
+      '--delete_unversioned_trees', '--reset', '--upstream'
+    ]
+    gclient_entries_file = os.path.join(opts.chromium_dir,
+                                        '.gclient.tmp_entries')
+
+  # To avoid gclient sync problems when DEPS entries have been removed we must
+  # wipe the gclient's entries file that contains cached URLs for all DEPS.
+  if os.path.exists(gclient_entries_file):
+    os.unlink(gclient_entries_file)
+
+  if target_os_list:
+    args += ['--deps=' + target_os_list]
+
+  print 'Running "%s" in %s' % (' '.join(args), opts.chromium_dir)
+  ret = subprocess.call(args, cwd=opts.chromium_dir, env=env)
+  if ret == 0:
+    with open(flag_file, 'wb') as f:
+      f.write(flag_file_content)
+
+  return ret
+
+
+if __name__ == '__main__':
+  sys.exit(main())
--- a/media/libyuv/third_party/gflags/BUILD.gn
+++ b/media/libyuv/third_party/gflags/BUILD.gn
@ -0,0 +1,76 @@
+#
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# This is a copy of WebRTC's BUILD.gn.
+
+if (is_win) {
+  gflags_gen_arch_root = "gen/win"
+} else {
+  gflags_gen_arch_root = "gen/posix"
+}
+
+config("gflags_config") {
+  include_dirs = [
+    "$gflags_gen_arch_root/include",  # For configured files.
+    "src/src",  # For everything else.
+  ]
+
+  defines = [
+    # These macros exist so flags and symbols are properly exported when
+    # building DLLs. Since we don't build DLLs, we need to disable them.
+    "GFLAGS_DLL_DECL=",
+    "GFLAGS_DLL_DECLARE_FLAG=",
+    "GFLAGS_DLL_DEFINE_FLAG=",
+  ]
+
+  # GN orders flags on a target before flags from configs. The default config
+  # adds -Wall, and this flag have to be after -Wall -- so they need to
+  # come from a config and can't be on the target directly.
+  if (is_clang) {
+    cflags = [ "-Wno-unused-local-typedef" ]
+  }
+}
+
+source_set("gflags") {
+  cflags = []
+  sources = [
+    "src/src/gflags.cc",
+    "src/src/gflags_completions.cc",
+    "src/src/gflags_reporting.cc",
+  ]
+  if (is_win) {
+    sources += [ "src/src/windows_port.cc" ]
+
+    cflags += [
+      "/wd4005",  # WIN32_LEAN_AND_MEAN.
+      "/wd4267",  # Conversion from size_t to "type".
+    ]
+  }
+
+  include_dirs = [
+    "$gflags_gen_arch_root/include/gflags",  # For configured files.
+    "$gflags_gen_arch_root/include/private",  # For config.h
+  ]
+
+  public_configs = [ ":gflags_config" ]
+
+  configs -= [ "//build/config/compiler:chromium_code" ]
+  configs += [ "//build/config/compiler:no_chromium_code" ]
+
+  if (is_win) {
+    configs -= [ "//build/config/win:unicode" ]
+  }
+
+  if (is_clang) {
+    # TODO(andrew): Look into fixing this warning upstream:
+    # http://code.google.com/p/webrtc/issues/detail?id=760
+    configs -= [ "//build/config/clang:extra_warnings" ]
+    cflags += [ "-Wno-microsoft-include" ]
+  }
+}
--- a/media/libyuv/third_party/gflags/LICENSE
+++ b/media/libyuv/third_party/gflags/LICENSE
@ -0,0 +1,28 @@
+Copyright (c) 2006, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/media/libyuv/third_party/gflags/README.libyuv
+++ b/media/libyuv/third_party/gflags/README.libyuv
@ -0,0 +1,28 @@
+URL: https://github.com/gflags/gflags
+Version: 2.1.2
+License: New BSD
+License File: LICENSE
+
+Description:
+The gflags package contains a library that implements commandline
+flags processing. As such it's a replacement for getopt(). It has
+increased flexibility, including built-in support for C++ types like
+string, and the ability to define flags in the source file in which
+they're used.
+
+Local Modifications: None
+
+
+How to update platform configuration files:
+The gen/ directory contains pre-generated configuration header files.
+Historically, all operating systems and architectures have generated
+similar configurations except for Windows. This is why there's only
+posix and win directories below gen/.
+When rolling gflags to a newer version, it's a good idea to check if
+new configuration files needs to be generated as well.
+Do this by running ./configure in the newly checked out version of
+gflags. Then diff the generated files with the ones below gen/.
+If you notice a diff, update the files with the updated ones.
+If you suspect platform dependend changes other than Windows, you'll
+have to checkout gflags on the other platforms as well and run
+./configure there too.
--- a/media/libyuv/third_party/gflags/gen/posix/include/gflags/gflags.h
+++ b/media/libyuv/third_party/gflags/gen/posix/include/gflags/gflags.h
@ -0,0 +1,573 @@
+// Copyright (c) 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Revamped and reorganized by Craig Silverstein
+//
+// This is the file that should be included by any file which declares
+// or defines a command line flag or wants to parse command line flags
+// or print a program usage message (which will include information about
+// flags).  Executive summary, in the form of an example foo.cc file:
+//
+//    #include "foo.h"         // foo.h has a line "DECLARE_int32(start);"
+//    #include "validators.h"  // hypothetical file defining ValidateIsFile()
+//
+//    DEFINE_int32(end, 1000, "The last record to read");
+//
+//    DEFINE_string(filename, "my_file.txt", "The file to read");
+//    // Crash if the specified file does not exist.
+//    static bool dummy = RegisterFlagValidator(&FLAGS_filename,
+//                                              &ValidateIsFile);
+//
+//    DECLARE_bool(verbose); // some other file has a DEFINE_bool(verbose, ...)
+//
+//    void MyFunc() {
+//      if (FLAGS_verbose) printf("Records %d-%d\n", FLAGS_start, FLAGS_end);
+//    }
+//
+//    Then, at the command-line:
+//       ./foo --noverbose --start=5 --end=100
+//
+// For more details, see
+//    doc/gflags.html
+//
+// --- A note about thread-safety:
+//
+// We describe many functions in this routine as being thread-hostile,
+// thread-compatible, or thread-safe.  Here are the meanings we use:
+//
+// thread-safe: it is safe for multiple threads to call this routine
+//   (or, when referring to a class, methods of this class)
+//   concurrently.
+// thread-hostile: it is not safe for multiple threads to call this
+//   routine (or methods of this class) concurrently.  In gflags,
+//   most thread-hostile routines are intended to be called early in,
+//   or even before, main() -- that is, before threads are spawned.
+// thread-compatible: it is safe for multiple threads to read from
+//   this variable (when applied to variables), or to call const
+//   methods of this class (when applied to classes), as long as no
+//   other thread is writing to the variable or calling non-const
+//   methods of this class.
+
+#ifndef GFLAGS_GFLAGS_H_
+#define GFLAGS_GFLAGS_H_
+
+#include <string>
+#include <vector>
+
+#include "gflags_declare.h" // IWYU pragma: export
+
+
+// We always want to export variables defined in user code
+#ifndef GFLAGS_DLL_DEFINE_FLAG
+#  ifdef _MSC_VER
+#    define GFLAGS_DLL_DEFINE_FLAG __declspec(dllexport)
+#  else
+#    define GFLAGS_DLL_DEFINE_FLAG
+#  endif
+#endif
+
+
+namespace GFLAGS_NAMESPACE {
+
+
+// --------------------------------------------------------------------
+// To actually define a flag in a file, use DEFINE_bool,
+// DEFINE_string, etc. at the bottom of this file.  You may also find
+// it useful to register a validator with the flag.  This ensures that
+// when the flag is parsed from the commandline, or is later set via
+// SetCommandLineOption, we call the validation function. It is _not_
+// called when you assign the value to the flag directly using the = operator.
+//
+// The validation function should return true if the flag value is valid, and
+// false otherwise. If the function returns false for the new setting of the
+// flag, the flag will retain its current value. If it returns false for the
+// default value, ParseCommandLineFlags() will die.
+//
+// This function is safe to call at global construct time (as in the
+// example below).
+//
+// Example use:
+//    static bool ValidatePort(const char* flagname, int32 value) {
+//       if (value > 0 && value < 32768)   // value is ok
+//         return true;
+//       printf("Invalid value for --%s: %d\n", flagname, (int)value);
+//       return false;
+//    }
+//    DEFINE_int32(port, 0, "What port to listen on");
+//    static bool dummy = RegisterFlagValidator(&FLAGS_port, &ValidatePort);
+
+// Returns true if successfully registered, false if not (because the
+// first argument doesn't point to a command-line flag, or because a
+// validator is already registered for this flag).
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const bool*        flag, bool (*validate_fn)(const char*, bool));
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const int32*       flag, bool (*validate_fn)(const char*, int32));
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const int64*       flag, bool (*validate_fn)(const char*, int64));
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const uint64*      flag, bool (*validate_fn)(const char*, uint64));
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const double*      flag, bool (*validate_fn)(const char*, double));
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const std::string* flag, bool (*validate_fn)(const char*, const std::string&));
+
+// Convenience macro for the registration of a flag validator
+#define DEFINE_validator(name, validator) \
+    static const bool name##_validator_registered = \
+            GFLAGS_NAMESPACE::RegisterFlagValidator(&FLAGS_##name, validator)
+
+
+// --------------------------------------------------------------------
+// These methods are the best way to get access to info about the
+// list of commandline flags.  Note that these routines are pretty slow.
+//   GetAllFlags: mostly-complete info about the list, sorted by file.
+//   ShowUsageWithFlags: pretty-prints the list to stdout (what --help does)
+//   ShowUsageWithFlagsRestrict: limit to filenames with restrict as a substr
+//
+// In addition to accessing flags, you can also access argv[0] (the program
+// name) and argv (the entire commandline), which we sock away a copy of.
+// These variables are static, so you should only set them once.
+//
+// No need to export this data only structure from DLL, avoiding VS warning 4251.
+struct CommandLineFlagInfo {
+  std::string name;            // the name of the flag
+  std::string type;            // the type of the flag: int32, etc
+  std::string description;     // the "help text" associated with the flag
+  std::string current_value;   // the current value, as a string
+  std::string default_value;   // the default value, as a string
+  std::string filename;        // 'cleaned' version of filename holding the flag
+  bool has_validator_fn;       // true if RegisterFlagValidator called on this flag
+  bool is_default;             // true if the flag has the default value and
+                               // has not been set explicitly from the cmdline
+                               // or via SetCommandLineOption
+  const void* flag_ptr;        // pointer to the flag's current value (i.e. FLAGS_foo)
+};
+
+// Using this inside of a validator is a recipe for a deadlock.
+// TODO(user) Fix locking when validators are running, to make it safe to
+// call validators during ParseAllFlags.
+// Also make sure then to uncomment the corresponding unit test in
+// gflags_unittest.sh
+extern GFLAGS_DLL_DECL void GetAllFlags(std::vector<CommandLineFlagInfo>* OUTPUT);
+// These two are actually defined in gflags_reporting.cc.
+extern GFLAGS_DLL_DECL void ShowUsageWithFlags(const char *argv0);  // what --help does
+extern GFLAGS_DLL_DECL void ShowUsageWithFlagsRestrict(const char *argv0, const char *restrict);
+
+// Create a descriptive string for a flag.
+// Goes to some trouble to make pretty line breaks.
+extern GFLAGS_DLL_DECL std::string DescribeOneFlag(const CommandLineFlagInfo& flag);
+
+// Thread-hostile; meant to be called before any threads are spawned.
+extern GFLAGS_DLL_DECL void SetArgv(int argc, const char** argv);
+
+// The following functions are thread-safe as long as SetArgv() is
+// only called before any threads start.
+extern GFLAGS_DLL_DECL const std::vector<std::string>& GetArgvs();
+extern GFLAGS_DLL_DECL const char* GetArgv();                      // all of argv as a string
+extern GFLAGS_DLL_DECL const char* GetArgv0();                     // only argv0
+extern GFLAGS_DLL_DECL uint32 GetArgvSum();                        // simple checksum of argv
+extern GFLAGS_DLL_DECL const char* ProgramInvocationName();        // argv0, or "UNKNOWN" if not set
+extern GFLAGS_DLL_DECL const char* ProgramInvocationShortName();   // basename(argv0)
+
+// ProgramUsage() is thread-safe as long as SetUsageMessage() is only
+// called before any threads start.
+extern GFLAGS_DLL_DECL const char* ProgramUsage();                 // string set by SetUsageMessage()
+
+// VersionString() is thread-safe as long as SetVersionString() is only
+// called before any threads start.
+extern GFLAGS_DLL_DECL const char* VersionString();                // string set by SetVersionString()
+
+
+
+// --------------------------------------------------------------------
+// Normally you access commandline flags by just saying "if (FLAGS_foo)"
+// or whatever, and set them by calling "FLAGS_foo = bar" (or, more
+// commonly, via the DEFINE_foo macro).  But if you need a bit more
+// control, we have programmatic ways to get/set the flags as well.
+// These programmatic ways to access flags are thread-safe, but direct
+// access is only thread-compatible.
+
+// Return true iff the flagname was found.
+// OUTPUT is set to the flag's value, or unchanged if we return false.
+extern GFLAGS_DLL_DECL bool GetCommandLineOption(const char* name, std::string* OUTPUT);
+
+// Return true iff the flagname was found. OUTPUT is set to the flag's
+// CommandLineFlagInfo or unchanged if we return false.
+extern GFLAGS_DLL_DECL bool GetCommandLineFlagInfo(const char* name, CommandLineFlagInfo* OUTPUT);
+
+// Return the CommandLineFlagInfo of the flagname.  exit() if name not found.
+// Example usage, to check if a flag's value is currently the default value:
+//   if (GetCommandLineFlagInfoOrDie("foo").is_default) ...
+extern GFLAGS_DLL_DECL CommandLineFlagInfo GetCommandLineFlagInfoOrDie(const char* name);
+
+enum GFLAGS_DLL_DECL FlagSettingMode {
+  // update the flag's value (can call this multiple times).
+  SET_FLAGS_VALUE,
+  // update the flag's value, but *only if* it has not yet been updated
+  // with SET_FLAGS_VALUE, SET_FLAG_IF_DEFAULT, or "FLAGS_xxx = nondef".
+  SET_FLAG_IF_DEFAULT,
+  // set the flag's default value to this.  If the flag has not yet updated
+  // yet (via SET_FLAGS_VALUE, SET_FLAG_IF_DEFAULT, or "FLAGS_xxx = nondef")
+  // change the flag's current value to the new default value as well.
+  SET_FLAGS_DEFAULT
+};
+
+// Set a particular flag ("command line option").  Returns a string
+// describing the new value that the option has been set to.  The
+// return value API is not well-specified, so basically just depend on
+// it to be empty if the setting failed for some reason -- the name is
+// not a valid flag name, or the value is not a valid value -- and
+// non-empty else.
+
+// SetCommandLineOption uses set_mode == SET_FLAGS_VALUE (the common case)
+extern GFLAGS_DLL_DECL std::string SetCommandLineOption        (const char* name, const char* value);
+extern GFLAGS_DLL_DECL std::string SetCommandLineOptionWithMode(const char* name, const char* value, FlagSettingMode set_mode);
+
+
+// --------------------------------------------------------------------
+// Saves the states (value, default value, whether the user has set
+// the flag, registered validators, etc) of all flags, and restores
+// them when the FlagSaver is destroyed.  This is very useful in
+// tests, say, when you want to let your tests change the flags, but
+// make sure that they get reverted to the original states when your
+// test is complete.
+//
+// Example usage:
+//   void TestFoo() {
+//     FlagSaver s1;
+//     FLAG_foo = false;
+//     FLAG_bar = "some value";
+//
+//     // test happens here.  You can return at any time
+//     // without worrying about restoring the FLAG values.
+//   }
+//
+// Note: This class is marked with GFLAGS_ATTRIBUTE_UNUSED because all
+// the work is done in the constructor and destructor, so in the standard
+// usage example above, the compiler would complain that it's an
+// unused variable.
+//
+// This class is thread-safe.  However, its destructor writes to
+// exactly the set of flags that have changed value during its
+// lifetime, so concurrent _direct_ access to those flags
+// (i.e. FLAGS_foo instead of {Get,Set}CommandLineOption()) is unsafe.
+
+class GFLAGS_DLL_DECL FlagSaver {
+ public:
+  FlagSaver();
+  ~FlagSaver();
+
+ private:
+  class FlagSaverImpl* impl_;   // we use pimpl here to keep API steady
+
+  FlagSaver(const FlagSaver&);  // no copying!
+  void operator=(const FlagSaver&);
+}__attribute((unused));
+
+// --------------------------------------------------------------------
+// Some deprecated or hopefully-soon-to-be-deprecated functions.
+
+// This is often used for logging.  TODO(csilvers): figure out a better way
+extern GFLAGS_DLL_DECL std::string CommandlineFlagsIntoString();
+// Usually where this is used, a FlagSaver should be used instead.
+extern GFLAGS_DLL_DECL
+bool ReadFlagsFromString(const std::string& flagfilecontents,
+                         const char* prog_name,
+                         bool errors_are_fatal);  // uses SET_FLAGS_VALUE
+
+// These let you manually implement --flagfile functionality.
+// DEPRECATED.
+extern GFLAGS_DLL_DECL bool AppendFlagsIntoFile(const std::string& filename, const char* prog_name);
+extern GFLAGS_DLL_DECL bool ReadFromFlagsFile(const std::string& filename, const char* prog_name, bool errors_are_fatal);   // uses SET_FLAGS_VALUE
+
+
+// --------------------------------------------------------------------
+// Useful routines for initializing flags from the environment.
+// In each case, if 'varname' does not exist in the environment
+// return defval.  If 'varname' does exist but is not valid
+// (e.g., not a number for an int32 flag), abort with an error.
+// Otherwise, return the value.  NOTE: for booleans, for true use
+// 't' or 'T' or 'true' or '1', for false 'f' or 'F' or 'false' or '0'.
+
+extern GFLAGS_DLL_DECL bool BoolFromEnv(const char *varname, bool defval);
+extern GFLAGS_DLL_DECL int32 Int32FromEnv(const char *varname, int32 defval);
+extern GFLAGS_DLL_DECL int64 Int64FromEnv(const char *varname, int64 defval);
+extern GFLAGS_DLL_DECL uint64 Uint64FromEnv(const char *varname, uint64 defval);
+extern GFLAGS_DLL_DECL double DoubleFromEnv(const char *varname, double defval);
+extern GFLAGS_DLL_DECL const char *StringFromEnv(const char *varname, const char *defval);
+
+
+// --------------------------------------------------------------------
+// The next two functions parse gflags from main():
+
+// Set the "usage" message for this program.  For example:
+//   string usage("This program does nothing.  Sample usage:\n");
+//   usage += argv[0] + " <uselessarg1> <uselessarg2>";
+//   SetUsageMessage(usage);
+// Do not include commandline flags in the usage: we do that for you!
+// Thread-hostile; meant to be called before any threads are spawned.
+extern GFLAGS_DLL_DECL void SetUsageMessage(const std::string& usage);
+
+// Sets the version string, which is emitted with --version.
+// For instance: SetVersionString("1.3");
+// Thread-hostile; meant to be called before any threads are spawned.
+extern GFLAGS_DLL_DECL void SetVersionString(const std::string& version);
+
+
+// Looks for flags in argv and parses them.  Rearranges argv to put
+// flags first, or removes them entirely if remove_flags is true.
+// If a flag is defined more than once in the command line or flag
+// file, the last definition is used.  Returns the index (into argv)
+// of the first non-flag argument.
+// See top-of-file for more details on this function.
+#ifndef SWIG   // In swig, use ParseCommandLineFlagsScript() instead.
+extern GFLAGS_DLL_DECL uint32 ParseCommandLineFlags(int *argc, char*** argv, bool remove_flags);
+#endif
+
+
+// Calls to ParseCommandLineNonHelpFlags and then to
+// HandleCommandLineHelpFlags can be used instead of a call to
+// ParseCommandLineFlags during initialization, in order to allow for
+// changing default values for some FLAGS (via
+// e.g. SetCommandLineOptionWithMode calls) between the time of
+// command line parsing and the time of dumping help information for
+// the flags as a result of command line parsing.  If a flag is
+// defined more than once in the command line or flag file, the last
+// definition is used.  Returns the index (into argv) of the first
+// non-flag argument.  (If remove_flags is true, will always return 1.)
+extern GFLAGS_DLL_DECL uint32 ParseCommandLineNonHelpFlags(int *argc, char*** argv, bool remove_flags);
+
+// This is actually defined in gflags_reporting.cc.
+// This function is misnamed (it also handles --version, etc.), but
+// it's too late to change that now. :-(
+extern GFLAGS_DLL_DECL void HandleCommandLineHelpFlags();   // in gflags_reporting.cc
+
+// Allow command line reparsing.  Disables the error normally
+// generated when an unknown flag is found, since it may be found in a
+// later parse.  Thread-hostile; meant to be called before any threads
+// are spawned.
+extern GFLAGS_DLL_DECL void AllowCommandLineReparsing();
+
+// Reparse the flags that have not yet been recognized.  Only flags
+// registered since the last parse will be recognized.  Any flag value
+// must be provided as part of the argument using "=", not as a
+// separate command line argument that follows the flag argument.
+// Intended for handling flags from dynamically loaded libraries,
+// since their flags are not registered until they are loaded.
+extern GFLAGS_DLL_DECL void ReparseCommandLineNonHelpFlags();
+
+// Clean up memory allocated by flags.  This is only needed to reduce
+// the quantity of "potentially leaked" reports emitted by memory
+// debugging tools such as valgrind.  It is not required for normal
+// operation, or for the google perftools heap-checker.  It must only
+// be called when the process is about to exit, and all threads that
+// might access flags are quiescent.  Referencing flags after this is
+// called will have unexpected consequences.  This is not safe to run
+// when multiple threads might be running: the function is
+// thread-hostile.
+extern GFLAGS_DLL_DECL void ShutDownCommandLineFlags();
+
+
+// --------------------------------------------------------------------
+// Now come the command line flag declaration/definition macros that
+// will actually be used.  They're kind of hairy.  A major reason
+// for this is initialization: we want people to be able to access
+// variables in global constructors and have that not crash, even if
+// their global constructor runs before the global constructor here.
+// (Obviously, we can't guarantee the flags will have the correct
+// default value in that case, but at least accessing them is safe.)
+// The only way to do that is have flags point to a static buffer.
+// So we make one, using a union to ensure proper alignment, and
+// then use placement-new to actually set up the flag with the
+// correct default value.  In the same vein, we have to worry about
+// flag access in global destructors, so FlagRegisterer has to be
+// careful never to destroy the flag-values it constructs.
+//
+// Note that when we define a flag variable FLAGS_<name>, we also
+// preemptively define a junk variable, FLAGS_no<name>.  This is to
+// cause a link-time error if someone tries to define 2 flags with
+// names like "logging" and "nologging".  We do this because a bool
+// flag FLAG can be set from the command line to true with a "-FLAG"
+// argument, and to false with a "-noFLAG" argument, and so this can
+// potentially avert confusion.
+//
+// We also put flags into their own namespace.  It is purposefully
+// named in an opaque way that people should have trouble typing
+// directly.  The idea is that DEFINE puts the flag in the weird
+// namespace, and DECLARE imports the flag from there into the current
+// namespace.  The net result is to force people to use DECLARE to get
+// access to a flag, rather than saying "extern GFLAGS_DLL_DECL bool FLAGS_whatever;"
+// or some such instead.  We want this so we can put extra
+// functionality (like sanity-checking) in DECLARE if we want, and
+// make sure it is picked up everywhere.
+//
+// We also put the type of the variable in the namespace, so that
+// people can't DECLARE_int32 something that they DEFINE_bool'd
+// elsewhere.
+
+class GFLAGS_DLL_DECL FlagRegisterer {
+ public:
+  FlagRegisterer(const char* name, const char* type,
+                 const char* help, const char* filename,
+                 void* current_storage, void* defvalue_storage);
+};
+
+// If your application #defines STRIP_FLAG_HELP to a non-zero value
+// before #including this file, we remove the help message from the
+// binary file. This can reduce the size of the resulting binary
+// somewhat, and may also be useful for security reasons.
+
+extern GFLAGS_DLL_DECL const char kStrippedFlagHelp[];
+
+
+} // namespace GFLAGS_NAMESPACE
+
+
+#ifndef SWIG  // In swig, ignore the main flag declarations
+
+#if defined(STRIP_FLAG_HELP) && STRIP_FLAG_HELP > 0
+// Need this construct to avoid the 'defined but not used' warning.
+#define MAYBE_STRIPPED_HELP(txt) \
+   (false ? (txt) : GFLAGS_NAMESPACE::kStrippedFlagHelp)
+#else
+#define MAYBE_STRIPPED_HELP(txt) txt
+#endif
+
+// Each command-line flag has two variables associated with it: one
+// with the current value, and one with the default value.  However,
+// we have a third variable, which is where value is assigned; it's a
+// constant.  This guarantees that FLAG_##value is initialized at
+// static initialization time (e.g. before program-start) rather than
+// than global construction time (which is after program-start but
+// before main), at least when 'value' is a compile-time constant.  We
+// use a small trick for the "default value" variable, and call it
+// FLAGS_no<name>.  This serves the second purpose of assuring a
+// compile error if someone tries to define a flag named no<name>
+// which is illegal (--foo and --nofoo both affect the "foo" flag).
+#define DEFINE_VARIABLE(type, shorttype, name, value, help)             \
+  namespace fL##shorttype {                                             \
+    static const type FLAGS_nono##name = value;                         \
+    /* We always want to export defined variables, dll or no */         \
+    GFLAGS_DLL_DEFINE_FLAG type FLAGS_##name = FLAGS_nono##name;        \
+    type FLAGS_no##name = FLAGS_nono##name;                             \
+    static GFLAGS_NAMESPACE::FlagRegisterer o_##name(                   \
+      #name, #type, MAYBE_STRIPPED_HELP(help), __FILE__,                \
+      &FLAGS_##name, &FLAGS_no##name);                                  \
+  }                                                                     \
+  using fL##shorttype::FLAGS_##name
+
+// For DEFINE_bool, we want to do the extra check that the passed-in
+// value is actually a bool, and not a string or something that can be
+// coerced to a bool.  These declarations (no definition needed!) will
+// help us do that, and never evaluate From, which is important.
+// We'll use 'sizeof(IsBool(val))' to distinguish. This code requires
+// that the compiler have different sizes for bool & double. Since
+// this is not guaranteed by the standard, we check it with a
+// COMPILE_ASSERT.
+namespace fLB {
+struct CompileAssert {};
+typedef CompileAssert expected_sizeof_double_neq_sizeof_bool[
+                      (sizeof(double) != sizeof(bool)) ? 1 : -1];
+template<typename From> double GFLAGS_DLL_DECL IsBoolFlag(const From& from);
+GFLAGS_DLL_DECL bool IsBoolFlag(bool from);
+}  // namespace fLB
+
+// Here are the actual DEFINE_*-macros. The respective DECLARE_*-macros
+// are in a separate include, gflags_declare.h, for reducing
+// the physical transitive size for DECLARE use.
+#define DEFINE_bool(name, val, txt)                                     \
+  namespace fLB {                                                       \
+    typedef ::fLB::CompileAssert FLAG_##name##_value_is_not_a_bool[     \
+            (sizeof(::fLB::IsBoolFlag(val)) != sizeof(double))? 1: -1]; \
+  }                                                                     \
+  DEFINE_VARIABLE(bool, B, name, val, txt)
+
+#define DEFINE_int32(name, val, txt) \
+   DEFINE_VARIABLE(GFLAGS_NAMESPACE::int32, I, \
+                   name, val, txt)
+
+#define DEFINE_int64(name, val, txt) \
+   DEFINE_VARIABLE(GFLAGS_NAMESPACE::int64, I64, \
+                   name, val, txt)
+
+#define DEFINE_uint64(name,val, txt) \
+   DEFINE_VARIABLE(GFLAGS_NAMESPACE::uint64, U64, \
+                   name, val, txt)
+
+#define DEFINE_double(name, val, txt) \
+   DEFINE_VARIABLE(double, D, name, val, txt)
+
+// Strings are trickier, because they're not a POD, so we can't
+// construct them at static-initialization time (instead they get
+// constructed at global-constructor time, which is much later).  To
+// try to avoid crashes in that case, we use a char buffer to store
+// the string, which we can static-initialize, and then placement-new
+// into it later.  It's not perfect, but the best we can do.
+
+namespace fLS {
+
+inline clstring* dont_pass0toDEFINE_string(char *stringspot,
+                                           const char *value) {
+  return new(stringspot) clstring(value);
+}
+inline clstring* dont_pass0toDEFINE_string(char *stringspot,
+                                           const clstring &value) {
+  return new(stringspot) clstring(value);
+}
+inline clstring* dont_pass0toDEFINE_string(char *stringspot,
+                                           int value);
+}  // namespace fLS
+
+// We need to define a var named FLAGS_no##name so people don't define
+// --string and --nostring.  And we need a temporary place to put val
+// so we don't have to evaluate it twice.  Two great needs that go
+// great together!
+// The weird 'using' + 'extern' inside the fLS namespace is to work around
+// an unknown compiler bug/issue with the gcc 4.2.1 on SUSE 10.  See
+//    http://code.google.com/p/google-gflags/issues/detail?id=20
+#define DEFINE_string(name, val, txt)                                       \
+  namespace fLS {                                                           \
+    using ::fLS::clstring;                                                  \
+    static union { void* align; char s[sizeof(clstring)]; } s_##name[2];    \
+    clstring* const FLAGS_no##name = ::fLS::                                \
+                                   dont_pass0toDEFINE_string(s_##name[0].s, \
+                                                             val);          \
+    static GFLAGS_NAMESPACE::FlagRegisterer o_##name(                       \
+        #name, "string", MAYBE_STRIPPED_HELP(txt), __FILE__,                \
+        s_##name[0].s, new (s_##name[1].s) clstring(*FLAGS_no##name));      \
+    extern GFLAGS_DLL_DEFINE_FLAG clstring& FLAGS_##name;                   \
+    using fLS::FLAGS_##name;                                                \
+    clstring& FLAGS_##name = *FLAGS_no##name;                               \
+  }                                                                         \
+  using fLS::FLAGS_##name
+
+#endif  // SWIG
+
+
+// Import gflags library symbols into alternative/deprecated namespace(s)
+#include "gflags_gflags.h"
+
+
+#endif  // GFLAGS_GFLAGS_H_
--- a/media/libyuv/third_party/gflags/gen/posix/include/gflags/gflags_completions.h
+++ b/media/libyuv/third_party/gflags/gen/posix/include/gflags/gflags_completions.h
@ -0,0 +1,121 @@
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ---
+
+//
+// Implement helpful bash-style command line flag completions
+//
+// ** Functional API:
+// HandleCommandLineCompletions() should be called early during
+// program startup, but after command line flag code has been
+// initialized, such as the beginning of HandleCommandLineHelpFlags().
+// It checks the value of the flag --tab_completion_word.  If this
+// flag is empty, nothing happens here.  If it contains a string,
+// however, then HandleCommandLineCompletions() will hijack the
+// process, attempting to identify the intention behind this
+// completion.  Regardless of the outcome of this deduction, the
+// process will be terminated, similar to --helpshort flag
+// handling.
+//
+// ** Overview of Bash completions:
+// Bash can be told to programatically determine completions for the
+// current 'cursor word'.  It does this by (in this case) invoking a
+// command with some additional arguments identifying the command
+// being executed, the word being completed, and the previous word
+// (if any).  Bash then expects a sequence of output lines to be
+// printed to stdout.  If these lines all contain a common prefix
+// longer than the cursor word, bash will replace the cursor word
+// with that common prefix, and display nothing.  If there isn't such
+// a common prefix, bash will display the lines in pages using 'more'.
+//
+// ** Strategy taken for command line completions:
+// If we can deduce either the exact flag intended, or a common flag
+// prefix, we'll output exactly that.  Otherwise, if information
+// must be displayed to the user, we'll take the opportunity to add
+// some helpful information beyond just the flag name (specifically,
+// we'll include the default flag value and as much of the flag's
+// description as can fit on a single terminal line width, as specified
+// by the flag --tab_completion_columns).  Furthermore, we'll try to
+// make bash order the output such that the most useful or relevent
+// flags are the most likely to be shown at the top.
+//
+// ** Additional features:
+// To assist in finding that one really useful flag, substring matching
+// was implemented.  Before pressing a <TAB> to get completion for the
+// current word, you can append one or more '?' to the flag to do
+// substring matching.  Here's the semantics:
+//   --foo<TAB>     Show me all flags with names prefixed by 'foo'
+//   --foo?<TAB>    Show me all flags with 'foo' somewhere in the name
+//   --foo??<TAB>   Same as prior case, but also search in module
+//                  definition path for 'foo'
+//   --foo???<TAB>  Same as prior case, but also search in flag
+//                  descriptions for 'foo'
+// Finally, we'll trim the output to a relatively small number of
+// flags to keep bash quiet about the verbosity of output.  If one
+// really wanted to see all possible matches, appending a '+' to the
+// search word will force the exhaustive list of matches to be printed.
+//
+// ** How to have bash accept completions from a binary:
+// Bash requires that it be informed about each command that programmatic
+// completion should be enabled for.  Example addition to a .bashrc
+// file would be (your path to gflags_completions.sh file may differ):
+
+/*
+$ complete -o bashdefault -o default -o nospace -C                            \
+ '/home/build/eng/bash/bash_completions.sh --tab_completion_columns $COLUMNS' \
+  time  env  binary_name  another_binary  [...]
+*/
+
+// This would allow the following to work:
+//   $ /path/to/binary_name --vmodule<TAB>
+// Or:
+//   $ ./bin/path/another_binary --gfs_u<TAB>
+// (etc)
+//
+// Sadly, it appears that bash gives no easy way to force this behavior for
+// all commands.  That's where the "time" in the above example comes in.
+// If you haven't specifically added a command to the list of completion
+// supported commands, you can still get completions by prefixing the
+// entire command with "env".
+//   $ env /some/brand/new/binary --vmod<TAB>
+// Assuming that "binary" is a newly compiled binary, this should still
+// produce the expected completion output.
+
+
+#ifndef GFLAGS_COMPLETIONS_H_
+#define GFLAGS_COMPLETIONS_H_
+
+namespace google {
+
+extern void HandleCommandLineCompletions(void);
+
+}
+
+#endif  // GFLAGS_COMPLETIONS_H_
--- a/media/libyuv/third_party/gflags/gen/posix/include/gflags/gflags_declare.h
+++ b/media/libyuv/third_party/gflags/gen/posix/include/gflags/gflags_declare.h
@ -0,0 +1,141 @@
+// Copyright (c) 1999, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+//
+// Revamped and reorganized by Craig Silverstein
+//
+// This is the file that should be included by any file which declares
+// command line flag.
+
+#ifndef GFLAGS_DECLARE_H_
+#define GFLAGS_DECLARE_H_
+
+
+// ---------------------------------------------------------------------------
+// Namespace of gflags library symbols.
+#define GFLAGS_NAMESPACE google
+
+// ---------------------------------------------------------------------------
+// Windows DLL import/export.
+
+// We always want to import the symbols of the gflags library
+#ifndef GFLAGS_DLL_DECL
+#  if 0 && defined(_MSC_VER)
+#    define GFLAGS_DLL_DECL __declspec(dllimport)
+#  else
+#    define GFLAGS_DLL_DECL
+#  endif
+#endif
+
+// We always want to import variables declared in user code
+#ifndef GFLAGS_DLL_DECLARE_FLAG
+#  ifdef _MSC_VER
+#    define GFLAGS_DLL_DECLARE_FLAG __declspec(dllimport)
+#  else
+#    define GFLAGS_DLL_DECLARE_FLAG
+#  endif
+#endif
+
+// ---------------------------------------------------------------------------
+// Flag types
+#include <string>
+#if 1
+#  include <stdint.h>                   // the normal place uint32_t is defined
+#elif 1
+#  include <sys/types.h>                // the normal place u_int32_t is defined
+#elif 1
+#  include <inttypes.h>                 // a third place for uint32_t or u_int32_t
+#endif
+
+namespace GFLAGS_NAMESPACE {
+
+#if 1 // C99
+typedef int32_t          int32;
+typedef uint32_t         uint32;
+typedef int64_t          int64;
+typedef uint64_t         uint64;
+#elif 0 // BSD
+typedef int32_t          int32;
+typedef u_int32_t        uint32;
+typedef int64_t          int64;
+typedef u_int64_t        uint64;
+#elif 0 // Windows
+typedef __int32          int32;
+typedef unsigned __int32 uint32;
+typedef __int64          int64;
+typedef unsigned __int64 uint64;
+#else
+#  error Do not know how to define a 32-bit integer quantity on your system
+#endif
+
+} // namespace GFLAGS_NAMESPACE
+
+
+namespace fLS {
+
+// The meaning of "string" might be different between now and when the
+// macros below get invoked (e.g., if someone is experimenting with
+// other string implementations that get defined after this file is
+// included).  Save the current meaning now and use it in the macros.
+typedef std::string clstring;
+
+} // namespace fLS
+
+
+#define DECLARE_VARIABLE(type, shorttype, name) \
+  /* We always want to import declared variables, dll or no */ \
+  namespace fL##shorttype { extern GFLAGS_DLL_DECLARE_FLAG type FLAGS_##name; } \
+  using fL##shorttype::FLAGS_##name
+
+#define DECLARE_bool(name) \
+  DECLARE_VARIABLE(bool, B, name)
+
+#define DECLARE_int32(name) \
+  DECLARE_VARIABLE(::GFLAGS_NAMESPACE::int32, I, name)
+
+#define DECLARE_int64(name) \
+  DECLARE_VARIABLE(::GFLAGS_NAMESPACE::int64, I64, name)
+
+#define DECLARE_uint64(name) \
+  DECLARE_VARIABLE(::GFLAGS_NAMESPACE::uint64, U64, name)
+
+#define DECLARE_double(name) \
+  DECLARE_VARIABLE(double, D, name)
+
+#define DECLARE_string(name) \
+  /* We always want to import declared variables, dll or no */ \
+  namespace fLS { \
+  using ::fLS::clstring; \
+  extern GFLAGS_DLL_DECLARE_FLAG ::fLS::clstring& FLAGS_##name; \
+  } \
+  using fLS::FLAGS_##name
+
+
+#endif  // GFLAGS_DECLARE_H_
--- a/media/libyuv/third_party/gflags/gen/posix/include/gflags/gflags_gflags.h
+++ b/media/libyuv/third_party/gflags/gen/posix/include/gflags/gflags_gflags.h
@ -0,0 +1,101 @@
+// Copyright (c) 2014, Andreas Schuh
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// -----------------------------------------------------------------------------
+// Imports the gflags library symbols into an alternative/deprecated namespace.
+
+#ifndef GFLAGS_GFLAGS_H_
+#  error The internal header gflags_gflags.h may only be included by gflags.h
+#endif
+
+#ifndef GFLAGS_NS_GFLAGS_H_
+#define GFLAGS_NS_GFLAGS_H_
+
+
+namespace gflags {
+
+
+using GFLAGS_NAMESPACE::int32;
+using GFLAGS_NAMESPACE::uint32;
+using GFLAGS_NAMESPACE::int64;
+using GFLAGS_NAMESPACE::uint64;
+
+using GFLAGS_NAMESPACE::RegisterFlagValidator;
+using GFLAGS_NAMESPACE::CommandLineFlagInfo;
+using GFLAGS_NAMESPACE::GetAllFlags;
+using GFLAGS_NAMESPACE::ShowUsageWithFlags;
+using GFLAGS_NAMESPACE::ShowUsageWithFlagsRestrict;
+using GFLAGS_NAMESPACE::DescribeOneFlag;
+using GFLAGS_NAMESPACE::SetArgv;
+using GFLAGS_NAMESPACE::GetArgvs;
+using GFLAGS_NAMESPACE::GetArgv;
+using GFLAGS_NAMESPACE::GetArgv0;
+using GFLAGS_NAMESPACE::GetArgvSum;
+using GFLAGS_NAMESPACE::ProgramInvocationName;
+using GFLAGS_NAMESPACE::ProgramInvocationShortName;
+using GFLAGS_NAMESPACE::ProgramUsage;
+using GFLAGS_NAMESPACE::VersionString;
+using GFLAGS_NAMESPACE::GetCommandLineOption;
+using GFLAGS_NAMESPACE::GetCommandLineFlagInfo;
+using GFLAGS_NAMESPACE::GetCommandLineFlagInfoOrDie;
+using GFLAGS_NAMESPACE::FlagSettingMode;
+using GFLAGS_NAMESPACE::SET_FLAGS_VALUE;
+using GFLAGS_NAMESPACE::SET_FLAG_IF_DEFAULT;
+using GFLAGS_NAMESPACE::SET_FLAGS_DEFAULT;
+using GFLAGS_NAMESPACE::SetCommandLineOption;
+using GFLAGS_NAMESPACE::SetCommandLineOptionWithMode;
+using GFLAGS_NAMESPACE::FlagSaver;
+using GFLAGS_NAMESPACE::CommandlineFlagsIntoString;
+using GFLAGS_NAMESPACE::ReadFlagsFromString;
+using GFLAGS_NAMESPACE::AppendFlagsIntoFile;
+using GFLAGS_NAMESPACE::ReadFromFlagsFile;
+using GFLAGS_NAMESPACE::BoolFromEnv;
+using GFLAGS_NAMESPACE::Int32FromEnv;
+using GFLAGS_NAMESPACE::Int64FromEnv;
+using GFLAGS_NAMESPACE::Uint64FromEnv;
+using GFLAGS_NAMESPACE::DoubleFromEnv;
+using GFLAGS_NAMESPACE::StringFromEnv;
+using GFLAGS_NAMESPACE::SetUsageMessage;
+using GFLAGS_NAMESPACE::SetVersionString;
+using GFLAGS_NAMESPACE::ParseCommandLineNonHelpFlags;
+using GFLAGS_NAMESPACE::HandleCommandLineHelpFlags;
+using GFLAGS_NAMESPACE::AllowCommandLineReparsing;
+using GFLAGS_NAMESPACE::ReparseCommandLineNonHelpFlags;
+using GFLAGS_NAMESPACE::ShutDownCommandLineFlags;
+using GFLAGS_NAMESPACE::FlagRegisterer;
+
+#ifndef SWIG
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+#endif
+
+
+} // namespace gflags
+
+
+#endif  // GFLAGS_NS_GFLAGS_H_
--- a/media/libyuv/third_party/gflags/gen/posix/include/private/config.h
+++ b/media/libyuv/third_party/gflags/gen/posix/include/private/config.h
@ -0,0 +1,112 @@
+/* Generated from config.h.in during build configuration using CMake. */
+
+// Note: This header file is only used internally. It is not part of public interface!
+
+// ---------------------------------------------------------------------------
+// System checks
+
+// Define if you build this library for a MS Windows OS.
+/* #undef OS_WINDOWS */
+
+// Define if you have the <stdint.h> header file.
+#define HAVE_STDINT_H
+
+// Define if you have the <sys/types.h> header file.
+#define HAVE_SYS_TYPES_H
+
+// Define if you have the <inttypes.h> header file.
+#define HAVE_INTTYPES_H
+
+// Define if you have the <sys/stat.h> header file.
+#define HAVE_SYS_STAT_H
+
+// Define if you have the <unistd.h> header file.
+#define HAVE_UNISTD_H
+
+// Define if you have the <fnmatch.h> header file.
+#define HAVE_FNMATCH_H
+
+// Define if you have the <shlwapi.h> header file (Windows 2000/XP).
+/* #undef HAVE_SHLWAPI_H */
+
+// Define if you have the strtoll function.
+#define HAVE_STRTOLL
+
+// Define if you have the strtoq function.
+/* #undef HAVE_STRTOQ */
+
+// Define if you have the <pthread.h> header file.
+#define HAVE_PTHREAD
+
+// Define if your pthread library defines the type pthread_rwlock_t
+#define HAVE_RWLOCK
+
+// gcc requires this to get PRId64, etc.
+#if defined(HAVE_INTTYPES_H) && !defined(__STDC_FORMAT_MACROS)
+#  define __STDC_FORMAT_MACROS 1
+#endif
+
+// ---------------------------------------------------------------------------
+// Package information
+
+// Name of package.
+#define PACKAGE gflags
+
+// Define to the full name of this package.
+#define PACKAGE_NAME gflags
+
+// Define to the full name and version of this package.
+#define PACKAGE_STRING gflags 2.2.0
+
+// Define to the one symbol short name of this package.
+#define PACKAGE_TARNAME gflags-2.2.0
+
+// Define to the version of this package.
+#define PACKAGE_VERSION 2.2.0
+
+// Version number of package.
+#define VERSION PACKAGE_VERSION
+
+// Define to the address where bug reports for this package should be sent.
+#define PACKAGE_BUGREPORT https://github.com/schuhschuh/gflags/issues
+
+// ---------------------------------------------------------------------------
+// Path separator
+#ifndef PATH_SEPARATOR
+#  ifdef OS_WINDOWS
+#    define PATH_SEPARATOR  '\\'
+#  else
+#    define PATH_SEPARATOR  '/'
+#  endif
+#endif
+
+// ---------------------------------------------------------------------------
+// Windows
+
+// Whether gflags library is a DLL.
+#ifndef GFLAGS_IS_A_DLL
+#  define GFLAGS_IS_A_DLL 0
+#endif
+
+// Always export symbols when compiling a shared library as this file is only
+// included by internal modules when building the gflags library itself.
+// The gflags_declare.h header file will set it to import these symbols otherwise.
+#ifndef GFLAGS_DLL_DECL
+#  if GFLAGS_IS_A_DLL && defined(_MSC_VER)
+#    define GFLAGS_DLL_DECL __declspec(dllexport)
+#  else
+#    define GFLAGS_DLL_DECL
+#  endif
+#endif
+// Flags defined by the gflags library itself must be exported
+#ifndef GFLAGS_DLL_DEFINE_FLAG
+#  define GFLAGS_DLL_DEFINE_FLAG GFLAGS_DLL_DECL
+#endif
+
+#ifdef OS_WINDOWS
+// The unittests import the symbols of the shared gflags library
+#  if GFLAGS_IS_A_DLL && defined(_MSC_VER)
+#    define GFLAGS_DLL_DECL_FOR_UNITTESTS __declspec(dllimport)
+#  endif
+#  include "windows_port.h"
+#endif
--- a/media/libyuv/third_party/gflags/gen/win/include/gflags/gflags.h
+++ b/media/libyuv/third_party/gflags/gen/win/include/gflags/gflags.h
@ -0,0 +1,573 @@
+// Copyright (c) 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Revamped and reorganized by Craig Silverstein
+//
+// This is the file that should be included by any file which declares
+// or defines a command line flag or wants to parse command line flags
+// or print a program usage message (which will include information about
+// flags).  Executive summary, in the form of an example foo.cc file:
+//
+//    #include "foo.h"         // foo.h has a line "DECLARE_int32(start);"
+//    #include "validators.h"  // hypothetical file defining ValidateIsFile()
+//
+//    DEFINE_int32(end, 1000, "The last record to read");
+//
+//    DEFINE_string(filename, "my_file.txt", "The file to read");
+//    // Crash if the specified file does not exist.
+//    static bool dummy = RegisterFlagValidator(&FLAGS_filename,
+//                                              &ValidateIsFile);
+//
+//    DECLARE_bool(verbose); // some other file has a DEFINE_bool(verbose, ...)
+//
+//    void MyFunc() {
+//      if (FLAGS_verbose) printf("Records %d-%d\n", FLAGS_start, FLAGS_end);
+//    }
+//
+//    Then, at the command-line:
+//       ./foo --noverbose --start=5 --end=100
+//
+// For more details, see
+//    doc/gflags.html
+//
+// --- A note about thread-safety:
+//
+// We describe many functions in this routine as being thread-hostile,
+// thread-compatible, or thread-safe.  Here are the meanings we use:
+//
+// thread-safe: it is safe for multiple threads to call this routine
+//   (or, when referring to a class, methods of this class)
+//   concurrently.
+// thread-hostile: it is not safe for multiple threads to call this
+//   routine (or methods of this class) concurrently.  In gflags,
+//   most thread-hostile routines are intended to be called early in,
+//   or even before, main() -- that is, before threads are spawned.
+// thread-compatible: it is safe for multiple threads to read from
+//   this variable (when applied to variables), or to call const
+//   methods of this class (when applied to classes), as long as no
+//   other thread is writing to the variable or calling non-const
+//   methods of this class.
+
+#ifndef GFLAGS_GFLAGS_H_
+#define GFLAGS_GFLAGS_H_
+
+#include <string>
+#include <vector>
+
+#include "gflags_declare.h" // IWYU pragma: export
+
+
+// We always want to export variables defined in user code
+#ifndef GFLAGS_DLL_DEFINE_FLAG
+#  ifdef _MSC_VER
+#    define GFLAGS_DLL_DEFINE_FLAG __declspec(dllexport)
+#  else
+#    define GFLAGS_DLL_DEFINE_FLAG
+#  endif
+#endif
+
+
+namespace GFLAGS_NAMESPACE {
+
+
+// --------------------------------------------------------------------
+// To actually define a flag in a file, use DEFINE_bool,
+// DEFINE_string, etc. at the bottom of this file.  You may also find
+// it useful to register a validator with the flag.  This ensures that
+// when the flag is parsed from the commandline, or is later set via
+// SetCommandLineOption, we call the validation function. It is _not_
+// called when you assign the value to the flag directly using the = operator.
+//
+// The validation function should return true if the flag value is valid, and
+// false otherwise. If the function returns false for the new setting of the
+// flag, the flag will retain its current value. If it returns false for the
+// default value, ParseCommandLineFlags() will die.
+//
+// This function is safe to call at global construct time (as in the
+// example below).
+//
+// Example use:
+//    static bool ValidatePort(const char* flagname, int32 value) {
+//       if (value > 0 && value < 32768)   // value is ok
+//         return true;
+//       printf("Invalid value for --%s: %d\n", flagname, (int)value);
+//       return false;
+//    }
+//    DEFINE_int32(port, 0, "What port to listen on");
+//    static bool dummy = RegisterFlagValidator(&FLAGS_port, &ValidatePort);
+
+// Returns true if successfully registered, false if not (because the
+// first argument doesn't point to a command-line flag, or because a
+// validator is already registered for this flag).
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const bool*        flag, bool (*validate_fn)(const char*, bool));
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const int32*       flag, bool (*validate_fn)(const char*, int32));
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const int64*       flag, bool (*validate_fn)(const char*, int64));
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const uint64*      flag, bool (*validate_fn)(const char*, uint64));
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const double*      flag, bool (*validate_fn)(const char*, double));
+extern GFLAGS_DLL_DECL bool RegisterFlagValidator(const std::string* flag, bool (*validate_fn)(const char*, const std::string&));
+
+// Convenience macro for the registration of a flag validator
+#define DEFINE_validator(name, validator) \
+    static const bool name##_validator_registered = \
+            GFLAGS_NAMESPACE::RegisterFlagValidator(&FLAGS_##name, validator)
+
+
+// --------------------------------------------------------------------
+// These methods are the best way to get access to info about the
+// list of commandline flags.  Note that these routines are pretty slow.
+//   GetAllFlags: mostly-complete info about the list, sorted by file.
+//   ShowUsageWithFlags: pretty-prints the list to stdout (what --help does)
+//   ShowUsageWithFlagsRestrict: limit to filenames with restrict as a substr
+//
+// In addition to accessing flags, you can also access argv[0] (the program
+// name) and argv (the entire commandline), which we sock away a copy of.
+// These variables are static, so you should only set them once.
+//
+// No need to export this data only structure from DLL, avoiding VS warning 4251.
+struct CommandLineFlagInfo {
+  std::string name;            // the name of the flag
+  std::string type;            // the type of the flag: int32, etc
+  std::string description;     // the "help text" associated with the flag
+  std::string current_value;   // the current value, as a string
+  std::string default_value;   // the default value, as a string
+  std::string filename;        // 'cleaned' version of filename holding the flag
+  bool has_validator_fn;       // true if RegisterFlagValidator called on this flag
+  bool is_default;             // true if the flag has the default value and
+                               // has not been set explicitly from the cmdline
+                               // or via SetCommandLineOption
+  const void* flag_ptr;        // pointer to the flag's current value (i.e. FLAGS_foo)
+};
+
+// Using this inside of a validator is a recipe for a deadlock.
+// TODO(user) Fix locking when validators are running, to make it safe to
+// call validators during ParseAllFlags.
+// Also make sure then to uncomment the corresponding unit test in
+// gflags_unittest.sh
+extern GFLAGS_DLL_DECL void GetAllFlags(std::vector<CommandLineFlagInfo>* OUTPUT);
+// These two are actually defined in gflags_reporting.cc.
+extern GFLAGS_DLL_DECL void ShowUsageWithFlags(const char *argv0);  // what --help does
+extern GFLAGS_DLL_DECL void ShowUsageWithFlagsRestrict(const char *argv0, const char *restrict);
+
+// Create a descriptive string for a flag.
+// Goes to some trouble to make pretty line breaks.
+extern GFLAGS_DLL_DECL std::string DescribeOneFlag(const CommandLineFlagInfo& flag);
+
+// Thread-hostile; meant to be called before any threads are spawned.
+extern GFLAGS_DLL_DECL void SetArgv(int argc, const char** argv);
+
+// The following functions are thread-safe as long as SetArgv() is
+// only called before any threads start.
+extern GFLAGS_DLL_DECL const std::vector<std::string>& GetArgvs();
+extern GFLAGS_DLL_DECL const char* GetArgv();                      // all of argv as a string
+extern GFLAGS_DLL_DECL const char* GetArgv0();                     // only argv0
+extern GFLAGS_DLL_DECL uint32 GetArgvSum();                        // simple checksum of argv
+extern GFLAGS_DLL_DECL const char* ProgramInvocationName();        // argv0, or "UNKNOWN" if not set
+extern GFLAGS_DLL_DECL const char* ProgramInvocationShortName();   // basename(argv0)
+
+// ProgramUsage() is thread-safe as long as SetUsageMessage() is only
+// called before any threads start.
+extern GFLAGS_DLL_DECL const char* ProgramUsage();                 // string set by SetUsageMessage()
+
+// VersionString() is thread-safe as long as SetVersionString() is only
+// called before any threads start.
+extern GFLAGS_DLL_DECL const char* VersionString();                // string set by SetVersionString()
+
+
+
+// --------------------------------------------------------------------
+// Normally you access commandline flags by just saying "if (FLAGS_foo)"
+// or whatever, and set them by calling "FLAGS_foo = bar" (or, more
+// commonly, via the DEFINE_foo macro).  But if you need a bit more
+// control, we have programmatic ways to get/set the flags as well.
+// These programmatic ways to access flags are thread-safe, but direct
+// access is only thread-compatible.
+
+// Return true iff the flagname was found.
+// OUTPUT is set to the flag's value, or unchanged if we return false.
+extern GFLAGS_DLL_DECL bool GetCommandLineOption(const char* name, std::string* OUTPUT);
+
+// Return true iff the flagname was found. OUTPUT is set to the flag's
+// CommandLineFlagInfo or unchanged if we return false.
+extern GFLAGS_DLL_DECL bool GetCommandLineFlagInfo(const char* name, CommandLineFlagInfo* OUTPUT);
+
+// Return the CommandLineFlagInfo of the flagname.  exit() if name not found.
+// Example usage, to check if a flag's value is currently the default value:
+//   if (GetCommandLineFlagInfoOrDie("foo").is_default) ...
+extern GFLAGS_DLL_DECL CommandLineFlagInfo GetCommandLineFlagInfoOrDie(const char* name);
+
+enum GFLAGS_DLL_DECL FlagSettingMode {
+  // update the flag's value (can call this multiple times).
+  SET_FLAGS_VALUE,
+  // update the flag's value, but *only if* it has not yet been updated
+  // with SET_FLAGS_VALUE, SET_FLAG_IF_DEFAULT, or "FLAGS_xxx = nondef".
+  SET_FLAG_IF_DEFAULT,
+  // set the flag's default value to this.  If the flag has not yet updated
+  // yet (via SET_FLAGS_VALUE, SET_FLAG_IF_DEFAULT, or "FLAGS_xxx = nondef")
+  // change the flag's current value to the new default value as well.
+  SET_FLAGS_DEFAULT
+};
+
+// Set a particular flag ("command line option").  Returns a string
+// describing the new value that the option has been set to.  The
+// return value API is not well-specified, so basically just depend on
+// it to be empty if the setting failed for some reason -- the name is
+// not a valid flag name, or the value is not a valid value -- and
+// non-empty else.
+
+// SetCommandLineOption uses set_mode == SET_FLAGS_VALUE (the common case)
+extern GFLAGS_DLL_DECL std::string SetCommandLineOption        (const char* name, const char* value);
+extern GFLAGS_DLL_DECL std::string SetCommandLineOptionWithMode(const char* name, const char* value, FlagSettingMode set_mode);
+
+
+// --------------------------------------------------------------------
+// Saves the states (value, default value, whether the user has set
+// the flag, registered validators, etc) of all flags, and restores
+// them when the FlagSaver is destroyed.  This is very useful in
+// tests, say, when you want to let your tests change the flags, but
+// make sure that they get reverted to the original states when your
+// test is complete.
+//
+// Example usage:
+//   void TestFoo() {
+//     FlagSaver s1;
+//     FLAG_foo = false;
+//     FLAG_bar = "some value";
+//
+//     // test happens here.  You can return at any time
+//     // without worrying about restoring the FLAG values.
+//   }
+//
+// Note: This class is marked with GFLAGS_ATTRIBUTE_UNUSED because all
+// the work is done in the constructor and destructor, so in the standard
+// usage example above, the compiler would complain that it's an
+// unused variable.
+//
+// This class is thread-safe.  However, its destructor writes to
+// exactly the set of flags that have changed value during its
+// lifetime, so concurrent _direct_ access to those flags
+// (i.e. FLAGS_foo instead of {Get,Set}CommandLineOption()) is unsafe.
+
+class GFLAGS_DLL_DECL FlagSaver {
+ public:
+  FlagSaver();
+  ~FlagSaver();
+
+ private:
+  class FlagSaverImpl* impl_;   // we use pimpl here to keep API steady
+
+  FlagSaver(const FlagSaver&);  // no copying!
+  void operator=(const FlagSaver&);
+};
+
+// --------------------------------------------------------------------
+// Some deprecated or hopefully-soon-to-be-deprecated functions.
+
+// This is often used for logging.  TODO(csilvers): figure out a better way
+extern GFLAGS_DLL_DECL std::string CommandlineFlagsIntoString();
+// Usually where this is used, a FlagSaver should be used instead.
+extern GFLAGS_DLL_DECL
+bool ReadFlagsFromString(const std::string& flagfilecontents,
+                         const char* prog_name,
+                         bool errors_are_fatal);  // uses SET_FLAGS_VALUE
+
+// These let you manually implement --flagfile functionality.
+// DEPRECATED.
+extern GFLAGS_DLL_DECL bool AppendFlagsIntoFile(const std::string& filename, const char* prog_name);
+extern GFLAGS_DLL_DECL bool ReadFromFlagsFile(const std::string& filename, const char* prog_name, bool errors_are_fatal);   // uses SET_FLAGS_VALUE
+
+
+// --------------------------------------------------------------------
+// Useful routines for initializing flags from the environment.
+// In each case, if 'varname' does not exist in the environment
+// return defval.  If 'varname' does exist but is not valid
+// (e.g., not a number for an int32 flag), abort with an error.
+// Otherwise, return the value.  NOTE: for booleans, for true use
+// 't' or 'T' or 'true' or '1', for false 'f' or 'F' or 'false' or '0'.
+
+extern GFLAGS_DLL_DECL bool BoolFromEnv(const char *varname, bool defval);
+extern GFLAGS_DLL_DECL int32 Int32FromEnv(const char *varname, int32 defval);
+extern GFLAGS_DLL_DECL int64 Int64FromEnv(const char *varname, int64 defval);
+extern GFLAGS_DLL_DECL uint64 Uint64FromEnv(const char *varname, uint64 defval);
+extern GFLAGS_DLL_DECL double DoubleFromEnv(const char *varname, double defval);
+extern GFLAGS_DLL_DECL const char *StringFromEnv(const char *varname, const char *defval);
+
+
+// --------------------------------------------------------------------
+// The next two functions parse gflags from main():
+
+// Set the "usage" message for this program.  For example:
+//   string usage("This program does nothing.  Sample usage:\n");
+//   usage += argv[0] + " <uselessarg1> <uselessarg2>";
+//   SetUsageMessage(usage);
+// Do not include commandline flags in the usage: we do that for you!
+// Thread-hostile; meant to be called before any threads are spawned.
+extern GFLAGS_DLL_DECL void SetUsageMessage(const std::string& usage);
+
+// Sets the version string, which is emitted with --version.
+// For instance: SetVersionString("1.3");
+// Thread-hostile; meant to be called before any threads are spawned.
+extern GFLAGS_DLL_DECL void SetVersionString(const std::string& version);
+
+
+// Looks for flags in argv and parses them.  Rearranges argv to put
+// flags first, or removes them entirely if remove_flags is true.
+// If a flag is defined more than once in the command line or flag
+// file, the last definition is used.  Returns the index (into argv)
+// of the first non-flag argument.
+// See top-of-file for more details on this function.
+#ifndef SWIG   // In swig, use ParseCommandLineFlagsScript() instead.
+extern GFLAGS_DLL_DECL uint32 ParseCommandLineFlags(int *argc, char*** argv, bool remove_flags);
+#endif
+
+
+// Calls to ParseCommandLineNonHelpFlags and then to
+// HandleCommandLineHelpFlags can be used instead of a call to
+// ParseCommandLineFlags during initialization, in order to allow for
+// changing default values for some FLAGS (via
+// e.g. SetCommandLineOptionWithMode calls) between the time of
+// command line parsing and the time of dumping help information for
+// the flags as a result of command line parsing.  If a flag is
+// defined more than once in the command line or flag file, the last
+// definition is used.  Returns the index (into argv) of the first
+// non-flag argument.  (If remove_flags is true, will always return 1.)
+extern GFLAGS_DLL_DECL uint32 ParseCommandLineNonHelpFlags(int *argc, char*** argv, bool remove_flags);
+
+// This is actually defined in gflags_reporting.cc.
+// This function is misnamed (it also handles --version, etc.), but
+// it's too late to change that now. :-(
+extern GFLAGS_DLL_DECL void HandleCommandLineHelpFlags();   // in gflags_reporting.cc
+
+// Allow command line reparsing.  Disables the error normally
+// generated when an unknown flag is found, since it may be found in a
+// later parse.  Thread-hostile; meant to be called before any threads
+// are spawned.
+extern GFLAGS_DLL_DECL void AllowCommandLineReparsing();
+
+// Reparse the flags that have not yet been recognized.  Only flags
+// registered since the last parse will be recognized.  Any flag value
+// must be provided as part of the argument using "=", not as a
+// separate command line argument that follows the flag argument.
+// Intended for handling flags from dynamically loaded libraries,
+// since their flags are not registered until they are loaded.
+extern GFLAGS_DLL_DECL void ReparseCommandLineNonHelpFlags();
+
+// Clean up memory allocated by flags.  This is only needed to reduce
+// the quantity of "potentially leaked" reports emitted by memory
+// debugging tools such as valgrind.  It is not required for normal
+// operation, or for the google perftools heap-checker.  It must only
+// be called when the process is about to exit, and all threads that
+// might access flags are quiescent.  Referencing flags after this is
+// called will have unexpected consequences.  This is not safe to run
+// when multiple threads might be running: the function is
+// thread-hostile.
+extern GFLAGS_DLL_DECL void ShutDownCommandLineFlags();
+
+
+// --------------------------------------------------------------------
+// Now come the command line flag declaration/definition macros that
+// will actually be used.  They're kind of hairy.  A major reason
+// for this is initialization: we want people to be able to access
+// variables in global constructors and have that not crash, even if
+// their global constructor runs before the global constructor here.
+// (Obviously, we can't guarantee the flags will have the correct
+// default value in that case, but at least accessing them is safe.)
+// The only way to do that is have flags point to a static buffer.
+// So we make one, using a union to ensure proper alignment, and
+// then use placement-new to actually set up the flag with the
+// correct default value.  In the same vein, we have to worry about
+// flag access in global destructors, so FlagRegisterer has to be
+// careful never to destroy the flag-values it constructs.
+//
+// Note that when we define a flag variable FLAGS_<name>, we also
+// preemptively define a junk variable, FLAGS_no<name>.  This is to
+// cause a link-time error if someone tries to define 2 flags with
+// names like "logging" and "nologging".  We do this because a bool
+// flag FLAG can be set from the command line to true with a "-FLAG"
+// argument, and to false with a "-noFLAG" argument, and so this can
+// potentially avert confusion.
+//
+// We also put flags into their own namespace.  It is purposefully
+// named in an opaque way that people should have trouble typing
+// directly.  The idea is that DEFINE puts the flag in the weird
+// namespace, and DECLARE imports the flag from there into the current
+// namespace.  The net result is to force people to use DECLARE to get
+// access to a flag, rather than saying "extern GFLAGS_DLL_DECL bool FLAGS_whatever;"
+// or some such instead.  We want this so we can put extra
+// functionality (like sanity-checking) in DECLARE if we want, and
+// make sure it is picked up everywhere.
+//
+// We also put the type of the variable in the namespace, so that
+// people can't DECLARE_int32 something that they DEFINE_bool'd
+// elsewhere.
+
+class GFLAGS_DLL_DECL FlagRegisterer {
+ public:
+  FlagRegisterer(const char* name, const char* type,
+                 const char* help, const char* filename,
+                 void* current_storage, void* defvalue_storage);
+};
+
+// If your application #defines STRIP_FLAG_HELP to a non-zero value
+// before #including this file, we remove the help message from the
+// binary file. This can reduce the size of the resulting binary
+// somewhat, and may also be useful for security reasons.
+
+extern GFLAGS_DLL_DECL const char kStrippedFlagHelp[];
+
+
+} // namespace GFLAGS_NAMESPACE
+
+
+#ifndef SWIG  // In swig, ignore the main flag declarations
+
+#if defined(STRIP_FLAG_HELP) && STRIP_FLAG_HELP > 0
+// Need this construct to avoid the 'defined but not used' warning.
+#define MAYBE_STRIPPED_HELP(txt) \
+   (false ? (txt) : GFLAGS_NAMESPACE::kStrippedFlagHelp)
+#else
+#define MAYBE_STRIPPED_HELP(txt) txt
+#endif
+
+// Each command-line flag has two variables associated with it: one
+// with the current value, and one with the default value.  However,
+// we have a third variable, which is where value is assigned; it's a
+// constant.  This guarantees that FLAG_##value is initialized at
+// static initialization time (e.g. before program-start) rather than
+// than global construction time (which is after program-start but
+// before main), at least when 'value' is a compile-time constant.  We
+// use a small trick for the "default value" variable, and call it
+// FLAGS_no<name>.  This serves the second purpose of assuring a
+// compile error if someone tries to define a flag named no<name>
+// which is illegal (--foo and --nofoo both affect the "foo" flag).
+#define DEFINE_VARIABLE(type, shorttype, name, value, help)             \
+  namespace fL##shorttype {                                             \
+    static const type FLAGS_nono##name = value;                         \
+    /* We always want to export defined variables, dll or no */         \
+    GFLAGS_DLL_DEFINE_FLAG type FLAGS_##name = FLAGS_nono##name;        \
+    type FLAGS_no##name = FLAGS_nono##name;                             \
+    static GFLAGS_NAMESPACE::FlagRegisterer o_##name(                   \
+      #name, #type, MAYBE_STRIPPED_HELP(help), __FILE__,                \
+      &FLAGS_##name, &FLAGS_no##name);                                  \
+  }                                                                     \
+  using fL##shorttype::FLAGS_##name
+
+// For DEFINE_bool, we want to do the extra check that the passed-in
+// value is actually a bool, and not a string or something that can be
+// coerced to a bool.  These declarations (no definition needed!) will
+// help us do that, and never evaluate From, which is important.
+// We'll use 'sizeof(IsBool(val))' to distinguish. This code requires
+// that the compiler have different sizes for bool & double. Since
+// this is not guaranteed by the standard, we check it with a
+// COMPILE_ASSERT.
+namespace fLB {
+struct CompileAssert {};
+typedef CompileAssert expected_sizeof_double_neq_sizeof_bool[
+                      (sizeof(double) != sizeof(bool)) ? 1 : -1];
+template<typename From> double GFLAGS_DLL_DECL IsBoolFlag(const From& from);
+GFLAGS_DLL_DECL bool IsBoolFlag(bool from);
+}  // namespace fLB
+
+// Here are the actual DEFINE_*-macros. The respective DECLARE_*-macros
+// are in a separate include, gflags_declare.h, for reducing
+// the physical transitive size for DECLARE use.
+#define DEFINE_bool(name, val, txt)                                     \
+  namespace fLB {                                                       \
+    typedef ::fLB::CompileAssert FLAG_##name##_value_is_not_a_bool[     \
+            (sizeof(::fLB::IsBoolFlag(val)) != sizeof(double))? 1: -1]; \
+  }                                                                     \
+  DEFINE_VARIABLE(bool, B, name, val, txt)
+
+#define DEFINE_int32(name, val, txt) \
+   DEFINE_VARIABLE(GFLAGS_NAMESPACE::int32, I, \
+                   name, val, txt)
+
+#define DEFINE_int64(name, val, txt) \
+   DEFINE_VARIABLE(GFLAGS_NAMESPACE::int64, I64, \
+                   name, val, txt)
+
+#define DEFINE_uint64(name,val, txt) \
+   DEFINE_VARIABLE(GFLAGS_NAMESPACE::uint64, U64, \
+                   name, val, txt)
+
+#define DEFINE_double(name, val, txt) \
+   DEFINE_VARIABLE(double, D, name, val, txt)
+
+// Strings are trickier, because they're not a POD, so we can't
+// construct them at static-initialization time (instead they get
+// constructed at global-constructor time, which is much later).  To
+// try to avoid crashes in that case, we use a char buffer to store
+// the string, which we can static-initialize, and then placement-new
+// into it later.  It's not perfect, but the best we can do.
+
+namespace fLS {
+
+inline clstring* dont_pass0toDEFINE_string(char *stringspot,
+                                           const char *value) {
+  return new(stringspot) clstring(value);
+}
+inline clstring* dont_pass0toDEFINE_string(char *stringspot,
+                                           const clstring &value) {
+  return new(stringspot) clstring(value);
+}
+inline clstring* dont_pass0toDEFINE_string(char *stringspot,
+                                           int value);
+}  // namespace fLS
+
+// We need to define a var named FLAGS_no##name so people don't define
+// --string and --nostring.  And we need a temporary place to put val
+// so we don't have to evaluate it twice.  Two great needs that go
+// great together!
+// The weird 'using' + 'extern' inside the fLS namespace is to work around
+// an unknown compiler bug/issue with the gcc 4.2.1 on SUSE 10.  See
+//    http://code.google.com/p/google-gflags/issues/detail?id=20
+#define DEFINE_string(name, val, txt)                                       \
+  namespace fLS {                                                           \
+    using ::fLS::clstring;                                                  \
+    static union { void* align; char s[sizeof(clstring)]; } s_##name[2];    \
+    clstring* const FLAGS_no##name = ::fLS::                                \
+                                   dont_pass0toDEFINE_string(s_##name[0].s, \
+                                                             val);          \
+    static GFLAGS_NAMESPACE::FlagRegisterer o_##name(                       \
+        #name, "string", MAYBE_STRIPPED_HELP(txt), __FILE__,                \
+        s_##name[0].s, new (s_##name[1].s) clstring(*FLAGS_no##name));      \
+    extern GFLAGS_DLL_DEFINE_FLAG clstring& FLAGS_##name;                   \
+    using fLS::FLAGS_##name;                                                \
+    clstring& FLAGS_##name = *FLAGS_no##name;                               \
+  }                                                                         \
+  using fLS::FLAGS_##name
+
+#endif  // SWIG
+
+
+// Import gflags library symbols into alternative/deprecated namespace(s)
+#include "gflags_gflags.h"
+
+
+#endif  // GFLAGS_GFLAGS_H_
--- a/Показать больше
+++ b/Показать больше