Bug 1816576 - Update libyuv to 2bdc210be9eb11ded16bf3ef1f6cadb0d4dcb0c2 r=ng

Differential Revision: https://phabricator.services.mozilla.com/D185940
2023-10-11 16:22:32 +00:00 · 2023-10-11 16:22:32 +00:00 · 436a11e04d
--- a/media/libyuv/libyuv/README.chromium
+++ b/media/libyuv/libyuv/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1857
+Version: 1860
 License: BSD
 License File: LICENSE

--- a/media/libyuv/libyuv/include/libyuv/rotate_row.h
+++ b/media/libyuv/libyuv/include/libyuv/rotate_row.h
@ -232,6 +232,27 @@ void TransposeWx1_16_C(const uint16_t* src,
                       uint16_t* dst,
                       int dst_stride,
                       int width);
+
+// Transpose 32 bit values (ARGB)
+void Transpose4x4_32_NEON(const uint8_t* src,
+                          int src_stride,
+                          uint8_t* dst,
+                          int dst_stride,
+                          int width);
+
+void Transpose4x4_32_C(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst,
+                       int dst_stride,
+                       int width);
+
+// Transpose 32 bit values (ARGB)
+void Transpose8x8_32_NEON(const uint8_t* src,
+                          int src_stride,
+                          uint8_t* dst,
+                          int dst_stride,
+                          int width);
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
--- a/media/libyuv/libyuv/include/libyuv/row.h
+++ b/media/libyuv/libyuv/include/libyuv/row.h
@ -177,9 +177,8 @@ extern "C" {

 // The following functions fail on gcc/clang 32 bit with fpic and framepointer.
 // caveat: clangcl uses row_win.cc which works.
-#if !defined(MOZ_PROFILING) && \
-     (defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \
-      defined(_MSC_VER))
+#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \
+    defined(_MSC_VER)
 // TODO(fbarchard): fix build error on android_full_debug=1
 // https://code.google.com/p/libyuv/issues/detail?id=517
 #define HAS_I422ALPHATOARGBROW_SSSE3
@ -248,9 +247,8 @@ extern "C" {
 #define HAS_ARGBATTENUATEROW_AVX2
 #endif

-#if !defined(MOZ_PROFILING) && \
-  (defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \
-   defined(_MSC_VER))
+#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \
+    defined(_MSC_VER)
 // TODO(fbarchard): fix build error on android_full_debug=1
 // https://code.google.com/p/libyuv/issues/detail?id=517
 #define HAS_I422ALPHATOARGBROW_AVX2
@ -404,9 +402,11 @@ extern "C" {
 // The following are available for AVX512 clang x86 platforms:
 // TODO(fbarchard): Port to GCC and Visual C
 // TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789
+// TODO(fbarchard): Port MERGEUV to assembly
 #if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || defined(__i386__)) && (defined(CLANG_HAS_AVX512))
+    (defined(__x86_64__) || defined(__i386__)) && (defined(CLANG_HAS_AVX512) && !defined(_MSC_VER))
 #define HAS_ARGBTORGB24ROW_AVX512VBMI
+#define HAS_MERGEUVROW_AVX512BW
 #endif

 // The following are available for AVX512 clang x64 platforms:
@ -2186,6 +2186,10 @@ void MergeUVRow_AVX2(const uint8_t* src_u,
                     const uint8_t* src_v,
                     uint8_t* dst_uv,
                     int width);
+void MergeUVRow_AVX512BW(const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_uv,
+                         int width);
 void MergeUVRow_NEON(const uint8_t* src_u,
                     const uint8_t* src_v,
                     uint8_t* dst_uv,
@ -2206,6 +2210,10 @@ void MergeUVRow_Any_AVX2(const uint8_t* y_buf,
                         const uint8_t* uv_buf,
                         uint8_t* dst_ptr,
                         int width);
+void MergeUVRow_Any_AVX512BW(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             int width);
 void MergeUVRow_Any_NEON(const uint8_t* y_buf,
                         const uint8_t* uv_buf,
                         uint8_t* dst_ptr,
--- a/media/libyuv/libyuv/include/libyuv/version.h
+++ b/media/libyuv/libyuv/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1857
+#define LIBYUV_VERSION 1860

 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/media/libyuv/libyuv/libyuv.gyp
+++ b/media/libyuv/libyuv/libyuv.gyp
@ -52,7 +52,7 @@
        'optimize': 'max',  # enable O2 and ltcg.
      },
      # Allows libyuv.a redistributable library without external dependencies.
-      # 'standalone_static_library': 1,
+      'standalone_static_library': 1,
      'conditions': [
       # Disable -Wunused-parameter
        ['clang == 1', {
@ -70,11 +70,6 @@
            '-mfpu=vfpv3-d16',
            # '-mthumb',  # arm32 not thumb
          ],
-          'cflags_mozilla!': [
-            '-mfpu=vfp',
-            '-mfpu=vfpv3',
-            '-mfpu=vfpv3-d16',
-          ],
          'conditions': [
            # Disable LTO in libyuv_neon target due to gcc 4.9 compiler bug.
            ['clang == 0 and use_lto == 1', {
@ -89,9 +84,6 @@
                '-mfpu=neon',
                # '-marm',  # arm32 not thumb
              ],
-              'cflags_mozilla': [
-                '-mfpu=neon',
-              ],
            }],
          ],
        }],
@ -100,15 +92,7 @@
            'LIBYUV_MSA',
          ],
        }],
-        ['build_with_mozilla == 1', {
-          'defines': [
-            'HAVE_JPEG'
-          ],
-          'cflags_mozilla': [
-            '$(MOZ_JPEG_CFLAGS)',
-          ],
-        }],
-        ['OS != "ios" and libyuv_disable_jpeg != 1 and build_with_mozilla != 1', {
+        ['OS != "ios" and libyuv_disable_jpeg != 1', {
          'defines': [
            'HAVE_JPEG'
          ],
--- a/media/libyuv/libyuv/source/convert.cc
+++ b/media/libyuv/libyuv/source/convert.cc
@ -924,6 +924,14 @@ int I422ToNV21(const uint8_t* src_y,
    }
  }
 #endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    MergeUVRow = MergeUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow = MergeUVRow_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    MergeUVRow = MergeUVRow_Any_NEON;
--- a/media/libyuv/libyuv/source/convert_from_argb.cc
+++ b/media/libyuv/libyuv/source/convert_from_argb.cc
@ -389,6 +389,14 @@ int ARGBToNV12(const uint8_t* src_argb,
    }
  }
 #endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow_ = MergeUVRow_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    MergeUVRow_ = MergeUVRow_Any_NEON;
@ -559,6 +567,14 @@ int ARGBToNV21(const uint8_t* src_argb,
    }
  }
 #endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(halfwidth, 64)) {
+      MergeUVRow_ = MergeUVRow_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    MergeUVRow_ = MergeUVRow_Any_NEON;
@ -726,6 +742,14 @@ int ABGRToNV12(const uint8_t* src_abgr,
    }
  }
 #endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(halfwidth, 64)) {
+      MergeUVRow_ = MergeUVRow_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    MergeUVRow_ = MergeUVRow_Any_NEON;
@ -894,6 +918,14 @@ int ABGRToNV21(const uint8_t* src_abgr,
    }
  }
 #endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(halfwidth, 64)) {
+      MergeUVRow_ = MergeUVRow_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    MergeUVRow_ = MergeUVRow_Any_NEON;
@ -2921,6 +2953,14 @@ int RAWToJNV21(const uint8_t* src_raw,
    }
  }
 #endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(halfwidth, 64)) {
+      MergeUVRow_ = MergeUVRow_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    MergeUVRow_ = MergeUVRow_Any_NEON;
--- a/media/libyuv/libyuv/source/mjpeg_decoder.cc
+++ b/media/libyuv/libyuv/source/mjpeg_decoder.cc
@ -79,9 +79,7 @@ MJpegDecoder::MJpegDecoder()
  decompress_struct_->err = jpeg_std_error(&error_mgr_->base);
  // Override standard exit()-based error handler.
  error_mgr_->base.error_exit = &ErrorHandler;
-#ifndef DEBUG_MJPEG
  error_mgr_->base.output_message = &OutputHandler;
-#endif
 #endif
  decompress_struct_->client_data = NULL;
  source_mgr_->init_source = &init_source;
@ -465,12 +463,11 @@ void ErrorHandler(j_common_ptr cinfo) {
  longjmp(mgr->setjmp_buffer, 1);
 }

-#ifndef DEBUG_MJPEG
 // Suppress fprintf warnings.
 void OutputHandler(j_common_ptr cinfo) {
  (void)cinfo;
 }
-#endif
+
 #endif  // HAVE_SETJMP

 void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
--- a/media/libyuv/libyuv/source/planar_functions.cc
+++ b/media/libyuv/libyuv/source/planar_functions.cc
@ -599,6 +599,14 @@ void MergeUVPlane(const uint8_t* src_u,
    }
  }
 #endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    MergeUVRow = MergeUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 32)) {
+      MergeUVRow = MergeUVRow_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    MergeUVRow = MergeUVRow_Any_NEON;
--- a/media/libyuv/libyuv/source/rotate_common.cc
+++ b/media/libyuv/libyuv/source/rotate_common.cc
@ -166,6 +166,63 @@ void TransposeWxH_16_C(const uint16_t* src,
  }
 }

+// Transpose 32 bit values (ARGB)
+void Transpose4x4_32_C(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst,
+                       int dst_stride,
+                       int width) {
+  const uint8_t* src1 = src + src_stride;
+  const uint8_t* src2 = src1 + src_stride;
+  const uint8_t* src3 = src2 + src_stride;
+  uint8_t* dst1 = dst + dst_stride;
+  uint8_t* dst2 = dst1 + dst_stride;
+  uint8_t* dst3 = dst2 + dst_stride;
+  int i;
+  for (i = 0; i < width; i += 4) {
+    uint32_t p00 = ((uint32_t*)(src))[0];
+    uint32_t p10 = ((uint32_t*)(src))[1];
+    uint32_t p20 = ((uint32_t*)(src))[2];
+    uint32_t p30 = ((uint32_t*)(src))[3];
+    uint32_t p01 = ((uint32_t*)(src1))[0];
+    uint32_t p11 = ((uint32_t*)(src1))[1];
+    uint32_t p21 = ((uint32_t*)(src1))[2];
+    uint32_t p31 = ((uint32_t*)(src1))[3];
+    uint32_t p02 = ((uint32_t*)(src2))[0];
+    uint32_t p12 = ((uint32_t*)(src2))[1];
+    uint32_t p22 = ((uint32_t*)(src2))[2];
+    uint32_t p32 = ((uint32_t*)(src2))[3];
+    uint32_t p03 = ((uint32_t*)(src3))[0];
+    uint32_t p13 = ((uint32_t*)(src3))[1];
+    uint32_t p23 = ((uint32_t*)(src3))[2];
+    uint32_t p33 = ((uint32_t*)(src3))[3];
+    ((uint32_t*)(dst))[0] = p00;
+    ((uint32_t*)(dst))[1] = p01;
+    ((uint32_t*)(dst))[2] = p02;
+    ((uint32_t*)(dst))[3] = p03;
+    ((uint32_t*)(dst1))[0] = p10;
+    ((uint32_t*)(dst1))[1] = p11;
+    ((uint32_t*)(dst1))[2] = p12;
+    ((uint32_t*)(dst1))[3] = p13;
+    ((uint32_t*)(dst2))[0] = p20;
+    ((uint32_t*)(dst2))[1] = p21;
+    ((uint32_t*)(dst2))[2] = p22;
+    ((uint32_t*)(dst2))[3] = p23;
+    ((uint32_t*)(dst3))[0] = p30;
+    ((uint32_t*)(dst3))[1] = p31;
+    ((uint32_t*)(dst3))[2] = p32;
+    ((uint32_t*)(dst3))[3] = p33;
+    src += src_stride * 4;  // advance 4 rows
+    src1 += src_stride * 4;
+    src2 += src_stride * 4;
+    src3 += src_stride * 4;
+    dst += 4 * 4;  // advance 4 columns
+    dst1 += 4 * 4;
+    dst2 += 4 * 4;
+    dst3 += 4 * 4;
+  }
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
--- a/media/libyuv/libyuv/source/rotate_neon64.cc
+++ b/media/libyuv/libyuv/source/rotate_neon64.cc
@ -435,6 +435,45 @@ void TransposeUVWx8_NEON(const uint8_t* src,
      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
        "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31");
 }
+
+// Transpose 32 bit values (ARGB)
+void Transpose4x4_32_NEON(const uint8_t* src,
+                          int src_stride,
+                          uint8_t* dst,
+                          int dst_stride,
+                          int width) {
+  const uint8_t* src1 = src + src_stride;
+  const uint8_t* src2 = src1 + src_stride;
+  const uint8_t* src3 = src2 + src_stride;
+  uint8_t* dst1 = dst + dst_stride;
+  uint8_t* dst2 = dst1 + dst_stride;
+  uint8_t* dst3 = dst2 + dst_stride;
+  asm volatile(
+      // Main loop transpose 4x4.  Read a column, write a row.
+      "1:                                        \n"
+      "ld4         {v0.s, v1.s, v2.s, v3.s}[0], [%0], %9 \n"
+      "ld4         {v0.s, v1.s, v2.s, v3.s}[1], [%1], %9 \n"
+      "ld4         {v0.s, v1.s, v2.s, v3.s}[2], [%2], %9 \n"
+      "ld4         {v0.s, v1.s, v2.s, v3.s}[3], [%3], %9 \n"
+      "subs        %w8, %w8, #4                  \n"  // w -= 4
+      "st1         {v0.4s}, [%4], 16             \n"
+      "st1         {v1.4s}, [%5], 16             \n"
+      "st1         {v2.4s}, [%6], 16             \n"
+      "st1         {v3.4s}, [%7], 16             \n"
+      "b.gt        1b                            \n"
+      : "+r"(src),                        // %0
+        "+r"(src1),                       // %1
+        "+r"(src2),                       // %2
+        "+r"(src3),                       // %3
+        "+r"(dst),                        // %4
+        "+r"(dst1),                       // %5
+        "+r"(dst2),                       // %6
+        "+r"(dst3),                       // %7
+        "+r"(width)                       // %8
+      : "r"((ptrdiff_t)(src_stride * 4))  // %9
+      : "memory", "cc", "v0", "v1", "v2", "v3");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)

 #ifdef __cplusplus
--- a/media/libyuv/libyuv/source/row_any.cc
+++ b/media/libyuv/libyuv/source/row_any.cc
@ -571,6 +571,9 @@ ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15)
 #ifdef HAS_MERGEUVROW_AVX2
 ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31)
 #endif
+#ifdef HAS_MERGEUVROW_AVX512BW
+ANY21(MergeUVRow_Any_AVX512BW, MergeUVRow_AVX512BW, 0, 1, 1, 2, 31)
+#endif
 #ifdef HAS_MERGEUVROW_NEON
 ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15)
 #endif
--- a/media/libyuv/libyuv/source/row_gcc.cc
+++ b/media/libyuv/libyuv/source/row_gcc.cc
@ -17,6 +17,8 @@ extern "C" {
 // This module is for GCC x86 and x64.
 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))

+#include <immintrin.h>
+
 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)

 // Constants for ARGB
@ -5142,6 +5144,25 @@ void DetileSplitUVRow_SSSE3(const uint8_t* src_uv,
 }
 #endif  // HAS_DETILESPLITUVROW_SSSE3

+#ifdef HAS_MERGEUVROW_AVX512BW
+__attribute__ ((target("avx512vl,avx512bw")))
+void MergeUVRow_AVX512BW(const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_uv,
+                         int width) {
+  do {
+    const __m512i u = _mm512_cvtepu8_epi16(_mm256_loadu_epi8(src_u));
+    const __m512i v = _mm512_slli_epi64(_mm512_cvtepu8_epi16(_mm256_loadu_epi8(src_v)), 8);
+    const __m512i uv = _mm512_or_si512(u, v);
+    _mm512_storeu_epi8(dst_uv, uv);
+    src_u += 32;
+    src_v += 32;
+    dst_uv += 64;
+    width -= 32;
+  } while (width > 0);
+}
+#endif  // HAS_MERGEUVROW_AVX512BW
+
 #ifdef HAS_MERGEUVROW_AVX2
 void MergeUVRow_AVX2(const uint8_t* src_u,
                     const uint8_t* src_v,
--- a/media/libyuv/libyuv/source/row_lasx.cc
+++ b/media/libyuv/libyuv/source/row_lasx.cc
@ -2036,8 +2036,8 @@ static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb,
                                  uint8_t* dst_y,
                                  int width,
                                  const struct RgbConstants* rgbconstants) {
-    int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
-    asm volatile(
+  int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
+  asm volatile(
      "xvldrepl.b      $xr0,  %3,    0             \n\t"  // load rgbconstants
      "xvldrepl.b      $xr1,  %3,    1             \n\t"  // load rgbconstants
      "xvldrepl.b      $xr2,  %3,    2             \n\t"  // load rgbconstants
@ -2047,19 +2047,21 @@ static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb,
      "xvld            $xr4,  %0,    0             \n\t"
      "xvld            $xr5,  %0,    32            \n\t"
      "xvld            $xr6,  %0,    64            \n\t"
-      "xvld            $xr7,  %0,    96            \n\t"  // load 32 pixels of ARGB
+      "xvld            $xr7,  %0,    96            \n\t"  // load 32 pixels of
+                                                          // ARGB
      "xvor.v          $xr12, $xr3,  $xr3          \n\t"
      "xvor.v          $xr13, $xr3,  $xr3          \n\t"
-      "addi.d          %2,    %2,    -32           \n\t"  // 32 processed per loop.
-      "xvpickev.b      $xr8,  $xr5,  $xr4          \n\t"  //BR
+      "addi.d          %2,    %2,    -32           \n\t"  // 32 processed per
+                                                          // loop.
+      "xvpickev.b      $xr8,  $xr5,  $xr4          \n\t"  // BR
      "xvpickev.b      $xr10, $xr7,  $xr6          \n\t"
-      "xvpickod.b      $xr9,  $xr5,  $xr4          \n\t"  //GA
+      "xvpickod.b      $xr9,  $xr5,  $xr4          \n\t"  // GA
      "xvpickod.b      $xr11, $xr7,  $xr6          \n\t"
-      "xvmaddwev.h.bu  $xr12, $xr8,  $xr0          \n\t"  //B
+      "xvmaddwev.h.bu  $xr12, $xr8,  $xr0          \n\t"  // B
      "xvmaddwev.h.bu  $xr13, $xr10, $xr0          \n\t"
-      "xvmaddwev.h.bu  $xr12, $xr9,  $xr1          \n\t"  //G
+      "xvmaddwev.h.bu  $xr12, $xr9,  $xr1          \n\t"  // G
      "xvmaddwev.h.bu  $xr13, $xr11, $xr1          \n\t"
-      "xvmaddwod.h.bu  $xr12, $xr8,  $xr2          \n\t"  //R
+      "xvmaddwod.h.bu  $xr12, $xr8,  $xr2          \n\t"  // R
      "xvmaddwod.h.bu  $xr13, $xr10, $xr2          \n\t"
      "addi.d          %0,    %0,    128           \n\t"
      "xvpickod.b      $xr10, $xr13, $xr12         \n\t"
@ -2067,13 +2069,11 @@ static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb,
      "xvst            $xr11, %1,    0             \n\t"
      "addi.d          %1,    %1,    32            \n\t"
      "bnez            %2,    1b                   \n\t"
-      : "+&r"(src_argb),    // %0
-        "+&r"(dst_y),       // %1
-        "+&r"(width)        // %2
-      : "r"(rgbconstants),
-        "r"(shuff)
-      : "memory"
-    );
+      : "+&r"(src_argb),  // %0
+        "+&r"(dst_y),     // %1
+        "+&r"(width)      // %2
+      : "r"(rgbconstants), "r"(shuff)
+      : "memory");
 }

 void ARGBToYRow_LASX(const uint8_t* src_argb, uint8_t* dst_y, int width) {
@ -2098,8 +2098,8 @@ static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba,
                                  uint8_t* dst_y,
                                  int width,
                                  const struct RgbConstants* rgbconstants) {
-    int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
-    asm volatile(
+  int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
+  asm volatile(
      "xvldrepl.b      $xr0,  %3,    0             \n\t"  // load rgbconstants
      "xvldrepl.b      $xr1,  %3,    1             \n\t"  // load rgbconstants
      "xvldrepl.b      $xr2,  %3,    2             \n\t"  // load rgbconstants
@ -2109,19 +2109,21 @@ static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba,
      "xvld            $xr4,  %0,    0             \n\t"
      "xvld            $xr5,  %0,    32            \n\t"
      "xvld            $xr6,  %0,    64            \n\t"
-      "xvld            $xr7,  %0,    96            \n\t"  // load 32 pixels of RGBA
+      "xvld            $xr7,  %0,    96            \n\t"  // load 32 pixels of
+                                                          // RGBA
      "xvor.v          $xr12, $xr3,  $xr3          \n\t"
      "xvor.v          $xr13, $xr3,  $xr3          \n\t"
-      "addi.d          %2,    %2,    -32           \n\t"  // 32 processed per loop.
-      "xvpickev.b      $xr8,  $xr5,  $xr4          \n\t"  //AG
+      "addi.d          %2,    %2,    -32           \n\t"  // 32 processed per
+                                                          // loop.
+      "xvpickev.b      $xr8,  $xr5,  $xr4          \n\t"  // AG
      "xvpickev.b      $xr10, $xr7,  $xr6          \n\t"
-      "xvpickod.b      $xr9,  $xr5,  $xr4          \n\t"  //BR
+      "xvpickod.b      $xr9,  $xr5,  $xr4          \n\t"  // BR
      "xvpickod.b      $xr11, $xr7,  $xr6          \n\t"
-      "xvmaddwev.h.bu  $xr12, $xr9,  $xr0          \n\t"  //B
+      "xvmaddwev.h.bu  $xr12, $xr9,  $xr0          \n\t"  // B
      "xvmaddwev.h.bu  $xr13, $xr11, $xr0          \n\t"
-      "xvmaddwod.h.bu  $xr12, $xr8,  $xr1          \n\t"  //G
+      "xvmaddwod.h.bu  $xr12, $xr8,  $xr1          \n\t"  // G
      "xvmaddwod.h.bu  $xr13, $xr10, $xr1          \n\t"
-      "xvmaddwod.h.bu  $xr12, $xr9,  $xr2          \n\t"  //R
+      "xvmaddwod.h.bu  $xr12, $xr9,  $xr2          \n\t"  // R
      "xvmaddwod.h.bu  $xr13, $xr11, $xr2          \n\t"
      "addi.d          %0,    %0,    128           \n\t"
      "xvpickod.b      $xr10, $xr13, $xr12         \n\t"
@ -2129,13 +2131,11 @@ static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba,
      "xvst            $xr11, %1,    0             \n\t"
      "addi.d          %1,    %1,    32            \n\t"
      "bnez            %2,    1b                   \n\t"
-      : "+&r"(src_rgba),    // %0
-        "+&r"(dst_y),       // %1
-        "+&r"(width)        // %2
-      : "r"(rgbconstants),
-        "r"(shuff)
-      : "memory"
-    );
+      : "+&r"(src_rgba),  // %0
+        "+&r"(dst_y),     // %1
+        "+&r"(width)      // %2
+      : "r"(rgbconstants), "r"(shuff)
+      : "memory");
 }

 void RGBAToYRow_LASX(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
@ -2151,18 +2151,19 @@ void BGRAToYRow_LASX(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
 }

 static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba,
-                                  uint8_t* dst_y,
-                                  int width,
-                                  const struct RgbConstants* rgbconstants) {
-    int8_t shuff[128] = {0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23,
-                         0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23,
-                         24, 26, 27, 29, 30, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15,
-                         24, 26, 27, 29, 30, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15,
-                         1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0,
-                         1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0,
-                         25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0,
-                         25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0};
-    asm volatile(
+                                 uint8_t* dst_y,
+                                 int width,
+                                 const struct RgbConstants* rgbconstants) {
+  int8_t shuff[128] = {
+      0,  2,  3,  5,  6,  8, 9,  11, 12, 14, 15, 17, 18, 20, 21, 23,
+      0,  2,  3,  5,  6,  8, 9,  11, 12, 14, 15, 17, 18, 20, 21, 23,
+      24, 26, 27, 29, 30, 0, 1,  3,  4,  6,  7,  9,  10, 12, 13, 15,
+      24, 26, 27, 29, 30, 0, 1,  3,  4,  6,  7,  9,  10, 12, 13, 15,
+      1,  0,  4,  0,  7,  0, 10, 0,  13, 0,  16, 0,  19, 0,  22, 0,
+      1,  0,  4,  0,  7,  0, 10, 0,  13, 0,  16, 0,  19, 0,  22, 0,
+      25, 0,  28, 0,  31, 0, 2,  0,  5,  0,  8,  0,  11, 0,  14, 0,
+      25, 0,  28, 0,  31, 0, 2,  0,  5,  0,  8,  0,  11, 0,  14, 0};
+  asm volatile(
      "xvldrepl.b      $xr0,  %3,    0             \n\t"  // load rgbconstants
      "xvldrepl.b      $xr1,  %3,    1             \n\t"  // load rgbconstants
      "xvldrepl.b      $xr2,  %3,    2             \n\t"  // load rgbconstants
@ -2174,23 +2175,25 @@ static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba,
      "1:                                          \n\t"
      "xvld            $xr8,  %0,    0             \n\t"
      "xvld            $xr9,  %0,    32            \n\t"
-      "xvld            $xr10, %0,    64            \n\t"  // load 32 pixels of RGB
+      "xvld            $xr10, %0,    64            \n\t"  // load 32 pixels of
+                                                          // RGB
      "xvor.v          $xr12, $xr3,  $xr3          \n\t"
      "xvor.v          $xr13, $xr3,  $xr3          \n\t"
      "xvor.v          $xr11, $xr9,  $xr9          \n\t"
-      "addi.d          %2,    %2,    -32           \n\t"  // 32 processed per loop.
-      "xvpermi.q       $xr9,  $xr8,  0x30          \n\t"  //src0
-      "xvpermi.q       $xr8,  $xr10, 0x03          \n\t"  //src1
-      "xvpermi.q       $xr10, $xr11, 0x30          \n\t"  //src2
+      "addi.d          %2,    %2,    -32           \n\t"  // 32 processed per
+                                                          // loop.
+      "xvpermi.q       $xr9,  $xr8,  0x30          \n\t"  // src0
+      "xvpermi.q       $xr8,  $xr10, 0x03          \n\t"  // src1
+      "xvpermi.q       $xr10, $xr11, 0x30          \n\t"  // src2
      "xvshuf.b        $xr14, $xr8,  $xr9,  $xr4   \n\t"
      "xvshuf.b        $xr15, $xr8,  $xr10, $xr5   \n\t"
      "xvshuf.b        $xr16, $xr8,  $xr9,  $xr6   \n\t"
      "xvshuf.b        $xr17, $xr8,  $xr10, $xr7   \n\t"
-      "xvmaddwev.h.bu  $xr12, $xr16, $xr1          \n\t"  //G
+      "xvmaddwev.h.bu  $xr12, $xr16, $xr1          \n\t"  // G
      "xvmaddwev.h.bu  $xr13, $xr17, $xr1          \n\t"
-      "xvmaddwev.h.bu  $xr12, $xr14, $xr0          \n\t"  //B
+      "xvmaddwev.h.bu  $xr12, $xr14, $xr0          \n\t"  // B
      "xvmaddwev.h.bu  $xr13, $xr15, $xr0          \n\t"
-      "xvmaddwod.h.bu  $xr12, $xr14, $xr2          \n\t"  //R
+      "xvmaddwod.h.bu  $xr12, $xr14, $xr2          \n\t"  // R
      "xvmaddwod.h.bu  $xr13, $xr15, $xr2          \n\t"
      "addi.d          %0,    %0,    96            \n\t"
      "xvpickod.b      $xr10, $xr13, $xr12         \n\t"
@ -2202,8 +2205,7 @@ static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba,
        "+&r"(width)        // %2
      : "r"(rgbconstants),  // %3
        "r"(shuff)          // %4
-      : "memory"
-    );
+      : "memory");
 }

 void RGB24ToYJRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
--- a/media/libyuv/libyuv/source/row_lsx.cc
+++ b/media/libyuv/libyuv/source/row_lsx.cc
@ -1679,7 +1679,7 @@ static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
                                 uint8_t* dst_y,
                                 int width,
                                 const struct RgbConstants* rgbconstants) {
-    asm volatile(
+  asm volatile(
      "vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
      "vldrepl.b      $vr1,  %3,    1             \n\t"  // load rgbconstants
      "vldrepl.b      $vr2,  %3,    2             \n\t"  // load rgbconstants
@ -1688,31 +1688,32 @@ static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
      "vld            $vr4,  %0,    0             \n\t"
      "vld            $vr5,  %0,    16            \n\t"
      "vld            $vr6,  %0,    32            \n\t"
-      "vld            $vr7,  %0,    48            \n\t"  // load 16 pixels of ARGB
+      "vld            $vr7,  %0,    48            \n\t"  // load 16 pixels of
+                                                         // ARGB
      "vor.v          $vr12, $vr3,  $vr3          \n\t"
      "vor.v          $vr13, $vr3,  $vr3          \n\t"
-      "addi.d         %2,    %2,    -16           \n\t"  // 16 processed per loop.
-      "vpickev.b      $vr8,  $vr5,  $vr4          \n\t"  //BR
+      "addi.d         %2,    %2,    -16           \n\t"  // 16 processed per
+                                                         // loop.
+      "vpickev.b      $vr8,  $vr5,  $vr4          \n\t"  // BR
      "vpickev.b      $vr10, $vr7,  $vr6          \n\t"
-      "vpickod.b      $vr9,  $vr5,  $vr4          \n\t"  //GA
+      "vpickod.b      $vr9,  $vr5,  $vr4          \n\t"  // GA
      "vpickod.b      $vr11, $vr7,  $vr6          \n\t"
-      "vmaddwev.h.bu  $vr12, $vr8,  $vr0          \n\t"  //B
+      "vmaddwev.h.bu  $vr12, $vr8,  $vr0          \n\t"  // B
      "vmaddwev.h.bu  $vr13, $vr10, $vr0          \n\t"
-      "vmaddwev.h.bu  $vr12, $vr9,  $vr1          \n\t"  //G
+      "vmaddwev.h.bu  $vr12, $vr9,  $vr1          \n\t"  // G
      "vmaddwev.h.bu  $vr13, $vr11, $vr1          \n\t"
-      "vmaddwod.h.bu  $vr12, $vr8,  $vr2          \n\t"  //R
+      "vmaddwod.h.bu  $vr12, $vr8,  $vr2          \n\t"  // R
      "vmaddwod.h.bu  $vr13, $vr10, $vr2          \n\t"
      "addi.d         %0,    %0,    64            \n\t"
      "vpickod.b      $vr10, $vr13, $vr12         \n\t"
      "vst            $vr10, %1,    0             \n\t"
      "addi.d         %1,    %1,    16            \n\t"
      "bnez           %2,    1b                   \n\t"
-      : "+&r"(src_argb),    // %0
-        "+&r"(dst_y),       // %1
-        "+&r"(width)        // %2
+      : "+&r"(src_argb),  // %0
+        "+&r"(dst_y),     // %1
+        "+&r"(width)      // %2
      : "r"(rgbconstants)
-      : "memory"
-    );
+      : "memory");
 }

 void ARGBToYRow_LSX(const uint8_t* src_argb, uint8_t* dst_y, int width) {
@ -1737,7 +1738,7 @@ static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba,
                                 uint8_t* dst_y,
                                 int width,
                                 const struct RgbConstants* rgbconstants) {
-    asm volatile(
+  asm volatile(
      "vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
      "vldrepl.b      $vr1,  %3,    1             \n\t"  // load rgbconstants
      "vldrepl.b      $vr2,  %3,    2             \n\t"  // load rgbconstants
@ -1746,31 +1747,32 @@ static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba,
      "vld            $vr4,  %0,    0             \n\t"
      "vld            $vr5,  %0,    16            \n\t"
      "vld            $vr6,  %0,    32            \n\t"
-      "vld            $vr7,  %0,    48            \n\t"  // load 16 pixels of RGBA
+      "vld            $vr7,  %0,    48            \n\t"  // load 16 pixels of
+                                                         // RGBA
      "vor.v          $vr12, $vr3,  $vr3          \n\t"
      "vor.v          $vr13, $vr3,  $vr3          \n\t"
-      "addi.d         %2,    %2,    -16           \n\t"  // 16 processed per loop.
-      "vpickev.b      $vr8,  $vr5,  $vr4          \n\t"  //AG
+      "addi.d         %2,    %2,    -16           \n\t"  // 16 processed per
+                                                         // loop.
+      "vpickev.b      $vr8,  $vr5,  $vr4          \n\t"  // AG
      "vpickev.b      $vr10, $vr7,  $vr6          \n\t"
-      "vpickod.b      $vr9,  $vr5,  $vr4          \n\t"  //BR
+      "vpickod.b      $vr9,  $vr5,  $vr4          \n\t"  // BR
      "vpickod.b      $vr11, $vr7,  $vr6          \n\t"
-      "vmaddwev.h.bu  $vr12, $vr9,  $vr0          \n\t"  //B
+      "vmaddwev.h.bu  $vr12, $vr9,  $vr0          \n\t"  // B
      "vmaddwev.h.bu  $vr13, $vr11, $vr0          \n\t"
-      "vmaddwod.h.bu  $vr12, $vr8,  $vr1          \n\t"  //G
+      "vmaddwod.h.bu  $vr12, $vr8,  $vr1          \n\t"  // G
      "vmaddwod.h.bu  $vr13, $vr10, $vr1          \n\t"
-      "vmaddwod.h.bu  $vr12, $vr9,  $vr2          \n\t"  //R
+      "vmaddwod.h.bu  $vr12, $vr9,  $vr2          \n\t"  // R
      "vmaddwod.h.bu  $vr13, $vr11, $vr2          \n\t"
      "addi.d         %0,    %0,    64            \n\t"
      "vpickod.b      $vr10, $vr13, $vr12         \n\t"
      "vst            $vr10, %1,    0             \n\t"
      "addi.d         %1,    %1,    16            \n\t"
      "bnez           %2,    1b                   \n\t"
-      : "+&r"(src_rgba),    // %0
-        "+&r"(dst_y),       // %1
-        "+&r"(width)        // %2
+      : "+&r"(src_rgba),  // %0
+        "+&r"(dst_y),     // %1
+        "+&r"(width)      // %2
      : "r"(rgbconstants)
-      : "memory"
-    );
+      : "memory");
 }

 void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
@ -1789,11 +1791,12 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
                                uint8_t* dst_y,
                                int width,
                                const struct RgbConstants* rgbconstants) {
-    int8_t shuff[64] = {0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23,
-                        24, 26, 27, 29, 30, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15,
-                        1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0,
-                        25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0};
-    asm volatile(
+  int8_t shuff[64] = {0,  2,  3,  5,  6,  8,  9,  11, 12, 14, 15, 17, 18,
+                      20, 21, 23, 24, 26, 27, 29, 30, 0,  1,  3,  4,  6,
+                      7,  9,  10, 12, 13, 15, 1,  0,  4,  0,  7,  0,  10,
+                      0,  13, 0,  16, 0,  19, 0,  22, 0,  25, 0,  28, 0,
+                      31, 0,  2,  0,  5,  0,  8,  0,  11, 0,  14, 0};
+  asm volatile(
      "vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
      "vldrepl.b      $vr1,  %3,    1             \n\t"  // load rgbconstants
      "vldrepl.b      $vr2,  %3,    2             \n\t"  // load rgbconstants
@ -1805,19 +1808,21 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
      "1:                                         \n\t"
      "vld            $vr8,  %0,    0             \n\t"
      "vld            $vr9,  %0,    16            \n\t"
-      "vld            $vr10, %0,    32            \n\t"  // load 16 pixels of RGB
+      "vld            $vr10, %0,    32            \n\t"  // load 16 pixels of
+                                                         // RGB
      "vor.v          $vr12, $vr3,  $vr3          \n\t"
      "vor.v          $vr13, $vr3,  $vr3          \n\t"
-      "addi.d         %2,    %2,    -16           \n\t"  // 16 processed per loop.
+      "addi.d         %2,    %2,    -16           \n\t"  // 16 processed per
+                                                         // loop.
      "vshuf.b        $vr14, $vr9,  $vr8,  $vr4   \n\t"
      "vshuf.b        $vr15, $vr9,  $vr10, $vr5   \n\t"
      "vshuf.b        $vr16, $vr9,  $vr8,  $vr6   \n\t"
      "vshuf.b        $vr17, $vr9,  $vr10, $vr7   \n\t"
-      "vmaddwev.h.bu  $vr12, $vr16, $vr1          \n\t"  //G
+      "vmaddwev.h.bu  $vr12, $vr16, $vr1          \n\t"  // G
      "vmaddwev.h.bu  $vr13, $vr17, $vr1          \n\t"
-      "vmaddwev.h.bu  $vr12, $vr14, $vr0          \n\t"  //B
+      "vmaddwev.h.bu  $vr12, $vr14, $vr0          \n\t"  // B
      "vmaddwev.h.bu  $vr13, $vr15, $vr0          \n\t"
-      "vmaddwod.h.bu  $vr12, $vr14, $vr2          \n\t"  //R
+      "vmaddwod.h.bu  $vr12, $vr14, $vr2          \n\t"  // R
      "vmaddwod.h.bu  $vr13, $vr15, $vr2          \n\t"
      "addi.d         %0,    %0,    48            \n\t"
      "vpickod.b      $vr10, $vr13, $vr12         \n\t"
@ -1829,8 +1834,7 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
        "+&r"(width)        // %2
      : "r"(rgbconstants),  // %3
        "r"(shuff)          // %4
-      : "memory"
-    );
+      : "memory");
 }

 void RGB24ToYJRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
--- a/media/libyuv/libyuv/source/row_neon64.cc
+++ b/media/libyuv/libyuv/source/row_neon64.cc
@ -820,6 +820,28 @@ void MergeUVRow_NEON(const uint8_t* src_u,
      : "cc", "memory", "v0", "v1"  // Clobber List
  );
 }
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+void MergeUVRow_NEON1(const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_uv,
+                     int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1         {v0.16b,v2.16b}, [%0], #32    \n"  // load U
+      "ld1         {v1.16b,v3.16b}, [%1], #32    \n"  // load V
+      "subs        %w3, %w3, #32                 \n"  // 32 processed per loop
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "st2         {v0.16b,v1.16b,v2.16b,v3.16b}, [%2], #64 \n"  // store 32 UV
+      "b.gt        1b                            \n"
+      : "+r"(src_u),                // %0
+        "+r"(src_v),                // %1
+        "+r"(dst_uv),               // %2
+        "+r"(width)                 // %3  // Output registers
+      :                             // Input registers
+      : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}

 void MergeUVRow_16_NEON(const uint16_t* src_u,
                        const uint16_t* src_v,
--- a/media/libyuv/libyuv/source/scale.cc
+++ b/media/libyuv/libyuv/source/scale.cc
@ -1065,10 +1065,10 @@ void ScalePlaneBilinearDown(int src_width,

  const int max_y = (src_height - 1) << 16;
  int j;
-  void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+  void (*ScaleFilterCols)(uint8_t* dst_ptr, const uint8_t* src_ptr,
                          int dst_width, int x, int dx) =
      (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C;
-  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+  void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr,
                         ptrdiff_t src_stride, int dst_width,
                         int source_y_fraction) = InterpolateRow_C;
  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
@ -1188,10 +1188,10 @@ void ScalePlaneBilinearDown_16(int src_width,

  const int max_y = (src_height - 1) << 16;
  int j;
-  void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+  void (*ScaleFilterCols)(uint16_t* dst_ptr, const uint16_t* src_ptr,
                          int dst_width, int x, int dx) =
      (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C;
-  void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+  void (*InterpolateRow)(uint16_t* dst_ptr, const uint16_t* src_ptr,
                         ptrdiff_t src_stride, int dst_width,
                         int source_y_fraction) = InterpolateRow_16_C;
  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
@ -1276,10 +1276,10 @@ void ScalePlaneBilinearUp(int src_width,
  int dx = 0;
  int dy = 0;
  const int max_y = (src_height - 1) << 16;
-  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+  void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr,
                         ptrdiff_t src_stride, int dst_width,
                         int source_y_fraction) = InterpolateRow_C;
-  void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+  void (*ScaleFilterCols)(uint8_t* dst_ptr, const uint8_t* src_ptr,
                          int dst_width, int x, int dx) =
      filtering ? ScaleFilterCols_C : ScaleCols_C;
  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
@ -1744,10 +1744,10 @@ void ScalePlaneBilinearUp_16(int src_width,
  int dx = 0;
  int dy = 0;
  const int max_y = (src_height - 1) << 16;
-  void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+  void (*InterpolateRow)(uint16_t* dst_ptr, const uint16_t* src_ptr,
                         ptrdiff_t src_stride, int dst_width,
                         int source_y_fraction) = InterpolateRow_16_C;
-  void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+  void (*ScaleFilterCols)(uint16_t* dst_ptr, const uint16_t* src_ptr,
                          int dst_width, int x, int dx) =
      filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
@ -1872,7 +1872,7 @@ static void ScalePlaneSimple(int src_width,
                             const uint8_t* src_ptr,
                             uint8_t* dst_ptr) {
  int i;
-  void (*ScaleCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, int dst_width,
+  void (*ScaleCols)(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width,
                    int x, int dx) = ScaleCols_C;
  // Initial source x/y coordinate and step values as 16.16 fixed point.
  int x = 0;
@ -1909,7 +1909,7 @@ static void ScalePlaneSimple_16(int src_width,
                                const uint16_t* src_ptr,
                                uint16_t* dst_ptr) {
  int i;
-  void (*ScaleCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, int dst_width,
+  void (*ScaleCols)(uint16_t* dst_ptr, const uint16_t* src_ptr, int dst_width,
                    int x, int dx) = ScaleCols_16_C;
  // Initial source x/y coordinate and step values as 16.16 fixed point.
  int x = 0;
--- a/media/libyuv/libyuv/source/scale_common.cc
+++ b/media/libyuv/libyuv/source/scale_common.cc
@ -1633,7 +1633,7 @@ void ScalePlaneVertical(int src_height,
                        enum FilterMode filtering) {
  // TODO(fbarchard): Allow higher bpp.
  int dst_width_bytes = dst_width * bpp;
-  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+  void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb,
                         ptrdiff_t src_stride, int dst_width,
                         int source_y_fraction) = InterpolateRow_C;
  const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
@ -1712,7 +1712,7 @@ void ScalePlaneVertical_16(int src_height,
                           enum FilterMode filtering) {
  // TODO(fbarchard): Allow higher wpp.
  int dst_width_words = dst_width * wpp;
-  void (*InterpolateRow)(uint16_t * dst_argb, const uint16_t* src_argb,
+  void (*InterpolateRow)(uint16_t* dst_argb, const uint16_t* src_argb,
                         ptrdiff_t src_stride, int dst_width,
                         int source_y_fraction) = InterpolateRow_16_C;
  const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
@ -1791,7 +1791,7 @@ void ScalePlaneVertical_16To8(int src_height,
  // TODO(fbarchard): Allow higher wpp.
  int dst_width_words = dst_width * wpp;
  // TODO(https://crbug.com/libyuv/931): Add NEON 32 bit and AVX2 versions.
-  void (*InterpolateRow_16To8)(uint8_t * dst_argb, const uint16_t* src_argb,
+  void (*InterpolateRow_16To8)(uint8_t* dst_argb, const uint16_t* src_argb,
                               ptrdiff_t src_stride, int scale, int dst_width,
                               int source_y_fraction) = InterpolateRow_16To8_C;
  const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
--- a/media/libyuv/libyuv/unit_test/rotate_test.cc
+++ b/media/libyuv/libyuv/unit_test/rotate_test.cc
@ -14,6 +14,10 @@
 #include "libyuv/cpu_id.h"
 #include "libyuv/rotate.h"

+#ifdef ENABLE_ROW_TESTS
+#include "libyuv/rotate_row.h"
+#endif
+
 namespace libyuv {

 #define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a))
@ -858,4 +862,47 @@ TEST_F(LibYUVRotateTest, I410Rotate270_Opt) {
                 disable_cpu_flags_, benchmark_cpu_info_);
 }

+#if defined(ENABLE_ROW_TESTS)
+
+TEST_F(LibYUVRotateTest, Transpose4x4) {
+  // dst width and height
+  const int width = ((benchmark_width_ * benchmark_height_ + 3) / 4 + 3) & ~3;
+  const int height = 4;
+  align_buffer_page_end(src_pixels, height * width * 4);
+  align_buffer_page_end(dst_pixels_c, width * height * 4);
+  align_buffer_page_end(dst_pixels_opt, width * height * 4);
+
+  MemRandomize(src_pixels, height * width * 4);
+  memset(dst_pixels_c, 1, width * height * 4);
+  memset(dst_pixels_opt, 1, width * height * 4);
+
+  Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
+                    (uint8_t*)dst_pixels_c, width * 4, width);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+#if defined(__aarch64__)
+    if (TestCpuFlag(kCpuHasNEON)) {
+      Transpose4x4_32_NEON((const uint8_t*)src_pixels, height * 4,
+                           (uint8_t*)dst_pixels_opt, width * 4, width);
+    } else {
+      Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
+                        (uint8_t*)dst_pixels_opt, width * 4, width);
+    }
+#else
+    Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
+                      (uint8_t*)dst_pixels_opt, width * 4, width);
+#endif
+  }
+
+  //  for (int i = 0; i < width * height; ++i) {
+  //    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  //  }
+
+  free_aligned_buffer_page_end(src_pixels);
+  free_aligned_buffer_page_end(dst_pixels_c);
+  free_aligned_buffer_page_end(dst_pixels_opt);
+}
+
+#endif  // ENABLE_ROW_TESTS
+
 }  // namespace libyuv
--- a/media/libyuv/moz.yaml
+++ b/media/libyuv/moz.yaml
@ -23,7 +23,7 @@ origin:

  # Revision to pull in
  # Must be a long or short commit SHA (long preferred)
-  revision: b2528b0be934de1918e20c85fc170d809eeb49ab
+  revision: 2bdc210be9eb11ded16bf3ef1f6cadb0d4dcb0c2

  # The package's license, where possible using the mnemonic from
  # https://spdx.org/licenses/