Bug 1816484 - Update dav1d to 7d23ec4a042b2feb6c0d4c1b0618a87cb8c34dcb r=media-playback-reviewers,padenot

Differential Revision: https://phabricator.services.mozilla.com/D172702
2023-03-16 17:07:14 +00:00 · 2023-03-16 17:07:14 +00:00 · 8397595ed4
--- a/media/libdav1d/moz.yaml
+++ b/media/libdav1d/moz.yaml
@ -20,11 +20,11 @@ origin:

  # Human-readable identifier for this version/release
  # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: 188dfc89f21ed00e084e4a519a581df5f2ceb35d (2023-01-26T17:32:59.000+00:00).
+  release: 7d23ec4a042b2feb6c0d4c1b0618a87cb8c34dcb (2023-03-13T15:19:35.000+00:00).

  # Revision to pull in
  # Must be a long or short commit SHA (long preferred)
-  revision: 188dfc89f21ed00e084e4a519a581df5f2ceb35d
+  revision: 7d23ec4a042b2feb6c0d4c1b0618a87cb8c34dcb

  # The package's license, where possible using the mnemonic from
  # https://spdx.org/licenses/
--- a/media/libdav1d/vcs_version.h
+++ b/media/libdav1d/vcs_version.h
@ -1,2 +1,2 @@
 /* auto-generated, do not edit */
-#define DAV1D_VERSION "188dfc89f21ed00e084e4a519a581df5f2ceb35d"
+#define DAV1D_VERSION "7d23ec4a042b2feb6c0d4c1b0618a87cb8c34dcb"
--- a/third_party/dav1d/NEWS
+++ b/third_party/dav1d/NEWS
@ -1,4 +1,20 @@
-Changes for 1.0.0 'Peregrine falcon':
+Changes for 1.1.0 'Arctic Peregrine Falcon':
+-------------------------------------------
+
+1.1.0 is an important release of dav1d, fixing numerous bugs, and adding SIMD
+
+- New function dav1d_get_frame_delay to query the decoder frame delay
+- Numerous fixes for strict conformity to the specs and samples
+- NEON and AVX-512 misc fixes and improvements
+- Partial AVX2 12bpc transform implementations
+- AVX-512 high bit-depth cdef_filter, loopfilter, itx
+- NEON z1/z3 optimization for 8bpc
+- SSSE3 z1 optimization for 8bpc
+
+ "From VideoLAN with love"
+
+
+Changes for 1.0.0 'Peregrine Falcon':
 -------------------------------------

 1.0.0 is a major release of dav1d, adding important features and bug fixes.
@ -66,7 +82,7 @@ Details:
 - New API to signal events happening during the decoding process


-Changes for 0.8.2 'Eurasian hobby':
+Changes for 0.8.2 'Eurasian Hobby':
 -----------------------------------

 0.8.2 is a middle-size update of the 0.8.0 branch:
@ -87,7 +103,7 @@ Changes for 0.8.2 'Eurasian hobby':
 - Add a xxh3 muxer in the dav1d application


-Changes for 0.8.1 'Eurasian hobby':
+Changes for 0.8.1 'Eurasian Hobby':
 -----------------------------------

 0.8.1 is a minor update on 0.8.0:
@ -99,7 +115,7 @@ Changes for 0.8.1 'Eurasian hobby':
 - x86 optimizations for wiener in SSE2/SSSE3/AVX2


-Changes for 0.8.0 'Eurasian hobby':
+Changes for 0.8.0 'Eurasian Hobby':
 -----------------------------------

 0.8.0 is a major update for dav1d:
--- a/third_party/dav1d/README.md
+++ b/third_party/dav1d/README.md
@ -82,7 +82,7 @@ The [VideoLAN Code of Conduct](https://wiki.videolan.org/CoC) applies to this pr

 1. Install [Meson](https://mesonbuild.com/) (0.49 or higher), [Ninja](https://ninja-build.org/), and, for x86\* targets, [nasm](https://nasm.us/) (2.14 or higher)
 2. Run `mkdir build && cd build` to create a build directory and enter it
-3. Run `meson ..` to configure meson, add `--default-library=static` if static linking is desired
+3. Run `meson setup ..` to configure meson, add `--default-library=static` if static linking is desired
 4. Run `ninja` to compile

 ## Cross-Compilation for 32- or 64-bit Windows, 32-bit Linux
@ -90,13 +90,13 @@ The [VideoLAN Code of Conduct](https://wiki.videolan.org/CoC) applies to this pr
 If you're on a linux build machine trying to compile .exe for a Windows target/host machine, run

 ```
-meson build --cross-file=package/crossfiles/x86_64-w64-mingw32.meson
+meson setup build --cross-file=package/crossfiles/x86_64-w64-mingw32.meson
 ```

 or, for 32-bit:

 ```
-meson build --cross-file=package/crossfiles/i686-w64-mingw32.meson
+meson setup build --cross-file=package/crossfiles/i686-w64-mingw32.meson
 ```

 `mingw-w64` is a pre-requisite and should be installed on your linux machine via your preferred method or package manager. Note the binary name formats may differ between distributions. Verify the names, and use `alias` if certain binaries cannot be found.
@ -104,13 +104,13 @@ meson build --cross-file=package/crossfiles/i686-w64-mingw32.meson
 For 32-bit linux, run

 ```
-meson build --cross-file=package/crossfiles/i686-linux32.meson
+meson setup build --cross-file=package/crossfiles/i686-linux32.meson
 ```

 ## Build documentation

 1. Install [doxygen](https://www.doxygen.nl/) and [graphviz](https://www.graphviz.org/)
-2. Run `meson build -Denable_docs=true` to create the build directory
+2. Run `meson setup build -Denable_docs=true` to create the build directory
 3. Run `ninja -C build doc/html` to build the docs

 The result can be found in `build/doc/html/`. An online version built from master can be found [here](https://videolan.videolan.me/dav1d/).
--- a/third_party/dav1d/THANKS.md
+++ b/third_party/dav1d/THANKS.md
@ -1,7 +1,7 @@
 # The dav1d project and VideoLAN association would like to thank

 ## AOM
-The Alliance for Open Media (AOM) for funding this project.
+The Alliance for Open Media (AOM) for partially funding this project.

 ## Companies
 * Two Orioles LLC, for important coding effort
@ -17,17 +17,18 @@ The Alliance for Open Media (AOM) for funding this project.
 And all the dav1d Authors (git shortlog -sn), including:

 Martin Storsjö, Henrik Gramner, Ronald S. Bultje, Janne Grunau, James Almer,
-Victorien Le Couviour--Tuffet, Matthias Dressel, Marvin Scholz, Luc Trudeau,
-Jean-Baptiste Kempf, Hugo Beauzée-Luyssen, Niklas Haas, Konstantin Pavlov,
-David Michael Barr, Steve Lhomme, Nathan E. Egge, Kyle Siefring, Raphaël Zumer,
-B Krishnan Iyer, Francois Cartegnie, Liwei Wang, Derek Buitenhuis,
-Michael Bradshaw, Wan-Teh Chang, Xuefeng Jiang, Luca Barbato, Jan Beich,
-Christophe Gisquet, Justin Bull, Boyuan Xiao, Dale Curtis, Emmanuel Gil Peyrot,
-Rupert Swarbrick, Thierry Foucu, Thomas Daede, Colin Lee, Jonathan Wright,
-Lynne, Michail Alvanos, Nico Weber, Salome Thirot, SmilingWolf, Tristan Laurent,
-Vittorio Giovara, Yannis Guyon, André Kempe, Anisse Astier, Anton Mitrofanov,
-Dmitriy Sychov, Ewout ter Hoeven, Fred Barbier, Jean-Yves Avenard, Joe Drago,
-Mark Shuttleworth, Matthieu Bouron, Mehdi Sabwat, Nicolas Frattaroli,
-Pablo Stebler, Rostislav Pehlivanov, Shiz, Steinar Midtskogen, Sylvain BERTRAND,
+Victorien Le Couviour--Tuffet, Matthias Dressel, Marvin Scholz,
+Jean-Baptiste Kempf, Luc Trudeau, Hugo Beauzée-Luyssen, Konstantin Pavlov,
+Niklas Haas, David Michael Barr, Steve Lhomme, Nathan E. Egge, Wan-Teh Chang,
+Kyle Siefring, B Krishnan Iyer, Francois Cartegnie, Liwei Wang, Luca Barbato,
+David Conrad, Derek Buitenhuis, Jan Beich, Michael Bradshaw, Raphaël Zumer,
+Xuefeng Jiang, Christophe Gisquet, Justin Bull, Boyuan Xiao, Dale Curtis,
+Emmanuel Gil Peyrot, Raphael Zumer, Rupert Swarbrick, Thierry Foucu,
+Thomas Daede, Colin Lee, Jonathan Wright, Lynne, Michail Alvanos, Nico Weber,
+Salome Thirot, SmilingWolf, Tristan Laurent, Vittorio Giovara, Yannis Guyon,
+André Kempe, Anisse Astier, Anton Mitrofanov, Charlie Hayden, Dmitriy Sychov,
+Ewout ter Hoeven, Fred Barbier, Jean-Yves Avenard, Joe Drago, Mark Shuttleworth,
+Matthieu Bouron, Mehdi Sabwat, Nicolas Frattaroli, Pablo Stebler, Rostislav
+Pehlivanov, Sebastian Dröge, Shiz, Steinar Midtskogen, Sylvain BERTRAND,
 Sylvestre Ledru, Timo Gurr, Tristan Matthews, Vibhoothi, Xavier Claessens,
-Xu Guangxin, kossh1 and skal
+Xu Guangxin, kossh1 and skal.
--- a/third_party/dav1d/meson.build
+++ b/third_party/dav1d/meson.build
@ -23,7 +23,7 @@
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 project('dav1d', ['c'],
-    version: '1.0.0',
+    version: '1.1.0',
    default_options: ['c_std=c99',
                      'warning_level=2',
                      'buildtype=release',
@ -89,7 +89,7 @@ test_args = []
 optional_arguments = []
 optional_link_arguments = []

-if host_machine.system() in ['linux', 'gnu']
+if host_machine.system() in ['linux', 'gnu', 'emscripten']
    test_args += '-D_GNU_SOURCE'
    add_project_arguments('-D_GNU_SOURCE', language: 'c')
 endif
@ -134,7 +134,7 @@ if host_machine.system() == 'windows'
    rc_data.set('API_VERSION_MAJOR', dav1d_api_version_major)
    rc_data.set('API_VERSION_MINOR', dav1d_api_version_minor)
    rc_data.set('API_VERSION_REVISION', dav1d_api_version_revision)
-    rc_data.set('COPYRIGHT_YEARS', '2022')
+    rc_data.set('COPYRIGHT_YEARS', '2018-2023')
 else
    thread_dependency = dependency('threads')
    thread_compat_dep = []
@ -254,6 +254,9 @@ pthread_np_prefix = '''
 if cc.has_function('pthread_getaffinity_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency)
    cdata.set('HAVE_PTHREAD_GETAFFINITY_NP', 1)
 endif
+if cc.has_function('pthread_setaffinity_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency)
+    cdata.set('HAVE_PTHREAD_SETAFFINITY_NP', 1)
+endif

 if cc.compiles('int x = _Generic(0, default: 0);', name: '_Generic', args: test_args)
    cdata.set('HAVE_C11_GENERIC', 1)
--- a/third_party/dav1d/src/arm/64/ipred.S
+++ b/third_party/dav1d/src/arm/64/ipred.S
--- a/third_party/dav1d/src/arm/ipred.h
+++ b/third_party/dav1d/src/arm/ipred.h
@ -50,6 +50,138 @@ decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_444, neon));

 decl_pal_pred_fn(BF(dav1d_pal_pred, neon));

+#if ARCH_AARCH64 && BITDEPTH == 8
+void BF(dav1d_ipred_z1_upsample_edge, neon)(pixel *out, const int hsz,
+                                            const pixel *const in,
+                                            const int end);
+void BF(dav1d_ipred_z1_filter_edge, neon)(pixel *out, const int sz,
+                                          const pixel *const in,
+                                          const int end, const int strength);
+void BF(dav1d_ipred_z1_fill1, neon)(pixel *dst, ptrdiff_t stride,
+                                    const pixel *const top, const int width,
+                                    const int height, const int dx,
+                                    const int max_base_x);
+void BF(dav1d_ipred_z1_fill2, neon)(pixel *dst, ptrdiff_t stride,
+                                    const pixel *const top, const int width,
+                                    const int height, const int dx,
+                                    const int max_base_x);
+
+static void ipred_z1_neon(pixel *dst, const ptrdiff_t stride,
+                          const pixel *const topleft_in,
+                          const int width, const int height, int angle,
+                          const int max_width, const int max_height
+                          HIGHBD_DECL_SUFFIX)
+{
+    const int is_sm = (angle >> 9) & 0x1;
+    const int enable_intra_edge_filter = angle >> 10;
+    angle &= 511;
+    int dx = dav1d_dr_intra_derivative[angle >> 1];
+    pixel top_out[64 + 64 + (64+15)*2];
+    int max_base_x;
+    const int upsample_above = enable_intra_edge_filter ?
+        get_upsample(width + height, 90 - angle, is_sm) : 0;
+    if (upsample_above) {
+        BF(dav1d_ipred_z1_upsample_edge, neon)(top_out, width + height,
+                                               topleft_in,
+                                               width + imin(width, height));
+        max_base_x = 2 * (width + height) - 2;
+        dx <<= 1;
+    } else {
+        const int filter_strength = enable_intra_edge_filter ?
+            get_filter_strength(width + height, 90 - angle, is_sm) : 0;
+        if (filter_strength) {
+            BF(dav1d_ipred_z1_filter_edge, neon)(top_out, width + height,
+                                                 topleft_in,
+                                                 width + imin(width, height),
+                                                 filter_strength);
+            max_base_x = width + height - 1;
+        } else {
+            max_base_x = width + imin(width, height) - 1;
+            memcpy(top_out, &topleft_in[1], (max_base_x + 1) * sizeof(pixel));
+        }
+    }
+    const int base_inc = 1 + upsample_above;
+    int pad_pixels = width + 15; // max(dx >> 6) == 15
+    pixel_set(&top_out[max_base_x + 1], top_out[max_base_x], pad_pixels * base_inc);
+    if (upsample_above)
+        BF(dav1d_ipred_z1_fill2, neon)(dst, stride, top_out, width, height,
+                                       dx, max_base_x);
+    else
+        BF(dav1d_ipred_z1_fill1, neon)(dst, stride, top_out, width, height,
+                                       dx, max_base_x);
+}
+
+void BF(dav1d_ipred_z3_fill1, neon)(pixel *dst, ptrdiff_t stride,
+                                    const pixel *const left, const int width,
+                                    const int height, const int dy,
+                                    const int max_base_y);
+void BF(dav1d_ipred_z3_fill2, neon)(pixel *dst, ptrdiff_t stride,
+                                    const pixel *const left, const int width,
+                                    const int height, const int dy,
+                                    const int max_base_y);
+
+void BF(dav1d_ipred_reverse, neon)(pixel *dst, const pixel *const src,
+                                   const int n);
+
+static void ipred_z3_neon(pixel *dst, const ptrdiff_t stride,
+                          const pixel *const topleft_in,
+                          const int width, const int height, int angle,
+                          const int max_width, const int max_height
+                          HIGHBD_DECL_SUFFIX)
+{
+    const int is_sm = (angle >> 9) & 0x1;
+    const int enable_intra_edge_filter = angle >> 10;
+    angle &= 511;
+    assert(angle > 180);
+    int dy = dav1d_dr_intra_derivative[(270 - angle) >> 1];
+    pixel flipped[64 + 64 + 16];
+    pixel left_out[64 + 64 + (64+15)*2];
+    int max_base_y;
+    const int upsample_left = enable_intra_edge_filter ?
+        get_upsample(width + height, angle - 180, is_sm) : 0;
+    if (upsample_left) {
+            flipped[0] = topleft_in[0];
+        BF(dav1d_ipred_reverse, neon)(&flipped[1], &topleft_in[0],
+                                      height + imax(width, height));
+        BF(dav1d_ipred_z1_upsample_edge, neon)(left_out, width + height,
+                                               flipped,
+                                               height + imin(width, height));
+        max_base_y = 2 * (width + height) - 2;
+        dy <<= 1;
+    } else {
+        const int filter_strength = enable_intra_edge_filter ?
+            get_filter_strength(width + height, angle - 180, is_sm) : 0;
+
+        if (filter_strength) {
+            flipped[0] = topleft_in[0];
+            BF(dav1d_ipred_reverse, neon)(&flipped[1], &topleft_in[0],
+                                          height + imax(width, height));
+            BF(dav1d_ipred_z1_filter_edge, neon)(left_out, width + height,
+                                                 flipped,
+                                                 height + imin(width, height),
+                                                 filter_strength);
+            max_base_y = width + height - 1;
+        } else {
+            BF(dav1d_ipred_reverse, neon)(left_out, &topleft_in[0],
+                                          height + imin(width, height));
+            max_base_y = height + imin(width, height) - 1;
+        }
+    }
+    const int base_inc = 1 + upsample_left;
+    // The tbx based implementation needs left[] to have 64 bytes intitialized,
+    // the other implementation can read height + max(dy >> 6) past the end.
+    int pad_pixels = imax(64 - max_base_y - 1, height + 15);
+
+    pixel_set(&left_out[max_base_y + 1], left_out[max_base_y], pad_pixels * base_inc);
+    if (upsample_left)
+        BF(dav1d_ipred_z3_fill2, neon)(dst, stride, left_out, width, height,
+                                       dy, max_base_y);
+    else
+        BF(dav1d_ipred_z3_fill1, neon)(dst, stride, left_out, width, height,
+                                       dy, max_base_y);
+}
+#endif
+
 static ALWAYS_INLINE void intra_pred_dsp_init_arm(Dav1dIntraPredDSPContext *const c) {
    const unsigned flags = dav1d_get_cpu_flags();

@ -65,6 +197,10 @@ static ALWAYS_INLINE void intra_pred_dsp_init_arm(Dav1dIntraPredDSPContext *cons
    c->intra_pred[SMOOTH_PRED]   = BF(dav1d_ipred_smooth, neon);
    c->intra_pred[SMOOTH_V_PRED] = BF(dav1d_ipred_smooth_v, neon);
    c->intra_pred[SMOOTH_H_PRED] = BF(dav1d_ipred_smooth_h, neon);
+#if ARCH_AARCH64 && BITDEPTH == 8
+    c->intra_pred[Z1_PRED]       = ipred_z1_neon;
+    c->intra_pred[Z3_PRED]       = ipred_z3_neon;
+#endif
    c->intra_pred[FILTER_PRED]   = BF(dav1d_ipred_filter, neon);

    c->cfl_pred[DC_PRED]         = BF(dav1d_ipred_cfl, neon);
--- a/third_party/dav1d/src/decode.c
+++ b/third_party/dav1d/src/decode.c
@ -2829,9 +2829,9 @@ int dav1d_decode_tile_sbrow(Dav1dTaskContext *const t) {
    if (ts->msac.cnt < -15) return 1;

    if (f->c->n_tc > 1 && f->frame_hdr->use_ref_frame_mvs) {
-        dav1d_refmvs_load_tmvs(&f->rf, ts->tiling.row,
-                               ts->tiling.col_start >> 1, ts->tiling.col_end >> 1,
-                               t->by >> 1, (t->by + sb_step) >> 1);
+        f->c->refmvs_dsp.load_tmvs(&f->rf, ts->tiling.row,
+                                   ts->tiling.col_start >> 1, ts->tiling.col_end >> 1,
+                                   t->by >> 1, (t->by + sb_step) >> 1);
    }
    memset(t->pal_sz_uv[1], 0, sizeof(*t->pal_sz_uv));
    const int sb128y = t->by >> 5;
@ -2914,7 +2914,7 @@ int dav1d_decode_tile_sbrow(Dav1dTaskContext *const t) {
    }

    if (f->seq_hdr->ref_frame_mvs && f->c->n_tc > 1 && IS_INTER_OR_SWITCH(f->frame_hdr)) {
-        dav1d_refmvs_save_tmvs(&t->rt,
+        dav1d_refmvs_save_tmvs(&f->c->refmvs_dsp, &t->rt,
                               ts->tiling.col_start >> 1, ts->tiling.col_end >> 1,
                               t->by >> 1, (t->by + sb_step) >> 1);
    }
@ -3394,15 +3394,16 @@ int dav1d_decode_frame_main(Dav1dFrameContext *const f) {
            t->by = sby << (4 + f->seq_hdr->sb128);
            const int by_end = (t->by + f->sb_step) >> 1;
            if (f->frame_hdr->use_ref_frame_mvs) {
-                dav1d_refmvs_load_tmvs(&f->rf, tile_row,
-                                       0, f->bw >> 1, t->by >> 1, by_end);
+                f->c->refmvs_dsp.load_tmvs(&f->rf, tile_row,
+                                           0, f->bw >> 1, t->by >> 1, by_end);
            }
            for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) {
                t->ts = &f->ts[tile_row * f->frame_hdr->tiling.cols + tile_col];
                if (dav1d_decode_tile_sbrow(t)) goto error;
            }
            if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
-                dav1d_refmvs_save_tmvs(&t->rt, 0, f->bw >> 1, t->by >> 1, by_end);
+                dav1d_refmvs_save_tmvs(&f->c->refmvs_dsp, &t->rt,
+                                       0, f->bw >> 1, t->by >> 1, by_end);
            }

            // loopfilter + cdef + restoration
--- a/third_party/dav1d/src/fg_apply_tmpl.c
+++ b/third_party/dav1d/src/fg_apply_tmpl.c
@ -37,6 +37,7 @@
 #include "common/bitdepth.h"

 #include "src/fg_apply.h"
+#include "src/ref.h"

 static void generate_scaling(const int bitdepth,
                             const uint8_t points[][2], const int num,
@ -125,36 +126,32 @@ void bitfn(dav1d_prep_grain)(const Dav1dFilmGrainDSPContext *const dsp,
    if (data->num_uv_points[1])
        generate_scaling(in->p.bpc, data->uv_points[1], data->num_uv_points[1], scaling[2]);

-    // Copy over the non-modified planes
-    // TODO: eliminate in favor of per-plane refs
+    // Create new references for the non-modified planes
    assert(out->stride[0] == in->stride[0]);
    if (!data->num_y_points) {
-        const ptrdiff_t stride = out->stride[0];
-        const ptrdiff_t sz = out->p.h * stride;
-        if (sz < 0)
-            memcpy((uint8_t*) out->data[0] + sz - stride,
-                   (uint8_t*) in->data[0] + sz - stride, -sz);
-        else
-            memcpy(out->data[0], in->data[0], sz);
+        struct Dav1dRef **out_plane_ref = out->ref->user_data;
+        struct Dav1dRef **in_plane_ref = in->ref->user_data;
+        dav1d_ref_dec(&out_plane_ref[0]);
+        out_plane_ref[0] = in_plane_ref[0];
+        dav1d_ref_inc(out_plane_ref[0]);
+        out->data[0] = in->data[0];
    }

    if (in->p.layout != DAV1D_PIXEL_LAYOUT_I400 && !data->chroma_scaling_from_luma) {
        assert(out->stride[1] == in->stride[1]);
-        const int ss_ver = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
-        const ptrdiff_t stride = out->stride[1];
-        const ptrdiff_t sz = ((out->p.h + ss_ver) >> ss_ver) * stride;
-        if (sz < 0) {
-            if (!data->num_uv_points[0])
-                memcpy((uint8_t*) out->data[1] + sz - stride,
-                       (uint8_t*) in->data[1] + sz - stride, -sz);
-            if (!data->num_uv_points[1])
-                memcpy((uint8_t*) out->data[2] + sz - stride,
-                       (uint8_t*) in->data[2] + sz - stride, -sz);
-        } else {
-            if (!data->num_uv_points[0])
-                memcpy(out->data[1], in->data[1], sz);
-            if (!data->num_uv_points[1])
-                memcpy(out->data[2], in->data[2], sz);
+        struct Dav1dRef **out_plane_ref = out->ref->user_data;
+        struct Dav1dRef **in_plane_ref = in->ref->user_data;
+        if (!data->num_uv_points[0]) {
+            dav1d_ref_dec(&out_plane_ref[1]);
+            out_plane_ref[1] = in_plane_ref[1];
+            dav1d_ref_inc(out_plane_ref[1]);
+            out->data[1] = in->data[1];
+        }
+        if (!data->num_uv_points[1]) {
+            dav1d_ref_dec(&out_plane_ref[2]);
+            out_plane_ref[2] = in_plane_ref[2];
+            dav1d_ref_inc(out_plane_ref[2]);
+            out->data[2] = in->data[2];
        }
    }
 }
--- a/third_party/dav1d/src/lib.c
+++ b/third_party/dav1d/src/lib.c
@ -397,6 +397,7 @@ static int output_picture_ready(Dav1dContext *const c, const int drain) {

 static int drain_picture(Dav1dContext *const c, Dav1dPicture *const out) {
    unsigned drain_count = 0;
+    int drained = 0;
    do {
        const unsigned next = c->frame_thread.next;
        Dav1dFrameContext *const f = &c->fc[next];
@ -406,17 +407,23 @@ static int drain_picture(Dav1dContext *const c, Dav1dPicture *const out) {
                              &f->task_thread.ttd->lock);
        Dav1dThreadPicture *const out_delayed =
            &c->frame_thread.out_delayed[next];
+        if (out_delayed->p.data[0] || atomic_load(&f->task_thread.error)) {
+            unsigned first = atomic_load(&c->task_thread.first);
+            if (first + 1U < c->n_fc)
+                atomic_fetch_add(&c->task_thread.first, 1U);
+            else
+                atomic_store(&c->task_thread.first, 0);
+            atomic_compare_exchange_strong(&c->task_thread.reset_task_cur,
+                                           &first, UINT_MAX);
+            if (c->task_thread.cur && c->task_thread.cur < c->n_fc)
+                c->task_thread.cur--;
+            drained = 1;
+        } else if (drained) {
+            pthread_mutex_unlock(&c->task_thread.lock);
+            break;
+        }
        if (++c->frame_thread.next == c->n_fc)
            c->frame_thread.next = 0;
-        unsigned first = atomic_load(&c->task_thread.first);
-        if (first + 1U < c->n_fc)
-            atomic_fetch_add(&c->task_thread.first, 1U);
-        else
-            atomic_store(&c->task_thread.first, 0);
-        atomic_compare_exchange_strong(&c->task_thread.reset_task_cur,
-                                       &first, UINT_MAX);
-        if (c->task_thread.cur && c->task_thread.cur < c->n_fc)
-            c->task_thread.cur--;
        pthread_mutex_unlock(&c->task_thread.lock);
        const int error = f->task_thread.retval;
        if (error) {
--- a/third_party/dav1d/src/picture.c
+++ b/third_party/dav1d/src/picture.c
@ -87,19 +87,33 @@ void dav1d_default_picture_release(Dav1dPicture *const p, void *const cookie) {
 }

 struct pic_ctx_context {
+    struct Dav1dRef *plane_ref[3]; /* MUST BE FIRST */
+    enum Dav1dPixelLayout layout;
+    void *extra_ptr; /* MUST BE AT THE END */
+};
+
+struct plane_ctx_context {
    Dav1dPicAllocator allocator;
    Dav1dPicture pic;
-    void *extra_ptr; /* MUST BE AT THE END */
 };

 static void free_buffer(const uint8_t *const data, void *const user_data) {
    struct pic_ctx_context *pic_ctx = user_data;
+    const int planes = pic_ctx->layout != DAV1D_PIXEL_LAYOUT_I400 ? 3 : 1;

-    pic_ctx->allocator.release_picture_callback(&pic_ctx->pic,
-                                                pic_ctx->allocator.cookie);
+    for (int i = 0; i < planes; i++)
+        dav1d_ref_dec(&pic_ctx->plane_ref[i]);
    free(pic_ctx);
 }

+static void free_plane_buffer(const uint8_t *const data, void *const user_data) {
+    struct plane_ctx_context *plane_ctx = user_data;
+
+    plane_ctx->allocator.release_picture_callback(&plane_ctx->pic,
+                                                  plane_ctx->allocator.cookie);
+    free(plane_ctx);
+}
+
 static int picture_alloc_with_edges(Dav1dContext *const c,
                                    Dav1dPicture *const p,
                                    const int w, const int h,
@ -122,6 +136,7 @@ static int picture_alloc_with_edges(Dav1dContext *const c,
    struct pic_ctx_context *pic_ctx = malloc(extra + sizeof(struct pic_ctx_context));
    if (pic_ctx == NULL)
        return DAV1D_ERR(ENOMEM);
+    memset(pic_ctx, 0, sizeof(struct pic_ctx_context));

    p->p.w = w;
    p->p.h = h;
@ -139,8 +154,7 @@ static int picture_alloc_with_edges(Dav1dContext *const c,
        return res;
    }

-    pic_ctx->allocator = *p_allocator;
-    pic_ctx->pic = *p;
+    pic_ctx->layout = p->p.layout;

    if (!(p->ref = dav1d_ref_wrap(p->data[0], free_buffer, pic_ctx))) {
        p_allocator->release_picture_callback(p, p_allocator->cookie);
@ -149,6 +163,31 @@ static int picture_alloc_with_edges(Dav1dContext *const c,
        return DAV1D_ERR(ENOMEM);
    }

+    struct plane_ctx_context *plane_ctx = malloc(sizeof(struct plane_ctx_context));
+    if (plane_ctx == NULL){
+        dav1d_ref_dec(&p->ref);
+        p_allocator->release_picture_callback(p, p_allocator->cookie);
+        return DAV1D_ERR(ENOMEM);
+    }
+
+    plane_ctx->allocator = *p_allocator;
+    plane_ctx->pic = *p;
+
+    pic_ctx->plane_ref[0] = dav1d_ref_wrap(p->data[0], free_plane_buffer, plane_ctx);
+    if (!pic_ctx->plane_ref[0]) {
+        dav1d_ref_dec(&p->ref);
+        p_allocator->release_picture_callback(p, p_allocator->cookie);
+        free(plane_ctx);
+        dav1d_log(c, "Failed to wrap picture plane: %s\n", strerror(errno));
+        return DAV1D_ERR(ENOMEM);
+    }
+
+    const int planes = p->p.layout != DAV1D_PIXEL_LAYOUT_I400 ? 3 : 1;
+    for (int i = 1; i < planes; i++) {
+        pic_ctx->plane_ref[i] = pic_ctx->plane_ref[0];
+        dav1d_ref_inc(pic_ctx->plane_ref[i]);
+    }
+
    p->seq_hdr_ref = seq_hdr_ref;
    if (seq_hdr_ref) dav1d_ref_inc(seq_hdr_ref);

@ -214,13 +253,14 @@ int dav1d_picture_alloc_copy(Dav1dContext *const c, Dav1dPicture *const dst, con
                             const Dav1dPicture *const src)
 {
    struct pic_ctx_context *const pic_ctx = src->ref->user_data;
+    struct plane_ctx_context *const plane_ctx = pic_ctx->plane_ref[0]->user_data;
    const int res = picture_alloc_with_edges(c, dst, w, src->p.h,
                                             src->seq_hdr, src->seq_hdr_ref,
                                             src->frame_hdr, src->frame_hdr_ref,
                                             src->content_light, src->content_light_ref,
                                             src->mastering_display, src->mastering_display_ref,
                                             src->itut_t35, src->itut_t35_ref,
-                                             src->p.bpc, &src->m, &pic_ctx->allocator,
+                                             src->p.bpc, &src->m, &plane_ctx->allocator,
                                             0, NULL);
    return res;
 }
--- a/third_party/dav1d/src/refmvs.c
+++ b/third_party/dav1d/src/refmvs.c
@ -687,9 +687,9 @@ void dav1d_refmvs_tile_sbrow_init(refmvs_tile *const rt, const refmvs_frame *con
    rt->tile_col.end = imin(tile_col_end4, rf->iw4);
 }

-void dav1d_refmvs_load_tmvs(const refmvs_frame *const rf, int tile_row_idx,
-                            const int col_start8, const int col_end8,
-                            const int row_start8, int row_end8)
+static void load_tmvs_c(const refmvs_frame *const rf, int tile_row_idx,
+                        const int col_start8, const int col_end8,
+                        const int row_start8, int row_end8)
 {
    if (rf->n_tile_threads == 1) tile_row_idx = 0;
    assert(row_start8 >= 0);
@ -760,22 +760,14 @@ void dav1d_refmvs_load_tmvs(const refmvs_frame *const rf, int tile_row_idx,
    }
 }

-void dav1d_refmvs_save_tmvs(const refmvs_tile *const rt,
-                            const int col_start8, int col_end8,
-                            const int row_start8, int row_end8)
+static void save_tmvs_c(refmvs_temporal_block *rp, const ptrdiff_t stride,
+                        refmvs_block *const *const rr,
+                        const uint8_t *const ref_sign,
+                        const int col_end8, const int row_end8,
+                        const int col_start8, const int row_start8)
 {
-    const refmvs_frame *const rf = rt->rf;
-
-    assert(row_start8 >= 0);
-    assert((unsigned) (row_end8 - row_start8) <= 16U);
-    row_end8 = imin(row_end8, rf->ih8);
-    col_end8 = imin(col_end8, rf->iw8);
-
-    const ptrdiff_t stride = rf->rp_stride;
-    const uint8_t *const ref_sign = rf->mfmv_sign;
-    refmvs_temporal_block *rp = &rf->rp[row_start8 * stride];
    for (int y = row_start8; y < row_end8; y++) {
-        const refmvs_block *const b = rt->r[6 + (y & 15) * 2];
+        const refmvs_block *const b = rr[(y & 15) * 2];

        for (int x = col_start8; x < col_end8;) {
            const refmvs_block *const cand_b = &b[x * 2 + 1];
@ -794,8 +786,10 @@ void dav1d_refmvs_save_tmvs(const refmvs_tile *const rt,
                    rp[x] = (refmvs_temporal_block) { .mv = cand_b->mv.mv[0],
                                                      .ref = cand_b->ref.ref[0] };
            } else {
-                for (int n = 0; n < bw8; n++, x++)
+                for (int n = 0; n < bw8; n++, x++) {
+                    rp[x].mv.n = 0;
                    rp[x].ref = 0; // "invalid"
+                }
            }
        }
        rp += stride;
@ -932,6 +926,8 @@ static void splat_mv_c(refmvs_block **rr, const refmvs_block *const rmv,

 COLD void dav1d_refmvs_dsp_init(Dav1dRefmvsDSPContext *const c)
 {
+    c->load_tmvs = load_tmvs_c;
+    c->save_tmvs = save_tmvs_c;
    c->splat_mv = splat_mv_c;

 #if HAVE_ASM
--- a/third_party/dav1d/src/refmvs.h
+++ b/third_party/dav1d/src/refmvs.h
@ -39,10 +39,10 @@

 #define INVALID_MV 0x80008000

-typedef struct refmvs_temporal_block {
+PACKED(typedef struct refmvs_temporal_block {
    mv mv;
    int8_t ref;
-} refmvs_temporal_block;
+}) refmvs_temporal_block;

 typedef union refmvs_refpair {
    int8_t ref[2]; // [0] = 0: intra=1, [1] = -1: comp=0
@ -96,11 +96,28 @@ typedef struct refmvs_candidate {
    int weight;
 } refmvs_candidate;

+// initialize temporal MVs; this can be done in any configuration, e.g. one
+// tile/sbrow at a time, where col_{start,end}8 are the tile boundaries; or
+// it can just be for the whole frame's sbrow, where col_{start,end}8 are the
+// frame boundaries. row_{start,end}8 are the superblock row boundaries.
+#define decl_load_tmvs_fn(name) \
+void (name)(const refmvs_frame *rf, int tile_row_idx, \
+            int col_start8, int col_end8, int row_start8, int row_end8)
+typedef decl_load_tmvs_fn(*load_tmvs_fn);
+
+#define decl_save_tmvs_fn(name) \
+void (name)(refmvs_temporal_block *rp, const ptrdiff_t stride, \
+            refmvs_block *const *const rr, const uint8_t *const ref_sign, \
+            int col_end8, int row_end8, int col_start8, int row_start8)
+typedef decl_save_tmvs_fn(*save_tmvs_fn);
+
 #define decl_splat_mv_fn(name) \
 void (name)(refmvs_block **rr, const refmvs_block *rmv, int bx4, int bw4, int bh4)
 typedef decl_splat_mv_fn(*splat_mv_fn);

 typedef struct Dav1dRefmvsDSPContext {
+    load_tmvs_fn load_tmvs;
+    save_tmvs_fn save_tmvs;
    splat_mv_fn splat_mv;
 } Dav1dRefmvsDSPContext;

@ -118,19 +135,27 @@ int dav1d_refmvs_init_frame(refmvs_frame *rf,
                            /*const*/ refmvs_temporal_block *const rp_ref[7],
                            int n_tile_threads, int n_frame_threads);

-// initialize temporal MVs; this can be done in any configuration, e.g. one
-// tile/sbrow at a time, where col_{start,end}8 are the tile boundaries; or
-// it can just be for the whole frame's sbrow, where col_{start,end}8 are the
-// frame boundaries. row_{start,end}8 are the superblock row boundaries.
-void dav1d_refmvs_load_tmvs(const refmvs_frame *rf, int tile_row_idx,
-                            int col_start8, int col_end8,
-                            int row_start8, int row_end8);
-
 // cache the current tile/sbrow (or frame/sbrow)'s projectable motion vectors
 // into buffers for use in future frame's temporal MV prediction
-void dav1d_refmvs_save_tmvs(const refmvs_tile *rt,
-                            int col_start8, int col_end8,
-                            int row_start8, int row_end8);
+static inline void dav1d_refmvs_save_tmvs(const Dav1dRefmvsDSPContext *const dsp,
+                                          const refmvs_tile *const rt,
+                                          const int col_start8, int col_end8,
+                                          const int row_start8, int row_end8)
+{
+    const refmvs_frame *const rf = rt->rf;
+
+    assert(row_start8 >= 0);
+    assert((unsigned) (row_end8 - row_start8) <= 16U);
+    row_end8 = imin(row_end8, rf->ih8);
+    col_end8 = imin(col_end8, rf->iw8);
+
+    const ptrdiff_t stride = rf->rp_stride;
+    const uint8_t *const ref_sign = rf->mfmv_sign;
+    refmvs_temporal_block *rp = &rf->rp[row_start8 * stride];
+
+    dsp->save_tmvs(rp, stride, rt->r + 6, ref_sign,
+                   col_end8, row_end8, col_start8, row_start8);
+}

 // initialize tile boundaries and refmvs_block pointers for one tile/sbrow
 void dav1d_refmvs_tile_sbrow_init(refmvs_tile *rt, const refmvs_frame *rf,
--- a/third_party/dav1d/src/thread_task.c
+++ b/third_party/dav1d/src/thread_task.c
@ -795,6 +795,7 @@ void *dav1d_worker_task(void *data) {
                    atomic_load(&f->task_thread.done[0]) &&
                    (!uses_2pass || atomic_load(&f->task_thread.done[1])))
                {
+                    error = atomic_load(&f->task_thread.error);
                    dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) :
                                            error ? DAV1D_ERR(ENOMEM) : 0);
                    f->n_tile_data = 0;
@ -891,6 +892,7 @@ void *dav1d_worker_task(void *data) {
            if (!num_tasks && atomic_load(&f->task_thread.done[0]) &&
                atomic_load(&f->task_thread.done[1]))
            {
+                error = atomic_load(&f->task_thread.error);
                dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) :
                                        error ? DAV1D_ERR(ENOMEM) : 0);
                f->n_tile_data = 0;
@ -920,6 +922,7 @@ void *dav1d_worker_task(void *data) {
        if (!num_tasks && atomic_load(&f->task_thread.done[0]) &&
            (!uses_2pass || atomic_load(&f->task_thread.done[1])))
        {
+            error = atomic_load(&f->task_thread.error);
            dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) :
                                    error ? DAV1D_ERR(ENOMEM) : 0);
            f->n_tile_data = 0;
--- a/third_party/dav1d/src/x86/ipred.h
+++ b/third_party/dav1d/src/x86/ipred.h
@ -83,6 +83,10 @@ static ALWAYS_INLINE void intra_pred_dsp_init_x86(Dav1dIntraPredDSPContext *cons
    init_angular_ipred_fn(SMOOTH_PRED,   ipred_smooth,   ssse3);
    init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, ssse3);
    init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, ssse3);
+#if BITDEPTH == 8
+    init_angular_ipred_fn(Z1_PRED,       ipred_z1,       ssse3);
+    init_angular_ipred_fn(Z3_PRED,       ipred_z3,       ssse3);
+#endif
    init_angular_ipred_fn(FILTER_PRED,   ipred_filter,   ssse3);

    init_cfl_pred_fn(DC_PRED,      ipred_cfl,      ssse3);
--- a/third_party/dav1d/src/x86/ipred_sse.asm
+++ b/third_party/dav1d/src/x86/ipred_sse.asm
--- a/third_party/dav1d/src/x86/refmvs.asm
+++ b/third_party/dav1d/src/x86/refmvs.asm
@ -38,19 +38,228 @@ SECTION_RODATA 64
    %endrep
 %endmacro

+%macro SAVE_TMVS_TABLE 3 ; num_entries, w, suffix
+    %rep %1
+        db %2*3
+        db mangle(private_prefix %+ _save_tmvs_%3).write%2 - \
+           mangle(private_prefix %+ _save_tmvs_%3).write1
+    %endrep
+%endmacro
+
 %if ARCH_X86_64
 splat_mv_shuf: db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,  0,  1,  2,  3
               db  4,  5,  6,  7,  8,  9, 10, 11,  0,  1,  2,  3,  4,  5,  6,  7
               db  8,  9, 10, 11,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11
               db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,  0,  1,  2,  3
+%endif
+save_pack0:    db  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0
+               db  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1
+save_pack1:    db  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2
+               db  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3
+save_ref_shuf: db  0, -1, -1, -1,  1, -1, -1, -1,  8, -1, -1, -1,  9, -1, -1, -1
+save_cond0:    db  0x80, 0x81, 0x82, 0x83, 0x89, 0x84, 0x00, 0x00
+save_cond1:    db  0x84, 0x85, 0x86, 0x87, 0x88, 0x80, 0x00, 0x00
+pb_128:        times 16 db 128
+
+save_tmvs_ssse3_table: SAVE_TMVS_TABLE 2, 16, ssse3
+                       SAVE_TMVS_TABLE 4,  8, ssse3
+                       SAVE_TMVS_TABLE 4,  4, ssse3
+                       SAVE_TMVS_TABLE 5,  2, ssse3
+                       SAVE_TMVS_TABLE 7,  1, ssse3
+
+%if ARCH_X86_64
+save_tmvs_avx2_table: SAVE_TMVS_TABLE 2, 16, avx2
+                      SAVE_TMVS_TABLE 4,  8, avx2
+                      SAVE_TMVS_TABLE 4,  4, avx2
+                      SAVE_TMVS_TABLE 5,  2, avx2
+                      SAVE_TMVS_TABLE 7,  1, avx2

 JMP_TABLE splat_mv_avx512icl, 1, 2, 4, 8, 16, 32
 JMP_TABLE splat_mv_avx2,      1, 2, 4, 8, 16, 32
 %endif
+
 JMP_TABLE splat_mv_sse2,      1, 2, 4, 8, 16, 32

 SECTION .text

+%macro movif32 2
+%if ARCH_X86_32
+    mov             %1, %2
+%endif
+%endmacro
+
+INIT_XMM ssse3
+; refmvs_temporal_block *rp, ptrdiff_t stride,
+; refmvs_block **rr, uint8_t *ref_sign,
+; int col_end8, int row_end8, int col_start8, int row_start8
+%if ARCH_X86_64
+cglobal save_tmvs, 4, 13, 11, rp, stride, rr, ref_sign, \
+                             xend, yend, xstart, ystart
+%define base_reg r12
+%else
+cglobal save_tmvs, 6, 7, 8, rp, stride, rr, ref_sign, \
+                            xend, yend, xstart, ystart
+    movq            m5, [ref_signq]
+    lea        strided, [strided*5]
+    mov        stridem, strided
+    mov             r3, xstartm
+    mov             r1, ystartm
+ DEFINE_ARGS b, ystart, rr, cand, xend, x
+%define stridemp r1m
+%define m8  [base+pb_128]
+%define m9  [base+save_pack0+ 0]
+%define m10 [base+save_pack0+16]
+%define base_reg r6
+%endif
+%define base base_reg-.write1
+    LEA       base_reg, .write1
+%if ARCH_X86_64
+    movifnidn    xendd, xendm
+    movifnidn    yendd, yendm
+    mov        xstartd, xstartm
+    mov        ystartd, ystartm
+    movq            m5, [ref_signq]
+%endif
+    movu            m4, [base+save_ref_shuf]
+    movddup         m6, [base+save_cond0]
+    movddup         m7, [base+save_cond1]
+%if ARCH_X86_64
+    mova            m8, [base+pb_128]
+    mova            m9, [base+save_pack0+ 0]
+    mova           m10, [base+save_pack0+16]
+%endif
+    psllq           m5, 8
+%if ARCH_X86_64
+    lea            r9d, [xendq*5]
+    lea        xstartd, [xstartq*5]
+    sub          yendd, ystartd
+    add        ystartd, ystartd
+    lea        strideq, [strideq*5]
+    sub        xstartq, r9
+    add          xendd, r9d
+    add            rpq, r9
+ DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand
+%else
+    lea             r0, [xendd*5]   ; xend5
+    lea             r3, [r3*5]      ; xstart5
+    sub             r3, r0          ; -w5
+    mov            r6m, r3
+%define xstartq r6m
+    add          xendd, r0          ; xend6
+    add            r0m, r0          ; rp+xend5
+    mov          xendm, xendd
+    sub             r5, r1          ; h
+    add             r1, r1
+    mov            r7m, r1
+    mov            r5m, r5
+%define hd r5mp
+    jmp .loop_y_noload
+%endif
+.loop_y:
+    movif32    ystartd, r7m
+    movif32      xendd, xendm
+.loop_y_noload:
+    and        ystartd, 30
+    mov             xq, xstartq
+    mov             bq, [rrq+ystartq*gprsize]
+    add        ystartd, 2
+    movif32        r7m, ystartd
+    lea             bq, [bq+xendq*4]
+.loop_x:
+%if ARCH_X86_32
+%define rpq  r3
+%define r10  r1
+%define r10d r1
+%define r10w r1w
+%define r10b r1b
+%define r11  r4
+%define r11d r4
+%endif
+    imul         candq, xq, 0x9999  ; x / 5 * 3
+    sar          candq, 16
+    movzx         r10d, byte [bq+candq*8+22] ; cand_b->bs
+    movu            m0, [bq+candq*8+12]      ; cand_b
+    movzx         r11d, byte [base+save_tmvs_ssse3_table+r10*2+0]
+    movzx         r10d, byte [base+save_tmvs_ssse3_table+r10*2+1]
+    add            r10, base_reg
+    add          candq, r11
+    jge .calc
+    movu            m1, [bq+candq*8+12]
+    movzx         r11d, byte [bq+candq*8+22]
+    movzx         r11d, byte [base+save_tmvs_ssse3_table+r11*2+1]
+    add            r11, base_reg
+.calc:
+    movif32        rpq, r0m
+    ; ref check
+    punpckhqdq      m2, m0, m1
+    pshufb          m2, m4      ; b0.ref0 b0.ref1 b1.ref0 b1.ref1 | ...
+    pshufb          m3, m5, m2  ; ref > 0 && res_sign[ref - 1]
+    ; mv check
+    punpcklqdq      m2, m0, m1  ; b0.mv0 b0.mv1 b1.mv0 b1.mv1 | ...
+    pabsw           m2, m2
+    psrlw           m2, 12      ; (abs(mv.x) | abs(mv.y)) < 4096
+    ; res
+    pcmpgtd         m3, m2
+    pshufd          m2, m3, q2301
+    pand            m3, m6      ; b0c0 b0c1 b1c0 b1c1 | ...
+    pand            m2, m7      ; b0c1 b0c0 b1c1 b1c0 | ...
+    por             m3, m2      ; b0.shuf b1.shuf | ...
+    pxor            m3, m8      ; if cond0|cond1 == 0 => zero out
+    pshufb          m0, m3
+    pshufb          m1, m3
+    call           r10
+    jge .next_line
+    pshufd          m0, m1, q3232
+    call           r11
+    jl .loop_x
+.next_line:
+    add            rpq, stridemp
+    movif32        r0m, rpq
+    dec             hd
+    jg .loop_y
+    RET
+.write1:
+    movd    [rpq+xq+0], m0
+    psrlq           m0, 8
+    movd    [rpq+xq+1], m0
+    add             xq, 5*1
+    ret
+.write2:
+    movq    [rpq+xq+0], m0
+    psrlq           m0, 8
+    movd    [rpq+xq+6], m0
+    add             xq, 5*2
+    ret
+.write4:
+    pshufb          m0, m9
+    movu   [rpq+xq+ 0], m0
+    psrlq           m0, 8
+    movd   [rpq+xq+16], m0
+    add             xq, 5*4
+    ret
+.write8:
+    pshufb          m2, m0, m9
+    movu   [rpq+xq+ 0], m2
+    pshufb          m0, m10
+    movu   [rpq+xq+16], m0
+    psrldq          m2, 2
+    movq   [rpq+xq+32], m2
+    add             xq, 5*8
+    ret
+.write16:
+    pshufb          m2, m0, m9
+    movu   [rpq+xq+ 0], m2
+    pshufb          m0, m10
+    movu   [rpq+xq+16], m0
+    shufps          m2, m0, q1032
+    movu   [rpq+xq+48], m2
+    shufps          m2, m0, q2121
+    movu   [rpq+xq+32], m2
+    shufps          m0, m2, q1032
+    movu   [rpq+xq+64], m0
+    add             xq, 5*16
+    ret
+
 INIT_XMM sse2
 ; refmvs_block **rr, refmvs_block *a, int bx4, int bw4, int bh4
 cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
@ -116,6 +325,113 @@ cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4

 %if ARCH_X86_64
 INIT_YMM avx2
+; refmvs_temporal_block *rp, ptrdiff_t stride,
+; refmvs_block **rr, uint8_t *ref_sign,
+; int col_end8, int row_end8, int col_start8, int row_start8
+cglobal save_tmvs, 4, 13, 10, rp, stride, rr, ref_sign, \
+                              xend, yend, xstart, ystart
+%define base r12-.write1
+    lea            r12, [.write1]
+    movifnidn    xendd, xendm
+    movifnidn    yendd, yendm
+    mov        xstartd, xstartm
+    mov        ystartd, ystartm
+    vpbroadcastq    m4, [ref_signq]
+    vpbroadcastq    m3, [base+save_ref_shuf+8]
+    vpbroadcastq    m5, [base+save_cond0]
+    vpbroadcastq    m6, [base+save_cond1]
+    vpbroadcastd    m7, [base+pb_128]
+    mova            m8, [base+save_pack0]
+    mova            m9, [base+save_pack1]
+    psllq           m4, 8
+    lea            r9d, [xendq*5]
+    lea        xstartd, [xstartq*5]
+    sub          yendd, ystartd
+    add        ystartd, ystartd
+    lea        strideq, [strideq*5]
+    sub        xstartq, r9
+    add          xendd, r9d
+    add            rpq, r9
+ DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand
+.loop_y:
+    and        ystartd, 30
+    mov             xq, xstartq
+    mov             bq, [rrq+ystartq*8]
+    add        ystartd, 2
+    lea             bq, [bq+xendq*4]
+.loop_x:
+    imul         candq, xq, 0x9999
+    sar          candq, 16                   ; x / 5 * 3
+    movzx         r10d, byte [bq+candq*8+22] ; cand_b->bs
+    movu           xm0, [bq+candq*8+12]      ; cand_b
+    movzx         r11d, byte [base+save_tmvs_avx2_table+r10*2+0]
+    movzx         r10d, byte [base+save_tmvs_avx2_table+r10*2+1]
+    add            r10, r12
+    add          candq, r11
+    jge .calc
+    vinserti128     m0, [bq+candq*8+12], 1
+    movzx         r11d, byte [bq+candq*8+22]
+    movzx         r11d, byte [base+save_tmvs_avx2_table+r11*2+1]
+    add            r11, r12
+.calc:
+    pshufb          m1, m0, m3
+    pabsw           m2, m0
+    pshufb          m1, m4, m1  ; ref > 0 && res_sign[ref - 1]
+    psrlw           m2, 12      ; (abs(mv.x) | abs(mv.y)) < 4096
+    pcmpgtd         m1, m2
+    pshufd          m2, m1, q2301
+    pand            m1, m5      ; b0.cond0 b1.cond0
+    pand            m2, m6      ; b0.cond1 b1.cond1
+    por             m1, m2      ; b0.shuf b1.shuf
+    pxor            m1, m7      ; if cond0|cond1 == 0 => zero out
+    pshufb          m0, m1
+    call           r10
+    jge .next_line
+    vextracti128   xm0, m0, 1
+    call           r11
+    jl .loop_x
+.next_line:
+    add            rpq, strideq
+    dec             hd
+    jg .loop_y
+    RET
+.write1:
+    movd   [rpq+xq+ 0], xm0
+    pextrb [rpq+xq+ 4], xm0, 4
+    add             xq, 5*1
+    ret
+.write2:
+    movq    [rpq+xq+0], xm0
+    psrlq          xm1, xm0, 8
+    movd    [rpq+xq+6], xm1
+    add             xq, 5*2
+    ret
+.write4:
+    pshufb         xm1, xm0, xm8
+    movu   [rpq+xq+ 0], xm1
+    psrlq          xm1, 8
+    movd   [rpq+xq+16], xm1
+    add             xq, 5*4
+    ret
+.write8:
+    vinserti128     m1, m0, xm0, 1
+    pshufb          m1, m8
+    movu   [rpq+xq+ 0], m1
+    psrldq         xm1, 2
+    movq   [rpq+xq+32], xm1
+    add             xq, 5*8
+    ret
+.write16:
+    vinserti128     m1, m0, xm0, 1
+    pshufb          m2, m1, m8
+    movu   [rpq+xq+ 0], m2
+    pshufb          m1, m9
+    movu   [rpq+xq+32], m1
+    shufps         xm2, xm1, q1021
+    movu   [rpq+xq+64], xm2
+    add             xq, 5*16
+    ret
+
 cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
    add           bx4d, bw4d
    tzcnt         bw4d, bw4d
--- a/third_party/dav1d/src/x86/refmvs.h
+++ b/third_party/dav1d/src/x86/refmvs.h
@ -28,6 +28,9 @@
 #include "src/cpu.h"
 #include "src/refmvs.h"

+decl_save_tmvs_fn(dav1d_save_tmvs_ssse3);
+decl_save_tmvs_fn(dav1d_save_tmvs_avx2);
+
 decl_splat_mv_fn(dav1d_splat_mv_sse2);
 decl_splat_mv_fn(dav1d_splat_mv_avx2);
 decl_splat_mv_fn(dav1d_splat_mv_avx512icl);
@ -39,9 +42,14 @@ static ALWAYS_INLINE void refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *const c) {

    c->splat_mv = dav1d_splat_mv_sse2;

+    if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+    c->save_tmvs = dav1d_save_tmvs_ssse3;
+
 #if ARCH_X86_64
    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;

+    c->save_tmvs = dav1d_save_tmvs_avx2;
    c->splat_mv = dav1d_splat_mv_avx2;

    if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
--- a/third_party/dav1d/tests/checkasm/checkasm.c
+++ b/third_party/dav1d/tests/checkasm/checkasm.c
@ -42,6 +42,10 @@
 #include <unistd.h>
 #include <signal.h>
 #include <time.h>
+#include <pthread.h>
+#ifdef HAVE_PTHREAD_NP_H
+#include <pthread_np.h>
+#endif
 #ifdef __APPLE__
 #include <mach/mach_time.h>
 #endif
@ -123,7 +127,7 @@ static struct {
    const char *current_test_name;
    int num_checked;
    int num_failed;
-    int nop_time;
+    double nop_time;
    unsigned cpu_flag;
    const char *cpu_flag_name;
    const char *test_pattern;
@ -134,6 +138,8 @@ static struct {
    int verbose;
    int function_listing;
    int catch_signals;
+    int suffix_length;
+    int max_function_name_length;
 #if ARCH_X86_64
    void (*simd_warmup)(void);
 #endif
@ -321,7 +327,7 @@ static int cmp_nop(const void *a, const void *b) {
 }

 /* Measure the overhead of the timing code (in decicycles) */
-static int measure_nop_time(void) {
+static double measure_nop_time(void) {
    uint16_t nops[10000];
    int nop_sum = 0;

@ -334,7 +340,16 @@ static int measure_nop_time(void) {
    for (int i = 2500; i < 7500; i++)
        nop_sum += nops[i];

-    return nop_sum / 500;
+    return nop_sum / 5000.0;
+}
+
+static double avg_cycles_per_call(const CheckasmFuncVersion *const v) {
+    if (v->iterations) {
+        const double cycles = (double)v->cycles / v->iterations - state.nop_time;
+        if (cycles > 0.0)
+            return cycles / 4.0; /* 4 calls per iteration */
+    }
+    return 0.0;
 }

 /* Print benchmark results */
@ -343,15 +358,15 @@ static void print_benchs(const CheckasmFunc *const f) {
        print_benchs(f->child[0]);

        /* Only print functions with at least one assembly version */
-        if (state.bench_c || f->versions.cpu || f->versions.next) {
-            const CheckasmFuncVersion *v = &f->versions;
+        const CheckasmFuncVersion *v = &f->versions;
+        if ((state.bench_c || v->cpu || v->next) && v->iterations) {
+            const double baseline = avg_cycles_per_call(v);
            do {
-                if (v->iterations) {
-                    const int decicycles = (int) (10*v->cycles/v->iterations -
-                                                  state.nop_time) / 4;
-                    printf("%s_%s: %d.%d\n", f->name, cpu_suffix(v->cpu),
-                           decicycles/10, decicycles%10);
-                }
+                const int pad_length = 10 + state.max_function_name_length -
+                    printf("%s_%s:", f->name, cpu_suffix(v->cpu));
+                const double cycles = avg_cycles_per_call(v);
+                const double ratio = cycles ? baseline / cycles : 0.0;
+                printf("%*.1f (%5.2fx)\n", imax(pad_length, 0), cycles, ratio);
            } while ((v = v->next));
        }

@ -363,7 +378,11 @@ static void print_benchs(const CheckasmFunc *const f) {
 static void print_functions(const CheckasmFunc *const f) {
    if (f) {
        print_functions(f->child[0]);
-        printf("%s\n", f->name);
+        const CheckasmFuncVersion *v = &f->versions;
+        printf("%s (%s", f->name, cpu_suffix(v->cpu));
+        while ((v = v->next))
+            printf(", %s", cpu_suffix(v->cpu));
+        printf(")\n");
        print_functions(f->child[1]);
    }
 }
@ -515,6 +534,7 @@ static void check_cpu_flag(const char *const name, unsigned flag) {

    if (!flag || state.cpu_flag != old_cpu_flag) {
        state.cpu_flag_name = name;
+        state.suffix_length = (int)strlen(cpu_suffix(flag)) + 1;
        for (int i = 0; tests[i].func; i++) {
            if (state.test_pattern && wildstrcmp(tests[i].name, state.test_pattern))
                continue;
@ -556,13 +576,14 @@ int main(int argc, char *argv[]) {
                    "checkasm [options] <random seed>\n"
                    "    <random seed>              Numeric value to seed the rng\n"
                    "Options:\n"
+                    "    --affinity=<cpu>           Run the process on CPU <cpu>\n"
                    "    --test=<pattern>           Test only <pattern>\n"
                    "    --function=<pattern> -f    Test only the functions matching <pattern>\n"
                    "    --bench -b                 Benchmark the tested functions\n"
                    "    --list-functions           List available functions\n"
                    "    --list-tests               List available tests\n"
                    "    --bench-c -c               Benchmark the C-only functions\n"
-                    "    --verbose -v               Print failures verbosely\n");
+                    "    --verbose -v               Print verbose output\n");
            return 0;
        } else if (!strcmp(argv[1], "--bench-c") || !strcmp(argv[1], "-c")) {
            state.bench_c = 1;
@ -593,6 +614,43 @@ int main(int argc, char *argv[]) {
            return 0;
        } else if (!strcmp(argv[1], "--verbose") || !strcmp(argv[1], "-v")) {
            state.verbose = 1;
+        } else if (!strncmp(argv[1], "--affinity=", 11)) {
+            unsigned long affinity = strtoul(argv[1] + 11, NULL, 16);
+#ifdef _WIN32
+            BOOL (WINAPI *spdcs)(HANDLE, const ULONG*, ULONG) =
+                (void*)GetProcAddress(GetModuleHandleW(L"kernel32.dll"), "SetProcessDefaultCpuSets");
+            HANDLE process = GetCurrentProcess();
+            int affinity_err;
+            if (spdcs) {
+                affinity_err = !spdcs(process, (ULONG[]){ affinity + 256 }, 1);
+            } else {
+                if (affinity < sizeof(DWORD_PTR) * 8)
+                    affinity_err = !SetProcessAffinityMask(process, (DWORD_PTR)1 << affinity);
+                else
+                    affinity_err = 1;
+            }
+            if (affinity_err) {
+                fprintf(stderr, "checkasm: invalid cpu affinity (%lu)\n", affinity);
+                return 1;
+            } else {
+                fprintf(stderr, "checkasm: running on cpu %lu\n", affinity);
+            }
+#elif defined(HAVE_PTHREAD_SETAFFINITY_NP) && defined(CPU_SET)
+            cpu_set_t set;
+            CPU_ZERO(&set);
+            CPU_SET(affinity, &set);
+            if (pthread_setaffinity_np(pthread_self(), sizeof(set), &set)) {
+                fprintf(stderr, "checkasm: invalid cpu affinity (%lu)\n", affinity);
+                return 1;
+            } else {
+                fprintf(stderr, "checkasm: running on cpu %lu\n", affinity);
+            }
+#else
+            (void)affinity;
+            fprintf(stderr,
+                    "checkasm: --affinity is not supported on your system\n");
+            return 1;
+#endif
        } else {
            state.seed = (unsigned) strtoul(argv[1], NULL, 10);
        }
@ -665,28 +723,28 @@ int main(int argc, char *argv[]) {
    }

    check_cpu_flag(NULL, 0);
+    for (int i = 0; cpus[i].flag; i++)
+        check_cpu_flag(cpus[i].name, cpus[i].flag);

    if (state.function_listing) {
        print_functions(state.funcs);
+    } else if (state.num_failed) {
+        fprintf(stderr, "checkasm: %d of %d tests failed\n",
+                state.num_failed, state.num_checked);
+        ret = 1;
    } else {
-        for (int i = 0; cpus[i].flag; i++)
-            check_cpu_flag(cpus[i].name, cpus[i].flag);
-        if (!state.num_checked) {
-            fprintf(stderr, "checkasm: no tests to perform\n");
-        } else if (state.num_failed) {
-            fprintf(stderr, "checkasm: %d of %d tests have failed\n",
-                    state.num_failed, state.num_checked);
-            ret = 1;
-        } else {
+        if (state.num_checked)
            fprintf(stderr, "checkasm: all %d tests passed\n", state.num_checked);
+        else
+            fprintf(stderr, "checkasm: no tests to perform\n");
 #ifdef readtime
-            if (state.bench) {
-                state.nop_time = measure_nop_time();
-                printf("nop: %d.%d\n", state.nop_time/10, state.nop_time%10);
-                print_benchs(state.funcs);
-            }
-#endif
+        if (state.bench && state.max_function_name_length) {
+            state.nop_time = measure_nop_time();
+            if (state.verbose)
+                printf("nop:%*.1f\n", state.max_function_name_length + 6, state.nop_time);
+            print_benchs(state.funcs);
        }
+#endif
    }

    destroy_func_tree(state.funcs);
@ -701,7 +759,7 @@ void *checkasm_check_func(void *const func, const char *const name, ...) {
    va_list arg;

    va_start(arg, name);
-    const int name_length = vsnprintf(name_buf, sizeof(name_buf), name, arg);
+    int name_length = vsnprintf(name_buf, sizeof(name_buf), name, arg);
    va_end(arg);

    if (!func || name_length <= 0 || (size_t)name_length >= sizeof(name_buf) ||
@ -712,9 +770,6 @@ void *checkasm_check_func(void *const func, const char *const name, ...) {

    state.current_func = get_func(&state.funcs, name_buf);

-    if (state.function_listing) /* Save function names without running tests */
-        return NULL;
-
    state.funcs->color = 1;
    CheckasmFuncVersion *v = &state.current_func->versions;
    void *ref = func;
@ -735,13 +790,20 @@ void *checkasm_check_func(void *const func, const char *const name, ...) {
        v = prev->next = checkasm_malloc(sizeof(CheckasmFuncVersion));
    }

+    name_length += state.suffix_length;
+    if (name_length > state.max_function_name_length)
+        state.max_function_name_length = name_length;
+
    v->func = func;
    v->ok = 1;
    v->cpu = state.cpu_flag;
    state.current_func_ver = v;
+    if (state.function_listing) /* Save function names without running tests */
+        return NULL;
+
    xor128_srand(state.seed);

-    if (state.cpu_flag || state.bench_c)
+    if (state.cpu_flag)
        state.num_checked++;

    return ref;
--- a/third_party/dav1d/tests/checkasm/ipred.c
+++ b/third_party/dav1d/tests/checkasm/ipred.c
@ -90,51 +90,54 @@ static void check_intra_pred(Dav1dIntraPredDSPContext *const c) {
                        (mode == FILTER_PRED ? 32 : 64)); h <<= 1)
                    {
                        const ptrdiff_t stride = c_dst_stride;
+                        int nb_iters = (mode >= Z1_PRED && mode <= Z3_PRED) ? 5 : 1;

-                        int a = 0, maxw = 0, maxh = 0;
-                        if (mode >= Z1_PRED && mode <= Z3_PRED) { /* angle */
-                            a = (90 * (mode - Z1_PRED) + z_angles[rnd() % 27]) |
-                                (rnd() & 0x600);
-                            if (mode == Z2_PRED) {
-                                maxw = rnd(), maxh = rnd();
-                                maxw = 1 + (maxw & (maxw & 4096 ? 4095 : w - 1));
-                                maxh = 1 + (maxh & (maxh & 4096 ? 4095 : h - 1));
+                        for (int iter = 0; iter < nb_iters; iter++) {
+                            int a = 0, maxw = 0, maxh = 0;
+                            if (mode >= Z1_PRED && mode <= Z3_PRED) { /* angle */
+                                a = (90 * (mode - Z1_PRED) + z_angles[rnd() % 27]) |
+                                    (rnd() & 0x600);
+                                if (mode == Z2_PRED) {
+                                    maxw = rnd(), maxh = rnd();
+                                    maxw = 1 + (maxw & (maxw & 4096 ? 4095 : w - 1));
+                                    maxh = 1 + (maxh & (maxh & 4096 ? 4095 : h - 1));
+                                }
+                            } else if (mode == FILTER_PRED) /* filter_idx */
+                                a = (rnd() % 5) | (rnd() & ~511);
+
+                            int bitdepth_max;
+                            if (bpc == 16)
+                                bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+                            else
+                                bitdepth_max = (1 << bpc) - 1;
+
+                            for (int i = -h * 2; i <= w * 2; i++)
+                                topleft[i] = rnd() & bitdepth_max;
+
+                            CLEAR_PIXEL_RECT(c_dst);
+                            CLEAR_PIXEL_RECT(a_dst);
+                            call_ref(c_dst, stride, topleft, w, h, a, maxw, maxh
+                                     HIGHBD_TAIL_SUFFIX);
+                            call_new(a_dst, stride, topleft, w, h, a, maxw, maxh
+                                     HIGHBD_TAIL_SUFFIX);
+                            if (checkasm_check_pixel_padded(c_dst, stride,
+                                                            a_dst, stride,
+                                                            w, h, "dst"))
+                            {
+                                if (mode == Z1_PRED || mode == Z3_PRED)
+                                    fprintf(stderr, "angle = %d (0x%03x)\n",
+                                            a & 0x1ff, a & 0x600);
+                                else if (mode == Z2_PRED)
+                                    fprintf(stderr, "angle = %d (0x%03x), "
+                                            "max_width = %d, max_height = %d\n",
+                                            a & 0x1ff, a & 0x600, maxw, maxh);
+                                else if (mode == FILTER_PRED)
+                                    fprintf(stderr, "filter_idx = %d\n", a & 0x1ff);
                            }
-                        } else if (mode == FILTER_PRED) /* filter_idx */
-                            a = (rnd() % 5) | (rnd() & ~511);

-                        int bitdepth_max;
-                        if (bpc == 16)
-                            bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
-                        else
-                            bitdepth_max = (1 << bpc) - 1;
-
-                        for (int i = -h * 2; i <= w * 2; i++)
-                            topleft[i] = rnd() & bitdepth_max;
-
-                        CLEAR_PIXEL_RECT(c_dst);
-                        CLEAR_PIXEL_RECT(a_dst);
-                        call_ref(c_dst, stride, topleft, w, h, a, maxw, maxh
-                                 HIGHBD_TAIL_SUFFIX);
-                        call_new(a_dst, stride, topleft, w, h, a, maxw, maxh
-                                 HIGHBD_TAIL_SUFFIX);
-                        if (checkasm_check_pixel_padded(c_dst, stride,
-                                                        a_dst, stride,
-                                                        w, h, "dst"))
-                        {
-                            if (mode == Z1_PRED || mode == Z3_PRED)
-                                fprintf(stderr, "angle = %d (0x%03x)\n",
-                                        a & 0x1ff, a & 0x600);
-                            else if (mode == Z2_PRED)
-                                fprintf(stderr, "angle = %d (0x%03x), "
-                                        "max_width = %d, max_height = %d\n",
-                                        a & 0x1ff, a & 0x600, maxw, maxh);
-                            else if (mode == FILTER_PRED)
-                                fprintf(stderr, "filter_idx = %d\n", a & 0x1ff);
+                            bench_new(a_dst, stride, topleft, w, h, a, 128, 128
+                                      HIGHBD_TAIL_SUFFIX);
                        }
-
-                        bench_new(a_dst, stride, topleft, w, h, a, 128, 128
-                                  HIGHBD_TAIL_SUFFIX);
                    }
                }
    }
--- a/third_party/dav1d/tests/checkasm/refmvs.c
+++ b/third_party/dav1d/tests/checkasm/refmvs.c
@ -27,6 +27,84 @@
 #include "tests/checkasm/checkasm.h"
 #include "src/refmvs.h"

+#include <stdio.h>
+
+static void check_save_tmvs(const Dav1dRefmvsDSPContext *const c) {
+    refmvs_block *rr[31];
+    refmvs_block r[31 * 256];
+    ALIGN_STK_64(refmvs_temporal_block, c_rp, 128 * 16,);
+    ALIGN_STK_64(refmvs_temporal_block, a_rp, 128 * 16,);
+    uint8_t ref_sign[7];
+
+    for (int i = 0; i < 31; i++)
+        rr[i] = &r[i * 256];
+
+    declare_func(void, refmvs_temporal_block *rp, const ptrdiff_t stride,
+                 refmvs_block *const *const rr, const uint8_t *const ref_sign,
+                 int col_end8, int row_end8, int col_start8, int row_start8);
+
+    if (check_func(c->save_tmvs, "save_tmvs")) {
+        const int row_start8 = rnd() & 7;
+        const int row_end8 = 8 + (rnd() & 7);
+        const int col_start8 = rnd() & 31;
+        const int col_end8 = 96 + (rnd() & 31);
+
+        for (int i = 0; i < 7; i++)
+            ref_sign[i] = rnd() & 1;
+
+        for (int i = row_start8; i < row_end8; i++)
+            for (int j = col_start8; j < col_end8;) {
+                int bs = rnd() % N_BS_SIZES;
+                while (j + ((dav1d_block_dimensions[bs][0] + 1) >> 1) > col_end8)
+                    bs++;
+                rr[i * 2][j * 2 + 1] = (refmvs_block) {
+                    .mv.mv[0].x = -(rnd() & 1) * (rnd() & 8191),
+                    .mv.mv[0].y = -(rnd() & 1) * (rnd() & 8191),
+                    .mv.mv[1].x = -(rnd() & 1) * (rnd() & 8191),
+                    .mv.mv[1].y = -(rnd() & 1) * (rnd() & 8191),
+                    .ref.ref = { (rnd() % 9) - 1, (rnd() % 9) - 1 },
+                    .bs = bs
+                };
+                for (int k = 0; k < (dav1d_block_dimensions[bs][0] + 1) >> 1; k++, j++) {
+                    c_rp[i * 128 + j].mv.n = 0xdeadbeef;
+                    c_rp[i * 128 + j].ref = 0xdd;
+                }
+            }
+
+        call_ref(c_rp + row_start8 * 128, 128, rr, ref_sign,
+                 col_end8, row_end8, col_start8, row_start8);
+        call_new(a_rp + row_start8 * 128, 128, rr, ref_sign,
+                 col_end8, row_end8, col_start8, row_start8);
+        for (int i = row_start8; i < row_end8; i++)
+            for (int j = col_start8; j < col_end8; j++)
+                if (c_rp[i * 128 + j].mv.n != a_rp[i * 128 + j].mv.n ||
+                    c_rp[i * 128 + j].ref != a_rp[i * 128 + j].ref)
+                {
+                    if (fail()) {
+                        fprintf(stderr, "[%d][%d] c_rp.mv.x = 0x%x a_rp.mv.x = 0x%x\n",
+                                i, j, c_rp[i * 128 + j].mv.x, a_rp[i * 128 + j].mv.x);
+                        fprintf(stderr, "[%d][%d] c_rp.mv.y = 0x%x a_rp.mv.y = 0x%x\n",
+                                i, j, c_rp[i * 128 + j].mv.y, a_rp[i * 128 + j].mv.y);
+                        fprintf(stderr, "[%d][%d] c_rp.ref = %u a_rp.ref = %u\n",
+                                i, j, c_rp[i * 128 + j].ref, a_rp[i * 128 + j].ref);
+                    }
+                }
+
+        for (int bs = BS_4x4; bs < N_BS_SIZES; bs++) {
+            const int bw8 = (dav1d_block_dimensions[bs][0] + 1) >> 1;
+            for (int i = 0; i < 16; i++)
+                for (int j = 0; j < 128; j += bw8) {
+                    rr[i * 2][j * 2 + 1].ref.ref[0] = (rnd() % 9) - 1;
+                    rr[i * 2][j * 2 + 1].ref.ref[1] = (rnd() % 9) - 1;
+                    rr[i * 2][j * 2 + 1].bs = bs;
+                }
+            bench_new(alternate(c_rp, a_rp), 128, rr, ref_sign, 128, 16, 0, 0);
+        }
+    }
+
+    report("save_tmvs");
+}
+
 static void check_splat_mv(const Dav1dRefmvsDSPContext *const c) {
    ALIGN_STK_64(refmvs_block, c_buf, 32 * 32,);
    ALIGN_STK_64(refmvs_block, a_buf, 32 * 32,);
@ -74,5 +152,6 @@ void checkasm_check_refmvs(void) {
    Dav1dRefmvsDSPContext c;
    dav1d_refmvs_dsp_init(&c);

+    check_save_tmvs(&c);
    check_splat_mv(&c);
 }
--- a/third_party/dav1d/tests/dav1d_argon.bash
+++ b/third_party/dav1d/tests/dav1d_argon.bash
@ -0,0 +1,175 @@
+#!/usr/bin/env bash
+
+DAV1D="tools/dav1d"
+ARGON_DIR='.'
+FILMGRAIN=1
+CPUMASK=-1
+THREADS=0
+JOBS=1
+
+usage() {
+    NAME=$(basename "$0")
+    {
+        printf "Usage:   %s [-d dav1d] [-a argondir] [-g \$filmgrain] [-c \$cpumask] [-t threads] [-j jobs] [DIRECTORY]...\n" "$NAME"
+        printf "Example: %s -d /path/to/dav1d -a /path/to/argon/ -g 0 -c avx2 profile0_core\n" "$NAME"
+        printf "Used to verify that dav1d can decode the Argon AV1 test vectors correctly.\n\n"
+        printf " DIRECTORY one or more dirs in the argon folder to check against\n"
+        printf "             (default: everything except large scale tiles and stress files)\n"
+        printf " -d dav1d  path to dav1d executable (default: tools/dav1d)\n"
+        printf " -a dir    path to argon dir (default: 'tests/argon' if found; '.' otherwise)\n"
+        printf " -g \$num   enable filmgrain (default: 1)\n"
+        printf " -c \$mask  use restricted cpumask (default: -1)\n"
+        printf " -t \$num   number of threads per dav1d (default: 0)\n"
+        printf " -j \$num   number of parallel dav1d processes (default: 1)\n\n"
+    } >&2
+    exit 1
+}
+
+error() {
+    printf "\033[1;91m%s\033[0m\n" "$*" >&2
+    exit 1
+}
+
+fail() {
+    printf "\033[1K\rMismatch in %s\n" "$1"
+    (( failed++ ))
+}
+
+check_pids() {
+    new_pids=()
+    done_pids=()
+    for p in "${pids[@]}"; do
+        if kill -0 "$p" 2>/dev/null; then
+            new_pids+=("$p")
+        else
+            done_pids+=("$p")
+        fi
+    done
+    pids=("${new_pids[@]}")
+}
+
+wait_pids() {
+    pid_list=("$@")
+    for p in "${pid_list[@]}"; do
+        if ! wait "$p"; then
+            local file_varname="file$p"
+            fail "${!file_varname}"
+        fi
+    done
+}
+
+block_pids() {
+    while [ ${#pids[@]} -ge "$JOBS" ]; do
+        check_pids
+        if [ ${#done_pids} -eq 0 ]; then
+            sleep 0.2
+        else
+            wait_pids "${done_pids[@]}"
+        fi
+    done
+}
+
+wait_all_pids() {
+    wait_pids "${pids[@]}"
+}
+
+# find tests/argon
+tests_dir=$(dirname "$(readlink -f "$0")")
+if [ -d "$tests_dir/argon" ]; then
+    ARGON_DIR="$tests_dir/argon"
+fi
+
+while getopts ":d:a:g:c:t:j:" opt; do
+    case "$opt" in
+        d)
+            DAV1D="$OPTARG"
+            ;;
+        a)
+            ARGON_DIR="$OPTARG"
+            ;;
+        g)
+            FILMGRAIN="$OPTARG"
+            ;;
+        c)
+            CPUMASK="$OPTARG"
+            ;;
+        t)
+            THREADS="$OPTARG"
+            ;;
+        j)
+            JOBS="$OPTARG"
+            ;;
+        \?)
+            printf "Error! Invalid option: -%s\n" "$OPTARG" >&2
+            usage
+            ;;
+        *)
+            usage
+            ;;
+    esac
+done
+shift $((OPTIND-1))
+
+if [ "$#" -eq 0 ]; then
+    # Everything except large scale tiles and stress files.
+    dirs=("$ARGON_DIR/profile0_core"       "$ARGON_DIR/profile0_core_special"
+          "$ARGON_DIR/profile0_not_annexb" "$ARGON_DIR/profile0_not_annexb_special"
+          "$ARGON_DIR/profile1_core"       "$ARGON_DIR/profile1_core_special"
+          "$ARGON_DIR/profile1_not_annexb" "$ARGON_DIR/profile1_not_annexb_special"
+          "$ARGON_DIR/profile2_core"       "$ARGON_DIR/profile2_core_special"
+          "$ARGON_DIR/profile2_not_annexb" "$ARGON_DIR/profile2_not_annexb_special"
+          "$ARGON_DIR/profile_switching")
+else
+    mapfile -t dirs < <(printf "${ARGON_DIR}/%s\n" "$@" | sort -u)
+fi
+
+ver_info="dav1d $("$DAV1D" -v 2>&1) filmgrain=$FILMGRAIN cpumask=$CPUMASK" || error "Error! Can't run $DAV1D"
+files=()
+
+for d in "${dirs[@]}"; do
+    if [ -d "$d/streams" ]; then
+        files+=("${d/%\//}"/streams/*.obu)
+    fi
+done
+
+if [ ${#files[@]} -eq 0 ]; then
+    error "Error! No files found at ${dirs[*]}"
+fi
+
+failed=0
+pids=()
+for i in "${!files[@]}"; do
+    f="${files[i]}"
+    if [ "$FILMGRAIN" -eq 0 ]; then
+        md5=${f/\/streams\//\/md5_no_film_grain\/}
+    else
+        md5=${f/\/streams\//\/md5_ref\/}
+    fi
+    md5=$(<"${md5/%obu/md5}") || error "Error! Can't read md5 ${md5} for file ${f}"
+    md5=${md5/ */}
+
+    printf "\033[1K\r[%3d%% %d/%d] Verifying %s" "$(((i+1)*100/${#files[@]}))" "$((i+1))" "${#files[@]}" "$f"
+    cmd=("$DAV1D" -i "$f" --filmgrain "$FILMGRAIN" --verify "$md5" --cpumask "$CPUMASK" --threads "$THREADS" -q)
+    if [ "$JOBS" -gt 1 ]; then
+        "${cmd[@]}" 2>/dev/null &
+        p=$!
+        pids+=("$p")
+        declare "file$p=$f"
+        block_pids
+    else
+        if ! "${cmd[@]}" 2>/dev/null; then
+            fail "$f"
+        fi
+    fi
+done
+
+wait_all_pids
+
+if [ "$failed" -ne 0 ]; then
+    printf "\033[1K\r%d/%d files \033[1;91mfailed\033[0m to verify" "$failed" "${#files[@]}"
+else
+    printf "\033[1K\r%d files \033[1;92msuccessfully\033[0m verified" "${#files[@]}"
+fi
+printf " in %dm%ds (%s)\n" "$((SECONDS/60))" "$((SECONDS%60))" "$ver_info"
+
+exit $failed
--- a/third_party/dav1d/tests/meson.build
+++ b/third_party/dav1d/tests/meson.build
@ -54,6 +54,7 @@ if is_asm_enabled
            'checkasm_bitdepth_@0@'.format(bitdepth),
            checkasm_tmpl_sources,
            include_directories: dav1d_inc_dirs,
+            dependencies : [stdatomic_dependencies],
            c_args: ['-DBITDEPTH=@0@'.format(bitdepth)],
            install: false,
            build_by_default: false,