Bug 1509327 - Update dav1d from upstream to 36b807a. r=TD-Linux

Differential Revision: https://phabricator.services.mozilla.com/D13426 --HG-- extra : moz-landing-system : lando
2024-10-10 20:05:49 +00:00 · 2018-12-01 21:59:40 +00:00 · 2018-12-01 21:59:40 +00:00 · 04fe7aa697
commit 04fe7aa697
parent 5d689219f2
69 changed files with 6807 additions and 1897 deletions
--- a/media/libdav1d/README_MOZILLA
+++ b/media/libdav1d/README_MOZILLA
@ -21,4 +21,4 @@ To update to a fork, use

 The last update was pulled from https://code.videolan.org/videolan/dav1d

-The git commit ID used was 46e2a2d0cc451e1d6bb929f80088f8a7b8940dd0 (2018-10-25T16:51:31.000Z).
+The git commit ID used was 197a19ad702d5e7472852efcde98feeb07f373e0 (2018-11-26T12:15:41.000Z).
--- a/media/libdav1d/moz.yaml
+++ b/media/libdav1d/moz.yaml
@ -20,7 +20,7 @@ origin:

  # Human-readable identifier for this version/release
  # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: commit 46e2a2d0cc451e1d6bb929f80088f8a7b8940dd0
+  release: commit 36b807afe75040d9953bf63f68b67e6cd2fe4fc0

  # The package's license, where possible using the mnemonic from
  # https://spdx.org/licenses/
--- a/third_party/dav1d/.gitlab-ci.yml
+++ b/third_party/dav1d/.gitlab-ci.yml
@ -9,7 +9,7 @@ build-debian:
        - debian
        - amd64
    script:
-        - env CFLAGS='-Werror' meson build --buildtype release
+        - meson build --buildtype release --werror
        - ninja -C build
        - cd build && meson test -v

@ -20,7 +20,7 @@ build-debian-static:
        - debian
        - amd64
    script:
-        - env CFLAGS='-Werror' meson build --buildtype release --default-library static
+        - meson build --buildtype release --default-library static --werror
        - ninja -C build
        - cd build && meson test -v

@ -30,12 +30,12 @@ build-win32:
    tags:
        - win32
    script:
-        - env CFLAGS='-Werror'
-            meson build --buildtype release
-                        --libdir lib
-                        --prefix "$(pwd)/build/dav1d_install"
-                        --cross-file /opt/crossfiles/i686-w64-mingw32.meson
-                        -Ddefault_library=both
+        - meson build --buildtype release
+                      --werror
+                      --libdir lib
+                      --prefix "$(pwd)/build/dav1d_install"
+                      --cross-file /opt/crossfiles/i686-w64-mingw32.meson
+                      -Ddefault_library=both
        - ninja -C build
        - ninja -C build install
    artifacts:
@ -50,12 +50,12 @@ build-win64:
    tags:
        - win64
    script:
-        - env CFLAGS='-Werror'
-            meson build --buildtype release
-                        --libdir lib
-                        --prefix "$(pwd)/build/dav1d_install"
-                        --cross-file /opt/crossfiles/x86_64-w64-mingw32.meson
-                        -Ddefault_library=both
+        - meson build --buildtype release
+                      --werror
+                      --libdir lib
+                      --prefix "$(pwd)/build/dav1d_install"
+                      --cross-file /opt/crossfiles/x86_64-w64-mingw32.meson
+                      -Ddefault_library=both
        - ninja -C build
        - ninja -C build install
    artifacts:
@ -66,19 +66,20 @@ build-win64:

 build-debian-aarch64:
    stage: build
+    image: registry.videolan.org:5000/dav1d-debian-unstable-aarch64:20181122182457
    tags:
        - aarch64
        - debian
    script:
-        - env CFLAGS='-Werror' meson build --buildtype release
+        - meson build --buildtype release --werror
        - ninja -C build
        - cd build && meson test -v

 build-debian-aarch64-clang-5:
    stage: build
+    image: registry.videolan.org:5000/dav1d-debian-unstable-aarch64:20181122182457
    tags:
        - aarch64
-        - clang5
        - debian
    script:
        - env CC=clang-5.0 CFLAGS='-integrated-as' meson build --buildtype release
@ -90,18 +91,18 @@ build-macos:
    tags:
        - macos
    script:
-        - env CFLAGS='-Werror' meson build --buildtype release -Ddefault_library=both
+        - meson build --buildtype release -Ddefault_library=both --werror
        - ninja -C build
        - cd build && meson test -v

 build-debian-werror:
-    image: dav1d-debian-aarch64:201810240631
+    image: registry.videolan.org:5000/dav1d-debian-unstable-aarch64:20181122182457
    stage: build
    tags:
        - aarch64
        - debian
    script:
-        - env CC='clang-7' CFLAGS='-Werror' meson build -Dbuild_tests=false
+        - env CC='clang-7' meson build --buildtype debug --werror
        - ninja -C build

 test-debian:
@ -122,3 +123,66 @@ test-debian:
        - meson build --buildtype release -Dtestdata_tests=true
        - ninja -C build
        - cd build && time meson test -v
+
+test-debian-asan:
+    image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132
+    stage: test
+    tags:
+        - debian
+        - amd64
+    cache:
+        key: testdata.git
+        paths:
+            - cache/dav1d-test-data.git/
+    variables:
+        ASAN_OPTIONS: 'detect_leaks=0'
+    script:
+        - test -d cache || mkdir cache
+        - test -d cache/dav1d-test-data.git && GIT_DIR=cache/dav1d-test-data.git git fetch --refmap=refs/heads/master:refs/heads/master origin master
+        - test -d cache/dav1d-test-data.git || git clone --bare https://code.videolan.org/videolan/dav1d-test-data.git cache/dav1d-test-data.git
+        - git clone cache/dav1d-test-data.git tests/dav1d-test-data
+        - meson build --buildtype debugoptimized -Dtestdata_tests=true -Db_sanitize=address -Dbuild_asm=false
+        - ninja -C build
+        - cd build && time meson test -v --setup=sanitizer
+
+test-debian-msan:
+    image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132
+    stage: test
+    tags:
+        - debian
+        - amd64
+    cache:
+        key: testdata.git
+        paths:
+            - cache/dav1d-test-data.git/
+    variables:
+        MSAN_OPTIONS: 'exitcode=1'
+    script:
+        - test -d cache || mkdir cache
+        - test -d cache/dav1d-test-data.git && GIT_DIR=cache/dav1d-test-data.git git fetch --refmap=refs/heads/master:refs/heads/master origin master
+        - test -d cache/dav1d-test-data.git || git clone --bare https://code.videolan.org/videolan/dav1d-test-data.git cache/dav1d-test-data.git
+        - git clone cache/dav1d-test-data.git tests/dav1d-test-data
+        - env CC=clang meson build --buildtype debugoptimized -Dtestdata_tests=true -Db_sanitize=memory -Db_lundef=false -Dbuild_asm=false
+        - ninja -C build
+        - cd build && time meson test -v --setup=sanitizer
+
+test-debian-ubsan:
+    image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132
+    stage: test
+    tags:
+        - debian
+        - amd64
+    cache:
+        key: testdata.git
+        paths:
+            - cache/dav1d-test-data.git/
+    variables:
+        UBSAN_OPTIONS: 'print_stacktrace=1:halt_on_error=1'
+    script:
+        - test -d cache || mkdir cache
+        - test -d cache/dav1d-test-data.git && GIT_DIR=cache/dav1d-test-data.git git fetch --refmap=refs/heads/master:refs/heads/master origin master
+        - test -d cache/dav1d-test-data.git || git clone --bare https://code.videolan.org/videolan/dav1d-test-data.git cache/dav1d-test-data.git
+        - git clone cache/dav1d-test-data.git tests/dav1d-test-data
+        - env CC=clang meson build --buildtype debugoptimized -Dtestdata_tests=true -Db_sanitize=undefined -Db_lundef=false -Dbuild_asm=false
+        - ninja -C build
+        - cd build && time meson test -v --setup=sanitizer
--- a/third_party/dav1d/doc/Doxyfile.in
+++ b/third_party/dav1d/doc/Doxyfile.in
@ -0,0 +1,19 @@
+PROJECT_NAME            = dav1d
+OUTPUT_DIRECTORY        = @DOXYGEN_OUTPUT@
+STRIP_FROM_PATH         = @DOXYGEN_STRIP@
+OUTPUT_LANGUAGE         = English
+TAB_SIZE                = 4
+EXTRACT_ALL             = YES
+OPTIMIZE_OUTPUT_FOR_C   = YES
+DOXYFILE_ENCODING       = UTF-8
+TYPEDEF_HIDES_STRUCT    = YES
+
+QUIET                   = YES
+WARNINGS                = YES
+WARN_IF_UNDOCUMENTED    = YES
+
+INPUT                   = @DOXYGEN_INPUT@
+FILE_PATTERNS           = *.h
+
+GENERATE_HTML           = YES
+GENERATE_LATEX          = NO
--- a/third_party/dav1d/doc/meson.build
+++ b/third_party/dav1d/doc/meson.build
@ -0,0 +1,42 @@
+# Copyright © 2018, VideoLAN and dav1d authors
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+doxygen = find_program('doxygen', required: false)
+
+if doxygen.found()
+    conf_data = configuration_data()
+    conf_data.set('DOXYGEN_INPUT', join_paths(meson.source_root(), 'include/dav1d'))
+    conf_data.set('DOXYGEN_STRIP', join_paths(meson.source_root(), 'include'))
+    conf_data.set('DOXYGEN_OUTPUT', meson.current_build_dir())
+    doxyfile = configure_file(input: 'Doxyfile.in',
+                              output: 'Doxyfile',
+                              configuration: conf_data)
+
+    custom_target('doc',
+                  build_by_default: false,
+                  command: [doxygen, doxyfile],
+                  output: ['html']
+    )
+endif
+
--- a/third_party/dav1d/include/dav1d/common.h
+++ b/third_party/dav1d/include/dav1d/common.h
@ -25,8 +25,11 @@
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

-#ifndef __COMMON_H__
-#define __COMMON_H__
+#ifndef __DAV1D_COMMON_H__
+#define __DAV1D_COMMON_H__
+
+#include <stddef.h>
+#include <stdint.h>

 #ifndef DAV1D_API
    #if defined _WIN32
@ -40,4 +43,19 @@
    #endif
 #endif

-#endif // __COMMON_H__
+/**
+ * Input packet metadata which are copied from the input data used to
+ * decode each image into the matching structure of the output image
+ * returned back to the user. Since these are metadata fields, they
+ * can be used for other purposes than the documented ones, they will
+ * still be passed from input data to output picture without being
+ * used internally.
+ */
+typedef struct Dav1dDataProps {
+    int64_t timestamp; ///< container timestamp of input data, INT64_MIN if unknown (default)
+    int64_t duration; ///< container duration of input data, 0 if unknown (default)
+    int64_t offset; ///< stream offset of input data, -1 if unknown (default)
+    size_t size; ///< packet size, default Dav1dData.sz
+} Dav1dDataProps;
+
+#endif // __DAV1D_COMMON_H__
--- a/third_party/dav1d/include/dav1d/data.h
+++ b/third_party/dav1d/include/dav1d/data.h
@ -37,6 +37,7 @@ typedef struct Dav1dData {
    const uint8_t *data; ///< data pointer
    size_t sz; ///< data size
    struct Dav1dRef *ref; ///< allocation origin
+    Dav1dDataProps m;
 } Dav1dData;

 /**
@ -45,7 +46,7 @@ typedef struct Dav1dData {
 * @param data Input context.
 * @param   sz Size of the data that should be allocated.
 *
- * @return Pointer to the allocated bufferon success. NULL on error.
+ * @return Pointer to the allocated buffer on success. NULL on error.
 */
 DAV1D_API uint8_t * dav1d_data_create(Dav1dData *data, size_t sz);

--- a/third_party/dav1d/include/dav1d/dav1d.h
+++ b/third_party/dav1d/include/dav1d/dav1d.h
@ -41,10 +41,16 @@ extern "C" {
 typedef struct Dav1dContext Dav1dContext;
 typedef struct Dav1dRef Dav1dRef;

+#define DAV1D_MAX_FRAME_THREADS 256
+#define DAV1D_MAX_TILE_THREADS 64
+
 typedef struct Dav1dSettings {
    int n_frame_threads;
    int n_tile_threads;
    Dav1dPicAllocator allocator;
+    int apply_grain;
+    int operating_point; ///< select an operating point for scalable AV1 bitstreams (0 - 31)
+    int all_layers; ///< output all spatial layers of a scalable AV1 biststream
 } Dav1dSettings;

 /**
@ -73,6 +79,22 @@ DAV1D_API void dav1d_default_settings(Dav1dSettings *s);
 */
 DAV1D_API int dav1d_open(Dav1dContext **c_out, const Dav1dSettings *s);

+/**
+ * Parse a Sequence Header OBU from bitstream data.
+ *
+ * @param out Output Sequence Header.
+ * @param buf The data to be parser.
+ * @param sz  Size of the data.
+ *
+ * @return 0 on success, or < 0 (a negative errno code) on error.
+ *
+ * @note It is safe to feed this function data containing other OBUs than a
+ *       Sequence Header, as they will simply be ignored. If there is more than
+ *       one Sequence Header OBU present, only the last will be returned.
+ */
+DAV1D_API int dav1d_parse_sequence_header(Dav1dSequenceHeader *out,
+                                          const uint8_t *buf, const size_t sz);
+
 /**
 * Feed bitstream data to the decoder.
 *
@ -106,6 +128,39 @@ DAV1D_API int dav1d_send_data(Dav1dContext *c, Dav1dData *in);
 *
 * @note To drain buffered frames from the decoder (i.e. on end of stream),
 *       call this function until it returns -EAGAIN.
+ *
+ * @code{.c}
+ *  Dav1dData data = { 0 };
+ *  Dav1dPicture p = { 0 };
+ *  int res;
+ *
+ *  read_data(&data);
+ *  do {
+ *      res = dav1d_send_data(c, &data);
+ *      // Keep going even if the function can't consume the current data
+ *         packet. It eventually will after one or more frames have been
+ *         returned in this loop.
+ *      if (res < 0 && res != -EAGAIN)
+ *          free_and_abort();
+ *      res = dav1d_get_picture(c, &p);
+ *      if (res < 0) {
+ *          if (res != -EAGAIN)
+ *              free_and_abort();
+ *      } else
+ *          output_and_unref_picture(&p);
+ *  // Stay in the loop as long as there's data to consume.
+ *  } while (data.sz || read_data(&data) == SUCCESS);
+ *
+ *  // Handle EOS by draining all buffered frames.
+ *  do {
+ *      res = dav1d_get_picture(c, &p);
+ *      if (res < 0) {
+ *          if (res != -EAGAIN)
+ *              free_and_abort();
+ *      } else
+ *          output_and_unref_picture(&p);
+ *  } while (res == 0);
+ * @endcode
 */
 DAV1D_API int dav1d_get_picture(Dav1dContext *c, Dav1dPicture *out);

@ -117,9 +172,14 @@ DAV1D_API int dav1d_get_picture(Dav1dContext *c, Dav1dPicture *out);
 DAV1D_API void dav1d_close(Dav1dContext **c_out);

 /**
- * Flush all delayed frames in decoder, to be used when seeking.
+ * Flush all delayed frames in decoder and clear internal decoder state,
+ * to be used when seeking.
 *
 * @param c Input decoder instance.
+ *
+ * @note Decoding will start only after a valid sequence header OBU is
+ *       delivered to dav1d_send_data().
+ *
 */
 DAV1D_API void dav1d_flush(Dav1dContext *c);

--- a/third_party/dav1d/include/dav1d/headers.h
+++ b/third_party/dav1d/include/dav1d/headers.h
@ -0,0 +1,385 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __DAV1D_HEADERS_H__
+#define __DAV1D_HEADERS_H__
+
+// Constants from Section 3. "Symbols and abbreviated terms"
+#define DAV1D_MAX_CDEF_STRENGTHS 8
+#define DAV1D_MAX_OPERATING_POINTS 32
+#define DAV1D_MAX_TILE_COLS 64
+#define DAV1D_MAX_TILE_ROWS 64
+#define DAV1D_MAX_SEGMENTS 8
+#define DAV1D_NUM_REF_FRAMES 8
+#define DAV1D_PRIMARY_REF_NONE 7
+#define DAV1D_REFS_PER_FRAME 7
+#define DAV1D_TOTAL_REFS_PER_FRAME (DAV1D_REFS_PER_FRAME + 1)
+
+enum Dav1dTxfmMode {
+    DAV1D_TX_4X4_ONLY,
+    DAV1D_TX_LARGEST,
+    DAV1D_TX_SWITCHABLE,
+    DAV1D_N_TX_MODES,
+};
+
+enum Dav1dFilterMode {
+    DAV1D_FILTER_8TAP_REGULAR,
+    DAV1D_FILTER_8TAP_SMOOTH,
+    DAV1D_FILTER_8TAP_SHARP,
+    DAV1D_N_SWITCHABLE_FILTERS,
+    DAV1D_FILTER_BILINEAR = DAV1D_N_SWITCHABLE_FILTERS,
+    DAV1D_N_FILTERS,
+    DAV1D_FILTER_SWITCHABLE = DAV1D_N_FILTERS,
+};
+
+enum Dav1dAdaptiveBoolean {
+    DAV1D_OFF = 0,
+    DAV1D_ON = 1,
+    DAV1D_ADAPTIVE = 2,
+};
+
+enum Dav1dRestorationType {
+    DAV1D_RESTORATION_NONE,
+    DAV1D_RESTORATION_SWITCHABLE,
+    DAV1D_RESTORATION_WIENER,
+    DAV1D_RESTORATION_SGRPROJ,
+};
+
+enum Dav1dWarpedMotionType {
+    DAV1D_WM_TYPE_IDENTITY,
+    DAV1D_WM_TYPE_TRANSLATION,
+    DAV1D_WM_TYPE_ROT_ZOOM,
+    DAV1D_WM_TYPE_AFFINE,
+};
+
+typedef struct Dav1dWarpedMotionParams {
+    enum Dav1dWarpedMotionType type;
+    int32_t matrix[6];
+    union {
+        struct {
+            int16_t alpha, beta, gamma, delta;
+        };
+        int16_t abcd[4];
+    };
+} Dav1dWarpedMotionParams;
+
+enum Dav1dPixelLayout {
+    DAV1D_PIXEL_LAYOUT_I400, ///< monochrome
+    DAV1D_PIXEL_LAYOUT_I420, ///< 4:2:0 planar
+    DAV1D_PIXEL_LAYOUT_I422, ///< 4:2:2 planar
+    DAV1D_PIXEL_LAYOUT_I444, ///< 4:4:4 planar
+};
+
+enum Dav1dFrameType {
+    DAV1D_FRAME_TYPE_KEY = 0,    ///< Key Intra frame
+    DAV1D_FRAME_TYPE_INTER = 1,  ///< Inter frame
+    DAV1D_FRAME_TYPE_INTRA = 2,  ///< Non key Intra frame
+    DAV1D_FRAME_TYPE_SWITCH = 3, ///< Switch Inter frame
+};
+
+enum Dav1dColorPrimaries {
+    DAV1D_COLOR_PRI_BT709 = 1,
+    DAV1D_COLOR_PRI_UNKNOWN = 2,
+    DAV1D_COLOR_PRI_BT470M = 4,
+    DAV1D_COLOR_PRI_BT470BG = 5,
+    DAV1D_COLOR_PRI_BT601 = 6,
+    DAV1D_COLOR_PRI_SMPTE240 = 7,
+    DAV1D_COLOR_PRI_FILM = 8,
+    DAV1D_COLOR_PRI_BT2020 = 9,
+    DAV1D_COLOR_PRI_XYZ = 10,
+    DAV1D_COLOR_PRI_SMPTE431 = 11,
+    DAV1D_COLOR_PRI_SMPTE432 = 12,
+    DAV1D_COLOR_PRI_EBU3213 = 22,
+};
+
+enum Dav1dTransferCharacteristics {
+    DAV1D_TRC_BT709 = 1,
+    DAV1D_TRC_UNKNOWN = 2,
+    DAV1D_TRC_BT470M = 4,
+    DAV1D_TRC_BT470BG = 5,
+    DAV1D_TRC_BT601 = 6,
+    DAV1D_TRC_SMPTE240 = 7,
+    DAV1D_TRC_LINEAR = 8,
+    DAV1D_TRC_LOG100 = 9,         ///< logarithmic (100:1 range)
+    DAV1D_TRC_LOG100_SQRT10 = 10, ///< lograithmic (100*sqrt(10):1 range)
+    DAV1D_TRC_IEC61966 = 11,
+    DAV1D_TRC_BT1361 = 12,
+    DAV1D_TRC_SRGB = 13,
+    DAV1D_TRC_BT2020_10BIT = 14,
+    DAV1D_TRC_BT2020_12BIT = 15,
+    DAV1D_TRC_SMPTE2084 = 16,     ///< PQ
+    DAV1D_TRC_SMPTE428 = 17,
+    DAV1D_TRC_HLG = 18,           ///< hybrid log/gamma (BT.2100 / ARIB STD-B67)
+};
+
+enum Dav1dMatrixCoefficients {
+    DAV1D_MC_IDENTITY = 0,
+    DAV1D_MC_BT709 = 1,
+    DAV1D_MC_UNKNOWN = 2,
+    DAV1D_MC_FCC = 4,
+    DAV1D_MC_BT470BG = 5,
+    DAV1D_MC_BT601 = 6,
+    DAV1D_MC_SMPTE240 = 7,
+    DAV1D_MC_SMPTE_YCGCO = 8,
+    DAV1D_MC_BT2020_NCL = 9,
+    DAV1D_MC_BT2020_CL = 10,
+    DAV1D_MC_SMPTE2085 = 11,
+    DAV1D_MC_CHROMAT_NCL = 12, ///< Chromaticity-derived
+    DAV1D_MC_CHROMAT_CL = 13,
+    DAV1D_MC_ICTCP = 14,
+};
+
+enum Dav1dChromaSamplePosition {
+    DAV1D_CHR_UNKNOWN = 0,
+    DAV1D_CHR_VERTICAL = 1,  ///< Horizontally co-located with luma(0, 0)
+                           ///< sample, between two vertical samples
+    DAV1D_CHR_COLOCATED = 2, ///< Co-located with luma(0, 0) sample
+};
+
+typedef struct Dav1dSequenceHeader {
+    /**
+     * Stream profile, 0 for 8-10 bits/component 4:2:0 or monochrome;
+     * 1 for 8-10 bits/component 4:4:4; 2 for 4:2:2 at any bits/component,
+     * or 12 bits/component at any chroma subsampling.
+     */
+    int profile;
+    /**
+     * Maximum dimensions for this stream. In non-scalable streams, these
+     * are often the actual dimensions of the stream, although that is not
+     * a normative requirement.
+     */
+    int max_width, max_height;
+    enum Dav1dPixelLayout layout; ///< format of the picture
+    enum Dav1dColorPrimaries pri; ///< color primaries (av1)
+    enum Dav1dTransferCharacteristics trc; ///< transfer characteristics (av1)
+    enum Dav1dMatrixCoefficients mtrx; ///< matrix coefficients (av1)
+    enum Dav1dChromaSamplePosition chr; ///< chroma sample position (av1)
+    /**
+     * Pixel data uses JPEG pixel range ([0,255] for 8bits) instead of
+     * MPEG pixel range ([16,235] for 8bits luma, [16,240] for 8bits chroma).
+     */
+    int color_range;
+
+    int num_operating_points;
+    struct Dav1dSequenceHeaderOperatingPoint {
+        int major_level, minor_level;
+        int initial_display_delay;
+        int idc;
+        int tier;
+        int decoder_model_param_present;
+        int decoder_buffer_delay;
+        int encoder_buffer_delay;
+        int low_delay_mode;
+        int display_model_param_present;
+    } operating_points[DAV1D_MAX_OPERATING_POINTS];
+
+    int still_picture;
+    int reduced_still_picture_header;
+    int timing_info_present;
+    int num_units_in_tick;
+    int time_scale;
+    int equal_picture_interval;
+    int num_ticks_per_picture;
+    int decoder_model_info_present;
+    int encoder_decoder_buffer_delay_length;
+    int num_units_in_decoding_tick;
+    int buffer_removal_delay_length;
+    int frame_presentation_delay_length;
+    int display_model_info_present;
+    int width_n_bits, height_n_bits;
+    int frame_id_numbers_present;
+    int delta_frame_id_n_bits;
+    int frame_id_n_bits;
+    int sb128;
+    int filter_intra;
+    int intra_edge_filter;
+    int inter_intra;
+    int masked_compound;
+    int warped_motion;
+    int dual_filter;
+    int order_hint;
+    int jnt_comp;
+    int ref_frame_mvs;
+    enum Dav1dAdaptiveBoolean screen_content_tools;
+    enum Dav1dAdaptiveBoolean force_integer_mv;
+    int order_hint_n_bits;
+    int super_res;
+    int cdef;
+    int restoration;
+    /**
+     * 0, 1 and 2 mean 8, 10 or 12 bits/component, respectively. This is not
+     * exactly the same as 'hbd' from the spec; the spec's hbd distinguishes
+     * between 8 (0) and 10-12 (1) bits/component, and another element
+     * (twelve_bit) to distinguish between 10 and 12 bits/component. To get
+     * the spec's hbd, use !!our_hbd, and to get twelve_bit, use hbd == 2.
+     */
+    int hbd;
+    int ss_hor, ss_ver, monochrome;
+    int color_description_present;
+    int separate_uv_delta_q;
+    int film_grain_present;
+} Dav1dSequenceHeader;
+
+typedef struct Dav1dSegmentationData {
+    int delta_q;
+    int delta_lf_y_v, delta_lf_y_h, delta_lf_u, delta_lf_v;
+    int ref;
+    int skip;
+    int globalmv;
+} Dav1dSegmentationData;
+
+typedef struct Dav1dSegmentationDataSet {
+    Dav1dSegmentationData d[DAV1D_MAX_SEGMENTS];
+    int preskip;
+    int last_active_segid;
+} Dav1dSegmentationDataSet;
+
+typedef struct Dav1dLoopfilterModeRefDeltas {
+    int mode_delta[2 /* is_zeromv */];
+    int ref_delta[DAV1D_TOTAL_REFS_PER_FRAME];
+} Dav1dLoopfilterModeRefDeltas;
+
+typedef struct Dav1dFilmGrainData {
+    uint16_t seed;
+    int num_y_points;
+    uint8_t y_points[14][2 /* value, scaling */];
+    int chroma_scaling_from_luma;
+    int num_uv_points[2];
+    uint8_t uv_points[2][10][2 /* value, scaling */];
+    int scaling_shift;
+    int ar_coeff_lag;
+    int8_t ar_coeffs_y[24];
+    int8_t ar_coeffs_uv[2][25];
+    int ar_coeff_shift;
+    int grain_scale_shift;
+    int uv_mult[2];
+    int uv_luma_mult[2];
+    int uv_offset[2];
+    int overlap_flag;
+    int clip_to_restricted_range;
+} Dav1dFilmGrainData;
+
+typedef struct Dav1dFrameHeader {
+    enum Dav1dFrameType frame_type; ///< type of the picture
+    int width[2 /* { coded_width, superresolution_upscaled_width } */], height;
+    int frame_offset; ///< frame number
+    struct {
+        int present, update;
+        Dav1dFilmGrainData data;
+    } film_grain; ///< film grain parameters
+    int temporal_id, spatial_id; ///< spatial and temporal id of the frame for SVC
+
+    int show_existing_frame;
+    int existing_frame_idx;
+    int frame_id;
+    int frame_presentation_delay;
+    int show_frame;
+    int showable_frame;
+    int error_resilient_mode;
+    int disable_cdf_update;
+    int allow_screen_content_tools;
+    int force_integer_mv;
+    int frame_size_override;
+    int primary_ref_frame;
+    int buffer_removal_time_present;
+    struct Dav1dFrameHeaderOperatingPoint {
+        int buffer_removal_time;
+    } operating_points[DAV1D_MAX_OPERATING_POINTS];
+    int refresh_frame_flags;
+    int render_width, render_height;
+    struct {
+        int width_scale_denominator;
+        int enabled;
+    } super_res;
+    int have_render_size;
+    int allow_intrabc;
+    int frame_ref_short_signaling;
+    int refidx[DAV1D_REFS_PER_FRAME];
+    int hp;
+    enum Dav1dFilterMode subpel_filter_mode;
+    int switchable_motion_mode;
+    int use_ref_frame_mvs;
+    int refresh_context;
+    struct {
+        int uniform;
+        unsigned n_bytes;
+        int min_log2_cols, max_log2_cols, log2_cols, cols;
+        int min_log2_rows, max_log2_rows, log2_rows, rows;
+        uint16_t col_start_sb[DAV1D_MAX_TILE_COLS + 1];
+        uint16_t row_start_sb[DAV1D_MAX_TILE_ROWS + 1];
+        int update;
+    } tiling;
+    struct {
+        int yac;
+        int ydc_delta;
+        int udc_delta, uac_delta, vdc_delta, vac_delta;
+        int qm, qm_y, qm_u, qm_v;
+    } quant;
+    struct {
+        int enabled, update_map, temporal, update_data;
+        Dav1dSegmentationDataSet seg_data;
+        int lossless[DAV1D_MAX_SEGMENTS], qidx[DAV1D_MAX_SEGMENTS];
+    } segmentation;
+    struct {
+        struct {
+            int present;
+            int res_log2;
+        } q;
+        struct {
+            int present;
+            int res_log2;
+            int multi;
+        } lf;
+    } delta;
+    int all_lossless;
+    struct {
+        int level_y[2 /* dir */];
+        int level_u, level_v;
+        int mode_ref_delta_enabled;
+        int mode_ref_delta_update;
+        Dav1dLoopfilterModeRefDeltas mode_ref_deltas;
+        int sharpness;
+    } loopfilter;
+    struct {
+        int damping;
+        int n_bits;
+        int y_strength[DAV1D_MAX_CDEF_STRENGTHS];
+        int uv_strength[DAV1D_MAX_CDEF_STRENGTHS];
+    } cdef;
+    struct {
+        enum Dav1dRestorationType type[3 /* plane */];
+        int unit_size[2 /* y, uv */];
+    } restoration;
+    enum Dav1dTxfmMode txfm_mode;
+    int switchable_comp_refs;
+    int skip_mode_allowed, skip_mode_enabled, skip_mode_refs[2];
+    int warp_motion;
+    int reduced_txtp_set;
+    Dav1dWarpedMotionParams gmv[DAV1D_REFS_PER_FRAME];
+} Dav1dFrameHeader;
+
+#endif /* __DAV1D_HEADERS_H__ */
--- a/third_party/dav1d/include/dav1d/picture.h
+++ b/third_party/dav1d/include/dav1d/picture.h
@ -32,100 +32,18 @@
 #include <stdint.h>

 #include "common.h"
-
-enum Dav1dPixelLayout {
-    DAV1D_PIXEL_LAYOUT_I400, ///< monochrome
-    DAV1D_PIXEL_LAYOUT_I420, ///< 4:2:0 planar
-    DAV1D_PIXEL_LAYOUT_I422, ///< 4:2:2 planar
-    DAV1D_PIXEL_LAYOUT_I444, ///< 4:4:4 planar
-};
-
-enum Dav1dFrameType {
-    DAV1D_FRAME_TYPE_KEY = 0,    ///< Key Intra frame
-    DAV1D_FRAME_TYPE_INTER = 1,  ///< Inter frame
-    DAV1D_FRAME_TYPE_INTRA = 2,  ///< Non key Intra frame
-    DAV1D_FRAME_TYPE_SWITCH = 3, ///< Switch Inter frame
-};
-
-enum Dav1dColorPrimaries {
-    DAV1D_COLOR_PRI_BT709 = 1,
-    DAV1D_COLOR_PRI_UNKNOWN = 2,
-    DAV1D_COLOR_PRI_BT470M = 4,
-    DAV1D_COLOR_PRI_BT470BG = 5,
-    DAV1D_COLOR_PRI_BT601 = 6,
-    DAV1D_COLOR_PRI_SMPTE240 = 7,
-    DAV1D_COLOR_PRI_FILM = 8,
-    DAV1D_COLOR_PRI_BT2020 = 9,
-    DAV1D_COLOR_PRI_XYZ = 10,
-    DAV1D_COLOR_PRI_SMPTE431 = 11,
-    DAV1D_COLOR_PRI_SMPTE432 = 12,
-    DAV1D_COLOR_PRI_EBU3213 = 22,
-};
-
-enum Dav1dTransferCharacteristics {
-    DAV1D_TRC_BT709 = 1,
-    DAV1D_TRC_UNKNOWN = 2,
-    DAV1D_TRC_BT470M = 4,
-    DAV1D_TRC_BT470BG = 5,
-    DAV1D_TRC_BT601 = 6,
-    DAV1D_TRC_SMPTE240 = 7,
-    DAV1D_TRC_LINEAR = 8,
-    DAV1D_TRC_LOG100 = 9,         ///< logarithmic (100:1 range)
-    DAV1D_TRC_LOG100_SQRT10 = 10, ///< lograithmic (100*sqrt(10):1 range)
-    DAV1D_TRC_IEC61966 = 11,
-    DAV1D_TRC_BT1361 = 12,
-    DAV1D_TRC_SRGB = 13,
-    DAV1D_TRC_BT2020_10BIT = 14,
-    DAV1D_TRC_BT2020_12BIT = 15,
-    DAV1D_TRC_SMPTE2084 = 16,     ///< PQ
-    DAV1D_TRC_SMPTE428 = 17,
-    DAV1D_TRC_HLG = 18,           ///< hybrid log/gamma (BT.2100 / ARIB STD-B67)
-};
-
-enum Dav1dMatrixCoefficients {
-    DAV1D_MC_IDENTITY = 0,
-    DAV1D_MC_BT709 = 1,
-    DAV1D_MC_UNKNOWN = 2,
-    DAV1D_MC_FCC = 4,
-    DAV1D_MC_BT470BG = 5,
-    DAV1D_MC_BT601 = 6,
-    DAV1D_MC_SMPTE240 = 7,
-    DAV1D_MC_SMPTE_YCGCO = 8,
-    DAV1D_MC_BT2020_NCL = 9,
-    DAV1D_MC_BT2020_CL = 10,
-    DAV1D_MC_SMPTE2085 = 11,
-    DAV1D_MC_CHROMAT_NCL = 12, ///< Chromaticity-derived
-    DAV1D_MC_CHROMAT_CL = 13,
-    DAV1D_MC_ICTCP = 14,
-};
-
-enum Dav1dChromaSamplePosition {
-    DAV1D_CHR_UNKNOWN = 0,
-    DAV1D_CHR_VERTICAL = 1,  ///< Horizontally co-located with luma(0, 0)
-                           ///< sample, between two vertical samples
-    DAV1D_CHR_COLOCATED = 2, ///< Co-located with luma(0, 0) sample
-};
+#include "headers.h"

 typedef struct Dav1dPictureParameters {
    int w; ///< width (in pixels)
    int h; ///< height (in pixels)
    enum Dav1dPixelLayout layout; ///< format of the picture
-    enum Dav1dFrameType type; ///< type of the picture
    int bpc; ///< bits per pixel component (8 or 10)
-
-    enum Dav1dColorPrimaries pri; ///< color primaries (av1)
-    enum Dav1dTransferCharacteristics trc; ///< transfer characteristics (av1)
-    enum Dav1dMatrixCoefficients mtrx; ///< matrix coefficients (av1)
-    enum Dav1dChromaSamplePosition chr; ///< chroma sample position (av1)
-    /**
-     * Pixel data uses JPEG pixel range ([0,255] for 8bits) instead of
-     * MPEG pixel range ([16,235] for 8bits luma, [16,240] for 8bits chroma).
-     */
-    int fullrange;
 } Dav1dPictureParameters;

 typedef struct Dav1dPicture {
-    int poc; ///< frame number
+    Dav1dSequenceHeader *seq_hdr;
+    Dav1dFrameHeader *frame_hdr;

    /**
     * Pointers to planar image data (Y is [0], U is [1], V is [2]). The data
@ -135,7 +53,6 @@ typedef struct Dav1dPicture {
     * zero'ed out.
     */
    void *data[3];
-    struct Dav1dRef *ref; ///< allocation origin

    /**
     * Number of bytes between 2 lines in data[] for luma [0] or chroma [1].
@ -143,6 +60,8 @@ typedef struct Dav1dPicture {
    ptrdiff_t stride[2];

    Dav1dPictureParameters p;
+    Dav1dDataProps m;
+    struct Dav1dRef *frame_hdr_ref, *seq_hdr_ref, *ref; ///< allocation origins

    void *allocator_data; ///< pointer managed by the allocator
 } Dav1dPicture;
@ -152,7 +71,7 @@ typedef struct Dav1dPicAllocator {
    /**
     * Allocate the picture buffer based on the Dav1dPictureParameters.
     *
-     * The data[0], data[1] and data[2] must be 32 bits aligned and with a
+     * The data[0], data[1] and data[2] must be 32 byte aligned and with a
     * pixel width/height multiple of 128 pixels.
     * data[1] and data[2] must share the same stride[1].
     *
@ -170,14 +89,10 @@ typedef struct Dav1dPicAllocator {
    /**
     * Release the picture buffer.
     *
-     * @param buf           The buffer that was returned by 
-     *                                   alloc_picture_callback().
-     * @param allocator_tag The Dav1dPicture.allocator_data that was filled by
-     *                      alloc_picture_callback()
-     * @param cookie        Custom pointer passed to all calls.
+     * @param pic    The picture that was filled by alloc_picture_callback().
+     * @param cookie Custom pointer passed to all calls.
     */
-    void (*release_picture_callback)(uint8_t *buf, void *allocator_data,
-                                     void *cookie);
+    void (*release_picture_callback)(Dav1dPicture *pic, void *cookie);
 } Dav1dPicAllocator;

 /**
--- a/third_party/dav1d/meson.build
+++ b/third_party/dav1d/meson.build
@ -324,6 +324,8 @@ endif

 subdir('include')

+subdir('doc')
+
 subdir('src')

 subdir('tools')
--- a/third_party/dav1d/src/arm/64/looprestoration.S
+++ b/third_party/dav1d/src/arm/64/looprestoration.S
@ -0,0 +1,627 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+
+// void dav1d_wiener_filter_h_neon(int16_t *dst, const pixel (*left)[4],
+//                                 const pixel *src, ptrdiff_t stride,
+//                                 const int16_t fh[7], const intptr_t w,
+//                                 int h, enum LrEdgeFlags edges);
+function wiener_filter_h_neon, export=1
+        mov             w8,  w5
+        ld1             {v0.8h},  [x4]
+        mov             w9,  #(1 << 14) - (1 << 2)
+        dup             v30.8h,  w9
+        movi            v31.8h,  #8, lsl #8
+        // Calculate mid_stride
+        add             w10, w5,  #7
+        bic             w10, w10, #7
+        lsl             w10, w10, #1
+
+        // Clear the last unused element of v0, to allow filtering a single
+        // pixel with one plain mul+addv.
+        ins             v0.h[7], wzr
+
+        // Set up pointers for reading/writing alternate rows
+        add             x12, x0,  x10
+        lsl             w10, w10, #1
+        add             x13, x2,  x3
+        lsl             x3,  x3,  #1
+
+        // Subtract the width from mid_strid3
+        sub             x10, x10, w5, uxtw #1
+
+        // For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels.
+        cmp             w5,  #8
+        add             w11, w5,  #13
+        bic             w11, w11, #7
+        b.ge            1f
+        mov             w11, #16
+1:
+        sub             x3,  x3,  w11, uxtw
+
+        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+        tst             w7,  #1 // LR_HAVE_LEFT
+        b.eq            2f
+        // LR_HAVE_LEFT
+        cbnz            x1,  0f
+        // left == NULL
+        sub             x2,  x2,  #3
+        sub             x13, x13, #3
+        b               1f
+0:      // LR_HAVE_LEFT, left != NULL
+2:      // !LR_HAVE_LEFT, increase the stride.
+        // For this case we don't read the left 3 pixels from the src pointer,
+        // but shift it as if we had done that.
+        add             x3,  x3,  #3
+
+
+1:      // Loop vertically
+        ld1             {v3.16b},  [x2],  #16
+        ld1             {v5.16b},  [x13], #16
+
+        tst             w7,  #1 // LR_HAVE_LEFT
+        b.eq            0f
+        cbz             x1,  2f
+        // LR_HAVE_LEFT, left != NULL
+        ld1             {v2.s}[3],  [x1], #4
+        // Move x2/x13 back to account for the last 3 bytes we loaded earlier,
+        // which we'll shift out.
+        sub             x2,  x2,  #3
+        sub             x13, x13, #3
+        ld1             {v4.s}[3],  [x1], #4
+        ext             v3.16b, v2.16b, v3.16b, #13
+        ext             v5.16b, v4.16b, v5.16b, #13
+        b               2f
+0:
+        // !LR_HAVE_LEFT, fill v2 with the leftmost byte
+        // and shift v3 to have 3x the first byte at the front.
+        dup             v2.16b, v3.b[0]
+        dup             v4.16b, v5.b[0]
+        // Move x2 back to account for the last 3 bytes we loaded before,
+        // which we shifted out.
+        sub             x2,  x2,  #3
+        sub             x13, x13, #3
+        ext             v3.16b, v2.16b, v3.16b, #13
+        ext             v5.16b, v4.16b, v5.16b, #13
+
+2:
+        uxtl            v2.8h,  v3.8b
+        uxtl2           v3.8h,  v3.16b
+        uxtl            v4.8h,  v5.8b
+        uxtl2           v5.8h,  v5.16b
+
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        b.ne            4f
+        // If we'll need to pad the right edge, load that byte to pad with
+        // here since we can find it pretty easily from here.
+        sub             w9,  w5, #14
+        ldr             b28, [x2,  w9, sxtw]
+        ldr             b29, [x13, w9, sxtw]
+        // Fill v28/v29 with the right padding pixel
+        dup             v28.8b,  v28.b[0]
+        dup             v29.8b,  v29.b[0]
+        uxtl            v28.8h,  v28.8b
+        uxtl            v29.8h,  v29.8b
+3:      // !LR_HAVE_RIGHT
+        // If we'll have to pad the right edge we need to quit early here.
+        cmp             w5,  #11
+        b.ge            4f   // If w >= 11, all used input pixels are valid
+        cmp             w5,  #7
+        b.ge            5f   // If w >= 7, we can filter 4 pixels
+        b               6f
+
+4:      // Loop horizontally
+.macro filter wd
+        // Interleaving the mul/mla chains actually hurts performance
+        // significantly on Cortex A53, thus keeping mul/mla tightly
+        // chained like this.
+        ext             v16.16b, v2.16b,  v3.16b, #2
+        ext             v17.16b, v2.16b,  v3.16b, #4
+        ext             v18.16b, v2.16b,  v3.16b, #6
+        ext             v19.16b, v2.16b,  v3.16b, #8
+        ext             v20.16b, v2.16b,  v3.16b, #10
+        ext             v21.16b, v2.16b,  v3.16b, #12
+        mul             v6\wd,   v2\wd,   v0.h[0]
+        mla             v6\wd,   v16\wd,  v0.h[1]
+        mla             v6\wd,   v17\wd,  v0.h[2]
+        mla             v6\wd,   v18\wd,  v0.h[3]
+        mla             v6\wd,   v19\wd,  v0.h[4]
+        mla             v6\wd,   v20\wd,  v0.h[5]
+        mla             v6\wd,   v21\wd,  v0.h[6]
+        ext             v22.16b, v4.16b,  v5.16b, #2
+        ext             v23.16b, v4.16b,  v5.16b, #4
+        ext             v24.16b, v4.16b,  v5.16b, #6
+        ext             v25.16b, v4.16b,  v5.16b, #8
+        ext             v26.16b, v4.16b,  v5.16b, #10
+        ext             v27.16b, v4.16b,  v5.16b, #12
+        mul             v7\wd,   v4\wd,   v0.h[0]
+        mla             v7\wd,   v22\wd,  v0.h[1]
+        mla             v7\wd,   v23\wd,  v0.h[2]
+        mla             v7\wd,   v24\wd,  v0.h[3]
+        mla             v7\wd,   v25\wd,  v0.h[4]
+        mla             v7\wd,   v26\wd,  v0.h[5]
+        mla             v7\wd,   v27\wd,  v0.h[6]
+
+        shl             v18\wd,  v18\wd,  #7
+        shl             v24\wd,  v24\wd,  #7
+        sub             v18\wd,  v18\wd,  v30\wd
+        sub             v24\wd,  v24\wd,  v30\wd
+        sqadd           v6\wd,   v6\wd,   v18\wd
+        sqadd           v7\wd,   v7\wd,   v24\wd
+        sshr            v6\wd,   v6\wd,   #3
+        sshr            v7\wd,   v7\wd,   #3
+        add             v6\wd,   v6\wd,   v31\wd
+        add             v7\wd,   v7\wd,   v31\wd
+.endm
+        filter          .8h
+        st1             {v6.8h},  [x0],  #16
+        st1             {v7.8h},  [x12], #16
+
+        subs            w5,  w5,  #8
+        b.le            9f
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        mov             v2.16b,  v3.16b
+        mov             v4.16b,  v5.16b
+        ld1             {v3.8b},  [x2],  #8
+        ld1             {v5.8b},  [x13], #8
+        uxtl            v3.8h,   v3.8b
+        uxtl            v5.8h,   v5.8b
+        b.ne            4b // If we don't need to pad, just keep filtering.
+        b               3b // If we need to pad, check how many pixels we have left.
+
+5:      // Filter 4 pixels, 7 <= w < 11
+        filter          .4h
+        st1             {v6.4h},  [x0],  #8
+        st1             {v7.4h},  [x12], #8
+
+        subs            w5,  w5,  #4 // 3 <= w < 7
+        ext             v2.16b,  v2.16b,  v3.16b, #8
+        ext             v3.16b,  v3.16b,  v3.16b, #8
+        ext             v4.16b,  v4.16b,  v5.16b, #8
+        ext             v5.16b,  v5.16b,  v5.16b, #8
+
+6:      // Pad the right edge and filter the last few pixels.
+        // w < 7, w+3 pixels valid in v2-v3
+        cmp             w5,  #5
+        b.lt            7f
+        b.gt            8f
+        // w == 5, 8 pixels valid in v2, v3 invalid
+        mov             v3.16b,  v28.16b
+        mov             v5.16b,  v29.16b
+        b               88f
+
+7:      // 1 <= w < 5, 4-7 pixels valid in v2
+        sub             w9,  w5,  #1
+        // w9 = (pixels valid - 4)
+        adr             x11, L(variable_shift_tbl)
+        ldrh            w9,  [x11, w9, uxtw #1]
+        sub             x11, x11, w9, uxth
+        mov             v3.16b,  v28.16b
+        mov             v5.16b,  v29.16b
+        br              x11
+        // Shift v2 right, shifting out invalid pixels,
+        // shift v2 left to the original offset, shifting in padding pixels.
+44:     // 4 pixels valid
+        ext             v2.16b,  v2.16b,  v2.16b,  #8
+        ext             v2.16b,  v2.16b,  v3.16b,  #8
+        ext             v4.16b,  v4.16b,  v4.16b,  #8
+        ext             v4.16b,  v4.16b,  v5.16b,  #8
+        b               88f
+55:     // 5 pixels valid
+        ext             v2.16b,  v2.16b,  v2.16b,  #10
+        ext             v2.16b,  v2.16b,  v3.16b,  #6
+        ext             v4.16b,  v4.16b,  v4.16b,  #10
+        ext             v4.16b,  v4.16b,  v5.16b,  #6
+        b               88f
+66:     // 6 pixels valid
+        ext             v2.16b,  v2.16b,  v2.16b,  #12
+        ext             v2.16b,  v2.16b,  v3.16b,  #4
+        ext             v4.16b,  v4.16b,  v4.16b,  #12
+        ext             v4.16b,  v4.16b,  v5.16b,  #4
+        b               88f
+77:     // 7 pixels valid
+        ext             v2.16b,  v2.16b,  v2.16b,  #14
+        ext             v2.16b,  v2.16b,  v3.16b,  #2
+        ext             v4.16b,  v4.16b,  v4.16b,  #14
+        ext             v4.16b,  v4.16b,  v5.16b,  #2
+        b               88f
+
+L(variable_shift_tbl):
+        .hword L(variable_shift_tbl) - 44b
+        .hword L(variable_shift_tbl) - 55b
+        .hword L(variable_shift_tbl) - 66b
+        .hword L(variable_shift_tbl) - 77b
+
+8:      // w > 5, w == 6, 9 pixels valid in v2-v3, 1 pixel valid in v3
+        ins             v28.h[0],  v3.h[0]
+        ins             v29.h[0],  v5.h[0]
+        mov             v3.16b,  v28.16b
+        mov             v5.16b,  v29.16b
+
+88:
+        // w < 7, v2-v3 padded properly
+        cmp             w5,  #4
+        b.lt            888f
+
+        // w >= 4, filter 4 pixels
+        filter          .4h
+        st1             {v6.4h},  [x0],  #8
+        st1             {v7.4h},  [x12], #8
+        subs            w5,  w5,  #4 // 0 <= w < 4
+        ext             v2.16b,  v2.16b,  v3.16b, #8
+        ext             v4.16b,  v4.16b,  v5.16b, #8
+        b.eq            9f
+888:    // 1 <= w < 4, filter 1 pixel at a time
+        mul             v6.8h,   v2.8h,   v0.8h
+        mul             v7.8h,   v4.8h,   v0.8h
+        addv            h6,      v6.8h
+        addv            h7,      v7.8h
+        dup             v16.4h,  v2.h[3]
+        dup             v17.4h,  v4.h[3]
+        shl             v16.4h,  v16.4h,  #7
+        shl             v17.4h,  v17.4h,  #7
+        sub             v16.4h,  v16.4h,  v30.4h
+        sub             v17.4h,  v17.4h,  v30.4h
+        sqadd           v6.4h,   v6.4h,   v16.4h
+        sqadd           v7.4h,   v7.4h,   v17.4h
+        sshr            v6.4h,   v6.4h,   #3
+        sshr            v7.4h,   v7.4h,   #3
+        add             v6.4h,   v6.4h,   v31.4h
+        add             v7.4h,   v7.4h,   v31.4h
+        st1             {v6.h}[0], [x0],  #2
+        st1             {v7.h}[0], [x12], #2
+        subs            w5,  w5,  #1
+        ext             v2.16b,  v2.16b,  v3.16b,  #2
+        ext             v4.16b,  v4.16b,  v5.16b,  #2
+        b.gt            888b
+
+9:
+        subs            w6,  w6,  #2
+        b.le            0f
+        // Jump to the next row and loop horizontally
+        add             x0,  x0,  x10
+        add             x12, x12, x10
+        add             x2,  x2,  x3
+        add             x13, x13, x3
+        mov             w5,  w8
+        b               1b
+0:
+        ret
+.purgem filter
+endfunc
+
+// void dav1d_wiener_filter_v_neon(pixel *dst, ptrdiff_t stride,
+//                                 const int16_t *mid, int w, int h,
+//                                 const int16_t fv[7], enum LrEdgeFlags edges,
+//                                 ptrdiff_t mid_stride);
+function wiener_filter_v_neon, export=1
+        mov             w8,  w4
+        ld1             {v0.8h},  [x5]
+        mov             w9,  #128
+        dup             v1.8h, w9
+        add             v1.8h,  v1.8h,  v0.8h
+
+        // Calculate the number of rows to move back when looping vertically
+        mov             w11, w4
+        tst             w6,  #4 // LR_HAVE_TOP
+        b.eq            0f
+        sub             x2,  x2,  x7,  lsl #1
+        add             w11, w11, #2
+0:
+        tst             w6,  #8 // LR_HAVE_BOTTOM
+        b.eq            1f
+        add             w11, w11, #2
+
+1:      // Start of horizontal loop; start one vertical filter slice.
+        // Load rows into v16-v19 and pad properly.
+        tst             w6,  #4 // LR_HAVE_TOP
+        ld1             {v16.8h}, [x2], x7
+        b.eq            2f
+        // LR_HAVE_TOP
+        ld1             {v18.8h}, [x2], x7
+        mov             v17.16b, v16.16b
+        ld1             {v19.8h}, [x2], x7
+        b               3f
+2:      // !LR_HAVE_TOP
+        mov             v17.16b, v16.16b
+        mov             v18.16b, v16.16b
+        mov             v19.16b, v16.16b
+
+3:
+        cmp             w4,  #4
+        b.lt            5f
+        // Start filtering normally; fill in v20-v22 with unique rows.
+        ld1             {v20.8h}, [x2], x7
+        ld1             {v21.8h}, [x2], x7
+        ld1             {v22.8h}, [x2], x7
+
+4:
+.macro filter compare
+        subs            w4,  w4,  #1
+        // Interleaving the mul/mla chains actually hurts performance
+        // significantly on Cortex A53, thus keeping mul/mla tightly
+        // chained like this.
+        smull           v2.4s,  v16.4h,  v0.h[0]
+        smlal           v2.4s,  v17.4h,  v0.h[1]
+        smlal           v2.4s,  v18.4h,  v0.h[2]
+        smlal           v2.4s,  v19.4h,  v1.h[3]
+        smlal           v2.4s,  v20.4h,  v0.h[4]
+        smlal           v2.4s,  v21.4h,  v0.h[5]
+        smlal           v2.4s,  v22.4h,  v0.h[6]
+        smull2          v3.4s,  v16.8h,  v0.h[0]
+        smlal2          v3.4s,  v17.8h,  v0.h[1]
+        smlal2          v3.4s,  v18.8h,  v0.h[2]
+        smlal2          v3.4s,  v19.8h,  v1.h[3]
+        smlal2          v3.4s,  v20.8h,  v0.h[4]
+        smlal2          v3.4s,  v21.8h,  v0.h[5]
+        smlal2          v3.4s,  v22.8h,  v0.h[6]
+        sqrshrun        v2.4h,  v2.4s,   #11
+        sqrshrun2       v2.8h,  v3.4s,   #11
+        sqxtun          v2.8b,  v2.8h
+        st1             {v2.8b}, [x0], x1
+.if \compare
+        cmp             w4,  #4
+.else
+        b.le            9f
+.endif
+        mov             v16.16b,  v17.16b
+        mov             v17.16b,  v18.16b
+        mov             v18.16b,  v19.16b
+        mov             v19.16b,  v20.16b
+        mov             v20.16b,  v21.16b
+        mov             v21.16b,  v22.16b
+.endm
+        filter          1
+        b.lt            7f
+        ld1             {v22.8h}, [x2], x7
+        b               4b
+
+5:      // Less than 4 rows in total; not all of v20-v21 are filled yet.
+        tst             w6,  #8 // LR_HAVE_BOTTOM
+        b.eq            6f
+        // LR_HAVE_BOTTOM
+        cmp             w4,  #2
+        // We load at least 2 rows in all cases.
+        ld1             {v20.8h}, [x2], x7
+        ld1             {v21.8h}, [x2], x7
+        b.gt            53f // 3 rows in total
+        b.eq            52f // 2 rows in total
+51:     // 1 row in total, v19 already loaded, load edge into v20-v22.
+        mov             v22.16b,  v21.16b
+        b               8f
+52:     // 2 rows in total, v19 already loaded, load v20 with content data
+        // and 2 rows of edge.
+        ld1             {v22.8h}, [x2], x7
+        mov             v23.16b,  v22.16b
+        b               8f
+53:
+        // 3 rows in total, v19 already loaded, load v20 and v21 with content
+        // and 2 rows of edge.
+        ld1             {v22.8h}, [x2], x7
+        ld1             {v23.8h}, [x2], x7
+        mov             v24.16b,  v23.16b
+        b               8f
+
+6:
+        // !LR_HAVE_BOTTOM
+        cmp             w4,  #2
+        b.gt            63f // 3 rows in total
+        b.eq            62f // 2 rows in total
+61:     // 1 row in total, v19 already loaded, pad that into v20-v22.
+        mov             v20.16b,  v19.16b
+        mov             v21.16b,  v19.16b
+        mov             v22.16b,  v19.16b
+        b               8f
+62:     // 2 rows in total, v19 already loaded, load v20 and pad that into v20-v23.
+        ld1             {v20.8h}, [x2], x7
+        mov             v21.16b,  v20.16b
+        mov             v22.16b,  v20.16b
+        mov             v23.16b,  v20.16b
+        b               8f
+63:
+        // 3 rows in total, v19 already loaded, load v20 and v21 and pad v21 into v22-v24.
+        ld1             {v20.8h}, [x2], x7
+        ld1             {v21.8h}, [x2], x7
+        mov             v22.16b,  v21.16b
+        mov             v23.16b,  v21.16b
+        mov             v24.16b,  v21.16b
+        b               8f
+
+7:
+        // All registers up to v21 are filled already, 3 valid rows left.
+        // < 4 valid rows left; fill in padding and filter the last
+        // few rows.
+        tst             w6,  #8 // LR_HAVE_BOTTOM
+        b.eq            71f
+        // LR_HAVE_BOTTOM; load 2 rows of edge.
+        ld1             {v22.8h}, [x2], x7
+        ld1             {v23.8h}, [x2], x7
+        mov             v24.16b,  v23.16b
+        b               8f
+71:
+        // !LR_HAVE_BOTTOM, pad 3 rows
+        mov             v22.16b,  v21.16b
+        mov             v23.16b,  v21.16b
+        mov             v24.16b,  v21.16b
+
+8:      // At this point, all registers up to v22-v24 are loaded with
+        // edge/padding (depending on how many rows are left).
+        filter          0 // This branches to 9f when done
+        mov             v22.16b,  v23.16b
+        mov             v23.16b,  v24.16b
+        b               8b
+
+9:      // End of one vertical slice.
+        subs            w3,  w3,  #8
+        b.le            0f
+        // Move pointers back up to the top and loop horizontally.
+        msub            x0,  x1,  x8,  x0
+        msub            x2,  x7,  x11, x2
+        add             x0,  x0,  #8
+        add             x2,  x2,  #16
+        mov             w4,  w8
+        b               1b
+
+0:
+        ret
+.purgem filter
+endfunc
+
+// void dav1d_copy_narrow_neon(pixel *dst, ptrdiff_t stride,
+//                             const pixel *src, int w, int h);
+function copy_narrow_neon, export=1
+        adr             x5,  L(copy_narrow_tbl)
+        ldrh            w6,  [x5, w3, uxtw #1]
+        sub             x5,  x5,  w6, uxth
+        br              x5
+10:
+        add             x7,  x0,  x1
+        lsl             x1,  x1,  #1
+18:
+        cmp             w4,  #8
+        b.lt            110f
+        subs            w4,  w4,  #8
+        ld1             {v0.8b}, [x2], #8
+        st1             {v0.b}[0], [x0], x1
+        st1             {v0.b}[1], [x7], x1
+        st1             {v0.b}[2], [x0], x1
+        st1             {v0.b}[3], [x7], x1
+        st1             {v0.b}[4], [x0], x1
+        st1             {v0.b}[5], [x7], x1
+        st1             {v0.b}[6], [x0], x1
+        st1             {v0.b}[7], [x7], x1
+        b.le            0f
+        b               18b
+110:
+        asr             x1,  x1,  #1
+11:
+        subs            w4,  w4,  #1
+        ld1             {v0.b}[0], [x2], #1
+        st1             {v0.b}[0], [x0], x1
+        b.ge            11b
+0:
+        ret
+
+20:
+        add             x7,  x0,  x1
+        lsl             x1,  x1,  #1
+24:
+        cmp             w4,  #4
+        b.lt            210f
+        subs            w4,  w4,  #4
+        ld1             {v0.4h}, [x2], #8
+        st1             {v0.h}[0], [x0], x1
+        st1             {v0.h}[1], [x7], x1
+        st1             {v0.h}[2], [x0], x1
+        st1             {v0.h}[3], [x7], x1
+        b.le            0f
+        b               24b
+210:
+        asr             x1,  x1,  #1
+22:
+        subs            w4,  w4,  #1
+        ld1             {v0.h}[0], [x2], #2
+        st1             {v0.h}[0], [x0], x1
+        b.ge            22b
+0:
+        ret
+
+30:
+        ldrh            w5,  [x2]
+        ldrb            w6,  [x2, #2]
+        add             x2,  x2,  #3
+        subs            w4,  w4,  #1
+        strh            w5,  [x0]
+        strb            w6,  [x0, #2]
+        add             x0,  x0,  x1
+        b.gt            30b
+        ret
+
+40:
+        add             x7,  x0,  x1
+        lsl             x1,  x1,  #1
+42:
+        cmp             w4,  #2
+        b.lt            41f
+        subs            w4,  w4,  #2
+        ld1             {v0.2s}, [x2], #8
+        st1             {v0.s}[0], [x0], x1
+        st1             {v0.s}[1], [x7], x1
+        b.le            0f
+        b               42b
+41:
+        ld1             {v0.s}[0], [x2]
+        st1             {v0.s}[0], [x0]
+0:
+        ret
+
+50:
+        ldr             w5,  [x2]
+        ldrb            w6,  [x2, #4]
+        add             x2,  x2,  #5
+        subs            w4,  w4,  #1
+        str             w5,  [x0]
+        strb            w6,  [x0, #4]
+        add             x0,  x0,  x1
+        b.gt            50b
+        ret
+
+60:
+        ldr             w5,  [x2]
+        ldrh            w6,  [x2, #4]
+        add             x2,  x2,  #6
+        subs            w4,  w4,  #1
+        str             w5,  [x0]
+        strh            w6,  [x0, #4]
+        add             x0,  x0,  x1
+        b.gt            60b
+        ret
+
+70:
+        ldr             w5,  [x2]
+        ldrh            w6,  [x2, #4]
+        ldrb            w7,  [x2, #6]
+        add             x2,  x2,  #7
+        subs            w4,  w4,  #1
+        str             w5,  [x0]
+        strh            w6,  [x0, #4]
+        strb            w7,  [x0, #6]
+        add             x0,  x0,  x1
+        b.gt            70b
+        ret
+
+L(copy_narrow_tbl):
+        .hword 0
+        .hword L(copy_narrow_tbl) - 10b
+        .hword L(copy_narrow_tbl) - 20b
+        .hword L(copy_narrow_tbl) - 30b
+        .hword L(copy_narrow_tbl) - 40b
+        .hword L(copy_narrow_tbl) - 50b
+        .hword L(copy_narrow_tbl) - 60b
+        .hword L(copy_narrow_tbl) - 70b
+endfunc
--- a/third_party/dav1d/src/arm/64/mc.S
+++ b/third_party/dav1d/src/arm/64/mc.S
--- a/third_party/dav1d/src/arm/asm.S
+++ b/third_party/dav1d/src/arm/asm.S
@ -54,6 +54,14 @@
 #endif
 #endif

+#if !defined(PIC)
+#if defined(__PIC__)
+#define PIC __PIC__
+#elif defined(__pic__)
+#define PIC __pic__
+#endif
+#endif
+
 #ifndef PRIVATE_PREFIX
 #define PRIVATE_PREFIX dav1d_
 #endif
@ -121,4 +129,6 @@ EXTERN\name:
 #define L(x) .L ## x
 #endif

+#define X(x) CONCAT(EXTERN, x)
+
 #endif /* __DAV1D_SRC_ARM_ASM_S__ */
--- a/third_party/dav1d/src/arm/looprestoration_init_tmpl.c
+++ b/third_party/dav1d/src/arm/looprestoration_init_tmpl.c
@ -0,0 +1,106 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/looprestoration.h"
+
+#include "common/attributes.h"
+#include "common/intops.h"
+#include "src/tables.h"
+
+#if BITDEPTH == 8 && ARCH_AARCH64
+// This calculates things slightly differently than the reference C version.
+// This version calculates roughly this:
+// int16_t sum = 0;
+// for (int i = 0; i < 7; i++)
+//     sum += src[idx] * fh[i];
+// int16_t sum2 = (src[x] << 7) - (1 << (BITDEPTH + 6)) + rounding_off_h;
+// sum = iclip(sum + sum2, INT16_MIN, INT16_MAX) >> round_bits_h;
+// sum += 2048;
+void dav1d_wiener_filter_h_neon(int16_t *dst, const pixel (*left)[4],
+                                const pixel *src, ptrdiff_t stride,
+                                const int16_t fh[7], const intptr_t w,
+                                int h, enum LrEdgeFlags edges);
+// This calculates things slightly differently than the reference C version.
+// This version calculates roughly this:
+// fv[3] += 128;
+// int32_t sum = 0;
+// for (int i = 0; i < 7; i++)
+//     sum += mid[idx] * fv[i];
+// sum = (sum + rounding_off_v) >> round_bits_v;
+// This function assumes that the width is a multiple of 8.
+void dav1d_wiener_filter_v_neon(pixel *dst, ptrdiff_t stride,
+                                const int16_t *mid, int w, int h,
+                                const int16_t fv[7], enum LrEdgeFlags edges,
+                                ptrdiff_t mid_stride);
+void dav1d_copy_narrow_neon(pixel *dst, ptrdiff_t stride,
+                            const pixel *src, int w, int h);
+
+static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
+                               const pixel (*const left)[4],
+                               const pixel *lpf, const ptrdiff_t lpf_stride,
+                               const int w, const int h, const int16_t fh[7],
+                               const int16_t fv[7], const enum LrEdgeFlags edges)
+{
+    ALIGN_STK_32(int16_t, mid, 68 * 384,);
+    int mid_stride = (w + 7) & ~7;
+
+    // Horizontal filter
+    dav1d_wiener_filter_h_neon(&mid[2 * mid_stride], left, dst, dst_stride,
+                               fh, w, h, edges);
+    if (edges & LR_HAVE_TOP)
+        dav1d_wiener_filter_h_neon(mid, NULL, lpf, lpf_stride,
+                                   fh, w, 2, edges);
+    if (edges & LR_HAVE_BOTTOM)
+        dav1d_wiener_filter_h_neon(&mid[(2 + h) * mid_stride], NULL,
+                                   lpf + 6 * PXSTRIDE(lpf_stride), lpf_stride,
+                                   fh, w, 2, edges);
+
+    // Vertical filter
+    if (w >= 8)
+        dav1d_wiener_filter_v_neon(dst, dst_stride, &mid[2*mid_stride],
+                                   w & ~7, h, fv, edges, mid_stride * sizeof(*mid));
+    if (w & 7) {
+        // For uneven widths, do a full 8 pixel wide filtering into a temp
+        // buffer and copy out the narrow slice of pixels separately into dest.
+        ALIGN_STK_16(pixel, tmp, 64 * 8,);
+        dav1d_wiener_filter_v_neon(tmp, w & 7, &mid[2*mid_stride + (w & ~7)],
+                                   w & 7, h, fv, edges, mid_stride * sizeof(*mid));
+        dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, tmp, w & 7, h);
+    }
+}
+#endif
+
+void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *const c) {
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+#if BITDEPTH == 8 && ARCH_AARCH64
+    c->wiener = wiener_filter_neon;
+#endif
+}
--- a/third_party/dav1d/src/arm/mc_init_tmpl.c
+++ b/third_party/dav1d/src/arm/mc_init_tmpl.c
@ -30,16 +30,66 @@
 #include "src/mc.h"
 #include "src/cpu.h"

+decl_mc_fn(dav1d_put_8tap_regular_8bpc_neon);
+decl_mc_fn(dav1d_put_8tap_regular_smooth_8bpc_neon);
+decl_mc_fn(dav1d_put_8tap_regular_sharp_8bpc_neon);
+decl_mc_fn(dav1d_put_8tap_smooth_8bpc_neon);
+decl_mc_fn(dav1d_put_8tap_smooth_regular_8bpc_neon);
+decl_mc_fn(dav1d_put_8tap_smooth_sharp_8bpc_neon);
+decl_mc_fn(dav1d_put_8tap_sharp_8bpc_neon);
+decl_mc_fn(dav1d_put_8tap_sharp_regular_8bpc_neon);
+decl_mc_fn(dav1d_put_8tap_sharp_smooth_8bpc_neon);
+decl_mc_fn(dav1d_put_bilin_8bpc_neon);
+
+decl_mct_fn(dav1d_prep_8tap_regular_8bpc_neon);
+decl_mct_fn(dav1d_prep_8tap_regular_smooth_8bpc_neon);
+decl_mct_fn(dav1d_prep_8tap_regular_sharp_8bpc_neon);
+decl_mct_fn(dav1d_prep_8tap_smooth_8bpc_neon);
+decl_mct_fn(dav1d_prep_8tap_smooth_regular_8bpc_neon);
+decl_mct_fn(dav1d_prep_8tap_smooth_sharp_8bpc_neon);
+decl_mct_fn(dav1d_prep_8tap_sharp_8bpc_neon);
+decl_mct_fn(dav1d_prep_8tap_sharp_regular_8bpc_neon);
+decl_mct_fn(dav1d_prep_8tap_sharp_smooth_8bpc_neon);
+decl_mct_fn(dav1d_prep_bilin_8bpc_neon);
+
 decl_avg_fn(dav1d_avg_8bpc_neon);
 decl_w_avg_fn(dav1d_w_avg_8bpc_neon);
 decl_mask_fn(dav1d_mask_8bpc_neon);

 void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
+#define init_mc_fn(type, name, suffix) \
+    c->mc[type] = dav1d_put_##name##_8bpc_##suffix
+#define init_mct_fn(type, name, suffix) \
+    c->mct[type] = dav1d_prep_##name##_8bpc_##suffix
    const unsigned flags = dav1d_get_cpu_flags();

    if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;

 #if BITDEPTH == 8
+#if ARCH_AARCH64
+    init_mc_fn (FILTER_2D_8TAP_REGULAR,        8tap_regular,        neon);
+    init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
+    init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  neon);
+    init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon);
+    init_mc_fn (FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         neon);
+    init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   neon);
+    init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  neon);
+    init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   neon);
+    init_mc_fn (FILTER_2D_8TAP_SHARP,          8tap_sharp,          neon);
+    init_mc_fn (FILTER_2D_BILINEAR,            bilin,               neon);
+
+    init_mct_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        neon);
+    init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
+    init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  neon);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         neon);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   neon);
+    init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  neon);
+    init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   neon);
+    init_mct_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          neon);
+    init_mct_fn(FILTER_2D_BILINEAR,            bilin,               neon);
+#endif
+
    c->avg = dav1d_avg_8bpc_neon;
    c->w_avg = dav1d_w_avg_8bpc_neon;
    c->mask = dav1d_mask_8bpc_neon;
--- a/third_party/dav1d/src/cdef_apply_tmpl.c
+++ b/third_party/dav1d/src/cdef_apply_tmpl.c
@ -88,8 +88,8 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
    pixel *ptrs[3] = { p[0], p[1], p[2] };
    const int sbsz = 16;
    const int sb64w = f->sb128w << 1;
-    const int damping = f->frame_hdr.cdef.damping + BITDEPTH - 8;
-    const enum Dav1dPixelLayout layout = f->cur.p.p.layout;
+    const int damping = f->frame_hdr->cdef.damping + BITDEPTH - 8;
+    const enum Dav1dPixelLayout layout = f->cur.p.layout;
    const int uv_idx = DAV1D_PIXEL_LAYOUT_I444 - layout;
    const int has_chroma = layout != DAV1D_PIXEL_LAYOUT_I400;
    const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
@ -106,7 +106,7 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,

        if (edges & HAVE_BOTTOM) {
            // backup pre-filter data for next iteration
-            backup2lines(f->lf.cdef_line_ptr[!tf], ptrs, f->cur.p.stride,
+            backup2lines(f->lf.cdef_line_ptr[!tf], ptrs, f->cur.stride,
                         8, f->bw * 4, layout);
        }

@ -119,15 +119,15 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
            const int sb64_idx = ((by & sbsz) >> 3) + (sbx & 1);
            const int cdef_idx = lflvl[sb128x].cdef_idx[sb64_idx];
            if (cdef_idx == -1 ||
-                (!f->frame_hdr.cdef.y_strength[cdef_idx] &&
-                 !f->frame_hdr.cdef.uv_strength[cdef_idx]))
+                (!f->frame_hdr->cdef.y_strength[cdef_idx] &&
+                 !f->frame_hdr->cdef.uv_strength[cdef_idx]))
            {
                last_skip = 1;
                goto next_sb;
            }

-            const int y_lvl = f->frame_hdr.cdef.y_strength[cdef_idx];
-            const int uv_lvl = f->frame_hdr.cdef.uv_strength[cdef_idx];
+            const int y_lvl = f->frame_hdr->cdef.y_strength[cdef_idx];
+            const int uv_lvl = f->frame_hdr->cdef.uv_strength[cdef_idx];
            pixel *bptrs[3] = { iptrs[0], iptrs[1], iptrs[2] };
            for (int bx = sbx * sbsz; bx < imin((sbx + 1) * sbsz, f->bw);
                 bx += 2, edges |= HAVE_LEFT)
@ -148,11 +148,11 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
                if (last_skip && edges & HAVE_LEFT) {
                    // we didn't backup the prefilter data because it wasn't
                    // there, so do it here instead
-                    backup2x8(lr_bak[bit], bptrs, f->cur.p.stride, 0, layout);
+                    backup2x8(lr_bak[bit], bptrs, f->cur.stride, 0, layout);
                }
                if (edges & HAVE_RIGHT) {
                    // backup pre-filter data for next iteration
-                    backup2x8(lr_bak[!bit], bptrs, f->cur.p.stride, 8, layout);
+                    backup2x8(lr_bak[!bit], bptrs, f->cur.stride, 8, layout);
                }

                // the actual filter
@ -165,10 +165,10 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
                uv_sec_lvl += uv_sec_lvl == 3;
                uv_sec_lvl <<= BITDEPTH - 8;
                unsigned variance;
-                const int dir = dsp->cdef.dir(bptrs[0], f->cur.p.stride[0],
+                const int dir = dsp->cdef.dir(bptrs[0], f->cur.stride[0],
                                              &variance);
                if (y_lvl) {
-                    dsp->cdef.fb[0](bptrs[0], f->cur.p.stride[0], lr_bak[bit][0],
+                    dsp->cdef.fb[0](bptrs[0], f->cur.stride[0], lr_bak[bit][0],
                                    (pixel *const [2]) {
                                        &f->lf.cdef_line_ptr[tf][0][0][bx * 4],
                                        &f->lf.cdef_line_ptr[tf][0][1][bx * 4],
@ -179,10 +179,10 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
                }
                if (uv_lvl && has_chroma) {
                    const int uvdir =
-                        f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I422 ? dir :
+                        f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I422 ? dir :
                        ((uint8_t[]) { 7, 0, 2, 4, 5, 6, 6, 6 })[dir];
                    for (int pl = 1; pl <= 2; pl++) {
-                        dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.p.stride[1],
+                        dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.stride[1],
                                             lr_bak[bit][pl],
                                             (pixel *const [2]) {
                                                 &f->lf.cdef_line_ptr[tf][pl][0][bx * 4 >> ss_hor],
@ -209,9 +209,9 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
            iptrs[2] += sbsz * 4 >> ss_hor;
        }

-        ptrs[0] += 8 * PXSTRIDE(f->cur.p.stride[0]);
-        ptrs[1] += 8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver;
-        ptrs[2] += 8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver;
+        ptrs[0] += 8 * PXSTRIDE(f->cur.stride[0]);
+        ptrs[1] += 8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver;
+        ptrs[2] += 8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver;
        f->lf.top_pre_cdef_toggle ^= 1;
    }
 }
--- a/third_party/dav1d/src/cdf.c
+++ b/third_party/dav1d/src/cdf.c
@ -4072,7 +4072,7 @@ void dav1d_init_states(CdfThreadContext *const cdf, const int qidx) {
    dav1d_cdf_thread_ref(cdf, &cdf_init[qcat]);
 }

-void dav1d_update_tile_cdf(const Av1FrameHeader *const hdr,
+void dav1d_update_tile_cdf(const Dav1dFrameHeader *const hdr,
                           CdfContext *const dst,
                           const CdfContext *const src)
 {
@ -4138,7 +4138,7 @@ void dav1d_update_tile_cdf(const Av1FrameHeader *const hdr,
    update_cdf_4d(N_TX_SIZES, 2, 41 /*42*/, 4, coef.base_tok);
    update_bit_2d(2, 3, coef.dc_sign);
    update_cdf_4d(4, 2, 21, 4, coef.br_tok);
-    update_cdf_2d(3, NUM_SEGMENTS, m.seg_id);
+    update_cdf_2d(3, DAV1D_MAX_SEGMENTS, m.seg_id);
    update_cdf_1d(8, m.cfl_sign);
    update_cdf_2d(6, 16, m.cfl_alpha);
    update_bit_0d(m.restore_wiener);
@ -4171,7 +4171,7 @@ void dav1d_update_tile_cdf(const Av1FrameHeader *const hdr,

    update_bit_1d(3, m.skip_mode);
    update_cdf_2d(4, N_INTRA_PRED_MODES, m.y_mode);
-    update_cdf_3d(2, 8, N_SWITCHABLE_FILTERS, m.filter);
+    update_cdf_3d(2, 8, DAV1D_N_SWITCHABLE_FILTERS, m.filter);
    update_bit_1d(6, m.newmv_mode);
    update_bit_1d(2, m.globalmv_mode);
    update_bit_1d(6, m.refmv_mode);
--- a/third_party/dav1d/src/cdf.h
+++ b/third_party/dav1d/src/cdf.h
@ -40,7 +40,7 @@ typedef struct CdfModeContext {
    uint16_t filter_intra[5 + 1];
    uint16_t uv_mode[2][N_INTRA_PRED_MODES][N_UV_INTRA_PRED_MODES + 1];
    uint16_t angle_delta[8][8];
-    uint16_t filter[2][8][N_SWITCHABLE_FILTERS + 1];
+    uint16_t filter[2][8][DAV1D_N_SWITCHABLE_FILTERS + 1];
    uint16_t newmv_mode[6][2];
    uint16_t globalmv_mode[2][2];
    uint16_t refmv_mode[6][2];
@ -68,7 +68,7 @@ typedef struct CdfModeContext {
    uint16_t skip_mode[3][2];
    uint16_t partition[N_BL_LEVELS][4][N_PARTITIONS + 1];
    uint16_t seg_pred[3][2];
-    uint16_t seg_id[3][NUM_SEGMENTS + 1];
+    uint16_t seg_id[3][DAV1D_MAX_SEGMENTS + 1];
    uint16_t cfl_sign[8 + 1];
    uint16_t cfl_alpha[6][16 + 1];
    uint16_t restore_wiener[2];
@ -132,7 +132,7 @@ typedef struct CdfThreadContext {
 } CdfThreadContext;

 void dav1d_init_states(CdfThreadContext *cdf, int qidx);
-void dav1d_update_tile_cdf(const Av1FrameHeader *hdr, CdfContext *dst,
+void dav1d_update_tile_cdf(const Dav1dFrameHeader *hdr, CdfContext *dst,
                         const CdfContext *src);

 void dav1d_cdf_thread_alloc(CdfThreadContext *cdf, struct thread_data *t);
--- a/third_party/dav1d/src/cpu.h
+++ b/third_party/dav1d/src/cpu.h
@ -30,6 +30,8 @@

 #include "config.h"

+#include "dav1d/common.h"
+
 #if ARCH_AARCH64 || ARCH_ARM
 #include "src/arm/cpu.h"
 #elif ARCH_X86
@ -37,6 +39,6 @@
 #endif

 unsigned dav1d_get_cpu_flags(void);
-void dav1d_set_cpu_flags_mask(const unsigned mask);
+DAV1D_API void dav1d_set_cpu_flags_mask(const unsigned mask);

 #endif /* __DAV1D_SRC_CPU_H__ */
--- a/third_party/dav1d/src/data.c
+++ b/third_party/dav1d/src/data.c
@ -28,6 +28,7 @@
 #include "config.h"

 #include <errno.h>
+#include <stdint.h>
 #include <stdlib.h>
 #include <string.h>

@ -44,7 +45,10 @@ uint8_t * dav1d_data_create(Dav1dData *const buf, const size_t sz) {
    buf->ref = dav1d_ref_create(sz);
    if (!buf->ref) return NULL;
    buf->data = buf->ref->const_data;
-    buf->sz = sz;
+    buf->sz = buf->m.size = sz;
+    buf->m.timestamp = INT64_MIN;
+    buf->m.duration = 0;
+    buf->m.offset = -1;

    return buf->ref->data;
 }
@ -60,7 +64,10 @@ int dav1d_data_wrap(Dav1dData *const buf, const uint8_t *const ptr, const size_t
    buf->ref = dav1d_ref_wrap(ptr, free_callback, user_data);
    if (!buf->ref) return -ENOMEM;
    buf->data = ptr;
-    buf->sz = sz;
+    buf->sz = buf->m.size = sz;
+    buf->m.timestamp = INT64_MIN;
+    buf->m.duration = 0;
+    buf->m.offset = -1;

    return 0;
 }
--- a/third_party/dav1d/src/decode.c
+++ b/third_party/dav1d/src/decode.c
--- a/third_party/dav1d/src/env.h
+++ b/third_party/dav1d/src/env.h
@ -91,45 +91,48 @@ static inline int get_partition_ctx(const BlockContext *const a,
 }

 static inline unsigned cdf_element_prob(const uint16_t *const cdf, const int e) {
-    return (e > 0 ? cdf[e - 1] : 32768) - cdf[e];
+    assert(e > 0);
+    return cdf[e - 1] - cdf[e];
 }

 static inline unsigned gather_left_partition_prob(const uint16_t *const in,
                                                  const enum BlockLevel bl)
 {
-    unsigned out = 32768;
-    out -= cdf_element_prob(in, PARTITION_H);
+    unsigned out = 0;
+    out += cdf_element_prob(in, PARTITION_H);
    if (bl != BL_128X128)
-        out -= cdf_element_prob(in, PARTITION_H4);
-    out -= cdf_element_prob(in, PARTITION_SPLIT);
-    out -= cdf_element_prob(in, PARTITION_T_TOP_SPLIT);
-    out -= cdf_element_prob(in, PARTITION_T_BOTTOM_SPLIT);
-    out -= cdf_element_prob(in, PARTITION_T_LEFT_SPLIT);
-    return 32768 - out;
+        out += cdf_element_prob(in, PARTITION_H4);
+    // Exploit the fact that cdfs for PARTITION_SPLIT, PARTITION_T_TOP_SPLIT,
+    //  PARTITION_T_BOTTOM_SPLIT and PARTITION_T_LEFT_SPLIT are neighbors.
+    out += in[PARTITION_SPLIT - 1] - in[PARTITION_T_LEFT_SPLIT];
+    return out;
 }

 static inline unsigned gather_top_partition_prob(const uint16_t *const in,
                                                 const enum BlockLevel bl)
 {
-    unsigned out = 32768;
-    out -= cdf_element_prob(in, PARTITION_V);
+    unsigned out = 0;
    if (bl != BL_128X128)
-        out -= cdf_element_prob(in, PARTITION_V4);
-    out -= cdf_element_prob(in, PARTITION_SPLIT);
-    out -= cdf_element_prob(in, PARTITION_T_TOP_SPLIT);
-    out -= cdf_element_prob(in, PARTITION_T_LEFT_SPLIT);
-    out -= cdf_element_prob(in, PARTITION_T_RIGHT_SPLIT);
-    return 32768 - out;
+        out += cdf_element_prob(in, PARTITION_V4);
+    // Exploit the fact that cdfs for PARTITION_T_LEFT_SPLIT and PARTITION_T_RIGHT_SPLIT,
+    //  and PARTITION_V, PARTITION_SPLIT and PARTITION_T_TOP_SPLIT are neighbors.
+    out += in[PARTITION_T_LEFT_SPLIT - 1] - in[PARTITION_T_RIGHT_SPLIT];
+    out += in[PARTITION_V - 1] - in[PARTITION_T_TOP_SPLIT];
+    return out;
 }

 static inline enum TxfmTypeSet get_ext_txtp_set(const enum RectTxfmSize tx,
                                                const int inter,
-                                                const Av1FrameHeader *const hdr,
+                                                const Dav1dFrameHeader *const hdr,
                                                const int seg_id)
 {
-    if (hdr->segmentation.lossless[seg_id]) {
-        assert(tx == (int) TX_4X4);
-        return TXTP_SET_LOSSLESS;
+    if (!hdr->segmentation.qidx[seg_id]) {
+        if (hdr->segmentation.lossless[seg_id]) {
+            assert(tx == (int) TX_4X4);
+            return TXTP_SET_LOSSLESS;
+        } else {
+            return TXTP_SET_DCT;
+        }
    }

    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
@ -153,7 +156,7 @@ static inline enum TxfmTypeSet get_ext_txtp_set(const enum RectTxfmSize tx,

 static inline enum TxfmType get_uv_intra_txtp(const enum IntraPredMode uv_mode,
                                              const enum RectTxfmSize tx,
-                                              const Av1FrameHeader *const hdr,
+                                              const Dav1dFrameHeader *const hdr,
                                              const int seg_id)
 {
    if (hdr->segmentation.lossless[seg_id]) {
@ -168,7 +171,7 @@ static inline enum TxfmType get_uv_intra_txtp(const enum IntraPredMode uv_mode,

 static inline enum TxfmType get_uv_inter_txtp(const TxfmInfo *const uvt_dim,
                                              const enum TxfmType ytxtp,
-                                              const Av1FrameHeader *const hdr,
+                                              const Dav1dFrameHeader *const hdr,
                                              const int seg_id)
 {
    if (hdr->segmentation.lossless[seg_id]) {
@ -194,18 +197,18 @@ static inline int get_filter_ctx(const BlockContext *const a,
                                 const int yb4, const int xb4)
 {
    const int a_filter = (a->ref[0][xb4] == ref || a->ref[1][xb4] == ref) ?
-                         a->filter[dir][xb4] : N_SWITCHABLE_FILTERS;
+                         a->filter[dir][xb4] : DAV1D_N_SWITCHABLE_FILTERS;
    const int l_filter = (l->ref[0][yb4] == ref || l->ref[1][yb4] == ref) ?
-                         l->filter[dir][yb4] : N_SWITCHABLE_FILTERS;
+                         l->filter[dir][yb4] : DAV1D_N_SWITCHABLE_FILTERS;

    if (a_filter == l_filter) {
        return comp * 4 + a_filter;
-    } else if (a_filter == N_SWITCHABLE_FILTERS) {
+    } else if (a_filter == DAV1D_N_SWITCHABLE_FILTERS) {
        return comp * 4 + l_filter;
-    } else if (l_filter == N_SWITCHABLE_FILTERS) {
+    } else if (l_filter == DAV1D_N_SWITCHABLE_FILTERS) {
        return comp * 4 + a_filter;
    } else {
-        return comp * 4 + N_SWITCHABLE_FILTERS;
+        return comp * 4 + DAV1D_N_SWITCHABLE_FILTERS;
    }
 }

@ -716,18 +719,18 @@ static inline int get_br_ctx(const uint8_t *const levels,
    return mag + 14;
 }

-static inline mv get_gmv_2d(const WarpedMotionParams *const gmv,
+static inline mv get_gmv_2d(const Dav1dWarpedMotionParams *const gmv,
                            const int bx4, const int by4,
                            const int bw4, const int bh4,
-                            const Av1FrameHeader *const hdr)
+                            const Dav1dFrameHeader *const hdr)
 {
    switch (gmv->type) {
-    case WM_TYPE_ROT_ZOOM:
+    case DAV1D_WM_TYPE_ROT_ZOOM:
        assert(gmv->matrix[5] ==  gmv->matrix[2]);
        assert(gmv->matrix[4] == -gmv->matrix[3]);
        // fall-through
    default:
-    case WM_TYPE_AFFINE: {
+    case DAV1D_WM_TYPE_AFFINE: {
        const int x = bx4 * 4 + bw4 * 2 - 1;
        const int y = by4 * 4 + bh4 * 2 - 1;
        const int xc = (gmv->matrix[2] - (1 << 16)) * x +
@ -741,12 +744,12 @@ static inline mv get_gmv_2d(const WarpedMotionParams *const gmv,
            .x = apply_sign(((abs(xc) + round) >> shift) << !hdr->hp, xc),
        };
    }
-    case WM_TYPE_TRANSLATION:
+    case DAV1D_WM_TYPE_TRANSLATION:
        return (mv) {
            .y = gmv->matrix[0] >> 13,
            .x = gmv->matrix[1] >> 13,
        };
-    case WM_TYPE_IDENTITY:
+    case DAV1D_WM_TYPE_IDENTITY:
        return (mv) { .x = 0, .y = 0 };
    }
 }
--- a/third_party/dav1d/src/film_grain.h
+++ b/third_party/dav1d/src/film_grain.h
@ -0,0 +1,39 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __DAV1D_SRC_FILM_GRAIN_H__
+#define __DAV1D_SRC_FILM_GRAIN_H__
+
+#include "dav1d/dav1d.h"
+
+void dav1d_apply_grain_8bpc(Dav1dPicture *const out,
+                            const Dav1dPicture *const in);
+
+void dav1d_apply_grain_10bpc(Dav1dPicture *const out,
+                             const Dav1dPicture *const in);
+
+#endif /* __DAV1D_SRC_FILM_GRAIN_H__ */
--- a/third_party/dav1d/src/film_grain_tmpl.c
+++ b/third_party/dav1d/src/film_grain_tmpl.c
@ -0,0 +1,512 @@
+/*
+ * Copyright © 2018, Niklas Haas
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <assert.h>
+#include <stdint.h>
+
+#include "common.h"
+#include "common/intops.h"
+#include "common/bitdepth.h"
+#include "tables.h"
+
+#include "film_grain.h"
+
+#if BITDEPTH == 8
+typedef int8_t entry;
+#else
+typedef int16_t entry;
+#endif
+
+enum {
+    GRAIN_WIDTH  = 82,
+    GRAIN_HEIGHT = 73,
+    SUB_GRAIN_WIDTH = 44,
+    SUB_GRAIN_HEIGHT = 38,
+    SUB_GRAIN_OFFSET = 6,
+    BLOCK_SIZE = 32,
+    SCALING_SIZE = 1 << BITDEPTH,
+};
+
+static inline int get_random_number(const int bits, unsigned *state) {
+    const int r = *state;
+    unsigned bit = ((r >> 0) ^ (r >> 1) ^ (r >> 3) ^ (r >> 12)) & 1;
+    *state = (r >> 1) | (bit << 15);
+
+    return (*state >> (16 - bits)) & ((1 << bits) - 1);
+}
+
+static inline int round2(const int x, const int shift) {
+    return (x + ((1 << shift) >> 1)) >> shift;
+}
+
+enum {
+    GRAIN_CENTER = 128 << (BITDEPTH - 8),
+    GRAIN_MIN = -GRAIN_CENTER,
+    GRAIN_MAX = (256 << (BITDEPTH - 8)) - 1 - GRAIN_CENTER,
+};
+
+static void generate_grain_y(const Dav1dPicture *const in,
+                             entry buf[GRAIN_HEIGHT][GRAIN_WIDTH])
+{
+    const Dav1dFilmGrainData *data = &in->frame_hdr->film_grain.data;
+    unsigned seed = data->seed;
+    const int shift = 12 - BITDEPTH + data->grain_scale_shift;
+
+    for (int y = 0; y < GRAIN_HEIGHT; y++) {
+        for (int x = 0; x < GRAIN_WIDTH; x++) {
+            const int value = get_random_number(11, &seed);
+            buf[y][x] = round2(dav1d_gaussian_sequence[ value ], shift);
+        }
+    }
+
+    const int ar_pad = 3;
+    const int ar_lag = data->ar_coeff_lag;
+
+    for (int y = ar_pad; y < GRAIN_HEIGHT; y++) {
+        for (int x = ar_pad; x < GRAIN_WIDTH - ar_pad; x++) {
+            const int8_t *coeff = data->ar_coeffs_y;
+            int sum = 0;
+            for (int dy = -ar_lag; dy <= 0; dy++) {
+                for (int dx = -ar_lag; dx <= ar_lag; dx++) {
+                    if (!dx && !dy)
+                        break;
+                    sum += *(coeff++) * buf[y + dy][x + dx];
+                }
+            }
+
+            int grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
+            buf[y][x] = iclip(grain, GRAIN_MIN, GRAIN_MAX);
+        }
+    }
+}
+
+static void generate_grain_uv(const Dav1dPicture *const in, int uv,
+                              entry buf[GRAIN_HEIGHT][GRAIN_WIDTH],
+                              entry buf_y[GRAIN_HEIGHT][GRAIN_WIDTH])
+{
+    const Dav1dFilmGrainData *data = &in->frame_hdr->film_grain.data;
+    unsigned seed = data->seed ^ (uv ? 0x49d8 : 0xb524);
+    const int shift = 12 - BITDEPTH + data->grain_scale_shift;
+
+    const int subx = in->p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    const int suby = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
+
+    const int chromaW = subx ? SUB_GRAIN_WIDTH  : GRAIN_WIDTH;
+    const int chromaH = suby ? SUB_GRAIN_HEIGHT : GRAIN_HEIGHT;
+
+    for (int y = 0; y < chromaH; y++) {
+        for (int x = 0; x < chromaW; x++) {
+            const int value = get_random_number(11, &seed);
+            buf[y][x] = round2(dav1d_gaussian_sequence[ value ], shift);
+        }
+    }
+
+    const int ar_pad = 3;
+    const int ar_lag = data->ar_coeff_lag;
+
+    for (int y = ar_pad; y < chromaH; y++) {
+        for (int x = ar_pad; x < chromaW - ar_pad; x++) {
+            const int8_t *coeff = data->ar_coeffs_uv[uv];
+            int sum = 0;
+            for (int dy = -ar_lag; dy <= 0; dy++) {
+                for (int dx = -ar_lag; dx <= ar_lag; dx++) {
+                    // For the final (current) pixel, we need to add in the
+                    // contribution from the luma grain texture
+                    if (!dx && !dy) {
+                        if (!data->num_y_points)
+                            break;
+                        int luma = 0;
+                        const int lumaX = ((x - ar_pad) << subx) + ar_pad;
+                        const int lumaY = ((y - ar_pad) << suby) + ar_pad;
+                        for (int i = 0; i <= suby; i++) {
+                            for (int j = 0; j <= subx; j++) {
+                                luma += buf_y[lumaY + i][lumaX + j];
+                            }
+                        }
+                        luma = round2(luma, subx + suby);
+                        sum += luma * (*coeff);
+                        break;
+                    }
+
+                    sum += *(coeff++) * buf[y + dy][x + dx];
+                }
+            }
+
+            const int grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
+            buf[y][x] = iclip(grain, GRAIN_MIN, GRAIN_MAX);
+        }
+    }
+}
+
+static void generate_scaling(const uint8_t points[][2], int num,
+                             uint8_t scaling[SCALING_SIZE])
+{
+    const int shift_x = BITDEPTH - 8;
+
+    // Fill up the preceding entries with the initial value
+    for (int i = 0; i < points[0][0] << shift_x; i++)
+        scaling[i] = points[0][1];
+
+    // Linearly interpolate the values in the middle
+    for (int i = 0; i < num - 1; i++) {
+        const int bx = points[i][0] << shift_x;
+        const int by = points[i][1];
+        const int ex = points[i+1][0] << shift_x;
+        const int ey = points[i+1][1];
+        const int dx = ex - bx;
+        const int dy = ey - by;
+        const int delta = dy * ((0xFFFF + (dx >> 1))) / dx;
+        for (int x = 0; x < dx; x++) {
+            const int v = by + ((x * delta + 0x8000) >> 16);
+            scaling[bx + x] = v;
+        }
+    }
+
+    // Fill up the remaining entries with the final value
+    for (int i = points[num - 1][0] << shift_x; i < SCALING_SIZE; i++)
+        scaling[i] = points[num - 1][1];
+}
+
+// samples from the correct block of a grain LUT, while taking into account the
+// offsets provided by the offsets cache
+static inline entry sample_lut(entry grain_lut[GRAIN_HEIGHT][GRAIN_WIDTH],
+                               int offsets[2][2], int subx, int suby,
+                               int bx, int by, int x, int y)
+{
+    const int randval = offsets[bx][by];
+    const int offx = 3 + (2 >> subx) * (3 + (randval >> 4));
+    const int offy = 3 + (2 >> suby) * (3 + (randval & 0xF));
+    return grain_lut[offy + y + (BLOCK_SIZE >> suby) * by]
+                    [offx + x + (BLOCK_SIZE >> subx) * bx];
+}
+
+static void apply_to_row_y(Dav1dPicture *const out, const Dav1dPicture *const in,
+                           entry grain_lut[GRAIN_HEIGHT][GRAIN_WIDTH],
+                           uint8_t scaling[SCALING_SIZE], int row_num)
+{
+    const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;
+    const int rows = 1 + (data->overlap_flag && row_num > 0);
+
+    int min_value, max_value;
+    if (data->clip_to_restricted_range) {
+        min_value = 16 << (BITDEPTH - 8);
+        max_value = 235 << (BITDEPTH - 8);
+    } else {
+        min_value = 0;
+        max_value = (1 << BITDEPTH) - 1;
+    }
+
+    // seed[0] contains the current row, seed[1] contains the previous
+    unsigned seed[2];
+    for (int i = 0; i < rows; i++) {
+        seed[i] = data->seed;
+        seed[i] ^= (((row_num - i) * 37  + 178) & 0xFF) << 8;
+        seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
+    }
+
+    const ptrdiff_t stride = out->stride[0];
+    assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0);
+    assert(stride == in->stride[0]);
+    pixel *const src_row = (pixel *)  in->data[0] + PXSTRIDE(stride) * row_num * BLOCK_SIZE;
+    pixel *const dst_row = (pixel *) out->data[0] + PXSTRIDE(stride) * row_num * BLOCK_SIZE;
+
+    int offsets[2 /* col offset */][2 /* row offset */];
+
+    // process this row in BLOCK_SIZE^2 blocks
+    const int bh = imin(out->p.h - row_num * BLOCK_SIZE, BLOCK_SIZE);
+    for (int bx = 0; bx < out->p.w; bx += BLOCK_SIZE) {
+        const int bw = imin(BLOCK_SIZE, out->p.w - bx);
+
+        if (data->overlap_flag && bx) {
+            // shift previous offsets left
+            for (int i = 0; i < rows; i++)
+                offsets[1][i] = offsets[0][i];
+        }
+
+        // update current offsets
+        for (int i = 0; i < rows; i++)
+            offsets[0][i] = get_random_number(8, &seed[i]);
+
+        // x/y block offsets to compensate for overlapped regions
+        const int ystart = data->overlap_flag && row_num ? 2 : 0;
+        const int xstart = data->overlap_flag && bx      ? 2 : 0;
+
+        static const int w[2][2] = { { 27, 17 }, { 17, 27 } };
+
+#define add_noise_y(x, y, grain)                                                \
+            pixel *src = src_row + (y) * PXSTRIDE(stride) + (bx + (x));         \
+            pixel *dst = dst_row + (y) * PXSTRIDE(stride) + (bx + (x));         \
+            int noise = round2(scaling[ *src ] * (grain), data->scaling_shift); \
+            *dst = iclip(*src + noise, min_value, max_value);
+
+        for (int y = ystart; y < bh; y++) {
+            // Non-overlapped image region (straightforward)
+            for (int x = xstart; x < bw; x++) {
+                int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
+                add_noise_y(x, y, grain);
+            }
+
+            // Special case for overlapped column
+            for (int x = 0; x < xstart; x++) {
+                int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
+                int old   = sample_lut(grain_lut, offsets, 0, 0, 1, 0, x, y);
+                grain = round2(old * w[x][0] + grain * w[x][1], 5);
+                grain = iclip(grain, GRAIN_MIN, GRAIN_MAX);
+                add_noise_y(x, y, grain);
+            }
+        }
+
+        for (int y = 0; y < ystart; y++) {
+            // Special case for overlapped row (sans corner)
+            for (int x = xstart; x < bw; x++) {
+                int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
+                int old   = sample_lut(grain_lut, offsets, 0, 0, 0, 1, x, y);
+                grain = round2(old * w[y][0] + grain * w[y][1], 5);
+                grain = iclip(grain, GRAIN_MIN, GRAIN_MAX);
+                add_noise_y(x, y, grain);
+            }
+
+            // Special case for doubly-overlapped corner
+            for (int x = 0; x < xstart; x++) {
+                // Blend the top pixel with the top left block
+                int top = sample_lut(grain_lut, offsets, 0, 0, 0, 1, x, y);
+                int old = sample_lut(grain_lut, offsets, 0, 0, 1, 1, x, y);
+                top = round2(old * w[x][0] + top * w[x][1], 5);
+                top = iclip(top, GRAIN_MIN, GRAIN_MAX);
+
+                // Blend the current pixel with the left block
+                int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
+                old = sample_lut(grain_lut, offsets, 0, 0, 1, 0, x, y);
+                grain = round2(old * w[x][0] + grain * w[x][1], 5);
+                grain = iclip(grain, GRAIN_MIN, GRAIN_MAX);
+
+                // Mix the row rows together and apply grain
+                grain = round2(top * w[y][0] + grain * w[y][1], 5);
+                grain = iclip(grain, GRAIN_MIN, GRAIN_MAX);
+                add_noise_y(x, y, grain);
+            }
+        }
+    }
+}
+
+static void apply_to_row_uv(Dav1dPicture *const out, const Dav1dPicture *const in,
+                            entry grain_lut[GRAIN_HEIGHT][GRAIN_WIDTH],
+                            uint8_t scaling[SCALING_SIZE], int uv, int row_num)
+{
+    const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;
+    const int rows = 1 + (data->overlap_flag && row_num > 0);
+
+    int min_value, max_value;
+    if (data->clip_to_restricted_range) {
+        min_value = 16 << (BITDEPTH - 8);
+        if (out->seq_hdr->mtrx == DAV1D_MC_IDENTITY) {
+            max_value = 235 << (BITDEPTH - 8);
+        } else {
+            max_value = 240 << (BITDEPTH - 8);
+        }
+    } else {
+        min_value = 0;
+        max_value = (1 << BITDEPTH) - 1;
+    }
+
+    const int sx = in->p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    const int sy = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
+
+    // seed[0] contains the current row, seed[1] contains the previous
+    unsigned seed[2];
+    for (int i = 0; i < rows; i++) {
+        seed[i] = data->seed;
+        seed[i] ^= (((row_num - i) * 37  + 178) & 0xFF) << 8;
+        seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
+    }
+
+    const ptrdiff_t stride = out->stride[1];
+    assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0);
+    assert(stride == in->stride[1]);
+
+    const int by = row_num * (BLOCK_SIZE >> sy);
+    pixel *const dst_row = (pixel *) out->data[1 + uv] + PXSTRIDE(stride) * by;
+    pixel *const src_row = (pixel *)  in->data[1 + uv] + PXSTRIDE(stride) * by;
+    pixel *const luma_row = (pixel *) out->data[0] + PXSTRIDE(out->stride[0]) * row_num * BLOCK_SIZE;
+
+    int offsets[2 /* col offset */][2 /* row offset */];
+
+    // process this row in BLOCK_SIZE^2 blocks (subsampled)
+    const int bh = (imin(out->p.h - row_num * BLOCK_SIZE, BLOCK_SIZE) + sy) >> sy;
+    for (int bx = 0; bx < (out->p.w + sx) >> sx; bx += BLOCK_SIZE >> sx) {
+        const int bw = (imin(BLOCK_SIZE, out->p.w - (bx << sx)) + sx) >> sx;
+        if (data->overlap_flag && bx) {
+            // shift previous offsets left
+            for (int i = 0; i < rows; i++)
+                offsets[1][i] = offsets[0][i];
+        }
+
+        // update current offsets
+        for (int i = 0; i < rows; i++)
+            offsets[0][i] = get_random_number(8, &seed[i]);
+
+        // x/y block offsets to compensate for overlapped regions
+        const int ystart = data->overlap_flag && row_num ? (2 >> sy) : 0;
+        const int xstart = data->overlap_flag && bx      ? (2 >> sx) : 0;
+
+        static const int w[2 /* sub */][2 /* off */][2] = {
+            { { 27, 17 }, { 17, 27 } },
+            { { 23, 22 } },
+        };
+
+#define add_noise_uv(x, y, grain)                                               \
+            const int lx = (bx + x) << sx;                                      \
+            const int ly = y << sy;                                             \
+            pixel *luma = luma_row + ly * PXSTRIDE(out->stride[0]) + lx;        \
+            pixel avg = luma[0];                                                \
+            if (sx && lx + 1 < out->p.w)                                        \
+                avg = (avg + luma[1] + 1) >> 1;                                 \
+                                                                                \
+            pixel *src = src_row + (y) * PXSTRIDE(stride) + (bx + (x));         \
+            pixel *dst = dst_row + (y) * PXSTRIDE(stride) + (bx + (x));         \
+            int val = avg;                                                      \
+            if (!data->chroma_scaling_from_luma) {                              \
+                int combined = avg * data->uv_luma_mult[uv] +                   \
+                               *src * data->uv_mult[uv];                        \
+                val = iclip_pixel( (combined >> 6) +                            \
+                                   (data->uv_offset[uv] * (1 << (BITDEPTH - 8))) );   \
+            }                                                                   \
+                                                                                \
+            int noise = round2(scaling[ val ] * (grain), data->scaling_shift);  \
+            *dst = iclip(*src + noise, min_value, max_value);
+
+        for (int y = ystart; y < bh; y++) {
+            // Non-overlapped image region (straightforward)
+            for (int x = xstart; x < bw; x++) {
+                int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
+                add_noise_uv(x, y, grain);
+            }
+
+            // Special case for overlapped column
+            for (int x = 0; x < xstart; x++) {
+                int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
+                int old   = sample_lut(grain_lut, offsets, sx, sy, 1, 0, x, y);
+                grain = (old * w[sx][x][0] + grain * w[sx][x][1] + 16) >> 5;
+                grain = iclip(grain, GRAIN_MIN, GRAIN_MAX);
+                add_noise_uv(x, y, grain);
+            }
+        }
+
+        for (int y = 0; y < ystart; y++) {
+            // Special case for overlapped row (sans corner)
+            for (int x = xstart; x < bw; x++) {
+                int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
+                int old   = sample_lut(grain_lut, offsets, sx, sy, 0, 1, x, y);
+                grain = (old * w[sy][y][0] + grain * w[sy][y][1] + 16) >> 5;
+                grain = iclip(grain, GRAIN_MIN, GRAIN_MAX);
+                add_noise_uv(x, y, grain);
+            }
+
+            // Special case for doubly-overlapped corner
+            for (int x = 0; x < xstart; x++) {
+                // Blend the top pixel with the top left block
+                int top = sample_lut(grain_lut, offsets, sx, sy, 0, 1, x, y);
+                int old = sample_lut(grain_lut, offsets, sx, sy, 1, 1, x, y);
+                top = (old * w[sx][x][0] + top * w[sx][x][1] + 16) >> 5;
+                top = iclip(top, GRAIN_MIN, GRAIN_MAX);
+
+                // Blend the current pixel with the left block
+                int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
+                old = sample_lut(grain_lut, offsets, sx, sy, 1, 0, x, y);
+                grain = (old * w[sx][x][0] + grain * w[sx][x][1] + 16) >> 5;
+                grain = iclip(grain, GRAIN_MIN, GRAIN_MAX);
+
+                // Mix the row rows together and apply to image
+                grain = (top * w[sy][y][0] + grain * w[sy][y][1] + 16) >> 5;
+                grain = iclip(grain, GRAIN_MIN, GRAIN_MAX);
+                add_noise_uv(x, y, grain);
+            }
+        }
+    }
+}
+
+void bitfn(dav1d_apply_grain)(Dav1dPicture *const out,
+                              const Dav1dPicture *const in)
+{
+    const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;
+
+    entry grain_lut[3][GRAIN_HEIGHT][GRAIN_WIDTH];
+    uint8_t scaling[3][SCALING_SIZE];
+
+    // Generate grain LUTs as needed
+    generate_grain_y(out, grain_lut[0]); // always needed
+    if (data->num_uv_points[0] || data->chroma_scaling_from_luma)
+        generate_grain_uv(out, 0, grain_lut[1], grain_lut[0]);
+    if (data->num_uv_points[1] || data->chroma_scaling_from_luma)
+        generate_grain_uv(out, 1, grain_lut[2], grain_lut[0]);
+
+    // Generate scaling LUTs as needed
+    if (data->num_y_points)
+        generate_scaling(data->y_points, data->num_y_points, scaling[0]);
+    if (data->num_uv_points[0])
+        generate_scaling(data->uv_points[0], data->num_uv_points[0], scaling[1]);
+    if (data->num_uv_points[1])
+        generate_scaling(data->uv_points[1], data->num_uv_points[1], scaling[2]);
+
+    // Synthesize grain for the affected planes
+    int rows = (out->p.h + 31) >> 5;
+    for (int row = 0; row < rows; row++) {
+        if (data->num_y_points)
+            apply_to_row_y(out, in, grain_lut[0], scaling[0], row);
+
+        if (data->chroma_scaling_from_luma) {
+            apply_to_row_uv(out, in, grain_lut[1], scaling[0], 0, row);
+            apply_to_row_uv(out, in, grain_lut[2], scaling[0], 1, row);
+        } else {
+            if (data->num_uv_points[0])
+                apply_to_row_uv(out, in, grain_lut[1], scaling[1], 0, row);
+            if (data->num_uv_points[1])
+                apply_to_row_uv(out, in, grain_lut[2], scaling[2], 1, row);
+        }
+    }
+
+    // Copy over the non-modified planes
+    // TODO: eliminate in favor of per-plane refs
+    if (!data->num_y_points) {
+        assert(out->stride[0] == in->stride[0]);
+        memcpy(out->data[0], in->data[0], out->p.h * out->stride[0]);
+    }
+
+    if (in->p.layout != DAV1D_PIXEL_LAYOUT_I400) {
+        for (int i = 0; i < 2; i++) {
+            if (!data->num_uv_points[i] && !data->chroma_scaling_from_luma) {
+                const int suby = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
+                assert(out->stride[1] == in->stride[1]);
+                memcpy(out->data[1+i], in->data[1+i],
+                       (out->p.h >> suby) * out->stride[1]);
+            }
+        }
+    }
+}
--- a/third_party/dav1d/src/internal.h
+++ b/third_party/dav1d/src/internal.h
@ -75,10 +75,12 @@ struct Dav1dContext {
        Dav1dData data;
        int start, end;
    } tile[256];
-    int n_tile_data, have_seq_hdr, have_frame_hdr;
+    int n_tile_data;
    int n_tiles;
-    Av1SequenceHeader seq_hdr; // FIXME make ref?
-    Av1FrameHeader frame_hdr; // FIXME make ref?
+    Dav1dRef *seq_hdr_ref;
+    Dav1dSequenceHeader *seq_hdr;
+    Dav1dRef *frame_hdr_ref;
+    Dav1dFrameHeader *frame_hdr;

    // decoded output picture queue
    Dav1dData in;
@ -86,19 +88,18 @@ struct Dav1dContext {
    struct {
        Dav1dThreadPicture *out_delayed;
        unsigned next;
+        // dummy is a pointer to prevent compiler errors about atomic_load()
+        // not taking const arguments; the const attribute is not taken
+        // from pointers
+        atomic_int flush_mem, *flush;
    } frame_thread;

    // reference/entropy state
    struct {
        Dav1dThreadPicture p;
        Dav1dRef *segmap;
-        Av1SegmentationDataSet seg_data;
        Dav1dRef *refmvs;
        unsigned refpoc[7];
-        WarpedMotionParams gmv[7];
-        Av1LoopfilterModeRefDeltas lf_mode_ref_deltas;
-        Av1FilmGrainData film_grain;
-        uint8_t qidx;
    } refs[8];
    CdfThreadContext cdf[8];

@ -114,12 +115,20 @@ struct Dav1dContext {
    } intra_edge;

    Dav1dPicAllocator allocator;
+    int apply_grain;
+    int operating_point;
+    unsigned operating_point_idc;
+    int all_layers;
 };

 struct Dav1dFrameContext {
-    Av1SequenceHeader seq_hdr;
-    Av1FrameHeader frame_hdr;
-    Dav1dThreadPicture refp[7], cur;
+    Dav1dRef *seq_hdr_ref;
+    Dav1dSequenceHeader *seq_hdr;
+    Dav1dRef *frame_hdr_ref;
+    Dav1dFrameHeader *frame_hdr;
+    Dav1dThreadPicture refp[7];
+    Dav1dPicture cur; // during block coding / reconstruction
+    Dav1dThreadPicture sr_cur; // after super-resolution upscaling
    Dav1dRef *mvs_ref;
    refmvs *mvs, *ref_mvs[7];
    Dav1dRef *ref_mvs_ref[7];
@ -127,6 +136,7 @@ struct Dav1dFrameContext {
    uint8_t *cur_segmap;
    const uint8_t *prev_segmap;
    unsigned refpoc[7], refrefpoc[7][7];
+    uint8_t gmv_warp_allowed[7];
    CdfThreadContext in_cdf, out_cdf;
    struct {
        Dav1dData data;
@ -139,6 +149,7 @@ struct Dav1dFrameContext {
        int scale; // if no scaling, this is 0
        int step;
    } svc[7][2 /* x, y */];
+    int resize_step[2 /* y, uv */], resize_start[2 /* y, uv */];

    const Dav1dContext *c;
    Dav1dTileContext *tc;
@ -157,8 +168,8 @@ struct Dav1dFrameContext {
    int ipred_edge_sz;
    pixel *ipred_edge[3];
    ptrdiff_t b4_stride;
-    int w4, h4, bw, bh, sb128w, sb128h, sbh, sb_shift, sb_step;
-    uint16_t dq[NUM_SEGMENTS][3 /* plane */][2 /* dc/ac */];
+    int w4, h4, bw, bh, sb128w, sb128h, sbh, sb_shift, sb_step, sr_sb128w;
+    uint16_t dq[DAV1D_MAX_SEGMENTS][3 /* plane */][2 /* dc/ac */];
    const uint8_t *qm[2 /* is_1d */][N_RECT_TX_SIZES][3 /* plane */];
    BlockContext *a;
    int a_sz /* w*tile_rows */;
@ -188,8 +199,9 @@ struct Dav1dFrameContext {
    struct {
        uint8_t (*level)[4];
        Av1Filter *mask;
+        Av1Restoration *lr_mask;
        int top_pre_cdef_toggle;
-        int mask_sz /* w*h */, line_sz /* w */, re_sz /* h */;
+        int mask_sz /* w*h */, lr_mask_sz, line_sz /* w */, lr_line_sz, re_sz /* h */;
        Av1FilterLUT lim_lut;
        int last_sharpness;
        uint8_t lvl[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */];
@ -201,7 +213,7 @@ struct Dav1dFrameContext {

        // in-loop filter per-frame state keeping
        int tile_row; // for carry-over at tile row edges
-        pixel *p[3];
+        pixel *p[3], *sr_p[3];
        Av1Filter *mask_ptr, *prev_mask_ptr;
    } lf;

@ -212,7 +224,7 @@ struct Dav1dFrameContext {
        pthread_cond_t cond, icond;
        int tasks_left, num_tasks;
        int (*task_idx_to_sby_and_tile_idx)[2];
-        int titsati_sz, titsati_init[2];
+        int titsati_sz, titsati_init[3];
    } tile_thread;
 };

@ -235,7 +247,7 @@ struct Dav1dTileState {
        coef *cf;
    } frame_thread;

-    uint16_t dqmem[NUM_SEGMENTS][3 /* plane */][2 /* dc/ac */];
+    uint16_t dqmem[DAV1D_MAX_SEGMENTS][3 /* plane */][2 /* dc/ac */];
    const uint16_t (*dq)[3][2];
    int last_qidx;

@ -259,7 +271,7 @@ struct Dav1dTileContext {
    uint16_t pal[3 /* plane */][8 /* palette_idx */];
    uint8_t pal_sz_uv[2 /* a/l */][32 /* bx4/by4 */];
    uint8_t txtp_map[32 * 32]; // inter-only
-    WarpedMotionParams warpmv;
+    Dav1dWarpedMotionParams warpmv;
    union {
        void *mem;
        uint8_t *pal_idx;
--- a/third_party/dav1d/src/intra_edge.c
+++ b/third_party/dav1d/src/intra_edge.c
@ -28,6 +28,7 @@
 #include "config.h"

 #include <assert.h>
+#include <stdlib.h>

 #include "src/intra_edge.h"
 #include "src/levels.h"
--- a/third_party/dav1d/src/ipred.h
+++ b/third_party/dav1d/src/ipred.h
@ -28,6 +28,8 @@
 #ifndef __DAV1D_SRC_IPRED_H__
 #define __DAV1D_SRC_IPRED_H__

+#include <stddef.h>
+
 #include "common/bitdepth.h"

 #include "src/levels.h"
--- a/third_party/dav1d/src/itx.h
+++ b/third_party/dav1d/src/itx.h
@ -28,6 +28,8 @@
 #ifndef __DAV1D_SRC_ITX_H__
 #define __DAV1D_SRC_ITX_H__

+#include <stddef.h>
+
 #include "common/bitdepth.h"

 #include "src/levels.h"
--- a/third_party/dav1d/src/itx_1d.c
+++ b/third_party/dav1d/src/itx_1d.c
--- a/third_party/dav1d/src/itx_tmpl.c
+++ b/third_party/dav1d/src/itx_tmpl.c
@ -40,7 +40,7 @@
 #include "src/itx_1d.c"

 typedef void (*itx_1d_fn)(const coef *in, ptrdiff_t in_s,
-                          coef *out, ptrdiff_t out_s);
+                          coef *out, ptrdiff_t out_s, const int range);

 static void NOINLINE
 inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
@ -54,6 +54,9 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
    // Maximum value for h and w is 64
    coef tmp[4096 /* w * h */], out[64 /* h */], in_mem[64 /* w */];
    const int is_rect2 = w * 2 == h || h * 2 == w;
+    const int row_clip_max = (1 << (BITDEPTH + 8 - 1)) - 1;
+    const int col_clip_max = (1 << (imax(BITDEPTH + 6, 16) - 1)) -1;
+    const int col_clip_min = -col_clip_max - 1;

    if (w != sw) memset(&in_mem[sw], 0, (w - sw) * sizeof(*in_mem));
    const int rnd1 = (1 << shift1) >> 1;
@ -64,18 +67,19 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
                if (is_rect2)
                    in_mem[j] = (in_mem[j] * 2896 + 2048) >> 12;
            }
-            first_1d_fn(in_mem, 1, &tmp[i * w], 1);
+            first_1d_fn(in_mem, 1, &tmp[i * w], 1, row_clip_max);
        } else {
-            first_1d_fn(&coeff[i], sh, &tmp[i * w], 1);
+            first_1d_fn(&coeff[i], sh, &tmp[i * w], 1, row_clip_max);
        }
        for (j = 0; j < w; j++)
-            tmp[i * w + j] = (tmp[i * w + j] + (rnd1)) >> shift1;
+            tmp[i * w + j] = iclip((tmp[i * w + j] + (rnd1)) >> shift1,
+                                   col_clip_min, col_clip_max);
    }

    if (h != sh) memset(&tmp[sh * w], 0, w * (h - sh) * sizeof(*tmp));
    const int rnd2 = (1 << shift2) >> 1;
    for (i = 0; i < w; i++) {
-        second_1d_fn(&tmp[i], w, out, 1);
+        second_1d_fn(&tmp[i], w, out, 1, col_clip_max);
        for (j = 0; j < h; j++)
            dst[i + j * PXSTRIDE(stride)] =
                iclip_pixel(dst[i + j * PXSTRIDE(stride)] +
@ -145,15 +149,18 @@ inv_txfm_fn64(64, 64, 2, 4)
 static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride,
                                       coef *const coeff, const int eob)
 {
-    int i, j;
+    const int col_clip_max = (1 << (imax(BITDEPTH + 6, 16) - 1)) -1;
+    const int col_clip_min = -col_clip_max - 1;
    coef tmp[4 * 4], out[4];

-    for (i = 0; i < 4; i++)
+    for (int i = 0; i < 4; i++)
        inv_wht4_1d(&coeff[i], 4, &tmp[i * 4], 1, 0);
+    for (int k = 0; k < 4 * 4; k++)
+        tmp[k] = iclip(tmp[k], col_clip_min, col_clip_max);

-    for (i = 0; i < 4; i++) {
+    for (int i = 0; i < 4; i++) {
        inv_wht4_1d(&tmp[i], 4, out, 1, 1);
-        for (j = 0; j < 4; j++)
+        for (int j = 0; j < 4; j++)
            dst[i + j * PXSTRIDE(stride)] =
                iclip_pixel(dst[i + j * PXSTRIDE(stride)] + out[j]);
    }
--- a/third_party/dav1d/src/levels.h
+++ b/third_party/dav1d/src/levels.h
@ -28,7 +28,9 @@
 #ifndef __DAV1D_SRC_LEVELS_H__
 #define __DAV1D_SRC_LEVELS_H__

-#include "dav1d/picture.h"
+#include <stdint.h>
+
+#include "dav1d/headers.h"

 enum ObuType {
    OBU_SEQ_HDR   = 1,
@ -41,10 +43,6 @@ enum ObuType {
    OBU_PADDING   = 15,
 };

-// Constants from Section 3. "Symbols and abbreviated terms"
-#define MAX_TILE_COLS 64
-#define MAX_TILE_ROWS 64
-
 enum TxfmSize {
    TX_4X4,
    TX_8X8,
@ -63,13 +61,6 @@ enum BlockLevel {
    N_BL_LEVELS,
 };

-enum TxfmMode {
-    TX_4X4_ONLY,
-    TX_LARGEST,
-    TX_SWITCHABLE,
-    N_TX_MODES,
-};
-
 enum RectTxfmSize {
    RTX_4X8 = N_TX_SIZES,
    RTX_8X4,
@ -203,16 +194,6 @@ enum BlockSize {
    N_BS_SIZES,
 };

-enum FilterMode {
-    FILTER_8TAP_REGULAR,
-    FILTER_8TAP_SMOOTH,
-    FILTER_8TAP_SHARP,
-    N_SWITCHABLE_FILTERS,
-    FILTER_BILINEAR = N_SWITCHABLE_FILTERS,
-    N_FILTERS,
-    FILTER_SWITCHABLE = N_FILTERS,
-};
-
 enum Filter2d { // order is horizontal, vertical
    FILTER_2D_8TAP_REGULAR,
    FILTER_2D_8TAP_REGULAR_SMOOTH,
@ -269,242 +250,16 @@ enum InterIntraType {
    INTER_INTRA_WEDGE,
 };

-enum AdaptiveBoolean {
-    OFF = 0,
-    ON = 1,
-    ADAPTIVE = 2,
-};
-
-enum RestorationType {
-    RESTORATION_NONE,
-    RESTORATION_SWITCHABLE,
-    RESTORATION_WIENER,
-    RESTORATION_SGRPROJ,
-};
-
 typedef struct mv {
    int16_t y, x;
 } mv;

-enum WarpedMotionType {
-    WM_TYPE_IDENTITY,
-    WM_TYPE_TRANSLATION,
-    WM_TYPE_ROT_ZOOM,
-    WM_TYPE_AFFINE,
-};
-
-typedef struct WarpedMotionParams {
-    enum WarpedMotionType type;
-    int32_t matrix[6];
-    union {
-        struct {
-            int16_t alpha, beta, gamma, delta;
-        };
-        int16_t abcd[4];
-    };
-} WarpedMotionParams;
-
 enum MotionMode {
    MM_TRANSLATION,
    MM_OBMC,
    MM_WARP,
 };

-typedef struct Av1SequenceHeader {
-    int profile;
-    int still_picture;
-    int reduced_still_picture_header;
-    int timing_info_present;
-    int num_units_in_tick;
-    int time_scale;
-    int equal_picture_interval;
-    int num_ticks_per_picture;
-    int decoder_model_info_present;
-    int encoder_decoder_buffer_delay_length;
-    int num_units_in_decoding_tick;
-    int buffer_removal_delay_length;
-    int frame_presentation_delay_length;
-    int display_model_info_present;
-    int num_operating_points;
-    struct Av1SequenceHeaderOperatingPoint {
-        int idc;
-        int major_level, minor_level;
-        int tier;
-        int decoder_model_param_present;
-        int decoder_buffer_delay;
-        int encoder_buffer_delay;
-        int low_delay_mode;
-        int display_model_param_present;
-        int initial_display_delay;
-    } operating_points[32];
-    int max_width, max_height, width_n_bits, height_n_bits;
-    int frame_id_numbers_present;
-    int delta_frame_id_n_bits;
-    int frame_id_n_bits;
-    int sb128;
-    int filter_intra;
-    int intra_edge_filter;
-    int inter_intra;
-    int masked_compound;
-    int warped_motion;
-    int dual_filter;
-    int order_hint;
-    int jnt_comp;
-    int ref_frame_mvs;
-    enum AdaptiveBoolean screen_content_tools;
-    enum AdaptiveBoolean force_integer_mv;
-    int order_hint_n_bits;
-    int super_res;
-    int cdef;
-    int restoration;
-    int bpc;
-    int hbd;
-    int color_description_present;
-    enum Dav1dPixelLayout layout;
-    enum Dav1dColorPrimaries pri;
-    enum Dav1dTransferCharacteristics trc;
-    enum Dav1dMatrixCoefficients mtrx;
-    enum Dav1dChromaSamplePosition chr;
-    int color_range;
-    int separate_uv_delta_q;
-    int film_grain_present;
-} Av1SequenceHeader;
-
-#define NUM_SEGMENTS 8
-
-typedef struct Av1SegmentationData {
-    int delta_q;
-    int delta_lf_y_v, delta_lf_y_h, delta_lf_u, delta_lf_v;
-    int ref;
-    int skip;
-    int globalmv;
-} Av1SegmentationData;
-
-typedef struct Av1SegmentationDataSet {
-    Av1SegmentationData d[NUM_SEGMENTS];
-    int preskip;
-    int last_active_segid;
-} Av1SegmentationDataSet;
-
-typedef struct Av1LoopfilterModeRefDeltas {
-    int mode_delta[2];
-    int ref_delta[8];
-} Av1LoopfilterModeRefDeltas;
-
-typedef struct Av1FilmGrainData {
-    int num_y_points;
-    uint8_t y_points[14][2 /* value, scaling */];
-    int chroma_scaling_from_luma;
-    int num_uv_points[2];
-    uint8_t uv_points[2][10][2 /* value, scaling */];
-    int scaling_shift;
-    int ar_coeff_lag;
-    int8_t ar_coeffs_y[24];
-    int8_t ar_coeffs_uv[2][25];
-    int ar_coeff_shift;
-    int grain_scale_shift;
-    int uv_mult[2];
-    int uv_luma_mult[2];
-    int uv_offset[2];
-    int overlap_flag;
-    int clip_to_restricted_range;
-} Av1FilmGrainData;
-
-typedef struct Av1FrameHeader {
-    int show_existing_frame;
-    int existing_frame_idx;
-    int frame_id;
-    int frame_presentation_delay;
-    enum Dav1dFrameType frame_type;
-    int show_frame;
-    int showable_frame;
-    int error_resilient_mode;
-    int disable_cdf_update;
-    int allow_screen_content_tools;
-    int force_integer_mv;
-    int frame_size_override;
-#define PRIMARY_REF_NONE 7
-    int primary_ref_frame;
-    int buffer_removal_time_present;
-    struct Av1FrameHeaderOperatingPoint {
-        int buffer_removal_time;
-    } operating_points[32];
-    int frame_offset;
-    int refresh_frame_flags;
-    int width, height;
-    int render_width, render_height;
-    int super_res;
-    int have_render_size;
-    int allow_intrabc;
-    int frame_ref_short_signaling;
-    int refidx[7];
-    int hp;
-    enum FilterMode subpel_filter_mode;
-    int switchable_motion_mode;
-    int use_ref_frame_mvs;
-    int refresh_context;
-    struct {
-        int uniform;
-        unsigned n_bytes;
-        int min_log2_cols, max_log2_cols, log2_cols, cols;
-        int min_log2_rows, max_log2_rows, log2_rows, rows;
-        uint16_t col_start_sb[MAX_TILE_COLS + 1];
-        uint16_t row_start_sb[MAX_TILE_ROWS + 1];
-        int update;
-    } tiling;
-    struct {
-        int yac;
-        int ydc_delta;
-        int udc_delta, uac_delta, vdc_delta, vac_delta;
-        int qm, qm_y, qm_u, qm_v;
-    } quant;
-    struct {
-        int enabled, update_map, temporal, update_data;
-        Av1SegmentationDataSet seg_data;
-        int lossless[NUM_SEGMENTS], qidx[NUM_SEGMENTS];
-    } segmentation;
-    struct {
-        struct {
-            int present;
-            int res_log2;
-        } q;
-        struct {
-            int present;
-            int res_log2;
-            int multi;
-        } lf;
-    } delta;
-    int all_lossless;
-    struct {
-        int level_y[2];
-        int level_u, level_v;
-        int mode_ref_delta_enabled;
-        int mode_ref_delta_update;
-        Av1LoopfilterModeRefDeltas mode_ref_deltas;
-        int sharpness;
-    } loopfilter;
-    struct {
-        int damping;
-        int n_bits;
-        int y_strength[8];
-        int uv_strength[8];
-    } cdef;
-    struct {
-        enum RestorationType type[3];
-        int unit_size[2];
-    } restoration;
-    enum TxfmMode txfm_mode;
-    int switchable_comp_refs;
-    int skip_mode_allowed, skip_mode_enabled, skip_mode_refs[2];
-    int warp_motion;
-    int reduced_txtp_set;
-    WarpedMotionParams gmv[7];
-    struct {
-        int present, update, seed;
-        Av1FilmGrainData data;
-    } film_grain;
-} Av1FrameHeader;
-
 #define QINDEX_RANGE 256

 typedef struct Av1Block {
--- a/third_party/dav1d/src/lf_apply_tmpl.c
+++ b/third_party/dav1d/src/lf_apply_tmpl.c
@ -178,13 +178,13 @@ void bytefn(dav1d_loopfilter_sbrow)(const Dav1dFrameContext *const f,
    int x, have_left;
    // Don't filter outside the frame
    const int have_top = sby > 0;
-    const int is_sb64 = !f->seq_hdr.sb128;
+    const int is_sb64 = !f->seq_hdr->sb128;
    const int starty4 = (sby & is_sb64) << 4;
    const int sbsz = 32 >> is_sb64;
    const int sbl2 = 5 - is_sb64;
    const int halign = (f->bh + 31) & ~31;
-    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-    const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
    const int vmask = 16 >> ss_ver, hmask = 16 >> ss_hor;
    const unsigned vmax = 1U << vmask, hmax = 1U << hmask;
    const unsigned endy4 = starty4 + imin(f->h4 - sby * sbsz, sbsz);
@ -194,7 +194,7 @@ void bytefn(dav1d_loopfilter_sbrow)(const Dav1dFrameContext *const f,
    const uint8_t *lpf_y = &f->lf.tx_lpf_right_edge[0][sby << sbl2];
    const uint8_t *lpf_uv = &f->lf.tx_lpf_right_edge[1][sby << (sbl2 - ss_ver)];
    for (int tile_col = 1;; tile_col++) {
-        x = f->frame_hdr.tiling.col_start_sb[tile_col];
+        x = f->frame_hdr->tiling.col_start_sb[tile_col];
        if ((x << sbl2) >= f->bw) break;
        const int bx4 = x & is_sb64 ? 16 : 0, cbx4 = bx4 >> ss_hor;
        x >>= is_sb64;
@ -211,7 +211,7 @@ void bytefn(dav1d_loopfilter_sbrow)(const Dav1dFrameContext *const f,
            y_hmask[imin(idx, lpf_y[y - starty4])][sidx] |= smask;
        }

-        if (f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
+        if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
            uint16_t (*const uv_hmask)[2] = lflvl[x].filter_uv[0][cbx4];
            for (unsigned y = starty4 >> ss_ver, uv_mask = 1 << y; y < uv_endy4;
                 y++, uv_mask <<= 1)
@ -247,7 +247,7 @@ void bytefn(dav1d_loopfilter_sbrow)(const Dav1dFrameContext *const f,
                y_vmask[imin(idx, a->tx_lpf_y[i])][sidx] |= smask;
            }

-            if (f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
+            if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
                const unsigned cw = (w + ss_hor) >> ss_hor;
                uint16_t (*const uv_vmask)[2] = lflvl[x].filter_uv[1][starty4 >> ss_ver];
                for (unsigned uv_mask = 1, i = 0; i < cw; uv_mask <<= 1, i++) {
@ -268,18 +268,18 @@ void bytefn(dav1d_loopfilter_sbrow)(const Dav1dFrameContext *const f,
         x++, have_left = 1, ptr += 128, level_ptr += 32)
    {
        filter_plane_cols_y(f, have_left, level_ptr, f->b4_stride,
-                            lflvl[x].filter_y[0], ptr, f->cur.p.stride[0],
+                            lflvl[x].filter_y[0], ptr, f->cur.stride[0],
                            imin(32, f->w4 - x * 32), starty4, endy4);
    }

    level_ptr = f->lf.level + f->b4_stride * sby * sbsz;
    for (ptr = p[0], x = 0; x < f->sb128w; x++, ptr += 128, level_ptr += 32) {
        filter_plane_rows_y(f, have_top, level_ptr, f->b4_stride,
-                            lflvl[x].filter_y[1], ptr, f->cur.p.stride[0],
+                            lflvl[x].filter_y[1], ptr, f->cur.stride[0],
                            imin(32, f->w4 - x * 32), starty4, endy4);
    }

-    if (!f->frame_hdr.loopfilter.level_u && !f->frame_hdr.loopfilter.level_v)
+    if (!f->frame_hdr->loopfilter.level_u && !f->frame_hdr->loopfilter.level_v)
        return;

    ptrdiff_t uv_off;
@ -289,7 +289,7 @@ void bytefn(dav1d_loopfilter_sbrow)(const Dav1dFrameContext *const f,
    {
        filter_plane_cols_uv(f, have_left, level_ptr, f->b4_stride,
                             lflvl[x].filter_uv[0],
-                             &p[1][uv_off], &p[2][uv_off], f->cur.p.stride[1],
+                             &p[1][uv_off], &p[2][uv_off], f->cur.stride[1],
                             (imin(32, f->w4 - x * 32) + ss_hor) >> ss_hor,
                             starty4 >> ss_ver, uv_endy4, ss_ver);
    }
@ -300,7 +300,7 @@ void bytefn(dav1d_loopfilter_sbrow)(const Dav1dFrameContext *const f,
    {
        filter_plane_rows_uv(f, have_top, level_ptr, f->b4_stride,
                             lflvl[x].filter_uv[1],
-                             &p[1][uv_off], &p[2][uv_off], f->cur.p.stride[1],
+                             &p[1][uv_off], &p[2][uv_off], f->cur.stride[1],
                             (imin(32, f->w4 - x * 32) + ss_hor) >> ss_hor,
                             starty4 >> ss_ver, uv_endy4, ss_hor);
    }
--- a/third_party/dav1d/src/lf_mask.c
+++ b/third_party/dav1d/src/lf_mask.c
@ -287,7 +287,7 @@ static inline void mask_edges_chroma(uint16_t (*const masks)[32][2][2],
 void dav1d_create_lf_mask_intra(Av1Filter *const lflvl,
                                uint8_t (*const level_cache)[4],
                                const ptrdiff_t b4_stride,
-                                const Av1FrameHeader *const hdr,
+                                const Dav1dFrameHeader *const hdr,
                                const uint8_t (*filter_level)[8][2],
                                const int bx, const int by,
                                const int iw, const int ih,
@ -351,7 +351,7 @@ void dav1d_create_lf_mask_intra(Av1Filter *const lflvl,
 void dav1d_create_lf_mask_inter(Av1Filter *const lflvl,
                                uint8_t (*const level_cache)[4],
                                const ptrdiff_t b4_stride,
-                                const Av1FrameHeader *const hdr,
+                                const Dav1dFrameHeader *const hdr,
                                const uint8_t (*filter_level)[8][2],
                                const int bx, const int by,
                                const int iw, const int ih,
@ -435,7 +435,7 @@ void dav1d_calc_eih(Av1FilterLUT *const lim_lut, const int filter_sharpness) {
 static inline void calc_lf_value(uint8_t (*const lflvl_values)[2],
                                 const int is_chroma, const int base_lvl,
                                 const int lf_delta, const int seg_delta,
-                                 const Av1LoopfilterModeRefDeltas *const mr_delta)
+                                 const Dav1dLoopfilterModeRefDeltas *const mr_delta)
 {
    const int base = iclip(iclip(base_lvl + lf_delta, 0, 63) + seg_delta, 0, 63);

@ -458,7 +458,7 @@ static inline void calc_lf_value(uint8_t (*const lflvl_values)[2],
 }

 void dav1d_calc_lf_values(uint8_t (*const lflvl_values)[4][8][2],
-                          const Av1FrameHeader *const hdr,
+                          const Dav1dFrameHeader *const hdr,
                          const int8_t lf_delta[4])
 {
    const int n_seg = hdr->segmentation.enabled ? 8 : 1;
@ -468,11 +468,11 @@ void dav1d_calc_lf_values(uint8_t (*const lflvl_values)[4][8][2],
        return;
    }

-    const Av1LoopfilterModeRefDeltas *const mr_deltas =
+    const Dav1dLoopfilterModeRefDeltas *const mr_deltas =
        hdr->loopfilter.mode_ref_delta_enabled ?
        &hdr->loopfilter.mode_ref_deltas : NULL;
    for (int s = 0; s < n_seg; s++) {
-        const Av1SegmentationData *const segd =
+        const Dav1dSegmentationData *const segd =
            hdr->segmentation.enabled ? &hdr->segmentation.seg_data.d[s] : NULL;

        calc_lf_value(lflvl_values[s][0], 0, hdr->loopfilter.level_y[0],
--- a/third_party/dav1d/src/lf_mask.h
+++ b/third_party/dav1d/src/lf_mask.h
@ -40,26 +40,30 @@ typedef struct Av1FilterLUT {
 } Av1FilterLUT;

 typedef struct Av1RestorationUnit {
-    enum RestorationType type;
+    enum Dav1dRestorationType type;
    int16_t filter_h[3];
    int16_t filter_v[3];
    uint8_t sgr_idx;
    int16_t sgr_weights[2];
 } Av1RestorationUnit;

-// each struct describes one 128x128 area (1 or 4 SBs)
+// each struct describes one 128x128 area (1 or 4 SBs), pre-superres-scaling
 typedef struct Av1Filter {
    // each bit is 1 col
    uint16_t filter_y[2 /* 0=col, 1=row */][32][3][2];
    uint16_t filter_uv[2 /* 0=col, 1=row */][32][2][2];
    int8_t cdef_idx[4]; // -1 means "unset"
    uint16_t noskip_mask[32][2];
-    Av1RestorationUnit lr[3][4];
 } Av1Filter;

+// each struct describes one 128x128 area (1 or 4 SBs), post-superres-scaling
+typedef struct Av1Restoration {
+    Av1RestorationUnit lr[3][4];
+} Av1Restoration;
+
 void dav1d_create_lf_mask_intra(Av1Filter *lflvl, uint8_t (*level_cache)[4],
                                const ptrdiff_t b4_stride,
-                                const Av1FrameHeader *hdr,
+                                const Dav1dFrameHeader *hdr,
                                const uint8_t (*level)[8][2], int bx, int by,
                                int iw, int ih, enum BlockSize bs,
                                enum RectTxfmSize ytx, enum RectTxfmSize uvtx,
@ -67,7 +71,7 @@ void dav1d_create_lf_mask_intra(Av1Filter *lflvl, uint8_t (*level_cache)[4],
                                uint8_t *ly, uint8_t *auv, uint8_t *luv);
 void dav1d_create_lf_mask_inter(Av1Filter *lflvl, uint8_t (*level_cache)[4],
                                const ptrdiff_t b4_stride,
-                                const Av1FrameHeader *hdr,
+                                const Dav1dFrameHeader *hdr,
                                const uint8_t (*level)[8][2], int bx, int by,
                                int iw, int ih, int skip_inter,
                                enum BlockSize bs, const uint16_t *tx_mask,
@ -75,7 +79,7 @@ void dav1d_create_lf_mask_inter(Av1Filter *lflvl, uint8_t (*level_cache)[4],
                                enum Dav1dPixelLayout layout, uint8_t *ay,
                                uint8_t *ly, uint8_t *auv, uint8_t *luv);
 void dav1d_calc_eih(Av1FilterLUT *lim_lut, int filter_sharpness);
-void dav1d_calc_lf_values(uint8_t (*values)[4][8][2], const Av1FrameHeader *hdr,
+void dav1d_calc_lf_values(uint8_t (*values)[4][8][2], const Dav1dFrameHeader *hdr,
                          const int8_t lf_delta[4]);

 #endif /* __DAV1D_SRC_LF_MASK_H__ */
--- a/third_party/dav1d/src/lib.c
+++ b/third_party/dav1d/src/lib.c
@ -43,6 +43,7 @@
 #include "src/ref.h"
 #include "src/thread_task.h"
 #include "src/wedge.h"
+#include "src/film_grain.h"

 static void init_internal(void) {
    dav1d_init_wedge_masks();
@ -57,9 +58,12 @@ const char *dav1d_version(void) {
 void dav1d_default_settings(Dav1dSettings *const s) {
    s->n_frame_threads = 1;
    s->n_tile_threads = 1;
+    s->apply_grain = 1;
    s->allocator.cookie = NULL;
    s->allocator.alloc_picture_callback = default_picture_allocator;
    s->allocator.release_picture_callback = default_picture_release;
+    s->operating_point = 0;
+    s->all_layers = 1; // just until the tests are adjusted
 }

 int dav1d_open(Dav1dContext **const c_out,
@ -71,19 +75,26 @@ int dav1d_open(Dav1dContext **const c_out,
    validate_input_or_ret(c_out != NULL, -EINVAL);
    validate_input_or_ret(s != NULL, -EINVAL);
    validate_input_or_ret(s->n_tile_threads >= 1 &&
-                          s->n_tile_threads <= 64, -EINVAL);
+                          s->n_tile_threads <= DAV1D_MAX_TILE_THREADS, -EINVAL);
    validate_input_or_ret(s->n_frame_threads >= 1 &&
-                          s->n_frame_threads <= 256, -EINVAL);
+                          s->n_frame_threads <= DAV1D_MAX_FRAME_THREADS, -EINVAL);
    validate_input_or_ret(s->allocator.alloc_picture_callback != NULL,
                          -EINVAL);
    validate_input_or_ret(s->allocator.release_picture_callback != NULL,
                          -EINVAL);
+    validate_input_or_ret(s->operating_point >= 0 &&
+                          s->operating_point <= 31, -EINVAL);

    Dav1dContext *const c = *c_out = dav1d_alloc_aligned(sizeof(*c), 32);
    if (!c) goto error;
    memset(c, 0, sizeof(*c));

    c->allocator = s->allocator;
+    c->apply_grain = s->apply_grain;
+    c->operating_point = s->operating_point;
+    c->all_layers = s->all_layers;
+    c->frame_thread.flush = &c->frame_thread.flush_mem;
+    atomic_init(c->frame_thread.flush, 0);
    c->n_fc = s->n_frame_threads;
    c->fc = dav1d_alloc_aligned(sizeof(*c->fc) * s->n_frame_threads, 32);
    if (!c->fc) goto error;
@ -157,6 +168,54 @@ error:
    return -ENOMEM;
 }

+static void dummy_free(const uint8_t *const data, void *const user_data) {
+    assert(data && !user_data);
+}
+
+int dav1d_parse_sequence_header(Dav1dSequenceHeader *const out,
+                                const uint8_t *const ptr, const size_t sz)
+{
+    Dav1dData buf = { 0 };
+    int res;
+
+    validate_input_or_ret(out != NULL, -EINVAL);
+
+    Dav1dSettings s;
+    dav1d_default_settings(&s);
+
+    Dav1dContext *c;
+    res	= dav1d_open(&c, &s);
+    if (res < 0) return res;
+
+    if (ptr) {
+        res = dav1d_data_wrap(&buf, ptr, sz, dummy_free, NULL);
+        if (res < 0) goto error;
+    }
+
+    while (buf.sz > 0) {
+        res = dav1d_parse_obus(c, &buf, 1);
+        if (res < 0) goto error;
+
+        assert((size_t)res <= buf.sz);
+        buf.sz -= res;
+        buf.data += res;
+    }
+
+    if (!c->seq_hdr) {
+        res = -EINVAL;
+        goto error;
+    }
+
+    memcpy(out, c->seq_hdr, sizeof(*out));
+
+    res = 0;
+error:
+    dav1d_data_unref(&buf);
+    dav1d_close(&c);
+
+    return res;
+}
+
 int dav1d_send_data(Dav1dContext *const c, Dav1dData *const in)
 {
    validate_input_or_ret(c != NULL, -EINVAL);
@ -170,6 +229,52 @@ int dav1d_send_data(Dav1dContext *const c, Dav1dData *const in)
    return 0;
 }

+static int output_image(Dav1dContext *const c, Dav1dPicture *const out,
+                        Dav1dPicture *const in)
+{
+    const Dav1dFilmGrainData *fgdata = &in->frame_hdr->film_grain.data;
+    int has_grain = fgdata->num_y_points || fgdata->num_uv_points[0] ||
+                    fgdata->num_uv_points[1];
+
+    // skip lower spatial layers
+    if (c->operating_point_idc && !c->all_layers) {
+        const int max_spatial_id = ulog2(c->operating_point_idc >> 8);
+        if (max_spatial_id > in->frame_hdr->spatial_id) {
+            dav1d_picture_unref(in);
+            return 0;
+        }
+    }
+
+    // If there is nothing to be done, skip the allocation/copy
+    if (!c->apply_grain || !has_grain) {
+        dav1d_picture_move_ref(out, in);
+        return 0;
+    }
+
+    // Apply film grain to a new copy of the image to avoid corrupting refs
+    int res = dav1d_picture_alloc_copy(out, in->p.w, in);
+    if (res < 0)
+        return res;
+
+    switch (out->p.bpc) {
+#if CONFIG_8BPC
+    case 8:
+        dav1d_apply_grain_8bpc(out, in);
+        break;
+#endif
+#if CONFIG_10BPC
+    case 10:
+        dav1d_apply_grain_10bpc(out, in);
+        break;
+#endif
+    default:
+        assert(0);
+    }
+
+    dav1d_picture_unref(in);
+    return 0;
+}
+
 int dav1d_get_picture(Dav1dContext *const c, Dav1dPicture *const out)
 {
    int res;
@ -197,21 +302,20 @@ int dav1d_get_picture(Dav1dContext *const c, Dav1dPicture *const out)
            if (++c->frame_thread.next == c->n_fc)
                c->frame_thread.next = 0;
            if (out_delayed->p.data[0]) {
-                if (out_delayed->visible && !out_delayed->flushed) {
-                    dav1d_picture_ref(out, &out_delayed->p);
-                }
+                const unsigned progress = atomic_load_explicit(&out_delayed->progress[1],
+                                                               memory_order_relaxed);
+                if (out_delayed->visible && progress != FRAME_ERROR)
+                    dav1d_picture_ref(&c->out, &out_delayed->p);
                dav1d_thread_picture_unref(out_delayed);
-                if (out->data[0]) {
-                    return 0;
-                }
-                // else continue
+                if (c->out.data[0])
+                    return output_image(c, out, &c->out);
            }
        } while (++flush_count < c->n_fc);
        return -EAGAIN;
    }

    while (in->sz > 0) {
-        if ((res = dav1d_parse_obus(c, in)) < 0) {
+        if ((res = dav1d_parse_obus(c, in, 0)) < 0) {
            dav1d_data_unref(in);
            return res;
        }
@ -220,16 +324,12 @@ int dav1d_get_picture(Dav1dContext *const c, Dav1dPicture *const out)
        in->sz -= res;
        in->data += res;
        if (!in->sz) dav1d_data_unref(in);
-        if (c->out.data[0]) {
-            dav1d_picture_move_ref(out, &c->out);
-            return 0;
-        }
+        if (c->out.data[0])
+            break;
    }

-    if (c->out.data[0]) {
-        dav1d_picture_move_ref(out, &c->out);
-        return 0;
-    }
+    if (c->out.data[0])
+        return output_image(c, out, &c->out);

    return -EAGAIN;
 }
@ -239,8 +339,39 @@ void dav1d_flush(Dav1dContext *const c) {

    if (c->n_fc == 1) return;

-    for (unsigned n = 0; n < c->n_fc; n++)
-        c->frame_thread.out_delayed[n].flushed = 1;
+    // mark each currently-running frame as flushing, so that we
+    // exit out as quickly as the running thread checks this flag
+    atomic_store(c->frame_thread.flush, 1);
+    for (unsigned n = 0, next = c->frame_thread.next; n < c->n_fc; n++, next++) {
+        if (next == c->n_fc) next = 0;
+        Dav1dFrameContext *const f = &c->fc[next];
+        pthread_mutex_lock(&f->frame_thread.td.lock);
+        if (f->n_tile_data > 0) {
+            while (f->n_tile_data > 0)
+                pthread_cond_wait(&f->frame_thread.td.cond,
+                                  &f->frame_thread.td.lock);
+            assert(!f->cur.data[0]);
+        }
+        pthread_mutex_unlock(&f->frame_thread.td.lock);
+        Dav1dThreadPicture *const out_delayed = &c->frame_thread.out_delayed[next];
+        if (out_delayed->p.data[0])
+            dav1d_thread_picture_unref(out_delayed);
+    }
+    atomic_store(c->frame_thread.flush, 0);
+
+    for (int i = 0; i < 8; i++) {
+        if (c->refs[i].p.p.data[0])
+            dav1d_thread_picture_unref(&c->refs[i].p);
+        dav1d_ref_dec(&c->refs[i].segmap);
+        dav1d_ref_dec(&c->refs[i].refmvs);
+        if (c->cdf[i].cdf)
+            dav1d_cdf_thread_unref(&c->cdf[i]);
+    }
+    c->frame_hdr = NULL;
+    c->seq_hdr = NULL;
+    dav1d_ref_dec(&c->seq_hdr_ref);
+
+    c->frame_thread.next = 0;
 }

 void dav1d_close(Dav1dContext **const c_out) {
@ -249,6 +380,7 @@ void dav1d_close(Dav1dContext **const c_out) {
    Dav1dContext *const c = *c_out;
    if (!c) return;

+    dav1d_flush(c);
    for (unsigned n = 0; n < c->n_fc; n++) {
        Dav1dFrameContext *const f = &c->fc[n];

@ -259,22 +391,6 @@ void dav1d_close(Dav1dContext **const c_out) {
            pthread_cond_signal(&f->frame_thread.td.cond);
            pthread_mutex_unlock(&f->frame_thread.td.lock);
            pthread_join(f->frame_thread.td.thread, NULL);
-            // free references from dav1d_submit_frame() usually freed by
-            // dav1d_decode_frame
-            for (int i = 0; i < 7; i++) {
-                if (f->refp[i].p.data[0])
-                    dav1d_thread_picture_unref(&f->refp[i]);
-                dav1d_ref_dec(&f->ref_mvs_ref[i]);
-            }
-            dav1d_thread_picture_unref(&f->cur);
-            dav1d_cdf_thread_unref(&f->in_cdf);
-            if (f->frame_hdr.refresh_context)
-                dav1d_cdf_thread_unref(&f->out_cdf);
-            dav1d_ref_dec(&f->cur_segmap_ref);
-            dav1d_ref_dec(&f->prev_segmap_ref);
-            dav1d_ref_dec(&f->mvs_ref);
-            for (int i = 0; i < f->n_tile_data; i++)
-                dav1d_data_unref(&f->tile[i].data);
            freep(&f->frame_thread.b);
            dav1d_freep_aligned(&f->frame_thread.pal_idx);
            dav1d_freep_aligned(&f->frame_thread.cf);
@ -324,6 +440,7 @@ void dav1d_close(Dav1dContext **const c_out) {
        dav1d_free_aligned(f->ipred_edge[0]);
        free(f->a);
        free(f->lf.mask);
+        free(f->lf.lr_mask);
        free(f->lf.level);
        free(f->lf.tx_lpf_right_edge[0]);
        av1_free_ref_mv_common(f->libaom_cm);
@ -348,5 +465,8 @@ void dav1d_close(Dav1dContext **const c_out) {
        dav1d_ref_dec(&c->refs[n].refmvs);
        dav1d_ref_dec(&c->refs[n].segmap);
    }
+    dav1d_ref_dec(&c->seq_hdr_ref);
+    dav1d_ref_dec(&c->frame_hdr_ref);
+
    dav1d_freep_aligned(c_out);
 }
--- a/third_party/dav1d/src/looprestoration.h
+++ b/third_party/dav1d/src/looprestoration.h
@ -74,6 +74,8 @@ typedef struct Dav1dLoopRestorationDSPContext {
 void dav1d_loop_restoration_dsp_init_8bpc(Dav1dLoopRestorationDSPContext *c);
 void dav1d_loop_restoration_dsp_init_10bpc(Dav1dLoopRestorationDSPContext *c);

+void dav1d_loop_restoration_dsp_init_arm_8bpc(Dav1dLoopRestorationDSPContext *c);
+void dav1d_loop_restoration_dsp_init_arm_10bpc(Dav1dLoopRestorationDSPContext *c);
 void dav1d_loop_restoration_dsp_init_x86_8bpc(Dav1dLoopRestorationDSPContext *c);
 void dav1d_loop_restoration_dsp_init_x86_10bpc(Dav1dLoopRestorationDSPContext *c);

--- a/third_party/dav1d/src/looprestoration_tmpl.c
+++ b/third_party/dav1d/src/looprestoration_tmpl.c
@ -573,7 +573,11 @@ void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *cons
    c->wiener = wiener_c;
    c->selfguided = selfguided_c;

-#if HAVE_ASM && ARCH_X86 && BITDEPTH == 8
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+    bitfn(dav1d_loop_restoration_dsp_init_arm)(c);
+#elif ARCH_X86
    bitfn(dav1d_loop_restoration_dsp_init_x86)(c);
 #endif
+#endif
 }
--- a/third_party/dav1d/src/lr_apply_tmpl.c
+++ b/third_party/dav1d/src/lr_apply_tmpl.c
@ -33,7 +33,6 @@

 #include "src/lr_apply.h"

-
 enum LrRestorePlanes {
    LR_RESTORE_Y = 1 << 0,
    LR_RESTORE_U = 1 << 1,
@ -44,13 +43,14 @@ enum LrRestorePlanes {
 // contain at most 2 stripes. Each stripe requires 4 rows pixels (2 above
 // and 2 below) the final 4 rows are used to swap the bottom of the last
 // stripe with the top of the next super block row.
-static void backup_lpf(pixel *dst, ptrdiff_t dst_stride,
-                       const pixel *src, ptrdiff_t src_stride,
+static void backup_lpf(const Dav1dFrameContext *const f,
+                       pixel *dst, const ptrdiff_t dst_stride,
+                       const pixel *src, const ptrdiff_t src_stride,
                       const int ss_ver, const int sb128,
-                       int row, const int row_h, const int w)
+                       int row, const int row_h, const int src_w, const int ss_hor)
 {
-    src_stride = PXSTRIDE(src_stride);
-    dst_stride = PXSTRIDE(dst_stride);
+    const int dst_w = f->frame_hdr->super_res.enabled ?
+                      (f->frame_hdr->width[1] + ss_hor) >> ss_hor : src_w;

    // The first stripe of the frame is shorter by 8 luma pixel rows.
    int stripe_h = (64 - 8 * !row) >> ss_ver;
@ -59,23 +59,40 @@ static void backup_lpf(pixel *dst, ptrdiff_t dst_stride,
        const int top = 4 << sb128;
        // Copy the top part of the stored loop filtered pixels from the
        // previous sb row needed above the first stripe of this sb row.
-        pixel_copy(&dst[dst_stride *  0], &dst[dst_stride *  top], w);
-        pixel_copy(&dst[dst_stride *  1], &dst[dst_stride * (top + 1)], w);
-        pixel_copy(&dst[dst_stride *  2], &dst[dst_stride * (top + 2)], w);
-        pixel_copy(&dst[dst_stride *  3], &dst[dst_stride * (top + 3)], w);
+        pixel_copy(&dst[PXSTRIDE(dst_stride) *  0],
+                   &dst[PXSTRIDE(dst_stride) *  top],      dst_w);
+        pixel_copy(&dst[PXSTRIDE(dst_stride) *  1],
+                   &dst[PXSTRIDE(dst_stride) * (top + 1)], dst_w);
+        pixel_copy(&dst[PXSTRIDE(dst_stride) *  2],
+                   &dst[PXSTRIDE(dst_stride) * (top + 2)], dst_w);
+        pixel_copy(&dst[PXSTRIDE(dst_stride) *  3],
+                   &dst[PXSTRIDE(dst_stride) * (top + 3)], dst_w);
    }

-    dst += 4 * dst_stride;
-    src += (stripe_h - 2) * src_stride;
+    dst += 4 * PXSTRIDE(dst_stride);
+    src += (stripe_h - 2) * PXSTRIDE(src_stride);

-    for (; row + stripe_h <= row_h; row += stripe_h) {
-        for (int i = 0; i < 4; i++) {
-            pixel_copy(dst, src, w);
-            dst += dst_stride;
-            src += src_stride;
+    if (f->frame_hdr->super_res.enabled) {
+        while (row + stripe_h <= row_h) {
+            f->dsp->mc.resize(dst, dst_stride, src, src_stride,
+                              dst_w, src_w, 4, f->resize_step[ss_hor],
+                              f->resize_start[ss_hor]);
+            row += stripe_h; // unmodified stripe_h for the 1st stripe
+            stripe_h = 64 >> ss_ver;
+            src += stripe_h * PXSTRIDE(src_stride);
+            dst += 4 * PXSTRIDE(dst_stride);
+        }
+    } else {
+        while (row + stripe_h <= row_h) {
+            for (int i = 0; i < 4; i++) {
+                pixel_copy(dst, src, src_w);
+                dst += PXSTRIDE(dst_stride);
+                src += PXSTRIDE(src_stride);
+            }
+            row += stripe_h; // unmodified stripe_h for the 1st stripe
+            stripe_h = 64 >> ss_ver;
+            src += (stripe_h - 4) * PXSTRIDE(src_stride);
        }
-        stripe_h = 64 >> ss_ver;
-        src += (stripe_h - 4) * src_stride;
    }
 }

@ -83,47 +100,47 @@ void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f,
                               /*const*/ pixel *const src[3], const int sby)
 {
    const ptrdiff_t offset = 8 * !!sby;
-    const ptrdiff_t *const src_stride = f->cur.p.stride;
+    const ptrdiff_t *const src_stride = f->cur.stride;
+    const ptrdiff_t lr_stride = ((f->sr_cur.p.p.w + 31) & ~31) * sizeof(pixel);

    // TODO Also check block level restore type to reduce copying.
    const int restore_planes =
-        ((f->frame_hdr.restoration.type[0] != RESTORATION_NONE) << 0) +
-        ((f->frame_hdr.restoration.type[1] != RESTORATION_NONE) << 1) +
-        ((f->frame_hdr.restoration.type[2] != RESTORATION_NONE) << 2);
+        ((f->frame_hdr->restoration.type[0] != DAV1D_RESTORATION_NONE) << 0) +
+        ((f->frame_hdr->restoration.type[1] != DAV1D_RESTORATION_NONE) << 1) +
+        ((f->frame_hdr->restoration.type[2] != DAV1D_RESTORATION_NONE) << 2);

    if (restore_planes & LR_RESTORE_Y) {
        const int h = f->bh << 2;
        const int w = f->bw << 2;
-        const int row_h = imin((sby + 1) << (6 + f->seq_hdr.sb128), h);
-        const int y_stripe = (sby << (6 + f->seq_hdr.sb128)) - offset;
-        backup_lpf(f->lf.lr_lpf_line_ptr[0], sizeof(pixel) * f->b4_stride * 4,
+        const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h - 4);
+        const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset;
+        backup_lpf(f, f->lf.lr_lpf_line_ptr[0], lr_stride,
                   src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0],
-                   0, f->seq_hdr.sb128, y_stripe, row_h, w);
+                   0, f->seq_hdr->sb128, y_stripe, row_h, w, 0);
    }
    if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) {
-        const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-        const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+        const int ss_ver = f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+        const int ss_hor = f->sr_cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
        const int h = f->bh << (2 - ss_ver);
        const int w = f->bw << (2 - ss_hor);
-        const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr.sb128), h);
+        const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h - 4);
        const ptrdiff_t offset_uv = offset >> ss_ver;
        const int y_stripe =
-            (sby << ((6 - ss_ver) + f->seq_hdr.sb128)) - offset_uv;
+            (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv;

        if (restore_planes & LR_RESTORE_U) {
-            backup_lpf(f->lf.lr_lpf_line_ptr[1], sizeof(pixel) * f->b4_stride * 4,
+            backup_lpf(f, f->lf.lr_lpf_line_ptr[1], lr_stride,
                       src[1] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],
-                       ss_ver, f->seq_hdr.sb128, y_stripe, row_h, w);
+                       ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, ss_hor);
        }
        if (restore_planes & LR_RESTORE_V) {
-            backup_lpf(f->lf.lr_lpf_line_ptr[2], sizeof(pixel) * f->b4_stride * 4,
+            backup_lpf(f, f->lf.lr_lpf_line_ptr[2], lr_stride,
                       src[2] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],
-                       ss_ver, f->seq_hdr.sb128, y_stripe, row_h, w);
+                       ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, ss_hor);
        }
    }
 }

-
 static void lr_stripe(const Dav1dFrameContext *const f, pixel *p,
                      const pixel (*left)[4], int x, int y,
                      const int plane, const int unit_w, const int row_h,
@ -131,18 +148,18 @@ static void lr_stripe(const Dav1dFrameContext *const f, pixel *p,
 {
    const Dav1dDSPContext *const dsp = f->dsp;
    const int chroma = !!plane;
-    const int ss_ver = chroma & (f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420);
+    const int ss_ver = chroma & (f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420);
    const int sbrow_has_bottom = (edges & LR_HAVE_BOTTOM);
    const pixel *lpf = f->lf.lr_lpf_line_ptr[plane] + x;
-    const ptrdiff_t p_stride = f->cur.p.stride[chroma];
-    const ptrdiff_t lpf_stride = sizeof(pixel) * f->b4_stride * 4;
+    const ptrdiff_t p_stride = f->sr_cur.p.stride[chroma];
+    const ptrdiff_t lpf_stride = sizeof(pixel) * ((f->sr_cur.p.p.w + 31) & ~31);

    // The first stripe of the frame is shorter by 8 luma pixel rows.
    int stripe_h = imin((64 - 8 * !y) >> ss_ver, row_h - y);

    // FIXME [8] might be easier for SIMD
    int16_t filterh[7], filterv[7];
-    if (lr->type == RESTORATION_WIENER) {
+    if (lr->type == DAV1D_RESTORATION_WIENER) {
        filterh[0] = filterh[6] = lr->filter_h[0];
        filterh[1] = filterh[5] = lr->filter_h[1];
        filterh[2] = filterh[4] = lr->filter_h[2];
@ -161,11 +178,11 @@ static void lr_stripe(const Dav1dFrameContext *const f, pixel *p,
        } else {
            edges |= LR_HAVE_BOTTOM;
        }
-        if (lr->type == RESTORATION_WIENER) {
+        if (lr->type == DAV1D_RESTORATION_WIENER) {
            dsp->lr.wiener(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
                           filterh, filterv, edges);
        } else {
-            assert(lr->type == RESTORATION_SGRPROJ);
+            assert(lr->type == DAV1D_RESTORATION_SGRPROJ);
            dsp->lr.selfguided(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
                               lr->sgr_idx, lr->sgr_weights, edges);
        }
@ -192,11 +209,11 @@ static void lr_sbrow(const Dav1dFrameContext *const f, pixel *p, const int y,
                     const int w, const int h, const int row_h, const int plane)
 {
    const int chroma = !!plane;
-    const int ss_ver = chroma & (f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420);
-    const int ss_hor = chroma & (f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444);
-    const ptrdiff_t p_stride = f->cur.p.stride[chroma];
+    const int ss_ver = chroma & (f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420);
+    const int ss_hor = chroma & (f->sr_cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444);
+    const ptrdiff_t p_stride = f->sr_cur.p.stride[chroma];

-    const int unit_size_log2 = f->frame_hdr.restoration.unit_size[!!plane];
+    const int unit_size_log2 = f->frame_hdr->restoration.unit_size[!!plane];
    const int unit_size = 1 << unit_size_log2;
    const int half_unit_size = unit_size >> 1;
    const int max_unit_size = unit_size + half_unit_size;
@ -211,42 +228,38 @@ static void lr_sbrow(const Dav1dFrameContext *const f, pixel *p, const int y,
    // with a 4:2:0 chroma subsampling, do we store the filter information at
    // the AV1Filter unit located at (128,128) or (256,256)
    // TODO Support chroma subsampling.
-    const int shift_ver = 7 - ss_ver;
    const int shift_hor = 7 - ss_hor;

-    int ruy = (row_y >> unit_size_log2);
-    // Merge last restoration unit if its height is < half_unit_size
-    if (ruy > 0) ruy -= (ruy << unit_size_log2) + half_unit_size > h;
-
    pixel pre_lr_border[2][128 + 8 /* maximum sbrow height is 128 + 8 rows offset */][4];

    int unit_w = unit_size, bit = 0;

-    enum LrEdgeFlags edges = (y > 0 ? LR_HAVE_TOP : 0) |
+    enum LrEdgeFlags edges = (y > 0 ? LR_HAVE_TOP : 0) | LR_HAVE_RIGHT |
                             (row_h < h ? LR_HAVE_BOTTOM : 0);

-    for (int x = 0, rux = 0; x < w; x+= unit_w, rux++, edges |= LR_HAVE_LEFT, bit ^= 1) {
-        // TODO Clean up this if statement.
+    int aligned_unit_pos = row_y & ~(unit_size - 1);
+    if (aligned_unit_pos && aligned_unit_pos + half_unit_size > h)
+        aligned_unit_pos -= unit_size;
+    aligned_unit_pos <<= ss_ver;
+    const int sb_idx = (aligned_unit_pos >> 7) * f->sr_sb128w;
+    const int unit_idx = ((aligned_unit_pos >> 6) & 1) << 1;
+    for (int x = 0; x < w; x += unit_w, edges |= LR_HAVE_LEFT, bit ^= 1) {
        if (x + max_unit_size > w) {
            unit_w = w - x;
            edges &= ~LR_HAVE_RIGHT;
-        } else {
-            edges |= LR_HAVE_RIGHT;
        }

        // Based on the position of the restoration unit, find the corresponding
        // AV1Filter unit.
-        const int unit_idx = ((ruy & 16) >> 3) + ((rux & 16) >> 4);
+        const int u_idx = unit_idx + ((x >> (shift_hor - 1)) & 1);
        const Av1RestorationUnit *const lr =
-            &f->lf.mask[(((ruy << (unit_size_log2)) >> shift_ver) * f->sb128w) +
-                        (x >> shift_hor)].lr[plane][unit_idx];
+            &f->lf.lr_mask[sb_idx + (x >> shift_hor)].lr[plane][u_idx];

        // FIXME Don't backup if the next restoration unit is RESTORE_NONE
-        // This also requires not restoring in the same conditions.
        if (edges & LR_HAVE_RIGHT) {
            backup4xU(pre_lr_border[bit], p + unit_w - 4, p_stride, row_h - y);
        }
-        if (lr->type != RESTORATION_NONE) {
+        if (lr->type != DAV1D_RESTORATION_NONE) {
            lr_stripe(f, p, pre_lr_border[!bit], x, y, plane, unit_w, row_h, lr, edges);
        }
        p += unit_w;
@ -257,30 +270,30 @@ void bytefn(dav1d_lr_sbrow)(Dav1dFrameContext *const f, pixel *const dst[3],
                            const int sby)
 {
    const ptrdiff_t offset_y = 8 * !!sby;
-    const ptrdiff_t *const dst_stride = f->cur.p.stride;
+    const ptrdiff_t *const dst_stride = f->sr_cur.p.stride;

    const int restore_planes =
-        ((f->frame_hdr.restoration.type[0] != RESTORATION_NONE) << 0) +
-        ((f->frame_hdr.restoration.type[1] != RESTORATION_NONE) << 1) +
-        ((f->frame_hdr.restoration.type[2] != RESTORATION_NONE) << 2);
+        ((f->frame_hdr->restoration.type[0] != DAV1D_RESTORATION_NONE) << 0) +
+        ((f->frame_hdr->restoration.type[1] != DAV1D_RESTORATION_NONE) << 1) +
+        ((f->frame_hdr->restoration.type[2] != DAV1D_RESTORATION_NONE) << 2);

    if (restore_planes & LR_RESTORE_Y) {
-        const int h = f->cur.p.p.h;
-        const int w = f->cur.p.p.w;
-        const int row_h = imin((sby + 1) << (6 + f->seq_hdr.sb128), h);
-        const int y_stripe = (sby << (6 + f->seq_hdr.sb128)) - offset_y;
+        const int h = f->sr_cur.p.p.h;
+        const int w = f->sr_cur.p.p.w;
+        const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h);
+        const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset_y;
        lr_sbrow(f, dst[0] - offset_y * PXSTRIDE(dst_stride[0]), y_stripe, w,
                 h, row_h, 0);
    }
    if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) {
-        const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-        const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
-        const int h = (f->cur.p.p.h + ss_ver) >> ss_ver;
-        const int w = (f->cur.p.p.w + ss_hor) >> ss_hor;
-        const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr.sb128), h);
+        const int ss_ver = f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+        const int ss_hor = f->sr_cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+        const int h = (f->sr_cur.p.p.h + ss_ver) >> ss_ver;
+        const int w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
+        const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h);
        const ptrdiff_t offset_uv = offset_y >> ss_ver;
        const int y_stripe =
-            (sby << ((6 - ss_ver) + f->seq_hdr.sb128)) - offset_uv;
+            (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv;
        if (restore_planes & LR_RESTORE_U)
            lr_sbrow(f, dst[1] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe,
                     w, h, row_h, 1);
--- a/third_party/dav1d/src/mc.h
+++ b/third_party/dav1d/src/mc.h
@ -105,6 +105,12 @@ void (name)(intptr_t bw, intptr_t bh, intptr_t iw, intptr_t ih, intptr_t x, intp
            pixel *dst, ptrdiff_t dst_stride, const pixel *src, ptrdiff_t src_stride)
 typedef decl_emu_edge_fn(*emu_edge_fn);

+#define decl_resize_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, \
+            const pixel *src, ptrdiff_t src_stride, \
+            int dst_w, int src_w, int h, int dx, int mx)
+typedef decl_resize_fn(*resize_fn);
+
 typedef struct Dav1dMCDSPContext {
    mc_fn mc[N_2D_FILTERS];
    mc_scaled_fn mc_scaled[N_2D_FILTERS];
@ -120,6 +126,7 @@ typedef struct Dav1dMCDSPContext {
    warp8x8_fn warp8x8;
    warp8x8t_fn warp8x8t;
    emu_edge_fn emu_edge;
+    resize_fn resize;
 } Dav1dMCDSPContext;

 void dav1d_mc_dsp_init_8bpc(Dav1dMCDSPContext *c);
--- a/third_party/dav1d/src/mc_tmpl.c
+++ b/third_party/dav1d/src/mc_tmpl.c
@ -72,11 +72,11 @@ prep_c(coef *tmp, const pixel *src, const ptrdiff_t src_stride,
     F[6] * src[x + +3 * stride] + \
     F[7] * src[x + +4 * stride])

-#define FILTER_8TAP_RND(src, x, F, stride, sh) \
+#define DAV1D_FILTER_8TAP_RND(src, x, F, stride, sh) \
    ((FILTER_8TAP(src, x, F, stride) + ((1 << sh) >> 1)) >> sh)

-#define FILTER_8TAP_CLIP(src, x, F, stride, sh) \
-    iclip_pixel(FILTER_8TAP_RND(src, x, F, stride, sh))
+#define DAV1D_FILTER_8TAP_CLIP(src, x, F, stride, sh) \
+    iclip_pixel(DAV1D_FILTER_8TAP_RND(src, x, F, stride, sh))

 #define GET_H_FILTER(mx) \
    const int8_t *const fh = !(mx) ? NULL : w > 4 ? \
@ -110,7 +110,7 @@ put_8tap_c(pixel *dst, ptrdiff_t dst_stride,
            src -= src_stride * 3;
            do {
                for (int x = 0; x < w; x++)
-                    mid_ptr[x] = FILTER_8TAP_RND(src, x, fh, 1, 2);
+                    mid_ptr[x] = DAV1D_FILTER_8TAP_RND(src, x, fh, 1, 2);

                mid_ptr += 128;
                src += src_stride;
@ -119,7 +119,7 @@ put_8tap_c(pixel *dst, ptrdiff_t dst_stride,
            mid_ptr = mid + 128 * 3;
            do {
                for (int x = 0; x < w; x++)
-                    dst[x] = FILTER_8TAP_CLIP(mid_ptr, x, fv, 128, 10);
+                    dst[x] = DAV1D_FILTER_8TAP_CLIP(mid_ptr, x, fv, 128, 10);

                mid_ptr += 128;
                dst += dst_stride;
@ -127,7 +127,7 @@ put_8tap_c(pixel *dst, ptrdiff_t dst_stride,
        } else {
            do {
                for (int x = 0; x < w; x++) {
-                    const int px = FILTER_8TAP_RND(src, x, fh, 1, 2);
+                    const int px = DAV1D_FILTER_8TAP_RND(src, x, fh, 1, 2);
                    dst[x] = iclip_pixel((px + 8) >> 4);
                }

@ -138,7 +138,7 @@ put_8tap_c(pixel *dst, ptrdiff_t dst_stride,
    } else if (fv) {
        do {
            for (int x = 0; x < w; x++)
-                dst[x] = FILTER_8TAP_CLIP(src, x, fv, src_stride, 6);
+                dst[x] = DAV1D_FILTER_8TAP_CLIP(src, x, fv, src_stride, 6);

            dst += dst_stride;
            src += src_stride;
@ -164,7 +164,7 @@ put_8tap_scaled_c(pixel *dst, const ptrdiff_t dst_stride,

        for (x = 0; x < w; x++) {
            GET_H_FILTER(imx >> 6);
-            mid_ptr[x] = fh ? FILTER_8TAP_RND(src, ioff, fh, 1, 2) : src[ioff] << 4;
+            mid_ptr[x] = fh ? DAV1D_FILTER_8TAP_RND(src, ioff, fh, 1, 2) : src[ioff] << 4;
            imx += dx;
            ioff += imx >> 10;
            imx &= 0x3ff;
@ -180,7 +180,7 @@ put_8tap_scaled_c(pixel *dst, const ptrdiff_t dst_stride,
        GET_V_FILTER(my >> 6);

        for (x = 0; x < w; x++)
-            dst[x] = fv ? FILTER_8TAP_CLIP(mid_ptr, x, fv, 128, 10) :
+            dst[x] = fv ? DAV1D_FILTER_8TAP_CLIP(mid_ptr, x, fv, 128, 10) :
                          iclip_pixel((mid_ptr[x] + 8) >> 4);

        my += dy;
@ -206,7 +206,7 @@ prep_8tap_c(coef *tmp, const pixel *src, ptrdiff_t src_stride,
            src -= src_stride * 3;
            do {
                for (int x = 0; x < w; x++)
-                    mid_ptr[x] = FILTER_8TAP_RND(src, x, fh, 1, 2);
+                    mid_ptr[x] = DAV1D_FILTER_8TAP_RND(src, x, fh, 1, 2);

                mid_ptr += 128;
                src += src_stride;
@ -215,7 +215,7 @@ prep_8tap_c(coef *tmp, const pixel *src, ptrdiff_t src_stride,
            mid_ptr = mid + 128 * 3;
            do {
                for (int x = 0; x < w; x++)
-                    tmp[x] = FILTER_8TAP_RND(mid_ptr, x, fv, 128, 6);
+                    tmp[x] = DAV1D_FILTER_8TAP_RND(mid_ptr, x, fv, 128, 6);

                mid_ptr += 128;
                tmp += w;
@ -223,7 +223,7 @@ prep_8tap_c(coef *tmp, const pixel *src, ptrdiff_t src_stride,
        } else {
            do {
                for (int x = 0; x < w; x++)
-                    tmp[x] = FILTER_8TAP_RND(src, x, fh, 1, 2);
+                    tmp[x] = DAV1D_FILTER_8TAP_RND(src, x, fh, 1, 2);

                tmp += w;
                src += src_stride;
@ -232,7 +232,7 @@ prep_8tap_c(coef *tmp, const pixel *src, ptrdiff_t src_stride,
    } else if (fv) {
        do {
            for (int x = 0; x < w; x++)
-                tmp[x] = FILTER_8TAP_RND(src, x, fv, src_stride, 2);
+                tmp[x] = DAV1D_FILTER_8TAP_RND(src, x, fv, src_stride, 2);

            tmp += w;
            src += src_stride;
@ -257,7 +257,7 @@ prep_8tap_scaled_c(coef *tmp, const pixel *src, ptrdiff_t src_stride,

        for (x = 0; x < w; x++) {
            GET_H_FILTER(imx >> 6);
-            mid_ptr[x] = fh ? FILTER_8TAP_RND(src, ioff, fh, 1, 2) : src[ioff] << 4;
+            mid_ptr[x] = fh ? DAV1D_FILTER_8TAP_RND(src, ioff, fh, 1, 2) : src[ioff] << 4;
            imx += dx;
            ioff += imx >> 10;
            imx &= 0x3ff;
@ -273,7 +273,7 @@ prep_8tap_scaled_c(coef *tmp, const pixel *src, ptrdiff_t src_stride,
        GET_V_FILTER(my >> 6);

        for (x = 0; x < w; x++)
-            tmp[x] = fv ? FILTER_8TAP_RND(mid_ptr, x, fv, 128, 6) : mid_ptr[x];
+            tmp[x] = fv ? DAV1D_FILTER_8TAP_RND(mid_ptr, x, fv, 128, 6) : mid_ptr[x];

        my += dy;
        mid_ptr += (my >> 10) * 128;
@ -324,15 +324,15 @@ static void prep_8tap_##type##_scaled_c(coef *const tmp, \
                       type_h | (type_v << 2)); \
 }

-filter_fns(regular,        FILTER_8TAP_REGULAR, FILTER_8TAP_REGULAR)
-filter_fns(regular_sharp,  FILTER_8TAP_REGULAR, FILTER_8TAP_SHARP)
-filter_fns(regular_smooth, FILTER_8TAP_REGULAR, FILTER_8TAP_SMOOTH)
-filter_fns(smooth,         FILTER_8TAP_SMOOTH,  FILTER_8TAP_SMOOTH)
-filter_fns(smooth_regular, FILTER_8TAP_SMOOTH,  FILTER_8TAP_REGULAR)
-filter_fns(smooth_sharp,   FILTER_8TAP_SMOOTH,  FILTER_8TAP_SHARP)
-filter_fns(sharp,          FILTER_8TAP_SHARP,   FILTER_8TAP_SHARP)
-filter_fns(sharp_regular,  FILTER_8TAP_SHARP,   FILTER_8TAP_REGULAR)
-filter_fns(sharp_smooth,   FILTER_8TAP_SHARP,   FILTER_8TAP_SMOOTH)
+filter_fns(regular,        DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_REGULAR)
+filter_fns(regular_sharp,  DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_SHARP)
+filter_fns(regular_smooth, DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_SMOOTH)
+filter_fns(smooth,         DAV1D_FILTER_8TAP_SMOOTH,  DAV1D_FILTER_8TAP_SMOOTH)
+filter_fns(smooth_regular, DAV1D_FILTER_8TAP_SMOOTH,  DAV1D_FILTER_8TAP_REGULAR)
+filter_fns(smooth_sharp,   DAV1D_FILTER_8TAP_SMOOTH,  DAV1D_FILTER_8TAP_SHARP)
+filter_fns(sharp,          DAV1D_FILTER_8TAP_SHARP,   DAV1D_FILTER_8TAP_SHARP)
+filter_fns(sharp_regular,  DAV1D_FILTER_8TAP_SHARP,   DAV1D_FILTER_8TAP_REGULAR)
+filter_fns(sharp_smooth,   DAV1D_FILTER_8TAP_SHARP,   DAV1D_FILTER_8TAP_SMOOTH)

 #define FILTER_BILIN(src, x, mxy, stride) \
    (16 * src[x] + ((mxy) * (src[x + stride] - src[x])))
@ -782,6 +782,34 @@ static void emu_edge_c(const intptr_t bw, const intptr_t bh,
    }
 }

+static void resize_c(pixel *dst, const ptrdiff_t dst_stride,
+                     const pixel *src, const ptrdiff_t src_stride,
+                     const int dst_w, const int src_w, int h,
+                     const int dx, const int mx0)
+{
+    do {
+        int mx = mx0, src_x = -1;
+        for (int x = 0; x < dst_w; x++) {
+            const int16_t *const F = dav1d_resize_filter[mx >> 8];
+            dst[x] = iclip_pixel((F[0] * src[iclip(src_x - 3, 0, src_w - 1)] +
+                                  F[1] * src[iclip(src_x - 2, 0, src_w - 1)] +
+                                  F[2] * src[iclip(src_x - 1, 0, src_w - 1)] +
+                                  F[3] * src[iclip(src_x + 0, 0, src_w - 1)] +
+                                  F[4] * src[iclip(src_x + 1, 0, src_w - 1)] +
+                                  F[5] * src[iclip(src_x + 2, 0, src_w - 1)] +
+                                  F[6] * src[iclip(src_x + 3, 0, src_w - 1)] +
+                                  F[7] * src[iclip(src_x + 4, 0, src_w - 1)] +
+                                  64) >> 7);
+            mx += dx;
+            src_x += mx >> 14;
+            mx &= 0x3fff;
+        }
+
+        dst += PXSTRIDE(dst_stride);
+        src += PXSTRIDE(src_stride);
+    } while (--h);
+}
+
 void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) {
 #define init_mc_fns(type, name) do { \
    c->mc        [type] = put_##name##_c; \
@ -813,6 +841,7 @@ void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) {
    c->warp8x8  = warp_affine_8x8_c;
    c->warp8x8t = warp_affine_8x8t_c;
    c->emu_edge = emu_edge_c;
+    c->resize   = resize_c;

 #if HAVE_ASM
 #if ARCH_AARCH64 || ARCH_ARM
--- a/third_party/dav1d/src/meson.build
+++ b/third_party/dav1d/src/meson.build
@ -62,7 +62,8 @@ libdav1d_tmpl_sources = files(
    'cdef_tmpl.c',
    'lr_apply_tmpl.c',
    'looprestoration_tmpl.c',
-    'recon_tmpl.c'
+    'recon_tmpl.c',
+    'film_grain_tmpl.c',
 )

 # libdav1d entrypoint source files
@ -83,10 +84,12 @@ if is_asm_enabled
            'arm/cpu.c',
        )
        libdav1d_tmpl_sources += files(
+            'arm/looprestoration_init_tmpl.c',
            'arm/mc_init_tmpl.c',
        )
        if host_machine.cpu_family() == 'aarch64'
            libdav1d_sources += files(
+                'arm/64/looprestoration.S',
                'arm/64/mc.S',
            )
        elif host_machine.cpu_family().startswith('arm')
@ -118,6 +121,7 @@ if is_asm_enabled
            'x86/loopfilter.asm',
            'x86/looprestoration.asm',
            'x86/mc.asm',
+            'x86/mc_ssse3.asm',
        )

        # Compile the ASM sources with NASM
--- a/third_party/dav1d/src/obu.c
+++ b/third_party/dav1d/src/obu.c
@ -41,10 +41,10 @@
 #include "src/levels.h"
 #include "src/obu.h"
 #include "src/ref.h"
-#include "src/warpmv.h"
+#include "src/thread_task.h"

 static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
-                         Av1SequenceHeader *const hdr)
+                         Dav1dSequenceHeader *const hdr)
 {
 #define DEBUG_SEQ_HDR 0

@ -105,7 +105,7 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
        hdr->display_model_info_present = dav1d_get_bits(gb, 1);
        hdr->num_operating_points = dav1d_get_bits(gb, 5) + 1;
        for (int i = 0; i < hdr->num_operating_points; i++) {
-            struct Av1SequenceHeaderOperatingPoint *const op =
+            struct Dav1dSequenceHeaderOperatingPoint *const op =
                &hdr->operating_points[i];
            op->idc = dav1d_get_bits(gb, 12);
            op->major_level = 2 + dav1d_get_bits(gb, 3);
@ -126,6 +126,10 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
                op->initial_display_delay = dav1d_get_bits(gb, 4) + 1;
            }
        }
+        if (c->operating_point < hdr->num_operating_points)
+            c->operating_point_idc = hdr->operating_points[c->operating_point].idc;
+        else
+            c->operating_point_idc = hdr->operating_points[0].idc;
 #if DEBUG_SEQ_HDR
        printf("SEQHDR: post-operating-points: off=%ld\n",
               dav1d_get_bits_pos(gb) - init_bit_pos);
@ -163,8 +167,8 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
        hdr->jnt_comp = 0;
        hdr->ref_frame_mvs = 0;
        hdr->order_hint_n_bits = 0;
-        hdr->screen_content_tools = ADAPTIVE;
-        hdr->force_integer_mv = ADAPTIVE;
+        hdr->screen_content_tools = DAV1D_ADAPTIVE;
+        hdr->force_integer_mv = DAV1D_ADAPTIVE;
    } else {
        hdr->inter_intra = dav1d_get_bits(gb, 1);
        hdr->masked_compound = dav1d_get_bits(gb, 1);
@ -179,13 +183,13 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
            hdr->ref_frame_mvs = 0;
            hdr->order_hint_n_bits = 0;
        }
-        hdr->screen_content_tools = dav1d_get_bits(gb, 1) ? ADAPTIVE : dav1d_get_bits(gb, 1);
+        hdr->screen_content_tools = dav1d_get_bits(gb, 1) ? DAV1D_ADAPTIVE : dav1d_get_bits(gb, 1);
    #if DEBUG_SEQ_HDR
        printf("SEQHDR: post-screentools: off=%ld\n",
               dav1d_get_bits_pos(gb) - init_bit_pos);
    #endif
        hdr->force_integer_mv = hdr->screen_content_tools ?
-                                dav1d_get_bits(gb, 1) ? ADAPTIVE : dav1d_get_bits(gb, 1) : 2;
+                                dav1d_get_bits(gb, 1) ? DAV1D_ADAPTIVE : dav1d_get_bits(gb, 1) : 2;
        if (hdr->order_hint)
            hdr->order_hint_n_bits = dav1d_get_bits(gb, 3) + 1;
    }
@ -197,10 +201,9 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
           dav1d_get_bits_pos(gb) - init_bit_pos);
 #endif

-    const int hbd = dav1d_get_bits(gb, 1);
-    hdr->bpc = hdr->profile == 2 && hbd ? 10U + 2 * dav1d_get_bits(gb, 1) : 8U + 2 * hbd;
-    hdr->hbd = hdr->bpc > 8;
-    const int monochrome = hdr->profile != 1 ? dav1d_get_bits(gb, 1) : 0;
+    hdr->hbd = dav1d_get_bits(gb, 1);
+    if (hdr->profile == 2 && hdr->hbd) hdr->hbd += dav1d_get_bits(gb, 1);
+    hdr->monochrome = hdr->profile != 1 ? dav1d_get_bits(gb, 1) : 0;
    hdr->color_description_present = dav1d_get_bits(gb, 1);
    if (hdr->color_description_present) {
        hdr->pri = dav1d_get_bits(gb, 8);
@ -211,9 +214,10 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
        hdr->trc = DAV1D_TRC_UNKNOWN;
        hdr->mtrx = DAV1D_MC_UNKNOWN;
    }
-    if (monochrome) {
+    if (hdr->monochrome) {
        hdr->color_range = dav1d_get_bits(gb, 1);
        hdr->layout = DAV1D_PIXEL_LAYOUT_I400;
+        hdr->ss_hor = hdr->ss_ver = 0;
        hdr->chr = DAV1D_CHR_UNKNOWN;
        hdr->separate_uv_delta_q = 0;
    } else if (hdr->pri == DAV1D_COLOR_PRI_BT709 &&
@ -221,26 +225,35 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
               hdr->mtrx == DAV1D_MC_IDENTITY)
    {
        hdr->layout = DAV1D_PIXEL_LAYOUT_I444;
+        hdr->ss_hor = hdr->ss_ver = 1;
        hdr->color_range = 1;
-        if (hdr->profile != 1 && !(hdr->profile == 2 && hdr->bpc == 12))
+        if (hdr->profile != 1 && !(hdr->profile == 2 && hdr->hbd == 2))
            goto error;
    } else {
        hdr->color_range = dav1d_get_bits(gb, 1);
        switch (hdr->profile) {
-        case 0: hdr->layout = DAV1D_PIXEL_LAYOUT_I420; break;
-        case 1: hdr->layout = DAV1D_PIXEL_LAYOUT_I444; break;
+        case 0: hdr->layout = DAV1D_PIXEL_LAYOUT_I420;
+                hdr->ss_hor = hdr->ss_ver = 1;
+                break;
+        case 1: hdr->layout = DAV1D_PIXEL_LAYOUT_I444;
+                hdr->ss_hor = hdr->ss_ver = 0;
+                break;
        case 2:
-            if (hdr->bpc == 12) {
-                hdr->layout = dav1d_get_bits(gb, 1) ?
-                              dav1d_get_bits(gb, 1) ? DAV1D_PIXEL_LAYOUT_I420 :
-                                                      DAV1D_PIXEL_LAYOUT_I422 :
-                                                      DAV1D_PIXEL_LAYOUT_I444;
-            } else
-                hdr->layout = DAV1D_PIXEL_LAYOUT_I422;
+            if (hdr->hbd == 2) {
+                hdr->ss_hor = dav1d_get_bits(gb, 1);
+                hdr->ss_ver = hdr->ss_hor && dav1d_get_bits(gb, 1);
+            } else {
+                hdr->ss_hor = 1;
+                hdr->ss_ver = 0;
+            }
+            hdr->layout = hdr->ss_hor ?
+                          hdr->ss_ver ? DAV1D_PIXEL_LAYOUT_I420 :
+                                        DAV1D_PIXEL_LAYOUT_I422 :
+                                        DAV1D_PIXEL_LAYOUT_I444;
            break;
        }
-        if (hdr->layout == DAV1D_PIXEL_LAYOUT_I420)
-            hdr->chr = dav1d_get_bits(gb, 2);
+        hdr->chr = hdr->ss_hor == 1 && hdr->ss_ver == 1 ?
+                   dav1d_get_bits(gb, 2) : DAV1D_CHR_UNKNOWN;
        hdr->separate_uv_delta_q = dav1d_get_bits(gb, 1);
    }
 #if DEBUG_SEQ_HDR
@ -270,39 +283,54 @@ error:
 static int read_frame_size(Dav1dContext *const c, GetBits *const gb,
                           const int use_ref)
 {
-    const Av1SequenceHeader *const seqhdr = &c->seq_hdr;
-    Av1FrameHeader *const hdr = &c->frame_hdr;
+    const Dav1dSequenceHeader *const seqhdr = c->seq_hdr;
+    Dav1dFrameHeader *const hdr = c->frame_hdr;

    if (use_ref) {
        for (int i = 0; i < 7; i++) {
            if (dav1d_get_bits(gb, 1)) {
                Dav1dThreadPicture *const ref =
-                    &c->refs[c->frame_hdr.refidx[i]].p;
+                    &c->refs[c->frame_hdr->refidx[i]].p;
                if (!ref->p.data[0]) return -1;
                // FIXME render_* may be wrong
-                hdr->render_width = hdr->width = ref->p.p.w;
+                hdr->render_width = hdr->width[1] = ref->p.p.w;
                hdr->render_height = hdr->height = ref->p.p.h;
-                hdr->super_res = 0; // FIXME probably wrong
+                hdr->super_res.enabled = seqhdr->super_res && dav1d_get_bits(gb, 1);
+                if (hdr->super_res.enabled) {
+                    const int d = hdr->super_res.width_scale_denominator =
+                        9 + dav1d_get_bits(gb, 3);
+                    hdr->width[0] = imax((hdr->width[1] * 8 + (d >> 1)) / d,
+                                         imin(16, hdr->width[1]));
+                } else {
+                    hdr->super_res.width_scale_denominator = 8;
+                    hdr->width[0] = hdr->width[1];
+                }
                return 0;
            }
        }
    }

    if (hdr->frame_size_override) {
-        hdr->width = dav1d_get_bits(gb, seqhdr->width_n_bits) + 1;
+        hdr->width[1] = dav1d_get_bits(gb, seqhdr->width_n_bits) + 1;
        hdr->height = dav1d_get_bits(gb, seqhdr->height_n_bits) + 1;
    } else {
-        hdr->width = seqhdr->max_width;
+        hdr->width[1] = seqhdr->max_width;
        hdr->height = seqhdr->max_height;
    }
-    hdr->super_res = seqhdr->super_res && dav1d_get_bits(gb, 1);
-    if (hdr->super_res) return -1; // FIXME
+    hdr->super_res.enabled = seqhdr->super_res && dav1d_get_bits(gb, 1);
+    if (hdr->super_res.enabled) {
+        const int d = hdr->super_res.width_scale_denominator = 9 + dav1d_get_bits(gb, 3);
+        hdr->width[0] = imax((hdr->width[1] * 8 + (d >> 1)) / d, imin(16, hdr->width[1]));
+    } else {
+        hdr->super_res.width_scale_denominator = 8;
+        hdr->width[0] = hdr->width[1];
+    }
    hdr->have_render_size = dav1d_get_bits(gb, 1);
    if (hdr->have_render_size) {
        hdr->render_width = dav1d_get_bits(gb, 16) + 1;
        hdr->render_height = dav1d_get_bits(gb, 16) + 1;
    } else {
-        hdr->render_width = hdr->width;
+        hdr->render_width = hdr->width[1];
        hdr->render_height = hdr->height;
    }
    return 0;
@ -314,7 +342,7 @@ static inline int tile_log2(int sz, int tgt) {
    return k;
 }

-static const Av1LoopfilterModeRefDeltas default_mode_ref_deltas = {
+static const Dav1dLoopfilterModeRefDeltas default_mode_ref_deltas = {
    .mode_delta = { 0, 0 },
    .ref_delta = { 1, 0, 0, 0, -1, 0, -1, -1 },
 };
@ -325,8 +353,8 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
 #if DEBUG_FRAME_HDR
    const uint8_t *const init_ptr = gb->ptr;
 #endif
-    const Av1SequenceHeader *const seqhdr = &c->seq_hdr;
-    Av1FrameHeader *const hdr = &c->frame_hdr;
+    const Dav1dSequenceHeader *const seqhdr = c->seq_hdr;
+    Dav1dFrameHeader *const hdr = c->frame_hdr;
    int res;

    hdr->show_existing_frame =
@ -360,10 +388,10 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
           (gb->ptr - init_ptr) * 8 - gb->bits_left);
 #endif
    hdr->disable_cdf_update = dav1d_get_bits(gb, 1);
-    hdr->allow_screen_content_tools = seqhdr->screen_content_tools == ADAPTIVE ?
+    hdr->allow_screen_content_tools = seqhdr->screen_content_tools == DAV1D_ADAPTIVE ?
                                 dav1d_get_bits(gb, 1) : seqhdr->screen_content_tools;
    if (hdr->allow_screen_content_tools)
-        hdr->force_integer_mv = seqhdr->force_integer_mv == ADAPTIVE ?
+        hdr->force_integer_mv = seqhdr->force_integer_mv == DAV1D_ADAPTIVE ?
                                dav1d_get_bits(gb, 1) : seqhdr->force_integer_mv;
    else
        hdr->force_integer_mv = 0;
@ -383,18 +411,18 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
    hdr->frame_offset = seqhdr->order_hint ?
                        dav1d_get_bits(gb, seqhdr->order_hint_n_bits) : 0;
    hdr->primary_ref_frame = !hdr->error_resilient_mode && hdr->frame_type & 1 ?
-                             dav1d_get_bits(gb, 3) : PRIMARY_REF_NONE;
+                             dav1d_get_bits(gb, 3) : DAV1D_PRIMARY_REF_NONE;

    if (seqhdr->decoder_model_info_present) {
        hdr->buffer_removal_time_present = dav1d_get_bits(gb, 1);
        if (hdr->buffer_removal_time_present) {
-            for (int i = 0; i < c->seq_hdr.num_operating_points; i++) {
-                const struct Av1SequenceHeaderOperatingPoint *const seqop = &seqhdr->operating_points[i];
-                struct Av1FrameHeaderOperatingPoint *const op = &hdr->operating_points[i];
+            for (int i = 0; i < c->seq_hdr->num_operating_points; i++) {
+                const struct Dav1dSequenceHeaderOperatingPoint *const seqop = &seqhdr->operating_points[i];
+                struct Dav1dFrameHeaderOperatingPoint *const op = &hdr->operating_points[i];
                if (seqop->decoder_model_param_present) {
-                    int in_temporal_layer = (seqop->idc >>  0 /* FIXME: temporal_id */ ) & 1;
-                    int in_spatial_layer  = (seqop->idc >> (0 /* FIXME: spatial_id */ + 8)) & 1;
-                    if (!seqop->idc || in_temporal_layer || in_spatial_layer)
+                    int in_temporal_layer = (seqop->idc >> hdr->temporal_id) & 1;
+                    int in_spatial_layer  = (seqop->idc >> (hdr->spatial_id + 8)) & 1;
+                    if (!seqop->idc || (in_temporal_layer && in_spatial_layer))
                        op->buffer_removal_time = dav1d_get_bits(gb, seqhdr->buffer_removal_delay_length);
                }
            }
@ -411,7 +439,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
                dav1d_get_bits(gb, seqhdr->order_hint_n_bits);
        if ((res = read_frame_size(c, gb, 0)) < 0) goto error;
        hdr->allow_intrabc = hdr->allow_screen_content_tools &&
-                             /* FIXME: no superres scaling && */ dav1d_get_bits(gb, 1);
+                             !hdr->super_res.enabled && dav1d_get_bits(gb, 1);
        hdr->use_ref_frame_mvs = 0;
    } else {
        hdr->allow_intrabc = 0;
@ -422,9 +450,99 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
                dav1d_get_bits(gb, seqhdr->order_hint_n_bits);
        hdr->frame_ref_short_signaling =
            seqhdr->order_hint && dav1d_get_bits(gb, 1);
-        if (hdr->frame_ref_short_signaling) goto error; // FIXME
+        if (hdr->frame_ref_short_signaling) { // FIXME: Nearly verbatim copy from section 7.8
+            hdr->refidx[0] = dav1d_get_bits(gb, 3);
+            hdr->refidx[1] = hdr->refidx[2] = -1;
+            hdr->refidx[3] = dav1d_get_bits(gb, 3);
+            hdr->refidx[4] = hdr->refidx[5] = hdr->refidx[6] = -1;
+
+            int shifted_frame_offset[8];
+            const int current_frame_offset = 1 << (seqhdr->order_hint_n_bits - 1);
+            for (int i = 0; i < 8; i++) {
+                if (!c->refs[i].p.p.frame_hdr) goto error;
+                shifted_frame_offset[i] = current_frame_offset +
+                    get_poc_diff(seqhdr->order_hint_n_bits,
+                                 c->refs[i].p.p.frame_hdr->frame_offset,
+                                 hdr->frame_offset);
+            }
+
+            int used_frame[8] = { 0 };
+            used_frame[hdr->refidx[0]] = 1;
+            used_frame[hdr->refidx[3]] = 1;
+
+            int latest_frame_offset = -1;
+            for (int i = 0; i < 8; i++) {
+                int hint = shifted_frame_offset[i];
+                if (!used_frame[i] && hint >= current_frame_offset &&
+                    hint >= latest_frame_offset)
+                {
+                    hdr->refidx[6] = i;
+                    latest_frame_offset = hint;
+                }
+            }
+            if (latest_frame_offset != -1)
+                used_frame[hdr->refidx[6]] = 1;
+
+            int earliest_frame_offset = INT_MAX;
+            for (int i = 0; i < 8; i++) {
+                int hint = shifted_frame_offset[i];
+                if (!used_frame[i] && hint >= current_frame_offset &&
+                    hint < earliest_frame_offset)
+                {
+                    hdr->refidx[4] = i;
+                    earliest_frame_offset = hint;
+                }
+            }
+            if (earliest_frame_offset != INT_MAX)
+                used_frame[hdr->refidx[4]] = 1;
+
+            earliest_frame_offset = INT_MAX;
+            for (int i = 0; i < 8; i++) {
+                int hint = shifted_frame_offset[i];
+                if (!used_frame[i] && hint >= current_frame_offset &&
+                    (hint < earliest_frame_offset))
+                {
+                    hdr->refidx[5] = i;
+                    earliest_frame_offset = hint;
+                }
+            }
+            if (earliest_frame_offset != INT_MAX)
+                used_frame[hdr->refidx[5]] = 1;
+
+            for (int i = 1; i < 7; i++) {
+                if (hdr->refidx[i] < 0) {
+                    latest_frame_offset = -1;
+                    for (int j = 0; j < 8; j++) {
+                        int hint = shifted_frame_offset[j];
+                        if (!used_frame[j] && hint < current_frame_offset &&
+                            hint >= latest_frame_offset)
+                        {
+                            hdr->refidx[i] = j;
+                            latest_frame_offset = hint;
+                        }
+                    }
+                    if (latest_frame_offset != -1)
+                        used_frame[hdr->refidx[i]] = 1;
+                }
+            }
+
+            earliest_frame_offset = INT_MAX;
+            int ref = -1;
+            for (int i = 0; i < 8; i++) {
+                int hint = shifted_frame_offset[i];
+                if (hint < earliest_frame_offset) {
+                    ref = i;
+                    earliest_frame_offset = hint;
+                }
+            }
+            for (int i = 0; i < 7; i++) {
+                if (hdr->refidx[i] < 0)
+                    hdr->refidx[i] = ref;
+            }
+        }
        for (int i = 0; i < 7; i++) {
-            hdr->refidx[i] = dav1d_get_bits(gb, 3);
+            if (!hdr->frame_ref_short_signaling)
+                hdr->refidx[i] = dav1d_get_bits(gb, 3);
            if (seqhdr->frame_id_numbers_present)
                dav1d_get_bits(gb, seqhdr->delta_frame_id_n_bits);
        }
@ -432,7 +550,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
                            hdr->frame_size_override;
        if ((res = read_frame_size(c, gb, use_ref)) < 0) goto error;
        hdr->hp = !hdr->force_integer_mv && dav1d_get_bits(gb, 1);
-        hdr->subpel_filter_mode = dav1d_get_bits(gb, 1) ? FILTER_SWITCHABLE :
+        hdr->subpel_filter_mode = dav1d_get_bits(gb, 1) ? DAV1D_FILTER_SWITCHABLE :
                                                          dav1d_get_bits(gb, 2);
        hdr->switchable_motion_mode = dav1d_get_bits(gb, 1);
        hdr->use_ref_frame_mvs = !hdr->error_resilient_mode &&
@ -455,13 +573,13 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
    hdr->tiling.uniform = dav1d_get_bits(gb, 1);
    const int sbsz_min1 = (64 << seqhdr->sb128) - 1;
    int sbsz_log2 = 6 + seqhdr->sb128;
-    int sbw = (hdr->width + sbsz_min1) >> sbsz_log2;
+    int sbw = (hdr->width[0] + sbsz_min1) >> sbsz_log2;
    int sbh = (hdr->height + sbsz_min1) >> sbsz_log2;
    int max_tile_width_sb = 4096 >> sbsz_log2;
    int max_tile_area_sb = 4096 * 2304 >> (2 * sbsz_log2);
    hdr->tiling.min_log2_cols = tile_log2(max_tile_width_sb, sbw);
-    hdr->tiling.max_log2_cols = tile_log2(1, imin(sbw, MAX_TILE_COLS));
-    hdr->tiling.max_log2_rows = tile_log2(1, imin(sbh, MAX_TILE_ROWS));
+    hdr->tiling.max_log2_cols = tile_log2(1, imin(sbw, DAV1D_MAX_TILE_COLS));
+    hdr->tiling.max_log2_rows = tile_log2(1, imin(sbh, DAV1D_MAX_TILE_ROWS));
    int min_log2_tiles = imax(tile_log2(max_tile_area_sb, sbw * sbh),
                              hdr->tiling.min_log2_cols);
    if (hdr->tiling.uniform) {
@ -485,7 +603,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
    } else {
        hdr->tiling.cols = 0;
        int widest_tile = 0, max_tile_area_sb = sbw * sbh;
-        for (int sbx = 0; sbx < sbw && hdr->tiling.cols < MAX_TILE_COLS; hdr->tiling.cols++) {
+        for (int sbx = 0; sbx < sbw && hdr->tiling.cols < DAV1D_MAX_TILE_COLS; hdr->tiling.cols++) {
            const int tile_width_sb = imin(sbw - sbx, max_tile_width_sb);
            const int tile_w = (tile_width_sb > 1) ?
                                   1 + dav1d_get_uniform(gb, tile_width_sb) :
@ -499,7 +617,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
        int max_tile_height_sb = imax(max_tile_area_sb / widest_tile, 1);

        hdr->tiling.rows = 0;
-        for (int sby = 0; sby < sbh && hdr->tiling.rows < MAX_TILE_ROWS; hdr->tiling.rows++) {
+        for (int sby = 0; sby < sbh && hdr->tiling.rows < DAV1D_MAX_TILE_ROWS; hdr->tiling.rows++) {
            const int tile_height_sb = imin(sbh - sby, max_tile_height_sb);
            const int tile_h = (tile_height_sb > 1) ?
                                   1 + dav1d_get_uniform(gb, tile_height_sb) :
@ -528,7 +646,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
    // quant data
    hdr->quant.yac = dav1d_get_bits(gb, 8);
    hdr->quant.ydc_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0;
-    if (seqhdr->layout != DAV1D_PIXEL_LAYOUT_I400) {
+    if (!seqhdr->monochrome) {
        // If the sequence header says that delta_q might be different
        // for U, V, we must check whether it actually is for this
        // frame.
@ -563,7 +681,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
    // segmentation data
    hdr->segmentation.enabled = dav1d_get_bits(gb, 1);
    if (hdr->segmentation.enabled) {
-        if (hdr->primary_ref_frame == PRIMARY_REF_NONE) {
+        if (hdr->primary_ref_frame == DAV1D_PRIMARY_REF_NONE) {
            hdr->segmentation.update_map = 1;
            hdr->segmentation.temporal = 0;
            hdr->segmentation.update_data = 1;
@ -577,8 +695,8 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
        if (hdr->segmentation.update_data) {
            hdr->segmentation.seg_data.preskip = 0;
            hdr->segmentation.seg_data.last_active_segid = -1;
-            for (int i = 0; i < NUM_SEGMENTS; i++) {
-                Av1SegmentationData *const seg =
+            for (int i = 0; i < DAV1D_MAX_SEGMENTS; i++) {
+                Dav1dSegmentationData *const seg =
                    &hdr->segmentation.seg_data.d[i];
                if (dav1d_get_bits(gb, 1)) {
                    seg->delta_q = dav1d_get_sbits(gb, 8);
@ -629,13 +747,15 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
        } else {
            // segmentation.update_data was false so we should copy
            // segmentation data from the reference frame.
-            assert(hdr->primary_ref_frame != PRIMARY_REF_NONE);
+            assert(hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE);
            const int pri_ref = hdr->refidx[hdr->primary_ref_frame];
-            hdr->segmentation.seg_data = c->refs[pri_ref].seg_data;
+            if (!c->refs[pri_ref].p.p.frame_hdr) return -EINVAL;
+            hdr->segmentation.seg_data =
+                c->refs[pri_ref].p.p.frame_hdr->segmentation.seg_data;
        }
    } else {
-        memset(&hdr->segmentation.seg_data, 0, sizeof(Av1SegmentationDataSet));
-        for (int i = 0; i < NUM_SEGMENTS; i++)
+        memset(&hdr->segmentation.seg_data, 0, sizeof(Dav1dSegmentationDataSet));
+        for (int i = 0; i < DAV1D_MAX_SEGMENTS; i++)
            hdr->segmentation.seg_data.d[i].ref = -1;
    }
 #if DEBUG_FRAME_HDR
@ -659,7 +779,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
    const int delta_lossless = !hdr->quant.ydc_delta && !hdr->quant.udc_delta &&
        !hdr->quant.uac_delta && !hdr->quant.vdc_delta && !hdr->quant.vac_delta;
    hdr->all_lossless = 1;
-    for (int i = 0; i < NUM_SEGMENTS; i++) {
+    for (int i = 0; i < DAV1D_MAX_SEGMENTS; i++) {
        hdr->segmentation.qidx[i] = hdr->segmentation.enabled ?
            iclip_u8(hdr->quant.yac + hdr->segmentation.seg_data.d[i].delta_q) :
            hdr->quant.yac;
@ -679,7 +799,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
    } else {
        hdr->loopfilter.level_y[0] = dav1d_get_bits(gb, 6);
        hdr->loopfilter.level_y[1] = dav1d_get_bits(gb, 6);
-        if (seqhdr->layout != DAV1D_PIXEL_LAYOUT_I400 &&
+        if (!seqhdr->monochrome &&
            (hdr->loopfilter.level_y[0] || hdr->loopfilter.level_y[1]))
        {
            hdr->loopfilter.level_u = dav1d_get_bits(gb, 6);
@ -687,11 +807,13 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
        }
        hdr->loopfilter.sharpness = dav1d_get_bits(gb, 3);

-        if (hdr->primary_ref_frame == PRIMARY_REF_NONE) {
+        if (hdr->primary_ref_frame == DAV1D_PRIMARY_REF_NONE) {
            hdr->loopfilter.mode_ref_deltas = default_mode_ref_deltas;
        } else {
            const int ref = hdr->refidx[hdr->primary_ref_frame];
-            hdr->loopfilter.mode_ref_deltas = c->refs[ref].lf_mode_ref_deltas;
+            if (!c->refs[ref].p.p.frame_hdr) return -EINVAL;
+            hdr->loopfilter.mode_ref_deltas =
+                c->refs[ref].p.p.frame_hdr->loopfilter.mode_ref_deltas;
        }
        hdr->loopfilter.mode_ref_delta_enabled = dav1d_get_bits(gb, 1);
        if (hdr->loopfilter.mode_ref_delta_enabled) {
@ -719,7 +841,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
        hdr->cdef.n_bits = dav1d_get_bits(gb, 2);
        for (int i = 0; i < (1 << hdr->cdef.n_bits); i++) {
            hdr->cdef.y_strength[i] = dav1d_get_bits(gb, 6);
-            if (seqhdr->layout != DAV1D_PIXEL_LAYOUT_I400)
+            if (!seqhdr->monochrome)
                hdr->cdef.uv_strength[i] = dav1d_get_bits(gb, 6);
        }
    } else {
@ -733,14 +855,16 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
 #endif

    // restoration
-    if (!hdr->all_lossless && seqhdr->restoration && !hdr->allow_intrabc) {
+    if ((!hdr->all_lossless || hdr->super_res.enabled) &&
+        seqhdr->restoration && !hdr->allow_intrabc)
+    {
        hdr->restoration.type[0] = dav1d_get_bits(gb, 2);
-        if (seqhdr->layout != DAV1D_PIXEL_LAYOUT_I400) {
+        if (!seqhdr->monochrome) {
            hdr->restoration.type[1] = dav1d_get_bits(gb, 2);
            hdr->restoration.type[2] = dav1d_get_bits(gb, 2);
        } else {
            hdr->restoration.type[1] =
-            hdr->restoration.type[2] = RESTORATION_NONE;
+            hdr->restoration.type[2] = DAV1D_RESTORATION_NONE;
        }

        if (hdr->restoration.type[0] || hdr->restoration.type[1] ||
@ -755,7 +879,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
            }
            hdr->restoration.unit_size[1] = hdr->restoration.unit_size[0];
            if ((hdr->restoration.type[1] || hdr->restoration.type[2]) &&
-                seqhdr->layout == DAV1D_PIXEL_LAYOUT_I420)
+                seqhdr->ss_hor == 1 && seqhdr->ss_ver == 1)
            {
                hdr->restoration.unit_size[1] -= dav1d_get_bits(gb, 1);
            }
@ -763,17 +887,17 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
            hdr->restoration.unit_size[0] = 8;
        }
    } else {
-        hdr->restoration.type[0] = RESTORATION_NONE;
-        hdr->restoration.type[1] = RESTORATION_NONE;
-        hdr->restoration.type[2] = RESTORATION_NONE;
+        hdr->restoration.type[0] = DAV1D_RESTORATION_NONE;
+        hdr->restoration.type[1] = DAV1D_RESTORATION_NONE;
+        hdr->restoration.type[2] = DAV1D_RESTORATION_NONE;
    }
 #if DEBUG_FRAME_HDR
    printf("HDR: post-restoration: off=%ld\n",
           (gb->ptr - init_ptr) * 8 - gb->bits_left);
 #endif

-    hdr->txfm_mode = hdr->all_lossless ? TX_4X4_ONLY :
-                     dav1d_get_bits(gb, 1) ? TX_SWITCHABLE : TX_LARGEST;
+    hdr->txfm_mode = hdr->all_lossless ? DAV1D_TX_4X4_ONLY :
+                     dav1d_get_bits(gb, 1) ? DAV1D_TX_SWITCHABLE : DAV1D_TX_LARGEST;
 #if DEBUG_FRAME_HDR
    printf("HDR: post-txfmmode: off=%ld\n",
           (gb->ptr - init_ptr) * 8 - gb->bits_left);
@ -790,7 +914,8 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
        int off_after = -1;
        int off_before_idx[2], off_after_idx;
        for (int i = 0; i < 7; i++) {
-            const unsigned refpoc = c->refs[hdr->refidx[i]].p.p.poc;
+            if (!c->refs[hdr->refidx[i]].p.p.data[0]) return -EINVAL;
+            const unsigned refpoc = c->refs[hdr->refidx[i]].p.p.frame_hdr->frame_offset;

            const int diff = get_poc_diff(seqhdr->order_hint_n_bits, refpoc, poc);
            if (diff > 0) {
@ -854,21 +979,26 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {

    if (hdr->frame_type & 1) {
        for (int i = 0; i < 7; i++) {
-            hdr->gmv[i].type = !dav1d_get_bits(gb, 1) ? WM_TYPE_IDENTITY :
-                                dav1d_get_bits(gb, 1) ? WM_TYPE_ROT_ZOOM :
-                                dav1d_get_bits(gb, 1) ? WM_TYPE_TRANSLATION :
-                                                  WM_TYPE_AFFINE;
+            hdr->gmv[i].type = !dav1d_get_bits(gb, 1) ? DAV1D_WM_TYPE_IDENTITY :
+                                dav1d_get_bits(gb, 1) ? DAV1D_WM_TYPE_ROT_ZOOM :
+                                dav1d_get_bits(gb, 1) ? DAV1D_WM_TYPE_TRANSLATION :
+                                                  DAV1D_WM_TYPE_AFFINE;

-            if (hdr->gmv[i].type == WM_TYPE_IDENTITY) continue;
+            if (hdr->gmv[i].type == DAV1D_WM_TYPE_IDENTITY) continue;

-            const WarpedMotionParams *const ref_gmv =
-                hdr->primary_ref_frame == PRIMARY_REF_NONE ? &dav1d_default_wm_params :
-                &c->refs[hdr->refidx[hdr->primary_ref_frame]].gmv[i];
+            const Dav1dWarpedMotionParams *ref_gmv;
+            if (hdr->primary_ref_frame == DAV1D_PRIMARY_REF_NONE) {
+                ref_gmv = &dav1d_default_wm_params;
+            } else {
+                const int pri_ref = hdr->refidx[hdr->primary_ref_frame];
+                if (!c->refs[pri_ref].p.p.frame_hdr) return -EINVAL;
+                ref_gmv = &c->refs[pri_ref].p.p.frame_hdr->gmv[i];
+            }
            int32_t *const mat = hdr->gmv[i].matrix;
            const int32_t *const ref_mat = ref_gmv->matrix;
            int bits, shift;

-            if (hdr->gmv[i].type >= WM_TYPE_ROT_ZOOM) {
+            if (hdr->gmv[i].type >= DAV1D_WM_TYPE_ROT_ZOOM) {
                mat[2] = (1 << 16) + 2 *
                    dav1d_get_bits_subexp(gb, (ref_mat[2] - (1 << 16)) >> 1, 12);
                mat[3] = 2 * dav1d_get_bits_subexp(gb, ref_mat[3] >> 1, 12);
@ -880,7 +1010,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
                shift = 13 + !hdr->hp;
            }

-            if (hdr->gmv[i].type == WM_TYPE_AFFINE) {
+            if (hdr->gmv[i].type == DAV1D_WM_TYPE_AFFINE) {
                mat[4] = 2 * dav1d_get_bits_subexp(gb, ref_mat[4] >> 1, 12);
                mat[5] = (1 << 16) + 2 *
                    dav1d_get_bits_subexp(gb, (ref_mat[5] - (1 << 16)) >> 1, 12);
@ -891,9 +1021,6 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {

            mat[0] = dav1d_get_bits_subexp(gb, ref_mat[0] >> shift, bits) * (1 << shift);
            mat[1] = dav1d_get_bits_subexp(gb, ref_mat[1] >> shift, bits) * (1 << shift);
-
-            if (dav1d_get_shear_params(&hdr->gmv[i]))
-                hdr->gmv[i].type = WM_TYPE_TRANSLATION;
        }
    }
 #if DEBUG_FRAME_HDR
@ -905,7 +1032,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
                              (hdr->show_frame || hdr->showable_frame) &&
                              dav1d_get_bits(gb, 1);
    if (hdr->film_grain.present) {
-        hdr->film_grain.seed = dav1d_get_bits(gb, 16);
+        const unsigned seed = dav1d_get_bits(gb, 16);
        hdr->film_grain.update = hdr->frame_type != DAV1D_FRAME_TYPE_INTER || dav1d_get_bits(gb, 1);
        if (!hdr->film_grain.update) {
            const int refidx = dav1d_get_bits(gb, 3);
@ -913,10 +1040,12 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
            for (i = 0; i < 7; i++)
                if (hdr->refidx[i] == refidx)
                    break;
-            if (i == 7) goto error;
-            hdr->film_grain.data = c->refs[refidx].film_grain;
+            if (i == 7 || !c->refs[refidx].p.p.frame_hdr)  goto error;
+            hdr->film_grain.data = c->refs[refidx].p.p.frame_hdr->film_grain.data;
+            hdr->film_grain.data.seed = seed;
        } else {
-            Av1FilmGrainData *const fgd = &hdr->film_grain.data;
+            Dav1dFilmGrainData *const fgd = &hdr->film_grain.data;
+            fgd->seed = seed;

            fgd->num_y_points = dav1d_get_bits(gb, 4);
            if (fgd->num_y_points > 14) goto error;
@ -928,10 +1057,9 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
            }

            fgd->chroma_scaling_from_luma =
-                seqhdr->layout != DAV1D_PIXEL_LAYOUT_I400 && dav1d_get_bits(gb, 1);
-            if (seqhdr->layout == DAV1D_PIXEL_LAYOUT_I400 ||
-                fgd->chroma_scaling_from_luma ||
-                (seqhdr->layout == DAV1D_PIXEL_LAYOUT_I420 && !fgd->num_y_points))
+                !seqhdr->monochrome && dav1d_get_bits(gb, 1);
+            if (seqhdr->monochrome || fgd->chroma_scaling_from_luma ||
+                (seqhdr->ss_ver == 1 && seqhdr->ss_hor == 1 && !fgd->num_y_points))
            {
                fgd->num_uv_points[0] = fgd->num_uv_points[1] = 0;
            } else for (int pl = 0; pl < 2; pl++) {
@ -945,7 +1073,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
                }
            }

-            if (seqhdr->layout == DAV1D_PIXEL_LAYOUT_I420 &&
+            if (seqhdr->ss_hor == 1 && seqhdr->ss_ver == 1 &&
                !!fgd->num_uv_points[0] != !!fgd->num_uv_points[1])
            {
                goto error;
@ -967,9 +1095,9 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
            fgd->grain_scale_shift = dav1d_get_bits(gb, 2);
            for (int pl = 0; pl < 2; pl++)
                if (fgd->num_uv_points[pl]) {
-                    fgd->uv_mult[pl] = dav1d_get_bits(gb, 8);
-                    fgd->uv_luma_mult[pl] = dav1d_get_bits(gb, 8);
-                    fgd->uv_offset[pl] = dav1d_get_bits(gb, 9);
+                    fgd->uv_mult[pl] = dav1d_get_bits(gb, 8) - 128;
+                    fgd->uv_luma_mult[pl] = dav1d_get_bits(gb, 8) - 128;
+                    fgd->uv_offset[pl] = dav1d_get_bits(gb, 9) - 256;
                }
            fgd->overlap_flag = dav1d_get_bits(gb, 1);
            fgd->clip_to_restricted_range = dav1d_get_bits(gb, 1);
@ -991,13 +1119,13 @@ error:

 static void parse_tile_hdr(Dav1dContext *const c, GetBits *const gb) {
    int have_tile_pos = 0;
-    const int n_tiles = c->frame_hdr.tiling.cols * c->frame_hdr.tiling.rows;
+    const int n_tiles = c->frame_hdr->tiling.cols * c->frame_hdr->tiling.rows;
    if (n_tiles > 1)
        have_tile_pos = dav1d_get_bits(gb, 1);

    if (have_tile_pos) {
-        const int n_bits = c->frame_hdr.tiling.log2_cols +
-                           c->frame_hdr.tiling.log2_rows;
+        const int n_bits = c->frame_hdr->tiling.log2_cols +
+                           c->frame_hdr->tiling.log2_rows;
        c->tile[c->n_tile_data].start = dav1d_get_bits(gb, n_bits);
        c->tile[c->n_tile_data].end = dav1d_get_bits(gb, n_bits);
    } else {
@ -1031,7 +1159,7 @@ check_for_overrun(GetBits *const gb, unsigned init_bit_pos, unsigned obu_len)
    return 0;
 }

-int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) {
+int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
    GetBits gb;
    int res;

@ -1043,9 +1171,11 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) {
    const int has_extension = dav1d_get_bits(&gb, 1);
    const int has_length_field = dav1d_get_bits(&gb, 1);
    dav1d_get_bits(&gb, 1); // reserved
+
+    int temporal_id = 0, spatial_id = 0;
    if (has_extension) {
-        dav1d_get_bits(&gb, 3); // temporal_layer_id
-        dav1d_get_bits(&gb, 2); // enhancement_layer_id
+        temporal_id = dav1d_get_bits(&gb, 3);
+        spatial_id = dav1d_get_bits(&gb, 2);
        dav1d_get_bits(&gb, 3); // reserved
    }

@ -1083,19 +1213,35 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) {
    // rest of the OBU.
    if (len > in->sz - init_byte_pos) goto error;

+    // skip obu not belonging to the selected temporal/spatial layer
+    if (type != OBU_SEQ_HDR && type != OBU_TD &&
+        has_extension && c->operating_point_idc != 0)
+    {
+        const int in_temporal_layer = (c->operating_point_idc >> temporal_id) & 1;
+        const int in_spatial_layer = (c->operating_point_idc >> (spatial_id + 8)) & 1;
+        if (!in_temporal_layer || !in_spatial_layer)
+            return len + init_byte_pos;
+    }
+
    switch (type) {
    case OBU_SEQ_HDR: {
-        Av1SequenceHeader hdr, *const hdr_ptr = c->have_seq_hdr ? &hdr : &c->seq_hdr;
-        memset(hdr_ptr, 0, sizeof(*hdr_ptr));
-        c->have_frame_hdr = 0;
-        if ((res = parse_seq_hdr(c, &gb, hdr_ptr)) < 0)
+        Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dSequenceHeader));
+        if (!ref) return -ENOMEM;
+        Dav1dSequenceHeader *seq_hdr = ref->data;
+        memset(seq_hdr, 0, sizeof(*seq_hdr));
+        c->frame_hdr = NULL;
+        if ((res = parse_seq_hdr(c, &gb, seq_hdr)) < 0) {
+            dav1d_ref_dec(&ref);
            return res;
-        if (check_for_overrun(&gb, init_bit_pos, len))
+        }
+        if (check_for_overrun(&gb, init_bit_pos, len)) {
+            dav1d_ref_dec(&ref);
            return -EINVAL;
+        }
        // If we have read a sequence header which is different from
        // the old one, this is a new video sequence and can't use any
        // previous state. Free that state.
-        if (c->have_seq_hdr && memcmp(&hdr, &c->seq_hdr, sizeof(hdr))) {
+        if (c->seq_hdr && memcmp(seq_hdr, c->seq_hdr, sizeof(*seq_hdr))) {
            for (int i = 0; i < 8; i++) {
                if (c->refs[i].p.p.data[0])
                    dav1d_thread_picture_unref(&c->refs[i].p);
@ -1104,21 +1250,33 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) {
                if (c->cdf[i].cdf)
                    dav1d_cdf_thread_unref(&c->cdf[i]);
            }
-            c->seq_hdr = hdr;
        }
-        c->have_seq_hdr = 1;
+        dav1d_ref_dec(&c->seq_hdr_ref);
+        c->seq_hdr_ref = ref;
+        c->seq_hdr = seq_hdr;
        break;
    }
    case OBU_REDUNDANT_FRAME_HDR:
-        if (c->have_frame_hdr) break;
+        if (c->frame_hdr) break;
        // fall-through
    case OBU_FRAME:
    case OBU_FRAME_HDR:
-        c->have_frame_hdr = 0;
-        if (!c->have_seq_hdr) goto error;
-        if ((res = parse_frame_hdr(c, &gb)) < 0)
+        if (global) break;
+        if (!c->seq_hdr) goto error;
+        if (!c->frame_hdr_ref) {
+            c->frame_hdr_ref = dav1d_ref_create(sizeof(Dav1dFrameHeader));
+            if (!c->frame_hdr_ref) return -ENOMEM;
+        }
+        // ensure that the reference is writable
+        assert(dav1d_ref_is_writable(c->frame_hdr_ref));
+        c->frame_hdr = c->frame_hdr_ref->data;
+        memset(c->frame_hdr, 0, sizeof(*c->frame_hdr));
+        c->frame_hdr->temporal_id = temporal_id;
+        c->frame_hdr->spatial_id = spatial_id;
+        if ((res = parse_frame_hdr(c, &gb)) < 0) {
+            c->frame_hdr = NULL;
            return res;
-        c->have_frame_hdr = 1;
+        }
        for (int n = 0; n < c->n_tile_data; n++)
            dav1d_data_unref(&c->tile[n].data);
        c->n_tile_data = 0;
@ -1127,13 +1285,18 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) {
            // This is actually a frame header OBU so read the
            // trailing bit and check for overrun.
            dav1d_get_bits(&gb, 1);
-            if (check_for_overrun(&gb, init_bit_pos, len))
+            if (check_for_overrun(&gb, init_bit_pos, len)) {
+                c->frame_hdr = NULL;
                return -EINVAL;
+            }

            break;
        }
        // OBU_FRAMEs shouldn't be signalled with show_existing_frame
-        if (c->frame_hdr.show_existing_frame) goto error;
+        if (c->frame_hdr->show_existing_frame) {
+            c->frame_hdr = NULL;
+            goto error;
+        }

        // This is the frame header at the start of a frame OBU.
        // There's no trailing bit at the end to skip, but we do need
@ -1141,7 +1304,8 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) {
        dav1d_bytealign_get_bits(&gb);
        // fall-through
    case OBU_TILE_GRP: {
-        if (!c->have_frame_hdr) goto error;
+        if (global) break;
+        if (!c->frame_hdr) goto error;
        if (c->n_tile_data >= 256) goto error;
        parse_tile_hdr(c, &gb);
        // Align to the next byte boundary and check for overrun.
@ -1156,6 +1320,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) {
        assert(pkt_bytelen >= (bit_pos >> 3));
        dav1d_ref_inc(in->ref);
        c->tile[c->n_tile_data].data.ref = in->ref;
+        c->tile[c->n_tile_data].data.m = in->m;
        c->tile[c->n_tile_data].data.data = in->data + (bit_pos >> 3);
        c->tile[c->n_tile_data].data.sz = pkt_bytelen - (bit_pos >> 3);
        // ensure tile groups are in order and sane, see 6.10.1
@ -1183,72 +1348,67 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) {
        return -EINVAL;
    }

-    if (c->have_seq_hdr && c->have_frame_hdr &&
-        c->n_tiles == c->frame_hdr.tiling.cols * c->frame_hdr.tiling.rows)
-    {
-        if (!c->n_tile_data)
-            return -EINVAL;
-        if ((res = dav1d_submit_frame(c)) < 0)
-            return res;
-        assert(!c->n_tile_data);
-        c->have_frame_hdr = 0;
-        c->n_tiles = 0;
-    } else if (c->have_seq_hdr && c->have_frame_hdr &&
-               c->frame_hdr.show_existing_frame)
-    {
-        if (c->n_fc == 1) {
-            dav1d_picture_ref(&c->out,
-                              &c->refs[c->frame_hdr.existing_frame_idx].p.p);
-        } else {
-            // need to append this to the frame output queue
-            const unsigned next = c->frame_thread.next++;
-            if (c->frame_thread.next == c->n_fc)
-                c->frame_thread.next = 0;
+    if (c->seq_hdr && c->frame_hdr) {
+        if (c->frame_hdr->show_existing_frame) {
+            if (!c->refs[c->frame_hdr->existing_frame_idx].p.p.data[0]) return -EINVAL;
+            if (c->n_fc == 1) {
+                dav1d_picture_ref(&c->out,
+                                  &c->refs[c->frame_hdr->existing_frame_idx].p.p);
+                c->out.m = in->m;
+            } else {
+                // need to append this to the frame output queue
+                const unsigned next = c->frame_thread.next++;
+                if (c->frame_thread.next == c->n_fc)
+                    c->frame_thread.next = 0;

-            Dav1dFrameContext *const f = &c->fc[next];
-            pthread_mutex_lock(&f->frame_thread.td.lock);
-            while (f->n_tile_data > 0)
-                pthread_cond_wait(&f->frame_thread.td.cond,
-                                  &f->frame_thread.td.lock);
-            Dav1dThreadPicture *const out_delayed =
-                &c->frame_thread.out_delayed[next];
-            if (out_delayed->p.data[0]) {
-                if (out_delayed->visible && !out_delayed->flushed)
-                    dav1d_picture_ref(&c->out, &out_delayed->p);
-                dav1d_thread_picture_unref(out_delayed);
+                Dav1dFrameContext *const f = &c->fc[next];
+                pthread_mutex_lock(&f->frame_thread.td.lock);
+                while (f->n_tile_data > 0)
+                    pthread_cond_wait(&f->frame_thread.td.cond,
+                                      &f->frame_thread.td.lock);
+                Dav1dThreadPicture *const out_delayed =
+                    &c->frame_thread.out_delayed[next];
+                if (out_delayed->p.data[0]) {
+                    const unsigned progress = atomic_load_explicit(&out_delayed->progress[1],
+                                                                   memory_order_relaxed);
+                    if (out_delayed->visible && progress != FRAME_ERROR)
+                        dav1d_picture_ref(&c->out, &out_delayed->p);
+                    dav1d_thread_picture_unref(out_delayed);
+                }
+                dav1d_thread_picture_ref(out_delayed,
+                                         &c->refs[c->frame_hdr->existing_frame_idx].p);
+                out_delayed->visible = 1;
+                out_delayed->p.m = in->m;
+                pthread_mutex_unlock(&f->frame_thread.td.lock);
            }
-            dav1d_thread_picture_ref(out_delayed,
-                                     &c->refs[c->frame_hdr.existing_frame_idx].p);
-            out_delayed->visible = 1;
-            out_delayed->flushed = 0;
-            pthread_mutex_unlock(&f->frame_thread.td.lock);
-        }
-        c->have_frame_hdr = 0;
-        if (c->refs[c->frame_hdr.existing_frame_idx].p.p.p.type == DAV1D_FRAME_TYPE_KEY) {
-            const int r = c->frame_hdr.existing_frame_idx;
-            for (int i = 0; i < 8; i++) {
-                if (i == c->frame_hdr.existing_frame_idx) continue;
+            if (c->refs[c->frame_hdr->existing_frame_idx].p.p.frame_hdr->frame_type == DAV1D_FRAME_TYPE_KEY) {
+                const int r = c->frame_hdr->existing_frame_idx;
+                for (int i = 0; i < 8; i++) {
+                    if (i == c->frame_hdr->existing_frame_idx) continue;

-                if (c->refs[i].p.p.data[0])
-                    dav1d_thread_picture_unref(&c->refs[i].p);
-                dav1d_thread_picture_ref(&c->refs[i].p, &c->refs[r].p);
+                    if (c->refs[i].p.p.data[0])
+                        dav1d_thread_picture_unref(&c->refs[i].p);
+                    dav1d_thread_picture_ref(&c->refs[i].p, &c->refs[r].p);

-                if (c->cdf[i].cdf) dav1d_cdf_thread_unref(&c->cdf[i]);
-                dav1d_init_states(&c->cdf[i], c->refs[r].qidx);
+                    if (c->cdf[i].cdf) dav1d_cdf_thread_unref(&c->cdf[i]);
+                    dav1d_init_states(&c->cdf[i], c->refs[r].p.p.frame_hdr->quant.yac);

-                c->refs[i].lf_mode_ref_deltas = c->refs[r].lf_mode_ref_deltas;
-                c->refs[i].seg_data = c->refs[r].seg_data;
-                for (int j = 0; j < 7; j++)
-                    c->refs[i].gmv[j] = dav1d_default_wm_params;
-                c->refs[i].film_grain = c->refs[r].film_grain;
-
-                dav1d_ref_dec(&c->refs[i].segmap);
-                c->refs[i].segmap = c->refs[r].segmap;
-                if (c->refs[r].segmap)
-                    dav1d_ref_inc(c->refs[r].segmap);
-                dav1d_ref_dec(&c->refs[i].refmvs);
-                c->refs[i].qidx = c->refs[r].qidx;
+                    dav1d_ref_dec(&c->refs[i].segmap);
+                    c->refs[i].segmap = c->refs[r].segmap;
+                    if (c->refs[r].segmap)
+                        dav1d_ref_inc(c->refs[r].segmap);
+                    dav1d_ref_dec(&c->refs[i].refmvs);
+                }
            }
+            c->frame_hdr = NULL;
+        } else if (c->n_tiles == c->frame_hdr->tiling.cols * c->frame_hdr->tiling.rows) {
+            if (!c->n_tile_data)
+                return -EINVAL;
+            if ((res = dav1d_submit_frame(c)) < 0)
+                return res;
+            assert(!c->n_tile_data);
+            c->frame_hdr = NULL;
+            c->n_tiles = 0;
        }
    }

--- a/third_party/dav1d/src/obu.h
+++ b/third_party/dav1d/src/obu.h
@ -31,6 +31,6 @@
 #include "dav1d/data.h"
 #include "src/internal.h"

-int dav1d_parse_obus(Dav1dContext *c, Dav1dData *in);
+int dav1d_parse_obus(Dav1dContext *c, Dav1dData *in, int global);

 #endif /* __DAV1D_SRC_OBU_H__ */
--- a/third_party/dav1d/src/picture.c
+++ b/third_party/dav1d/src/picture.c
@ -29,6 +29,7 @@

 #include <assert.h>
 #include <errno.h>
+#include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@ -74,29 +75,24 @@ int default_picture_allocator(Dav1dPicture *const p, void *cookie) {
    return 0;
 }

-void default_picture_release(uint8_t *const data, void *const allocator_data,
-                             void *cookie)
-{
+void default_picture_release(Dav1dPicture *const p, void *cookie) {
    assert(cookie == NULL);
 #ifndef NDEBUG /* safety check */
-    assert(allocator_data == data);
+    assert(p->allocator_data == p->data[0]);
 #endif
-    dav1d_free_aligned(data);
+    dav1d_free_aligned(p->data[0]);
 }

 struct pic_ctx_context {
    Dav1dPicAllocator allocator;
-    void *allocator_data;
-    uint8_t *data;
+    Dav1dPicture pic;
    void *extra_ptr; /* MUST BE AT THE END */
 };

-static void free_buffer(const uint8_t *data, void *user_data)
-{
+static void free_buffer(const uint8_t *const data, void *const user_data) {
    struct pic_ctx_context *pic_ctx = user_data;

-    pic_ctx->allocator.release_picture_callback(pic_ctx->data,
-                                                pic_ctx->allocator_data,
+    pic_ctx->allocator.release_picture_callback(&pic_ctx->pic,
                                                pic_ctx->allocator.cookie);
    free(pic_ctx);
 }
@ -121,10 +117,9 @@ static int picture_alloc_with_edges(Dav1dPicture *const p,

    p->p.w = w;
    p->p.h = h;
-    p->p.pri = DAV1D_COLOR_PRI_UNKNOWN;
-    p->p.trc = DAV1D_TRC_UNKNOWN;
-    p->p.mtrx = DAV1D_MC_UNKNOWN;
-    p->p.chr = DAV1D_CHR_UNKNOWN;
+    p->m.timestamp = INT64_MIN;
+    p->m.duration = 0;
+    p->m.offset = -1;
    p->p.layout = layout;
    p->p.bpc = bpc;
    int res = p_allocator->alloc_picture_callback(p, p_allocator->cookie);
@ -134,12 +129,10 @@ static int picture_alloc_with_edges(Dav1dPicture *const p,
    }

    pic_ctx->allocator = *p_allocator;
-    pic_ctx->allocator_data = p->allocator_data;
-    pic_ctx->data = p->data[0];
+    pic_ctx->pic = *p;

    if (!(p->ref = dav1d_ref_wrap(p->data[0], free_buffer, pic_ctx))) {
-        p_allocator->release_picture_callback(p->data[0], p->allocator_data,
-                                              p_allocator->cookie);
+        p_allocator->release_picture_callback(p, p_allocator->cookie);
        fprintf(stderr, "Failed to wrap picture: %s\n", strerror(errno));
        return -ENOMEM;
    }
@ -165,7 +158,6 @@ int dav1d_thread_picture_alloc(Dav1dThreadPicture *const p,
    if (res) return res;

    p->visible = visible;
-    p->flushed = 0;
    if (t) {
        atomic_init(&p->progress[0], 0);
        atomic_init(&p->progress[1], 0);
@ -173,6 +165,29 @@ int dav1d_thread_picture_alloc(Dav1dThreadPicture *const p,
    return res;
 }

+int dav1d_picture_alloc_copy(Dav1dPicture *const dst, const int w,
+                             const Dav1dPicture *const src)
+{
+    struct pic_ctx_context *const pic_ctx = src->ref->user_data;
+    const int res = picture_alloc_with_edges(dst, w, src->p.h, src->p.layout,
+                                             src->p.bpc, &pic_ctx->allocator,
+                                             0, NULL);
+
+    if (!res) {
+        dst->p = src->p;
+        dst->m = src->m;
+        dst->p.w = w;
+        dst->frame_hdr = src->frame_hdr;
+        dst->frame_hdr_ref = src->frame_hdr_ref;
+        if (dst->frame_hdr_ref) dav1d_ref_inc(dst->frame_hdr_ref);
+        dst->seq_hdr = src->seq_hdr;
+        dst->seq_hdr_ref = src->seq_hdr_ref;
+        if (dst->seq_hdr_ref) dav1d_ref_inc(dst->seq_hdr_ref);
+    }
+
+    return res;
+}
+
 void dav1d_picture_ref(Dav1dPicture *const dst, const Dav1dPicture *const src) {
    validate_input(dst != NULL);
    validate_input(dst->data[0] == NULL);
@ -181,6 +196,8 @@ void dav1d_picture_ref(Dav1dPicture *const dst, const Dav1dPicture *const src) {
    if (src->ref) {
        validate_input(src->data[0] != NULL);
        dav1d_ref_inc(src->ref);
+        if (src->frame_hdr_ref) dav1d_ref_inc(src->frame_hdr_ref);
+        if (src->seq_hdr_ref) dav1d_ref_inc(src->seq_hdr_ref);
    }
    *dst = *src;
 }
@ -204,7 +221,6 @@ void dav1d_thread_picture_ref(Dav1dThreadPicture *dst,
    dst->t = src->t;
    dst->visible = src->visible;
    dst->progress = src->progress;
-    dst->flushed = src->flushed;
 }

 void dav1d_picture_unref(Dav1dPicture *const p) {
@ -213,6 +229,8 @@ void dav1d_picture_unref(Dav1dPicture *const p) {
    if (p->ref) {
        validate_input(p->data[0] != NULL);
        dav1d_ref_dec(&p->ref);
+        dav1d_ref_dec(&p->seq_hdr_ref);
+        dav1d_ref_dec(&p->frame_hdr_ref);
    }
    memset(p, 0, sizeof(*p));
 }
@ -260,8 +278,10 @@ void dav1d_thread_picture_signal(const Dav1dThreadPicture *const p,
        return;

    pthread_mutex_lock(&p->t->lock);
-    if (plane_type != PLANE_TYPE_Y) atomic_store(&p->progress[0], y);
-    if (plane_type != PLANE_TYPE_BLOCK) atomic_store(&p->progress[1], y);
+    if (plane_type != PLANE_TYPE_Y)
+        atomic_store(&p->progress[0], y);
+    if (plane_type != PLANE_TYPE_BLOCK)
+        atomic_store(&p->progress[1], y);
    pthread_cond_broadcast(&p->t->cond);
    pthread_mutex_unlock(&p->t->lock);
 }
--- a/third_party/dav1d/src/picture.h
+++ b/third_party/dav1d/src/picture.h
@ -44,7 +44,7 @@ enum PlaneType {

 typedef struct Dav1dThreadPicture {
    Dav1dPicture p;
-    int visible, flushed;
+    int visible;
    struct thread_data *t;
    // [0] block data (including segmentation map and motion vectors)
    // [1] pixel data
@ -59,6 +59,16 @@ int dav1d_thread_picture_alloc(Dav1dThreadPicture *p, int w, int h,
                               struct thread_data *t, int visible,
                               Dav1dPicAllocator *);

+/**
+ * Allocate a picture with identical metadata to an existing picture.
+ * The width is a separate argument so this function can be used for
+ * super-res, where the width changes, but everything else is the same.
+ * For the more typical use case of allocating a new image of the same
+ * dimensions, use src->p.w as width.
+ */
+int dav1d_picture_alloc_copy(Dav1dPicture *dst, const int w,
+                             const Dav1dPicture *src);
+
 /**
 * Create a copy of a picture.
 */
@ -98,6 +108,6 @@ void dav1d_thread_picture_signal(const Dav1dThreadPicture *p, int y,
                                 enum PlaneType plane_type);

 int default_picture_allocator(Dav1dPicture *, void *cookie);
-void default_picture_release(uint8_t *, void *allocator_data, void *cookie);
+void default_picture_release(Dav1dPicture *, void *cookie);

 #endif /* __DAV1D_SRC_PICTURE_H__ */
--- a/third_party/dav1d/src/recon.h
+++ b/third_party/dav1d/src/recon.h
@ -32,7 +32,7 @@
 #include "src/levels.h"

 #define DEBUG_BLOCK_INFO 0 && \
-        f->frame_hdr.frame_offset == 2 && t->by >= 0 && t->by < 4 && \
+        f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
        t->bx >= 8 && t->bx < 12
 #define DEBUG_B_PIXELS 0

--- a/third_party/dav1d/src/recon_tmpl.c
+++ b/third_party/dav1d/src/recon_tmpl.c
@ -72,7 +72,7 @@ static int decode_coefs(Dav1dTileContext *const t,
    if (dbg) printf("Start: r=%d\n", ts->msac.rng);

    // does this block have any non-zero coefficients
-    const int sctx = get_coef_skip_ctx(t_dim, bs, a, l, chroma, f->cur.p.p.layout);
+    const int sctx = get_coef_skip_ctx(t_dim, bs, a, l, chroma, f->cur.p.layout);
    const int all_skip =
        msac_decode_bool_adapt(&ts->msac, ts->cdf.coef.skip[t_dim->ctx][sctx]);
    if (dbg)
@ -80,7 +80,7 @@ static int decode_coefs(Dav1dTileContext *const t,
           t_dim->ctx, sctx, all_skip, ts->msac.rng);
    if (all_skip) {
        *res_ctx = 0x40;
-        *txtp = f->frame_hdr.segmentation.lossless[b->seg_id] ? WHT_WHT :
+        *txtp = f->frame_hdr->segmentation.lossless[b->seg_id] ? WHT_WHT :
                                                                DCT_DCT;
        return -1;
    }
@ -88,14 +88,14 @@ static int decode_coefs(Dav1dTileContext *const t,
    // transform type (chroma: derived, luma: explicitly coded)
    if (chroma) {
        if (intra) {
-            *txtp = get_uv_intra_txtp(b->uv_mode, tx, &f->frame_hdr, b->seg_id);
+            *txtp = get_uv_intra_txtp(b->uv_mode, tx, f->frame_hdr, b->seg_id);
        } else {
            const enum TxfmType y_txtp = *txtp;
-            *txtp = get_uv_inter_txtp(t_dim, y_txtp, &f->frame_hdr, b->seg_id);
+            *txtp = get_uv_inter_txtp(t_dim, y_txtp, f->frame_hdr, b->seg_id);
        }
    } else {
        const enum TxfmTypeSet set = get_ext_txtp_set(tx, !intra,
-                                                      &f->frame_hdr, b->seg_id);
+                                                      f->frame_hdr, b->seg_id);
        const unsigned set_cnt = dav1d_tx_type_count[set];
        unsigned idx;
        if (set_cnt == 1) {
@ -289,7 +289,7 @@ static void read_coef_tree(Dav1dTileContext *const t,
        t->by += txsh;
        if (txh >= txw && t->by < f->bh) {
            if (dst)
-                dst += 4 * txsh * PXSTRIDE(f->cur.p.stride[0]);
+                dst += 4 * txsh * PXSTRIDE(f->cur.stride[0]);
            read_coef_tree(t, bs, b, sub, depth + 1, tx_split,
                           x_off * 2 + 0, y_off * 2 + 1, dst);
            t->bx += txsw;
@ -349,9 +349,9 @@ static void read_coef_tree(Dav1dTileContext *const t,
            if (eob >= 0) {
                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
                    coef_dump(cf, imin(t_dim->h, 8) * 4, imin(t_dim->w, 8) * 4, 3, "dq");
-                dsp->itx.itxfm_add[ytx][txtp](dst, f->cur.p.stride[0], cf, eob);
+                dsp->itx.itxfm_add[ytx][txtp](dst, f->cur.stride[0], cf, eob);
                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
-                    hex_dump(dst, f->cur.p.stride[0], t_dim->w * 4, t_dim->h * 4, "recon");
+                    hex_dump(dst, f->cur.stride[0], t_dim->w * 4, t_dim->h * 4, "recon");
            }
        }
    }
@ -361,14 +361,14 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTileContext *const t,
                                    const enum BlockSize bs, const Av1Block *const b)
 {
    const Dav1dFrameContext *const f = t->f;
-    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-    const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
    const int bx4 = t->bx & 31, by4 = t->by & 31;
    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
    const int bw4 = b_dim[0], bh4 = b_dim[1];
-    const int cbw4 = (bw4 + 1) >> ss_hor, cbh4 = (bh4 + 1) >> ss_ver;
-    const int has_chroma = f->seq_hdr.layout != DAV1D_PIXEL_LAYOUT_I400 &&
+    const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
+    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
                           (bw4 > ss_hor || t->bx & 1) &&
                           (bh4 > ss_ver || t->by & 1);

@ -501,27 +501,27 @@ static int mc(Dav1dTileContext *const t,
 {
    assert((dst8 != NULL) ^ (dst16 != NULL));
    const Dav1dFrameContext *const f = t->f;
-    const int ss_ver = !!pl && f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-    const int ss_hor = !!pl && f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
    const int mvx = mv.x, mvy = mv.y;
    const int mx = mvx & (15 >> !ss_hor), my = mvy & (15 >> !ss_ver);
    ptrdiff_t ref_stride = refp->p.stride[!!pl];
    const pixel *ref;

-    if (refp->p.p.w == f->cur.p.p.w && refp->p.p.h == f->cur.p.p.h) {
+    if (refp->p.p.w == f->cur.p.w && refp->p.p.h == f->cur.p.h) {
        const int dx = bx * h_mul + (mvx >> (3 + ss_hor));
        const int dy = by * v_mul + (mvy >> (3 + ss_ver));
        int w, h;

-        if (refp != &f->cur) { // i.e. not for intrabc
+        if (refp->p.data[0] != f->cur.data[0]) { // i.e. not for intrabc
            if (dav1d_thread_picture_wait(refp, dy + bh4 * v_mul + !!my * 4,
                                          PLANE_TYPE_Y + !!pl))
            {
                return -1;
            }
-            w = (f->cur.p.p.w + ss_hor) >> ss_hor;
-            h = (f->cur.p.p.h + ss_ver) >> ss_ver;
+            w = (f->cur.p.w + ss_hor) >> ss_hor;
+            h = (f->cur.p.h + ss_ver) >> ss_ver;
        } else {
            w = f->bw * 4 >> ss_hor;
            h = f->bh * 4 >> ss_ver;
@ -548,7 +548,7 @@ static int mc(Dav1dTileContext *const t,
                                      bh4 * v_mul, mx << !ss_hor, my << !ss_ver);
        }
    } else {
-        assert(refp != &f->cur);
+        assert(refp != &f->sr_cur);

        int orig_pos_y = (by * v_mul << 4) + mvy * (1 << !ss_ver);
        int orig_pos_x = (bx * h_mul << 4) + mvx * (1 << !ss_hor);
@ -567,8 +567,13 @@ static int mc(Dav1dTileContext *const t,
        const int bottom =
            ((pos_y + (bh4 * v_mul - 1) * f->svc[refidx][1].step) >> 10) + 1;

-        if (dav1d_thread_picture_wait(refp, bottom, PLANE_TYPE_Y + !!pl))
+        if (dav1d_thread_picture_wait(refp, bottom + 4, PLANE_TYPE_Y + !!pl))
            return -1;
+        if (DEBUG_BLOCK_INFO)
+            printf("Off %dx%d [%d,%d,%d], size %dx%d [%d,%d]\n",
+                   left, top, orig_pos_x, f->svc[refidx][0].scale, refidx,
+                   right-left, bottom-top,
+                   f->svc[refidx][0].step, f->svc[refidx][1].step);

        const int w = (refp->p.p.w + ss_hor) >> ss_hor;
        const int h = (refp->p.p.h + ss_ver) >> ss_ver;
@ -579,6 +584,7 @@ static int mc(Dav1dTileContext *const t,
                                refp->p.data[pl], ref_stride);
            ref = &t->emu_edge[320 * 3 + 3];
            ref_stride = 320 * sizeof(pixel);
+            if (DEBUG_BLOCK_INFO) printf("Emu\n");
        } else {
            ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * top + left;
        }
@ -610,8 +616,8 @@ static int obmc(Dav1dTileContext *const t,
    const Dav1dFrameContext *const f = t->f;
    const refmvs *const r = &f->mvs[t->by * f->b4_stride + t->bx];
    pixel *const lap = t->scratch.lap;
-    const int ss_ver = !!pl && f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-    const int ss_hor = !!pl && f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
    int res;

@ -668,13 +674,13 @@ static int warp_affine(Dav1dTileContext *const t,
                       pixel *dst8, coef *dst16, const ptrdiff_t dstride,
                       const uint8_t *const b_dim, const int pl,
                       const Dav1dThreadPicture *const refp,
-                       const WarpedMotionParams *const wmp)
+                       const Dav1dWarpedMotionParams *const wmp)
 {
    assert((dst8 != NULL) ^ (dst16 != NULL));
    const Dav1dFrameContext *const f = t->f;
    const Dav1dDSPContext *const dsp = f->dsp;
-    const int ss_ver = !!pl && f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-    const int ss_hor = !!pl && f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
    assert(!((b_dim[0] * h_mul) & 7) && !((b_dim[1] * v_mul) & 7));
    const int32_t *const mat = wmp->matrix;
@ -735,14 +741,14 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
    const Dav1dFrameContext *const f = t->f;
    const Dav1dDSPContext *const dsp = f->dsp;
    const int bx4 = t->bx & 31, by4 = t->by & 31;
-    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-    const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
    const int bw4 = b_dim[0], bh4 = b_dim[1];
    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
-    const int has_chroma = f->seq_hdr.layout != DAV1D_PIXEL_LAYOUT_I400 &&
+    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
                           (bw4 > ss_hor || t->bx & 1) &&
                           (bh4 > ss_ver || t->by & 1);
    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->tx];
@ -753,13 +759,13 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
    pixel *const edge = edge_buf + 128;
    const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;

-    const int intra_edge_filter_flag = f->seq_hdr.intra_edge_filter << 10;
+    const int intra_edge_filter_flag = f->seq_hdr->intra_edge_filter << 10;

    for (int init_y = 0; init_y < h4; init_y += 16) {
        for (int init_x = 0; init_x < w4; init_x += 16) {
            if (b->pal_sz[0]) {
-                pixel *dst = ((pixel *) f->cur.p.data[0]) +
-                             4 * (t->by * PXSTRIDE(f->cur.p.stride[0]) + t->bx);
+                pixel *dst = ((pixel *) f->cur.data[0]) +
+                             4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);
                const uint8_t *pal_idx;
                if (f->frame_thread.pass) {
                    pal_idx = ts->frame_thread.pal_idx;
@ -770,10 +776,10 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
                const uint16_t *const pal = f->frame_thread.pass ?
                    f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
                                        ((t->bx >> 1) + (t->by & 1))][0] : t->pal[0];
-                f->dsp->ipred.pal_pred(dst, f->cur.p.stride[0], pal,
+                f->dsp->ipred.pal_pred(dst, f->cur.stride[0], pal,
                                       pal_idx, bw4 * 4, bh4 * 4);
                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
-                    hex_dump(dst, PXSTRIDE(f->cur.p.stride[0]),
+                    hex_dump(dst, PXSTRIDE(f->cur.stride[0]),
                             bw4 * 4, bh4 * 4, "y-pal-pred");
            }

@ -790,8 +796,8 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
            for (y = init_y, t->by += init_y; y < sub_h4;
                 y += t_dim->h, t->by += t_dim->h)
            {
-                pixel *dst = ((pixel *) f->cur.p.data[0]) +
-                               4 * (t->by * PXSTRIDE(f->cur.p.stride[0]) +
+                pixel *dst = ((pixel *) f->cur.data[0]) +
+                               4 * (t->by * PXSTRIDE(f->cur.stride[0]) +
                                    t->bx + init_x);
                for (x = init_x, t->bx += init_x; x < sub_w4;
                     x += t_dim->w, t->bx += t_dim->w)
@ -818,10 +824,10 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
                                                          ts->tiling.col_end,
                                                          ts->tiling.row_end,
                                                          edge_flags, dst,
-                                                          f->cur.p.stride[0], top_sb_edge,
+                                                          f->cur.stride[0], top_sb_edge,
                                                          b->y_mode, &angle,
                                                          t_dim->w, t_dim->h, edge);
-                    dsp->ipred.intra_pred[m](dst, f->cur.p.stride[0], edge,
+                    dsp->ipred.intra_pred[m](dst, f->cur.stride[0], edge,
                                             t_dim->w * 4, t_dim->h * 4,
                                             angle | intra_flags,
                                             4 * f->bw - 4 * t->bx,
@ -833,7 +839,7 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
                        hex_dump(edge, 0, 1, 1, "tl");
                        hex_dump(edge + 1, t_dim->w * 4,
                                 t_dim->w * 4, 2, "t");
-                        hex_dump(dst, f->cur.p.stride[0],
+                        hex_dump(dst, f->cur.stride[0],
                                 t_dim->w * 4, t_dim->h * 4, "y-intra-pred");
                    }

@ -875,10 +881,10 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
                                          imin(t_dim->w, 8) * 4, 3, "dq");
                            dsp->itx.itxfm_add[b->tx]
                                              [txtp](dst,
-                                                     f->cur.p.stride[0],
+                                                     f->cur.stride[0],
                                                     cf, eob);
                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
-                                hex_dump(dst, f->cur.p.stride[0],
+                                hex_dump(dst, f->cur.stride[0],
                                         t_dim->w * 4, t_dim->h * 4, "recon");
                        }
                    } else if (!f->frame_thread.pass) {
@ -896,24 +902,24 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize

            if (!has_chroma) continue;

-            const ptrdiff_t stride = f->cur.p.stride[1];
+            const ptrdiff_t stride = f->cur.stride[1];

            if (b->uv_mode == CFL_PRED) {
                assert(!init_x && !init_y);

                int16_t *const ac = t->scratch.ac;
-                pixel *y_src = ((pixel *) f->cur.p.data[0]) + 4 * (t->bx & ~ss_hor) +
-                                 4 * (t->by & ~ss_ver) * PXSTRIDE(f->cur.p.stride[0]);
+                pixel *y_src = ((pixel *) f->cur.data[0]) + 4 * (t->bx & ~ss_hor) +
+                                 4 * (t->by & ~ss_ver) * PXSTRIDE(f->cur.stride[0]);
                const ptrdiff_t uv_off = 4 * ((t->bx >> ss_hor) +
                                              (t->by >> ss_ver) * PXSTRIDE(stride));
-                pixel *const uv_dst[2] = { ((pixel *) f->cur.p.data[1]) + uv_off,
-                                           ((pixel *) f->cur.p.data[2]) + uv_off };
+                pixel *const uv_dst[2] = { ((pixel *) f->cur.data[1]) + uv_off,
+                                           ((pixel *) f->cur.data[2]) + uv_off };

                const int furthest_r =
                    ((cw4 << ss_hor) + t_dim->w - 1) & ~(t_dim->w - 1);
                const int furthest_b =
                    ((ch4 << ss_ver) + t_dim->h - 1) & ~(t_dim->h - 1);
-                dsp->ipred.cfl_ac[f->cur.p.p.layout - 1](ac, y_src, f->cur.p.stride[0],
+                dsp->ipred.cfl_ac[f->cur.p.layout - 1](ac, y_src, f->cur.stride[0],
                                                         cbw4 - (furthest_r >> ss_hor),
                                                         cbh4 - (furthest_b >> ss_ver),
                                                         cbw4 * 4, cbh4 * 4);
@ -950,7 +956,7 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
                }
            } else if (b->pal_sz[1]) {
                ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) +
-                                           (t->by >> ss_ver) * PXSTRIDE(f->cur.p.stride[1]));
+                                           (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
                const uint8_t *pal_idx;
                if (f->frame_thread.pass) {
                    pal_idx = ts->frame_thread.pal_idx;
@ -961,21 +967,21 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
                const uint16_t *const pal_u = f->frame_thread.pass ?
                    f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
                                        ((t->bx >> 1) + (t->by & 1))][1] : t->pal[1];
-                f->dsp->ipred.pal_pred(((pixel *) f->cur.p.data[1]) + uv_dstoff,
-                                       f->cur.p.stride[1], pal_u,
+                f->dsp->ipred.pal_pred(((pixel *) f->cur.data[1]) + uv_dstoff,
+                                       f->cur.stride[1], pal_u,
                                       pal_idx, cbw4 * 4, cbh4 * 4);
                const uint16_t *const pal_v = f->frame_thread.pass ?
                    f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
                                        ((t->bx >> 1) + (t->by & 1))][2] : t->pal[2];
-                f->dsp->ipred.pal_pred(((pixel *) f->cur.p.data[2]) + uv_dstoff,
-                                       f->cur.p.stride[1], pal_v,
+                f->dsp->ipred.pal_pred(((pixel *) f->cur.data[2]) + uv_dstoff,
+                                       f->cur.stride[1], pal_v,
                                       pal_idx, cbw4 * 4, cbh4 * 4);
                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
-                    hex_dump(((pixel *) f->cur.p.data[1]) + uv_dstoff,
-                             PXSTRIDE(f->cur.p.stride[1]),
+                    hex_dump(((pixel *) f->cur.data[1]) + uv_dstoff,
+                             PXSTRIDE(f->cur.stride[1]),
                             cbw4 * 4, cbh4 * 4, "u-pal-pred");
-                    hex_dump(((pixel *) f->cur.p.data[2]) + uv_dstoff,
-                             PXSTRIDE(f->cur.p.stride[1]),
+                    hex_dump(((pixel *) f->cur.data[2]) + uv_dstoff,
+                             PXSTRIDE(f->cur.stride[1]),
                             cbw4 * 4, cbh4 * 4, "v-pal-pred");
                }
            }
@ -984,17 +990,17 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
                                 sm_uv_flag(&t->l, cby4);
            const int uv_sb_has_tr =
                ((init_x + 16) >> ss_hor) < cw4 ? 1 : init_y ? 0 :
-                intra_edge_flags & (EDGE_I420_TOP_HAS_RIGHT >> (f->cur.p.p.layout - 1));
+                intra_edge_flags & (EDGE_I420_TOP_HAS_RIGHT >> (f->cur.p.layout - 1));
            const int uv_sb_has_bl =
                init_x ? 0 : ((init_y + 16) >> ss_ver) < ch4 ? 1 :
-                intra_edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM >> (f->cur.p.p.layout - 1));
+                intra_edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM >> (f->cur.p.layout - 1));
            const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
            const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);
            for (int pl = 0; pl < 2; pl++) {
                for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
                     y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
                {
-                    pixel *dst = ((pixel *) f->cur.p.data[1 + pl]) +
+                    pixel *dst = ((pixel *) f->cur.data[1 + pl]) +
                                   4 * ((t->by >> ss_ver) * PXSTRIDE(stride) +
                                        ((t->bx + init_x) >> ss_hor));
                    for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
@ -1127,57 +1133,56 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
    const Dav1dFrameContext *const f = t->f;
    const Dav1dDSPContext *const dsp = f->dsp;
    const int bx4 = t->bx & 31, by4 = t->by & 31;
-    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-    const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
    const int bw4 = b_dim[0], bh4 = b_dim[1];
    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
-    const int has_chroma = f->seq_hdr.layout != DAV1D_PIXEL_LAYOUT_I400 &&
+    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
                           (bw4 > ss_hor || t->bx & 1) &&
                           (bh4 > ss_ver || t->by & 1);
-    const int chr_layout_idx = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I400 ? 0 :
-                               DAV1D_PIXEL_LAYOUT_I444 - f->cur.p.p.layout;
+    const int chr_layout_idx = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I400 ? 0 :
+                               DAV1D_PIXEL_LAYOUT_I444 - f->cur.p.layout;
    int res;

    // prediction
    const int cbh4 = (bh4 + ss_ver) >> ss_ver, cbw4 = (bw4 + ss_hor) >> ss_hor;
-    pixel *dst = ((pixel *) f->cur.p.data[0]) +
-        4 * (t->by * PXSTRIDE(f->cur.p.stride[0]) + t->bx);
+    pixel *dst = ((pixel *) f->cur.data[0]) +
+        4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);
    const ptrdiff_t uvdstoff =
-        4 * ((t->bx >> ss_hor) + (t->by >> ss_ver) * PXSTRIDE(f->cur.p.stride[1]));
-    if (!(f->frame_hdr.frame_type & 1)) {
+        4 * ((t->bx >> ss_hor) + (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
+    if (!(f->frame_hdr->frame_type & 1)) {
        // intrabc
-        res = mc(t, dst, NULL, f->cur.p.stride[0],
-                 bw4, bh4, t->bx, t->by, 0, b->mv[0], &f->cur, -1, FILTER_2D_BILINEAR);
+        assert(!f->frame_hdr->super_res.enabled);
+        res = mc(t, dst, NULL, f->cur.stride[0], bw4, bh4, t->bx, t->by, 0,
+                 b->mv[0], &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR);
        if (res) return res;
        if (has_chroma) for (int pl = 1; pl < 3; pl++) {
-            res = mc(t, ((pixel *)f->cur.p.data[pl]) + uvdstoff, NULL, f->cur.p.stride[1],
+            res = mc(t, ((pixel *)f->cur.data[pl]) + uvdstoff, NULL, f->cur.stride[1],
                     bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
-                     t->bx & ~ss_hor, t->by & ~ss_ver,
-                     pl, b->mv[0], &f->cur, -1, FILTER_2D_BILINEAR);
+                     t->bx & ~ss_hor, t->by & ~ss_ver, pl, b->mv[0],
+                     &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR);
            if (res) return res;
        }
    } else if (b->comp_type == COMP_INTER_NONE) {
        const Dav1dThreadPicture *const refp = &f->refp[b->ref[0]];
        const enum Filter2d filter_2d = b->filter2d;

-        if (imin(bw4, bh4) > 1 && !f->frame_hdr.force_integer_mv &&
-            ((b->inter_mode == GLOBALMV &&
-              f->frame_hdr.gmv[b->ref[0]].type > WM_TYPE_TRANSLATION) ||
-             (b->motion_mode == MM_WARP &&
-              t->warpmv.type > WM_TYPE_TRANSLATION)))
+        if (imin(bw4, bh4) > 1 &&
+            ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
+             (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
        {
-            res = warp_affine(t, dst, NULL, f->cur.p.stride[0], b_dim, 0, refp,
+            res = warp_affine(t, dst, NULL, f->cur.stride[0], b_dim, 0, refp,
                              b->motion_mode == MM_WARP ? &t->warpmv :
-                                  &f->frame_hdr.gmv[b->ref[0]]);
+                                  &f->frame_hdr->gmv[b->ref[0]]);
            if (res) return res;
        } else {
-            res = mc(t, dst, NULL, f->cur.p.stride[0],
+            res = mc(t, dst, NULL, f->cur.stride[0],
                     bw4, bh4, t->bx, t->by, 0, b->mv[0], refp, b->ref[0], filter_2d);
            if (res) return res;
            if (b->motion_mode == MM_OBMC) {
-                res = obmc(t, dst, f->cur.p.stride[0], b_dim, 0, bx4, by4, w4, h4);
+                res = obmc(t, dst, f->cur.stride[0], b_dim, 0, bx4, by4, w4, h4);
                if (res) return res;
            }
        }
@ -1197,7 +1202,7 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
            m = bytefn(dav1d_prepare_intra_edges)(t->bx, t->bx > ts->tiling.col_start,
                                                  t->by, t->by > ts->tiling.row_start,
                                                  ts->tiling.col_end, ts->tiling.row_end,
-                                                  0, dst, f->cur.p.stride[0], top_sb_edge,
+                                                  0, dst, f->cur.stride[0], top_sb_edge,
                                                  m, &angle, bw4, bh4, tl_edge);
            dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel),
                                     tl_edge, bw4 * 4, bh4 * 4, 0, 0, 0);
@ -1205,7 +1210,7 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
                b->interintra_type == INTER_INTRA_BLEND ?
                     dav1d_ii_masks[bs][0][b->interintra_mode] :
                     dav1d_wedge_masks[bs][0][0][b->wedge_idx];
-            dsp->mc.blend(dst, f->cur.p.stride[0], tmp,
+            dsp->mc.blend(dst, f->cur.stride[0], tmp,
                          bw4 * 4, bh4 * 4, ii_mask);
        }

@ -1229,8 +1234,8 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
            int h_off = 0, v_off = 0;
            if (bw4 == 1 && bh4 == ss_ver) {
                for (int pl = 0; pl < 2; pl++) {
-                    res = mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff,
-                             NULL, f->cur.p.stride[1],
+                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
+                             NULL, f->cur.stride[1],
                             bw4, bh4, t->bx - 1, t->by - 1, 1 + pl,
                             r[-(f->b4_stride + 1)].mv[0],
                             &f->refp[r[-(f->b4_stride + 1)].ref[0] - 1],
@ -1239,15 +1244,15 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
                                 f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx - 1].filter2d);
                    if (res) return res;
                }
-                v_off = 2 * PXSTRIDE(f->cur.p.stride[1]);
+                v_off = 2 * PXSTRIDE(f->cur.stride[1]);
                h_off = 2;
            }
            if (bw4 == 1) {
                const enum Filter2d left_filter_2d =
                    dav1d_filter_2d[t->l.filter[1][by4]][t->l.filter[0][by4]];
                for (int pl = 0; pl < 2; pl++) {
-                    res = mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff + v_off, NULL,
-                             f->cur.p.stride[1], bw4, bh4, t->bx - 1,
+                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + v_off, NULL,
+                             f->cur.stride[1], bw4, bh4, t->bx - 1,
                             t->by, 1 + pl, r[-1].mv[0], &f->refp[r[-1].ref[0] - 1],
                             r[-1].ref[0] - 1,
                             f->frame_thread.pass != 2 ? left_filter_2d :
@ -1260,8 +1265,8 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
                const enum Filter2d top_filter_2d =
                    dav1d_filter_2d[t->a->filter[1][bx4]][t->a->filter[0][bx4]];
                for (int pl = 0; pl < 2; pl++) {
-                    res = mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff + h_off, NULL,
-                             f->cur.p.stride[1], bw4, bh4, t->bx, t->by - 1,
+                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off, NULL,
+                             f->cur.stride[1], bw4, bh4, t->bx, t->by - 1,
                             1 + pl, r[-f->b4_stride].mv[0],
                             &f->refp[r[-f->b4_stride].ref[0] - 1],
                             r[-f->b4_stride].ref[0] - 1,
@ -1269,39 +1274,37 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
                                 f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx].filter2d);
                    if (res) return res;
                }
-                v_off = 2 * PXSTRIDE(f->cur.p.stride[1]);
+                v_off = 2 * PXSTRIDE(f->cur.stride[1]);
            }
            for (int pl = 0; pl < 2; pl++) {
-                res = mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff + h_off + v_off, NULL, f->cur.p.stride[1],
+                res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off + v_off, NULL, f->cur.stride[1],
                         bw4, bh4, t->bx, t->by, 1 + pl, b->mv[0],
                         refp, b->ref[0], filter_2d);
                if (res) return res;
            }
        } else {
-            if (imin(cbw4, cbh4) > 1 && !f->frame_hdr.force_integer_mv &&
-                ((b->inter_mode == GLOBALMV &&
-                  f->frame_hdr.gmv[b->ref[0]].type > WM_TYPE_TRANSLATION) ||
-                 (b->motion_mode == MM_WARP &&
-                  t->warpmv.type > WM_TYPE_TRANSLATION)))
+            if (imin(cbw4, cbh4) > 1 &&
+                ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
+                 (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
            {
                for (int pl = 0; pl < 2; pl++) {
-                    res = warp_affine(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff, NULL,
-                                      f->cur.p.stride[1], b_dim, 1 + pl, refp,
+                    res = warp_affine(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, NULL,
+                                      f->cur.stride[1], b_dim, 1 + pl, refp,
                                      b->motion_mode == MM_WARP ? &t->warpmv :
-                                          &f->frame_hdr.gmv[b->ref[0]]);
+                                          &f->frame_hdr->gmv[b->ref[0]]);
                    if (res) return res;
                }
            } else {
                for (int pl = 0; pl < 2; pl++) {
-                    res = mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff,
-                             NULL, f->cur.p.stride[1],
+                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
+                             NULL, f->cur.stride[1],
                             bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
                             t->bx & ~ss_hor, t->by & ~ss_ver,
                             1 + pl, b->mv[0], refp, b->ref[0], filter_2d);
                    if (res) return res;
                    if (b->motion_mode == MM_OBMC) {
-                        res = obmc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff,
-                                   f->cur.p.stride[1], b_dim, 1 + pl, bx4, by4, w4, h4);
+                        res = obmc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
+                                   f->cur.stride[1], b_dim, 1 + pl, bx4, by4, w4, h4);
                        if (res) return res;
                    }
                }
@ -1317,12 +1320,13 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize

                for (int pl = 0; pl < 2; pl++) {
                    pixel *const tmp = t->scratch.interintra;
-                    pixel tl_edge_px[65], *const tl_edge = &tl_edge_px[32];
+                    ALIGN_STK_32(pixel, tl_edge_px, 65,);
+                    pixel *const tl_edge = &tl_edge_px[32];
                    enum IntraPredMode m =
                        b->interintra_mode == II_SMOOTH_PRED ?
                        SMOOTH_PRED : b->interintra_mode;
                    int angle = 0;
-                    pixel *const uvdst = ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff;
+                    pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff;
                    const pixel *top_sb_edge = NULL;
                    if (!(t->by & (f->sb_step - 1))) {
                        top_sb_edge = f->ipred_edge[pl + 1];
@ -1337,12 +1341,12 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
                                                              (ts->tiling.row_start >> ss_ver),
                                                          ts->tiling.col_end >> ss_hor,
                                                          ts->tiling.row_end >> ss_ver,
-                                                          0, uvdst, f->cur.p.stride[1],
+                                                          0, uvdst, f->cur.stride[1],
                                                          top_sb_edge, m,
                                                          &angle, cbw4, cbh4, tl_edge);
                    dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel),
                                             tl_edge, cbw4 * 4, cbh4 * 4, 0, 0, 0);
-                    dsp->mc.blend(uvdst, f->cur.p.stride[1], tmp,
+                    dsp->mc.blend(uvdst, f->cur.stride[1], tmp,
                                  cbw4 * 4, cbh4 * 4, ii_mask);
                }
            }
@ -1361,37 +1365,37 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
        for (int i = 0; i < 2; i++) {
            const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];

-            if (b->inter_mode == GLOBALMV_GLOBALMV && !f->frame_hdr.force_integer_mv &&
-                f->frame_hdr.gmv[b->ref[i]].type > WM_TYPE_TRANSLATION)
-            {
+            if (b->inter_mode == GLOBALMV_GLOBALMV && f->gmv_warp_allowed[b->ref[i]]) {
                res = warp_affine(t, NULL, tmp[i], bw4 * 4, b_dim, 0, refp,
-                                  &f->frame_hdr.gmv[b->ref[i]]);
+                                  &f->frame_hdr->gmv[b->ref[i]]);
                if (res) return res;
            } else {
                res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by, 0,
                         b->mv[i], refp, b->ref[i], filter_2d);
+                if (DEBUG_BLOCK_INFO)
+                    coef_dump(tmp[i], bw4*4, bh4*4, 3, "med");
                if (res) return res;
            }
        }
        switch (b->comp_type) {
        case COMP_INTER_AVG:
-            dsp->mc.avg(dst, f->cur.p.stride[0], tmp[0], tmp[1],
+            dsp->mc.avg(dst, f->cur.stride[0], tmp[0], tmp[1],
                        bw4 * 4, bh4 * 4);
            break;
        case COMP_INTER_WEIGHTED_AVG:
            jnt_weight = f->jnt_weights[b->ref[0]][b->ref[1]];
-            dsp->mc.w_avg(dst, f->cur.p.stride[0], tmp[0], tmp[1],
+            dsp->mc.w_avg(dst, f->cur.stride[0], tmp[0], tmp[1],
                          bw4 * 4, bh4 * 4, jnt_weight);
            break;
        case COMP_INTER_SEG:
-            dsp->mc.w_mask[chr_layout_idx](dst, f->cur.p.stride[0],
+            dsp->mc.w_mask[chr_layout_idx](dst, f->cur.stride[0],
                                           tmp[b->mask_sign], tmp[!b->mask_sign],
                                           bw4 * 4, bh4 * 4, seg_mask, b->mask_sign);
            mask = seg_mask;
            break;
        case COMP_INTER_WEDGE:
            mask = dav1d_wedge_masks[bs][0][0][b->wedge_idx];
-            dsp->mc.mask(dst, f->cur.p.stride[0],
+            dsp->mc.mask(dst, f->cur.stride[0],
                         tmp[b->mask_sign], tmp[!b->mask_sign],
                         bw4 * 4, bh4 * 4, mask);
            if (has_chroma)
@ -1404,11 +1408,10 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
            for (int i = 0; i < 2; i++) {
                const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
                if (b->inter_mode == GLOBALMV_GLOBALMV &&
-                    imin(cbw4, cbh4) > 1 && !f->frame_hdr.force_integer_mv &&
-                    f->frame_hdr.gmv[b->ref[i]].type > WM_TYPE_TRANSLATION)
+                    imin(cbw4, cbh4) > 1 && f->gmv_warp_allowed[b->ref[i]])
                {
                    res = warp_affine(t, NULL, tmp[i], bw4 * 2, b_dim, 1 + pl,
-                                      refp, &f->frame_hdr.gmv[b->ref[i]]);
+                                      refp, &f->frame_hdr->gmv[b->ref[i]]);
                    if (res) return res;
                } else {
                    res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by,
@ -1416,19 +1419,19 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
                    if (res) return res;
                }
            }
-            pixel *const uvdst = ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff;
+            pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff;
            switch (b->comp_type) {
            case COMP_INTER_AVG:
-                dsp->mc.avg(uvdst, f->cur.p.stride[1], tmp[0], tmp[1],
+                dsp->mc.avg(uvdst, f->cur.stride[1], tmp[0], tmp[1],
                            bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver);
                break;
            case COMP_INTER_WEIGHTED_AVG:
-                dsp->mc.w_avg(uvdst, f->cur.p.stride[1], tmp[0], tmp[1],
+                dsp->mc.w_avg(uvdst, f->cur.stride[1], tmp[0], tmp[1],
                              bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, jnt_weight);
                break;
            case COMP_INTER_WEDGE:
            case COMP_INTER_SEG:
-                dsp->mc.mask(uvdst, f->cur.p.stride[1],
+                dsp->mc.mask(uvdst, f->cur.stride[1],
                             tmp[b->mask_sign], tmp[!b->mask_sign],
                             bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, mask);
                break;
@ -1437,11 +1440,11 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
    }

    if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
-        hex_dump(dst, f->cur.p.stride[0], b_dim[0] * 4, b_dim[1] * 4, "y-pred");
+        hex_dump(dst, f->cur.stride[0], b_dim[0] * 4, b_dim[1] * 4, "y-pred");
        if (has_chroma) {
-            hex_dump(&((pixel *) f->cur.p.data[1])[uvdstoff], f->cur.p.stride[1],
+            hex_dump(&((pixel *) f->cur.data[1])[uvdstoff], f->cur.stride[1],
                     cbw4 * 4, cbh4 * 4, "u-pred");
-            hex_dump(&((pixel *) f->cur.p.data[2])[uvdstoff], f->cur.p.stride[1],
+            hex_dump(&((pixel *) f->cur.data[2])[uvdstoff], f->cur.stride[1],
                     cbw4 * 4, cbh4 * 4, "v-pred");
        }
    }
@ -1473,7 +1476,7 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
        for (int init_x = 0; init_x < bw4; init_x += 16) {
            // coefficient coding & inverse transforms
            int y_off = !!init_y, y;
-            dst += PXSTRIDE(f->cur.p.stride[0]) * 4 * init_y;
+            dst += PXSTRIDE(f->cur.stride[0]) * 4 * init_y;
            for (y = init_y, t->by += init_y; y < imin(h4, init_y + 16);
                 y += ytx->h, y_off++)
            {
@ -1485,17 +1488,17 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
                                   x_off, y_off, &dst[x * 4]);
                    t->bx += ytx->w;
                }
-                dst += PXSTRIDE(f->cur.p.stride[0]) * 4 * ytx->h;
+                dst += PXSTRIDE(f->cur.stride[0]) * 4 * ytx->h;
                t->bx -= x;
                t->by += ytx->h;
            }
-            dst -= PXSTRIDE(f->cur.p.stride[0]) * 4 * y;
+            dst -= PXSTRIDE(f->cur.stride[0]) * 4 * y;
            t->by -= y;

            // chroma coefs and inverse transform
            if (has_chroma) for (int pl = 0; pl < 2; pl++) {
-                pixel *uvdst = ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff +
-                    (PXSTRIDE(f->cur.p.stride[1]) * init_y * 4 >> ss_ver);
+                pixel *uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff +
+                    (PXSTRIDE(f->cur.stride[1]) * init_y * 4 >> ss_ver);
                for (y = init_y >> ss_ver, t->by += init_y;
                     y < imin(ch4, (init_y + 16) >> ss_ver); y += uvtx->h)
                {
@ -1544,15 +1547,15 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
                                coef_dump(cf, uvtx->h * 4, uvtx->w * 4, 3, "dq");
                            dsp->itx.itxfm_add[b->uvtx]
                                              [txtp](&uvdst[4 * x],
-                                                     f->cur.p.stride[1],
+                                                     f->cur.stride[1],
                                                     cf, eob);
                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
-                                hex_dump(&uvdst[4 * x], f->cur.p.stride[1],
+                                hex_dump(&uvdst[4 * x], f->cur.stride[1],
                                         uvtx->w * 4, uvtx->h * 4, "recon");
                        }
                        t->bx += uvtx->w << ss_hor;
                    }
-                    uvdst += PXSTRIDE(f->cur.p.stride[1]) * 4 * uvtx->h;
+                    uvdst += PXSTRIDE(f->cur.stride[1]) * 4 * uvtx->h;
                    t->bx -= x << ss_hor;
                    t->by += uvtx->h << ss_ver;
                }
@ -1564,29 +1567,29 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
 }

 void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
-    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
    const int sbsz = f->sb_step, sbh = f->sbh;

-    if (f->frame_hdr.loopfilter.level_y[0] ||
-        f->frame_hdr.loopfilter.level_y[1])
+    if (f->frame_hdr->loopfilter.level_y[0] ||
+        f->frame_hdr->loopfilter.level_y[1])
    {
        int start_of_tile_row = 0;
-        if (f->frame_hdr.tiling.row_start_sb[f->lf.tile_row] == sby)
+        if (f->frame_hdr->tiling.row_start_sb[f->lf.tile_row] == sby)
            start_of_tile_row = f->lf.tile_row++;
        bytefn(dav1d_loopfilter_sbrow)(f, f->lf.p, f->lf.mask_ptr, sby,
                                       start_of_tile_row);
    }

-    if (f->seq_hdr.restoration) {
+    if (f->seq_hdr->restoration) {
        // Store loop filtered pixels required by loop restoration
        bytefn(dav1d_lr_copy_lpf)(f, f->lf.p, sby);
    }
-    if (f->seq_hdr.cdef) {
+    if (f->seq_hdr->cdef) {
        if (sby) {
+            const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
            pixel *p_up[3] = {
-                f->lf.p[0] - 8 * PXSTRIDE(f->cur.p.stride[0]),
-                f->lf.p[1] - (8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver),
-                f->lf.p[2] - (8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver),
+                f->lf.p[0] - 8 * PXSTRIDE(f->cur.stride[0]),
+                f->lf.p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
+                f->lf.p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
            };
            bytefn(dav1d_cdef_brow)(f, p_up, f->lf.prev_mask_ptr,
                                    sby * sbsz - 2, sby * sbsz);
@ -1595,15 +1598,39 @@ void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
        bytefn(dav1d_cdef_brow)(f, f->lf.p, f->lf.mask_ptr, sby * sbsz,
                                imin(sby * sbsz + n_blks, f->bh));
    }
-    if (f->seq_hdr.restoration) {
-        bytefn(dav1d_lr_sbrow)(f, f->lf.p, sby);
+    if (f->frame_hdr->super_res.enabled) {
+        const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;
+        for (int pl = 0; pl < 1 + 2 * has_chroma; pl++) {
+            const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+            const int h_start = 8 * !!sby >> ss_ver;
+            const ptrdiff_t dst_stride = f->sr_cur.p.stride[!!pl];
+            pixel *dst = f->lf.sr_p[pl] - h_start * PXSTRIDE(dst_stride);
+            const ptrdiff_t src_stride = f->cur.stride[!!pl];
+            const pixel *src = f->lf.p[pl] - h_start * PXSTRIDE(src_stride);
+            const int h_end = 4 * (sbsz - 2 * (sby + 1 < sbh)) >> ss_ver;
+            const int ss_hor = pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+            const int dst_w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
+            const int src_w = (4 * f->bw + ss_hor) >> ss_hor;
+            const int img_h = (f->cur.p.h - sbsz * 4 * sby + ss_ver) >> ss_ver;
+
+            f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w, src_w,
+                              imin(img_h, h_end) + h_start, f->resize_step[!!pl],
+                              f->resize_start[!!pl]);
+        }
+    }
+    if (f->seq_hdr->restoration) {
+        bytefn(dav1d_lr_sbrow)(f, f->lf.sr_p, sby);
    }

-    f->lf.p[0] += sbsz * 4 * PXSTRIDE(f->cur.p.stride[0]);
-    f->lf.p[1] += sbsz * 4 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver;
-    f->lf.p[2] += sbsz * 4 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver;
+    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    f->lf.p[0] += sbsz * 4 * PXSTRIDE(f->cur.stride[0]);
+    f->lf.p[1] += sbsz * 4 * PXSTRIDE(f->cur.stride[1]) >> ss_ver;
+    f->lf.p[2] += sbsz * 4 * PXSTRIDE(f->cur.stride[1]) >> ss_ver;
+    f->lf.sr_p[0] += sbsz * 4 * PXSTRIDE(f->sr_cur.p.stride[0]);
+    f->lf.sr_p[1] += sbsz * 4 * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver;
+    f->lf.sr_p[2] += sbsz * 4 * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver;
    f->lf.prev_mask_ptr = f->lf.mask_ptr;
-    if ((sby & 1) || f->seq_hdr.sb128) {
+    if ((sby & 1) || f->seq_hdr->sb128) {
        f->lf.mask_ptr += f->sb128w;
    }
 }
@ -1616,20 +1643,20 @@ void bytefn(dav1d_backup_ipred_edge)(Dav1dTileContext *const t) {
    const int x_off = ts->tiling.col_start;

    const pixel *const y =
-        ((const pixel *) f->cur.p.data[0]) + x_off * 4 +
-                    ((t->by + f->sb_step) * 4 - 1) * PXSTRIDE(f->cur.p.stride[0]);
+        ((const pixel *) f->cur.data[0]) + x_off * 4 +
+                    ((t->by + f->sb_step) * 4 - 1) * PXSTRIDE(f->cur.stride[0]);
    pixel_copy(&f->ipred_edge[0][sby_off + x_off * 4], y,
               4 * (ts->tiling.col_end - x_off));

-    if (f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
-        const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-        const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
+        const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+        const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;

        const ptrdiff_t uv_off = (x_off * 4 >> ss_hor) +
-            (((t->by + f->sb_step) * 4 >> ss_ver) - 1) * PXSTRIDE(f->cur.p.stride[1]);
+            (((t->by + f->sb_step) * 4 >> ss_ver) - 1) * PXSTRIDE(f->cur.stride[1]);
        for (int pl = 1; pl <= 2; pl++)
            pixel_copy(&f->ipred_edge[pl][sby_off + (x_off * 4 >> ss_hor)],
-                       &((const pixel *) f->cur.p.data[pl])[uv_off],
+                       &((const pixel *) f->cur.data[pl])[uv_off],
                       4 * (ts->tiling.col_end - x_off) >> ss_hor);
    }
 }
--- a/third_party/dav1d/src/ref.c
+++ b/third_party/dav1d/src/ref.c
@ -84,3 +84,7 @@ void dav1d_ref_dec(Dav1dRef **const pref) {
    }
    *pref = NULL;
 }
+
+int dav1d_ref_is_writable(Dav1dRef *const ref) {
+    return atomic_load(&ref->ref_cnt) == 1;
+}
--- a/third_party/dav1d/src/ref.h
+++ b/third_party/dav1d/src/ref.h
@ -48,4 +48,6 @@ Dav1dRef *dav1d_ref_wrap(const uint8_t *ptr,
 void dav1d_ref_inc(Dav1dRef *ref);
 void dav1d_ref_dec(Dav1dRef **ref);

+int dav1d_ref_is_writable(Dav1dRef *ref);
+
 #endif /* __DAV1D_SRC_REF_H__ */
--- a/third_party/dav1d/src/ref_mvs.c
+++ b/third_party/dav1d/src/ref_mvs.c
@ -301,15 +301,15 @@ static INLINE int is_global_mv_block(const MB_MODE_INFO *const mbmi,
  const BLOCK_SIZE bsize = mbmi->sb_type;
  const int block_size_allowed =
      AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
-  return (mode == GLOBALMV || mode == GLOBAL_GLOBALMV) && type > TRANSLATION &&
-         block_size_allowed;
+  return block_size_allowed && type > TRANSLATION &&
+         (mode == GLOBALMV || mode == GLOBAL_GLOBALMV);
 }

 typedef struct {
  TransformationType wmtype;
  int32_t wmmat[6];
  int16_t alpha, beta, gamma, delta;
-} WarpedMotionParams;
+} Dav1dWarpedMotionParams;

 #define REF_FRAMES_LOG2 3
 #define REF_FRAMES (1 << REF_FRAMES_LOG2)
@ -381,7 +381,7 @@ typedef struct AV1Common {
  // External BufferPool passed from outside.
  BufferPool buffer_pool;

-  WarpedMotionParams global_motion[TOTAL_REFS_PER_FRAME];
+  Dav1dWarpedMotionParams global_motion[TOTAL_REFS_PER_FRAME];
  struct {
    BLOCK_SIZE sb_size;
    int enable_order_hint;
@ -501,7 +501,7 @@ static INLINE int block_center_y(int mi_row, BLOCK_SIZE bs) {
 // allow_hp is zero, the bottom bit will always be zero. If CONFIG_AMVR and
 // is_integer is true, the bottom three bits will be zero (so the motion vector
 // represents an integer)
-static INLINE int_mv gm_get_motion_vector(const WarpedMotionParams *gm,
+static INLINE int_mv gm_get_motion_vector(const Dav1dWarpedMotionParams *gm,
                                          int allow_hp, BLOCK_SIZE bsize,
                                          int mi_col, int mi_row,
                                          int is_integer) {
@ -836,7 +836,7 @@ static void add_ref_mv_candidate(
    const MB_MODE_INFO *const candidate, const MV_REFERENCE_FRAME rf[2],
    uint8_t *refmv_count, uint8_t *ref_match_count, uint8_t *newmv_count,
    CANDIDATE_MV *ref_mv_stack, int_mv *gm_mv_candidates,
-    const WarpedMotionParams *gm_params, int col, int weight) {
+    const Dav1dWarpedMotionParams *gm_params, int col, int weight) {
  if (!is_inter_block(candidate)) return;  // for intrabc
  int index = 0, ref;
  assert(weight % 2 == 0);
@ -1989,7 +1989,7 @@ int av1_init_ref_mv_common(AV1_COMMON *cm,
                           const unsigned cur_poc,
                           const unsigned ref_poc[7],
                           const unsigned ref_ref_poc[7][7],
-                           const WarpedMotionParams gmv[7],
+                           const Dav1dWarpedMotionParams gmv[7],
                           const int allow_hp,
                           const int force_int_mv,
                           const int allow_ref_frame_mvs,
@ -2003,7 +2003,7 @@ int av1_init_ref_mv_common(AV1_COMMON *cm,
                           const unsigned cur_poc,
                           const unsigned ref_poc[7],
                           const unsigned ref_ref_poc[7][7],
-                           const WarpedMotionParams gmv[7],
+                           const Dav1dWarpedMotionParams gmv[7],
                           const int allow_hp,
                           const int force_int_mv,
                           const int allow_ref_frame_mvs,
--- a/third_party/dav1d/src/ref_mvs.h
+++ b/third_party/dav1d/src/ref_mvs.h
@ -45,7 +45,7 @@ int av1_init_ref_mv_common(AV1_COMMON *cm,
                           unsigned cur_poc,
                           const unsigned ref_poc[7],
                           const unsigned ref_ref_poc[7][7],
-                           const WarpedMotionParams gmv[7],
+                           const Dav1dWarpedMotionParams gmv[7],
                           int allow_hp, int force_int_mv,
                           int allow_ref_frame_mvs, int order_hint);

@ -156,7 +156,7 @@ static inline void splat_intraref(refmvs *r, const ptrdiff_t stride,
    } while (--bh4);
 }

-static inline void fix_mv_precision(const Av1FrameHeader *const hdr,
+static inline void fix_mv_precision(const Dav1dFrameHeader *const hdr,
                                    mv *const mv)
 {
    if (hdr->force_integer_mv) {
--- a/third_party/dav1d/src/tables.c
+++ b/third_party/dav1d/src/tables.c
@ -418,35 +418,35 @@ const uint8_t /* enum TxClass */ dav1d_tx_type_class[N_TX_TYPES_PLUS_LL] = {
    [WHT_WHT]           = TX_CLASS_2D,
 };

-const uint8_t /* enum Filter2d */ dav1d_filter_2d[N_FILTERS][N_FILTERS] = {
-    [FILTER_8TAP_REGULAR] = {
-        [FILTER_8TAP_REGULAR] = FILTER_2D_8TAP_REGULAR,
-        [FILTER_8TAP_SHARP]   = FILTER_2D_8TAP_REGULAR_SHARP,
-        [FILTER_8TAP_SMOOTH]  = FILTER_2D_8TAP_REGULAR_SMOOTH,
-    }, [FILTER_8TAP_SHARP] = {
-        [FILTER_8TAP_REGULAR] = FILTER_2D_8TAP_SHARP_REGULAR,
-        [FILTER_8TAP_SHARP]   = FILTER_2D_8TAP_SHARP,
-        [FILTER_8TAP_SMOOTH]  = FILTER_2D_8TAP_SHARP_SMOOTH,
-    }, [FILTER_8TAP_SMOOTH] = {
-        [FILTER_8TAP_REGULAR] = FILTER_2D_8TAP_SMOOTH_REGULAR,
-        [FILTER_8TAP_SHARP]   = FILTER_2D_8TAP_SMOOTH_SHARP,
-        [FILTER_8TAP_SMOOTH]  = FILTER_2D_8TAP_SMOOTH,
-    }, [FILTER_BILINEAR] = {
-        [FILTER_BILINEAR]     = FILTER_2D_BILINEAR,
+const uint8_t /* enum Filter2d */ dav1d_filter_2d[DAV1D_N_FILTERS][DAV1D_N_FILTERS] = {
+    [DAV1D_FILTER_8TAP_REGULAR] = {
+        [DAV1D_FILTER_8TAP_REGULAR] = FILTER_2D_8TAP_REGULAR,
+        [DAV1D_FILTER_8TAP_SHARP]   = FILTER_2D_8TAP_REGULAR_SHARP,
+        [DAV1D_FILTER_8TAP_SMOOTH]  = FILTER_2D_8TAP_REGULAR_SMOOTH,
+    }, [DAV1D_FILTER_8TAP_SHARP] = {
+        [DAV1D_FILTER_8TAP_REGULAR] = FILTER_2D_8TAP_SHARP_REGULAR,
+        [DAV1D_FILTER_8TAP_SHARP]   = FILTER_2D_8TAP_SHARP,
+        [DAV1D_FILTER_8TAP_SMOOTH]  = FILTER_2D_8TAP_SHARP_SMOOTH,
+    }, [DAV1D_FILTER_8TAP_SMOOTH] = {
+        [DAV1D_FILTER_8TAP_REGULAR] = FILTER_2D_8TAP_SMOOTH_REGULAR,
+        [DAV1D_FILTER_8TAP_SHARP]   = FILTER_2D_8TAP_SMOOTH_SHARP,
+        [DAV1D_FILTER_8TAP_SMOOTH]  = FILTER_2D_8TAP_SMOOTH,
+    }, [DAV1D_FILTER_BILINEAR] = {
+        [DAV1D_FILTER_BILINEAR]     = FILTER_2D_BILINEAR,
    }
 };

-const uint8_t /* enum FilterMode */ dav1d_filter_dir[N_2D_FILTERS][2] = {
-    [FILTER_2D_8TAP_REGULAR]        = { FILTER_8TAP_REGULAR, FILTER_8TAP_REGULAR },
-    [FILTER_2D_8TAP_REGULAR_SMOOTH] = { FILTER_8TAP_SMOOTH,  FILTER_8TAP_REGULAR },
-    [FILTER_2D_8TAP_REGULAR_SHARP]  = { FILTER_8TAP_SHARP,   FILTER_8TAP_REGULAR },
-    [FILTER_2D_8TAP_SHARP_REGULAR]  = { FILTER_8TAP_REGULAR, FILTER_8TAP_SHARP   },
-    [FILTER_2D_8TAP_SHARP_SMOOTH]   = { FILTER_8TAP_SMOOTH,  FILTER_8TAP_SHARP   },
-    [FILTER_2D_8TAP_SHARP]          = { FILTER_8TAP_SHARP,   FILTER_8TAP_SHARP   },
-    [FILTER_2D_8TAP_SMOOTH_REGULAR] = { FILTER_8TAP_REGULAR, FILTER_8TAP_SMOOTH  },
-    [FILTER_2D_8TAP_SMOOTH]         = { FILTER_8TAP_SMOOTH,  FILTER_8TAP_SMOOTH  },
-    [FILTER_2D_8TAP_SMOOTH_SHARP]   = { FILTER_8TAP_SHARP,   FILTER_8TAP_SMOOTH  },
-    [FILTER_2D_BILINEAR]            = { FILTER_BILINEAR,     FILTER_BILINEAR     },
+const uint8_t /* enum Dav1dFilterMode */ dav1d_filter_dir[N_2D_FILTERS][2] = {
+    [FILTER_2D_8TAP_REGULAR]        = { DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_REGULAR },
+    [FILTER_2D_8TAP_REGULAR_SMOOTH] = { DAV1D_FILTER_8TAP_SMOOTH,  DAV1D_FILTER_8TAP_REGULAR },
+    [FILTER_2D_8TAP_REGULAR_SHARP]  = { DAV1D_FILTER_8TAP_SHARP,   DAV1D_FILTER_8TAP_REGULAR },
+    [FILTER_2D_8TAP_SHARP_REGULAR]  = { DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_SHARP   },
+    [FILTER_2D_8TAP_SHARP_SMOOTH]   = { DAV1D_FILTER_8TAP_SMOOTH,  DAV1D_FILTER_8TAP_SHARP   },
+    [FILTER_2D_8TAP_SHARP]          = { DAV1D_FILTER_8TAP_SHARP,   DAV1D_FILTER_8TAP_SHARP   },
+    [FILTER_2D_8TAP_SMOOTH_REGULAR] = { DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_SMOOTH  },
+    [FILTER_2D_8TAP_SMOOTH]         = { DAV1D_FILTER_8TAP_SMOOTH,  DAV1D_FILTER_8TAP_SMOOTH  },
+    [FILTER_2D_8TAP_SMOOTH_SHARP]   = { DAV1D_FILTER_8TAP_SHARP,   DAV1D_FILTER_8TAP_SMOOTH  },
+    [FILTER_2D_BILINEAR]            = { DAV1D_FILTER_BILINEAR,     DAV1D_FILTER_BILINEAR     },
 };

 const uint8_t dav1d_filter_mode_to_y_mode[5] = {
@ -481,8 +481,8 @@ const uint8_t dav1d_wedge_ctx_lut[N_BS_SIZES] = {
    [BS_8x8]   = 0,
 };

-const WarpedMotionParams dav1d_default_wm_params = {
-    .type = WM_TYPE_IDENTITY,
+const Dav1dWarpedMotionParams dav1d_default_wm_params = {
+    .type = DAV1D_WM_TYPE_IDENTITY,
    .matrix = {
        0, 0, 1 << 16,
        0, 0, 1 << 16,
@ -524,7 +524,7 @@ const int dav1d_sgr_x_by_xplus1[256] = {
 };

 const int8_t ALIGN(dav1d_mc_subpel_filters[5][15][8], 8) = {
-    [FILTER_8TAP_REGULAR] = {
+    [DAV1D_FILTER_8TAP_REGULAR] = {
        {   0,   1,  -3,  63,   4,  -1,   0,   0 },
        {   0,   1,  -5,  61,   9,  -2,   0,   0 },
        {   0,   1,  -6,  58,  14,  -4,   1,   0 },
@ -540,7 +540,7 @@ const int8_t ALIGN(dav1d_mc_subpel_filters[5][15][8], 8) = {
        {   0,   1,  -4,  14,  58,  -6,   1,   0 },
        {   0,   0,  -2,   9,  61,  -5,   1,   0 },
        {   0,   0,  -1,   4,  63,  -3,   1,   0 }
-    }, [FILTER_8TAP_SMOOTH] = {
+    }, [DAV1D_FILTER_8TAP_SMOOTH] = {
        {   0,   1,  14,  31,  17,   1,   0,   0 },
        {   0,   0,  13,  31,  18,   2,   0,   0 },
        {   0,   0,  11,  31,  20,   2,   0,   0 },
@ -556,7 +556,7 @@ const int8_t ALIGN(dav1d_mc_subpel_filters[5][15][8], 8) = {
        {   0,   0,   2,  20,  31,  11,   0,   0 },
        {   0,   0,   2,  18,  31,  13,   0,   0 },
        {   0,   0,   1,  17,  31,  14,   1,   0 }
-    }, [FILTER_8TAP_SHARP] = {
+    }, [DAV1D_FILTER_8TAP_SHARP] = {
        {  -1,   1,  -3,  63,   4,  -1,   1,   0 },
        {  -1,   3,  -6,  62,   8,  -3,   2,  -1 },
        {  -1,   4,  -9,  60,  13,  -5,   3,  -1 },
@ -573,7 +573,7 @@ const int8_t ALIGN(dav1d_mc_subpel_filters[5][15][8], 8) = {
        {  -1,   2,  -3,   8,  62,  -6,   3,  -1 },
        {   0,   1,  -1,   4,  63,  -3,   1,  -1 }
    /* width <= 4 */
-    }, [3 + FILTER_8TAP_REGULAR] = {
+    }, [3 + DAV1D_FILTER_8TAP_REGULAR] = {
        {   0,   0,  -2,  63,   4,  -1,   0,   0 },
        {   0,   0,  -4,  61,   9,  -2,   0,   0 },
        {   0,   0,  -5,  58,  14,  -3,   0,   0 },
@ -589,7 +589,7 @@ const int8_t ALIGN(dav1d_mc_subpel_filters[5][15][8], 8) = {
        {   0,   0,  -3,  14,  58,  -5,   0,   0 },
        {   0,   0,  -2,   9,  61,  -4,   0,   0 },
        {   0,   0,  -1,   4,  63,  -2,   0,   0 }
-    }, [3 + FILTER_8TAP_SMOOTH] = {
+    }, [3 + DAV1D_FILTER_8TAP_SMOOTH] = {
        {   0,   0,  15,  31,  17,   1,   0,   0 },
        {   0,   0,  13,  31,  18,   2,   0,   0 },
        {   0,   0,  11,  31,  20,   2,   0,   0 },
@ -712,6 +712,41 @@ const int8_t ALIGN(dav1d_mc_warp_filter[193][8], 8) = {
    { 0, 0,   2,  -1, 0,   0, 127, 0 }
 };

+const int16_t dav1d_resize_filter[64][8] = {
+    {  0, 0,   0, 128,   0,   0, 0,  0 }, {  0, 0,  -1, 128,   2,  -1, 0,  0 },
+    {  0, 1,  -3, 127,   4,  -2, 1,  0 }, {  0, 1,  -4, 127,   6,  -3, 1,  0 },
+    {  0, 2,  -6, 126,   8,  -3, 1,  0 }, {  0, 2,  -7, 125,  11,  -4, 1,  0 },
+    { -1, 2,  -8, 125,  13,  -5, 2,  0 }, { -1, 3,  -9, 124,  15,  -6, 2,  0 },
+    { -1, 3, -10, 123,  18,  -6, 2, -1 }, { -1, 3, -11, 122,  20,  -7, 3, -1 },
+    { -1, 4, -12, 121,  22,  -8, 3, -1 }, { -1, 4, -13, 120,  25,  -9, 3, -1 },
+    { -1, 4, -14, 118,  28,  -9, 3, -1 }, { -1, 4, -15, 117,  30, -10, 4, -1 },
+    { -1, 5, -16, 116,  32, -11, 4, -1 }, { -1, 5, -16, 114,  35, -12, 4, -1 },
+    { -1, 5, -17, 112,  38, -12, 4, -1 }, { -1, 5, -18, 111,  40, -13, 5, -1 },
+    { -1, 5, -18, 109,  43, -14, 5, -1 }, { -1, 6, -19, 107,  45, -14, 5, -1 },
+    { -1, 6, -19, 105,  48, -15, 5, -1 }, { -1, 6, -19, 103,  51, -16, 5, -1 },
+    { -1, 6, -20, 101,  53, -16, 6, -1 }, { -1, 6, -20,  99,  56, -17, 6, -1 },
+    { -1, 6, -20,  97,  58, -17, 6, -1 }, { -1, 6, -20,  95,  61, -18, 6, -1 },
+    { -2, 7, -20,  93,  64, -18, 6, -2 }, { -2, 7, -20,  91,  66, -19, 6, -1 },
+    { -2, 7, -20,  88,  69, -19, 6, -1 }, { -2, 7, -20,  86,  71, -19, 6, -1 },
+    { -2, 7, -20,  84,  74, -20, 7, -2 }, { -2, 7, -20,  81,  76, -20, 7, -1 },
+    { -2, 7, -20,  79,  79, -20, 7, -2 }, { -1, 7, -20,  76,  81, -20, 7, -2 },
+    { -2, 7, -20,  74,  84, -20, 7, -2 }, { -1, 6, -19,  71,  86, -20, 7, -2 },
+    { -1, 6, -19,  69,  88, -20, 7, -2 }, { -1, 6, -19,  66,  91, -20, 7, -2 },
+    { -2, 6, -18,  64,  93, -20, 7, -2 }, { -1, 6, -18,  61,  95, -20, 6, -1 },
+    { -1, 6, -17,  58,  97, -20, 6, -1 }, { -1, 6, -17,  56,  99, -20, 6, -1 },
+    { -1, 6, -16,  53, 101, -20, 6, -1 }, { -1, 5, -16,  51, 103, -19, 6, -1 },
+    { -1, 5, -15,  48, 105, -19, 6, -1 }, { -1, 5, -14,  45, 107, -19, 6, -1 },
+    { -1, 5, -14,  43, 109, -18, 5, -1 }, { -1, 5, -13,  40, 111, -18, 5, -1 },
+    { -1, 4, -12,  38, 112, -17, 5, -1 }, { -1, 4, -12,  35, 114, -16, 5, -1 },
+    { -1, 4, -11,  32, 116, -16, 5, -1 }, { -1, 4, -10,  30, 117, -15, 4, -1 },
+    { -1, 3,  -9,  28, 118, -14, 4, -1 }, { -1, 3,  -9,  25, 120, -13, 4, -1 },
+    { -1, 3,  -8,  22, 121, -12, 4, -1 }, { -1, 3,  -7,  20, 122, -11, 3, -1 },
+    { -1, 2,  -6,  18, 123, -10, 3, -1 }, {  0, 2,  -6,  15, 124,  -9, 3, -1 },
+    {  0, 2,  -5,  13, 125,  -8, 2, -1 }, {  0, 1,  -4,  11, 125,  -7, 2,  0 },
+    {  0, 1,  -3,   8, 126,  -6, 2,  0 }, {  0, 1,  -3,   6, 127,  -4, 1,  0 },
+    {  0, 1,  -2,   4, 127,  -3, 1,  0 }, {  0, 0,  -1,   2, 128,  -1, 0,  0 },
+};
+
 const uint8_t dav1d_sm_weights[128] = {
    // Unused, because we always offset by bs, which is at least 2.
      0,   0,
@ -837,3 +872,194 @@ const uint8_t ALIGN(dav1d_obmc_masks[64], 32) = {
    31, 29, 28, 26, 24, 23, 21, 20, 19, 17, 16, 14, 13, 12, 11,  9,
     8,  7,  6,  5,  4,  4,  3,  2,  0,  0,  0,  0,  0,  0,  0,  0,
 };
+
+// Taken from the spec. Range is [-2048, 2047], mean is 0 and stddev is 512
+const int16_t dav1d_gaussian_sequence[2048] = {
+    56,    568,   -180,  172,   124,   -84,   172,   -64,   -900,  24,   820,
+    224,   1248,  996,   272,   -8,    -916,  -388,  -732,  -104,  -188, 800,
+    112,   -652,  -320,  -376,  140,   -252,  492,   -168,  44,    -788, 588,
+    -584,  500,   -228,  12,    680,   272,   -476,  972,   -100,  652,  368,
+    432,   -196,  -720,  -192,  1000,  -332,  652,   -136,  -552,  -604, -4,
+    192,   -220,  -136,  1000,  -52,   372,   -96,   -624,  124,   -24,  396,
+    540,   -12,   -104,  640,   464,   244,   -208,  -84,   368,   -528, -740,
+    248,   -968,  -848,  608,   376,   -60,   -292,  -40,   -156,  252,  -292,
+    248,   224,   -280,  400,   -244,  244,   -60,   76,    -80,   212,  532,
+    340,   128,   -36,   824,   -352,  -60,   -264,  -96,   -612,  416,  -704,
+    220,   -204,  640,   -160,  1220,  -408,  900,   336,   20,    -336, -96,
+    -792,  304,   48,    -28,   -1232, -1172, -448,  104,   -292,  -520, 244,
+    60,    -948,  0,     -708,  268,   108,   356,   -548,  488,   -344, -136,
+    488,   -196,  -224,  656,   -236,  -1128, 60,    4,     140,   276,  -676,
+    -376,  168,   -108,  464,   8,     564,   64,    240,   308,   -300, -400,
+    -456,  -136,  56,    120,   -408,  -116,  436,   504,   -232,  328,  844,
+    -164,  -84,   784,   -168,  232,   -224,  348,   -376,  128,   568,  96,
+    -1244, -288,  276,   848,   832,   -360,  656,   464,   -384,  -332, -356,
+    728,   -388,  160,   -192,  468,   296,   224,   140,   -776,  -100, 280,
+    4,     196,   44,    -36,   -648,  932,   16,    1428,  28,    528,  808,
+    772,   20,    268,   88,    -332,  -284,  124,   -384,  -448,  208,  -228,
+    -1044, -328,  660,   380,   -148,  -300,  588,   240,   540,   28,   136,
+    -88,   -436,  256,   296,   -1000, 1400,  0,     -48,   1056,  -136, 264,
+    -528,  -1108, 632,   -484,  -592,  -344,  796,   124,   -668,  -768, 388,
+    1296,  -232,  -188,  -200,  -288,  -4,    308,   100,   -168,  256,  -500,
+    204,   -508,  648,   -136,  372,   -272,  -120,  -1004, -552,  -548, -384,
+    548,   -296,  428,   -108,  -8,    -912,  -324,  -224,  -88,   -112, -220,
+    -100,  996,   -796,  548,   360,   -216,  180,   428,   -200,  -212, 148,
+    96,    148,   284,   216,   -412,  -320,  120,   -300,  -384,  -604, -572,
+    -332,  -8,    -180,  -176,  696,   116,   -88,   628,   76,    44,   -516,
+    240,   -208,  -40,   100,   -592,  344,   -308,  -452,  -228,  20,   916,
+    -1752, -136,  -340,  -804,  140,   40,    512,   340,   248,   184,  -492,
+    896,   -156,  932,   -628,  328,   -688,  -448,  -616,  -752,  -100, 560,
+    -1020, 180,   -800,  -64,   76,    576,   1068,  396,   660,   552,  -108,
+    -28,   320,   -628,  312,   -92,   -92,   -472,  268,   16,    560,  516,
+    -672,  -52,   492,   -100,  260,   384,   284,   292,   304,   -148, 88,
+    -152,  1012,  1064,  -228,  164,   -376,  -684,  592,   -392,  156,  196,
+    -524,  -64,   -884,  160,   -176,  636,   648,   404,   -396,  -436, 864,
+    424,   -728,  988,   -604,  904,   -592,  296,   -224,  536,   -176, -920,
+    436,   -48,   1176,  -884,  416,   -776,  -824,  -884,  524,   -548, -564,
+    -68,   -164,  -96,   692,   364,   -692,  -1012, -68,   260,   -480, 876,
+    -1116, 452,   -332,  -352,  892,   -1088, 1220,  -676,  12,    -292, 244,
+    496,   372,   -32,   280,   200,   112,   -440,  -96,   24,    -644, -184,
+    56,    -432,  224,   -980,  272,   -260,  144,   -436,  420,   356,  364,
+    -528,  76,    172,   -744,  -368,  404,   -752,  -416,  684,   -688, 72,
+    540,   416,   92,    444,   480,   -72,   -1416, 164,   -1172, -68,  24,
+    424,   264,   1040,  128,   -912,  -524,  -356,  64,    876,   -12,  4,
+    -88,   532,   272,   -524,  320,   276,   -508,  940,   24,    -400, -120,
+    756,   60,    236,   -412,  100,   376,   -484,  400,   -100,  -740, -108,
+    -260,  328,   -268,  224,   -200,  -416,  184,   -604,  -564,  -20,  296,
+    60,    892,   -888,  60,    164,   68,    -760,  216,   -296,  904,  -336,
+    -28,   404,   -356,  -568,  -208,  -1480, -512,  296,   328,   -360, -164,
+    -1560, -776,  1156,  -428,  164,   -504,  -112,  120,   -216,  -148, -264,
+    308,   32,    64,    -72,   72,    116,   176,   -64,   -272,  460,  -536,
+    -784,  -280,  348,   108,   -752,  -132,  524,   -540,  -776,  116,  -296,
+    -1196, -288,  -560,  1040,  -472,  116,   -848,  -1116, 116,   636,  696,
+    284,   -176,  1016,  204,   -864,  -648,  -248,  356,   972,   -584, -204,
+    264,   880,   528,   -24,   -184,  116,   448,   -144,  828,   524,  212,
+    -212,  52,    12,    200,   268,   -488,  -404,  -880,  824,   -672, -40,
+    908,   -248,  500,   716,   -576,  492,   -576,  16,    720,   -108, 384,
+    124,   344,   280,   576,   -500,  252,   104,   -308,  196,   -188, -8,
+    1268,  296,   1032,  -1196, 436,   316,   372,   -432,  -200,  -660, 704,
+    -224,  596,   -132,  268,   32,    -452,  884,   104,   -1008, 424,  -1348,
+    -280,  4,     -1168, 368,   476,   696,   300,   -8,    24,    180,  -592,
+    -196,  388,   304,   500,   724,   -160,  244,   -84,   272,   -256, -420,
+    320,   208,   -144,  -156,  156,   364,   452,   28,    540,   316,  220,
+    -644,  -248,  464,   72,    360,   32,    -388,  496,   -680,  -48,  208,
+    -116,  -408,  60,    -604,  -392,  548,   -840,  784,   -460,  656,  -544,
+    -388,  -264,  908,   -800,  -628,  -612,  -568,  572,   -220,  164,  288,
+    -16,   -308,  308,   -112,  -636,  -760,  280,   -668,  432,   364,  240,
+    -196,  604,   340,   384,   196,   592,   -44,   -500,  432,   -580, -132,
+    636,   -76,   392,   4,     -412,  540,   508,   328,   -356,  -36,  16,
+    -220,  -64,   -248,  -60,   24,    -192,  368,   1040,  92,    -24,  -1044,
+    -32,   40,    104,   148,   192,   -136,  -520,  56,    -816,  -224, 732,
+    392,   356,   212,   -80,   -424,  -1008, -324,  588,   -1496, 576,  460,
+    -816,  -848,  56,    -580,  -92,   -1372, -112,  -496,  200,   364,  52,
+    -140,  48,    -48,   -60,   84,    72,    40,    132,   -356,  -268, -104,
+    -284,  -404,  732,   -520,  164,   -304,  -540,  120,   328,   -76,  -460,
+    756,   388,   588,   236,   -436,  -72,   -176,  -404,  -316,  -148, 716,
+    -604,  404,   -72,   -88,   -888,  -68,   944,   88,    -220,  -344, 960,
+    472,   460,   -232,  704,   120,   832,   -228,  692,   -508,  132,  -476,
+    844,   -748,  -364,  -44,   1116,  -1104, -1056, 76,    428,   552,  -692,
+    60,    356,   96,    -384,  -188,  -612,  -576,  736,   508,   892,  352,
+    -1132, 504,   -24,   -352,  324,   332,   -600,  -312,  292,   508,  -144,
+    -8,    484,   48,    284,   -260,  -240,  256,   -100,  -292,  -204, -44,
+    472,   -204,  908,   -188,  -1000, -256,  92,    1164,  -392,  564,  356,
+    652,   -28,   -884,  256,   484,   -192,  760,   -176,  376,   -524, -452,
+    -436,  860,   -736,  212,   124,   504,   -476,  468,   76,    -472, 552,
+    -692,  -944,  -620,  740,   -240,  400,   132,   20,    192,   -196, 264,
+    -668,  -1012, -60,   296,   -316,  -828,  76,    -156,  284,   -768, -448,
+    -832,  148,   248,   652,   616,   1236,  288,   -328,  -400,  -124, 588,
+    220,   520,   -696,  1032,  768,   -740,  -92,   -272,  296,   448,  -464,
+    412,   -200,  392,   440,   -200,  264,   -152,  -260,  320,   1032, 216,
+    320,   -8,    -64,   156,   -1016, 1084,  1172,  536,   484,   -432, 132,
+    372,   -52,   -256,  84,    116,   -352,  48,    116,   304,   -384, 412,
+    924,   -300,  528,   628,   180,   648,   44,    -980,  -220,  1320, 48,
+    332,   748,   524,   -268,  -720,  540,   -276,  564,   -344,  -208, -196,
+    436,   896,   88,    -392,  132,   80,    -964,  -288,  568,   56,   -48,
+    -456,  888,   8,     552,   -156,  -292,  948,   288,   128,   -716, -292,
+    1192,  -152,  876,   352,   -600,  -260,  -812,  -468,  -28,   -120, -32,
+    -44,   1284,  496,   192,   464,   312,   -76,   -516,  -380,  -456, -1012,
+    -48,   308,   -156,  36,    492,   -156,  -808,  188,   1652,  68,   -120,
+    -116,  316,   160,   -140,  352,   808,   -416,  592,   316,   -480, 56,
+    528,   -204,  -568,  372,   -232,  752,   -344,  744,   -4,    324,  -416,
+    -600,  768,   268,   -248,  -88,   -132,  -420,  -432,  80,    -288, 404,
+    -316,  -1216, -588,  520,   -108,  92,    -320,  368,   -480,  -216, -92,
+    1688,  -300,  180,   1020,  -176,  820,   -68,   -228,  -260,  436,  -904,
+    20,    40,    -508,  440,   -736,  312,   332,   204,   760,   -372, 728,
+    96,    -20,   -632,  -520,  -560,  336,   1076,  -64,   -532,  776,  584,
+    192,   396,   -728,  -520,  276,   -188,  80,    -52,   -612,  -252, -48,
+    648,   212,   -688,  228,   -52,   -260,  428,   -412,  -272,  -404, 180,
+    816,   -796,  48,    152,   484,   -88,   -216,  988,   696,   188,  -528,
+    648,   -116,  -180,  316,   476,   12,    -564,  96,    476,   -252, -364,
+    -376,  -392,  556,   -256,  -576,  260,   -352,  120,   -16,   -136, -260,
+    -492,  72,    556,   660,   580,   616,   772,   436,   424,   -32,  -324,
+    -1268, 416,   -324,  -80,   920,   160,   228,   724,   32,    -516, 64,
+    384,   68,    -128,  136,   240,   248,   -204,  -68,   252,   -932, -120,
+    -480,  -628,  -84,   192,   852,   -404,  -288,  -132,  204,   100,  168,
+    -68,   -196,  -868,  460,   1080,  380,   -80,   244,   0,     484,  -888,
+    64,    184,   352,   600,   460,   164,   604,   -196,  320,   -64,  588,
+    -184,  228,   12,    372,   48,    -848,  -344,  224,   208,   -200, 484,
+    128,   -20,   272,   -468,  -840,  384,   256,   -720,  -520,  -464, -580,
+    112,   -120,  644,   -356,  -208,  -608,  -528,  704,   560,   -424, 392,
+    828,   40,    84,    200,   -152,  0,     -144,  584,   280,   -120, 80,
+    -556,  -972,  -196,  -472,  724,   80,    168,   -32,   88,    160,  -688,
+    0,     160,   356,   372,   -776,  740,   -128,  676,   -248,  -480, 4,
+    -364,  96,    544,   232,   -1032, 956,   236,   356,   20,    -40,  300,
+    24,    -676,  -596,  132,   1120,  -104,  532,   -1096, 568,   648,  444,
+    508,   380,   188,   -376,  -604,  1488,  424,   24,    756,   -220, -192,
+    716,   120,   920,   688,   168,   44,    -460,  568,   284,   1144, 1160,
+    600,   424,   888,   656,   -356,  -320,  220,   316,   -176,  -724, -188,
+    -816,  -628,  -348,  -228,  -380,  1012,  -452,  -660,  736,   928,  404,
+    -696,  -72,   -268,  -892,  128,   184,   -344,  -780,  360,   336,  400,
+    344,   428,   548,   -112,  136,   -228,  -216,  -820,  -516,  340,  92,
+    -136,  116,   -300,  376,   -244,  100,   -316,  -520,  -284,  -12,  824,
+    164,   -548,  -180,  -128,  116,   -924,  -828,  268,   -368,  -580, 620,
+    192,   160,   0,     -1676, 1068,  424,   -56,   -360,  468,   -156, 720,
+    288,   -528,  556,   -364,  548,   -148,  504,   316,   152,   -648, -620,
+    -684,  -24,   -376,  -384,  -108,  -920,  -1032, 768,   180,   -264, -508,
+    -1268, -260,  -60,   300,   -240,  988,   724,   -376,  -576,  -212, -736,
+    556,   192,   1092,  -620,  -880,  376,   -56,   -4,    -216,  -32,  836,
+    268,   396,   1332,  864,   -600,  100,   56,    -412,  -92,   356,  180,
+    884,   -468,  -436,  292,   -388,  -804,  -704,  -840,  368,   -348, 140,
+    -724,  1536,  940,   372,   112,   -372,  436,   -480,  1136,  296,  -32,
+    -228,  132,   -48,   -220,  868,   -1016, -60,   -1044, -464,  328,  916,
+    244,   12,    -736,  -296,  360,   468,   -376,  -108,  -92,   788,  368,
+    -56,   544,   400,   -672,  -420,  728,   16,    320,   44,    -284, -380,
+    -796,  488,   132,   204,   -596,  -372,  88,    -152,  -908,  -636, -572,
+    -624,  -116,  -692,  -200,  -56,   276,   -88,   484,   -324,  948,  864,
+    1000,  -456,  -184,  -276,  292,   -296,  156,   676,   320,   160,  908,
+    -84,   -1236, -288,  -116,  260,   -372,  -644,  732,   -756,  -96,  84,
+    344,   -520,  348,   -688,  240,   -84,   216,   -1044, -136,  -676, -396,
+    -1500, 960,   -40,   176,   168,   1516,  420,   -504,  -344,  -364, -360,
+    1216,  -940,  -380,  -212,  252,   -660,  -708,  484,   -444,  -152, 928,
+    -120,  1112,  476,   -260,  560,   -148,  -344,  108,   -196,  228,  -288,
+    504,   560,   -328,  -88,   288,   -1008, 460,   -228,  468,   -836, -196,
+    76,    388,   232,   412,   -1168, -716,  -644,  756,   -172,  -356, -504,
+    116,   432,   528,   48,    476,   -168,  -608,  448,   160,   -532, -272,
+    28,    -676,  -12,   828,   980,   456,   520,   104,   -104,  256,  -344,
+    -4,    -28,   -368,  -52,   -524,  -572,  -556,  -200,  768,   1124, -208,
+    -512,  176,   232,   248,   -148,  -888,  604,   -600,  -304,  804,  -156,
+    -212,  488,   -192,  -804,  -256,  368,   -360,  -916,  -328,  228,  -240,
+    -448,  -472,  856,   -556,  -364,  572,   -12,   -156,  -368,  -340, 432,
+    252,   -752,  -152,  288,   268,   -580,  -848,  -592,  108,   -76,  244,
+    312,   -716,  592,   -80,   436,   360,   4,     -248,  160,   516,  584,
+    732,   44,    -468,  -280,  -292,  -156,  -588,  28,    308,   912,  24,
+    124,   156,   180,   -252,  944,   -924,  -772,  -520,  -428,  -624, 300,
+    -212,  -1144, 32,    -724,  800,   -1128, -212,  -1288, -848,  180,  -416,
+    440,   192,   -576,  -792,  -76,   -1080, 80,    -532,  -352,  -132, 380,
+    -820,  148,   1112,  128,   164,   456,   700,   -924,  144,   -668, -384,
+    648,   -832,  508,   552,   -52,   -100,  -656,  208,   -568,  748,  -88,
+    680,   232,   300,   192,   -408,  -1012, -152,  -252,  -268,  272,  -876,
+    -664,  -648,  -332,  -136,  16,    12,    1152,  -28,   332,   -536, 320,
+    -672,  -460,  -316,  532,   -260,  228,   -40,   1052,  -816,  180,  88,
+    -496,  -556,  -672,  -368,  428,   92,    356,   404,   -408,  252,  196,
+    -176,  -556,  792,   268,   32,    372,   40,    96,    -332,  328,  120,
+    372,   -900,  -40,   472,   -264,  -592,  952,   128,   656,   112,  664,
+    -232,  420,   4,     -344,  -464,  556,   244,   -416,  -32,   252,  0,
+    -412,  188,   -696,  508,   -476,  324,   -1096, 656,   -312,  560,  264,
+    -136,  304,   160,   -64,   -580,  248,   336,   -720,  560,   -348, -288,
+    -276,  -196,  -500,  852,   -544,  -236,  -1128, -992,  -776,  116,  56,
+    52,    860,   884,   212,   -12,   168,   1020,  512,   -552,  924,  -148,
+    716,   188,   164,   -340,  -520,  -184,  880,   -152,  -680,  -208, -1156,
+    -300,  -528,  -472,  364,   100,   -744,  -1056, -32,   540,   280,  144,
+    -676,  -32,   -232,  -280,  -224,  96,    568,   -76,   172,   148,  148,
+    104,   32,    -296,  -32,   788,   -80,   32,    -16,   280,   288,  944,
+    428,   -484
+};
--- a/third_party/dav1d/src/tables.h
+++ b/third_party/dav1d/src/tables.h
@ -63,8 +63,8 @@ extern const uint8_t dav1d_nz_map_ctx_offset[N_RECT_TX_SIZES][5][5];
 extern const uint8_t /* enum TxClass */
                     dav1d_tx_type_class[N_TX_TYPES_PLUS_LL];
 extern const uint8_t /* enum Filter2d */
-                     dav1d_filter_2d[N_FILTERS /* h */][N_FILTERS /* v */];
-extern const uint8_t /* enum FilterMode */ dav1d_filter_dir[N_2D_FILTERS][2];
+                     dav1d_filter_2d[DAV1D_N_FILTERS /* h */][DAV1D_N_FILTERS /* v */];
+extern const uint8_t /* enum Dav1dFilterMode */ dav1d_filter_dir[N_2D_FILTERS][2];
 extern const uint8_t dav1d_intra_mode_context[N_INTRA_PRED_MODES];
 extern const uint8_t dav1d_wedge_ctx_lut[N_BS_SIZES];

@ -104,13 +104,14 @@ static const unsigned interintra_allowed_mask =
    (1 << BS_8x16) |
    (1 << BS_8x8);

-extern const WarpedMotionParams dav1d_default_wm_params;
+extern const Dav1dWarpedMotionParams dav1d_default_wm_params;

 extern const int16_t dav1d_sgr_params[16][4];
 extern const int dav1d_sgr_x_by_xplus1[256];

 extern const int8_t dav1d_mc_subpel_filters[5][15][8];
 extern const int8_t dav1d_mc_warp_filter[193][8];
+extern const int16_t dav1d_resize_filter[64][8];

 extern const uint8_t dav1d_sm_weights[128];
 extern const int16_t dav1d_dr_intra_derivative[90];
@ -118,4 +119,6 @@ extern const int8_t dav1d_filter_intra_taps[5][64];

 extern const uint8_t dav1d_obmc_masks[64];

+extern const int16_t dav1d_gaussian_sequence[2048]; // for fgs
+
 #endif /* __DAV1D_SRC_TABLES_H__ */
--- a/third_party/dav1d/src/thread_task.c
+++ b/third_party/dav1d/src/thread_task.c
@ -41,7 +41,10 @@ void *dav1d_frame_task(void *const data) {
        if (f->frame_thread.die) break;
        pthread_mutex_unlock(&f->frame_thread.td.lock);

-        dav1d_decode_frame(f);
+        const int res = dav1d_decode_frame(f);
+        if (res)
+            memset(f->frame_thread.cf, 0,
+                   sizeof(int32_t) * 3 * f->lf.mask_sz * 128 * 128);

        pthread_mutex_lock(&f->frame_thread.td.lock);
        f->n_tile_data = 0;
@ -79,7 +82,7 @@ void *dav1d_tile_task(void *const data) {
        const int task_idx = fttd->num_tasks - fttd->tasks_left--;
        pthread_mutex_unlock(&fttd->lock);

-        if (f->frame_thread.pass == 1 || f->n_tc >= f->frame_hdr.tiling.cols) {
+        if (f->frame_thread.pass == 1 || f->n_tc >= f->frame_hdr->tiling.cols) {
            // we can (or in fact, if >, we need to) do full tile decoding.
            // loopfilter happens in the main thread
            Dav1dTileState *const ts = t->ts = &f->ts[task_idx];
--- a/third_party/dav1d/src/warpmv.c
+++ b/third_party/dav1d/src/warpmv.c
@ -78,7 +78,7 @@ static inline int resolve_divisor_32(const unsigned d, int *const shift) {
    return div_lut[f];
 }

-int dav1d_get_shear_params(WarpedMotionParams *const wm) {
+int dav1d_get_shear_params(Dav1dWarpedMotionParams *const wm) {
    const int32_t *const mat = wm->matrix;

    if (mat[2] <= 0) return 1;
@ -129,7 +129,7 @@ static int get_mult_shift_diag(const int64_t px,

 int dav1d_find_affine_int(const int (*pts)[2][2], const int np,
                          const int bw4, const int bh4,
-                          const mv mv, WarpedMotionParams *const wm,
+                          const mv mv, Dav1dWarpedMotionParams *const wm,
                          const int bx4, const int by4)
 {
    int32_t *const mat = wm->matrix;
--- a/third_party/dav1d/src/warpmv.h
+++ b/third_party/dav1d/src/warpmv.h
@ -30,8 +30,8 @@

 #include "src/levels.h"

-int dav1d_get_shear_params(WarpedMotionParams *wm);
+int dav1d_get_shear_params(Dav1dWarpedMotionParams *wm);
 int dav1d_find_affine_int(const int (*pts)[2][2], int np, int bw4, int bh4,
-                          mv mv, WarpedMotionParams *wm, int by, int bx);
+                          mv mv, Dav1dWarpedMotionParams *wm, int by, int bx);

 #endif /* __DAV1D_SRC_WARPMV_H__ */
--- a/third_party/dav1d/src/x86/mc_init_tmpl.c
+++ b/third_party/dav1d/src/x86/mc_init_tmpl.c
@ -51,8 +51,11 @@ decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx2);
 decl_mct_fn(dav1d_prep_bilin_avx2);

 decl_avg_fn(dav1d_avg_avx2);
+decl_avg_fn(dav1d_avg_ssse3);
 decl_w_avg_fn(dav1d_w_avg_avx2);
+decl_w_avg_fn(dav1d_w_avg_ssse3);
 decl_mask_fn(dav1d_mask_avx2);
+decl_mask_fn(dav1d_mask_ssse3);
 decl_w_mask_fn(dav1d_w_mask_420_avx2);
 decl_blend_fn(dav1d_blend_avx2);
 decl_blend_dir_fn(dav1d_blend_v_avx2);
@ -70,7 +73,18 @@ void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
    c->mct[type] = dav1d_prep_##name##_##suffix
    const unsigned flags = dav1d_get_cpu_flags();

-    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+    if(!(flags & DAV1D_X86_CPU_FLAG_SSSE3))
+        return;
+
+#if BITDEPTH == 8 && ARCH_X86_64
+    c->avg = dav1d_avg_ssse3;
+    c->w_avg = dav1d_w_avg_ssse3;
+    c->mask = dav1d_mask_ssse3;
+#endif
+
+    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2))
+        return;

 #if BITDEPTH == 8 && ARCH_X86_64
    init_mc_fn (FILTER_2D_8TAP_REGULAR,        8tap_regular,        avx2);
--- a/third_party/dav1d/src/x86/mc_ssse3.asm
+++ b/third_party/dav1d/src/x86/mc_ssse3.asm
@ -0,0 +1,251 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; Copyright © 2018, VideoLabs
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 16
+
+pw_1024: times 8 dw 1024
+pw_2048: times 8 dw 2048
+
+%macro BIDIR_JMP_TABLE 1-*
+    ;evaluated at definition time (in loop below)
+    %xdefine %1_table (%%table - 2*%2)
+    %xdefine %%base %1_table
+    %xdefine %%prefix mangle(private_prefix %+ _%1)
+    ; dynamically generated label
+    %%table:
+    %rep %0 - 1 ; repeat for num args
+        dd %%prefix %+ .w%2 - %%base
+        %rotate 1
+    %endrep
+%endmacro
+
+BIDIR_JMP_TABLE avg_ssse3,        4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_avg_ssse3,      4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE mask_ssse3,       4, 8, 16, 32, 64, 128
+
+SECTION .text
+
+INIT_XMM ssse3
+
+%if WIN64
+DECLARE_REG_TMP 6, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+
+%macro BIDIR_FN 1 ; op
+    %1                    0
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+.w4_loop:
+    %1_INC_PTR            2
+    %1                    0
+    lea                dstq, [dstq+strideq*4]
+.w4: ; tile 4x
+    movd   [dstq          ], m0      ; copy dw[0]
+    pshuflw              m1, m0, q1032 ; swap dw[1] and dw[0]
+    movd   [dstq+strideq*1], m1      ; copy dw[1]
+    punpckhqdq           m0, m0      ; swap dw[3,2] with dw[1,0]
+    movd   [dstq+strideq*2], m0      ; dw[2]
+    psrlq                m0, 32      ; shift right in dw[3]
+    movd   [dstq+stride3q ], m0      ; copy
+    sub                  hd, 4
+    jg .w4_loop
+    RET
+.w8_loop:
+    %1_INC_PTR            2
+    %1                    0
+    lea                dstq, [dstq+strideq*2]
+.w8:
+    movq   [dstq          ], m0
+    movhps [dstq+strideq*1], m0
+    sub                  hd, 2
+    jg .w8_loop
+    RET
+.w16_loop:
+    %1_INC_PTR            2
+    %1                    0
+    lea                dstq, [dstq+strideq]
+.w16:
+    mova   [dstq          ], m0
+    dec                  hd
+    jg .w16_loop
+    RET
+.w32_loop:
+    %1_INC_PTR            4
+    %1                    0
+    lea                dstq, [dstq+strideq]
+.w32:
+    mova   [dstq          ], m0
+    %1                    2
+    mova   [dstq + 16     ], m0
+    dec                  hd
+    jg .w32_loop
+    RET
+.w64_loop:
+    %1_INC_PTR            8
+    %1                    0
+    add                dstq, strideq
+.w64:
+    %assign i 0
+    %rep 4
+    mova   [dstq + i*16   ], m0
+    %assign i i+1
+    %if i < 4
+    %1                    2*i
+    %endif
+    %endrep
+    dec                  hd
+    jg .w64_loop
+    RET
+.w128_loop:
+    %1_INC_PTR            16
+    %1                    0
+    add                dstq, strideq
+.w128:
+    %assign i 0
+    %rep 8
+    mova   [dstq + i*16   ], m0
+    %assign i i+1
+    %if i < 8
+    %1                    2*i
+    %endif
+    %endrep
+    dec                  hd
+    jg .w128_loop
+    RET
+%endmacro
+
+%macro AVG 1 ; src_offset
+    ; writes AVG of tmp1 tmp2 uint16 coeffs into uint8 pixel
+    mova                 m0, [tmp1q+(%1+0)*mmsize] ; load 8 coef(2bytes) from tmp1
+    paddw                m0, [tmp2q+(%1+0)*mmsize] ; load/add 8 coef(2bytes) tmp2
+    mova                 m1, [tmp1q+(%1+1)*mmsize]
+    paddw                m1, [tmp2q+(%1+1)*mmsize]
+    pmulhrsw             m0, m2
+    pmulhrsw             m1, m2
+    packuswb             m0, m1 ; pack/trunc 16 bits from m0 & m1 to 8 bit
+%endmacro
+
+%macro AVG_INC_PTR 1
+    add               tmp1q, %1*mmsize
+    add               tmp2q, %1*mmsize
+%endmacro
+
+cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
+    lea                  r6, [avg_ssse3_table]
+    tzcnt                wd, wm ; leading zeros
+    movifnidn            hd, hm ; move h(stack) to h(register) if not already that register
+    movsxd               wq, dword [r6+wq*4] ; push table entry matching the tile width (tzcnt) in widen reg
+    mova                 m2, [pw_1024+r6-avg_ssse3_table] ; fill m2 with shift/align
+    add                  wq, r6
+    BIDIR_FN            AVG
+
+%macro W_AVG 1 ; src_offset
+    ; (a * weight + b * (16 - weight) + 128) >> 8
+    ; = ((a - b) * weight + (b << 4) + 128) >> 8
+    ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4
+    mova                 m0,     [tmp2q+(%1+0)*mmsize]
+    psubw                m2, m0, [tmp1q+(%1+0)*mmsize]
+    mova                 m1,     [tmp2q+(%1+1)*mmsize]
+    psubw                m3, m1, [tmp1q+(%1+1)*mmsize]
+    paddw                m2, m2 ; compensate for the weight only being half
+    paddw                m3, m3 ; of what it should be
+    pmulhw               m2, m4 ; (b-a) * (-weight << 12)
+    pmulhw               m3, m4 ; (b-a) * (-weight << 12)
+    paddw                m0, m2 ; ((b-a) * -weight) + b
+    paddw                m1, m3
+    pmulhrsw             m0, m5
+    pmulhrsw             m1, m5
+    packuswb             m0, m1
+%endmacro
+
+%define W_AVG_INC_PTR AVG_INC_PTR
+
+cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
+    lea                  r6, [w_avg_ssse3_table]
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    movd                 m0, r6m
+    pshuflw              m0, m0, q0000
+    punpcklqdq           m0, m0
+    movsxd               wq, dword [r6+wq*4]
+    pxor                 m4, m4
+    psllw                m0, 11 ; can't shift by 12, sign bit must be preserved
+    psubw                m4, m0
+    mova                 m5, [pw_2048+r6-w_avg_ssse3_table]
+    add                  wq, r6
+    BIDIR_FN          W_AVG
+
+%macro MASK 1 ; src_offset
+    ; (a * m + b * (64 - m) + 512) >> 10
+    ; = ((a - b) * m + (b << 6) + 512) >> 10
+    ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4
+    mova                 m3,     [maskq+(%1+0)*(mmsize/2)]
+    mova                 m0,     [tmp2q+(%1+0)*mmsize] ; b
+    psubw                m1, m0, [tmp1q+(%1+0)*mmsize] ; b - a
+    mova                 m6, m3      ; m
+    psubb                m3, m4, m6  ; -m
+    paddw                m1, m1     ; (b - a) << 1
+    paddb                m3, m3     ; -m << 1
+    punpcklbw            m2, m4, m3 ; -m << 9 (<< 8 when ext as uint16)
+    pmulhw               m1, m2     ; (-m * (b - a)) << 10
+    paddw                m0, m1     ; + b
+    mova                 m1,     [tmp2q+(%1+1)*mmsize] ; b
+    psubw                m2, m1, [tmp1q+(%1+1)*mmsize] ; b - a
+    paddw                m2, m2  ; (b - a) << 1
+    mova                 m6, m3  ; (-m << 1)
+    punpckhbw            m3, m4, m6 ; (-m << 9)
+    pmulhw               m2, m3 ; (-m << 9)
+    paddw                m1, m2 ; (-m * (b - a)) << 10
+    pmulhrsw             m0, m5 ; round
+    pmulhrsw             m1, m5 ; round
+    packuswb             m0, m1 ; interleave 16 -> 8
+%endmacro
+
+%macro MASK_INC_PTR 1
+    add               maskq, %1*mmsize/2
+    add               tmp1q, %1*mmsize
+    add               tmp2q, %1*mmsize
+%endmacro
+
+cglobal mask, 4, 8, 7, dst, stride, tmp1, tmp2, w, h, mask, stride3
+    lea                  r7, [mask_ssse3_table]
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    mov               maskq, maskmp
+    movsxd               wq, dword [r7+wq*4]
+    pxor                 m4, m4
+    mova                 m5, [pw_2048+r7-mask_ssse3_table]
+    add                  wq, r7
+    BIDIR_FN           MASK
+
+%endif ; ARCH_X86_64
--- a/third_party/dav1d/tests/checkasm/checkasm.c
+++ b/third_party/dav1d/tests/checkasm/checkasm.c
@ -133,6 +133,7 @@ static struct {
    unsigned cpu_flag;
    const char *cpu_flag_name;
    const char *test_name;
+    unsigned int seed;
 } state;

 /* float compare support code */
@ -413,6 +414,7 @@ static void check_cpu_flag(const char *const name, unsigned flag) {
        for (int i = 0; tests[i].func; i++) {
            if (state.test_name && strcmp(tests[i].name, state.test_name))
                continue;
+            srand(state.seed);
            state.current_test_name = tests[i].name;
            tests[i].func();
        }
@ -429,7 +431,7 @@ static void print_cpu_name(void) {

 int main(int argc, char *argv[]) {
    (void)func_new, (void)func_ref;
-    unsigned int seed = get_seed();
+    state.seed = get_seed();
    int ret = 0;

    while (argc > 1) {
@ -446,16 +448,21 @@ int main(int argc, char *argv[]) {
                state.bench_pattern = "";
        } else if (!strncmp(argv[1], "--test=", 7)) {
            state.test_name = argv[1] + 7;
+        } else if (!strcmp(argv[1], "--list")) {
+            fprintf(stderr, "checkasm: available tests [");
+            for (int i = 0; tests[i].func; i++)
+                fprintf(stderr, "%s%s", i ? ", ": "", tests[i].name);
+            fprintf(stderr, "]\n");
+            return 0;
        } else {
-            seed = strtoul(argv[1], NULL, 10);
+            state.seed = strtoul(argv[1], NULL, 10);
        }

        argc--;
        argv++;
    }

-    fprintf(stderr, "checkasm: using random seed %u\n", seed);
-    srand(seed);
+    fprintf(stderr, "checkasm: using random seed %u\n", state.seed);

    check_cpu_flag(NULL, 0);
    for (int i = 0; cpus[i].flag; i++)
@ -521,6 +528,7 @@ void *checkasm_check_func(void *const func, const char *const name, ...) {
    v->ok = 1;
    v->cpu = state.cpu_flag;
    state.current_func_ver = v;
+    srand(state.seed);

    if (state.cpu_flag)
        state.num_checked++;
--- a/third_party/dav1d/tests/checkasm/ipred.c
+++ b/third_party/dav1d/tests/checkasm/ipred.c
@ -46,6 +46,8 @@ static const char *const intra_pred_mode_names[N_IMPL_INTRA_PRED_MODES] = {
    [FILTER_PRED]   = "filter"
 };

+static const char *const cfl_ac_names[3] = { "420", "422", "444" };
+
 static const char *const cfl_pred_mode_names[DC_128_PRED + 1] = {
    [DC_PRED]       = "cfl",
    [DC_128_PRED]   = "cfl_128",
@ -101,6 +103,42 @@ static void check_intra_pred(Dav1dIntraPredDSPContext *const c) {
    report("intra_pred");
 }

+static void check_cfl_ac(Dav1dIntraPredDSPContext *const c) {
+    ALIGN_STK_32(int16_t, c_dst, 32 * 32,);
+    ALIGN_STK_32(int16_t, a_dst, 32 * 32,);
+    ALIGN_STK_32(pixel, luma, 32 * 32,);
+
+    declare_func(void, int16_t *ac, const pixel *y, ptrdiff_t stride,
+                 int w_pad, int h_pad, int cw, int ch);
+
+    for (int layout = 1; layout <= DAV1D_PIXEL_LAYOUT_I444; layout++) {
+        const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
+        const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
+        for (int w = 4; w <= (32 >> ss_hor); w <<= 1)
+            if (check_func(c->cfl_ac[layout - 1], "cfl_ac_%s_w%d_%dbpc",
+                cfl_ac_names[layout - 1], w, BITDEPTH))
+            {
+                for (int h = imax(w / 4, 4); h <= imin(w * 4, (32 >> ss_ver)); h <<= 1) {
+                    const ptrdiff_t stride = 32 * sizeof(pixel);
+                    const int w_pad = rand() & ((w >> 2) - 1);
+                    const int h_pad = rand() & ((h >> 2) - 1);
+
+                    for (int y = 0; y < (h << ss_ver); y++)
+                        for (int x = 0; x < (w << ss_hor); x++)
+                            luma[y * 32 + x] = rand() & ((1 << BITDEPTH) - 1);
+
+                    call_ref(c_dst, luma, stride, w_pad, h_pad, w, h);
+                    call_new(a_dst, luma, stride, w_pad, h_pad, w, h);
+                    if (memcmp(c_dst, a_dst, w * h * sizeof(*c_dst)))
+                        fail();
+
+                    bench_new(a_dst, luma, stride, 0, 0, w, h);
+                }
+            }
+    }
+    report("cfl_ac");
+}
+
 static void check_cfl_pred(Dav1dIntraPredDSPContext *const c) {
    ALIGN_STK_32(pixel, c_dst, 32 * 32,);
    ALIGN_STK_32(pixel, a_dst, 32 * 32,);
@ -179,6 +217,7 @@ void bitfn(checkasm_check_ipred)(void) {
    bitfn(dav1d_intra_pred_dsp_init)(&c);

    check_intra_pred(&c);
+    check_cfl_ac(&c);
    check_cfl_pred(&c);
    check_pal_pred(&c);
 }
--- a/third_party/dav1d/tests/checkasm/loopfilter.c
+++ b/third_party/dav1d/tests/checkasm/loopfilter.c
@ -141,7 +141,7 @@ static void check_lpf_sb(loopfilter_sb_fn fn, const char *const name,

            for (int j = 0; j < n_blks; j++) {
                const int idx = rand() % (i + 2);
-                if (idx) vmask[idx - 1] |= 1 << j;
+                if (idx) vmask[idx - 1] |= 1U << j;
                if (dir) {
                    l[j][lf_idx] = rand() & 63;
                    l[j + 32][lf_idx] = rand() & 63;
--- a/third_party/dav1d/tests/libfuzzer/dav1d_fuzzer.c
+++ b/third_party/dav1d/tests/libfuzzer/dav1d_fuzzer.c
@ -25,12 +25,15 @@
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

+#include "config.h"
+
 #include <errno.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <string.h>

 #include <dav1d/dav1d.h>
+#include "src/cpu.h"
 #include "dav1d_fuzzer.h"

 static unsigned r32le(const uint8_t *const p) {
@ -58,10 +61,18 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)
    Dav1dContext * ctx = NULL;
    Dav1dPicture pic;
    const uint8_t *ptr = data;
+    int have_seq_hdr = 0;
    int err;

    dav1d_version();

+    // memory sanitizer is inherently incompatible with asm
+#if defined(__has_feature)
+  #if __has_feature(memory_sanitizer)
+    dav1d_set_cpu_flags_mask(0);
+  #endif
+#endif
+
    if (size < 32) goto end;
    ptr += 32; // skip ivf header

@ -92,6 +103,17 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)

        if (!frame_size) continue;

+        if (!have_seq_hdr) {
+            Dav1dSequenceHeader seq = { 0 };
+            int err = dav1d_parse_sequence_header(&seq, ptr, frame_size);
+            // skip frames until we see a sequence header
+            if  (err != 0) {
+                ptr += frame_size;
+                continue;
+            }
+            have_seq_hdr = 1;
+        }
+
        // copy frame data to a new buffer to catch reads past the end of input
        p = dav1d_data_create(&buf, frame_size);
        if (!p) goto cleanup;
--- a/third_party/dav1d/tests/meson.build
+++ b/third_party/dav1d/tests/meson.build
@ -67,10 +67,15 @@ if is_asm_enabled

    m_lib = cc.find_library('m', required: false)

+    libdav1d_nasm_objs_if_needed = []
+    if meson.version().version_compare('< 0.48.999')
+        libdav1d_nasm_objs_if_needed = libdav1d_nasm_objs
+    endif
+
    checkasm = executable('checkasm',
        checkasm_sources,
        checkasm_nasm_objs,
-        libdav1d_nasm_objs,
+        libdav1d_nasm_objs_if_needed,

        objects: [
            checkasm_bitdepth_objs,
--- a/third_party/dav1d/tools/dav1d.c
+++ b/third_party/dav1d/tools/dav1d.c
@ -99,6 +99,23 @@ int main(const int argc, char *const *const argv) {
    if (!cli_settings.quiet)
        fprintf(stderr, "dav1d %s - by VideoLAN\n", DAV1D_VERSION);

+    // skip frames until a sequence header is found
+    if (cli_settings.skip) {
+        Dav1dSequenceHeader seq;
+        unsigned seq_skip = 0;
+        while (dav1d_parse_sequence_header(&seq, data.data, data.sz)) {
+            if ((res = input_read(in, &data)) < 0) {
+                input_close(in);
+                return res;
+            }
+            seq_skip++;
+        }
+        if (seq_skip && !cli_settings.quiet)
+            fprintf(stderr,
+                    "skipped %u packets due to missing sequence header\n",
+                    seq_skip);
+    }
+
    //getc(stdin);
    if (cli_settings.limit != 0 && cli_settings.limit < total)
        total = cli_settings.limit;
--- a/third_party/dav1d/tools/dav1d_cli_parse.c
+++ b/third_party/dav1d/tools/dav1d_cli_parse.c
@ -48,6 +48,9 @@ enum {
    ARG_FRAME_THREADS,
    ARG_TILE_THREADS,
    ARG_VERIFY,
+    ARG_FILM_GRAIN,
+    ARG_OPPOINT,
+    ARG_ALL_LAYERS,
 };

 static const struct option long_opts[] = {
@ -62,6 +65,9 @@ static const struct option long_opts[] = {
    { "framethreads",   1, NULL, ARG_FRAME_THREADS },
    { "tilethreads",    1, NULL, ARG_TILE_THREADS },
    { "verify",         1, NULL, ARG_VERIFY },
+    { "filmgrain",      1, NULL, ARG_FILM_GRAIN },
+    { "oppoint",        1, NULL, ARG_OPPOINT },
+    { "alllayers",      1, NULL, ARG_ALL_LAYERS },
    { NULL,             0, NULL, 0 },
 };

@ -86,6 +92,9 @@ static void usage(const char *const app, const char *const reason, ...) {
            " --version/-v:        print version and exit\n"
            " --framethreads $num: number of frame threads (default: 1)\n"
            " --tilethreads $num:  number of tile threads (default: 1)\n"
+            " --filmgrain          enable film grain application (default: 1, except if muxer is md5)\n"
+            " --oppoint $num:      select an operating point of a scalable AV1 bitstream (0 - 32)\n"
+            " --alllayers $num:    output all spatial layers of a scalable AV1 bitstream (default: 1)\n"
            " --verify $md5:       verify decoded md5. implies --muxer md5, no output\n");
    exit(1);
 }
@ -124,8 +133,9 @@ void parse(const int argc, char *const *const argv,

    memset(cli_settings, 0, sizeof(*cli_settings));
    dav1d_default_settings(lib_settings);
+    int grain_specified = 0;

-    while ((o = getopt_long(argc, argv, short_opts, long_opts, NULL)) >= 0) {
+    while ((o = getopt_long(argc, argv, short_opts, long_opts, NULL)) != -1) {
        switch (o) {
        case 'o':
            cli_settings->outputfile = optarg;
@ -159,14 +169,29 @@ void parse(const int argc, char *const *const argv,
        case ARG_VERIFY:
            cli_settings->verify = optarg;
            break;
+        case ARG_FILM_GRAIN:
+            lib_settings->apply_grain =
+                !!parse_unsigned(optarg, ARG_FILM_GRAIN, argv[0]);
+            grain_specified = 1;
+            break;
+        case ARG_OPPOINT:
+            lib_settings->operating_point =
+                parse_unsigned(optarg, ARG_OPPOINT, argv[0]);
+            break;
+        case ARG_ALL_LAYERS:
+            lib_settings->all_layers =
+                !!parse_unsigned(optarg, ARG_ALL_LAYERS, argv[0]);
+            break;
        case 'v':
            fprintf(stderr, "%s\n", dav1d_version());
            exit(0);
        default:
-            break;
+            usage(argv[0], NULL);
        }
    }

+    if (optind < argc)
+        usage(argv[0], "Extra/unused arguments found, e.g. '%s'\n", argv[optind]);
    if (cli_settings->verify) {
        if (cli_settings->outputfile)
            usage(argv[0], "Verification (--verify) requires output file (-o/--output) to not set");
@ -178,6 +203,12 @@ void parse(const int argc, char *const *const argv,
            cli_settings->muxer = "md5";
    }

+    if (!grain_specified && cli_settings->muxer &&
+        !strcmp(cli_settings->muxer, "md5"))
+    {
+        lib_settings->apply_grain = 0;
+    }
+
    if (!cli_settings->inputfile)
        usage(argv[0], "Input file (-i/--input) is required");
    if ((!cli_settings->muxer || strcmp(cli_settings->muxer, "null")) &&
--- a/third_party/dav1d/tools/input/ivf.c
+++ b/third_party/dav1d/tools/input/ivf.c
@ -36,6 +36,10 @@

 #include "input/demuxer.h"

+#ifdef _MSC_VER
+#define ftello _ftelli64
+#endif
+
 typedef struct DemuxerPriv {
    FILE *f;
 } IvfInputContext;
@ -44,6 +48,10 @@ static unsigned rl32(const uint8_t *const p) {
    return ((uint32_t)p[3] << 24U) | (p[2] << 16U) | (p[1] << 8U) | p[0];
 }

+static int64_t rl64(const uint8_t *const p) {
+    return (((uint64_t) rl32(&p[4])) << 32) | rl32(p);
+}
+
 static int ivf_open(IvfInputContext *const c, const char *const file,
                    unsigned fps[2], unsigned *const num_frames)
 {
@ -87,16 +95,20 @@ static int ivf_open(IvfInputContext *const c, const char *const file,
 }

 static int ivf_read(IvfInputContext *const c, Dav1dData *const buf) {
-    uint8_t data[4];
+    uint8_t data[8];
    uint8_t *ptr;
    int res;

+    const int64_t off = ftello(c->f);
    if ((res = fread(data, 4, 1, c->f)) != 1)
        return -1; // EOF
-    fseek(c->f, 8, SEEK_CUR); // skip timestamp
    const ptrdiff_t sz = rl32(data);
+    if ((res = fread(data, 8, 1, c->f)) != 1)
+        return -1; // EOF
    ptr = dav1d_data_create(buf, sz);
    if (!ptr) return -1;
+    buf->m.offset = off;
+    buf->m.timestamp = rl64(data);
    if ((res = fread(ptr, sz, 1, c->f)) != 1) {
        fprintf(stderr, "Failed to read frame data: %s\n", strerror(errno));
        dav1d_data_unref(buf);
--- a/third_party/dav1d/tools/output/md5.c
+++ b/third_party/dav1d/tools/output/md5.c
@ -148,7 +148,8 @@ static void md5_update(MD5Context *const md5, const uint8_t *data, unsigned len)
    }

    while (len >= 64) {
-        md5_body(md5, data);
+        memcpy(md5->data, data, 64);
+        md5_body(md5, md5->data);
        md5->len += 64;
        data += 64;
        len -= 64;
--- a/third_party/dav1d/tools/output/y4m2.c
+++ b/third_party/dav1d/tools/output/y4m2.c
@ -37,6 +37,8 @@

 typedef struct MuxerPriv {
    FILE *f;
+    int first;
+    unsigned fps[2];
 } Y4m2OutputContext;

 static int y4m2_open(Y4m2OutputContext *const c, const char *const file,
@ -49,6 +51,14 @@ static int y4m2_open(Y4m2OutputContext *const c, const char *const file,
        return -1;
    }

+    c->first = 1;
+    c->fps[0] = fps[0];
+    c->fps[1] = fps[1];
+
+    return 0;
+}
+
+static int write_header(Y4m2OutputContext *const c, const Dav1dPicture *const p) {
    static const char *const ss_names[][2] = {
        [DAV1D_PIXEL_LAYOUT_I400] = { "mono", "mono10" },
        [DAV1D_PIXEL_LAYOUT_I420] = { NULL, "420p10" },
@ -59,20 +69,26 @@ static int y4m2_open(Y4m2OutputContext *const c, const char *const file,
    static const char *const chr_names_8bpc_i420[] = {
        [DAV1D_CHR_UNKNOWN] = "420jpeg",
        [DAV1D_CHR_VERTICAL] = "420mpeg2",
-        [DAV1D_CHR_COLOCATED] = "420paldv"
+        [DAV1D_CHR_COLOCATED] = "420"
    };

-    const char *const ss_name = p->layout == DAV1D_PIXEL_LAYOUT_I420 && p->bpc == 8 ?
-        chr_names_8bpc_i420[p->chr > 2 ? DAV1D_CHR_UNKNOWN : p->chr] :
-        ss_names[p->layout][p->bpc > 8];
+    const char *const ss_name =
+        p->p.layout == DAV1D_PIXEL_LAYOUT_I420 && p->p.bpc == 8 ?
+        chr_names_8bpc_i420[p->seq_hdr->chr > 2 ? DAV1D_CHR_UNKNOWN : p->seq_hdr->chr] :
+        ss_names[p->p.layout][p->p.bpc > 8];

-    fprintf(c->f, "YUV4MPEG2 W%d H%d C%s Ip F%d:%d\n",
-            p->w, p->h, ss_name, fps[0], fps[1]);
+    fprintf(c->f, "YUV4MPEG2 W%d H%d F%d:%d Ip C%s\n",
+            p->p.w, p->p.h, c->fps[0], c->fps[1], ss_name);

    return 0;
 }

 static int y4m2_write(Y4m2OutputContext *const c, Dav1dPicture *const p) {
+    if (c->first) {
+        c->first = 0;
+        const int res = write_header(c, p);
+        if (res < 0) return res;
+    }
    fprintf(c->f, "FRAME\n");

    uint8_t *ptr;