Bug 1582743 - Update dav1d from upstream to commit c0865f3. r=TD-Linux

Differential Revision: https://phabricator.services.mozilla.com/D46762 --HG-- extra : moz-landing-system : lando
2024-10-08 19:04:45 +00:00 · 2019-09-23 18:02:33 +00:00 · 2019-09-23 18:02:33 +00:00 · ac0da8b368
commit ac0da8b368
parent 5be5470fba
87 changed files with 13062 additions and 5655 deletions
--- a/media/libdav1d/asm/moz.build
+++ b/media/libdav1d/asm/moz.build
@ -82,6 +82,7 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
        # an error when it compiles empty files.
        SOURCES += [
            '../../../third_party/dav1d/src/x86/cdef.asm',
+            '../../../third_party/dav1d/src/x86/film_grain.asm',
            '../../../third_party/dav1d/src/x86/ipred.asm',
            '../../../third_party/dav1d/src/x86/itx.asm',
            '../../../third_party/dav1d/src/x86/loopfilter.asm',
@ -94,6 +95,7 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
        '../../../third_party/dav1d/src/x86/cpuid.asm',
        '../../../third_party/dav1d/src/x86/ipred_ssse3.asm',
        '../../../third_party/dav1d/src/x86/itx_ssse3.asm',
+        '../../../third_party/dav1d/src/x86/loopfilter_ssse3.asm',
        '../../../third_party/dav1d/src/x86/looprestoration_ssse3.asm',
        '../../../third_party/dav1d/src/x86/mc_ssse3.asm',
        '../../../third_party/dav1d/src/x86/msac.asm',
@ -103,6 +105,7 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
    relative_path = '../../../third_party/dav1d/src/x86/'
    bitdepth_basenames = [
        'cdef_init_tmpl.c',
+        'film_grain_init_tmpl.c',
        'ipred_init_tmpl.c',
        'itx_init_tmpl.c',
        'loopfilter_init_tmpl.c',
--- a/media/libdav1d/dav1d.rc
+++ b/media/libdav1d/dav1d.rc
@ -1,7 +1,7 @@
-#define API_VERSION_NUMBER 2,0,0,0
-#define API_VERSION_NUMBER_STR "2.0.0"
-#define PROJECT_VERSION_NUMBER 0,3,1,0
-#define PROJECT_VERSION_NUMBER_STR "0.3.1"
+#define API_VERSION_NUMBER 3,0,0,0
+#define API_VERSION_NUMBER_STR "3.0.0"
+#define PROJECT_VERSION_NUMBER 0,4,0,0
+#define PROJECT_VERSION_NUMBER_STR "0.4.0"

 #include <windows.h>

--- a/media/libdav1d/moz.build
+++ b/media/libdav1d/moz.build
@ -124,6 +124,7 @@ relative_path = '../../third_party/dav1d/src/'
 bitdepth_basenames = [
    'cdef_apply_tmpl.c',
    'cdef_tmpl.c',
+    'fg_apply_tmpl.c',
    'film_grain_tmpl.c',
    'ipred_prepare_tmpl.c',
    'ipred_tmpl.c',
@ -163,6 +164,7 @@ SOURCES += [
 EXPORTS.dav1d.src += [
    '../../third_party/dav1d/src/cdef.h',
    '../../third_party/dav1d/src/cdef_apply.h',
+    '../../third_party/dav1d/src/fg_apply.h',
    '../../third_party/dav1d/src/ipred.h',
    '../../third_party/dav1d/src/ipred_prepare.h',
    '../../third_party/dav1d/src/itx.h',
--- a/media/libdav1d/moz.yaml
+++ b/media/libdav1d/moz.yaml
@ -20,7 +20,7 @@ origin:

  # Human-readable identifier for this version/release
  # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: commit c138435f5aee794ff9d9ac23c3718017927f2e20 (2019-07-17T12:39:10.000Z).
+  release: commit c0865f35c74bdcc71021630f64dca2db35d2bc8c (2019-09-19T12:07:23.000+02:00).

  # The package's license, where possible using the mnemonic from
  # https://spdx.org/licenses/
--- a/media/libdav1d/vcs_version.h
+++ b/media/libdav1d/vcs_version.h
@ -1,2 +1,2 @@
 /* auto-generated, do not edit */
-#define DAV1D_VERSION "0.3.1-69-gc138435"
+#define DAV1D_VERSION "0.4.0-49-gc0865f3"
--- a/media/libdav1d/version.h
+++ b/media/libdav1d/version.h
@ -27,7 +27,7 @@
 #ifndef DAV1D_VERSION_H
 #define DAV1D_VERSION_H

-#define DAV1D_API_VERSION_MAJOR 2
+#define DAV1D_API_VERSION_MAJOR 3
 #define DAV1D_API_VERSION_MINOR 0
 #define DAV1D_API_VERSION_PATCH 0

--- a/third_party/dav1d/.gitlab-ci.yml
+++ b/third_party/dav1d/.gitlab-ci.yml
@ -269,6 +269,7 @@ build-debian-ppc64le:
 test-debian:
    image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
    stage: test
+    needs: ["build-debian"]
    tags:
        - debian
        - amd64
@ -289,6 +290,7 @@ test-debian:
 test-debian-unaligned-stack:
    image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
    stage: test
+    needs: ["build-debian"]
    tags:
        - debian
        - amd64
@ -309,6 +311,7 @@ test-debian-unaligned-stack:
 test-debian-asan:
    image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
    stage: test
+    needs: ["build-debian"]
    tags:
        - debian
        - amd64
@ -331,6 +334,7 @@ test-debian-asan:
 test-debian-msan:
    image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
    stage: test
+    needs: ["build-debian"]
    tags:
        - debian
        - amd64
@ -353,6 +357,7 @@ test-debian-msan:
 test-debian-ubsan:
    image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
    stage: test
+    needs: ["build-debian"]
    tags:
        - debian
        - amd64
@ -375,6 +380,7 @@ test-debian-ubsan:
 test-win64:
    image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
    stage: test
+    needs: ["build-win64"]
    tags:
        - debian
        - amd64
@ -399,6 +405,7 @@ test-win64:
 test-debian-aarch64:
    image: registry.videolan.org:5000/dav1d-debian-unstable-aarch64:20181122182457
    stage: test
+    needs: ["build-debian-aarch64"]
    tags:
        - aarch64
        - debian
@ -421,6 +428,7 @@ test-debian-aarch64:
 test-debian-ppc64le:
    image: registry.videolan.org/dav1d-debian-unstable-ppc64le:20190606105121
    stage: test
+    needs: ["build-debian-ppc64le"]
    tags:
        - ppc64le
        - docker
@ -443,6 +451,7 @@ test-debian-ppc64le:
 test-debian-armv7-clang-5:
    stage: test
    image: registry.videolan.org:5000/dav1d-debian-unstable-armv7:20190202101732
+    needs: ["build-debian-armv7-clang-5"]
    tags:
        - armv7
        - debian
--- a/third_party/dav1d/NEWS
+++ b/third_party/dav1d/NEWS
@ -6,6 +6,13 @@ Changes for 0.4.0 'Cheetah':
 - SSE2 and ARM64 optimizations for MSAC
 - Improve speed on 32bits systems
 - Optimization in obmc blend
+ - Reduce RAM usage significantly
+ - The initial PPC SIMD code, cdef_filter
+ - NEON optimizations for blend functions on ARM
+ - NEON optimizations for w_mask functions on ARM
+ - NEON optimizations for inverse transforms on ARM64
+ - Improve handling of malloc failures
+ - Simple Player example in tools


 Changes for 0.3.1 'Sailfish':
--- a/third_party/dav1d/examples/dav1dplay.c
+++ b/third_party/dav1d/examples/dav1dplay.c
--- a/third_party/dav1d/examples/meson.build
+++ b/third_party/dav1d/examples/meson.build
@ -0,0 +1,62 @@
+# Copyright © 2018, VideoLAN and dav1d authors
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#
+# Build definition for the dav1d examples
+#
+
+# Leave subdir if examples are disabled
+if not get_option('enable_examples')
+    subdir_done()
+endif
+
+
+# dav1d player sources
+dav1dplay_sources = files(
+    'dav1dplay.c',
+)
+
+sdl2_dependency = dependency('sdl2', version: '>= 2.0.1', required: false)
+
+if sdl2_dependency.found()
+    placebo_dependency = dependency('libplacebo', version: '>= 1.18.0', required: false)
+    vulkan_dependency = dependency('vulkan', required: false)
+    sdl_has_vulkan = cc.has_header('SDL_vulkan.h', dependencies: [sdl2_dependency])
+    cflag_placebo = []
+    deps_placebo = []
+    if placebo_dependency.found() and vulkan_dependency.found() and sdl_has_vulkan
+        cflag_placebo += '-DHAVE_PLACEBO_VULKAN=1'
+        deps_placebo = [vulkan_dependency, placebo_dependency]
+    endif
+    dav1dplay = executable('dav1dplay',
+        dav1dplay_sources,
+        rev_target,
+
+        link_with : [libdav1d, dav1d_input_objs],
+        include_directories : [dav1d_inc_dirs],
+        dependencies : [getopt_dependency, sdl2_dependency, deps_placebo],
+        install : true,
+        c_args : cflag_placebo,
+    )
+endif
--- a/third_party/dav1d/include/common/attributes.h
+++ b/third_party/dav1d/include/common/attributes.h
@ -46,7 +46,7 @@
 /* x86-64 needs 32-byte alignment for AVX2. */
 #define ALIGN_32_VAL 32
 #define ALIGN_16_VAL 16
-#elif ARCH_X86_32 || ARCH_ARM || ARCH_AARCH64
+#elif ARCH_X86_32 || ARCH_ARM || ARCH_AARCH64 || ARCH_PPC64LE
 /* ARM doesn't benefit from anything more than 16-byte alignment. */
 #define ALIGN_32_VAL 16
 #define ALIGN_16_VAL 16
@ -92,6 +92,14 @@
 #define NOINLINE __attribute__((noinline))
 #endif /* !_MSC_VER */

+#if defined(NDEBUG) && (defined(__GNUC__) || defined(__clang__))
+#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
+#elif defined(NDEBUG) && defined(_MSC_VER)
+#define assert __assume
+#else
+#include <assert.h>
+#endif
+
 #if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
 #    define dav1d_uninit(x) x=x
 #else
--- a/third_party/dav1d/include/common/intops.h
+++ b/third_party/dav1d/include/common/intops.h
@ -40,6 +40,14 @@ static inline int imin(const int a, const int b) {
    return a < b ? a : b;
 }

+static inline unsigned umax(const unsigned a, const unsigned b) {
+    return a > b ? a : b;
+}
+
+static inline unsigned umin(const unsigned a, const unsigned b) {
+    return a < b ? a : b;
+}
+
 static inline int iclip(const int v, const int min, const int max) {
    return v < min ? min : v > max ? max : v;
 }
--- a/third_party/dav1d/include/common/mem.h
+++ b/third_party/dav1d/include/common/mem.h
@ -28,13 +28,14 @@
 #ifndef DAV1D_COMMON_MEM_H
 #define DAV1D_COMMON_MEM_H

-#include <assert.h>
 #include <stdlib.h>

 #if defined(HAVE_ALIGNED_MALLOC) || defined(HAVE_MEMALIGN)
 #include <malloc.h>
 #endif

+#include "common/attributes.h"
+
 /*
 * Allocate 32-byte aligned memory. The return value can be released
 * by calling the standard free() function.
--- a/third_party/dav1d/include/dav1d/headers.h
+++ b/third_party/dav1d/include/dav1d/headers.h
@ -28,6 +28,8 @@
 #ifndef DAV1D_HEADERS_H
 #define DAV1D_HEADERS_H

+#include <stddef.h>
+
 // Constants from Section 3. "Symbols and abbreviated terms"
 #define DAV1D_MAX_CDEF_STRENGTHS 8
 #define DAV1D_MAX_OPERATING_POINTS 32
@ -176,6 +178,13 @@ typedef struct Dav1dMasteringDisplay {
    uint32_t min_luminance;
 } Dav1dMasteringDisplay;

+typedef struct Dav1dITUTT35 {
+    uint8_t  country_code;
+    uint8_t  country_code_extension_byte;
+    size_t   payload_size;
+    uint8_t *payload;
+} Dav1dITUTT35;
+
 typedef struct Dav1dSequenceHeader {
    /**
     * Stream profile, 0 for 8-10 bits/component 4:2:0 or monochrome;
@ -289,7 +298,7 @@ typedef struct Dav1dLoopfilterModeRefDeltas {
 } Dav1dLoopfilterModeRefDeltas;

 typedef struct Dav1dFilmGrainData {
-    uint16_t seed;
+    unsigned seed;
    int num_y_points;
    uint8_t y_points[14][2 /* value, scaling */];
    int chroma_scaling_from_luma;
--- a/third_party/dav1d/include/dav1d/picture.h
+++ b/third_party/dav1d/include/dav1d/picture.h
@ -77,9 +77,16 @@ typedef struct Dav1dPicture {
     * this picture, as defined in section 5.8.4 and 6.7.4
     */
    Dav1dMasteringDisplay *mastering_display;
+    /**
+     * ITU-T T.35 metadata as defined in section 5.8.2 and 6.7.2
+     */
+    Dav1dITUTT35 *itut_t35;
+
+    uintptr_t reserved[4]; ///< reserved for future use

    struct Dav1dRef *frame_hdr_ref, *seq_hdr_ref; ///< Frame parameter allocation origins
-    struct Dav1dRef *content_light_ref, *mastering_display_ref; ///< Metadata allocation origins
+    struct Dav1dRef *content_light_ref, *mastering_display_ref, *itut_t35_ref; ///< Metadata allocation origins
+    uintptr_t reserved_ref[4]; ///< reserved for future use
    struct Dav1dRef *ref; ///< Frame data allocation origin

    void *allocator_data; ///< pointer managed by the allocator
--- a/third_party/dav1d/meson.build
+++ b/third_party/dav1d/meson.build
@ -23,14 +23,14 @@
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 project('dav1d', ['c'],
-    version: '0.3.1',
+    version: '0.4.0',
    default_options: ['c_std=c99',
                      'warning_level=2',
                      'buildtype=release',
                      'b_ndebug=if-release'],
    meson_version: '>= 0.47.0')

-dav1d_soname_version       = '2.0.0'
+dav1d_soname_version       = '3.0.0'
 dav1d_api_version_array    = dav1d_soname_version.split('.')
 dav1d_api_version_major    = dav1d_api_version_array[0]
 dav1d_api_version_minor    = dav1d_api_version_array[1]
@ -88,6 +88,11 @@ optional_arguments = []
 test_args += '-D_POSIX_C_SOURCE=200112L'
 add_project_arguments('-D_POSIX_C_SOURCE=200112L', language: 'c')

+if host_machine.system() == 'darwin'
+    test_args += '-D_DARWIN_C_SOURCE'
+    add_project_arguments('-D_DARWIN_C_SOURCE', language: 'c')
+endif
+
 if host_machine.system() == 'windows'
    cdata.set('_WIN32_WINNT',           '0x0601')
    cdata.set('UNICODE',                1) # Define to 1 for Unicode (Wide Chars) APIs
@ -389,4 +394,6 @@ subdir('src')

 subdir('tools')

+subdir('examples')
+
 subdir('tests')
--- a/third_party/dav1d/meson_options.txt
+++ b/third_party/dav1d/meson_options.txt
@ -15,6 +15,11 @@ option('enable_tools',
    value: true,
    description: 'Build dav1d cli tools')

+option('enable_examples',
+    type: 'boolean',
+    value: false,
+    description: 'Build dav1d examples')
+
 option('enable_tests',
    type: 'boolean',
    value: true,
--- a/third_party/dav1d/src/arm/32/mc.S
+++ b/third_party/dav1d/src/arm/32/mc.S
@ -91,6 +91,7 @@ function \type\()_8bpc_neon, export=1
        \type           d16, d17, q0,  q1,  q2,  q3
        add             r12, r12, r4
        bx              r12
+
        .align 2
 L(\type\()_tbl):
        .word 1280f - L(\type\()_tbl) + CONFIG_THUMB
@ -99,6 +100,7 @@ L(\type\()_tbl):
        .word 160f  - L(\type\()_tbl) + CONFIG_THUMB
        .word 80f   - L(\type\()_tbl) + CONFIG_THUMB
        .word 4f    - L(\type\()_tbl) + CONFIG_THUMB
+
 4:
        add             r6,  r0,  r1
        lsl             r1,  r1,  #1
@ -217,17 +219,17 @@ bidir_fn mask

 .macro w_mask_fn type
 function w_mask_\type\()_8bpc_neon, export=1
-        push            {r4-r10,lr}
-        ldr             r4,  [sp, #32]
-        ldr             r5,  [sp, #36]
-        ldr             r6,  [sp, #40]
-        ldr             r7,  [sp, #44]
+        push            {r4-r9,lr}
+        ldr             r4,  [sp, #28]
+        ldr             r5,  [sp, #32]
+        ldr             r6,  [sp, #36]
+        ldr             r7,  [sp, #40]
        clz             r8,  r4
        adr             r9,  L(w_mask_\type\()_tbl)
        sub             r8,  r8,  #24
        ldr             r8,  [r9,  r8,  lsl #2]
        add             r9,  r9,  r8
-        mov             r12, #6903
+        movw            r12, #6903
        vdup.16         q14, r12
 .if \type == 444
        vmov.i8         q15, #64
@ -243,6 +245,7 @@ function w_mask_\type\()_8bpc_neon, export=1
        add             r12, r0,  r1
        lsl             r1,  r1,  #1
        bx              r9
+
        .align 2
 L(w_mask_\type\()_tbl):
        .word 1280f  - L(w_mask_\type\()_tbl) + CONFIG_THUMB
@ -251,9 +254,10 @@ L(w_mask_\type\()_tbl):
        .word 160f   - L(w_mask_\type\()_tbl) + CONFIG_THUMB
        .word 8f     - L(w_mask_\type\()_tbl) + CONFIG_THUMB
        .word 4f     - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+
 4:
-        vld1.16         {d0,  d1,  d2,  d3},  [r2]! // tmp1 (four rows at once)
-        vld1.16         {d4,  d5,  d6,  d7},  [r3]! // tmp2 (four rows at once)
+        vld1.16         {d0,  d1,  d2,  d3},  [r2,  :128]! // tmp1 (four rows at once)
+        vld1.16         {d4,  d5,  d6,  d7},  [r3,  :128]! // tmp2 (four rows at once)
        subs            r5,  r5,  #4
        vsub.i16        q8,  q2,  q0    // tmp2-tmp1
        vsub.i16        q9,  q3,  q1
@ -275,30 +279,30 @@ L(w_mask_\type\()_tbl):
        vmovn.u16       d20, q10        // 64 - m
        vmovn.u16       d21, q11
        vsub.i8         q10, q15, q10   // m
-        vst1.8          {d20, d21}, [r6]!
+        vst1.8          {d20, d21}, [r6,  :128]!
 .elseif \type == 422
        vpadd.s16       d20, d20, d21   // (64 - m) + (64 - n) (column wise addition)
        vpadd.s16       d21, d22, d23
        vmovn.s16       d6,  q10
        vhsub.u8        d6,  d30, d6    // ((129 - sign) - ((64 - m) + (64 - n))) >> 1
-        vst1.8          {d6},  [r6]!
+        vst1.8          {d6},  [r6,  :64]!
 .elseif \type == 420
        vadd.s16        d20, d20, d21   // (64 - my1) + (64 - my2) (row wise addition)
        vadd.s16        d21, d22, d23
        vpadd.s16       d20, d20, d21   // (128 - m) + (128 - n) (column wise addition)
        vsub.s16        d20, d30, d20   // (256 - sign) - ((128 - m) + (128 - n))
        vrshrn.u16      d20, q10,  #2   // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
-        vst1.32         {d20[0]},  [r6]!
+        vst1.32         {d20[0]}, [r6,  :32]!
 .endif
-        vst1.32         {d24[0]}, [r0],  r1
-        vst1.32         {d24[1]}, [r12], r1
-        vst1.32         {d25[0]}, [r0],  r1
-        vst1.32         {d25[1]}, [r12], r1
+        vst1.32         {d24[0]}, [r0,  :32], r1
+        vst1.32         {d24[1]}, [r12, :32], r1
+        vst1.32         {d25[0]}, [r0,  :32], r1
+        vst1.32         {d25[1]}, [r12, :32], r1
        bgt             4b
-        pop             {r4-r10,pc}
+        pop             {r4-r9,pc}
 8:
-        vld1.16         {d0,  d1,  d2,  d3},  [r2]! // tmp1y1, tmp1y2
-        vld1.16         {d4,  d5,  d6,  d7},  [r3]! // tmp2y1, tmp2y2
+        vld1.16         {d0,  d1,  d2,  d3},  [r2,  :128]! // tmp1y1, tmp1y2
+        vld1.16         {d4,  d5,  d6,  d7},  [r3,  :128]! // tmp2y1, tmp2y2
        subs            r5,  r5,  #2
        vsub.i16        q8,  q2,  q0    // tmp2y1 - tmp1y1
        vsub.i16        q9,  q3,  q1    // tmp2y2 - tmp1y2
@ -320,43 +324,42 @@ L(w_mask_\type\()_tbl):
        vmovn.u16       d20, q10        // 64 - m
        vmovn.u16       d21, q11
        vsub.i8         q10, q15, q10   // m
-        vst1.8          {d20, d21}, [r6]!
+        vst1.8          {d20, d21}, [r6,  :128]!
 .elseif \type == 422
        vpadd.s16       d20, d20, d21   // (64 - my1) + (64 - ny1) (column wise addition)
        vpadd.s16       d21, d22, d23   // (64 - my2) + (64 - ny2)
        vmovn.s16       d20, q10
        vhsub.u8        d20, d30, d20   // ((129 - sign) - ((64 - my1/y2) + (64 - ny1/y2))) >> 1
-        vst1.8          {d20}, [r6]!
+        vst1.8          {d20}, [r6,  :64]!
 .elseif \type == 420
        vadd.s16        q10, q10, q11   // (64 - my1) + (64 - my2) (row wise addition)
        vpadd.s16       d20, d20, d21   // (128 - m) + (128 - n) (column wise addition)
        vsub.s16        d20, d30, d20   // (256 - sign) - ((128 - m) + (128 - n))
        vrshrn.u16      d20, q10, #2    // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
-        vst1.32         {d20[0]}, [r6]!
+        vst1.32         {d20[0]}, [r6,  :32]!
 .endif
-        vst1.16         {d24}, [r0],  r1
-        vst1.16         {d25}, [r12], r1
+        vst1.16         {d24}, [r0,  :64], r1
+        vst1.16         {d25}, [r12, :64], r1
        bgt             8b
-        pop             {r4-r10,pc}
+        pop             {r4-r9,pc}
 1280:
 640:
 320:
 160:
        sub             r1,  r1,  r4
 .if \type == 444
-        add             r10, r6,  r4
+        add             lr,  r6,  r4
 .elseif \type == 422
-        add             r10, r6,  r4,  lsr #1
+        add             lr,  r6,  r4,  lsr #1
 .endif
-        mov             lr,  r7
        add             r9,  r3,  r4,  lsl #1
        add             r7,  r2,  r4,  lsl #1
 161:
        mov             r8,  r4
 16:
-        vld1.16         {d0,  d1,  d2,  d3},  [r2]! // tmp1y1
-        vld1.16         {d4,  d5,  d6,  d7},  [r3]! // tmp2y1
-        vld1.16         {d16, d17, d18, d19}, [r7]! // tmp1y2
+        vld1.16         {d0,  d1,  d2,  d3},  [r2,  :128]! // tmp1y1
+        vld1.16         {d4,  d5,  d6,  d7},  [r3,  :128]! // tmp2y1
+        vld1.16         {d16, d17, d18, d19}, [r7,  :128]! // tmp1y2
        subs            r8,  r8,  #16
        vsub.i16        q2,  q2,  q0    // tmp2y1 - tmp1y1
        vsub.i16        q3,  q3,  q1
@ -372,24 +375,24 @@ L(w_mask_\type\()_tbl):
        vqdmulh.s16     q13, q13, q3
        vadd.i16        q12, q12, q0    // (((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15) + tmp1y1
        vadd.i16        q13, q13, q1
-        vld1.16         {d0,  d1,  d2,  d3},  [r9]! // tmp2h2
+        vld1.16         {d0,  d1,  d2,  d3},  [r9,  :128]! // tmp2h2
 .if \type == 444
        vmovn.u16       d20, q10        // 64 - my1
        vmovn.u16       d21, q11
        vsub.i8         q10, q15, q10   // my1
-        vst1.8          {d20, d21}, [r6]!
+        vst1.8          {d20, d21}, [r6,  :128]!
 .elseif \type == 422
        vpadd.s16       d20, d20, d21   // (64 - my1) + (64 - ny1) (column wise addition)
        vpadd.s16       d21, d22, d23
        vmovn.s16       d20, q10
        vhsub.u8        d20, d30, d20   // ((129 - sign) - ((64 - my1) + (64 - ny1))) >> 1
-        vst1.8          {d20}, [r6]!
+        vst1.8          {d20}, [r6,  :64]!
 .endif
        vqrshrun.s16    d24, q12, #4    // (((((tmp2y1 - tmp1y1)*(64 - my1) << 9) >> 15) + tmp1y1) + 8) >> 4
        vqrshrun.s16    d25, q13, #4
        vsub.i16        q0,  q0,  q8    // tmp2y2 - tmp1y2
        vsub.i16        q1,  q1,  q9
-        vst1.16         {d24, d25}, [r0]!    // store dsty1
+        vst1.16         {d24, d25}, [r0,  :128]!    // store dsty1
        vabs.s16        q2,  q0         // abs(tmp2y2 - tmp1y2)
        vabs.s16        q3,  q1
        vqsub.u16       q2,  q14, q2    // 6903 - abs(tmp2y2 - tmp1y2)
@ -402,13 +405,13 @@ L(w_mask_\type\()_tbl):
        vmovn.u16       d4,  q2         // 64 - my2
        vmovn.u16       d5,  q3
        vsub.i8         q2,  q15, q2    // my2
-        vst1.8          {d4,  d5},  [r10]!
+        vst1.8          {d4,  d5},  [lr,  :128]!
 .elseif \type == 422
        vpadd.s16       d4,  d4,  d5    // (64 - my2) + (64 - ny2) (column wise addition)
        vpadd.s16       d5,  d6,  d7
        vmovn.s16       d4,  q2
        vhsub.u8        d4,  d30, d4    // ((129 - sign) - ((64 - my2) + (64 - ny2))) >> 1
-        vst1.8          {d4},  [r10]!
+        vst1.8          {d4},  [lr,  :64]!
 .elseif \type == 420
        vadd.s16        q10, q10, q2    // (64 - my1) + (64 - my2) (row wise addition)
        vadd.s16        q11, q11, q3
@ -416,7 +419,7 @@ L(w_mask_\type\()_tbl):
        vpadd.s16       d21, d22, d23
        vsub.s16        q10, q15, q10   // (256 - sign) - ((128 - m) + (128 - n))
        vrshrn.u16      d20, q10, #2    // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
-        vst1.8          {d20}, [r6]!
+        vst1.8          {d20}, [r6,  :64]!
 .endif
        vqdmulh.s16     q12, q12, q0    // ((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15
        vqdmulh.s16     q13, q13, q1
@ -424,7 +427,7 @@ L(w_mask_\type\()_tbl):
        vadd.i16        q13, q13, q9
        vqrshrun.s16    d24, q12, #4    // (((((tmp2y2 - tmp1y2)*(64 - my2) << 9) >> 15) + tmp1y2) + 8) >> 4
        vqrshrun.s16    d25, q13, #4
-        vst1.16         {d24, d25}, [r12]!   // store dsty2
+        vst1.16         {d24, d25}, [r12, :128]!   // store dsty2
        bgt             16b
        subs            r5,  r5,  #2
        add             r2,  r2,  r4,  lsl #1
@ -433,15 +436,15 @@ L(w_mask_\type\()_tbl):
        add             r9,  r9,  r4,  lsl #1
 .if \type == 444
        add             r6,  r6,  r4
-        add             r10, r10, r4
+        add             lr,  lr,  r4
 .elseif \type == 422
        add             r6,  r6,  r4,  lsr #1
-        add             r10, r10, r4,  lsr #1
+        add             lr,  lr,  r4,  lsr #1
 .endif
        add             r0,  r0,  r1
        add             r12, r12, r1
        bgt             161b
-        pop             {r4-r10,pc}
+        pop             {r4-r9,pc}
 endfunc
 .endm

@ -451,15 +454,16 @@ w_mask_fn 420


 function blend_8bpc_neon, export=1
-        push            {r4-r8,lr}
-        ldr             r4,  [sp, #24]
-        ldr             r5,  [sp, #28]
-        clz             r6,  r3
-        adr             r7,  L(blend_tbl)
-        sub             r6,  r6,  #26
-        ldr             r6,  [r7, r6, lsl #2]
-        add             r7,  r7,  r6
-        bx              r7
+        push            {r4-r5,lr}
+        ldr             r4,  [sp, #12]
+        ldr             r5,  [sp, #16]
+        clz             lr,  r3
+        adr             r3,  L(blend_tbl)
+        sub             lr,  lr,  #26
+        ldr             lr,  [r3, lr, lsl #2]
+        add             r3,  r3,  lr
+        bx              r3
+
        .align 2
 L(blend_tbl):
        .word 320f  - L(blend_tbl) + CONFIG_THUMB
@ -472,33 +476,29 @@ L(blend_tbl):
        add             r12, r0,  r1
        lsl             r1,  r1,  #1
 4:
-        vld1.32         {d2[]},   [r5],  r3
-        vld1.32         {d1[]},   [r2],  r3
-        vld1.32         {d0[]},   [r0]
+        vld1.u8         {d2},     [r5,  :64]!
+        vld1.u8         {d1},     [r2,  :64]!
+        vld1.32         {d0[]},   [r0,  :32]
        subs            r4,  r4,  #2
-        vld1.32         {d2[1]},  [r5],  r3
-        vld1.32         {d1[1]},  [r2],  r3
-        vld1.32         {d0[1]},  [r12]
+        vld1.32         {d0[1]},  [r12, :32]
        vsub.i8         d3,  d22, d2
        vmull.u8        q8,  d1,  d2
        vmlal.u8        q8,  d0,  d3
        vrshrn.i16      d20, q8,  #6
-        vst1.32         {d20[0]}, [r0],  r1
-        vst1.32         {d20[1]}, [r12], r1
+        vst1.32         {d20[0]}, [r0,  :32], r1
+        vst1.32         {d20[1]}, [r12, :32], r1
        bgt             4b
-        pop             {r4-r8,pc}
+        pop             {r4-r5,pc}
 80:
        vmov.i8         d16, #64
        add             r12, r0,  r1
        lsl             r1,  r1,  #1
 8:
-        vld1.u8         {d2},  [r5],  r3
-        vld1.u8         {d4},  [r2],  r3
-        vld1.u8         {d0},  [r0]
+        vld1.u8         {q1},  [r5,  :128]!
+        vld1.u8         {q2},  [r2,  :128]!
+        vld1.u8         {d0},  [r0,  :64]
        vsub.i8         d17, d16, d2
-        vld1.u8         {d3},  [r5],  r3
-        vld1.u8         {d5},  [r2],  r3
-        vld1.u8         {d1},  [r12]
+        vld1.u8         {d1},  [r12, :64]
        subs            r4,  r4,  #2
        vsub.i8         d18, d16, d3
        vmull.u8        q3,  d2,  d4
@ -507,47 +507,44 @@ L(blend_tbl):
        vmlal.u8        q10, d1,  d18
        vrshrn.i16      d22, q3,  #6
        vrshrn.i16      d23, q10, #6
-        vst1.u8         {d22}, [r0],  r1
-        vst1.u8         {d23}, [r12], r1
+        vst1.u8         {d22}, [r0,  :64], r1
+        vst1.u8         {d23}, [r12, :64], r1
        bgt             8b
-        pop             {r4-r8,pc}
+        pop             {r4-r5,pc}
 160:
        vmov.i8         q12, #64
        add             r12, r0,  r1
        lsl             r1,  r1,  #1
 16:
-        vld1.u8         {q2},  [r5],  r3
-        vld1.u8         {q1},  [r2],  r3
-        vld1.u8         {q0},  [r0]
+        vld1.u8         {q1,  q2},  [r5,  :128]!
+        vld1.u8         {q8,  q9},  [r2,  :128]!
+        vld1.u8         {q0},  [r0,  :128]
        subs            r4,  r4,  #2
-        vsub.i8         q11, q12, q2
-        vld1.u8         {q15}, [r5],  r3
-        vld1.u8         {q14}, [r2],  r3
-        vld1.u8         {q13}, [r12]
-        vmull.u8        q3,  d2,  d4
-        vmlal.u8        q3,  d0,  d22
-        vmull.u8        q8,  d3,  d5
-        vmlal.u8        q8,  d1,  d23
-        vsub.i8         q11, q12, q15
-        vrshrn.i16      d18, q3,  #6
-        vrshrn.i16      d19, q8,  #6
-        vmull.u8        q3,  d28, d30
-        vmlal.u8        q3,  d26, d22
-        vmull.u8        q8,  d29, d31
-        vmlal.u8        q8,  d27, d23
+        vsub.i8         q15, q12, q1
+        vld1.u8         {q13}, [r12, :128]
+        vmull.u8        q3,  d16, d2
+        vmlal.u8        q3,  d0,  d30
+        vmull.u8        q14, d17, d3
+        vmlal.u8        q14, d1,  d31
+        vsub.i8         q15, q12, q2
        vrshrn.i16      d20, q3,  #6
-        vrshrn.i16      d21, q8,  #6
-        vst1.u8         {q9},  [r0],  r1
-        vst1.u8         {q10}, [r12], r1
+        vrshrn.i16      d21, q14, #6
+        vmull.u8        q3,  d18, d4
+        vmlal.u8        q3,  d26, d30
+        vmull.u8        q14, d19, d5
+        vmlal.u8        q14, d27, d31
+        vrshrn.i16      d22, q3,  #6
+        vrshrn.i16      d23, q14, #6
+        vst1.u8         {q10}, [r0,  :128], r1
+        vst1.u8         {q11}, [r12, :128], r1
        bgt             16b
-        pop             {r4-r8,pc}
-
+        pop             {r4-r5,pc}
 320:
        vmov.i8         q10, #64
 32:
-        vld1.u8         {q2, q3},  [r5],  r3
-        vld1.u8         {q8, q9},  [r2],  r3
-        vld1.u8         {q0, q1},  [r0]
+        vld1.u8         {q2,  q3},  [r5,  :128]!
+        vld1.u8         {q8,  q9},  [r2,  :128]!
+        vld1.u8         {q0,  q1},  [r0,  :128]
        subs            r4,  r4,  #1
        vsub.i8         q11, q10, q2
        vmull.u8        q15, d16, d4
@ -563,9 +560,9 @@ L(blend_tbl):
        vmlal.u8        q14, d3,  d23
        vrshrn.i16      d26, q15, #6
        vrshrn.i16      d27, q14, #6
-        vst1.u8         {q12, q13}, [r0],  r1
+        vst1.u8         {q12, q13}, [r0,  :128],  r1
        bgt             32b
-        pop             {r4-r8,pc}
+        pop             {r4-r5,pc}
 endfunc

 function blend_h_8bpc_neon, export=1
@ -580,6 +577,7 @@ function blend_h_8bpc_neon, export=1
        ldr             r6,  [r7, r6, lsl #2]
        add             r7,  r7,  r6
        bx              r7
+
        .align 2
 L(blend_h_tbl):
        .word 1280f  - L(blend_h_tbl) + CONFIG_THUMB
@ -595,19 +593,18 @@ L(blend_h_tbl):
        add             r12, r0,  r1
        lsl             r1,  r1,  #1
 2:
-        vld1.16         {d2[], d3[]},  [r5]!
-        vld1.16         {d1[]},  [r2],  r3
+        vld1.16         {d2[], d3[]},  [r5,  :16]!
+        vld1.32         {d1[0]},  [r2,  :32]!
        subs            r4,  r4,  #2
-        vld1.16         {d0[]},  [r0]
+        vld1.16         {d0[]},   [r0,  :16]
        vzip.8          d2,  d3
-        vld1.16         {d1[1]}, [r2],  r3
        vsub.i8         d4,  d22, d2
-        vld1.16         {d0[1]}, [r12]
+        vld1.16         {d0[1]},  [r12, :16]
        vmull.u8        q8,  d1,  d2
        vmlal.u8        q8,  d0,  d4
        vrshrn.i16      d20, q8,  #6
-        vst1.16         {d20[0]}, [r0],  r1
-        vst1.16         {d20[1]}, [r12], r1
+        vst1.16         {d20[0]}, [r0,  :16], r1
+        vst1.16         {d20[1]}, [r12, :16], r1
        bgt             2b
        pop             {r4-r8,pc}
 40:
@ -615,74 +612,66 @@ L(blend_h_tbl):
        add             r12, r0,  r1
        lsl             r1,  r1,  #1
 4:
-        vld1.u8         {d2[]},  [r5]!
-        vld1.32         {d1[]},  [r2],  r3
+        vld2.u8         {d2[],  d3[]},   [r5,  :16]!
+        vld1.u8         {d1},     [r2,  :64]!
        subs            r4,  r4,  #2
-        vld1.u8         {d6[]},  [r5]!
-        vld1.32         {d1[1]}, [r2],  r3
-        vext.u8         d2,  d2,  d6,   #4
-        vld1.32         {d0[]},  [r0]
-        vsub.i8         d3,  d22, d2
-        vld1.32         {d0[1]}, [r12]
+        vext.u8         d2,  d2,  d3,   #4
+        vld1.32         {d0[]},   [r0,  :32]
+        vsub.i8         d6,  d22, d2
+        vld1.32         {d0[1]},  [r12, :32]
        vmull.u8        q8,  d1,  d2
-        vmlal.u8        q8,  d0,  d3
+        vmlal.u8        q8,  d0,  d6
        vrshrn.i16      d20, q8,  #6
-        vst1.32         {d20[0]}, [r0],  r1
-        vst1.32         {d20[1]}, [r12], r1
+        vst1.32         {d20[0]}, [r0,  :32], r1
+        vst1.32         {d20[1]}, [r12, :32], r1
        bgt             4b
        pop             {r4-r8,pc}
 80:
-        vmov.i8         d16, #64
+        vmov.i8         q8, #64
        add             r12, r0,  r1
        lsl             r1,  r1,  #1
 8:
-        vld1.u8         {d2[]}, [r5]!
-        vld1.u8         {d4},   [r2],  r3
-        vld1.u8         {d0},   [r0]
-        vsub.i8         d17, d16, d2
-        vld1.u8         {d3[]}, [r5]!
-        vld1.u8         {d5},   [r2],  r3
-        vld1.u8         {d1},   [r12]
+        vld2.u8         {d2[],  d3[]},  [r5,  :16]!
+        vld1.u8         {d4,  d5},  [r2,  :128]!
+        vld1.u8         {d0},   [r0,  :64]
+        vsub.i8         q9,  q8,  q1
+        vld1.u8         {d1},   [r12, :64]
        subs            r4,  r4,  #2
-        vsub.i8         d18, d16, d3
        vmull.u8        q3,  d2,  d4
-        vmlal.u8        q3,  d0,  d17
+        vmlal.u8        q3,  d0,  d18
        vmull.u8        q10, d3,  d5
-        vmlal.u8        q10, d1,  d18
+        vmlal.u8        q10, d1,  d19
        vrshrn.i16      d22, q3,  #6
        vrshrn.i16      d23, q10, #6
-        vst1.u8         {d22}, [r0],  r1
-        vst1.u8         {d23}, [r12], r1
+        vst1.u8         {d22}, [r0,  :64], r1
+        vst1.u8         {d23}, [r12, :64], r1
        bgt             8b
        pop             {r4-r8,pc}
 160:
-        vmov.i8         d24, #64
+        vmov.i8         q12, #64
        add             r12, r0,  r1
        lsl             r1,  r1,  #1
 16:
-        vld1.u8         {d4[]},  [r5]!
-        vld1.u8         {q1},    [r2],  r3
-        vsub.i8         d5,  d24, d4
-        vld1.u8         {q0},    [r0]
+        vld2.u8         {d28[], d29[]}, [r5,  :16]!
+        vld1.u8         {d2,  d3,  d4,  d5},  [r2,  :128]!
+        vsub.i8         q15, q12, q14
+        vld1.u8         {q0},  [r0,  :128]
        subs            r4,  r4,  #2
-        vld1.u8         {d30[]}, [r5]!
-        vld1.u8         {q14},   [r2],  r3
-        vsub.i8         d31, d24, d30
-        vld1.u8         {q13},   [r12]
-        vmull.u8        q3,  d2,  d4
-        vmlal.u8        q3,  d0,  d5
-        vmull.u8        q8,  d3,  d4
-        vmlal.u8        q8,  d1,  d5
+        vld1.u8         {q13}, [r12, :128]
+        vmull.u8        q3,  d2,  d28
+        vmlal.u8        q3,  d0,  d30
+        vmull.u8        q8,  d3,  d28
+        vmlal.u8        q8,  d1,  d30
        vrshrn.i16      d18, q3,  #6
        vrshrn.i16      d19, q8,  #6
-        vmull.u8        q3,  d28, d30
+        vmull.u8        q3,  d4,  d29
        vmlal.u8        q3,  d26, d31
-        vmull.u8        q8,  d29, d30
+        vmull.u8        q8,  d5,  d29
        vmlal.u8        q8,  d27, d31
        vrshrn.i16      d20, q3,  #6
        vrshrn.i16      d21, q8,  #6
-        vst1.u8         {q9},  [r0],  r1
-        vst1.u8         {q10}, [r12], r1
+        vst1.u8         {q9},  [r0,  :128], r1
+        vst1.u8         {q10}, [r12, :128], r1
        bgt             16b
        pop             {r4-r8,pc}
 320:
@ -695,8 +684,8 @@ L(blend_h_tbl):
        vsub.i8         d7,  d20, d6
        mov             r8,  r3
 32:
-        vld1.u8         {q8, q9}, [r2]!
-        vld1.u8         {q0, q1}, [r0]
+        vld1.u8         {q8,  q9},  [r2,  :128]!
+        vld1.u8         {q0,  q1},  [r0,  :128]
        vmull.u8        q15, d16, d6
        vmlal.u8        q15, d0,  d7
        vmull.u8        q14, d17, d6
@ -709,7 +698,7 @@ L(blend_h_tbl):
        vmlal.u8        q14, d3,  d7
        vrshrn.i16      d2,  q15, #6
        vrshrn.i16      d3,  q14, #6
-        vst1.u8         {q0, q1}, [r0]!
+        vst1.u8         {q0,  q1},  [r0,  :128]!
        subs            r8,  r8,  #32
        bgt             32b
        add             r0,  r0,  r1
@ -719,16 +708,17 @@ L(blend_h_tbl):
 endfunc

 function blend_v_8bpc_neon, export=1
-        push            {r4-r8,lr}
-        ldr             r4,  [sp, #24]
+        push            {r4-r5,lr}
+        ldr             r4,  [sp, #12]
        movrel          r5,  X(obmc_masks)
        add             r5,  r5,  r3
-        clz             r8,  r3
-        adr             r7,  L(blend_v_tbl)
-        sub             r8,  r8,  #26
-        ldr             r8,  [r7, r8, lsl #2]
-        add             r7,  r7,  r8
-        bx              r7
+        clz             lr,  r3
+        adr             r3,  L(blend_v_tbl)
+        sub             lr,  lr,  #26
+        ldr             lr,  [r3, lr, lsl #2]
+        add             r3,  r3,  lr
+        bx              r3
+
        .align 2
 L(blend_v_tbl):
        .word 320f  - L(blend_v_tbl) + CONFIG_THUMB
@ -744,54 +734,53 @@ L(blend_v_tbl):
        lsl             r1,  r1,  #1
        vsub.i8         d3,  d22, d2
 2:
-        vld1.8          {d1[]},  [r2],  r3
+        vld1.16         {d1[0]},  [r2,  :16]!
        vld1.8          {d0[]},   [r0]
        subs            r4,  r4,  #2
-        vld1.8          {d1[1]}, [r2],  r3
+        vld1.8          {d1[1]},  [r2]
        vld1.8          {d0[1]},  [r12]
        vmull.u8        q2,  d1,  d2
        vmlal.u8        q2,  d0,  d3
        vrshrn.i16      d6,  q2,  #6
+        add             r2,  r2,  #2
        vst1.8          {d6[0]},  [r0],  r1
        vst1.8          {d6[1]},  [r12], r1
        bgt             2b
-        pop             {r4-r8,pc}
+        pop             {r4-r5,pc}
 40:
        vmov.i8         d22, #64
-        vld1.32         {d4[]},  [r5]
+        vld1.32         {d4[]},   [r5,  :32]
        add             r12, r0,  r1
        lsl             r1,  r1,  #1
        vsub.i8         d5,  d22, d4
        sub             r1,  r1,  #3
 4:
-        vld1.32         {d2[]},  [r2],  r3
-        vld1.32         {d0[]},  [r0]
-        vld1.32         {d2[1]}, [r2],  r3
-        vld1.32         {d0[1]}, [r12]
+        vld1.u8         {d2},     [r2,  :64]!
+        vld1.32         {d0[]},   [r0,  :32]
+        vld1.32         {d0[1]},  [r12, :32]
        subs            r4,  r4,  #2
        vmull.u8        q3,  d2,  d4
        vmlal.u8        q3,  d0,  d5
        vrshrn.i16      d20, q3,  #6
-        vst1.16         {d20[0]}, [r0]!
-        vst1.16         {d20[2]}, [r12]!
+        vst1.16         {d20[0]}, [r0,  :16]!
+        vst1.16         {d20[2]}, [r12, :16]!
        vst1.8          {d20[2]}, [r0]!
        vst1.8          {d20[6]}, [r12]!
        add             r0,  r0,  r1
        add             r12, r12, r1
        bgt             4b
-        pop             {r4-r8,pc}
+        pop             {r4-r5,pc}
 80:
        vmov.i8         d16, #64
-        vld1.u8         {d2}, [r5]
+        vld1.u8         {d2},  [r5,  :64]
        add             r12, r0,  r1
        lsl             r1,  r1,  #1
        vsub.i8         d17, d16, d2
        sub             r1,  r1,  #6
 8:
-        vld1.u8         {d4},  [r2],  r3
-        vld1.u8         {d0},  [r0]
-        vld1.u8         {d5},  [r2],  r3
-        vld1.u8         {d1},  [r12]
+        vld1.u8         {d4,  d5},  [r2,  :128]!
+        vld1.u8         {d0},  [r0,  :64]
+        vld1.u8         {d1},  [r12, :64]
        subs            r4,  r4,  #2
        vmull.u8        q3,  d2,  d4
        vmlal.u8        q3,  d0,  d17
@ -799,55 +788,54 @@ L(blend_v_tbl):
        vmlal.u8        q10, d1,  d17
        vrshrn.i16      d22, q3,  #6
        vrshrn.i16      d23, q10, #6
-        vst1.32         {d22[0]}, [r0]!
-        vst1.32         {d23[0]}, [r12]!
-        vst1.16         {d22[2]}, [r0]!
-        vst1.16         {d23[2]}, [r12]!
+        vst1.32         {d22[0]}, [r0,  :32]!
+        vst1.32         {d23[0]}, [r12, :32]!
+        vst1.16         {d22[2]}, [r0,  :16]!
+        vst1.16         {d23[2]}, [r12, :16]!
        add             r0,  r0,  r1
        add             r12, r12, r1
        bgt             8b
-        pop             {r4-r8,pc}
+        pop             {r4-r5,pc}
 160:
        vmov.i8         q12, #64
-        vld1.u8         {q2},  [r5]
+        vld1.u8         {q14}, [r5,  :128]
        add             r12, r0,  r1
        lsl             r1,  r1,  #1
-        vsub.i8         q11, q12, q2
+        vsub.i8         q11, q12, q14
        sub             r1,  r1,  #12
 16:
-        vld1.u8         {q1},  [r2],  r3
-        vld1.u8         {q0},  [r0]
+        vld1.u8         {q1,  q2},  [r2,  :128]!
+        vld1.u8         {q0},  [r0,  :128]
        subs            r4,  r4,  #2
-        vld1.u8         {q14}, [r2],  r3
-        vld1.u8         {q13}, [r12]
-        vmull.u8        q3,  d2,  d4
+        vld1.u8         {q13}, [r12, :128]
+        vmull.u8        q3,  d2,  d28
        vmlal.u8        q3,  d0,  d22
-        vmull.u8        q8,  d3,  d5
+        vmull.u8        q8,  d3,  d29
        vmlal.u8        q8,  d1,  d23
        vrshrn.i16      d18, q3,  #6
        vrshrn.i16      d19, q8,  #6
-        vmull.u8        q3,  d28, d4
+        vmull.u8        q3,  d4,  d28
        vmlal.u8        q3,  d26, d22
-        vmull.u8        q8,  d29, d5
+        vmull.u8        q8,  d5,  d29
        vmlal.u8        q8,  d27, d23
        vrshrn.i16      d20, q3,  #6
        vrshrn.i16      d21, q8,  #6
-        vst1.u8         {d18},    [r0]!
-        vst1.u8         {d20},    [r12]!
-        vst1.32         {d19[0]}, [r0]!
-        vst1.32         {d21[0]}, [r12]!
+        vst1.u8         {d18},    [r0,  :64]!
+        vst1.u8         {d20},    [r12, :64]!
+        vst1.32         {d19[0]}, [r0,  :32]!
+        vst1.32         {d21[0]}, [r12, :32]!
        add             r0,  r0,  r1
        add             r12, r12, r1
        bgt             16b
-        pop             {r4-r8,pc}
+        pop             {r4-r5,pc}
 320:
        vmov.i8         q10, #64
-        vld1.u8         {q2, q3},  [r5]
+        vld1.u8         {q2,  q3},  [r5,  :128]
        vsub.i8         q11, q10, q2
        vsub.i8         q12, q10, q3
 32:
-        vld1.u8         {q8, q9},  [r2],  r3
-        vld1.u8         {q0, q1},  [r0]
+        vld1.u8         {q8,  q9},  [r2,  :128]!
+        vld1.u8         {q0,  q1},  [r0,  :128]
        subs            r4,  r4,  #1
        vmull.u8        q15, d16, d4
        vmlal.u8        q15, d0,  d22
@ -858,9 +846,9 @@ L(blend_v_tbl):
        vmull.u8        q15, d18, d6
        vmlal.u8        q15, d2,  d24
        vrshrn.i16      d2,  q15, #6
-        vst1.u8         {d0, d1, d2}, [r0], r1
+        vst1.u8         {d0,  d1,  d2},  [r0,  :64],  r1
        bgt             32b
-        pop             {r4-r8,pc}
+        pop             {r4-r5,pc}
 endfunc


--- a/third_party/dav1d/src/arm/64/itx.S
+++ b/third_party/dav1d/src/arm/64/itx.S
@ -98,7 +98,8 @@ const idct64_coeffs, align=4
 endconst

 const iadst4_coeffs, align=4
-        .short          1321, 3803, 2482, 3344, 3344*8
+        // .h[4-5] can be interpreted as .s[2]
+        .short          1321, 3803, 2482, 3344, 3344, 0
 endconst

 const iadst8_coeffs, align=4
@ -147,6 +148,27 @@ endconst
 .endif
 .endm

+.macro saddl_sz d0, d1, s0, s1, sz
+        saddl           \d0\().4s,  \s0\().4h,  \s1\().4h
+.ifc \sz, .8h
+        saddl2          \d1\().4s,  \s0\().8h,  \s1\().8h
+.endif
+.endm
+
+.macro ssubl_sz d0, d1, s0, s1, sz
+        ssubl           \d0\().4s,  \s0\().4h,  \s1\().4h
+.ifc \sz, .8h
+        ssubl2          \d1\().4s,  \s0\().8h,  \s1\().8h
+.endif
+.endm
+
+.macro mul_4s_sz d0, d1, s0, s1, c, sz
+        mul             \d0\().4s,  \s0\().4s,  \c
+.ifc \sz, .8h
+        mul             \d1\().4s,  \s1\().4s,  \c
+.endif
+.endm
+
 .macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7
        sqrdmulh        \r0\sz,  \r0\sz,  \c
        sqrdmulh        \r1\sz,  \r1\sz,  \c
@ -499,23 +521,24 @@ endfunc
        movrel          x16, iadst4_coeffs
        ld1             {v0.8h}, [x16]

-        sub             v3.4h,   v16.4h,  v18.4h
+        ssubl           v3.4s,   v16.4h,  v18.4h
        smull           v4.4s,   v16.4h,  v0.h[0]
        smlal           v4.4s,   v18.4h,  v0.h[1]
        smlal           v4.4s,   v19.4h,  v0.h[2]
        smull           v7.4s,   v17.4h,  v0.h[3]
-        add             v3.4h,   v3.4h,   v19.4h
+        saddw           v3.4s,   v3.4s,   v19.4h
        smull           v5.4s,   v16.4h,  v0.h[2]
        smlsl           v5.4s,   v18.4h,  v0.h[0]
        smlsl           v5.4s,   v19.4h,  v0.h[1]

        add             \o3\().4s, v4.4s,     v5.4s
-        sqrdmulh        \o2\().4h, v3.4h,     v0.h[4]
+        mul             \o2\().4s, v3.4s,     v0.s[2]
        add             \o0\().4s, v4.4s,     v7.4s
        add             \o1\().4s, v5.4s,     v7.4s
        sub             \o3\().4s, \o3\().4s, v7.4s

        rshrn           \o0\().4h, \o0\().4s, #12
+        rshrn           \o2\().4h, \o2\().4s, #12
        rshrn           \o1\().4h, \o1\().4s, #12
        rshrn           \o3\().4h, \o3\().4s, #12
 .endm
@ -534,14 +557,16 @@ endfunc
        movrel          x16, iadst4_coeffs
        ld1             {v0.8h}, [x16]

-        sub             v3.8h,   v16.8h,  v18.8h
+        ssubl           v2.4s,   v16.4h,  v18.4h
+        ssubl2          v3.4s,   v16.8h,  v18.8h
        smull           v4.4s,   v16.4h,  v0.h[0]
        smlal           v4.4s,   v18.4h,  v0.h[1]
        smlal           v4.4s,   v19.4h,  v0.h[2]
        smull2          v5.4s,   v16.8h,  v0.h[0]
        smlal2          v5.4s,   v18.8h,  v0.h[1]
        smlal2          v5.4s,   v19.8h,  v0.h[2]
-        add             v3.8h,   v3.8h,   v19.8h
+        saddw           v2.4s,   v2.4s,   v19.4h
+        saddw2          v3.4s,   v3.4s,   v19.8h
        smull           v6.4s,   v16.4h,  v0.h[2]
        smlsl           v6.4s,   v18.4h,  v0.h[0]
        smlsl           v6.4s,   v19.4h,  v0.h[1]
@ -549,7 +574,8 @@ endfunc
        smlsl2          v7.4s,   v18.8h,  v0.h[0]
        smlsl2          v7.4s,   v19.8h,  v0.h[1]

-        sqrdmulh        v18.8h,  v3.8h,   v0.h[4]
+        mul             v18.4s,  v2.4s,   v0.s[2]
+        mul             v19.4s,  v3.4s,   v0.s[2]

        smull           v2.4s,   v17.4h,  v0.h[3]
        smull2          v3.4s,   v17.8h,  v0.h[3]
@ -566,6 +592,9 @@ endfunc
        sub             v4.4s,   v4.4s,   v2.4s // out3
        sub             v5.4s,   v5.4s,   v3.4s

+        rshrn           v18.4h,  v18.4s, #12
+        rshrn2          v18.8h,  v19.4s, #12
+
        rshrn           \o0\().4h, v16.4s, #12
        rshrn2          \o0\().8h, v17.4s, #12

@ -836,16 +865,25 @@ endfunc
        sqsub           v5\sz,     v5\sz, v19\sz // t7
        sqneg           \o1\()\sz, \o1\()\sz     // out1

-        add             v6\sz,   v2\sz,   v4\sz
-        sub             v7\sz,   v2\sz,   v4\sz
-        add             v4\sz,   v3\sz,   v5\sz
-        sub             v5\sz,   v3\sz,   v5\sz
-        sqrdmulh        \o3\sz,  v6\sz,   v1.h[1] // out3
-        sqrdmulh        \o4\sz,  v7\sz,   v1.h[1] // out4
-        sqrdmulh        \o2\sz,  v4\sz,   v1.h[1] // out2
-        sqrdmulh        \o5\sz,  v5\sz,   v1.h[1] // out5
-        neg             \o3\()\sz, \o3\()\sz     // out3
-        neg             \o5\()\sz, \o5\()\sz     // out5
+        movi            v0.4s,  #2896>>4
+
+        saddl_sz        v18, v19, v2,  v4,  \sz // -> out3 (v19 or v20)
+        ssubl_sz        v6,  v7,  v2,  v4,  \sz // -> out4 (v20 or v19)
+        ssubl_sz        v20, v21, v3,  v5,  \sz // -> out5 (v21 or v18)
+        saddl_sz        v4,  v5,  v3,  v5,  \sz // -> out2 (v18 or v21)
+
+        mul_4s_sz       v18, v19, v18, v19, v0.s[0], \sz
+        mul_4s_sz       v6,  v7,  v6,  v7,  v0.s[0], \sz
+        mul_4s_sz       v20, v21, v20, v21, v0.s[0], \sz
+        mul_4s_sz       v4,  v5,  v4,  v5,  v0.s[0], \sz
+
+        rshrn_sz        v2,  v18, v19, #8,  \sz // out3
+        rshrn_sz        v3,  v20, v21, #8,  \sz // out5
+        rshrn_sz        \o2, v4,  v5,  #8,  \sz // out2 (v18 or v21)
+        rshrn_sz        \o4, v6,  v7,  #8,  \sz // out4 (v20 or v19)
+
+        sqneg           \o3\()\sz, v2\sz     // out3
+        sqneg           \o5\()\sz, v3\sz     // out5
 .endm

 function inv_adst_8x8_neon
@ -1272,28 +1310,47 @@ endfunc
        sqsub           v23\sz,  v25\sz,  v23\sz // t7
        sqneg           \o3\sz,  \o3\sz          // out3

-        sqsub           v24\sz,  v2\sz,   v21\sz // -> out8
-        sqadd           v2\sz,   v2\sz,   v21\sz // -> out7
-        sqadd           v21\sz,  v26\sz,  v3\sz  // -> out5
-        sqsub           v26\sz,  v26\sz,  v3\sz  // -> out10
-        sqadd           v3\sz,   v27\sz,  v20\sz // -> out6
-        sqsub           v25\sz,  v27\sz,  v20\sz // -> out9
-        sqadd           v20\sz,  v22\sz,  v23\sz // -> out4
-        sqsub           v27\sz,  v22\sz,  v23\sz // -> out11
+        movi            v0.4s,  #2896>>4

-        sqrdmulh        v2\sz,   v2\sz,   v0.h[1] // out7
-        sqrdmulh        v4\sz,   v21\sz,  v0.h[1] // out5
-        sqrdmulh        v5\sz,   v25\sz,  v0.h[1] // out9
-        sqrdmulh        v6\sz,   v27\sz,  v0.h[1] // out11
-        sqrdmulh        \o6\sz,  v3\sz,   v0.h[1] // out6
-        sqrdmulh        \o8\sz,  v24\sz,  v0.h[1] // out8
-        sqrdmulh        \o10\sz, v26\sz,  v0.h[1] // out10
-        sqrdmulh        \o4\sz,  v20\sz,  v0.h[1] // out4
+        ssubl_sz        v24, v25, v2,  v21, \sz // -> out8 (v24 or v23)
+        saddl_sz        v4,  v5,  v2,  v21, \sz // -> out7 (v23 or v24)
+        saddl_sz        v6,  v7,  v26, v3,  \sz // -> out5 (v21 or v26)
+        ssubl_sz        v2,  v3,  v26, v3,  \sz // -> out10 (v26 or v21)

-        neg             \o7\sz,  v2\sz // out7
-        neg             \o5\sz,  v4\sz // out5
-        neg             \o9\sz,  v5\sz // out9
-        neg             \o11\sz, v6\sz // out11
+        mul_4s_sz       v24, v25, v24, v25, v0.s[0], \sz
+        mul_4s_sz       v4,  v5,  v4,  v5,  v0.s[0], \sz
+        mul_4s_sz       v6,  v7,  v6,  v7,  v0.s[0], \sz
+        mul_4s_sz       v2,  v3,  v2,  v3,  v0.s[0], \sz
+
+        rshrn_sz        v24, v24, v25, #8,  \sz // out8
+        rshrn_sz        v4,  v4,  v5,  #8,  \sz // out7
+        rshrn_sz        v5,  v6,  v7,  #8,  \sz // out5
+        rshrn_sz        v26, v2,  v3,  #8,  \sz // out10
+
+        saddl_sz        v2,  v3,  v22, v23, \sz // -> out4 (v20 or v27)
+        ssubl_sz        v6,  v7,  v22, v23, \sz // -> out11 (v27 or v20)
+        saddl_sz        v22, v23, v27, v20, \sz // -> out6 (v22 or v25)
+        ssubl_sz        v21, v25, v27, v20, \sz // -> out9 (v25 or v22)
+
+        mul_4s_sz       v2,  v3,  v2,  v3,  v0.s[0], \sz
+        mul_4s_sz       v6,  v7,  v6,  v7,  v0.s[0], \sz
+        mul_4s_sz       v22, v23, v22, v23, v0.s[0], \sz
+        mul_4s_sz       v21, v25, v21, v25, v0.s[0], \sz
+
+        rshrn_sz        \o4, v2,  v3,  #8,  \sz // out4
+        rshrn_sz        v6,  v6,  v7,  #8,  \sz // out11
+        rshrn_sz        v7,  v21, v25, #8,  \sz // out9
+        rshrn_sz        \o6, v22, v23, #8,  \sz // out6
+
+.ifc \o8, v23
+        mov             \o8\szb,  v24\szb
+        mov             \o10\szb, v26\szb
+.endif
+
+        sqneg           \o7\sz,  v4\sz // out7
+        sqneg           \o5\sz,  v5\sz // out5
+        sqneg           \o11\sz, v6\sz // out11
+        sqneg           \o9\sz,  v7\sz // out9
 .endm

 function inv_adst_8x16_neon
--- a/third_party/dav1d/src/arm/64/mc.S
+++ b/third_party/dav1d/src/arm/64/mc.S
@ -234,6 +234,635 @@ bidir_fn w_avg
 bidir_fn mask


+.macro w_mask_fn type
+function w_mask_\type\()_8bpc_neon, export=1
+        clz             w8,  w4
+        adr             x9,  L(w_mask_\type\()_tbl)
+        sub             w8,  w8,  #24
+        ldrh            w8,  [x9,  x8,  lsl #1]
+        sub             x9,  x9,  w8,  uxtw
+        mov             w10, #6903
+        dup             v0.8h,   w10
+.if \type == 444
+        movi            v1.16b,  #64
+.elseif \type == 422
+        dup             v2.8b,   w7
+        movi            v3.8b,   #129
+        sub             v3.8b,   v3.8b,   v2.8b
+.elseif \type == 420
+        dup             v2.8h,   w7
+        movi            v3.8h,   #1, lsl #8
+        sub             v3.8h,   v3.8h,   v2.8h
+.endif
+        add             x12,  x0,  x1
+        lsl             x1,   x1,  #1
+        br              x9
+4:
+        ld1             {v4.8h,   v5.8h},   [x2],  #32  // tmp1 (four rows at once)
+        ld1             {v6.8h,   v7.8h},   [x3],  #32  // tmp2 (four rows at once)
+        subs            w5,  w5,  #4
+        sub             v16.8h,  v6.8h,   v4.8h
+        sub             v17.8h,  v7.8h,   v5.8h
+        sabd            v18.8h,  v4.8h,   v6.8h
+        sabd            v19.8h,  v5.8h,   v7.8h
+        uqsub           v18.8h,  v0.8h,   v18.8h
+        uqsub           v19.8h,  v0.8h,   v19.8h
+        ushr            v18.8h,  v18.8h,  #8
+        ushr            v19.8h,  v19.8h,  #8
+        shl             v20.8h,  v18.8h,  #9
+        shl             v21.8h,  v19.8h,  #9
+        sqdmulh         v20.8h,  v20.8h,  v16.8h
+        sqdmulh         v21.8h,  v21.8h,  v17.8h
+        add             v20.8h,  v20.8h,  v4.8h
+        add             v21.8h,  v21.8h,  v5.8h
+        sqrshrun        v22.8b,  v20.8h,  #4
+        sqrshrun        v23.8b,  v21.8h,  #4
+.if \type == 444
+        xtn             v18.8b,   v18.8h
+        xtn2            v18.16b,  v19.8h
+        sub             v18.16b,  v1.16b,  v18.16b
+        st1             {v18.16b}, [x6],  #16
+.elseif \type == 422
+        addp            v18.8h,   v18.8h,  v19.8h
+        xtn             v18.8b,   v18.8h
+        uhsub           v18.8b,   v3.8b,   v18.8b
+        st1             {v18.8b},  [x6],  #8
+.elseif \type == 420
+        trn1            v24.2d,   v18.2d,  v19.2d
+        trn2            v25.2d,   v18.2d,  v19.2d
+        add             v24.8h,   v24.8h,  v25.8h
+        addp            v18.8h,   v24.8h,  v24.8h
+        sub             v18.4h,   v3.4h,   v18.4h
+        rshrn           v18.8b,   v18.8h,  #2
+        st1             {v18.s}[0],  [x6],  #4
+.endif
+        st1             {v22.s}[0],  [x0],  x1
+        st1             {v22.s}[1],  [x12], x1
+        st1             {v23.s}[0],  [x0],  x1
+        st1             {v23.s}[1],  [x12], x1
+        b.gt            4b
+        ret
+8:
+        ld1             {v4.8h,   v5.8h},   [x2],  #32
+        ld1             {v6.8h,   v7.8h},   [x3],  #32
+        subs            w5,  w5,  #2
+        sub             v16.8h,  v6.8h,   v4.8h
+        sub             v17.8h,  v7.8h,   v5.8h
+        sabd            v18.8h,  v4.8h,   v6.8h
+        sabd            v19.8h,  v5.8h,   v7.8h
+        uqsub           v18.8h,  v0.8h,   v18.8h
+        uqsub           v19.8h,  v0.8h,   v19.8h
+        ushr            v18.8h,  v18.8h,  #8
+        ushr            v19.8h,  v19.8h,  #8
+        shl             v20.8h,  v18.8h,  #9
+        shl             v21.8h,  v19.8h,  #9
+        sqdmulh         v20.8h,  v20.8h,  v16.8h
+        sqdmulh         v21.8h,  v21.8h,  v17.8h
+        add             v20.8h,  v20.8h,  v4.8h
+        add             v21.8h,  v21.8h,  v5.8h
+        sqrshrun        v22.8b,  v20.8h,  #4
+        sqrshrun        v23.8b,  v21.8h,  #4
+.if \type == 444
+        xtn             v18.8b,  v18.8h
+        xtn2            v18.16b, v19.8h
+        sub             v18.16b, v1.16b,  v18.16b
+        st1             {v18.16b}, [x6],  #16
+.elseif \type == 422
+        addp            v18.8h,  v18.8h,  v19.8h
+        xtn             v18.8b,  v18.8h
+        uhsub           v18.8b,  v3.8b,   v18.8b
+        st1             {v18.8b},  [x6],  #8
+.elseif \type == 420
+        add             v18.8h,  v18.8h,  v19.8h
+        addp            v18.8h,  v18.8h,  v18.8h
+        sub             v18.4h,  v3.4h,   v18.4h
+        rshrn           v18.8b,  v18.8h,  #2
+        st1             {v18.s}[0],  [x6],  #4
+.endif
+        st1             {v22.8b},  [x0],  x1
+        st1             {v23.8b},  [x12], x1
+        b.gt            8b
+        ret
+1280:
+640:
+320:
+160:
+        mov             w11, w4
+        sub             x1,  x1,  w4,  uxtw
+.if \type == 444
+        add             x10, x6,  w4,  uxtw
+.elseif \type == 422
+        add             x10, x6,  x11, lsr #1
+.endif
+        add             x9,  x3,  w4,  uxtw #1
+        add             x7,  x2,  w4,  uxtw #1
+161:
+        mov             w8,  w4
+16:
+        ld1             {v4.8h,   v5.8h},   [x2],  #32
+        ld1             {v6.8h,   v7.8h},   [x3],  #32
+        ld1             {v16.8h,  v17.8h},  [x7],  #32
+        ld1             {v18.8h,  v19.8h},  [x9],  #32
+        subs            w8,  w8,  #16
+        sub             v6.8h,   v6.8h,   v4.8h
+        sub             v7.8h,   v7.8h,   v5.8h
+        sub             v18.8h,  v18.8h,  v16.8h
+        sub             v19.8h,  v19.8h,  v17.8h
+        abs             v20.8h,  v6.8h
+        abs             v21.8h,  v7.8h
+        abs             v22.8h,  v18.8h
+        abs             v23.8h,  v19.8h
+        uqsub           v20.8h,  v0.8h,   v20.8h
+        uqsub           v21.8h,  v0.8h,   v21.8h
+        uqsub           v22.8h,  v0.8h,   v22.8h
+        uqsub           v23.8h,  v0.8h,   v23.8h
+        ushr            v20.8h,  v20.8h,  #8
+        ushr            v21.8h,  v21.8h,  #8
+        ushr            v22.8h,  v22.8h,  #8
+        ushr            v23.8h,  v23.8h,  #8
+        shl             v24.8h,  v20.8h,  #9
+        shl             v25.8h,  v21.8h,  #9
+        shl             v26.8h,  v22.8h,  #9
+        shl             v27.8h,  v23.8h,  #9
+        sqdmulh         v24.8h,  v24.8h,  v6.8h
+        sqdmulh         v25.8h,  v25.8h,  v7.8h
+        sqdmulh         v26.8h,  v26.8h,  v18.8h
+        sqdmulh         v27.8h,  v27.8h,  v19.8h
+        add             v24.8h,  v24.8h,  v4.8h
+        add             v25.8h,  v25.8h,  v5.8h
+        add             v26.8h,  v26.8h,  v16.8h
+        add             v27.8h,  v27.8h,  v17.8h
+        sqrshrun        v24.8b,  v24.8h,  #4
+        sqrshrun        v25.8b,  v25.8h,  #4
+        sqrshrun        v26.8b,  v26.8h,  #4
+        sqrshrun        v27.8b,  v27.8h,  #4
+.if \type == 444
+        xtn             v20.8b,  v20.8h
+        xtn2            v20.16b, v21.8h
+        xtn             v21.8b,  v22.8h
+        xtn2            v21.16b, v23.8h
+        sub             v20.16b, v1.16b,  v20.16b
+        sub             v21.16b, v1.16b,  v21.16b
+        st1             {v20.16b}, [x6],  #16
+        st1             {v21.16b}, [x10], #16
+.elseif \type == 422
+        addp            v20.8h,  v20.8h,  v21.8h
+        addp            v21.8h,  v22.8h,  v23.8h
+        xtn             v20.8b,  v20.8h
+        xtn             v21.8b,  v21.8h
+        uhsub           v20.8b,  v3.8b,   v20.8b
+        uhsub           v21.8b,  v3.8b,   v21.8b
+        st1             {v20.8b},  [x6],  #8
+        st1             {v21.8b},  [x10], #8
+.elseif \type == 420
+        add             v20.8h,  v20.8h,  v22.8h
+        add             v21.8h,  v21.8h,  v23.8h
+        addp            v20.8h,  v20.8h,  v21.8h
+        sub             v20.8h,  v3.8h,   v20.8h
+        rshrn           v20.8b,  v20.8h,  #2
+        st1             {v20.8b},  [x6],  #8
+.endif
+        st1             {v24.8b,  v25.8b},  [x0],  #16
+        st1             {v26.8b,  v27.8b},  [x12], #16
+        b.gt            16b
+        subs            w5,  w5,  #2
+        add             x2,  x2,  w4,  uxtw #1
+        add             x3,  x3,  w4,  uxtw #1
+        add             x7,  x7,  w4,  uxtw #1
+        add             x9,  x9,  w4,  uxtw #1
+.if \type == 444
+        add             x6,  x6,  w4,  uxtw
+        add             x10, x10, w4,  uxtw
+.elseif \type == 422
+        add             x6,  x6,  x11, lsr #1
+        add             x10, x10, x11, lsr #1
+.endif
+        add             x0,  x0,  x1
+        add             x12, x12, x1
+        b.gt            161b
+        ret
+L(w_mask_\type\()_tbl):
+        .hword L(w_mask_\type\()_tbl) - 1280b
+        .hword L(w_mask_\type\()_tbl) -  640b
+        .hword L(w_mask_\type\()_tbl) -  320b
+        .hword L(w_mask_\type\()_tbl) -  160b
+        .hword L(w_mask_\type\()_tbl) -    8b
+        .hword L(w_mask_\type\()_tbl) -    4b
+endfunc
+.endm
+
+w_mask_fn 444
+w_mask_fn 422
+w_mask_fn 420
+
+
+function blend_8bpc_neon, export=1
+        adr             x6,  L(blend_tbl)
+        clz             w3,  w3
+        sub             w3,  w3,  #26
+        ldrh            w3,  [x6,  x3,  lsl #1]
+        sub             x6,  x6,  w3,  uxtw
+        movi            v4.16b,  #64
+        add             x8,  x0,  x1
+        lsl             w1,  w1,  #1
+        br              x6
+4:
+        ld1             {v2.d}[0],   [x5],  #8
+        ld1             {v1.d}[0],   [x2],  #8
+        ld1             {v0.s}[0],   [x0]
+        subs            w4,  w4,  #2
+        ld1             {v0.s}[1],   [x8]
+        sub             v3.8b,   v4.8b,   v2.8b
+        umull           v5.8h,   v1.8b,   v2.8b
+        umlal           v5.8h,   v0.8b,   v3.8b
+        rshrn           v6.8b,   v5.8h,   #6
+        st1             {v6.s}[0],   [x0],  x1
+        st1             {v6.s}[1],   [x8],  x1
+        b.gt            4b
+        ret
+8:
+        ld1             {v2.2d},   [x5],  #16
+        ld1             {v1.2d},   [x2],  #16
+        ld1             {v0.d}[0],   [x0]
+        ld1             {v0.d}[1],   [x8]
+        sub             v3.16b,  v4.16b,  v2.16b
+        subs            w4,  w4,  #2
+        umull           v5.8h,   v1.8b,   v2.8b
+        umlal           v5.8h,   v0.8b,   v3.8b
+        umull2          v6.8h,   v1.16b,  v2.16b
+        umlal2          v6.8h,   v0.16b,  v3.16b
+        rshrn           v7.8b,   v5.8h,   #6
+        rshrn2          v7.16b,  v6.8h,   #6
+        st1             {v7.d}[0],   [x0],  x1
+        st1             {v7.d}[1],   [x8],  x1
+        b.gt            8b
+        ret
+16:
+        ld1             {v1.2d,   v2.2d},   [x5],  #32
+        ld1             {v5.2d,   v6.2d},   [x2],  #32
+        ld1             {v0.2d},   [x0]
+        subs            w4,  w4,  #2
+        sub             v7.16b,  v4.16b,  v1.16b
+        sub             v20.16b, v4.16b,  v2.16b
+        ld1             {v3.2d},   [x8]
+        umull           v16.8h,  v5.8b,   v1.8b
+        umlal           v16.8h,  v0.8b,   v7.8b
+        umull2          v17.8h,  v5.16b,  v1.16b
+        umlal2          v17.8h,  v0.16b,  v7.16b
+        umull           v21.8h,  v6.8b,   v2.8b
+        umlal           v21.8h,  v3.8b,   v20.8b
+        umull2          v22.8h,  v6.16b,  v2.16b
+        umlal2          v22.8h,  v3.16b,  v20.16b
+        rshrn           v18.8b,  v16.8h,  #6
+        rshrn2          v18.16b, v17.8h,  #6
+        rshrn           v19.8b,  v21.8h,  #6
+        rshrn2          v19.16b, v22.8h,  #6
+        st1             {v18.2d},  [x0],  x1
+        st1             {v19.2d},  [x8],  x1
+        b.gt            16b
+        ret
+32:
+        ld1             {v0.2d,   v1.2d,   v2.2d,   v3.2d},   [x5],  #64
+        ld1             {v16.2d,  v17.2d,  v18.2d,  v19.2d},  [x2],  #64
+        ld1             {v20.2d,  v21.2d},  [x0]
+        subs            w4,  w4,  #2
+        ld1             {v22.2d,  v23.2d},  [x8]
+        sub             v5.16b,  v4.16b,  v0.16b
+        sub             v6.16b,  v4.16b,  v1.16b
+        sub             v30.16b, v4.16b,  v2.16b
+        sub             v31.16b, v4.16b,  v3.16b
+        umull           v24.8h,  v16.8b,  v0.8b
+        umlal           v24.8h,  v20.8b,  v5.8b
+        umull2          v26.8h,  v16.16b, v0.16b
+        umlal2          v26.8h,  v20.16b, v5.16b
+        umull           v28.8h,  v17.8b,  v1.8b
+        umlal           v28.8h,  v21.8b,  v6.8b
+        umull2          v7.8h,   v17.16b, v1.16b
+        umlal2          v7.8h,   v21.16b, v6.16b
+        umull           v27.8h,  v18.8b,  v2.8b
+        umlal           v27.8h,  v22.8b,  v30.8b
+        umull2          v1.8h,   v18.16b, v2.16b
+        umlal2          v1.8h,   v22.16b, v30.16b
+        umull           v29.8h,  v19.8b,  v3.8b
+        umlal           v29.8h,  v23.8b,  v31.8b
+        umull2          v21.8h,  v19.16b, v3.16b
+        umlal2          v21.8h,  v23.16b, v31.16b
+        rshrn           v24.8b,  v24.8h,  #6
+        rshrn2          v24.16b, v26.8h,  #6
+        rshrn           v25.8b,  v28.8h,  #6
+        rshrn2          v25.16b, v7.8h,   #6
+        rshrn           v27.8b,  v27.8h,  #6
+        rshrn2          v27.16b, v1.8h,   #6
+        rshrn           v28.8b,  v29.8h,  #6
+        rshrn2          v28.16b, v21.8h,  #6
+        st1             {v24.2d, v25.2d}, [x0],  x1
+        st1             {v27.2d, v28.2d}, [x8],  x1
+        b.gt            32b
+        ret
+L(blend_tbl):
+        .hword L(blend_tbl) - 32b
+        .hword L(blend_tbl) - 16b
+        .hword L(blend_tbl) -  8b
+        .hword L(blend_tbl) -  4b
+endfunc
+
+function blend_h_8bpc_neon, export=1
+        adr             x6, L(blend_h_tbl)
+        movrel          x5,  X(obmc_masks)
+        add             x5,  x5,  w4,  uxtw
+        sub             w4,  w4,  w4,  lsr #2
+        clz             w7,  w3
+        movi            v4.16b,  #64
+        add             x8,  x0,  x1
+        lsl             x1,  x1,  #1
+        sub             w7,  w7,  #24
+        ldrh            w7,  [x6,  x7,  lsl #1]
+        sub             x6,  x6,  w7, uxtw
+        br              x6
+2:
+        ld1             {v0.h}[0],   [x5],  #2
+        ld1             {v1.s}[0],   [x2],  #4
+        subs            w4,  w4,  #2
+        ld1             {v2.h}[0],   [x0]
+        zip1            v0.8b,   v0.8b,   v0.8b
+        sub             v3.8b,   v4.8b,   v0.8b
+        ld1             {v2.h}[1],   [x8]
+        umull           v5.8h,   v1.8b,   v0.8b
+        umlal           v5.8h,   v2.8b,   v3.8b
+        rshrn           v5.8b,   v5.8h,   #6
+        st1             {v5.h}[0],   [x0],  x1
+        st1             {v5.h}[1],   [x8],  x1
+        b.gt            2b
+        ret
+4:
+        ld2r            {v0.8b,   v1.8b},   [x5],  #2
+        ld1             {v2.2s},   [x2],  #8
+        subs            w4,  w4,  #2
+        ext             v0.8b,   v0.8b,   v1.8b,   #4
+        ld1             {v3.s}[0],   [x0]
+        sub             v5.8b,   v4.8b,   v0.8b
+        ld1             {v3.s}[1],   [x8]
+        umull           v6.8h,   v2.8b,   v0.8b
+        umlal           v6.8h,   v3.8b,   v5.8b
+        rshrn           v6.8b,   v6.8h,   #6
+        st1             {v6.s}[0],   [x0],  x1
+        st1             {v6.s}[1],   [x8],  x1
+        b.gt            4b
+        ret
+8:
+        ld2r            {v0.16b,  v1.16b},  [x5],  #2
+        ld1             {v2.16b},  [x2],  #16
+        ld1             {v3.d}[0],   [x0]
+        ext             v0.16b,  v0.16b,  v1.16b,  #8
+        sub             v5.16b,  v4.16b,  v0.16b
+        ld1             {v3.d}[1],   [x8]
+        subs            w4,  w4,  #2
+        umull           v6.8h,   v0.8b,   v2.8b
+        umlal           v6.8h,   v3.8b,   v5.8b
+        umull2          v7.8h,   v0.16b,  v2.16b
+        umlal2          v7.8h,   v3.16b,  v5.16b
+        rshrn           v16.8b,  v6.8h,   #6
+        rshrn2          v16.16b, v7.8h,   #6
+        st1             {v16.d}[0],  [x0],  x1
+        st1             {v16.d}[1],  [x8],  x1
+        b.gt            8b
+        ret
+16:
+        ld2r            {v0.16b,  v1.16b},  [x5],  #2
+        ld1             {v2.16b,  v3.16b},  [x2],  #32
+        ld1             {v5.16b},  [x0]
+        sub             v7.16b,  v4.16b,  v0.16b
+        sub             v16.16b, v4.16b,  v1.16b
+        ld1             {v6.16b},  [x8]
+        subs            w4,  w4,  #2
+        umull           v17.8h,  v0.8b,   v2.8b
+        umlal           v17.8h,  v5.8b,   v7.8b
+        umull2          v18.8h,  v0.16b,  v2.16b
+        umlal2          v18.8h,  v5.16b,  v7.16b
+        umull           v19.8h,  v1.8b,   v3.8b
+        umlal           v19.8h,  v6.8b,   v16.8b
+        umull2          v20.8h,  v1.16b,  v3.16b
+        umlal2          v20.8h,  v6.16b,  v16.16b
+        rshrn           v21.8b,  v17.8h,  #6
+        rshrn2          v21.16b, v18.8h,  #6
+        rshrn           v22.8b,  v19.8h,  #6
+        rshrn2          v22.16b, v20.8h,  #6
+        st1             {v21.16b}, [x0],  x1
+        st1             {v22.16b}, [x8],  x1
+        b.gt            16b
+        ret
+1280:
+640:
+320:
+        sub             x1,  x1,  w3,  uxtw
+        add             x7,  x2,  w3,  uxtw
+321:
+        ld2r            {v0.16b,  v1.16b},  [x5],  #2
+        mov             w6,  w3
+        sub             v20.16b, v4.16b,  v0.16b
+        sub             v21.16b, v4.16b,  v1.16b
+32:
+        ld1             {v16.16b, v17.16b}, [x2],  #32
+        ld1             {v2.16b,  v3.16b},  [x0]
+        subs            w6,  w6,  #32
+        umull           v23.8h,  v0.8b,   v16.8b
+        umlal           v23.8h,  v2.8b,   v20.8b
+        ld1             {v18.16b, v19.16b}, [x7],  #32
+        umull2          v27.8h,  v0.16b,  v16.16b
+        umlal2          v27.8h,  v2.16b,  v20.16b
+        ld1             {v6.16b,  v7.16b},  [x8]
+        umull           v24.8h,  v0.8b,   v17.8b
+        umlal           v24.8h,  v3.8b,   v20.8b
+        umull2          v28.8h,  v0.16b,  v17.16b
+        umlal2          v28.8h,  v3.16b,  v20.16b
+        umull           v25.8h,  v1.8b,   v18.8b
+        umlal           v25.8h,  v6.8b,   v21.8b
+        umull2          v5.8h,   v1.16b,  v18.16b
+        umlal2          v5.8h,   v6.16b,  v21.16b
+        rshrn           v29.8b,  v23.8h,  #6
+        rshrn2          v29.16b, v27.8h,  #6
+        umull           v26.8h,  v1.8b,   v19.8b
+        umlal           v26.8h,  v7.8b,   v21.8b
+        umull2          v31.8h,  v1.16b,  v19.16b
+        umlal2          v31.8h,  v7.16b,  v21.16b
+        rshrn           v30.8b,  v24.8h,  #6
+        rshrn2          v30.16b, v28.8h,  #6
+        rshrn           v23.8b,  v25.8h,  #6
+        rshrn2          v23.16b, v5.8h,   #6
+        rshrn           v24.8b,  v26.8h,  #6
+        st1             {v29.16b, v30.16b}, [x0],  #32
+        rshrn2          v24.16b, v31.8h,  #6
+        st1             {v23.16b, v24.16b}, [x8],  #32
+        b.gt            32b
+        subs            w4,  w4,  #2
+        add             x0,  x0,  x1
+        add             x8,  x8,  x1
+        add             x2,  x2,  w3,  uxtw
+        add             x7,  x7,  w3,  uxtw
+        b.gt            321b
+        ret
+L(blend_h_tbl):
+        .hword L(blend_h_tbl) - 1280b
+        .hword L(blend_h_tbl) -  640b
+        .hword L(blend_h_tbl) -  320b
+        .hword L(blend_h_tbl) -   16b
+        .hword L(blend_h_tbl) -    8b
+        .hword L(blend_h_tbl) -    4b
+        .hword L(blend_h_tbl) -    2b
+endfunc
+
+function blend_v_8bpc_neon, export=1
+        adr             x6,  L(blend_v_tbl)
+        movrel          x5,  X(obmc_masks)
+        add             x5,  x5,  w3,  uxtw
+        clz             w3,  w3
+        movi            v4.16b,  #64
+        add             x8,  x0,  x1
+        lsl             x1,  x1,  #1
+        sub             w3,  w3,  #26
+        ldrh            w3,  [x6,  x3,  lsl #1]
+        sub             x6,  x6,  w3,  uxtw
+        br              x6
+20:
+        ld1r            {v0.8b},   [x5]
+        sub             v1.8b,   v4.8b,   v0.8b
+2:
+        ld1             {v2.h}[0],   [x2],  #2
+        ld1             {v3.b}[0],   [x0]
+        subs            w4,  w4,  #2
+        ld1             {v2.b}[1],   [x2]
+        ld1             {v3.b}[1],   [x8]
+        umull           v5.8h,   v2.8b,   v0.8b
+        umlal           v5.8h,   v3.8b,   v1.8b
+        rshrn           v5.8b,   v5.8h,   #6
+        add             x2,  x2,  #2
+        st1             {v5.b}[0],   [x0],  x1
+        st1             {v5.b}[1],   [x8],  x1
+        b.gt            2b
+        ret
+40:
+        ld1r            {v0.2s},   [x5]
+        sub             v1.8b,   v4.8b,   v0.8b
+        sub             x1,  x1,  #3
+4:
+        ld1             {v2.8b},   [x2],  #8
+        ld1             {v3.s}[0],   [x0]
+        ld1             {v3.s}[1],   [x8]
+        subs            w4,  w4,  #2
+        umull           v5.8h,   v2.8b,   v0.8b
+        umlal           v5.8h,   v3.8b,   v1.8b
+        rshrn           v5.8b,   v5.8h,   #6
+        st1             {v5.h}[0],   [x0],  #2
+        st1             {v5.h}[2],   [x8],  #2
+        st1             {v5.b}[2],   [x0],  #1
+        st1             {v5.b}[6],   [x8],  #1
+        add             x0,  x0,  x1
+        add             x8,  x8,  x1
+        b.gt            4b
+        ret
+80:
+        ld1r            {v0.2d},   [x5]
+        sub             v1.16b,  v4.16b,  v0.16b
+        sub             x1,  x1,  #6
+8:
+        ld1             {v2.16b},  [x2],  #16
+        ld1             {v3.d}[0],   [x0]
+        ld1             {v3.d}[1],   [x8]
+        subs            w4,  w4,  #2
+        umull           v5.8h,  v0.8b,  v2.8b
+        umlal           v5.8h,  v3.8b,  v1.8b
+        umull2          v6.8h,  v0.16b, v2.16b
+        umlal2          v6.8h,  v3.16b, v1.16b
+        rshrn           v7.8b,  v5.8h,  #6
+        rshrn2          v7.16b, v6.8h,  #6
+        st1             {v7.s}[0],   [x0],  #4
+        st1             {v7.s}[2],   [x8],  #4
+        st1             {v7.h}[2],   [x0],  #2
+        st1             {v7.h}[6],   [x8],  #2
+        add             x0,  x0,  x1
+        add             x8,  x8,  x1
+        b.gt            8b
+        ret
+160:
+        ld1             {v0.16b},  [x5]
+        sub             v2.16b,  v4.16b,  v0.16b
+        sub             x1,  x1,  #12
+16:
+        ld1             {v5.16b,  v6.16b},  [x2],  #32
+        ld1             {v7.16b},  [x0]
+        subs            w4,  w4,  #2
+        ld1             {v16.16b}, [x8]
+        umull           v17.8h,  v5.8b,   v0.8b
+        umlal           v17.8h,  v7.8b,   v2.8b
+        umull2          v18.8h,  v5.16b,  v0.16b
+        umlal2          v18.8h,  v7.16b,  v2.16b
+        umull           v20.8h,  v6.8b,   v0.8b
+        umlal           v20.8h,  v16.8b,  v2.8b
+        umull2          v21.8h,  v6.16b,  v0.16b
+        umlal2          v21.8h,  v16.16b, v2.16b
+        rshrn           v19.8b,  v17.8h,  #6
+        rshrn2          v19.16b, v18.8h,  #6
+        rshrn           v22.8b,  v20.8h,  #6
+        rshrn2          v22.16b, v21.8h,  #6
+        st1             {v19.8b},  [x0],  #8
+        st1             {v22.8b},  [x8],  #8
+        st1             {v19.s}[2],  [x0],  #4
+        st1             {v22.s}[2],  [x8],  #4
+        add             x0,  x0,  x1
+        add             x8,  x8,  x1
+        b.gt            16b
+        ret
+320:
+        ld1             {v0.16b,  v1.16b},  [x5]
+        sub             v2.16b,  v4.16b,  v0.16b
+        sub             v3.16b,  v4.16b,  v1.16b
+        sub             x1,  x1,  #24
+32:
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2],  #64
+        ld1             {v5.16b,  v6.16b},  [x0]
+        subs            w4,  w4,  #2
+        ld1             {v20.16b, v21.16b}, [x8]
+        umull           v22.8h,  v16.8b,  v0.8b
+        umlal           v22.8h,  v5.8b,   v2.8b
+        umull2          v23.8h,  v16.16b, v0.16b
+        umlal2          v23.8h,  v5.16b,  v2.16b
+        umull           v28.8h,  v17.8b,  v1.8b
+        umlal           v28.8h,  v6.8b,   v3.8b
+        umull2          v29.8h,  v17.16b, v1.16b
+        umlal2          v29.8h,  v6.16b,  v3.16b
+        umull           v30.8h,  v18.8b,  v0.8b
+        umlal           v30.8h,  v20.8b,  v2.8b
+        umull2          v31.8h,  v18.16b, v0.16b
+        umlal2          v31.8h,  v20.16b, v2.16b
+        umull           v25.8h,  v19.8b,  v1.8b
+        umlal           v25.8h,  v21.8b,  v3.8b
+        umull2          v26.8h,  v19.16b, v1.16b
+        umlal2          v26.8h,  v21.16b, v3.16b
+        rshrn           v24.8b,  v22.8h,  #6
+        rshrn2          v24.16b, v23.8h,  #6
+        rshrn           v28.8b,  v28.8h,  #6
+        rshrn2          v28.16b, v29.8h,  #6
+        rshrn           v30.8b,  v30.8h,  #6
+        rshrn2          v30.16b, v31.8h,  #6
+        rshrn           v27.8b,  v25.8h,  #6
+        rshrn2          v27.16b, v26.8h,  #6
+        st1             {v24.16b}, [x0],  #16
+        st1             {v30.16b}, [x8],  #16
+        st1             {v28.8b},  [x0],  #8
+        st1             {v27.8b},  [x8],  #8
+        add             x0,  x0,  x1
+        add             x8,  x8,  x1
+        b.gt            32b
+        ret
+L(blend_v_tbl):
+        .hword L(blend_v_tbl) - 320b
+        .hword L(blend_v_tbl) - 160b
+        .hword L(blend_v_tbl) -  80b
+        .hword L(blend_v_tbl) -  40b
+        .hword L(blend_v_tbl) -  20b
+endfunc
+
+
 // This has got the same signature as the put_8tap functions,
 // and assumes that x8 is set to (clz(w)-24).
 function put_neon
--- a/third_party/dav1d/src/arm/64/msac.S
+++ b/third_party/dav1d/src/arm/64/msac.S
@ -148,7 +148,7 @@ function msac_decode_symbol_adapt4_neon, export=1
        add             x8,  x0,  #RNG
        ld1_n           v0,  v1,  x1,  \sz, \n                    // cdf
        ld1r            {v4\sz},  [x8]                            // rng
-        movrel          x9,  coeffs, 32
+        movrel          x9,  coeffs, 30
        sub             x9,  x9,  x2, lsl #1
        ushr_n          v2,  v3,  v0,  v1,  #6, \sz, \n           // cdf >> EC_PROB_SHIFT
        str             h4,  [sp, #14]                            // store original u = s->rng
@ -183,16 +183,24 @@ function msac_decode_symbol_adapt4_neon, export=1
        // update_cdf
        ldrh            w3,  [x1, x2, lsl #1]                     // count = cdf[n_symbols]
        movi            v5\szb, #0xff
-        cmp             x2,  #4                                   // set C if n_symbols >= 4 (n_symbols > 3)
-        mov             w14, #4
-        lsr             w4,  w3,  #4                              // count >> 4
+.if \n == 16
+        mov             w4,  #-5
+.else
+        mvn             w14, w2
+        mov             w4,  #-4
+        cmn             w14, #3                                   // set C if n_symbols <= 2
+.endif
        urhadd_n        v4,  v5,  v5,  v5,  v2,  v3,  \sz, \n     // i >= val ? -1 : 32768
-        adc             w4,  w4,  w14                             // (count >> 4) + (n_symbols > 3) + 4
-        neg             w4,  w4                                   // -rate
+.if \n == 16
+        sub             w4,  w4,  w3, lsr #4                      // -((count >> 4) + 5)
+.else
+        lsr             w14, w3,  #4                              // count >> 4
+        sbc             w4,  w4,  w14                             // -((count >> 4) + (n_symbols > 2) + 4)
+.endif
        sub_n           v4,  v5,  v4,  v5,  v0,  v1,  \sz, \n     // (32768 - cdf[i]) or (-1 - cdf[i])
        dup             v6.8h,    w4                              // -rate

-        sub             w3,  w3,  w3, lsr #5                      // count - (count >= 32)
+        sub             w3,  w3,  w3, lsr #5                      // count - (count == 32)
        sub_n           v0,  v1,  v0,  v1,  v2,  v3,  \sz, \n     // cdf + (i >= val ? 1 : 0)
        sshl_n          v4,  v5,  v4,  v5,  v6,  v6,  \sz, \n     // ({32768,-1} - cdf[i]) >> rate
        add             w3,  w3,  #1                              // count + (count < 32)
@ -224,8 +232,7 @@ L(renorm2):
        b.ge            9f

        // refill
-        ldr             x3,  [x0, #BUF_POS]
-        ldr             x4,  [x0, #BUF_END]
+        ldp             x3,  x4,  [x0]         // BUF_POS, BUF_END
        add             x5,  x3,  #8
        cmp             x5,  x4
        b.gt            2f
--- a/third_party/dav1d/src/arm/mc_init_tmpl.c
+++ b/third_party/dav1d/src/arm/mc_init_tmpl.c
@ -101,16 +101,15 @@ void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
    c->avg = dav1d_avg_8bpc_neon;
    c->w_avg = dav1d_w_avg_8bpc_neon;
    c->mask = dav1d_mask_8bpc_neon;
-#if ARCH_AARCH64
-    c->warp8x8 = dav1d_warp_affine_8x8_8bpc_neon;
-    c->warp8x8t = dav1d_warp_affine_8x8t_8bpc_neon;
-#elif ARCH_ARM
    c->blend = dav1d_blend_8bpc_neon;
    c->blend_h = dav1d_blend_h_8bpc_neon;
    c->blend_v = dav1d_blend_v_8bpc_neon;
    c->w_mask[0] = dav1d_w_mask_444_8bpc_neon;
    c->w_mask[1] = dav1d_w_mask_422_8bpc_neon;
    c->w_mask[2] = dav1d_w_mask_420_8bpc_neon;
+#if ARCH_AARCH64
+    c->warp8x8 = dav1d_warp_affine_8x8_8bpc_neon;
+    c->warp8x8t = dav1d_warp_affine_8x8t_8bpc_neon;
 #endif
 #endif
 }
--- a/third_party/dav1d/src/cdef.h
+++ b/third_party/dav1d/src/cdef.h
@ -67,6 +67,7 @@ typedef struct Dav1dCdefDSPContext {

 bitfn_decls(void dav1d_cdef_dsp_init, Dav1dCdefDSPContext *c);
 bitfn_decls(void dav1d_cdef_dsp_init_arm, Dav1dCdefDSPContext *c);
+bitfn_decls(void dav1d_cdef_dsp_init_ppc, Dav1dCdefDSPContext *c);
 bitfn_decls(void dav1d_cdef_dsp_init_x86, Dav1dCdefDSPContext *c);

 #endif /* DAV1D_SRC_CDEF_H */
--- a/third_party/dav1d/src/cdef_tmpl.c
+++ b/third_party/dav1d/src/cdef_tmpl.c
@ -27,7 +27,6 @@

 #include "config.h"

-#include <assert.h>
 #include <stdlib.h>

 #include "common/intops.h"
@ -263,6 +262,8 @@ COLD void bitfn(dav1d_cdef_dsp_init)(Dav1dCdefDSPContext *const c) {
 #if HAVE_ASM
 #if ARCH_AARCH64 || ARCH_ARM
    bitfn(dav1d_cdef_dsp_init_arm)(c);
+#elif ARCH_PPC64LE
+    bitfn(dav1d_cdef_dsp_init_ppc)(c);
 #elif ARCH_X86
    bitfn(dav1d_cdef_dsp_init_x86)(c);
 #endif
--- a/third_party/dav1d/src/cdf.c
+++ b/third_party/dav1d/src/cdf.c
--- a/third_party/dav1d/src/cdf.h
+++ b/third_party/dav1d/src/cdf.h
@ -37,91 +37,94 @@
 /* Buffers padded to [8] or [16] for SIMD where needed. */

 typedef struct CdfModeContext {
-    uint16_t y_mode[4][N_INTRA_PRED_MODES + 1 + 2];
-    uint16_t use_filter_intra[N_BS_SIZES][2];
-    uint16_t filter_intra[5 + 1];
-    uint16_t uv_mode[2][N_INTRA_PRED_MODES][N_UV_INTRA_PRED_MODES + 1 + 1];
-    uint16_t angle_delta[8][8];
-    uint16_t filter[2][8][DAV1D_N_SWITCHABLE_FILTERS + 1];
-    uint16_t newmv_mode[6][2];
-    uint16_t globalmv_mode[2][2];
-    uint16_t refmv_mode[6][2];
-    uint16_t drl_bit[3][2];
-    uint16_t comp_inter_mode[8][N_COMP_INTER_PRED_MODES + 1];
-    uint16_t intra[4][2];
-    uint16_t comp[5][2];
-    uint16_t comp_dir[5][2];
-    uint16_t jnt_comp[6][2];
-    uint16_t mask_comp[6][2];
-    uint16_t wedge_comp[9][2];
-    uint16_t wedge_idx[9][16 + 1];
-    uint16_t interintra[7][2];
-    uint16_t interintra_mode[4][5];
-    uint16_t interintra_wedge[7][2];
-    uint16_t ref[6][3][2];
-    uint16_t comp_fwd_ref[3][3][2];
-    uint16_t comp_bwd_ref[2][3][2];
-    uint16_t comp_uni_ref[3][3][2];
-    uint16_t txsz[N_TX_SIZES - 1][3][4];
-    uint16_t txpart[7][3][2];
-    uint16_t txtp_inter[4][N_TX_SIZES][N_TX_TYPES + 1];
-    uint16_t txtp_intra[3][N_TX_SIZES][N_INTRA_PRED_MODES][N_TX_TYPES + 1];
-    uint16_t skip[3][2];
-    uint16_t skip_mode[3][2];
-    uint16_t partition[N_BL_LEVELS][4][N_PARTITIONS + 1 + 5];
-    uint16_t seg_pred[3][2];
-    uint16_t seg_id[3][DAV1D_MAX_SEGMENTS + 1];
-    uint16_t cfl_sign[8 + 1];
-    uint16_t cfl_alpha[6][16 + 1];
-    uint16_t restore_wiener[2];
-    uint16_t restore_sgrproj[2];
-    uint16_t restore_switchable[3 + 1];
-    uint16_t delta_q[4 + 1];
-    uint16_t delta_lf[5][4 + 1];
-    uint16_t obmc[N_BS_SIZES][2];
-    uint16_t motion_mode[N_BS_SIZES][3 + 1];
-    uint16_t pal_y[7][3][2];
-    uint16_t pal_uv[2][2];
-    uint16_t pal_sz[2][7][7 + 1];
-    uint16_t color_map[2][7][5][8 + 1];
-    uint16_t intrabc[2];
+    ALIGN(uint16_t y_mode[4][N_INTRA_PRED_MODES + 3], 32);
+    ALIGN(uint16_t uv_mode[2][N_INTRA_PRED_MODES][N_UV_INTRA_PRED_MODES + 2], 32);
+    ALIGN(uint16_t wedge_idx[9][16], 32);
+    ALIGN(uint16_t partition[N_BL_LEVELS][4][N_PARTITIONS + 6], 32);
+    ALIGN(uint16_t cfl_alpha[6][16], 32);
+    ALIGN(uint16_t txtp_inter1[2][16], 32);
+    ALIGN(uint16_t txtp_inter2[12 + 4], 32);
+    ALIGN(uint16_t txtp_intra1[2][N_INTRA_PRED_MODES][7 + 1], 16);
+    ALIGN(uint16_t txtp_intra2[3][N_INTRA_PRED_MODES][5 + 3], 16);
+    ALIGN(uint16_t cfl_sign[8], 16);
+    ALIGN(uint16_t angle_delta[8][8], 16);
+    ALIGN(uint16_t filter_intra[5 + 3], 16);
+    ALIGN(uint16_t comp_inter_mode[8][N_COMP_INTER_PRED_MODES], 16);
+    ALIGN(uint16_t seg_id[3][DAV1D_MAX_SEGMENTS], 16);
+    ALIGN(uint16_t pal_sz[2][7][7 + 1], 16);
+    ALIGN(uint16_t color_map[2][7][5][8], 16);
+    ALIGN(uint16_t filter[2][8][DAV1D_N_SWITCHABLE_FILTERS + 1], 8);
+    ALIGN(uint16_t txsz[N_TX_SIZES - 1][3][4], 8);
+    ALIGN(uint16_t motion_mode[N_BS_SIZES][3 + 1], 8);
+    ALIGN(uint16_t delta_q[4], 8);
+    ALIGN(uint16_t delta_lf[5][4], 8);
+    ALIGN(uint16_t interintra_mode[4][4], 8);
+    ALIGN(uint16_t restore_switchable[3 + 1], 8);
+    ALIGN(uint16_t restore_wiener[2], 4);
+    ALIGN(uint16_t restore_sgrproj[2], 4);
+    ALIGN(uint16_t interintra[7][2], 4);
+    ALIGN(uint16_t interintra_wedge[7][2], 4);
+    ALIGN(uint16_t txtp_inter3[4][2], 4);
+    ALIGN(uint16_t use_filter_intra[N_BS_SIZES][2], 4);
+    ALIGN(uint16_t newmv_mode[6][2], 4);
+    ALIGN(uint16_t globalmv_mode[2][2], 4);
+    ALIGN(uint16_t refmv_mode[6][2], 4);
+    ALIGN(uint16_t drl_bit[3][2], 4);
+    ALIGN(uint16_t intra[4][2], 4);
+    ALIGN(uint16_t comp[5][2], 4);
+    ALIGN(uint16_t comp_dir[5][2], 4);
+    ALIGN(uint16_t jnt_comp[6][2], 4);
+    ALIGN(uint16_t mask_comp[6][2], 4);
+    ALIGN(uint16_t wedge_comp[9][2], 4);
+    ALIGN(uint16_t ref[6][3][2], 4);
+    ALIGN(uint16_t comp_fwd_ref[3][3][2], 4);
+    ALIGN(uint16_t comp_bwd_ref[2][3][2], 4);
+    ALIGN(uint16_t comp_uni_ref[3][3][2], 4);
+    ALIGN(uint16_t txpart[7][3][2], 4);
+    ALIGN(uint16_t skip[3][2], 4);
+    ALIGN(uint16_t skip_mode[3][2], 4);
+    ALIGN(uint16_t seg_pred[3][2], 4);
+    ALIGN(uint16_t obmc[N_BS_SIZES][2], 4);
+    ALIGN(uint16_t pal_y[7][3][2], 4);
+    ALIGN(uint16_t pal_uv[2][2], 4);
+    ALIGN(uint16_t intrabc[2], 4);
 } CdfModeContext;

 typedef struct CdfCoefContext {
-    uint16_t skip[N_TX_SIZES][13][2];
-    uint16_t eob_bin_16[2][2][6];
-    uint16_t eob_bin_32[2][2][7 + 1];
-    uint16_t eob_bin_64[2][2][8];
-    uint16_t eob_bin_128[2][2][9];
-    uint16_t eob_bin_256[2][2][10 + 6];
-    uint16_t eob_bin_512[2][2][11 + 5];
-    uint16_t eob_bin_1024[2][2][12 + 4];
-    uint16_t eob_hi_bit[N_TX_SIZES][2][11 /*22*/][2];
-    uint16_t eob_base_tok[N_TX_SIZES][2][4][4];
-    uint16_t base_tok[N_TX_SIZES][2][41][5];
-    uint16_t dc_sign[2][3][2];
-    uint16_t br_tok[4 /*5*/][2][21][5];
+    ALIGN(uint16_t eob_bin_16[2][2][5 + 3], 16);
+    ALIGN(uint16_t eob_bin_32[2][2][6 + 2], 16);
+    ALIGN(uint16_t eob_bin_64[2][2][7 + 1], 16);
+    ALIGN(uint16_t eob_bin_128[2][2][8 + 0], 16);
+    ALIGN(uint16_t eob_bin_256[2][2][9 + 7], 32);
+    ALIGN(uint16_t eob_bin_512[2][10 + 6], 32);
+    ALIGN(uint16_t eob_bin_1024[2][11 + 5], 32);
+    ALIGN(uint16_t eob_base_tok[N_TX_SIZES][2][4][4], 8);
+    ALIGN(uint16_t base_tok[N_TX_SIZES][2][41][4], 8);
+    ALIGN(uint16_t br_tok[4 /*5*/][2][21][4], 8);
+    ALIGN(uint16_t eob_hi_bit[N_TX_SIZES][2][11 /*22*/][2], 4);
+    ALIGN(uint16_t skip[N_TX_SIZES][13][2], 4);
+    ALIGN(uint16_t dc_sign[2][3][2], 4);
 } CdfCoefContext;

 typedef struct CdfMvComponent {
-    uint16_t classes[11 + 1 + 4];
-    uint16_t class0[2];
-    uint16_t classN[10][2];
-    uint16_t class0_fp[2][4 + 1];
-    uint16_t classN_fp[4 + 1];
-    uint16_t class0_hp[2];
-    uint16_t classN_hp[2];
-    uint16_t sign[2];
+    ALIGN(uint16_t classes[11 + 5], 32);
+    ALIGN(uint16_t class0_fp[2][4], 8);
+    ALIGN(uint16_t classN_fp[4], 8);
+    ALIGN(uint16_t class0_hp[2], 4);
+    ALIGN(uint16_t classN_hp[2], 4);
+    ALIGN(uint16_t class0[2], 4);
+    ALIGN(uint16_t classN[10][2], 4);
+    ALIGN(uint16_t sign[2], 4);
 } CdfMvComponent;

 typedef struct CdfMvContext {
    CdfMvComponent comp[2];
-    uint16_t joint[N_MV_JOINTS + 1];
+    ALIGN(uint16_t joint[N_MV_JOINTS], 8);
 } CdfMvContext;

 typedef struct CdfContext {
    CdfModeContext m;
-    uint16_t kfym[5][5][N_INTRA_PRED_MODES + 1 + 2];
+    ALIGN(uint16_t kfym[5][5][N_INTRA_PRED_MODES + 3], 32);
    CdfCoefContext coef;
    CdfMvContext mv, dmv;
 } CdfContext;
--- a/third_party/dav1d/src/data.c
+++ b/third_party/dav1d/src/data.c
@ -27,7 +27,6 @@

 #include "config.h"

-#include <assert.h>
 #include <errno.h>
 #include <stdint.h>
 #include <stdlib.h>
@ -35,6 +34,7 @@

 #include "dav1d/data.h"

+#include "common/attributes.h"
 #include "common/validate.h"

 #include "src/data.h"
--- a/third_party/dav1d/src/decode.c
+++ b/third_party/dav1d/src/decode.c
@ -42,6 +42,7 @@
 #include "src/decode.h"
 #include "src/dequant_tables.h"
 #include "src/env.h"
+#include "src/film_grain.h"
 #include "src/log.h"
 #include "src/qm.h"
 #include "src/recon.h"
@ -81,14 +82,14 @@ static int read_mv_component_diff(Dav1dTileContext *const t,
    const int have_hp = f->frame_hdr->hp;
    const int sign = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->sign);
    const int cl = dav1d_msac_decode_symbol_adapt16(&ts->msac,
-                                                    mv_comp->classes, 11);
+                                                    mv_comp->classes, 10);
    int up, fp, hp;

    if (!cl) {
        up = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->class0);
        if (have_fp) {
            fp = dav1d_msac_decode_symbol_adapt4(&ts->msac,
-                                                 mv_comp->class0_fp[up], 4);
+                                                 mv_comp->class0_fp[up], 3);
            hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac,
                                                        mv_comp->class0_hp) : 1;
        } else {
@ -102,7 +103,7 @@ static int read_mv_component_diff(Dav1dTileContext *const t,
                                               mv_comp->classN[n]) << n;
        if (have_fp) {
            fp = dav1d_msac_decode_symbol_adapt4(&ts->msac,
-                                                 mv_comp->classN_fp, 4);
+                                                 mv_comp->classN_fp, 3);
            hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac,
                                                        mv_comp->classN_hp) : 1;
        } else {
@ -120,7 +121,7 @@ static void read_mv_residual(Dav1dTileContext *const t, mv *const ref_mv,
                             CdfMvContext *const mv_cdf, const int have_fp)
 {
    switch (dav1d_msac_decode_symbol_adapt4(&t->ts->msac, t->ts->cdf.mv.joint,
-                                            N_MV_JOINTS))
+                                            N_MV_JOINTS - 1))
    {
    case MV_JOINT_HV:
        ref_mv->y += read_mv_component_diff(t, &mv_cdf->comp[0], have_fp);
@ -380,7 +381,7 @@ static void read_pal_plane(Dav1dTileContext *const t, Av1Block *const b,
    Dav1dTileState *const ts = t->ts;
    const Dav1dFrameContext *const f = t->f;
    const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac,
-                                           ts->cdf.m.pal_sz[pl][sz_ctx], 7) + 2;
+                                           ts->cdf.m.pal_sz[pl][sz_ctx], 6) + 2;
    uint16_t cache[16], used_cache[8];
    int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4];
    int n_cache = 0;
@ -586,7 +587,7 @@ static void read_pal_indices(Dav1dTileContext *const t,
    Dav1dTileState *const ts = t->ts;
    const ptrdiff_t stride = bw4 * 4;
    pal_idx[0] = dav1d_msac_decode_uniform(&ts->msac, b->pal_sz[pl]);
-    uint16_t (*const color_map_cdf)[8 + 1] =
+    uint16_t (*const color_map_cdf)[8] =
        ts->cdf.m.color_map[pl][b->pal_sz[pl] - 2];
    uint8_t (*const order)[8] = t->scratch.pal_order;
    uint8_t *const ctx = t->scratch.pal_ctx;
@ -597,7 +598,7 @@ static void read_pal_indices(Dav1dTileContext *const t,
        order_palette(pal_idx, stride, i, first, last, order, ctx);
        for (int j = first, m = 0; j >= last; j--, m++) {
            const int color_idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
-                                      color_map_cdf[ctx[m]], b->pal_sz[pl]);
+                                      color_map_cdf[ctx[m]], b->pal_sz[pl] - 1);
            pal_idx[(i - j) * stride + j] = order[m][color_idx];
        }
    }
@ -647,7 +648,7 @@ static void read_vartx_tree(Dav1dTileContext *const t,
        }
        b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.layout];
    } else {
-        assert(imin(bw4, bh4) <= 16 || b->max_ytx == TX_64X64);
+        assert(bw4 <= 16 || bh4 <= 16 || b->max_ytx == TX_64X64);
        int y, x, y_off, x_off;
        const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx];
        for (y = 0, y_off = 0; y < bh4; y += ytx->h, y_off++) {
@ -673,8 +674,6 @@ static inline unsigned get_prev_frame_segid(const Dav1dFrameContext *const f,
                                            const uint8_t *ref_seg_map,
                                            const ptrdiff_t stride)
 {
-    unsigned seg_id = 8;
-
    assert(f->frame_hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE);
    if (dav1d_thread_picture_wait(&f->refp[f->frame_hdr->primary_ref_frame],
                                  (by + h4) * 4, PLANE_TYPE_BLOCK))
@ -682,12 +681,13 @@ static inline unsigned get_prev_frame_segid(const Dav1dFrameContext *const f,
        return 8;
    }

+    unsigned seg_id = 8;
    ref_seg_map += by * stride + bx;
    do {
        for (int x = 0; x < w4; x++)
            seg_id = imin(seg_id, ref_seg_map[x]);
        ref_seg_map += stride;
-    } while (--h4 > 0);
+    } while (--h4 > 0 && seg_id);
    assert(seg_id < 8);

    return seg_id;
@ -814,7 +814,7 @@ static int decode_b(Dav1dTileContext *const t,
                                        &seg_ctx, f->cur_segmap, f->b4_stride);
                const unsigned diff = dav1d_msac_decode_symbol_adapt8(&ts->msac,
                                          ts->cdf.m.seg_id[seg_ctx],
-                                          DAV1D_MAX_SEGMENTS);
+                                          DAV1D_MAX_SEGMENTS - 1);
                const unsigned last_active_seg_id =
                    f->frame_hdr->segmentation.seg_data.last_active_segid;
                b->seg_id = neg_deinterleave(diff, pred_seg_id,
@ -886,7 +886,7 @@ static int decode_b(Dav1dTileContext *const t,
            } else {
                const unsigned diff = dav1d_msac_decode_symbol_adapt8(&ts->msac,
                                          ts->cdf.m.seg_id[seg_ctx],
-                                          DAV1D_MAX_SEGMENTS);
+                                          DAV1D_MAX_SEGMENTS - 1);
                const unsigned last_active_seg_id =
                    f->frame_hdr->segmentation.seg_data.last_active_segid;
                b->seg_id = neg_deinterleave(diff, pred_seg_id,
@ -934,7 +934,7 @@ static int decode_b(Dav1dTileContext *const t,

        if (have_delta_q) {
            int delta_q = dav1d_msac_decode_symbol_adapt4(&ts->msac,
-                                                          ts->cdf.m.delta_q, 4);
+                                                          ts->cdf.m.delta_q, 3);
            if (delta_q == 3) {
                const int n_bits = 1 + dav1d_msac_decode_bools(&ts->msac, 3);
                delta_q = dav1d_msac_decode_bools(&ts->msac, n_bits) +
@ -955,7 +955,7 @@ static int decode_b(Dav1dTileContext *const t,

                for (int i = 0; i < n_lfs; i++) {
                    int delta_lf = dav1d_msac_decode_symbol_adapt4(&ts->msac,
-                        ts->cdf.m.delta_lf[i + f->frame_hdr->delta.lf.multi], 4);
+                        ts->cdf.m.delta_lf[i + f->frame_hdr->delta.lf.multi], 3);
                    if (delta_lf == 3) {
                        const int n_bits = 1 + dav1d_msac_decode_bools(&ts->msac, 3);
                        delta_lf = dav1d_msac_decode_bools(&ts->msac, n_bits) +
@ -1020,7 +1020,7 @@ static int decode_b(Dav1dTileContext *const t,
            ts->cdf.kfym[dav1d_intra_mode_context[t->a->mode[bx4]]]
                        [dav1d_intra_mode_context[t->l.mode[by4]]];
        b->y_mode = dav1d_msac_decode_symbol_adapt16(&ts->msac, ymode_cdf,
-                                                     N_INTRA_PRED_MODES);
+                                                     N_INTRA_PRED_MODES - 1);
        if (DEBUG_BLOCK_INFO)
            printf("Post-ymode[%d]: r=%d\n", b->y_mode, ts->msac.rng);

@ -1029,7 +1029,7 @@ static int decode_b(Dav1dTileContext *const t,
            b->y_mode <= VERT_LEFT_PRED)
        {
            uint16_t *const acdf = ts->cdf.m.angle_delta[b->y_mode - VERT_PRED];
-            const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 7);
+            const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 6);
            b->y_angle = angle - 3;
        } else {
            b->y_angle = 0;
@ -1040,20 +1040,20 @@ static int decode_b(Dav1dTileContext *const t,
                cbw4 == 1 && cbh4 == 1 : !!(cfl_allowed_mask & (1 << bs));
            uint16_t *const uvmode_cdf = ts->cdf.m.uv_mode[cfl_allowed][b->y_mode];
            b->uv_mode = dav1d_msac_decode_symbol_adapt16(&ts->msac, uvmode_cdf,
-                             N_UV_INTRA_PRED_MODES - !cfl_allowed);
+                             N_UV_INTRA_PRED_MODES - 1 - !cfl_allowed);
            if (DEBUG_BLOCK_INFO)
                printf("Post-uvmode[%d]: r=%d\n", b->uv_mode, ts->msac.rng);

            if (b->uv_mode == CFL_PRED) {
 #define SIGN(a) (!!(a) + ((a) > 0))
                const int sign = dav1d_msac_decode_symbol_adapt8(&ts->msac,
-                                     ts->cdf.m.cfl_sign, 8) + 1;
+                                     ts->cdf.m.cfl_sign, 7) + 1;
                const int sign_u = sign * 0x56 >> 8, sign_v = sign - sign_u * 3;
                assert(sign_u == sign / 3);
                if (sign_u) {
                    const int ctx = (sign_u == 2) * 3 + sign_v;
                    b->cfl_alpha[0] = dav1d_msac_decode_symbol_adapt16(&ts->msac,
-                                          ts->cdf.m.cfl_alpha[ctx], 16) + 1;
+                                          ts->cdf.m.cfl_alpha[ctx], 15) + 1;
                    if (sign_u == 1) b->cfl_alpha[0] = -b->cfl_alpha[0];
                } else {
                    b->cfl_alpha[0] = 0;
@ -1061,7 +1061,7 @@ static int decode_b(Dav1dTileContext *const t,
                if (sign_v) {
                    const int ctx = (sign_v == 2) * 3 + sign_u;
                    b->cfl_alpha[1] = dav1d_msac_decode_symbol_adapt16(&ts->msac,
-                                          ts->cdf.m.cfl_alpha[ctx], 16) + 1;
+                                          ts->cdf.m.cfl_alpha[ctx], 15) + 1;
                    if (sign_v == 1) b->cfl_alpha[1] = -b->cfl_alpha[1];
                } else {
                    b->cfl_alpha[1] = 0;
@ -1074,7 +1074,7 @@ static int decode_b(Dav1dTileContext *const t,
                       b->uv_mode <= VERT_LEFT_PRED)
            {
                uint16_t *const acdf = ts->cdf.m.angle_delta[b->uv_mode - VERT_PRED];
-                const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 7);
+                const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 6);
                b->uv_angle = angle - 3;
            } else {
                b->uv_angle = 0;
@ -1115,7 +1115,7 @@ static int decode_b(Dav1dTileContext *const t,
            if (is_filter) {
                b->y_mode = FILTER_PRED;
                b->y_angle = dav1d_msac_decode_symbol_adapt4(&ts->msac,
-                                 ts->cdf.m.filter_intra, 5);
+                                 ts->cdf.m.filter_intra, 4);
            }
            if (DEBUG_BLOCK_INFO)
                printf("Post-filterintramode[%d/%d]: r=%d\n",
@ -1158,7 +1158,7 @@ static int decode_b(Dav1dTileContext *const t,
                const int tctx = get_tx_ctx(t->a, &t->l, t_dim, by4, bx4);
                uint16_t *const tx_cdf = ts->cdf.m.txsz[t_dim->max - 1][tctx];
                int depth = dav1d_msac_decode_symbol_adapt4(&ts->msac, tx_cdf,
-                                imin(t_dim->max + 1, 3));
+                                imin(t_dim->max, 2));

                while (depth--) {
                    b->tx = t_dim->sub;
@ -1480,7 +1480,7 @@ static int decode_b(Dav1dTileContext *const t,

            b->inter_mode = dav1d_msac_decode_symbol_adapt8(&ts->msac,
                                ts->cdf.m.comp_inter_mode[ctx],
-                                N_COMP_INTER_PRED_MODES);
+                                N_COMP_INTER_PRED_MODES - 1);
            if (DEBUG_BLOCK_INFO)
                printf("Post-compintermode[%d,ctx=%d,n_mvs=%d]: r=%d\n",
                       b->inter_mode, ctx, n_mvs, ts->msac.rng);
@ -1588,7 +1588,7 @@ static int decode_b(Dav1dTileContext *const t,
                                       ts->cdf.m.wedge_comp[ctx]);
                    if (b->comp_type == COMP_INTER_WEDGE)
                        b->wedge_idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
-                                           ts->cdf.m.wedge_idx[ctx], 16);
+                                           ts->cdf.m.wedge_idx[ctx], 15);
                } else {
                    b->comp_type = COMP_INTER_SEG;
                }
@ -1743,14 +1743,14 @@ static int decode_b(Dav1dTileContext *const t,
            {
                b->interintra_mode = dav1d_msac_decode_symbol_adapt4(&ts->msac,
                                         ts->cdf.m.interintra_mode[ii_sz_grp],
-                                         N_INTER_INTRA_PRED_MODES);
+                                         N_INTER_INTRA_PRED_MODES - 1);
                const int wedge_ctx = dav1d_wedge_ctx_lut[bs];
                b->interintra_type = INTER_INTRA_BLEND +
                                     dav1d_msac_decode_bool_adapt(&ts->msac,
                                         ts->cdf.m.interintra_wedge[wedge_ctx]);
                if (b->interintra_type == INTER_INTRA_WEDGE)
                    b->wedge_idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
-                                       ts->cdf.m.wedge_idx[wedge_ctx], 16);
+                                       ts->cdf.m.wedge_idx[wedge_ctx], 15);
            } else {
                b->interintra_type = INTER_INTRA_NONE;
            }
@ -1783,7 +1783,7 @@ static int decode_b(Dav1dTileContext *const t,

                b->motion_mode = allow_warp ?
                    dav1d_msac_decode_symbol_adapt4(&ts->msac,
-                        ts->cdf.m.motion_mode[bs], 3) :
+                        ts->cdf.m.motion_mode[bs], 2) :
                    dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.obmc[bs]);
                if (b->motion_mode == MM_WARP) {
                    has_subpel_filter = 0;
@ -1823,7 +1823,7 @@ static int decode_b(Dav1dTileContext *const t,
                                                by4, bx4);
                filter[0] = dav1d_msac_decode_symbol_adapt4(&ts->msac,
                               ts->cdf.m.filter[0][ctx1],
-                               DAV1D_N_SWITCHABLE_FILTERS);
+                               DAV1D_N_SWITCHABLE_FILTERS - 1);
                if (f->seq_hdr->dual_filter) {
                    const int ctx2 = get_filter_ctx(t->a, &t->l, comp, 1,
                                                    b->ref[0], by4, bx4);
@ -1832,7 +1832,7 @@ static int decode_b(Dav1dTileContext *const t,
                               filter[0], ctx1, ts->msac.rng);
                    filter[1] = dav1d_msac_decode_symbol_adapt4(&ts->msac,
                                    ts->cdf.m.filter[1][ctx2],
-                                    DAV1D_N_SWITCHABLE_FILTERS);
+                                    DAV1D_N_SWITCHABLE_FILTERS - 1);
                    if (DEBUG_BLOCK_INFO)
                        printf("Post-subpel_filter2[%d,ctx=%d]: r=%d\n",
                               filter[1], ctx2, ts->msac.rng);
@ -2023,9 +2023,8 @@ static int decode_sb(Dav1dTileContext *const t, const enum BlockLevel bl,
            const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx];
            bp = b->bl == bl ? b->bp : PARTITION_SPLIT;
        } else {
-            const unsigned n_part = bl == BL_8X8 ? N_SUB8X8_PARTITIONS :
-                bl == BL_128X128 ? N_PARTITIONS - 2 : N_PARTITIONS;
-            bp = dav1d_msac_decode_symbol_adapt16(&t->ts->msac, pc, n_part);
+            bp = dav1d_msac_decode_symbol_adapt16(&t->ts->msac, pc,
+                                                  dav1d_partition_type_count[bl]);
            if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I422 &&
                (bp == PARTITION_V || bp == PARTITION_V4 ||
                 bp == PARTITION_T_LEFT_SPLIT || bp == PARTITION_T_RIGHT_SPLIT))
@ -2381,7 +2380,7 @@ static void read_restoration_info(Dav1dTileContext *const t,

    if (frame_type == DAV1D_RESTORATION_SWITCHABLE) {
        const int filter = dav1d_msac_decode_symbol_adapt4(&ts->msac,
-                               ts->cdf.m.restore_switchable, 3);
+                               ts->cdf.m.restore_switchable, 2);
        lr->type = filter ? filter == 2 ? DAV1D_RESTORATION_SGRPROJ :
                                          DAV1D_RESTORATION_WIENER :
                                          DAV1D_RESTORATION_NONE;
@ -2597,8 +2596,10 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
            f->tile_thread.titsati_sz = titsati_sz;
        }
        if (f->tile_thread.titsati_init[0] != f->frame_hdr->tiling.cols ||
-            f->tile_thread.titsati_init[1] != f->sbh ||
-            f->tile_thread.titsati_init[2] != f->frame_hdr->tiling.rows)
+            f->tile_thread.titsati_init[1] != f->frame_hdr->tiling.rows ||
+            memcmp(f->frame_hdr->tiling.row_start_sb, f->tile_thread.titsati_index_rows,
+                   sizeof(*f->tile_thread.titsati_index_rows) *
+                       (f->frame_hdr->tiling.rows + 1)))
        {
            for (int tile_row = 0, tile_idx = 0;
                 tile_row < f->frame_hdr->tiling.rows; tile_row++)
@ -2616,8 +2617,10 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
                }
            }
            f->tile_thread.titsati_init[0] = f->frame_hdr->tiling.cols;
-            f->tile_thread.titsati_init[1] = f->sbh;
-            f->tile_thread.titsati_init[2] = f->frame_hdr->tiling.rows;
+            f->tile_thread.titsati_init[1] = f->frame_hdr->tiling.rows;
+            memcpy(f->tile_thread.titsati_index_rows, f->frame_hdr->tiling.row_start_sb,
+                   sizeof(*f->tile_thread.titsati_index_rows) *
+                       (f->frame_hdr->tiling.rows + 1));
        }
    }

@ -2637,9 +2640,13 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
                goto error;
            }
        }
-        if (n_ts > f->n_ts) {
-            Dav1dTileState *ts_new = realloc(f->ts, sizeof(*f->ts) * n_ts);
+        Dav1dTileState *ts_new = dav1d_alloc_aligned(sizeof(*f->ts) * n_ts, 32);
        if (!ts_new) goto error;
+        if (n_ts > f->n_ts) {
+            if (f->ts) {
+                memcpy(ts_new, f->ts, sizeof(*f->ts) * f->n_ts);
+                dav1d_free_aligned(f->ts);
+            }
            f->ts = ts_new;
            for (int n = f->n_ts; n < n_ts; f->n_ts = ++n) {
                Dav1dTileState *const ts = &f->ts[n];
@ -2655,9 +2662,9 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
                pthread_cond_destroy(&ts->tile_thread.cond);
                pthread_mutex_destroy(&ts->tile_thread.lock);
            }
+            memcpy(ts_new, f->ts, sizeof(*f->ts) * n_ts);
+            dav1d_free_aligned(f->ts);
            f->n_ts = n_ts;
-            Dav1dTileState *ts_new = realloc(f->ts, sizeof(*f->ts) * n_ts);
-            if (!ts_new) goto error;
            f->ts = ts_new;
        }
    }
@ -3184,6 +3191,7 @@ int dav1d_submit_frame(Dav1dContext *const c) {
            dav1d_loop_filter_dsp_init_##bd##bpc(&dsp->lf); \
            dav1d_loop_restoration_dsp_init_##bd##bpc(&dsp->lr); \
            dav1d_mc_dsp_init_##bd##bpc(&dsp->mc); \
+            dav1d_film_grain_dsp_init_##bd##bpc(&dsp->fg); \
            break
 #if CONFIG_8BPC
        case 8:
--- a/third_party/dav1d/src/env.h
+++ b/third_party/dav1d/src/env.h
@ -28,7 +28,6 @@
 #ifndef DAV1D_SRC_ENV_H
 #define DAV1D_SRC_ENV_H

-#include <assert.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <stdlib.h>
@ -90,95 +89,37 @@ static inline int get_partition_ctx(const BlockContext *const a,
          (((l->partition[yb8] >> (4 - bl)) & 1) << 1);
 }

-static inline unsigned cdf_element_prob(const uint16_t *const cdf, const int e) {
-    assert(e > 0);
-    return cdf[e - 1] - cdf[e];
-}
-
 static inline unsigned gather_left_partition_prob(const uint16_t *const in,
                                                  const enum BlockLevel bl)
 {
-    unsigned out = 0;
-    out += cdf_element_prob(in, PARTITION_H);
-    if (bl != BL_128X128)
-        out += cdf_element_prob(in, PARTITION_H4);
+    unsigned out = in[PARTITION_H - 1] - in[PARTITION_H];
    // Exploit the fact that cdfs for PARTITION_SPLIT, PARTITION_T_TOP_SPLIT,
    // PARTITION_T_BOTTOM_SPLIT and PARTITION_T_LEFT_SPLIT are neighbors.
    out += in[PARTITION_SPLIT - 1] - in[PARTITION_T_LEFT_SPLIT];
+    if (bl != BL_128X128)
+        out += in[PARTITION_H4 - 1] - in[PARTITION_H4];
    return out;
 }

 static inline unsigned gather_top_partition_prob(const uint16_t *const in,
                                                 const enum BlockLevel bl)
 {
-    unsigned out = 0;
+    // Exploit the fact that cdfs for PARTITION_V, PARTITION_SPLIT and
+    // PARTITION_T_TOP_SPLIT are neighbors.
+    unsigned out = in[PARTITION_V - 1] - in[PARTITION_T_TOP_SPLIT];
+    // Exploit the facts that cdfs for PARTITION_T_LEFT_SPLIT and
+    // PARTITION_T_RIGHT_SPLIT are neighbors, the probability for
+    // PARTITION_V4 is always zero, and the probability for
+    // PARTITION_T_RIGHT_SPLIT is zero in 128x128 blocks.
+    out += in[PARTITION_T_LEFT_SPLIT - 1];
    if (bl != BL_128X128)
-        out += cdf_element_prob(in, PARTITION_V4);
-    // Exploit the fact that cdfs for PARTITION_T_LEFT_SPLIT and PARTITION_T_RIGHT_SPLIT,
-    //  and PARTITION_V, PARTITION_SPLIT and PARTITION_T_TOP_SPLIT are neighbors.
-    out += in[PARTITION_T_LEFT_SPLIT - 1] - in[PARTITION_T_RIGHT_SPLIT];
-    out += in[PARTITION_V - 1] - in[PARTITION_T_TOP_SPLIT];
+        out += in[PARTITION_V4 - 1] - in[PARTITION_T_RIGHT_SPLIT];
    return out;
 }

-static inline enum TxfmTypeSet get_ext_txtp_set(const enum RectTxfmSize tx,
-                                                const int inter,
-                                                const Dav1dFrameHeader *const hdr,
-                                                const int seg_id)
-{
-    if (!hdr->segmentation.qidx[seg_id]) {
-        if (hdr->segmentation.lossless[seg_id]) {
-            assert(tx == (int) TX_4X4);
-            return TXTP_SET_LOSSLESS;
-        } else {
-            return TXTP_SET_DCT;
-        }
-    }
-
-    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
-
-    if (t_dim->max >= TX_64X64)
-        return TXTP_SET_DCT;
-
-    if (t_dim->max == TX_32X32)
-        return inter ? TXTP_SET_DCT_ID : TXTP_SET_DCT;
-
-    if (hdr->reduced_txtp_set)
-        return inter ? TXTP_SET_DCT_ID : TXTP_SET_DT4_ID;
-
-    const enum TxfmSize txsqsz = t_dim->min;
-
-    if (inter)
-        return txsqsz == TX_16X16 ? TXTP_SET_DT9_ID_1D : TXTP_SET_ALL;
-    else
-        return txsqsz == TX_16X16 ? TXTP_SET_DT4_ID : TXTP_SET_DT4_ID_1D;
-}
-
-static inline enum TxfmType get_uv_intra_txtp(const enum IntraPredMode uv_mode,
-                                              const enum RectTxfmSize tx,
-                                              const Dav1dFrameHeader *const hdr,
-                                              const int seg_id)
-{
-    if (hdr->segmentation.lossless[seg_id]) {
-        assert(tx == (int) TX_4X4);
-        return WHT_WHT;
-    }
-
-    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
-
-    return t_dim->max == TX_32X32 ? DCT_DCT : dav1d_txtp_from_uvmode[uv_mode];
-}
-
 static inline enum TxfmType get_uv_inter_txtp(const TxfmInfo *const uvt_dim,
-                                              const enum TxfmType ytxtp,
-                                              const Dav1dFrameHeader *const hdr,
-                                              const int seg_id)
+                                              const enum TxfmType ytxtp)
 {
-    if (hdr->segmentation.lossless[seg_id]) {
-        assert(uvt_dim->max == TX_4X4);
-        return WHT_WHT;
-    }
-
    if (uvt_dim->max == TX_32X32)
        return ytxtp == IDTX ? IDTX : DCT_DCT;
    if (uvt_dim->min == TX_16X16 &&
@ -528,180 +469,6 @@ static inline unsigned get_cur_frame_segid(const int by, const int bx,
    }
 }

-static inline int get_coef_skip_ctx(const TxfmInfo *const t_dim,
-                                    const enum BlockSize bs,
-                                    const uint8_t *const a,
-                                    const uint8_t *const l,
-                                    const int chroma,
-                                    const enum Dav1dPixelLayout layout)
-{
-    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
-
-    if (chroma) {
-        const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
-        const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
-        const int not_one_blk = b_dim[2] - (!!b_dim[2] && ss_hor) > t_dim->lw ||
-                                b_dim[3] - (!!b_dim[3] && ss_ver) > t_dim->lh;
-        int ca, cl;
-
-#define MERGE_CTX(dir, type, mask) \
-        c##dir = !!((*(const type *) dir) & mask); \
-        break
-        switch (t_dim->lw) {
-        case TX_4X4:   MERGE_CTX(a, uint8_t,  0x3F);
-        case TX_8X8:   MERGE_CTX(a, uint16_t, 0x3F3F);
-        case TX_16X16: MERGE_CTX(a, uint32_t, 0x3F3F3F3FU);
-        case TX_32X32: MERGE_CTX(a, uint64_t, 0x3F3F3F3F3F3F3F3FULL);
-        default: abort();
-        }
-        switch (t_dim->lh) {
-        case TX_4X4:   MERGE_CTX(l, uint8_t,  0x3F);
-        case TX_8X8:   MERGE_CTX(l, uint16_t, 0x3F3F);
-        case TX_16X16: MERGE_CTX(l, uint32_t, 0x3F3F3F3FU);
-        case TX_32X32: MERGE_CTX(l, uint64_t, 0x3F3F3F3F3F3F3F3FULL);
-        default: abort();
-        }
-#undef MERGE_CTX
-
-        return 7 + not_one_blk * 3 + ca + cl;
-    } else if (b_dim[2] == t_dim->lw && b_dim[3] == t_dim->lh) {
-        return 0;
-    } else {
-        static const uint8_t skip_contexts[5][5] = {
-            { 1, 2, 2, 2, 3 },
-            { 1, 4, 4, 4, 5 },
-            { 1, 4, 4, 4, 5 },
-            { 1, 4, 4, 4, 5 },
-            { 1, 4, 4, 4, 6 }
-        };
-        uint64_t la, ll;
-
-#define MERGE_CTX(dir, type, tx) do { \
-            l##dir = *(const type *) dir; \
-            if (tx == TX_64X64) \
-                l##dir |= *(const type *) &dir[sizeof(type)]; \
-            if (tx >= TX_32X32) l##dir |= l##dir >> 32; \
-            if (tx >= TX_16X16) l##dir |= l##dir >> 16; \
-            if (tx >= TX_8X8)   l##dir |= l##dir >> 8; \
-            l##dir &= 0x3F; \
-        } while (0); \
-        break
-        switch (t_dim->lw) {
-        case TX_4X4:   MERGE_CTX(a, uint8_t,  TX_4X4);
-        case TX_8X8:   MERGE_CTX(a, uint16_t, TX_8X8);
-        case TX_16X16: MERGE_CTX(a, uint32_t, TX_16X16);
-        case TX_32X32: MERGE_CTX(a, uint64_t, TX_32X32);
-        case TX_64X64: MERGE_CTX(a, uint64_t, TX_64X64);
-        }
-        switch (t_dim->lh) {
-        case TX_4X4:   MERGE_CTX(l, uint8_t,  TX_4X4);
-        case TX_8X8:   MERGE_CTX(l, uint16_t, TX_8X8);
-        case TX_16X16: MERGE_CTX(l, uint32_t, TX_16X16);
-        case TX_32X32: MERGE_CTX(l, uint64_t, TX_32X32);
-        case TX_64X64: MERGE_CTX(l, uint64_t, TX_64X64);
-        }
-#undef MERGE_CTX
-
-        const int max = imin((int) (la | ll), 4);
-        const int min = imin(imin((int) la, (int) ll), 4);
-
-        return skip_contexts[min][max];
-    }
-}
-
-static inline int get_coef_nz_ctx(uint8_t *const levels,
-                                  const enum RectTxfmSize tx,
-                                  const enum TxClass tx_class,
-                                  const int x, const int y,
-                                  const ptrdiff_t stride)
-{
-    static const uint8_t offsets[3][5][2 /* x, y */] = {
-        [TX_CLASS_2D] = {
-            { 0, 1 }, { 1, 0 }, { 2, 0 }, { 0, 2 }, { 1, 1 }
-        }, [TX_CLASS_V] = {
-            { 0, 1 }, { 1, 0 }, { 0, 2 }, { 0, 3 }, { 0, 4 }
-        }, [TX_CLASS_H] = {
-            { 0, 1 }, { 1, 0 }, { 2, 0 }, { 3, 0 }, { 4, 0 }
-        }
-    };
-    const uint8_t (*const off)[2] = offsets[tx_class];
-    int mag = 0;
-    for (int i = 0; i < 5; i++)
-        mag += imin(levels[(x + off[i][0]) * stride + (y + off[i][1])], 3);
-    const int ctx = imin((mag + 1) >> 1, 4);
-    if (tx_class == TX_CLASS_2D) {
-        return dav1d_nz_map_ctx_offset[tx][imin(y, 4)][imin(x, 4)] + ctx;
-    } else {
-        return 26 + imin((tx_class == TX_CLASS_V) ? y : x, 2) * 5 + ctx;
-    }
-}
-
-static inline int get_dc_sign_ctx(const TxfmInfo *const t_dim,
-                                  const uint8_t *const a,
-                                  const uint8_t *const l)
-{
-    uint64_t sa, sl;
-
-#define MERGE_CTX(dir, type, tx, mask) do { \
-        s##dir = ((*(const type *) dir) >> 6) & mask; \
-        if (tx == TX_64X64) \
-            s##dir += ((*(const type *) &dir[sizeof(type)]) >> 6) & mask; \
-        if (tx >= TX_32X32) s##dir += s##dir >> 32; \
-        if (tx >= TX_16X16) s##dir += s##dir >> 16; \
-        if (tx >= TX_8X8)   s##dir += s##dir >> 8; \
-    } while (0); \
-    break
-    switch (t_dim->lw) {
-    case TX_4X4:   MERGE_CTX(a, uint8_t,  TX_4X4,   0x03);
-    case TX_8X8:   MERGE_CTX(a, uint16_t, TX_8X8,   0x0303);
-    case TX_16X16: MERGE_CTX(a, uint32_t, TX_16X16, 0x03030303U);
-    case TX_32X32: MERGE_CTX(a, uint64_t, TX_32X32, 0x0303030303030303ULL);
-    case TX_64X64: MERGE_CTX(a, uint64_t, TX_64X64, 0x0303030303030303ULL);
-    }
-    switch (t_dim->lh) {
-    case TX_4X4:   MERGE_CTX(l, uint8_t,  TX_4X4,   0x03);
-    case TX_8X8:   MERGE_CTX(l, uint16_t, TX_8X8,   0x0303);
-    case TX_16X16: MERGE_CTX(l, uint32_t, TX_16X16, 0x03030303U);
-    case TX_32X32: MERGE_CTX(l, uint64_t, TX_32X32, 0x0303030303030303ULL);
-    case TX_64X64: MERGE_CTX(l, uint64_t, TX_64X64, 0x0303030303030303ULL);
-    }
-#undef MERGE_CTX
-    const int s = ((int) ((sa + sl) & 0xFF)) - (t_dim->w + t_dim->h);
-
-    return s < 0 ? 1 : s > 0 ? 2 : 0;
-}
-
-static inline int get_br_ctx(const uint8_t *const levels,
-                             const int ac, const enum TxClass tx_class,
-                             const int x, const int y,
-                             const ptrdiff_t stride)
-{
-    int mag = 0;
-    static const uint8_t offsets_from_txclass[3][3][2] = {
-        [TX_CLASS_2D] = { { 0, 1 }, { 1, 0 }, { 1, 1 } },
-        [TX_CLASS_H]  = { { 0, 1 }, { 1, 0 }, { 0, 2 } },
-        [TX_CLASS_V]  = { { 0, 1 }, { 1, 0 }, { 2, 0 } }
-    };
-    const uint8_t (*const offsets)[2] = offsets_from_txclass[tx_class];
-    for (int i = 0; i < 3; i++)
-        mag += levels[(x + offsets[i][1]) * stride + y + offsets[i][0]];
-
-    mag = imin((mag + 1) >> 1, 6);
-    if (!ac) return mag;
-    switch (tx_class) {
-    case TX_CLASS_2D:
-        if (y < 2 && x < 2) return mag + 7;
-        break;
-    case TX_CLASS_H:
-        if (x == 0) return mag + 7;
-        break;
-    case TX_CLASS_V:
-        if (y == 0) return mag + 7;
-        break;
-    }
-    return mag + 14;
-}
-
 static inline mv get_gmv_2d(const Dav1dWarpedMotionParams *const gmv,
                            const int bx4, const int by4,
                            const int bw4, const int bh4,
--- a/third_party/dav1d/src/fg_apply.h
+++ b/third_party/dav1d/src/fg_apply.h
@ -0,0 +1,41 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_FG_APPLY_H
+#define DAV1D_SRC_FG_APPLY_H
+
+#include "dav1d/picture.h"
+
+#include "common/bitdepth.h"
+
+#include "src/film_grain.h"
+
+bitfn_decls(void dav1d_apply_grain, const Dav1dFilmGrainDSPContext *const dsp,
+                                    Dav1dPicture *const out,
+                                    const Dav1dPicture *const in);
+
+#endif /* DAV1D_SRC_FG_APPLY_H */
--- a/third_party/dav1d/src/fg_apply_tmpl.c
+++ b/third_party/dav1d/src/fg_apply_tmpl.c
@ -0,0 +1,175 @@
+/*
+ * Copyright © 2018, Niklas Haas
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdint.h>
+
+#include "dav1d/picture.h"
+
+#include "common.h"
+#include "common/intops.h"
+#include "common/bitdepth.h"
+
+#include "fg_apply.h"
+
+static void generate_scaling(const int bitdepth,
+                             const uint8_t points[][2], const int num,
+                             uint8_t scaling[SCALING_SIZE])
+{
+    const int shift_x = bitdepth - 8;
+    const int scaling_size = 1 << bitdepth;
+    const int pad = 1 << shift_x;
+
+    // Fill up the preceding entries with the initial value
+    for (int i = 0; i < points[0][0] << shift_x; i++)
+        scaling[i] = points[0][1];
+
+    // Linearly interpolate the values in the middle
+    for (int i = 0; i < num - 1; i++) {
+        const int bx = points[i][0];
+        const int by = points[i][1];
+        const int ex = points[i+1][0];
+        const int ey = points[i+1][1];
+        const int dx = ex - bx;
+        const int dy = ey - by;
+        const int delta = dy * ((0x10000 + (dx >> 1)) / dx);
+        for (int x = 0; x < dx; x++) {
+            const int v = by + ((x * delta + 0x8000) >> 16);
+            scaling[(bx + x) << shift_x] = v;
+        }
+    }
+
+    // Fill up the remaining entries with the final value
+    for (int i = points[num - 1][0] << shift_x; i < scaling_size; i++)
+        scaling[i] = points[num - 1][1];
+
+    if (pad <= 1) return;
+
+    const int rnd = pad >> 1;
+    for (int i = 0; i < num - 1; i++) {
+        const int bx = points[i][0] << shift_x;
+        const int ex = points[i+1][0] << shift_x;
+        const int dx = ex - bx;
+        for (int x = 0; x < dx; x += pad) {
+            const int range = scaling[bx + x + pad] - scaling[bx + x];
+            for (int n = 1; n < pad; n++) {
+                scaling[bx + x + n] = scaling[bx + x] + ((range * n + rnd) >> shift_x);
+            }
+        }
+    }
+}
+
+#ifndef UNIT_TEST
+void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp,
+                              Dav1dPicture *const out,
+                              const Dav1dPicture *const in)
+{
+    const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;
+
+    entry grain_lut[3][GRAIN_HEIGHT + 1][GRAIN_WIDTH];
+    uint8_t scaling[3][SCALING_SIZE];
+#if BITDEPTH != 8
+    const int bitdepth_max = (1 << out->p.bpc) - 1;
+#endif
+
+    // Generate grain LUTs as needed
+    dsp->generate_grain_y(grain_lut[0], data HIGHBD_TAIL_SUFFIX); // always needed
+    if (data->num_uv_points[0] || data->chroma_scaling_from_luma)
+        dsp->generate_grain_uv[in->p.layout - 1](grain_lut[1], grain_lut[0],
+                                                 data, 0 HIGHBD_TAIL_SUFFIX);
+    if (data->num_uv_points[1] || data->chroma_scaling_from_luma)
+        dsp->generate_grain_uv[in->p.layout - 1](grain_lut[2], grain_lut[0],
+                                                 data, 1 HIGHBD_TAIL_SUFFIX);
+
+    // Generate scaling LUTs as needed
+    if (data->num_y_points)
+        generate_scaling(in->p.bpc, data->y_points, data->num_y_points, scaling[0]);
+    if (data->num_uv_points[0])
+        generate_scaling(in->p.bpc, data->uv_points[0], data->num_uv_points[0], scaling[1]);
+    if (data->num_uv_points[1])
+        generate_scaling(in->p.bpc, data->uv_points[1], data->num_uv_points[1], scaling[2]);
+
+    // Copy over the non-modified planes
+    // TODO: eliminate in favor of per-plane refs
+    assert(out->stride[0] == in->stride[0]);
+    if (!data->num_y_points) {
+        memcpy(out->data[0], in->data[0], out->p.h * out->stride[0]);
+    }
+
+    if (in->p.layout != DAV1D_PIXEL_LAYOUT_I400) {
+        assert(out->stride[1] == in->stride[1]);
+        for (int i = 0; i < 2; i++) {
+            if (!data->num_uv_points[i] && !data->chroma_scaling_from_luma) {
+                const int suby = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
+                memcpy(out->data[1+i], in->data[1+i],
+                       (out->p.h >> suby) * out->stride[1]);
+            }
+        }
+    }
+
+    // Synthesize grain for the affected planes
+    const int rows = (out->p.h + 31) >> 5;
+    const int ss_y = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_x = in->p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    const int cpw = (out->p.w + ss_x) >> ss_x;
+    const int is_id = out->seq_hdr->mtrx == DAV1D_MC_IDENTITY;
+    for (int row = 0; row < rows; row++) {
+        const pixel *const luma_src =
+            ((pixel *) in->data[0]) + row * BLOCK_SIZE * PXSTRIDE(in->stride[0]);
+
+        if (data->num_y_points) {
+            const int bh = imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE);
+            dsp->fgy_32x32xn(((pixel *) out->data[0]) + row * BLOCK_SIZE * PXSTRIDE(out->stride[0]),
+                             luma_src, out->stride[0], data,
+                             out->p.w, scaling[0], grain_lut[0], bh, row HIGHBD_TAIL_SUFFIX);
+        }
+
+        const int bh = (imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE) + ss_y) >> ss_y;
+        const ptrdiff_t uv_off = row * BLOCK_SIZE * PXSTRIDE(out->stride[1]) >> ss_y;
+        if (data->chroma_scaling_from_luma) {
+            for (int pl = 0; pl < 2; pl++)
+                dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off,
+                                                    ((const pixel *) in->data[1 + pl]) + uv_off,
+                                                    in->stride[1], data, cpw,
+                                                    scaling[0], grain_lut[1 + pl],
+                                                    bh, row, luma_src, in->stride[0],
+                                                    pl, is_id HIGHBD_TAIL_SUFFIX);
+        } else {
+            for (int pl = 0; pl < 2; pl++)
+                if (data->num_uv_points[pl])
+                    dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off,
+                                                        ((const pixel *) in->data[1 + pl]) + uv_off,
+                                                        in->stride[1], data, cpw,
+                                                        scaling[1 + pl], grain_lut[1 + pl],
+                                                        bh, row, luma_src, in->stride[0],
+                                                        pl, is_id HIGHBD_TAIL_SUFFIX);
+        }
+    }
+}
+#endif
--- a/third_party/dav1d/src/film_grain.h
+++ b/third_party/dav1d/src/film_grain.h
@ -28,9 +28,58 @@
 #ifndef DAV1D_SRC_FILM_GRAIN_H
 #define DAV1D_SRC_FILM_GRAIN_H

-#include "dav1d/dav1d.h"
+#include "common/bitdepth.h"

-bitfn_decls(void dav1d_apply_grain, Dav1dPicture *const out,
-                                    const Dav1dPicture *const in);
+#include "src/levels.h"
+
+#define GRAIN_WIDTH 82
+#define GRAIN_HEIGHT 73
+#define BLOCK_SIZE 32
+#if !defined(BITDEPTH) || BITDEPTH == 8
+#define SCALING_SIZE 256
+typedef int8_t entry;
+#else
+#define SCALING_SIZE 4096
+typedef int16_t entry;
+#endif
+
+#define decl_generate_grain_y_fn(name) \
+void (name)(entry buf[][GRAIN_WIDTH], \
+            const Dav1dFilmGrainData *const data HIGHBD_DECL_SUFFIX)
+typedef decl_generate_grain_y_fn(*generate_grain_y_fn);
+
+#define decl_generate_grain_uv_fn(name) \
+void (name)(entry buf[][GRAIN_WIDTH], \
+            const entry buf_y[][GRAIN_WIDTH], \
+            const Dav1dFilmGrainData *const data, const int uv HIGHBD_DECL_SUFFIX)
+typedef decl_generate_grain_uv_fn(*generate_grain_uv_fn);
+
+#define decl_fgy_32x32xn_fn(name) \
+void (name)(pixel *dst_row, const pixel *src_row, ptrdiff_t stride, \
+            const Dav1dFilmGrainData *data, \
+            size_t pw, const uint8_t scaling[SCALING_SIZE], \
+            const entry grain_lut[][GRAIN_WIDTH], \
+            int bh, int row_num HIGHBD_DECL_SUFFIX)
+typedef decl_fgy_32x32xn_fn(*fgy_32x32xn_fn);
+
+#define decl_fguv_32x32xn_fn(name) \
+void (name)(pixel *dst_row, const pixel *src_row, ptrdiff_t stride, \
+            const Dav1dFilmGrainData *data, int pw, \
+            const uint8_t scaling[SCALING_SIZE], \
+            const entry grain_lut[][GRAIN_WIDTH], int bh, int row_num, \
+            const pixel *luma_row, ptrdiff_t luma_stride, \
+            int uv_pl, int is_id HIGHBD_DECL_SUFFIX)
+typedef decl_fguv_32x32xn_fn(*fguv_32x32xn_fn);
+
+typedef struct Dav1dFilmGrainDSPContext {
+    generate_grain_y_fn generate_grain_y;
+    generate_grain_uv_fn generate_grain_uv[3];
+
+    fgy_32x32xn_fn fgy_32x32xn;
+    fguv_32x32xn_fn fguv_32x32xn[3];
+} Dav1dFilmGrainDSPContext;
+
+bitfn_decls(void dav1d_film_grain_dsp_init, Dav1dFilmGrainDSPContext *c);
+bitfn_decls(void dav1d_film_grain_dsp_init_x86, Dav1dFilmGrainDSPContext *c);

 #endif /* DAV1D_SRC_FILM_GRAIN_H */
--- a/third_party/dav1d/src/film_grain_tmpl.c
+++ b/third_party/dav1d/src/film_grain_tmpl.c
@ -26,39 +26,16 @@
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

-#include "config.h"
-
-#include <assert.h>
-#include <stdint.h>
-
-#include "common.h"
+#include "common/attributes.h"
 #include "common/intops.h"
-#include "common/bitdepth.h"
-#include "tables.h"

 #include "film_grain.h"
+#include "tables.h"

-#if BITDEPTH == 8
-typedef int8_t entry;
-#else
-typedef int16_t entry;
-#endif
+#define SUB_GRAIN_WIDTH 44
+#define SUB_GRAIN_HEIGHT 38

-enum {
-    GRAIN_WIDTH  = 82,
-    GRAIN_HEIGHT = 73,
-    SUB_GRAIN_WIDTH = 44,
-    SUB_GRAIN_HEIGHT = 38,
-    SUB_GRAIN_OFFSET = 6,
-    BLOCK_SIZE = 32,
-#if BITDEPTH == 8
-    SCALING_SIZE = 256
-#else
-    SCALING_SIZE = 4096
-#endif
-};
-
-static inline int get_random_number(const int bits, unsigned *state) {
+static inline int get_random_number(const int bits, unsigned *const state) {
    const int r = *state;
    unsigned bit = ((r >> 0) ^ (r >> 1) ^ (r >> 3) ^ (r >> 12)) & 1;
    *state = (r >> 1) | (bit << 15);
@ -70,13 +47,14 @@ static inline int round2(const int x, const int shift) {
    return (x + ((1 << shift) >> 1)) >> shift;
 }

-static void generate_grain_y(const Dav1dPicture *const in,
-                             entry buf[GRAIN_HEIGHT][GRAIN_WIDTH])
+static void generate_grain_y_c(entry buf[][GRAIN_WIDTH],
+                               const Dav1dFilmGrainData *const data
+                               HIGHBD_DECL_SUFFIX)
 {
-    const Dav1dFilmGrainData *data = &in->frame_hdr->film_grain.data;
+    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
    unsigned seed = data->seed;
-    const int shift = 12 - in->p.bpc + data->grain_scale_shift;
-    const int grain_ctr = 128 << (in->p.bpc - 8);
+    const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift;
+    const int grain_ctr = 128 << bitdepth_min_8;
    const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;

    for (int y = 0; y < GRAIN_HEIGHT; y++) {
@ -101,25 +79,24 @@ static void generate_grain_y(const Dav1dPicture *const in,
                }
            }

-            int grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
+            const int grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
            buf[y][x] = iclip(grain, grain_min, grain_max);
        }
    }
 }

-static void generate_grain_uv(const Dav1dPicture *const in, int uv,
-                              entry buf[GRAIN_HEIGHT][GRAIN_WIDTH],
-                              entry buf_y[GRAIN_HEIGHT][GRAIN_WIDTH])
+static NOINLINE void
+generate_grain_uv_c(entry buf[][GRAIN_WIDTH],
+                    const entry buf_y[][GRAIN_WIDTH],
+                    const Dav1dFilmGrainData *const data, const int uv,
+                    const int subx, const int suby HIGHBD_DECL_SUFFIX)
 {
-    const Dav1dFilmGrainData *data = &in->frame_hdr->film_grain.data;
+    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
    unsigned seed = data->seed ^ (uv ? 0x49d8 : 0xb524);
-    const int shift = 12 - in->p.bpc + data->grain_scale_shift;
-    const int grain_ctr = 128 << (in->p.bpc - 8);
+    const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift;
+    const int grain_ctr = 128 << bitdepth_min_8;
    const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;

-    const int subx = in->p.layout != DAV1D_PIXEL_LAYOUT_I444;
-    const int suby = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
-
    const int chromaW = subx ? SUB_GRAIN_WIDTH  : GRAIN_WIDTH;
    const int chromaH = suby ? SUB_GRAIN_HEIGHT : GRAIN_HEIGHT;

@ -167,40 +144,18 @@ static void generate_grain_uv(const Dav1dPicture *const in, int uv,
    }
 }

-static void generate_scaling(const int bitdepth,
-                             const uint8_t points[][2], int num,
-                             uint8_t scaling[SCALING_SIZE])
-{
-    const int shift_x = bitdepth - 8;
-    const int scaling_size = 1 << bitdepth;
-
-    // Fill up the preceding entries with the initial value
-    for (int i = 0; i < points[0][0] << shift_x; i++)
-        scaling[i] = points[0][1];
-
-    // Linearly interpolate the values in the middle
-    for (int i = 0; i < num - 1; i++) {
-        const int bx = points[i][0] << shift_x;
-        const int by = points[i][1];
-        const int ex = points[i+1][0] << shift_x;
-        const int ey = points[i+1][1];
-        const int dx = ex - bx;
-        const int dy = ey - by;
-        const int delta = dy * ((0xFFFF + (dx >> 1))) / dx;
-        for (int x = 0; x < dx; x++) {
-            const int v = by + ((x * delta + 0x8000) >> 16);
-            scaling[bx + x] = v;
-        }
-    }
-
-    // Fill up the remaining entries with the final value
-    for (int i = points[num - 1][0] << shift_x; i < scaling_size; i++)
-        scaling[i] = points[num - 1][1];
+#define gnuv_ss_fn(nm, ss_x, ss_y) \
+static decl_generate_grain_uv_fn(generate_grain_uv_##nm##_c) { \
+    generate_grain_uv_c(buf, buf_y, data, uv, ss_x, ss_y HIGHBD_TAIL_SUFFIX); \
 }

+gnuv_ss_fn(420, 1, 1);
+gnuv_ss_fn(422, 1, 0);
+gnuv_ss_fn(444, 0, 0);
+
 // samples from the correct block of a grain LUT, while taking into account the
 // offsets provided by the offsets cache
-static inline entry sample_lut(entry grain_lut[GRAIN_HEIGHT][GRAIN_WIDTH],
+static inline entry sample_lut(const entry grain_lut[][GRAIN_WIDTH],
                               int offsets[2][2], int subx, int suby,
                               int bx, int by, int x, int y)
 {
@ -211,13 +166,15 @@ static inline entry sample_lut(entry grain_lut[GRAIN_HEIGHT][GRAIN_WIDTH],
                    [offx + x + (BLOCK_SIZE >> subx) * bx];
 }

-static void apply_to_row_y(Dav1dPicture *const out, const Dav1dPicture *const in,
-                           entry grain_lut[GRAIN_HEIGHT][GRAIN_WIDTH],
-                           uint8_t scaling[SCALING_SIZE], int row_num)
+static void fgy_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
+                          const ptrdiff_t stride,
+                          const Dav1dFilmGrainData *const data, const size_t pw,
+                          const uint8_t scaling[SCALING_SIZE],
+                          const entry grain_lut[][GRAIN_WIDTH],
+                          const int bh, const int row_num HIGHBD_DECL_SUFFIX)
 {
-    const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;
    const int rows = 1 + (data->overlap_flag && row_num > 0);
-    const int bitdepth_min_8 = in->p.bpc - 8;
+    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
    const int grain_ctr = 128 << bitdepth_min_8;
    const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;

@ -227,7 +184,11 @@ static void apply_to_row_y(Dav1dPicture *const out, const Dav1dPicture *const in
        max_value = 235 << bitdepth_min_8;
    } else {
        min_value = 0;
-        max_value = (1U << in->p.bpc) - 1;
+#if BITDEPTH == 8
+        max_value = 0xff;
+#else
+        max_value = bitdepth_max;
+#endif
    }

    // seed[0] contains the current row, seed[1] contains the previous
@ -238,18 +199,13 @@ static void apply_to_row_y(Dav1dPicture *const out, const Dav1dPicture *const in
        seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
    }

-    const ptrdiff_t stride = out->stride[0];
    assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0);
-    assert(stride == in->stride[0]);
-    pixel *const src_row = (pixel *)  in->data[0] + PXSTRIDE(stride) * row_num * BLOCK_SIZE;
-    pixel *const dst_row = (pixel *) out->data[0] + PXSTRIDE(stride) * row_num * BLOCK_SIZE;

    int offsets[2 /* col offset */][2 /* row offset */];

    // process this row in BLOCK_SIZE^2 blocks
-    const int bh = imin(out->p.h - row_num * BLOCK_SIZE, BLOCK_SIZE);
-    for (int bx = 0; bx < out->p.w; bx += BLOCK_SIZE) {
-        const int bw = imin(BLOCK_SIZE, out->p.w - bx);
+    for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE) {
+        const int bw = imin(BLOCK_SIZE, (int) pw - bx);

        if (data->overlap_flag && bx) {
            // shift previous offsets left
@ -268,9 +224,9 @@ static void apply_to_row_y(Dav1dPicture *const out, const Dav1dPicture *const in
        static const int w[2][2] = { { 27, 17 }, { 17, 27 } };

 #define add_noise_y(x, y, grain)                                                  \
-            pixel *src = src_row + (y) * PXSTRIDE(stride) + (bx + (x));         \
-            pixel *dst = dst_row + (y) * PXSTRIDE(stride) + (bx + (x));         \
-            int noise = round2(scaling[ *src ] * (grain), data->scaling_shift); \
+        const pixel *const src = src_row + (y) * PXSTRIDE(stride) + (x) + bx;     \
+        pixel *const dst = dst_row + (y) * PXSTRIDE(stride) + (x) + bx;           \
+        const int noise = round2(scaling[ *src ] * (grain), data->scaling_shift); \
        *dst = iclip(*src + noise, min_value, max_value);

        for (int y = ystart; y < bh; y++) {
@ -323,33 +279,33 @@ static void apply_to_row_y(Dav1dPicture *const out, const Dav1dPicture *const in
    }
 }

-static void apply_to_row_uv(Dav1dPicture *const out, const Dav1dPicture *const in,
-                            entry grain_lut[GRAIN_HEIGHT][GRAIN_WIDTH],
-                            uint8_t scaling[SCALING_SIZE], int uv, int row_num)
+static NOINLINE void
+fguv_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
+               const ptrdiff_t stride, const Dav1dFilmGrainData *const data,
+               const int pw, const uint8_t scaling[SCALING_SIZE],
+               const entry grain_lut[][GRAIN_WIDTH], const int bh,
+               const int row_num, const pixel *const luma_row,
+               const ptrdiff_t luma_stride, const int uv, const int is_id,
+               const int sx, const int sy HIGHBD_DECL_SUFFIX)
 {
-    const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;
    const int rows = 1 + (data->overlap_flag && row_num > 0);
-    const int bitdepth_max = (1 << in->p.bpc) - 1;
-    const int bitdepth_min_8 = in->p.bpc - 8;
+    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
    const int grain_ctr = 128 << bitdepth_min_8;
    const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;

    int min_value, max_value;
    if (data->clip_to_restricted_range) {
        min_value = 16 << bitdepth_min_8;
-        if (out->seq_hdr->mtrx == DAV1D_MC_IDENTITY) {
-            max_value = 235 << bitdepth_min_8;
-        } else {
-            max_value = 240 << bitdepth_min_8;
-        }
+        max_value = (is_id ? 235 : 240) << bitdepth_min_8;
    } else {
        min_value = 0;
+#if BITDEPTH == 8
+        max_value = 0xff;
+#else
        max_value = bitdepth_max;
+#endif
    }

-    const int sx = in->p.layout != DAV1D_PIXEL_LAYOUT_I444;
-    const int sy = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
-
    // seed[0] contains the current row, seed[1] contains the previous
    unsigned seed[2];
    for (int i = 0; i < rows; i++) {
@ -358,21 +314,13 @@ static void apply_to_row_uv(Dav1dPicture *const out, const Dav1dPicture *const i
        seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
    }

-    const ptrdiff_t stride = out->stride[1];
    assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0);
-    assert(stride == in->stride[1]);
-
-    const int by = row_num * (BLOCK_SIZE >> sy);
-    pixel *const dst_row = (pixel *) out->data[1 + uv] + PXSTRIDE(stride) * by;
-    pixel *const src_row = (pixel *)  in->data[1 + uv] + PXSTRIDE(stride) * by;
-    pixel *const luma_row = (pixel *) out->data[0] + PXSTRIDE(out->stride[0]) * row_num * BLOCK_SIZE;

    int offsets[2 /* col offset */][2 /* row offset */];

    // process this row in BLOCK_SIZE^2 blocks (subsampled)
-    const int bh = (imin(out->p.h - row_num * BLOCK_SIZE, BLOCK_SIZE) + sy) >> sy;
-    for (int bx = 0; bx < (out->p.w + sx) >> sx; bx += BLOCK_SIZE >> sx) {
-        const int bw = (imin(BLOCK_SIZE, out->p.w - (bx << sx)) + sx) >> sx;
+    for (int bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) {
+        const int bw = imin(BLOCK_SIZE >> sx, pw - bx);
        if (data->overlap_flag && bx) {
            // shift previous offsets left
            for (int i = 0; i < rows; i++)
@ -395,22 +343,20 @@ static void apply_to_row_uv(Dav1dPicture *const out, const Dav1dPicture *const i
 #define add_noise_uv(x, y, grain)                                                    \
            const int lx = (bx + x) << sx;                                           \
            const int ly = y << sy;                                                  \
-            pixel *luma = luma_row + ly * PXSTRIDE(out->stride[0]) + lx;        \
+            const pixel *const luma = luma_row + ly * PXSTRIDE(luma_stride) + lx;    \
            pixel avg = luma[0];                                                     \
-            if (sx && lx + 1 < out->p.w)                                        \
+            if (sx)                                                                  \
                avg = (avg + luma[1] + 1) >> 1;                                      \
-                                                                                \
-            pixel *src = src_row + (y) * PXSTRIDE(stride) + (bx + (x));         \
-            pixel *dst = dst_row + (y) * PXSTRIDE(stride) + (bx + (x));         \
+            const pixel *const src = src_row + (y) * PXSTRIDE(stride) + (bx + (x));  \
+            pixel *const dst = dst_row + (y) * PXSTRIDE(stride) + (bx + (x));        \
            int val = avg;                                                           \
            if (!data->chroma_scaling_from_luma) {                                   \
-                int combined = avg * data->uv_luma_mult[uv] +                   \
+                const int combined = avg * data->uv_luma_mult[uv] +                  \
                               *src * data->uv_mult[uv];                             \
                val = iclip_pixel( (combined >> 6) +                                 \
                                   (data->uv_offset[uv] * (1 << bitdepth_min_8)) );  \
            }                                                                        \
-                                                                                \
-            int noise = round2(scaling[ val ] * (grain), data->scaling_shift);  \
+            const int noise = round2(scaling[ val ] * (grain), data->scaling_shift); \
            *dst = iclip(*src + noise, min_value, max_value);

        for (int y = ystart; y < bh; y++) {
@ -463,61 +409,29 @@ static void apply_to_row_uv(Dav1dPicture *const out, const Dav1dPicture *const i
    }
 }

-void bitfn(dav1d_apply_grain)(Dav1dPicture *const out,
-                              const Dav1dPicture *const in)
-{
-    const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;
-
-    entry grain_lut[3][GRAIN_HEIGHT][GRAIN_WIDTH];
-    uint8_t scaling[3][SCALING_SIZE];
-
-    // Generate grain LUTs as needed
-    generate_grain_y(out, grain_lut[0]); // always needed
-    if (data->num_uv_points[0] || data->chroma_scaling_from_luma)
-        generate_grain_uv(out, 0, grain_lut[1], grain_lut[0]);
-    if (data->num_uv_points[1] || data->chroma_scaling_from_luma)
-        generate_grain_uv(out, 1, grain_lut[2], grain_lut[0]);
-
-    // Generate scaling LUTs as needed
-    if (data->num_y_points)
-        generate_scaling(in->p.bpc, data->y_points, data->num_y_points, scaling[0]);
-    if (data->num_uv_points[0])
-        generate_scaling(in->p.bpc, data->uv_points[0], data->num_uv_points[0], scaling[1]);
-    if (data->num_uv_points[1])
-        generate_scaling(in->p.bpc, data->uv_points[1], data->num_uv_points[1], scaling[2]);
-
-    // Copy over the non-modified planes
-    // TODO: eliminate in favor of per-plane refs
-    if (!data->num_y_points) {
-        assert(out->stride[0] == in->stride[0]);
-        memcpy(out->data[0], in->data[0], out->p.h * out->stride[0]);
-    }
-
-    if (in->p.layout != DAV1D_PIXEL_LAYOUT_I400) {
-        for (int i = 0; i < 2; i++) {
-            if (!data->num_uv_points[i] && !data->chroma_scaling_from_luma) {
-                const int suby = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
-                assert(out->stride[1] == in->stride[1]);
-                memcpy(out->data[1+i], in->data[1+i],
-                       (out->p.h >> suby) * out->stride[1]);
-            }
-        }
-    }
-
-    // Synthesize grain for the affected planes
-    int rows = (out->p.h + 31) >> 5;
-    for (int row = 0; row < rows; row++) {
-        if (data->num_y_points)
-            apply_to_row_y(out, in, grain_lut[0], scaling[0], row);
-
-        if (data->chroma_scaling_from_luma) {
-            apply_to_row_uv(out, in, grain_lut[1], scaling[0], 0, row);
-            apply_to_row_uv(out, in, grain_lut[2], scaling[0], 1, row);
-        } else {
-            if (data->num_uv_points[0])
-                apply_to_row_uv(out, in, grain_lut[1], scaling[1], 0, row);
-            if (data->num_uv_points[1])
-                apply_to_row_uv(out, in, grain_lut[2], scaling[2], 1, row);
-        }
-    }
+#define fguv_ss_fn(nm, ss_x, ss_y) \
+static decl_fguv_32x32xn_fn(fguv_32x32xn_##nm##_c) { \
+    fguv_32x32xn_c(dst_row, src_row, stride, data, pw, scaling, grain_lut, bh, \
+                   row_num, luma_row, luma_stride, uv_pl, is_id, ss_x, ss_y \
+                   HIGHBD_TAIL_SUFFIX); \
+}
+
+fguv_ss_fn(420, 1, 1);
+fguv_ss_fn(422, 1, 0);
+fguv_ss_fn(444, 0, 0);
+
+COLD void bitfn(dav1d_film_grain_dsp_init)(Dav1dFilmGrainDSPContext *const c) {
+    c->generate_grain_y = generate_grain_y_c;
+    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = generate_grain_uv_420_c;
+    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = generate_grain_uv_422_c;
+    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = generate_grain_uv_444_c;
+
+    c->fgy_32x32xn = fgy_32x32xn_c;
+    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_c;
+    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_c;
+    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_c;
+
+#if HAVE_ASM && ARCH_X86
+    bitfn(dav1d_film_grain_dsp_init_x86)(c);
+#endif
 }
--- a/third_party/dav1d/src/getbits.c
+++ b/third_party/dav1d/src/getbits.c
@ -27,8 +27,6 @@

 #include "config.h"

-#include <assert.h>
-
 #include "common/intops.h"

 #include "src/getbits.h"
--- a/third_party/dav1d/src/internal.h
+++ b/third_party/dav1d/src/internal.h
@ -42,6 +42,7 @@ typedef struct Dav1dTileContext Dav1dTileContext;
 #include "src/cdf.h"
 #include "src/data.h"
 #include "src/env.h"
+#include "src/film_grain.h"
 #include "src/intra_edge.h"
 #include "src/ipred.h"
 #include "src/itx.h"
@ -57,6 +58,7 @@ typedef struct Dav1dTileContext Dav1dTileContext;
 #include "src/thread.h"

 typedef struct Dav1dDSPContext {
+    Dav1dFilmGrainDSPContext fg;
    Dav1dIntraPredDSPContext ipred;
    Dav1dMCDSPContext mc;
    Dav1dInvTxfmDSPContext itx;
@ -89,6 +91,8 @@ struct Dav1dContext {
    Dav1dContentLightLevel *content_light;
    Dav1dRef *mastering_display_ref;
    Dav1dMasteringDisplay *mastering_display;
+    Dav1dRef *itut_t35_ref;
+    Dav1dITUTT35 *itut_t35;

    // decoded output picture queue
    Dav1dData in;
@ -213,7 +217,7 @@ struct Dav1dFrameContext {
        Av1Restoration *lr_mask;
        int top_pre_cdef_toggle;
        int mask_sz /* w*h */, lr_mask_sz, line_sz /* w */, lr_line_sz, re_sz /* h */;
-        Av1FilterLUT lim_lut;
+        ALIGN(Av1FilterLUT lim_lut, 16);
        int last_sharpness;
        uint8_t lvl[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */];
        uint8_t *tx_lpf_right_edge[2];
@ -233,20 +237,21 @@ struct Dav1dFrameContext {
        pthread_cond_t cond, icond;
        int tasks_left, num_tasks;
        int (*task_idx_to_sby_and_tile_idx)[2];
-        int titsati_sz, titsati_init[3];
+        int titsati_sz, titsati_init[2];
+        uint16_t titsati_index_rows[1 + DAV1D_MAX_TILE_ROWS];
        int inited;
    } tile_thread;
 };

 struct Dav1dTileState {
+    CdfContext cdf;
+    MsacContext msac;
+
    struct {
        int col_start, col_end, row_start, row_end; // in 4px units
        int col, row; // in tile units
    } tiling;

-    CdfContext cdf;
-    MsacContext msac;
-
    atomic_int progress; // in sby units, TILE_ERROR after a decoding error
    struct {
        pthread_mutex_t lock;
@ -298,6 +303,18 @@ struct Dav1dTileContext {
                uint16_t emu_edge_16bpc[320 * (256 + 7)];
            };
        };
+        struct {
+            union {
+                uint8_t levels[32 * 34];
+                struct {
+                    uint8_t pal_order[64][8];
+                    uint8_t pal_ctx[64];
+                };
+            };
+            int16_t ac[32 * 32];
+            uint8_t pal_idx[2 * 64 * 64];
+            uint16_t pal[3 /* plane */][8 /* palette_idx */];
+            ALIGN(union, 32) {
                struct {
                    uint8_t interintra_8bpc[64 * 64];
                    uint8_t edge_8bpc[257];
@ -306,18 +323,8 @@ struct Dav1dTileContext {
                    uint16_t interintra_16bpc[64 * 64];
                    uint16_t edge_16bpc[257];
                };
-        struct {
-            uint8_t pal_idx[2 * 64 * 64];
-            union {
-                struct {
-                    uint8_t pal_order[64][8];
-                    uint8_t pal_ctx[64];
            };
-                uint8_t levels[36 * 36];
        };
-            uint16_t pal[3 /* plane */][8 /* palette_idx */];
-        };
-        int16_t ac[32 * 32];
    } scratch;

    Dav1dWarpedMotionParams warpmv;
--- a/third_party/dav1d/src/intra_edge.c
+++ b/third_party/dav1d/src/intra_edge.c
@ -27,9 +27,10 @@

 #include "config.h"

-#include <assert.h>
 #include <stdlib.h>

+#include "common/attributes.h"
+
 #include "src/intra_edge.h"
 #include "src/levels.h"

--- a/third_party/dav1d/src/ipred_prepare_tmpl.c
+++ b/third_party/dav1d/src/ipred_prepare_tmpl.c
@ -27,7 +27,6 @@

 #include "config.h"

-#include <assert.h>
 #include <stdint.h>
 #include <string.h>

--- a/third_party/dav1d/src/ipred_tmpl.c
+++ b/third_party/dav1d/src/ipred_tmpl.c
@ -27,7 +27,6 @@

 #include "config.h"

-#include <assert.h>
 #include <stdlib.h>
 #include <string.h>

--- a/third_party/dav1d/src/itx_tmpl.c
+++ b/third_party/dav1d/src/itx_tmpl.c
@ -27,7 +27,6 @@

 #include "config.h"

-#include <assert.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <string.h>
--- a/third_party/dav1d/src/levels.h
+++ b/third_party/dav1d/src/levels.h
@ -109,17 +109,6 @@ enum TxfmType {
    N_TX_TYPES_PLUS_LL,
 };

-enum TxfmTypeSet {
-    TXTP_SET_DCT,
-    TXTP_SET_DCT_ID,
-    TXTP_SET_DT4_ID,
-    TXTP_SET_DT4_ID_1D,
-    TXTP_SET_DT9_ID_1D,
-    TXTP_SET_ALL,
-    TXTP_SET_LOSSLESS,
-    N_TXTP_SETS
-};
-
 enum TxClass {
    TX_CLASS_2D,
    TX_CLASS_H,
--- a/third_party/dav1d/src/lf_apply_tmpl.c
+++ b/third_party/dav1d/src/lf_apply_tmpl.c
@ -27,7 +27,6 @@

 #include "config.h"

-#include <assert.h>
 #include <string.h>

 #include "common/intops.h"
--- a/third_party/dav1d/src/lf_mask.c
+++ b/third_party/dav1d/src/lf_mask.c
@ -27,7 +27,6 @@

 #include "config.h"

-#include <assert.h>
 #include <string.h>

 #include "common/intops.h"
--- a/third_party/dav1d/src/lib.c
+++ b/third_party/dav1d/src/lib.c
@ -37,6 +37,7 @@
 #include "common/mem.h"
 #include "common/validate.h"

+#include "src/fg_apply.h"
 #include "src/internal.h"
 #include "src/log.h"
 #include "src/obu.h"
@ -44,12 +45,12 @@
 #include "src/ref.h"
 #include "src/thread_task.h"
 #include "src/wedge.h"
-#include "src/film_grain.h"

 static COLD void init_internal(void) {
    dav1d_init_wedge_masks();
    dav1d_init_interintra_masks();
    dav1d_init_qm_tables();
+    dav1d_init_thread();
 }

 COLD const char *dav1d_version(void) {
@ -289,13 +290,13 @@ static int output_image(Dav1dContext *const c, Dav1dPicture *const out,
    switch (out->p.bpc) {
 #if CONFIG_8BPC
    case 8:
-        dav1d_apply_grain_8bpc(out, in);
+        dav1d_apply_grain_8bpc(&c->dsp[0].fg, out, in);
        break;
 #endif
 #if CONFIG_16BPC
    case 10:
    case 12:
-        dav1d_apply_grain_16bpc(out, in);
+        dav1d_apply_grain_16bpc(&c->dsp[(out->p.bpc >> 1) - 4].fg, out, in);
        break;
 #endif
    default:
@ -409,8 +410,10 @@ void dav1d_flush(Dav1dContext *const c) {

    c->mastering_display = NULL;
    c->content_light = NULL;
+    c->itut_t35 = NULL;
    dav1d_ref_dec(&c->mastering_display_ref);
    dav1d_ref_dec(&c->content_light_ref);
+    dav1d_ref_dec(&c->itut_t35_ref);

    if (c->n_fc == 1) return;

@ -499,7 +502,7 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
            pthread_cond_destroy(&ts->tile_thread.cond);
            pthread_mutex_destroy(&ts->tile_thread.lock);
        }
-        free(f->ts);
+        dav1d_free_aligned(f->ts);
        dav1d_free_aligned(f->tc);
        dav1d_free_aligned(f->ipred_edge[0]);
        free(f->a);
@ -535,6 +538,7 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {

    dav1d_ref_dec(&c->mastering_display_ref);
    dav1d_ref_dec(&c->content_light_ref);
+    dav1d_ref_dec(&c->itut_t35_ref);

    dav1d_freep_aligned(c_out);
 }
--- a/third_party/dav1d/src/looprestoration_tmpl.c
+++ b/third_party/dav1d/src/looprestoration_tmpl.c
@ -172,8 +172,8 @@ static void wiener_c(pixel *p, const ptrdiff_t p_stride,
    const int round_bits_v = 11 - (bitdepth == 12) * 2;
    const int rounding_off_v = 1 << (round_bits_v - 1);
    const int round_offset = 1 << (bitdepth + (round_bits_v - 1));
-    for (int i = 0; i < w; i++) {
    for (int j = 0; j < h; j++) {
+        for (int i = 0; i < w; i++) {
            int sum = (hor[(j + 3) * REST_UNIT_STRIDE + i] << 7) - round_offset;

            for (int k = 0; k < 7; k++) {
--- a/third_party/dav1d/src/mc_tmpl.c
+++ b/third_party/dav1d/src/mc_tmpl.c
@ -27,7 +27,6 @@

 #include "config.h"

-#include <assert.h>
 #include <stdlib.h>
 #include <string.h>

@ -906,6 +905,7 @@ static void resize_c(pixel *dst, const ptrdiff_t dst_stride,
            src_x += mx >> 14;
            mx &= 0x3fff;
        }
+        if (dst_w & 1) dst[dst_w] = dst[dst_w - 1];

        dst += PXSTRIDE(dst_stride);
        src += PXSTRIDE(src_stride);
--- a/third_party/dav1d/src/meson.build
+++ b/third_party/dav1d/src/meson.build
@ -55,6 +55,7 @@ libdav1d_sources = files(
 libdav1d_tmpl_sources = files(
    'cdef_apply_tmpl.c',
    'cdef_tmpl.c',
+    'fg_apply_tmpl.c',
    'film_grain_tmpl.c',
    'ipred_prepare_tmpl.c',
    'ipred_tmpl.c',
@ -67,6 +68,10 @@ libdav1d_tmpl_sources = files(
    'recon_tmpl.c',
 )

+libdav1d_arch_tmpl_sources = []
+
+libdav1d_bitdepth_objs = []
+
 # libdav1d entrypoint source files
 # These source files contain library entry points and are
 # built with the stack-realign flag set, where necessary.
@ -77,6 +82,8 @@ libdav1d_entrypoints_sources = files(

 # ASM specific sources
 libdav1d_nasm_objs = []
+# Arch-specific flags
+arch_flags = []
 if is_asm_enabled
    if (host_machine.cpu_family() == 'aarch64' or
        host_machine.cpu_family().startswith('arm'))
@ -114,6 +121,7 @@ if is_asm_enabled

        libdav1d_tmpl_sources += files(
            'x86/cdef_init_tmpl.c',
+            'x86/film_grain_init_tmpl.c',
            'x86/ipred_init_tmpl.c',
            'x86/itx_init_tmpl.c',
            'x86/loopfilter_init_tmpl.c',
@ -130,6 +138,7 @@ if is_asm_enabled
        if dav1d_bitdepths.contains('8')
            libdav1d_sources_asm += files(
                'x86/cdef.asm',
+                'x86/film_grain.asm',
                'x86/ipred.asm',
                'x86/itx.asm',
                'x86/loopfilter.asm',
@ -138,6 +147,7 @@ if is_asm_enabled
                'x86/cdef_sse.asm',
                'x86/ipred_ssse3.asm',
                'x86/itx_ssse3.asm',
+                'x86/loopfilter_ssse3.asm',
                'x86/looprestoration_ssse3.asm',
                'x86/mc_ssse3.asm',
            )
@ -151,9 +161,13 @@ if is_asm_enabled
        # Compile the ASM sources with NASM
        libdav1d_nasm_objs = nasm_gen.process(libdav1d_sources_asm)
    elif host_machine.cpu() == 'ppc64le'
+        arch_flags = ['-maltivec', '-mvsx']
        libdav1d_sources += files(
            'ppc/cpu.c',
        )
+        libdav1d_arch_tmpl_sources += files(
+            'ppc/cdef_init_tmpl.c',
+        )
    endif
 endif

@ -223,6 +237,19 @@ foreach bitdepth : dav1d_bitdepths
    ).extract_all_objects()
 endforeach

+# Helper library for each bitdepth and architecture-specific flags
+foreach bitdepth : dav1d_bitdepths
+    libdav1d_bitdepth_objs += static_library(
+        'dav1d_arch_bitdepth_@0@'.format(bitdepth),
+        libdav1d_arch_tmpl_sources, config_h_target,
+        include_directories: dav1d_inc_dirs,
+        dependencies : [stdatomic_dependency],
+        c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + stackalign_flag + arch_flags,
+        install : false,
+        build_by_default : false,
+    ).extract_all_objects()
+endforeach
+
 # The final dav1d library
 if host_machine.system() == 'windows'
    dav1d_soversion = ''
--- a/third_party/dav1d/src/msac.c
+++ b/third_party/dav1d/src/msac.c
@ -116,42 +116,39 @@ int dav1d_msac_decode_subexp(MsacContext *const s, const int ref,

 /* Decodes a symbol given an inverse cumulative distribution function (CDF)
 * table in Q15. */
-static unsigned decode_symbol(MsacContext *const s, const uint16_t *const cdf,
+unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *const s,
+                                          uint16_t *const cdf,
                                          const size_t n_symbols)
 {
-    const unsigned c = s->dif >> (EC_WIN_SIZE - 16);
-    unsigned u, v = s->rng, r = s->rng >> 8, ret = 0;
+    const unsigned c = s->dif >> (EC_WIN_SIZE - 16), r = s->rng >> 8;
+    unsigned u, v = s->rng, val = -1;

-    assert(!cdf[n_symbols - 1]);
+    assert(n_symbols <= 15);
+    assert(cdf[n_symbols] <= 32);

    do {
+        val++;
        u = v;
-        v = r * (cdf[ret++] >> EC_PROB_SHIFT);
+        v = r * (cdf[val] >> EC_PROB_SHIFT);
        v >>= 7 - EC_PROB_SHIFT;
-        v += EC_MIN_PROB * (int) (n_symbols - ret);
+        v += EC_MIN_PROB * ((unsigned)n_symbols - val);
    } while (c < v);

    assert(u <= s->rng);

    ctx_norm(s, s->dif - ((ec_win)v << (EC_WIN_SIZE - 16)), u - v);
-    return ret - 1;
-}

-unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *const s,
-                                          uint16_t *const cdf,
-                                          const size_t n_symbols)
-{
-    const unsigned val = decode_symbol(s, cdf, n_symbols);
    if (s->allow_update_cdf) {
        const unsigned count = cdf[n_symbols];
-        const int rate = ((count >> 4) | 4) + (n_symbols > 3);
+        const unsigned rate = 4 + (count >> 4) + (n_symbols > 2);
        unsigned i;
        for (i = 0; i < val; i++)
            cdf[i] += (32768 - cdf[i]) >> rate;
-        for (; i < n_symbols - 1; i++)
+        for (; i < n_symbols; i++)
            cdf[i] -= cdf[i] >> rate;
        cdf[n_symbols] = count + (count < 32);
    }
+
    return val;
 }

@ -163,7 +160,7 @@ unsigned dav1d_msac_decode_bool_adapt_c(MsacContext *const s,
    if (s->allow_update_cdf) {
        // update_cdf() specialized for boolean CDFs
        const unsigned count = cdf[1];
-        const int rate = (count >> 4) | 4;
+        const int rate = 4 + (count >> 4);
        if (bit)
            cdf[0] += (32768 - cdf[0]) >> rate;
        else
@ -174,6 +171,22 @@ unsigned dav1d_msac_decode_bool_adapt_c(MsacContext *const s,
    return bit;
 }

+unsigned dav1d_msac_decode_hi_tok_c(MsacContext *const s, uint16_t *const cdf) {
+    unsigned tok_br = dav1d_msac_decode_symbol_adapt4(s, cdf, 3);
+    unsigned tok = 3 + tok_br;
+    if (tok_br == 3) {
+        tok_br = dav1d_msac_decode_symbol_adapt4(s, cdf, 3);
+        tok = 6 + tok_br;
+        if (tok_br == 3) {
+            tok_br = dav1d_msac_decode_symbol_adapt4(s, cdf, 3);
+            tok = 9 + tok_br;
+            if (tok_br == 3)
+                tok = 12 + dav1d_msac_decode_symbol_adapt4(s, cdf, 3);
+        }
+    }
+    return tok;
+}
+
 void dav1d_msac_init(MsacContext *const s, const uint8_t *const data,
                     const size_t sz, const int disable_cdf_update_flag)
 {
--- a/third_party/dav1d/src/msac.h
+++ b/third_party/dav1d/src/msac.h
@ -28,10 +28,11 @@
 #ifndef DAV1D_SRC_MSAC_H
 #define DAV1D_SRC_MSAC_H

-#include <assert.h>
 #include <stdint.h>
 #include <stdlib.h>

+#include "common/attributes.h"
+
 typedef size_t ec_win;

 typedef struct MsacContext {
@ -58,9 +59,10 @@ unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *s, uint16_t *cdf,
 unsigned dav1d_msac_decode_bool_adapt_c(MsacContext *s, uint16_t *cdf);
 unsigned dav1d_msac_decode_bool_equi_c(MsacContext *s);
 unsigned dav1d_msac_decode_bool_c(MsacContext *s, unsigned f);
+unsigned dav1d_msac_decode_hi_tok_c(MsacContext *s, uint16_t *cdf);
 int dav1d_msac_decode_subexp(MsacContext *s, int ref, int n, unsigned k);

-/* Supported n_symbols ranges: adapt4: 1-5, adapt8: 1-8, adapt16: 4-16 */
+/* Supported n_symbols ranges: adapt4: 1-4, adapt8: 1-7, adapt16: 3-15 */
 #ifndef dav1d_msac_decode_symbol_adapt4
 #define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt_c
 #endif
@ -79,6 +81,9 @@ int dav1d_msac_decode_subexp(MsacContext *s, int ref, int n, unsigned k);
 #ifndef dav1d_msac_decode_bool
 #define dav1d_msac_decode_bool           dav1d_msac_decode_bool_c
 #endif
+#ifndef dav1d_msac_decode_hi_tok
+#define dav1d_msac_decode_hi_tok         dav1d_msac_decode_hi_tok_c
+#endif

 static inline unsigned dav1d_msac_decode_bools(MsacContext *const s, unsigned n) {
    unsigned v = 0;
--- a/third_party/dav1d/src/obu.c
+++ b/third_party/dav1d/src/obu.c
@ -27,7 +27,6 @@

 #include "config.h"

-#include <assert.h>
 #include <errno.h>
 #include <limits.h>
 #include <stdio.h>
@ -299,9 +298,10 @@ static int read_frame_size(Dav1dContext *const c, GetBits *const gb,
                Dav1dThreadPicture *const ref =
                    &c->refs[c->frame_hdr->refidx[i]].p;
                if (!ref->p.data[0]) return -1;
-                // FIXME render_* may be wrong
-                hdr->render_width = hdr->width[1] = ref->p.p.w;
-                hdr->render_height = hdr->height = ref->p.p.h;
+                hdr->width[1] = ref->p.p.w;
+                hdr->height = ref->p.p.h;
+                hdr->render_width = ref->p.frame_hdr->render_width;
+                hdr->render_height = ref->p.frame_hdr->render_height;
                hdr->super_res.enabled = seqhdr->super_res && dav1d_get_bits(gb, 1);
                if (hdr->super_res.enabled) {
                    const int d = hdr->super_res.width_scale_denominator =
@ -1275,8 +1275,10 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
            c->frame_hdr_ref = dav1d_ref_create(sizeof(Dav1dFrameHeader));
            if (!c->frame_hdr_ref) return DAV1D_ERR(ENOMEM);
        }
+#ifndef NDEBUG
        // ensure that the reference is writable
        assert(dav1d_ref_is_writable(c->frame_hdr_ref));
+#endif
        c->frame_hdr = c->frame_hdr_ref->data;
        memset(c->frame_hdr, 0, sizeof(*c->frame_hdr));
        c->frame_hdr->temporal_id = temporal_id;
@ -1364,10 +1366,12 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
    case OBU_METADATA: {
        // obu metadta type field
        const enum ObuMetaType meta_type = dav1d_get_uleb128(&gb);
+        const int meta_type_len = (dav1d_get_bits_pos(&gb) - init_bit_pos) >> 3;
        if (gb.error) goto error;
        Dav1dRef *ref;
        Dav1dContentLightLevel *content_light;
        Dav1dMasteringDisplay *mastering_display;
+        Dav1dITUTT35 *itut_t35_metadata;

        switch (meta_type) {
        case OBU_META_HDR_CLL:
@ -1420,7 +1424,47 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
            c->mastering_display_ref = ref;
            break;
        }
-        case OBU_META_ITUT_T35:
+        case OBU_META_ITUT_T35: {
+            int payload_size = len;
+            // Don't take into account all the trailing bits for payload_size
+            while (payload_size > 0 && !in->data[init_byte_pos + payload_size - 1])
+                payload_size--; // trailing_zero_bit x 8
+            payload_size--; // trailing_one_bit + trailing_zero_bit x 7
+
+            // Don't take into account meta_type bytes
+            payload_size -= meta_type_len;
+
+            int country_code_extension_byte = 0;
+            int country_code = dav1d_get_bits(&gb, 8);
+            payload_size--;
+            if (country_code == 0xFF) {
+                country_code_extension_byte = dav1d_get_bits(&gb, 8);
+                payload_size--;
+            }
+
+            if (payload_size <= 0) {
+                dav1d_log(c, "Malformed ITU-T T.35 metadata message format\n");
+                goto error;
+            }
+
+            ref = dav1d_ref_create(sizeof(Dav1dITUTT35) + payload_size * sizeof(uint8_t));
+            if (!ref) return DAV1D_ERR(ENOMEM);
+            itut_t35_metadata = ref->data;
+
+            // We need our public headers to be C++ compatible, so payload can't be
+            // a flexible array member
+            itut_t35_metadata->payload = (uint8_t *) &itut_t35_metadata[1];
+            itut_t35_metadata->country_code = country_code;
+            itut_t35_metadata->country_code_extension_byte = country_code_extension_byte;
+            for (int i = 0; i < payload_size; i++)
+                itut_t35_metadata->payload[i] = dav1d_get_bits(&gb, 8);
+            itut_t35_metadata->payload_size = payload_size;
+
+            dav1d_ref_dec(&c->itut_t35_ref);
+            c->itut_t35 = itut_t35_metadata;
+            c->itut_t35_ref = ref;
+            break;
+        }
        case OBU_META_SCALABILITY:
        case OBU_META_TIMECODE:
            // ignore metadata OBUs we don't care about
--- a/third_party/dav1d/src/picture.c
+++ b/third_party/dav1d/src/picture.c
@ -27,7 +27,6 @@

 #include "config.h"

-#include <assert.h>
 #include <errno.h>
 #include <stdint.h>
 #include <stdio.h>
@ -104,6 +103,7 @@ static int picture_alloc_with_edges(Dav1dContext *const c, Dav1dPicture *const p
                                    Dav1dFrameHeader *frame_hdr,  Dav1dRef *frame_hdr_ref,
                                    Dav1dContentLightLevel *content_light, Dav1dRef *content_light_ref,
                                    Dav1dMasteringDisplay *mastering_display, Dav1dRef *mastering_display_ref,
+                                    Dav1dITUTT35 *itut_t35, Dav1dRef *itut_t35_ref,
                                    const int bpc, const Dav1dDataProps *props,
                                    Dav1dPicAllocator *const p_allocator,
                                    const size_t extra, void **const extra_ptr)
@ -125,6 +125,7 @@ static int picture_alloc_with_edges(Dav1dContext *const c, Dav1dPicture *const p
    p->frame_hdr = frame_hdr;
    p->content_light = content_light;
    p->mastering_display = mastering_display;
+    p->itut_t35 = itut_t35;
    p->p.layout = seq_hdr->layout;
    p->p.bpc = bpc;
    dav1d_data_props_set_defaults(&p->m);
@ -161,6 +162,9 @@ static int picture_alloc_with_edges(Dav1dContext *const c, Dav1dPicture *const p
    p->mastering_display_ref = mastering_display_ref;
    if (mastering_display_ref) dav1d_ref_inc(mastering_display_ref);

+    p->itut_t35_ref = itut_t35_ref;
+    if (itut_t35_ref) dav1d_ref_inc(itut_t35_ref);
+
    return 0;
 }

@ -176,11 +180,16 @@ int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f
                                 f->frame_hdr, f->frame_hdr_ref,
                                 c->content_light, c->content_light_ref,
                                 c->mastering_display, c->mastering_display_ref,
+                                 c->itut_t35, c->itut_t35_ref,
                                 bpc, &f->tile[0].data.m, &c->allocator,
                                 p->t != NULL ? sizeof(atomic_int) * 2 : 0,
                                 (void **) &p->progress);
    if (res) return res;

+    // Must be removed from the context after being attached to the frame
+    dav1d_ref_dec(&c->itut_t35_ref);
+    c->itut_t35 = NULL;
+
    p->visible = f->frame_hdr->show_frame;
    if (p->t) {
        atomic_init(&p->progress[0], 0);
@ -198,6 +207,7 @@ int dav1d_picture_alloc_copy(Dav1dContext *const c, Dav1dPicture *const dst, con
                                             src->frame_hdr, src->frame_hdr_ref,
                                             src->content_light, src->content_light_ref,
                                             src->mastering_display, src->mastering_display_ref,
+                                             src->itut_t35, src->itut_t35_ref,
                                             src->p.bpc, &src->m, &pic_ctx->allocator,
                                             0, NULL);
    return res;
@ -216,6 +226,7 @@ void dav1d_picture_ref(Dav1dPicture *const dst, const Dav1dPicture *const src) {
        if (src->m.user_data.ref) dav1d_ref_inc(src->m.user_data.ref);
        if (src->content_light_ref) dav1d_ref_inc(src->content_light_ref);
        if (src->mastering_display_ref) dav1d_ref_inc(src->mastering_display_ref);
+        if (src->itut_t35_ref) dav1d_ref_inc(src->itut_t35_ref);
    }
    *dst = *src;
 }
@ -252,6 +263,7 @@ void dav1d_picture_unref_internal(Dav1dPicture *const p) {
        dav1d_ref_dec(&p->m.user_data.ref);
        dav1d_ref_dec(&p->content_light_ref);
        dav1d_ref_dec(&p->mastering_display_ref);
+        dav1d_ref_dec(&p->itut_t35_ref);
    }
    memset(p, 0, sizeof(*p));
 }
--- a/third_party/dav1d/src/ppc/cdef_init_tmpl.c
+++ b/third_party/dav1d/src/ppc/cdef_init_tmpl.c
@ -0,0 +1,488 @@
+/*
+ * Copyright © 2019, Luca Barbato
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdlib.h>
+
+#include "common/bitdepth.h"
+#include "common/intops.h"
+
+#include "src/cdef.h"
+#include "src/cpu.h"
+
+#include "src/ppc/types.h"
+
+#if BITDEPTH == 8
+static inline i16x8 vconstrain(const i16x8 diff, const int16_t threshold,
+                               const int damping)
+{
+    const i16x8 zero = vec_splat_s16(0);
+    if (!threshold) return zero;
+    const uint16_t shift = imax(0, damping - ulog2(threshold));
+    const i16x8 abs_diff = vec_abs(diff);
+    const b16x8 mask = vec_cmplt(diff, zero);
+    const i16x8 thr = vec_splats(threshold);
+    const i16x8 sub = vec_sub(thr, vec_sra(abs_diff, vec_splats(shift)));
+    const i16x8 max = vec_max(zero, sub);
+    const i16x8 min = vec_min(abs_diff, max);
+    const i16x8 neg = vec_sub(zero, min);
+    return vec_sel(min, neg, mask);
+}
+
+static inline void copy4xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
+                           const uint8_t *src, const ptrdiff_t src_stride,
+                           const uint8_t (*left)[2], uint8_t *const top[2],
+                           const int w, const int h,
+                           const enum CdefEdgeFlags edges)
+{
+    const u16x8 fill = vec_splats((uint16_t)INT16_MAX);
+
+    u16x8 l0;
+    u16x8 l1;
+
+    int y_start = -2, y_end = h + 2;
+
+    // Copy top and bottom first
+    if (!(edges & CDEF_HAVE_TOP)) {
+        l0 = fill;
+        l1 = fill;
+        y_start = 0;
+    } else {
+        l0 = u8h_to_u16(vec_vsx_ld(0, top[0] - 2));
+        l1 = u8h_to_u16(vec_vsx_ld(0, top[1] - 2));
+    }
+
+    vec_st(l0, 0, tmp - 2 * 8);
+    vec_st(l1, 0, tmp - 1 * 8);
+
+    if (!(edges & CDEF_HAVE_BOTTOM)) {
+        l0 = fill;
+        l1 = fill;
+        y_end -= 2;
+    } else {
+        l0 = u8h_to_u16(vec_vsx_ld(0, src - 2 + (h + 0) * src_stride));
+        l1 = u8h_to_u16(vec_vsx_ld(0, src - 2 + (h + 1) * src_stride));
+    }
+
+    vec_st(l0, 0, tmp + (h + 0) * 8);
+    vec_st(l1, 0, tmp + (h + 1) * 8);
+
+    for (int y = 0; y < h; y++) {
+        u16x8 l = u8h_to_u16(vec_vsx_ld(0, src - 2 + y * src_stride));
+        vec_st(l, 0, tmp + y * 8);
+    }
+
+    if (!(edges & CDEF_HAVE_LEFT)) {
+        for (int y = y_start; y < y_end; y++) {
+            tmp[y * 8] = INT16_MAX;
+            tmp[1 + y * 8] = INT16_MAX;
+        }
+    } else {
+        for (int y = 0; y < h; y++) {
+            tmp[y * 8] = left[y][0];
+            tmp[1 + y * 8] = left[y][1];
+        }
+    }
+    if (!(edges & CDEF_HAVE_RIGHT)) {
+        for (int y = y_start; y < y_end; y++) {
+            tmp[- 2 + (y + 1) * 8] = INT16_MAX;
+            tmp[- 1 + (y + 1) * 8] = INT16_MAX;
+        }
+    }
+}
+
+static inline void copy8xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
+                           const uint8_t *src, const ptrdiff_t src_stride,
+                           const uint8_t (*left)[2], uint8_t *const top[2],
+                           const int w, const int h,
+                           const enum CdefEdgeFlags edges)
+{
+    const u16x8 fill = vec_splats((uint16_t)INT16_MAX);
+
+    u16x8 l0h, l0l;
+    u16x8 l1h, l1l;
+
+    int y_start = -2, y_end = h + 2;
+
+    // Copy top and bottom first
+    if (!(edges & CDEF_HAVE_TOP)) {
+        l0h = fill;
+        l0l = fill;
+        l1h = fill;
+        l1l = fill;
+        y_start = 0;
+    } else {
+        u8x16 l0 = vec_vsx_ld(0, top[0] - 2);
+        u8x16 l1 = vec_vsx_ld(0, top[1] - 2);
+        l0h = u8h_to_u16(l0);
+        l0l = u8l_to_u16(l0);
+        l1h = u8h_to_u16(l1);
+        l1l = u8l_to_u16(l1);
+    }
+
+    vec_st(l0h, 0, tmp - 4 * 8);
+    vec_st(l0l, 0, tmp - 3 * 8);
+    vec_st(l1h, 0, tmp - 2 * 8);
+    vec_st(l1l, 0, tmp - 1 * 8);
+
+    if (!(edges & CDEF_HAVE_BOTTOM)) {
+        l0h = fill;
+        l0l = fill;
+        l1h = fill;
+        l1l = fill;
+        y_end -= 2;
+    } else {
+        u8x16 l0 = vec_vsx_ld(0, src - 2 + (h + 0) * src_stride);
+        u8x16 l1 = vec_vsx_ld(0, src - 2 + (h + 1) * src_stride);
+        l0h = u8h_to_u16(l0);
+        l0l = u8l_to_u16(l0);
+        l1h = u8h_to_u16(l1);
+        l1l = u8l_to_u16(l1);
+    }
+
+    vec_st(l0h, 0, tmp + (h + 0) * 16);
+    vec_st(l0l, 0, tmp + (h + 0) * 16 + 8);
+    vec_st(l1h, 0, tmp + (h + 1) * 16);
+    vec_st(l1l, 0, tmp + (h + 1) * 16 + 8);
+
+    for (int y = 0; y < h; y++) {
+        u8x16 l = vec_vsx_ld(0, src - 2 + y * src_stride);
+        u16x8 lh = u8h_to_u16(l);
+        u16x8 ll = u8l_to_u16(l);
+        vec_st(lh, 0, tmp + y * 16);
+        vec_st(ll, 0, tmp + 8 + y * 16);
+    }
+
+    if (!(edges & CDEF_HAVE_LEFT)) {
+        for (int y = y_start; y < y_end; y++) {
+            tmp[y * 16] = INT16_MAX;
+            tmp[1 + y * 16] = INT16_MAX;
+        }
+    } else {
+        for (int y = 0; y < h; y++) {
+            tmp[y * 16] = left[y][0];
+            tmp[1 + y * 16] = left[y][1];
+        }
+    }
+    if (!(edges & CDEF_HAVE_RIGHT)) {
+        for (int y = y_start; y < y_end; y++) {
+            tmp[- 6 + (y + 1) * 16] = INT16_MAX;
+            tmp[- 5 + (y + 1) * 16] = INT16_MAX;
+        }
+    }
+}
+
+static inline i16x8 max_mask(i16x8 a, i16x8 b) {
+    const i16x8 I16X8_INT16_MAX = vec_splats((int16_t)INT16_MAX);
+
+    const b16x8 mask = vec_cmpeq(a, I16X8_INT16_MAX);
+
+    const i16x8 val = vec_sel(a, b, mask);
+
+    return vec_max(val, b);
+}
+
+#define LOAD_PIX(addr) \
+    const i16x8 px = (i16x8)vec_vsx_ld(0, addr); \
+    i16x8 max = px; \
+    i16x8 min = px; \
+    i16x8 sum = vec_splat_s16(0);
+
+#define LOAD_PIX4(addr) \
+    const i16x8 a = (i16x8)vec_vsx_ld(0, addr); \
+    const i16x8 b = (i16x8)vec_vsx_ld(0, addr + tmp_stride); \
+    const i16x8 px = vec_xxpermdi(a, b, 0); \
+    i16x8 max = px; \
+    i16x8 min = px; \
+    i16x8 sum = vec_splat_s16(0);
+
+#define LOAD_DIR(p, addr, o0, o1) \
+    const i16x8 p ## 0 = (i16x8)vec_vsx_ld(0, addr + o0); \
+    const i16x8 p ## 1 = (i16x8)vec_vsx_ld(0, addr - o0); \
+    const i16x8 p ## 2 = (i16x8)vec_vsx_ld(0, addr + o1); \
+    const i16x8 p ## 3 = (i16x8)vec_vsx_ld(0, addr - o1);
+
+#define LOAD_DIR4(p, addr, o0, o1) \
+    LOAD_DIR(p ## a, addr, o0, o1) \
+    LOAD_DIR(p ## b, addr + tmp_stride, o0, o1) \
+    const i16x8 p ## 0 = vec_xxpermdi(p ## a ## 0, p ## b ## 0, 0); \
+    const i16x8 p ## 1 = vec_xxpermdi(p ## a ## 1, p ## b ## 1, 0); \
+    const i16x8 p ## 2 = vec_xxpermdi(p ## a ## 2, p ## b ## 2, 0); \
+    const i16x8 p ## 3 = vec_xxpermdi(p ## a ## 3, p ## b ## 3, 0);
+
+#define CONSTRAIN(p, strength) \
+    const i16x8 p ## _d0 = vec_sub(p ## 0, px); \
+    const i16x8 p ## _d1 = vec_sub(p ## 1, px); \
+    const i16x8 p ## _d2 = vec_sub(p ## 2, px); \
+    const i16x8 p ## _d3 = vec_sub(p ## 3, px); \
+\
+    i16x8 p ## _c0 = vconstrain(p ## _d0, strength, damping); \
+    i16x8 p ## _c1 = vconstrain(p ## _d1, strength, damping); \
+    i16x8 p ## _c2 = vconstrain(p ## _d2, strength, damping); \
+    i16x8 p ## _c3 = vconstrain(p ## _d3, strength, damping);
+
+#define MIN_MAX(p) \
+    max = max_mask(p ## 0, max); \
+    min = vec_min(p ## 0, min); \
+    max = max_mask(p ## 1, max); \
+    min = vec_min(p ## 1, min); \
+    max = max_mask(p ## 2, max); \
+    min = vec_min(p ## 2, min); \
+    max = max_mask(p ## 3, max); \
+    min = vec_min(p ## 3, min);
+
+#define PRI_0(p) \
+    p ## _c0 = vec_add(vec_sl(p ## _c0, vec_splat_u16(1)), vec_sl(p ## _c0, vec_splats(tap_even))); \
+    p ## _c1 = vec_add(vec_sl(p ## _c1, vec_splat_u16(1)), vec_sl(p ## _c1, vec_splats(tap_even)));
+
+#define PRI_1(p) \
+    p ## _c2 = vec_sub(vec_sl(p ## _c2, vec_splat_u16(2)), vec_sl(p ## _c2, vec_splats(tap_even))); \
+    p ## _c3 = vec_sub(vec_sl(p ## _c3, vec_splat_u16(2)), vec_sl(p ## _c3, vec_splats(tap_even)));
+
+#define SEC_0(p) \
+    p ## _c0 = vec_sl(p ## _c0, vec_splat_u16(1)); \
+    p ## _c1 = vec_sl(p ## _c1, vec_splat_u16(1)); \
+    p ## _c2 = vec_sl(p ## _c2, vec_splat_u16(1)); \
+    p ## _c3 = vec_sl(p ## _c3, vec_splat_u16(1));
+
+#define UPDATE_SUM(p) \
+    const i16x8 p ## sum0 = vec_add(p ## _c0, p ## _c1); \
+    const i16x8 p ## sum1 = vec_add(p ## _c2, p ## _c3); \
+    sum = vec_add(sum, p ## sum0); \
+    sum = vec_add(sum, p ## sum1);
+
+static inline void
+filter_4xN(pixel *dst, const ptrdiff_t dst_stride,
+           const pixel (*left)[2], /*const*/ pixel *const top[2],
+           const int w, const int h, const int pri_strength,
+           const int sec_strength, const int dir,
+           const int damping, const enum CdefEdgeFlags edges,
+           const ptrdiff_t tmp_stride, uint16_t *tmp)
+{
+    const int8_t cdef_directions[8 /* dir */][2 /* pass */] = {
+        { -1 * tmp_stride + 1, -2 * tmp_stride + 2 },
+        {  0 * tmp_stride + 1, -1 * tmp_stride + 2 },
+        {  0 * tmp_stride + 1,  0 * tmp_stride + 2 },
+        {  0 * tmp_stride + 1,  1 * tmp_stride + 2 },
+        {  1 * tmp_stride + 1,  2 * tmp_stride + 2 },
+        {  1 * tmp_stride + 0,  2 * tmp_stride + 1 },
+        {  1 * tmp_stride + 0,  2 * tmp_stride + 0 },
+        {  1 * tmp_stride + 0,  2 * tmp_stride - 1 }
+    };
+    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+    const uint16_t tap_even = !((pri_strength >> bitdepth_min_8) & 1);
+    const int off1 = cdef_directions[dir][0];
+    const int off1_1 = cdef_directions[dir][1];
+
+    const int off2 = cdef_directions[(dir + 2) & 7][0];
+    const int off3 = cdef_directions[(dir + 6) & 7][0];
+
+    const int off2_1 = cdef_directions[(dir + 2) & 7][1];
+    const int off3_1 = cdef_directions[(dir + 6) & 7][1];
+
+
+    copy4xN(tmp - 2, tmp_stride, dst, dst_stride, left, top, w, h, edges);
+    for (int y = 0; y < h / 2; y++) {
+        LOAD_PIX4(tmp)
+
+        // Primary pass
+        LOAD_DIR4(p, tmp, off1, off1_1)
+
+        CONSTRAIN(p, pri_strength)
+
+        MIN_MAX(p)
+
+        PRI_0(p)
+        PRI_1(p)
+
+        UPDATE_SUM(p)
+
+        // Secondary pass 1
+        LOAD_DIR4(s, tmp, off2, off3)
+
+        CONSTRAIN(s, sec_strength)
+
+        MIN_MAX(s)
+
+        SEC_0(s)
+
+        UPDATE_SUM(s)
+
+        // Secondary pass 2
+        LOAD_DIR4(s2, tmp, off2_1, off3_1)
+
+        CONSTRAIN(s2, sec_strength)
+
+        MIN_MAX(s2)
+
+        UPDATE_SUM(s2)
+
+        // Store
+        i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1));
+        bias = vec_sub(vec_splat_s16(8), bias);
+        i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4)));
+        i16x8 vdst = vec_max(vec_min(unclamped, max), min);
+
+        dst[0] = vdst[0];
+        dst[1] = vdst[1];
+        dst[2] = vdst[2];
+        dst[3] = vdst[3];
+
+        tmp += tmp_stride;
+        dst += PXSTRIDE(dst_stride);
+        dst[0] = vdst[4];
+        dst[1] = vdst[5];
+        dst[2] = vdst[6];
+        dst[3] = vdst[7];
+
+        tmp += tmp_stride;
+        dst += PXSTRIDE(dst_stride);
+    }
+}
+
+static inline void
+filter_8xN(pixel *dst, const ptrdiff_t dst_stride,
+           const pixel (*left)[2], /*const*/ pixel *const top[2],
+           const int w, const int h, const int pri_strength,
+           const int sec_strength, const int dir,
+           const int damping, const enum CdefEdgeFlags edges,
+           const ptrdiff_t tmp_stride, uint16_t *tmp)
+{
+    const int8_t cdef_directions[8 /* dir */][2 /* pass */] = {
+        { -1 * tmp_stride + 1, -2 * tmp_stride + 2 },
+        {  0 * tmp_stride + 1, -1 * tmp_stride + 2 },
+        {  0 * tmp_stride + 1,  0 * tmp_stride + 2 },
+        {  0 * tmp_stride + 1,  1 * tmp_stride + 2 },
+        {  1 * tmp_stride + 1,  2 * tmp_stride + 2 },
+        {  1 * tmp_stride + 0,  2 * tmp_stride + 1 },
+        {  1 * tmp_stride + 0,  2 * tmp_stride + 0 },
+        {  1 * tmp_stride + 0,  2 * tmp_stride - 1 }
+    };
+    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+
+
+    const uint16_t tap_even = !((pri_strength >> bitdepth_min_8) & 1);
+    const int off1 = cdef_directions[dir][0];
+    const int off1_1 = cdef_directions[dir][1];
+
+    const int off2 = cdef_directions[(dir + 2) & 7][0];
+    const int off3 = cdef_directions[(dir + 6) & 7][0];
+
+    const int off2_1 = cdef_directions[(dir + 2) & 7][1];
+    const int off3_1 = cdef_directions[(dir + 6) & 7][1];
+
+    copy8xN(tmp - 2, tmp_stride, dst, dst_stride, left, top, w, h, edges);
+
+    for (int y = 0; y < h; y++) {
+        LOAD_PIX(tmp)
+
+        // Primary pass
+        LOAD_DIR(p, tmp, off1, off1_1)
+
+        CONSTRAIN(p, pri_strength)
+
+        MIN_MAX(p)
+
+        PRI_0(p)
+        PRI_1(p)
+
+        UPDATE_SUM(p)
+
+        // Secondary pass 1
+        LOAD_DIR(s, tmp, off2, off3)
+
+        CONSTRAIN(s, sec_strength)
+
+        MIN_MAX(s)
+
+        SEC_0(s)
+
+        UPDATE_SUM(s)
+
+        // Secondary pass 2
+        LOAD_DIR(s2, tmp, off2_1, off3_1)
+
+        CONSTRAIN(s2, sec_strength)
+
+        MIN_MAX(s2)
+
+        UPDATE_SUM(s2)
+
+        // Store
+        i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1));
+        bias = vec_sub(vec_splat_s16(8), bias);
+        i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4)));
+        i16x8 vdst = vec_max(vec_min(unclamped, max), min);
+
+        dst[0] = vdst[0];
+        dst[1] = vdst[1];
+        dst[2] = vdst[2];
+        dst[3] = vdst[3];
+        dst[4] = vdst[4];
+        dst[5] = vdst[5];
+        dst[6] = vdst[6];
+        dst[7] = vdst[7];
+
+        tmp += tmp_stride;
+        dst += PXSTRIDE(dst_stride);
+    }
+
+}
+
+
+#define cdef_fn(w, h, tmp_stride) \
+static void cdef_filter_##w##x##h##_vsx(pixel *const dst, \
+                                        const ptrdiff_t dst_stride, \
+                                        const pixel (*left)[2], \
+                                        /*const*/ pixel *const top[2], \
+                                        const int pri_strength, \
+                                        const int sec_strength, \
+                                        const int dir, \
+                                        const int damping, \
+                                        const enum CdefEdgeFlags edges) \
+{ \
+    ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride,); \
+    uint16_t *tmp = tmp_buf + 2 * tmp_stride + 2; \
+    filter_##w##xN(dst, dst_stride, left, top, w, h, pri_strength, sec_strength, \
+                   dir, damping, edges, tmp_stride, tmp); \
+}
+
+cdef_fn(4, 4, 8);
+cdef_fn(4, 8, 8);
+cdef_fn(8, 8, 16);
+#endif
+
+COLD void bitfn(dav1d_cdef_dsp_init_ppc)(Dav1dCdefDSPContext *const c) {
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if (!(flags & DAV1D_PPC_CPU_FLAG_VSX)) return;
+
+#if BITDEPTH == 8
+    // c->dir = dav1d_cdef_find_dir_vsx;
+    c->fb[0] = cdef_filter_8x8_vsx;
+    c->fb[1] = cdef_filter_4x8_vsx;
+    c->fb[2] = cdef_filter_4x4_vsx;
+#endif
+}
--- a/third_party/dav1d/src/ppc/types.h
+++ b/third_party/dav1d/src/ppc/types.h
@ -0,0 +1,52 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Luca Barbato
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_PPC_TYPES_H
+#define DAV1D_SRC_PPC_TYPES_H
+
+#include <altivec.h>
+#undef pixel
+
+#define u8x16 vector unsigned char
+#define i8x16 vector signed char
+#define b8x16 vector bool char
+#define u16x8 vector unsigned short
+#define i16x8 vector signed short
+#define b16x8 vector bool short
+#define u32x4 vector unsigned int
+#define i32x4 vector signed int
+#define b32x4 vector bool int
+#define u64x2 vector unsigned long long
+#define i64x2 vector signed long long
+#define b64x2 vector bool long long
+
+#define u8h_to_u16(v) ((u16x8) vec_mergeh((u8x16) v, vec_splat_u8(0)))
+#define u8l_to_u16(v) ((u16x8) vec_mergel((u8x16) v, vec_splat_u8(0)))
+#define u16h_to_i32(v) ((i32x4) vec_mergeh((u16x8) v, vec_splat_u16(0)))
+#define u16l_to_i32(v) ((i32x4) vec_mergel((u16x8) v, vec_splat_u16(0)))
+
+#endif /* DAV1D_SRC_PPC_TYPES_H */
--- a/third_party/dav1d/src/recon_tmpl.c
+++ b/third_party/dav1d/src/recon_tmpl.c
@ -46,16 +46,278 @@
 #include "src/tables.h"
 #include "src/wedge.h"

-static unsigned read_golomb(MsacContext *const msac) {
+static inline unsigned read_golomb(MsacContext *const msac) {
    int len = 0;
    unsigned val = 1;

    while (!dav1d_msac_decode_bool_equi(msac) && len < 32) len++;
-    while (len--) val = (val << 1) | dav1d_msac_decode_bool_equi(msac);
+    while (len--) val = (val << 1) + dav1d_msac_decode_bool_equi(msac);

    return val - 1;
 }

+static inline unsigned get_skip_ctx(const TxfmInfo *const t_dim,
+                                    const enum BlockSize bs,
+                                    const uint8_t *const a,
+                                    const uint8_t *const l,
+                                    const int chroma,
+                                    const enum Dav1dPixelLayout layout)
+{
+    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+
+    if (chroma) {
+        const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
+        const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
+        const int not_one_blk = b_dim[2] - (!!b_dim[2] && ss_hor) > t_dim->lw ||
+                                b_dim[3] - (!!b_dim[3] && ss_ver) > t_dim->lh;
+        int ca, cl;
+
+#define MERGE_CTX(dir, type, mask) \
+        c##dir = !!((*(const type *) dir) & mask); \
+        break
+
+        switch (t_dim->lw) {
+        /* For some reason the MSVC CRT _wassert() function is not flagged as
+         * __declspec(noreturn), so when using those headers the compiler will
+         * expect execution to continue after an assertion has been triggered
+         * and will therefore complain about the use of uninitialized variables
+         * when compiled in debug mode if we put the default case at the end. */
+        default: assert(0); /* fall-through */
+        case TX_4X4:   MERGE_CTX(a, uint8_t,  0x3F);
+        case TX_8X8:   MERGE_CTX(a, uint16_t, 0x3F3F);
+        case TX_16X16: MERGE_CTX(a, uint32_t, 0x3F3F3F3FU);
+        case TX_32X32: MERGE_CTX(a, uint64_t, 0x3F3F3F3F3F3F3F3FULL);
+        }
+        switch (t_dim->lh) {
+        default: assert(0); /* fall-through */
+        case TX_4X4:   MERGE_CTX(l, uint8_t,  0x3F);
+        case TX_8X8:   MERGE_CTX(l, uint16_t, 0x3F3F);
+        case TX_16X16: MERGE_CTX(l, uint32_t, 0x3F3F3F3FU);
+        case TX_32X32: MERGE_CTX(l, uint64_t, 0x3F3F3F3F3F3F3F3FULL);
+        }
+#undef MERGE_CTX
+
+        return 7 + not_one_blk * 3 + ca + cl;
+    } else if (b_dim[2] == t_dim->lw && b_dim[3] == t_dim->lh) {
+        return 0;
+    } else {
+        unsigned la, ll;
+
+#define MERGE_CTX(dir, type, tx) \
+        if (tx == TX_64X64) { \
+            uint64_t tmp = *(const uint64_t *) dir; \
+            tmp |= *(const uint64_t *) &dir[8]; \
+            l##dir = (unsigned) (tmp >> 32) | (unsigned) tmp; \
+        } else \
+            l##dir = *(const type *) dir; \
+        if (tx == TX_32X32) l##dir |= *(const type *) &dir[sizeof(type)]; \
+        if (tx >= TX_16X16) l##dir |= l##dir >> 16; \
+        if (tx >= TX_8X8)   l##dir |= l##dir >> 8; \
+        break
+
+        switch (t_dim->lw) {
+        default: assert(0); /* fall-through */
+        case TX_4X4:   MERGE_CTX(a, uint8_t,  TX_4X4);
+        case TX_8X8:   MERGE_CTX(a, uint16_t, TX_8X8);
+        case TX_16X16: MERGE_CTX(a, uint32_t, TX_16X16);
+        case TX_32X32: MERGE_CTX(a, uint32_t, TX_32X32);
+        case TX_64X64: MERGE_CTX(a, uint32_t, TX_64X64);
+        }
+        switch (t_dim->lh) {
+        default: assert(0); /* fall-through */
+        case TX_4X4:   MERGE_CTX(l, uint8_t,  TX_4X4);
+        case TX_8X8:   MERGE_CTX(l, uint16_t, TX_8X8);
+        case TX_16X16: MERGE_CTX(l, uint32_t, TX_16X16);
+        case TX_32X32: MERGE_CTX(l, uint32_t, TX_32X32);
+        case TX_64X64: MERGE_CTX(l, uint32_t, TX_64X64);
+        }
+#undef MERGE_CTX
+
+        return dav1d_skip_ctx[umin(la & 0x3F, 4)][umin(ll & 0x3F, 4)];
+    }
+}
+
+static inline unsigned get_dc_sign_ctx(const int /*enum RectTxfmSize*/ tx,
+                                       const uint8_t *const a,
+                                       const uint8_t *const l)
+{
+    uint64_t mask = 0xC0C0C0C0C0C0C0C0ULL, mul = 0x0101010101010101ULL;
+    int s;
+
+#if ARCH_X86_64 && defined(__GNUC__)
+    /* Coerce compilers into producing better code. For some reason
+     * every x86-64 compiler is awful at handling 64-bit constants. */
+    __asm__("" : "+r"(mask), "+r"(mul));
+#endif
+
+    switch(tx) {
+    default: assert(0); /* fall-through */
+    case TX_4X4: {
+        int t = *(const uint8_t *) a >> 6;
+        t    += *(const uint8_t *) l >> 6;
+        s = t - 1 - 1;
+        break;
+    }
+    case TX_8X8: {
+        uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
+        t         += *(const uint16_t *) l & (uint32_t) mask;
+        t *= 0x04040404U;
+        s = (int) (t >> 24) - 2 - 2;
+        break;
+    }
+    case TX_16X16: {
+        uint32_t t = (*(const uint32_t *) a & (uint32_t) mask) >> 6;
+        t         += (*(const uint32_t *) l & (uint32_t) mask) >> 6;
+        t *= (uint32_t) mul;
+        s = (int) (t >> 24) - 4 - 4;
+        break;
+    }
+    case TX_32X32: {
+        uint64_t t = (*(const uint64_t *) a & mask) >> 6;
+        t         += (*(const uint64_t *) l & mask) >> 6;
+        t *= mul;
+        s = (int) (t >> 56) - 8 - 8;
+        break;
+    }
+    case TX_64X64: {
+        uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
+        t         += (*(const uint64_t *) &a[8] & mask) >> 6;
+        t         += (*(const uint64_t *) &l[0] & mask) >> 6;
+        t         += (*(const uint64_t *) &l[8] & mask) >> 6;
+        t *= mul;
+        s = (int) (t >> 56) - 16 - 16;
+        break;
+    }
+    case RTX_4X8: {
+        uint32_t t = *(const uint8_t  *) a & (uint32_t) mask;
+        t         += *(const uint16_t *) l & (uint32_t) mask;
+        t *= 0x04040404U;
+        s = (int) (t >> 24) - 1 - 2;
+        break;
+    }
+    case RTX_8X4: {
+        uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
+        t         += *(const uint8_t  *) l & (uint32_t) mask;
+        t *= 0x04040404U;
+        s = (int) (t >> 24) - 2 - 1;
+        break;
+    }
+    case RTX_8X16: {
+        uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
+        t         += *(const uint32_t *) l & (uint32_t) mask;
+        t = (t >> 6) * (uint32_t) mul;
+        s = (int) (t >> 24) - 2 - 4;
+        break;
+    }
+    case RTX_16X8: {
+        uint32_t t = *(const uint32_t *) a & (uint32_t) mask;
+        t         += *(const uint16_t *) l & (uint32_t) mask;
+        t = (t >> 6) * (uint32_t) mul;
+        s = (int) (t >> 24) - 4 - 2;
+        break;
+    }
+    case RTX_16X32: {
+        uint64_t t = *(const uint32_t *) a & (uint32_t) mask;
+        t         += *(const uint64_t *) l & mask;
+        t = (t >> 6) * mul;
+        s = (int) (t >> 56) - 4 - 8;
+        break;
+    }
+    case RTX_32X16: {
+        uint64_t t = *(const uint64_t *) a & mask;
+        t         += *(const uint32_t *) l & (uint32_t) mask;
+        t = (t >> 6) * mul;
+        s = (int) (t >> 56) - 8 - 4;
+        break;
+    }
+    case RTX_32X64: {
+        uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
+        t         += (*(const uint64_t *) &l[0] & mask) >> 6;
+        t         += (*(const uint64_t *) &l[8] & mask) >> 6;
+        t *= mul;
+        s = (int) (t >> 56) - 8 - 16;
+        break;
+    }
+    case RTX_64X32: {
+        uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
+        t         += (*(const uint64_t *) &a[8] & mask) >> 6;
+        t         += (*(const uint64_t *) &l[0] & mask) >> 6;
+        t *= mul;
+        s = (int) (t >> 56) - 16 - 8;
+        break;
+    }
+    case RTX_4X16: {
+        uint32_t t = *(const uint8_t  *) a & (uint32_t) mask;
+        t         += *(const uint32_t *) l & (uint32_t) mask;
+        t = (t >> 6) * (uint32_t) mul;
+        s = (int) (t >> 24) - 1 - 4;
+        break;
+    }
+    case RTX_16X4: {
+        uint32_t t = *(const uint32_t *) a & (uint32_t) mask;
+        t         += *(const uint8_t  *) l & (uint32_t) mask;
+        t = (t >> 6) * (uint32_t) mul;
+        s = (int) (t >> 24) - 4 - 1;
+        break;
+    }
+    case RTX_8X32: {
+        uint64_t t = *(const uint16_t *) a & (uint32_t) mask;
+        t         += *(const uint64_t *) l & mask;
+        t = (t >> 6) * mul;
+        s = (int) (t >> 56) - 2 - 8;
+        break;
+    }
+    case RTX_32X8: {
+        uint64_t t = *(const uint64_t *) a & mask;
+        t         += *(const uint16_t *) l & (uint32_t) mask;
+        t = (t >> 6) * mul;
+        s = (int) (t >> 56) - 8 - 2;
+        break;
+    }
+    case RTX_16X64: {
+        uint64_t t = *(const uint32_t *) a & (uint32_t) mask;
+        t         += *(const uint64_t *) &l[0] & mask;
+        t = (t >> 6) + ((*(const uint64_t *) &l[8] & mask) >> 6);
+        t *= mul;
+        s = (int) (t >> 56) - 4 - 16;
+        break;
+    }
+    case RTX_64X16: {
+        uint64_t t = *(const uint64_t *) &a[0] & mask;
+        t         += *(const uint32_t *) l & (uint32_t) mask;
+        t = (t >> 6) + ((*(const uint64_t *) &a[8] & mask) >> 6);
+        t *= mul;
+        s = (int) (t >> 56) - 16 - 4;
+        break;
+    }
+    }
+
+    return (s != 0) + (s > 0);
+}
+
+static inline unsigned get_lo_ctx(const uint8_t *const levels,
+                                  const enum TxClass tx_class,
+                                  unsigned *const hi_mag,
+                                  const uint8_t (*const ctx_offsets)[5],
+                                  const unsigned x, const unsigned y,
+                                  const ptrdiff_t stride)
+{
+    unsigned mag = levels[0 * stride + 1] + levels[1 * stride + 0];
+    unsigned offset;
+    if (tx_class == TX_CLASS_2D) {
+        mag += levels[1 * stride + 1];
+        *hi_mag = mag;
+        mag += levels[0 * stride + 2] + levels[2 * stride + 0];
+        offset = ctx_offsets[umin(y, 4)][umin(x, 4)];
+    } else {
+        mag += levels[0 * stride + 2];
+        *hi_mag = mag;
+        mag += levels[0 * stride + 3] + levels[0 * stride + 4];
+        offset = 26 + (y > 1 ? 10 : y * 5);
+    }
+    return offset + (mag > 512 ? 4 : (mag + 64) >> 7);
+}
+
 static int decode_coefs(Dav1dTileContext *const t,
                        uint8_t *const a, uint8_t *const l,
                        const enum RectTxfmSize tx, const enum BlockSize bs,
@ -66,6 +328,7 @@ static int decode_coefs(Dav1dTileContext *const t,
    Dav1dTileState *const ts = t->ts;
    const int chroma = !!plane;
    const Dav1dFrameContext *const f = t->f;
+    const int lossless = f->frame_hdr->segmentation.lossless[b->seg_id];
    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
    const int dbg = DEBUG_BLOCK_INFO && plane && 0;

@ -73,7 +336,7 @@ static int decode_coefs(Dav1dTileContext *const t,
        printf("Start: r=%d\n", ts->msac.rng);

    // does this block have any non-zero coefficients
-    const int sctx = get_coef_skip_ctx(t_dim, bs, a, l, chroma, f->cur.p.layout);
+    const int sctx = get_skip_ctx(t_dim, bs, a, l, chroma, f->cur.p.layout);
    const int all_skip = dav1d_msac_decode_bool_adapt(&ts->msac,
                             ts->cdf.coef.skip[t_dim->ctx][sctx]);
    if (dbg)
@ -81,41 +344,56 @@ static int decode_coefs(Dav1dTileContext *const t,
               t_dim->ctx, sctx, all_skip, ts->msac.rng);
    if (all_skip) {
        *res_ctx = 0x40;
-        *txtp = f->frame_hdr->segmentation.lossless[b->seg_id] ? WHT_WHT : DCT_DCT;
+        *txtp = lossless * WHT_WHT; /* lossless ? WHT_WHT : DCT_DCT */
        return -1;
    }

    // transform type (chroma: derived, luma: explicitly coded)
-    if (chroma) {
-        if (intra) {
-            *txtp = get_uv_intra_txtp(b->uv_mode, tx, f->frame_hdr, b->seg_id);
+    if (lossless) {
+        assert(t_dim->max == TX_4X4);
+        *txtp = WHT_WHT;
+    } else if (!f->frame_hdr->segmentation.qidx[b->seg_id] ||
+               t_dim->max + intra >= TX_64X64)
+    {
+        *txtp = DCT_DCT;
+    } else if (chroma) {
+        *txtp = intra ? dav1d_txtp_from_uvmode[b->uv_mode] :
+                        get_uv_inter_txtp(t_dim, *txtp);
    } else {
-            const enum TxfmType y_txtp = *txtp;
-            *txtp = get_uv_inter_txtp(t_dim, y_txtp, f->frame_hdr, b->seg_id);
-        }
-    } else {
-        const enum TxfmTypeSet set = get_ext_txtp_set(tx, !intra,
-                                                      f->frame_hdr, b->seg_id);
-        const unsigned set_cnt = dav1d_tx_type_count[set];
        unsigned idx;
-        if (set_cnt == 1) {
-            idx = 0;
+        if (intra) {
+            const enum IntraPredMode y_mode_nofilt = b->y_mode == FILTER_PRED ?
+                dav1d_filter_mode_to_y_mode[b->y_angle] : b->y_mode;
+            if (f->frame_hdr->reduced_txtp_set || t_dim->min == TX_16X16) {
+                idx = dav1d_msac_decode_symbol_adapt4(&ts->msac,
+                          ts->cdf.m.txtp_intra2[t_dim->min][y_mode_nofilt], 4);
+                *txtp = dav1d_tx_types_per_set[idx + 0];
            } else {
-            const int set_idx = dav1d_tx_type_set_index[!intra][set];
-            const enum IntraPredMode y_mode_nofilt = intra ? b->y_mode == FILTER_PRED ?
-                dav1d_filter_mode_to_y_mode[b->y_angle] : b->y_mode : 0;
-            uint16_t *const txtp_cdf = intra ?
-                       ts->cdf.m.txtp_intra[set_idx][t_dim->min][y_mode_nofilt] :
-                       ts->cdf.m.txtp_inter[set_idx][t_dim->min];
-            idx = (set_cnt <= 8 ? dav1d_msac_decode_symbol_adapt8 :
-                     dav1d_msac_decode_symbol_adapt16)(&ts->msac, txtp_cdf, set_cnt);
-
-            if (dbg)
-                printf("Post-txtp[%d->%d][%d->%d][%d][%d->%d]: r=%d\n",
-                       set, set_idx, tx, t_dim->min, intra ? (int)y_mode_nofilt : -1,
-                       idx, dav1d_tx_types_per_set[set][idx], ts->msac.rng);
+                idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
+                          ts->cdf.m.txtp_intra1[t_dim->min][y_mode_nofilt], 6);
+                *txtp = dav1d_tx_types_per_set[idx + 5];
+            }
+            if (dbg)
+                printf("Post-txtp-intra[%d->%d][%d][%d->%d]: r=%d\n",
+                       tx, t_dim->min, y_mode_nofilt, idx, *txtp, ts->msac.rng);
+        } else {
+            if (f->frame_hdr->reduced_txtp_set || t_dim->max == TX_32X32) {
+                idx = dav1d_msac_decode_bool_adapt(&ts->msac,
+                          ts->cdf.m.txtp_inter3[t_dim->min]);
+                *txtp = (idx - 1) & IDTX; /* idx ? DCT_DCT : IDTX */
+            } else if (t_dim->min == TX_16X16) {
+                idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
+                          ts->cdf.m.txtp_inter2, 11);
+                *txtp = dav1d_tx_types_per_set[idx + 12];
+            } else {
+                idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
+                          ts->cdf.m.txtp_inter1[t_dim->min], 15);
+                *txtp = dav1d_tx_types_per_set[idx + 24];
+            }
+            if (dbg)
+                printf("Post-txtp-inter[%d->%d][%d->%d]: r=%d\n",
+                       tx, t_dim->min, idx, *txtp, ts->msac.rng);
        }
-        *txtp = dav1d_tx_types_per_set[set][idx];
    }

    // find end-of-block (eob)
@ -124,19 +402,19 @@ static int decode_coefs(Dav1dTileContext *const t,
    const enum TxClass tx_class = dav1d_tx_type_class[*txtp];
    const int is_1d = tx_class != TX_CLASS_2D;
    switch (tx2dszctx) {
-#define case_sz(sz, bin, ns) \
+#define case_sz(sz, bin, ns, is_1d) \
    case sz: { \
-        uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma][is_1d]; \
-        eob_bin = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 5 + sz); \
+        uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma]is_1d; \
+        eob_bin = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 4 + sz); \
        break; \
    }
-    case_sz(0,   16,  4);
-    case_sz(1,   32,  8);
-    case_sz(2,   64,  8);
-    case_sz(3,  128,  8);
-    case_sz(4,  256, 16);
-    case_sz(5,  512, 16);
-    case_sz(6, 1024, 16);
+    case_sz(0,   16,  4, [is_1d]);
+    case_sz(1,   32,  8, [is_1d]);
+    case_sz(2,   64,  8, [is_1d]);
+    case_sz(3,  128,  8, [is_1d]);
+    case_sz(4,  256, 16, [is_1d]);
+    case_sz(5,  512, 16,        );
+    case_sz(6, 1024, 16,        );
 #undef case_sz
    }
    if (dbg)
@ -159,122 +437,134 @@ static int decode_coefs(Dav1dTileContext *const t,
    }

    // base tokens
-    uint16_t (*const br_cdf)[5] =
-        ts->cdf.coef.br_tok[imin(t_dim->ctx, 3)][chroma];
-    const int16_t *const scan = dav1d_scans[tx][tx_class];
+    uint16_t (*const eob_cdf)[4] = ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma];
+    uint16_t (*const hi_cdf)[4] = ts->cdf.coef.br_tok[imin(t_dim->ctx, 3)][chroma];
+    const uint16_t *const scan = dav1d_scans[tx][tx_class];
    int dc_tok;

    if (eob) {
-        uint8_t *const levels = t->scratch.levels;
+        uint16_t (*const lo_cdf)[4] = ts->cdf.coef.base_tok[t_dim->ctx][chroma];
+        uint8_t *const levels = t->scratch.levels; // bits 0-5: tok, 6-7: lo_tok
        const int sw = imin(t_dim->w, 8), sh = imin(t_dim->h, 8);
-        const ptrdiff_t stride = 4 * (sh + 1);
-        memset(levels, 0, stride * 4 * (sw + 1));
-        const int shift = 2 + imin(t_dim->lh, 3), mask = 4 * sh - 1;
+        const unsigned shift = 2 + imin(t_dim->lh, 3), mask = 4 * sh - 1;

-        { // eob
-            const int rc = scan[eob], x = rc >> shift, y = rc & mask;
-
-            const int ctx = 1 + (eob > sw * sh * 2) + (eob > sw * sh * 4);
-            uint16_t *const lo_cdf = ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma][ctx];
-            int tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf, 3) + 1;
+        /* eob */
+        unsigned rc = scan[eob], x = rc >> shift, y = rc & mask;
+        unsigned ctx = 1 + (eob > sw * sh * 2) + (eob > sw * sh * 4);
+        int eob_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[ctx], 2);
+        int tok = eob_tok + 1;
+        int level_tok = tok * 0x41;
+        unsigned mag;
        if (dbg)
            printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n",
                   t_dim->ctx, chroma, ctx, eob, rc, tok, ts->msac.rng);

-            if (tok == 3) {
-                const int br_ctx = get_br_ctx(levels, 1, tx_class, x, y, stride);
-                do {
-                    const int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac,
-                                           br_cdf[br_ctx], 4);
-                    if (dbg)
-                        printf("Post-hi_tok[%d][%d][%d][%d=%d=%d->%d]: r=%d\n",
-                               imin(t_dim->ctx, 3), chroma, br_ctx,
-                               eob, rc, tok_br, tok, ts->msac.rng);
-                    tok += tok_br;
-                    if (tok_br < 3) break;
-                } while (tok < 15);
-            }
+#define DECODE_COEFS_CLASS(tx_class) \
+        if (eob_tok == 2) { \
+            ctx = (tx_class == TX_CLASS_2D ? (x | y) > 1 : \
+                   tx_class == TX_CLASS_H ? x != 0 : y != 0) ? 14 : 7; \
+            tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
+            level_tok = tok + (3 << 6); \
+            if (dbg) \
+                printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
+                       imin(t_dim->ctx, 3), chroma, ctx, eob, rc, tok, \
+                       ts->msac.rng); \
+        } \
+        cf[rc] = tok; \
+        if (tx_class == TX_CLASS_H) \
+            /* Transposing reduces the stride and padding requirements */ \
+            levels[y * stride + x] = (uint8_t) level_tok; \
+        else \
+            levels[x * stride + y] = (uint8_t) level_tok; \
+        for (int i = eob - 1; i > 0; i--) { /* ac */ \
+            if (tx_class == TX_CLASS_H) \
+                rc = i, x = rc & mask, y = rc >> shift; \
+            else \
+                rc = scan[i], x = rc >> shift, y = rc & mask; \
+            assert(x < 32 && y < 32); \
+            uint8_t *const level = levels + x * stride + y; \
+            ctx = get_lo_ctx(level, tx_class, &mag, lo_ctx_offsets, x, y, stride); \
+            if (tx_class == TX_CLASS_2D) \
+                y |= x; \
+            tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \
+            level_tok = tok * 0x41; \
+            if (dbg) \
+                printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
+                       t_dim->ctx, chroma, ctx, i, rc, tok, ts->msac.rng); \
+            if (tok == 3) { \
+                mag &= 63; \
+                ctx = (y > (tx_class == TX_CLASS_2D) ? 14 : 7) + \
+                      (mag > 12 ? 6 : (mag + 1) >> 1); \
+                tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
+                level_tok = tok + (3 << 6); \
+                if (dbg) \
+                    printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
+                           imin(t_dim->ctx, 3), chroma, ctx, i, rc, tok, \
+                           ts->msac.rng); \
+            } \
+            cf[rc] = tok; \
+            *level = (uint8_t) level_tok; \
+        } \
+        /* dc */ \
+        ctx = (tx_class == TX_CLASS_2D) ? 0 : \
+            get_lo_ctx(levels, tx_class, &mag, lo_ctx_offsets, 0, 0, stride); \
+        dc_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \
+        if (dbg) \
+            printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n", \
+                   t_dim->ctx, chroma, ctx, dc_tok, ts->msac.rng); \
+        if (dc_tok == 3) { \
+            if (tx_class == TX_CLASS_2D) \
+                mag = levels[0 * stride + 1] + levels[1 * stride + 0] + \
+                      levels[1 * stride + 1]; \
+            mag &= 63; \
+            ctx = mag > 12 ? 6 : (mag + 1) >> 1; \
+            dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
+            if (dbg) \
+                printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n", \
+                       imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng); \
+        } \
+        break

-            cf[rc] = tok;
-            levels[x * stride + y] = (uint8_t) tok;
+        switch (tx_class) {
+        case TX_CLASS_2D: {
+            const unsigned nonsquare_tx = tx >= RTX_4X8;
+            const uint8_t (*const lo_ctx_offsets)[5] =
+                dav1d_lo_ctx_offsets[nonsquare_tx + (tx & nonsquare_tx)];
+            const ptrdiff_t stride = 4 * sh;
+            memset(levels, 0, stride * (4 * sw + 2));
+            DECODE_COEFS_CLASS(TX_CLASS_2D);
        }
-        for (int i = eob - 1; i > 0; i--) { // ac
-            const int rc = scan[i], x = rc >> shift, y = rc & mask;
-
-            // lo tok
-            const int ctx = get_coef_nz_ctx(levels, tx, tx_class, x, y, stride);
-            uint16_t *const lo_cdf = ts->cdf.coef.base_tok[t_dim->ctx][chroma][ctx];
-            int tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf, 4);
-            if (dbg)
-                printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n",
-                       t_dim->ctx, chroma, ctx, i, rc, tok, ts->msac.rng);
-
-            // hi tok
-            if (tok == 3) {
-                const int br_ctx = get_br_ctx(levels, 1, tx_class, x, y, stride);
-                do {
-                    const int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac,
-                                           br_cdf[br_ctx], 4);
-                    if (dbg)
-                        printf("Post-hi_tok[%d][%d][%d][%d=%d=%d->%d]: r=%d\n",
-                               imin(t_dim->ctx, 3), chroma, br_ctx,
-                               i, rc, tok_br, tok, ts->msac.rng);
-                    tok += tok_br;
-                    if (tok_br < 3) break;
-                } while (tok < 15);
+        case TX_CLASS_H: {
+#define lo_ctx_offsets NULL
+            const ptrdiff_t stride = 16;
+            memset(levels, 0, stride * (4 * sh + 2));
+            DECODE_COEFS_CLASS(TX_CLASS_H);
        }
-
-            cf[rc] = tok;
-            levels[x * stride + y] = (uint8_t) tok;
-        }
-        { // dc
-            int ctx = 0;
-            if (tx_class != TX_CLASS_2D)
-                ctx = get_coef_nz_ctx(levels, tx, tx_class, 0, 0, stride);
-            uint16_t *const lo_cdf = ts->cdf.coef.base_tok[t_dim->ctx][chroma][ctx];
-            dc_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf, 4);
-            if (dbg)
-                printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n",
-                       t_dim->ctx, chroma, ctx, dc_tok, ts->msac.rng);
-
-            if (dc_tok == 3) {
-                const int br_ctx = get_br_ctx(levels, 0, tx_class, 0, 0, stride);
-                do {
-                    const int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac,
-                                           br_cdf[br_ctx], 4);
-                    if (dbg)
-                        printf("Post-dc_hi_tok[%d][%d][%d][%d->%d]: r=%d\n",
-                               imin(t_dim->ctx, 3), chroma, br_ctx,
-                               tok_br, dc_tok, ts->msac.rng);
-                    dc_tok += tok_br;
-                    if (tok_br < 3) break;
-                } while (dc_tok < 15);
+        case TX_CLASS_V: {
+            const ptrdiff_t stride = 16;
+            memset(levels, 0, stride * (4 * sw + 2));
+            DECODE_COEFS_CLASS(TX_CLASS_V);
        }
+#undef lo_ctx_offsets
+#undef DECODE_COEFS_CLASS
+        default: assert(0);
        }
    } else { // dc-only
-        uint16_t *const lo_cdf = ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma][0];
-        dc_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf, 3) + 1;
+        int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[0], 2);
+        dc_tok = 1 + tok_br;
        if (dbg)
            printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n",
                   t_dim->ctx, chroma, 0, dc_tok, ts->msac.rng);
-
-        if (dc_tok == 3) {
-            do {
-                const int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac,
-                                       br_cdf[0], 4);
+        if (tok_br == 2) {
+            dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[0]);
            if (dbg)
-                    printf("Post-dc_hi_tok[%d][%d][%d][%d->%d]: r=%d\n",
-                           imin(t_dim->ctx, 3), chroma, 0,
-                           tok_br, dc_tok, ts->msac.rng);
-                dc_tok += tok_br;
-                if (tok_br < 3) break;
-            } while (dc_tok < 15);
+                printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n",
+                       imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng);
        }
    }

    // residual and sign
    int dc_sign = 1 << 6;
-    const int lossless = f->frame_hdr->segmentation.lossless[b->seg_id];
    const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane];
    const uint8_t *const qm_tbl = f->qm[lossless || is_1d || *txtp == IDTX][tx][plane];
    const int dq_shift = imax(0, t_dim->ctx - 2);
@ -283,7 +573,7 @@ static int decode_coefs(Dav1dTileContext *const t,
    unsigned cul_level = 0;

    if (dc_tok) { // dc
-        const int dc_sign_ctx = get_dc_sign_ctx(t_dim, a, l);
+        const int dc_sign_ctx = get_dc_sign_ctx(tx, a, l);
        uint16_t *const dc_sign_cdf =
            ts->cdf.coef.dc_sign[chroma][dc_sign_ctx];
        const int sign = dav1d_msac_decode_bool_adapt(&ts->msac, dc_sign_cdf);
@ -335,7 +625,7 @@ static int decode_coefs(Dav1dTileContext *const t,
    }

    // context
-    *res_ctx = imin(cul_level, 63) | dc_sign;
+    *res_ctx = umin(cul_level, 63) | dc_sign;

    return eob;
 }
@ -782,14 +1072,16 @@ static int warp_affine(Dav1dTileContext *const t,
            // luma pixel units
            const int src_x = t->bx * 4 + ((x + 4) << ss_hor);
            const int src_y = t->by * 4 + ((y + 4) << ss_ver);
-            const int mvx = (mat[2] * src_x + mat[3] * src_y + mat[0]) >> ss_hor;
-            const int mvy = (mat[4] * src_x + mat[5] * src_y + mat[1]) >> ss_ver;
+            const int64_t mvx = ((int64_t) mat[2] * src_x +
+                                 (int64_t) mat[3] * src_y + mat[0]) >> ss_hor;
+            const int64_t mvy = ((int64_t) mat[4] * src_x +
+                                 (int64_t) mat[5] * src_y + mat[1]) >> ss_ver;

-            const int dx = (mvx >> 16) - 4;
-            const int mx = ((mvx & 0xffff) - wmp->alpha * 4 -
+            const int dx = (int) (mvx >> 16) - 4;
+            const int mx = (((int) mvx & 0xffff) - wmp->alpha * 4 -
                                                   wmp->beta  * 7) & ~0x3f;
-            const int dy = (mvy >> 16) - 4;
-            const int my = ((mvy & 0xffff) - wmp->gamma * 4 -
+            const int dy = (int) (mvy >> 16) - 4;
+            const int my = (((int) mvy & 0xffff) - wmp->gamma * 4 -
                                                   wmp->delta * 4) & ~0x3f;

            const pixel *ref_ptr;
--- a/third_party/dav1d/src/ref_mvs.c
+++ b/third_party/dav1d/src/ref_mvs.c
@ -47,7 +47,6 @@

 #include "config.h"

-#include <assert.h>
 #include <errno.h>
 #include <limits.h>
 #include <stddef.h>
--- a/third_party/dav1d/src/scan.c
+++ b/third_party/dav1d/src/scan.c
@ -30,25 +30,19 @@
 #include "common/attributes.h"
 #include "src/scan.h"

-static const int16_t ALIGN(av1_default_scan_4x4[], 32) = {
+static const uint16_t ALIGN(av1_default_scan_4x4[], 32) = {
     0,  4,  1,  2,
     5,  8, 12,  9,
     6,  3,  7, 10,
    13, 14, 11, 15,
 };
-static const int16_t ALIGN(av1_mrow_scan_4x4[], 32) = {
+static const uint16_t ALIGN(av1_mrow_scan_4x4[], 32) = {
     0,  4,  8, 12,
     1,  5,  9, 13,
     2,  6, 10, 14,
     3,  7, 11, 15,
 };
-static const int16_t ALIGN(av1_mcol_scan_4x4[], 32) = {
-     0,  1,  2,  3,
-     4,  5,  6,  7,
-     8,  9, 10, 11,
-    12, 13, 14, 15,
-};
-static const int16_t ALIGN(av1_default_scan_4x8[], 32) = {
+static const uint16_t ALIGN(av1_default_scan_4x8[], 32) = {
     0,  8,  1, 16,
     9,  2, 24, 17,
    10,  3, 25, 18,
@ -58,7 +52,7 @@ static const int16_t ALIGN(av1_default_scan_4x8[], 32) = {
    14,  7, 29, 22,
    15, 30, 23, 31,
 };
-static const int16_t ALIGN(av1_mrow_scan_4x8[], 32) = {
+static const uint16_t ALIGN(av1_mrow_scan_4x8[], 32) = {
     0,  8, 16, 24,
     1,  9, 17, 25,
     2, 10, 18, 26,
@ -68,17 +62,7 @@ static const int16_t ALIGN(av1_mrow_scan_4x8[], 32) = {
     6, 14, 22, 30,
     7, 15, 23, 31,
 };
-static const int16_t ALIGN(av1_mcol_scan_4x8[], 32) = {
-     0,  1,  2,  3,
-     4,  5,  6,  7,
-     8,  9, 10, 11,
-    12, 13, 14, 15,
-    16, 17, 18, 19,
-    20, 21, 22, 23,
-    24, 25, 26, 27,
-    28, 29, 30, 31,
-};
-static const int16_t ALIGN(av1_default_scan_4x16[], 32) = {
+static const uint16_t ALIGN(av1_default_scan_4x16[], 32) = {
     0, 16,  1, 32,
    17,  2, 48, 33,
    18,  3, 49, 34,
@ -96,7 +80,7 @@ static const int16_t ALIGN(av1_default_scan_4x16[], 32) = {
    30, 15, 61, 46,
    31, 62, 47, 63,
 };
-static const int16_t ALIGN(av1_mrow_scan_4x16[], 32) = {
+static const uint16_t ALIGN(av1_mrow_scan_4x16[], 32) = {
     0, 16, 32, 48,
     1, 17, 33, 49,
     2, 18, 34, 50,
@ -114,43 +98,19 @@ static const int16_t ALIGN(av1_mrow_scan_4x16[], 32) = {
    14, 30, 46, 62,
    15, 31, 47, 63,
 };
-static const int16_t ALIGN(av1_mcol_scan_4x16[], 32) = {
-     0,  1,  2,  3,
-     4,  5,  6,  7,
-     8,  9, 10, 11,
-    12, 13, 14, 15,
-    16, 17, 18, 19,
-    20, 21, 22, 23,
-    24, 25, 26, 27,
-    28, 29, 30, 31,
-    32, 33, 34, 35,
-    36, 37, 38, 39,
-    40, 41, 42, 43,
-    44, 45, 46, 47,
-    48, 49, 50, 51,
-    52, 53, 54, 55,
-    56, 57, 58, 59,
-    60, 61, 62, 63,
-};
-static const int16_t ALIGN(av1_default_scan_8x4[], 32) = {
+static const uint16_t ALIGN(av1_default_scan_8x4[], 32) = {
     0,  1,  4,  2,  5,  8,  3,  6,
     9, 12,  7, 10, 13, 16, 11, 14,
    17, 20, 15, 18, 21, 24, 19, 22,
    25, 28, 23, 26, 29, 27, 30, 31,
 };
-static const int16_t ALIGN(av1_mrow_scan_8x4[], 32) = {
+static const uint16_t ALIGN(av1_mrow_scan_8x4[], 32) = {
     0,  4,  8, 12, 16, 20, 24, 28,
     1,  5,  9, 13, 17, 21, 25, 29,
     2,  6, 10, 14, 18, 22, 26, 30,
     3,  7, 11, 15, 19, 23, 27, 31,
 };
-static const int16_t ALIGN(av1_mcol_scan_8x4[], 32) = {
-     0,  1,  2,  3,  4,  5,  6,  7,
-     8,  9, 10, 11, 12, 13, 14, 15,
-    16, 17, 18, 19, 20, 21, 22, 23,
-    24, 25, 26, 27, 28, 29, 30, 31,
-};
-static const int16_t ALIGN(av1_default_scan_8x8[], 32) = {
+static const uint16_t ALIGN(av1_default_scan_8x8[], 32) = {
     0,  8,  1,  2,  9, 16, 24, 17,
    10,  3,  4, 11, 18, 25, 32, 40,
    33, 26, 19, 12,  5,  6, 13, 20,
@ -160,7 +120,7 @@ static const int16_t ALIGN(av1_default_scan_8x8[], 32) = {
    23, 31, 38, 45, 52, 59, 60, 53,
    46, 39, 47, 54, 61, 62, 55, 63,
 };
-static const int16_t ALIGN(av1_mrow_scan_8x8[], 32) = {
+static const uint16_t ALIGN(av1_mrow_scan_8x8[], 32) = {
     0,  8, 16, 24, 32, 40, 48, 56,
     1,  9, 17, 25, 33, 41, 49, 57,
     2, 10, 18, 26, 34, 42, 50, 58,
@ -170,17 +130,7 @@ static const int16_t ALIGN(av1_mrow_scan_8x8[], 32) = {
     6, 14, 22, 30, 38, 46, 54, 62,
     7, 15, 23, 31, 39, 47, 55, 63,
 };
-static const int16_t ALIGN(av1_mcol_scan_8x8[], 32) = {
-     0,  1,  2,  3,  4,  5,  6,  7,
-     8,  9, 10, 11, 12, 13, 14, 15,
-    16, 17, 18, 19, 20, 21, 22, 23,
-    24, 25, 26, 27, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39,
-    40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55,
-    56, 57, 58, 59, 60, 61, 62, 63,
-};
-static const int16_t ALIGN(av1_default_scan_8x16[], 32) = {
+static const uint16_t ALIGN(av1_default_scan_8x16[], 32) = {
      0,  16,   1,  32,  17,   2,  48,  33,
     18,   3,  64,  49,  34,  19,   4,  80,
     65,  50,  35,  20,   5,  96,  81,  66,
@ -198,7 +148,7 @@ static const int16_t ALIGN(av1_default_scan_8x16[], 32) = {
     47, 123, 108,  93,  78,  63, 124, 109,
     94,  79, 125, 110,  95, 126, 111, 127,
 };
-static const int16_t ALIGN(av1_mrow_scan_8x16[], 32) = {
+static const uint16_t ALIGN(av1_mrow_scan_8x16[], 32) = {
      0,  16,  32,  48,  64,  80,  96, 112,
      1,  17,  33,  49,  65,  81,  97, 113,
      2,  18,  34,  50,  66,  82,  98, 114,
@ -216,25 +166,7 @@ static const int16_t ALIGN(av1_mrow_scan_8x16[], 32) = {
     14,  30,  46,  62,  78,  94, 110, 126,
     15,  31,  47,  63,  79,  95, 111, 127,
 };
-static const int16_t ALIGN(av1_mcol_scan_8x16[], 32) = {
-      0,   1,   2,   3,   4,   5,   6,   7,
-      8,   9,  10,  11,  12,  13,  14,  15,
-     16,  17,  18,  19,  20,  21,  22,  23,
-     24,  25,  26,  27,  28,  29,  30,  31,
-     32,  33,  34,  35,  36,  37,  38,  39,
-     40,  41,  42,  43,  44,  45,  46,  47,
-     48,  49,  50,  51,  52,  53,  54,  55,
-     56,  57,  58,  59,  60,  61,  62,  63,
-     64,  65,  66,  67,  68,  69,  70,  71,
-     72,  73,  74,  75,  76,  77,  78,  79,
-     80,  81,  82,  83,  84,  85,  86,  87,
-     88,  89,  90,  91,  92,  93,  94,  95,
-     96,  97,  98,  99, 100, 101, 102, 103,
-    104, 105, 106, 107, 108, 109, 110, 111,
-    112, 113, 114, 115, 116, 117, 118, 119,
-    120, 121, 122, 123, 124, 125, 126, 127,
-};
-static const int16_t ALIGN(av1_default_scan_8x32[], 32) = {
+static const uint16_t ALIGN(av1_default_scan_8x32[], 32) = {
      0,  32,   1,  64,  33,   2,  96,  65,
     34,   3, 128,  97,  66,  35,   4, 160,
    129,  98,  67,  36,   5, 192, 161, 130,
@ -268,25 +200,19 @@ static const int16_t ALIGN(av1_default_scan_8x32[], 32) = {
     95, 251, 220, 189, 158, 127, 252, 221,
    190, 159, 253, 222, 191, 254, 223, 255,
 };
-static const int16_t ALIGN(av1_default_scan_16x4[], 32) = {
+static const uint16_t ALIGN(av1_default_scan_16x4[], 32) = {
     0,  1,  4,  2,  5,  8,  3,  6,  9, 12,  7, 10, 13, 16, 11, 14,
    17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 32, 27, 30,
    33, 36, 31, 34, 37, 40, 35, 38, 41, 44, 39, 42, 45, 48, 43, 46,
    49, 52, 47, 50, 53, 56, 51, 54, 57, 60, 55, 58, 61, 59, 62, 63,
 };
-static const int16_t ALIGN(av1_mrow_scan_16x4[], 32) = {
+static const uint16_t ALIGN(av1_mrow_scan_16x4[], 32) = {
     0,  4,  8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60,
     1,  5,  9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61,
     2,  6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62,
     3,  7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63,
 };
-static const int16_t ALIGN(av1_mcol_scan_16x4[], 32) = {
-     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-};
-static const int16_t ALIGN(av1_default_scan_16x8[], 32) = {
+static const uint16_t ALIGN(av1_default_scan_16x8[], 32) = {
      0,   1,   8,   2,   9,  16,   3,  10,  17,  24,   4,  11,  18,  25,  32,   5,
     12,  19,  26,  33,  40,   6,  13,  20,  27,  34,  41,  48,   7,  14,  21,  28,
     35,  42,  49,  56,  15,  22,  29,  36,  43,  50,  57,  64,  23,  30,  37,  44,
@ -296,7 +222,7 @@ static const int16_t ALIGN(av1_default_scan_16x8[], 32) = {
     99, 106, 113, 120,  79,  86,  93, 100, 107, 114, 121,  87,  94, 101, 108, 115,
    122,  95, 102, 109, 116, 123, 103, 110, 117, 124, 111, 118, 125, 119, 126, 127,
 };
-static const int16_t ALIGN(av1_mrow_scan_16x8[], 32) = {
+static const uint16_t ALIGN(av1_mrow_scan_16x8[], 32) = {
      0,   8,  16,  24,  32,  40,  48,  56,  64,  72,  80,  88,  96, 104, 112, 120,
      1,   9,  17,  25,  33,  41,  49,  57,  65,  73,  81,  89,  97, 105, 113, 121,
      2,  10,  18,  26,  34,  42,  50,  58,  66,  74,  82,  90,  98, 106, 114, 122,
@ -306,17 +232,7 @@ static const int16_t ALIGN(av1_mrow_scan_16x8[], 32) = {
      6,  14,  22,  30,  38,  46,  54,  62,  70,  78,  86,  94, 102, 110, 118, 126,
      7,  15,  23,  31,  39,  47,  55,  63,  71,  79,  87,  95, 103, 111, 119, 127,
 };
-static const int16_t ALIGN(av1_mcol_scan_16x8[], 32) = {
-      0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
-     16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
-     32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
-     48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
-     64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
-     80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
-     96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
-    112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
-};
-static const int16_t ALIGN(av1_default_scan_16x16[], 32) = {
+static const uint16_t ALIGN(av1_default_scan_16x16[], 32) = {
      0,  16,   1,   2,  17,  32,  48,  33,  18,   3,   4,  19,  34,  49,  64,  80,
     65,  50,  35,  20,   5,   6,  21,  36,  51,  66,  81,  96, 112,  97,  82,  67,
     52,  37,  22,   7,   8,  23,  38,  53,  68,  83,  98, 113, 128, 144, 129, 114,
@ -334,7 +250,7 @@ static const int16_t ALIGN(av1_default_scan_16x16[], 32) = {
    188, 173, 158, 143, 159, 174, 189, 204, 219, 234, 249, 250, 235, 220, 205, 190,
    175, 191, 206, 221, 236, 251, 252, 237, 222, 207, 223, 238, 253, 254, 239, 255,
 };
-static const int16_t ALIGN(av1_mrow_scan_16x16[], 32) = {
+static const uint16_t ALIGN(av1_mrow_scan_16x16[], 32) = {
      0,  16,  32,  48,  64,  80,  96, 112, 128, 144, 160, 176, 192, 208, 224, 240,
      1,  17,  33,  49,  65,  81,  97, 113, 129, 145, 161, 177, 193, 209, 225, 241,
      2,  18,  34,  50,  66,  82,  98, 114, 130, 146, 162, 178, 194, 210, 226, 242,
@ -352,7 +268,7 @@ static const int16_t ALIGN(av1_mrow_scan_16x16[], 32) = {
     14,  30,  46,  62,  78,  94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254,
     15,  31,  47,  63,  79,  95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255,
 };
-static const int16_t ALIGN(av1_mcol_scan_16x16[], 32) = {
+static const uint16_t ALIGN(av1_mcol_scan_16x16[], 32) = {
      0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
     16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
     32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
@ -370,7 +286,7 @@ static const int16_t ALIGN(av1_mcol_scan_16x16[], 32) = {
    224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
    240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
 };
-static const int16_t ALIGN(av1_default_scan_16x32[], 32) = {
+static const uint16_t ALIGN(av1_default_scan_16x32[], 32) = {
      0,  32,   1,  64,  33,   2,  96,  65,  34,   3, 128,  97,  66,  35,   4, 160,
    129,  98,  67,  36,   5, 192, 161, 130,  99,  68,  37,   6, 224, 193, 162, 131,
    100,  69,  38,   7, 256, 225, 194, 163, 132, 101,  70,  39,   8, 288, 257, 226,
@ -404,7 +320,7 @@ static const int16_t ALIGN(av1_default_scan_16x32[], 32) = {
    380, 349, 318, 287, 505, 474, 443, 412, 381, 350, 319, 506, 475, 444, 413, 382,
    351, 507, 476, 445, 414, 383, 508, 477, 446, 415, 509, 478, 447, 510, 479, 511,
 };
-static const int16_t ALIGN(av1_default_scan_32x8[], 32) = {
+static const uint16_t ALIGN(av1_default_scan_32x8[], 32) = {
      0,   1,   8,   2,   9,  16,   3,  10,  17,  24,   4,  11,  18,  25,  32,   5,  12,  19,  26,  33,  40,   6,  13,  20,  27,  34,  41,  48,   7,  14,  21,  28,
     35,  42,  49,  56,  15,  22,  29,  36,  43,  50,  57,  64,  23,  30,  37,  44,  51,  58,  65,  72,  31,  38,  45,  52,  59,  66,  73,  80,  39,  46,  53,  60,
     67,  74,  81,  88,  47,  54,  61,  68,  75,  82,  89,  96,  55,  62,  69,  76,  83,  90,  97, 104,  63,  70,  77,  84,  91,  98, 105, 112,  71,  78,  85,  92,
@ -414,7 +330,7 @@ static const int16_t ALIGN(av1_default_scan_32x8[], 32) = {
    195, 202, 209, 216, 175, 182, 189, 196, 203, 210, 217, 224, 183, 190, 197, 204, 211, 218, 225, 232, 191, 198, 205, 212, 219, 226, 233, 240, 199, 206, 213, 220,
    227, 234, 241, 248, 207, 214, 221, 228, 235, 242, 249, 215, 222, 229, 236, 243, 250, 223, 230, 237, 244, 251, 231, 238, 245, 252, 239, 246, 253, 247, 254, 255,
 };
-static const int16_t ALIGN(av1_default_scan_32x16[], 32) = {
+static const uint16_t ALIGN(av1_default_scan_32x16[], 32) = {
      0,   1,  16,   2,  17,  32,   3,  18,  33,  48,   4,  19,  34,  49,  64,   5,  20,  35,  50,  65,  80,   6,  21,  36,  51,  66,  81,  96,   7,  22,  37,  52,
     67,  82,  97, 112,   8,  23,  38,  53,  68,  83,  98, 113, 128,   9,  24,  39,  54,  69,  84,  99, 114, 129, 144,  10,  25,  40,  55,  70,  85, 100, 115, 130,
    145, 160,  11,  26,  41,  56,  71,  86, 101, 116, 131, 146, 161, 176,  12,  27,  42,  57,  72,  87, 102, 117, 132, 147, 162, 177, 192,  13,  28,  43,  58,  73,
@ -432,7 +348,7 @@ static const int16_t ALIGN(av1_default_scan_32x16[], 32) = {
    381, 396, 411, 426, 441, 456, 471, 486, 501, 367, 382, 397, 412, 427, 442, 457, 472, 487, 502, 383, 398, 413, 428, 443, 458, 473, 488, 503, 399, 414, 429, 444,
    459, 474, 489, 504, 415, 430, 445, 460, 475, 490, 505, 431, 446, 461, 476, 491, 506, 447, 462, 477, 492, 507, 463, 478, 493, 508, 479, 494, 509, 495, 510, 511,
 };
-static const int16_t ALIGN(av1_default_scan_32x32[], 32) = {
+static const uint16_t ALIGN(av1_default_scan_32x32[], 32) = {
       0,   32,    1,    2,   33,   64,   96,   65,   34,    3,    4,   35,   66,   97,  128,  160,  129,   98,   67,   36,    5,    6,   37,   68,   99,  130,  161,  192,  224,  193,  162,  131,
     100,   69,   38,    7,    8,   39,   70,  101,  132,  163,  194,  225,  256,  288,  257,  226,  195,  164,  133,  102,   71,   40,    9,   10,   41,   72,  103,  134,  165,  196,  227,  258,
     289,  320,  352,  321,  290,  259,  228,  197,  166,  135,  104,   73,   42,   11,   12,   43,   74,  105,  136,  167,  198,  229,  260,  291,  322,  353,  384,  416,  385,  354,  323,  292,
@ -467,15 +383,15 @@ static const int16_t ALIGN(av1_default_scan_32x32[], 32) = {
     892,  861,  830,  799,  831,  862,  893,  924,  955,  986, 1017, 1018,  987,  956,  925,  894,  863,  895,  926,  957,  988, 1019, 1020,  989,  958,  927,  959,  990, 1021, 1022,  991, 1023,
 };

-const int16_t *const dav1d_scans[N_RECT_TX_SIZES][3] = {
+const uint16_t *const dav1d_scans[N_RECT_TX_SIZES][3] = {
    [TX_4X4] = {
        [TX_CLASS_2D] = av1_default_scan_4x4,
        [TX_CLASS_V]  = av1_mrow_scan_4x4,
-        [TX_CLASS_H]  = av1_mcol_scan_4x4,
+        [TX_CLASS_H]  = av1_mcol_scan_16x16,
    }, [TX_8X8] = {
        [TX_CLASS_2D] = av1_default_scan_8x8,
        [TX_CLASS_V]  = av1_mrow_scan_8x8,
-        [TX_CLASS_H]  = av1_mcol_scan_8x8,
+        [TX_CLASS_H]  = av1_mcol_scan_16x16,
    }, [TX_16X16] = {
        [TX_CLASS_2D] = av1_default_scan_16x16,
        [TX_CLASS_V]  = av1_mrow_scan_16x16,
@ -487,19 +403,19 @@ const int16_t *const dav1d_scans[N_RECT_TX_SIZES][3] = {
    }, [RTX_4X8] = {
        [TX_CLASS_2D] = av1_default_scan_4x8,
        [TX_CLASS_V]  = av1_mrow_scan_4x8,
-        [TX_CLASS_H]  = av1_mcol_scan_4x8,
+        [TX_CLASS_H]  = av1_mcol_scan_16x16,
    }, [RTX_8X4] = {
        [TX_CLASS_2D] = av1_default_scan_8x4,
        [TX_CLASS_V]  = av1_mrow_scan_8x4,
-        [TX_CLASS_H]  = av1_mcol_scan_8x4,
+        [TX_CLASS_H]  = av1_mcol_scan_16x16,
    }, [RTX_8X16] = {
        [TX_CLASS_2D] = av1_default_scan_8x16,
        [TX_CLASS_V]  = av1_mrow_scan_8x16,
-        [TX_CLASS_H]  = av1_mcol_scan_8x16,
+        [TX_CLASS_H]  = av1_mcol_scan_16x16,
    }, [RTX_16X8] = {
        [TX_CLASS_2D] = av1_default_scan_16x8,
        [TX_CLASS_V]  = av1_mrow_scan_16x8,
-        [TX_CLASS_H]  = av1_mcol_scan_16x8,
+        [TX_CLASS_H]  = av1_mcol_scan_16x16,
    }, [RTX_16X32] = {
        [TX_CLASS_2D] = av1_default_scan_16x32,
    }, [RTX_32X16] = {
@ -511,11 +427,11 @@ const int16_t *const dav1d_scans[N_RECT_TX_SIZES][3] = {
    }, [RTX_4X16] = {
        [TX_CLASS_2D] = av1_default_scan_4x16,
        [TX_CLASS_V]  = av1_mrow_scan_4x16,
-        [TX_CLASS_H]  = av1_mcol_scan_4x16,
+        [TX_CLASS_H]  = av1_mcol_scan_16x16,
    }, [RTX_16X4] = {
        [TX_CLASS_2D] = av1_default_scan_16x4,
        [TX_CLASS_V]  = av1_mrow_scan_16x4,
-        [TX_CLASS_H]  = av1_mcol_scan_16x4,
+        [TX_CLASS_H]  = av1_mcol_scan_16x16,
    }, [RTX_8X32] = {
        [TX_CLASS_2D] = av1_default_scan_8x32,
    }, [RTX_32X8] = {
--- a/third_party/dav1d/src/scan.h
+++ b/third_party/dav1d/src/scan.h
@ -32,6 +32,6 @@

 #include "src/levels.h"

-extern const int16_t *const dav1d_scans[N_RECT_TX_SIZES][3];
+extern const uint16_t *const dav1d_scans[N_RECT_TX_SIZES][3];

 #endif /* DAV1D_SRC_SCAN_H */
--- a/third_party/dav1d/src/tables.c
+++ b/third_party/dav1d/src/tables.c
@ -225,37 +225,26 @@ const uint8_t /* enum InterPredMode */
    [NEARMV_NEWMV]        = { NEARMV,    NEWMV     },
 };

-const uint8_t dav1d_tx_type_count[N_TXTP_SETS] = {
-    [TXTP_SET_DCT] = 1,
-    [TXTP_SET_DCT_ID] = 2,
-    [TXTP_SET_DT4_ID] = 5,
-    [TXTP_SET_DT4_ID_1D] = 7,
-    [TXTP_SET_DT9_ID_1D] = 12,
-    [TXTP_SET_ALL] = 16,
-    [TXTP_SET_LOSSLESS] = 1,
+const uint8_t dav1d_partition_type_count[N_BL_LEVELS] = {
+    [BL_128X128] = N_PARTITIONS - 3,
+    [BL_64X64]   = N_PARTITIONS - 1,
+    [BL_32X32]   = N_PARTITIONS - 1,
+    [BL_16X16]   = N_PARTITIONS - 1,
+    [BL_8X8]     = N_SUB8X8_PARTITIONS - 1,
 };

-const uint8_t /* enum TxfmType */
-              dav1d_tx_types_per_set[N_TXTP_SETS][N_TX_TYPES] =
-{
-    [TXTP_SET_DCT]       = { DCT_DCT },
-    [TXTP_SET_DCT_ID]    = { IDTX, DCT_DCT },
-    [TXTP_SET_DT4_ID]    = { IDTX, DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST },
-    [TXTP_SET_DT4_ID_1D] = { IDTX, DCT_DCT, V_DCT, H_DCT, ADST_ADST, ADST_DCT,
-                             DCT_ADST },
-    [TXTP_SET_DT9_ID_1D] = { IDTX, V_DCT, H_DCT, DCT_DCT, ADST_DCT, DCT_ADST,
-                             FLIPADST_DCT, DCT_FLIPADST, ADST_ADST,
-                             FLIPADST_FLIPADST, ADST_FLIPADST, FLIPADST_ADST },
-    [TXTP_SET_ALL]       = { IDTX, V_DCT, H_DCT, V_ADST, H_ADST, V_FLIPADST,
-                             H_FLIPADST, DCT_DCT, ADST_DCT, DCT_ADST,
-                             FLIPADST_DCT, DCT_FLIPADST, ADST_ADST,
-                             FLIPADST_FLIPADST, ADST_FLIPADST, FLIPADST_ADST },
-    [TXTP_SET_LOSSLESS]  = { WHT_WHT },
-};
-
-const uint8_t dav1d_tx_type_set_index[2][N_TXTP_SETS] = {
-    { 0, -1,  2,  1, -1, -1, 3 },
-    { 0,  3, -1, -1,  2,  1, 4 },
+const uint8_t /* enum TxfmType */ dav1d_tx_types_per_set[40] = {
+    /* Intra2 */
+    IDTX, DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST,
+    /* Intra1 */
+    IDTX, DCT_DCT, V_DCT, H_DCT, ADST_ADST, ADST_DCT, DCT_ADST,
+    /* Inter2 */
+    IDTX, V_DCT, H_DCT, DCT_DCT, ADST_DCT, DCT_ADST, FLIPADST_DCT,
+    DCT_FLIPADST, ADST_ADST, FLIPADST_FLIPADST, ADST_FLIPADST, FLIPADST_ADST,
+    /* Inter1 */
+    IDTX, V_DCT, H_DCT, V_ADST, H_ADST, V_FLIPADST, H_FLIPADST,
+    DCT_DCT, ADST_DCT, DCT_ADST, FLIPADST_DCT, DCT_FLIPADST,
+    ADST_ADST, FLIPADST_FLIPADST, ADST_FLIPADST, FLIPADST_ADST,
 };

 const uint8_t dav1d_ymode_size_context[N_BS_SIZES] = {
@ -283,119 +272,34 @@ const uint8_t dav1d_ymode_size_context[N_BS_SIZES] = {
    [BS_4x4  ]   = 0,
 };

-const uint8_t dav1d_nz_map_ctx_offset[N_RECT_TX_SIZES][5][5] = {
-    [TX_4X4] = {
-        { 0, 1, 6, 6 },
-        { 1, 6, 6, 21 },
-        { 6, 6, 21, 21 },
-        { 6, 21, 21, 21 },
-    }, [TX_8X8] = {
+const uint8_t dav1d_lo_ctx_offsets[3][5][5] = {
+    { /* w == h */
        {  0,  1,  6,  6, 21 },
        {  1,  6,  6, 21, 21 },
        {  6,  6, 21, 21, 21 },
        {  6, 21, 21, 21, 21 },
-        { 21, 21, 21, 21, 21 }
-    }, [TX_16X16] = {
-        { 0, 1, 6, 6, 21 },
-        { 1, 6, 6, 21, 21 },
-        { 6, 6, 21, 21, 21 },
-        { 6, 21, 21, 21, 21 },
-        { 21, 21, 21, 21, 21 }
-    }, [TX_32X32] = {
-        { 0, 1, 6, 6, 21 },
-        { 1, 6, 6, 21, 21 },
-        { 6, 6, 21, 21, 21 },
-        { 6, 21, 21, 21, 21 },
-        { 21, 21, 21, 21, 21 }
-    }, [TX_64X64] = {
-        { 0, 1, 6, 6, 21 },
-        { 1, 6, 6, 21, 21 },
-        { 6, 6, 21, 21, 21 },
-        { 6, 21, 21, 21, 21 },
-        { 21, 21, 21, 21, 21 }
-    }, [RTX_4X8] = {
-        { 0, 11, 11, 11 },
-        { 11, 11, 11, 11 },
-        { 6, 6, 21, 21 },
-        { 6, 21, 21, 21 },
-        { 21, 21, 21, 21 }
-    }, [RTX_8X4] = {
+        { 21, 21, 21, 21, 21 },
+    }, { /* w > h */
        {  0, 16,  6,  6, 21 },
        { 16, 16,  6, 21, 21 },
        { 16, 16, 21, 21, 21 },
        { 16, 16, 21, 21, 21 },
-    }, [RTX_8X16] = {
+        { 16, 16, 21, 21, 21 },
+    }, { /* w < h */
        {  0, 11, 11, 11, 11 },
        { 11, 11, 11, 11, 11 },
        {  6,  6, 21, 21, 21 },
        {  6, 21, 21, 21, 21 },
-        { 21, 21, 21, 21, 21 }
-    }, [RTX_16X8] = {
-        { 0, 16, 6, 6, 21 },
-        { 16, 16, 6, 21, 21 },
-        { 16, 16, 21, 21, 21 },
-        { 16, 16, 21, 21, 21 },
-        { 16, 16, 21, 21, 21 }
-    }, [RTX_16X32] = {
-        { 0, 11, 11, 11, 11 },
-        { 11, 11, 11, 11, 11 },
-        { 6, 6, 21, 21, 21 },
-        { 6, 21, 21, 21, 21 },
-        { 21, 21, 21, 21, 21 }
-    }, [RTX_32X16] = {
-        { 0, 16, 6, 6, 21 },
-        { 16, 16, 6, 21, 21 },
-        { 16, 16, 21, 21, 21 },
-        { 16, 16, 21, 21, 21 },
-        { 16, 16, 21, 21, 21 }
-    }, [RTX_32X64] = {
-        { 0, 11, 11, 11, 11 },
-        { 11, 11, 11, 11, 11 },
-        { 6, 6, 21, 21, 21 },
-        { 6, 21, 21, 21, 21 },
-        { 21, 21, 21, 21, 21 }
-    }, [RTX_64X32] = {
-        { 0, 16, 6, 6, 21 },
-        { 16, 16, 6, 21, 21 },
-        { 16, 16, 21, 21, 21 },
-        { 16, 16, 21, 21, 21 },
-        { 16, 16, 21, 21, 21 }
-    }, [RTX_4X16] = {
-        { 0, 11, 11, 11 },
-        { 11, 11, 11, 11 },
-        { 6, 6, 21, 21 },
-        { 6, 21, 21, 21 },
-        { 21, 21, 21, 21 }
-    }, [RTX_16X4] = {
-        { 0, 16, 6, 6, 21 },
-        { 16, 16, 6, 21, 21 },
-        { 16, 16, 21, 21, 21 },
-        { 16, 16, 21, 21, 21 },
-    }, [RTX_8X32] = {
-        { 0, 11, 11, 11, 11 },
-        { 11, 11, 11, 11, 11 },
-        { 6, 6, 21, 21, 21 },
-        { 6, 21, 21, 21, 21 },
-        { 21, 21, 21, 21, 21 }
-    }, [RTX_32X8] = {
-        { 0, 16, 6, 6, 21 },
-        { 16, 16, 6, 21, 21 },
-        { 16, 16, 21, 21, 21 },
-        { 16, 16, 21, 21, 21 },
-        { 16, 16, 21, 21, 21 }
-    }, [RTX_16X64] = {
-        { 0, 11, 11, 11, 11 },
-        { 11, 11, 11, 11, 11 },
-        { 6, 6, 21, 21, 21 },
-        { 6, 21, 21, 21, 21 },
-        { 21, 21, 21, 21, 21 }
-    }, [RTX_64X16] = {
-        { 0, 16, 6, 6, 21 },
-        { 16, 16, 6, 21, 21 },
-        { 16, 16, 21, 21, 21 },
-        { 16, 16, 21, 21, 21 },
-        { 16, 16, 21, 21, 21 }
-    }
+        { 21, 21, 21, 21, 21 },
+    },
+};
+
+const uint8_t dav1d_skip_ctx[5][5] = {
+    { 1, 2, 2, 2, 3 },
+    { 2, 4, 4, 4, 5 },
+    { 2, 4, 4, 4, 5 },
+    { 2, 4, 4, 4, 5 },
+    { 3, 5, 5, 5, 6 },
 };

 const uint8_t /* enum TxClass */ dav1d_tx_type_class[N_TX_TYPES_PLUS_LL] = {
@ -861,7 +765,7 @@ const int8_t ALIGN(dav1d_filter_intra_taps[5][64], 16) = {
    }
 };

-const uint8_t dav1d_obmc_masks[64] = {
+const uint8_t ALIGN(dav1d_obmc_masks[64], 16) = {
    /* Unused */
     0,  0,
    /* 2 */
--- a/third_party/dav1d/src/tables.h
+++ b/third_party/dav1d/src/tables.h
@ -52,14 +52,13 @@ extern const uint8_t /* enum TxfmType */
 extern const uint8_t /* enum InterPredMode */
                     dav1d_comp_inter_pred_modes[N_COMP_INTER_PRED_MODES][2];

-extern const uint8_t dav1d_tx_type_count[N_TXTP_SETS];
-extern const uint8_t /* enum TxfmType */
-                     dav1d_tx_types_per_set[N_TXTP_SETS][N_TX_TYPES];
-extern const uint8_t dav1d_tx_type_set_index[2][N_TXTP_SETS];
+extern const uint8_t dav1d_partition_type_count[N_BL_LEVELS];
+extern const uint8_t /* enum TxfmType */ dav1d_tx_types_per_set[40];

 extern const uint8_t dav1d_filter_mode_to_y_mode[5];
 extern const uint8_t dav1d_ymode_size_context[N_BS_SIZES];
-extern const uint8_t dav1d_nz_map_ctx_offset[N_RECT_TX_SIZES][5][5];
+extern const uint8_t dav1d_lo_ctx_offsets[3][5][5];
+extern const uint8_t dav1d_skip_ctx[5][5];
 extern const uint8_t /* enum TxClass */
                     dav1d_tx_type_class[N_TX_TYPES_PLUS_LL];
 extern const uint8_t /* enum Filter2d */
--- a/third_party/dav1d/src/thread.h
+++ b/third_party/dav1d/src/thread.h
@ -48,6 +48,10 @@ typedef SRWLOCK pthread_mutex_t;
 typedef CONDITION_VARIABLE pthread_cond_t;
 typedef INIT_ONCE pthread_once_t;

+void dav1d_init_thread(void);
+void dav1d_set_thread_name(const wchar_t *name);
+#define dav1d_set_thread_name(name) dav1d_set_thread_name(L##name)
+
 int dav1d_pthread_create(pthread_t *thread, const pthread_attr_t *attr,
                         void *(*func)(void*), void *arg);
 int dav1d_pthread_join(pthread_t *thread, void **res);
@ -126,7 +130,7 @@ static inline int pthread_cond_broadcast(pthread_cond_t *const cond) {

 #include <pthread.h>

-#endif
+#define dav1d_init_thread() do {} while (0)

 /* Thread naming support */

@ -134,13 +138,40 @@ static inline int pthread_cond_broadcast(pthread_cond_t *const cond) {

 #include <sys/prctl.h>

-static inline void dav1d_set_thread_name(const char* name) {
+static inline void dav1d_set_thread_name(const char *const name) {
    prctl(PR_SET_NAME, name);
 }

+#elif defined(__APPLE__)
+
+static inline void dav1d_set_thread_name(const char *const name) {
+    pthread_setname_np(name);
+}
+
+#elif defined(__DragonFly__) || defined(__FreeBSD__) || defined(__OpenBSD__)
+
+#if defined(__FreeBSD__)
+ /* ALIGN from <sys/param.h> conflicts with ALIGN from "common/attributes.h" */
+#define _SYS_PARAM_H_
+#include <sys/types.h>
+#endif
+#include <pthread_np.h>
+
+static inline void dav1d_set_thread_name(const char *const name) {
+    pthread_set_name_np(pthread_self(), name);
+}
+
+#elif defined(__NetBSD__)
+
+static inline void dav1d_set_thread_name(const char *const name) {
+    pthread_setname_np(pthread_self(), "%s", (void*)name);
+}
+
 #else

-#define dav1d_set_thread_name(name)
+#define dav1d_set_thread_name(name) do {} while (0)
+
+#endif

 #endif

--- a/third_party/dav1d/src/warpmv.c
+++ b/third_party/dav1d/src/warpmv.c
@ -27,7 +27,6 @@

 #include "config.h"

-#include <assert.h>
 #include <stdlib.h>

 #include "common/intops.h"
--- a/third_party/dav1d/src/win32/thread.c
+++ b/third_party/dav1d/src/win32/thread.c
@ -37,6 +37,20 @@

 #include "src/thread.h"

+static HRESULT (WINAPI *set_thread_description)(HANDLE, PCWSTR);
+
+COLD void dav1d_init_thread(void) {
+    set_thread_description =
+        (void*)GetProcAddress(GetModuleHandleW(L"kernel32.dll"),
+                              "SetThreadDescription");
+}
+
+#undef dav1d_set_thread_name
+COLD void dav1d_set_thread_name(const wchar_t *const name) {
+    if (set_thread_description) /* Only available since Windows 10 1607 */
+        set_thread_description(GetCurrentThread(), name);
+}
+
 static COLD unsigned __stdcall thread_entrypoint(void *const data) {
    pthread_t *const t = data;
    t->arg = t->func(t->arg);
--- a/third_party/dav1d/src/x86/film_grain.asm
+++ b/third_party/dav1d/src/x86/film_grain.asm
--- a/third_party/dav1d/src/x86/film_grain_init_tmpl.c
+++ b/third_party/dav1d/src/x86/film_grain_init_tmpl.c
@ -0,0 +1,45 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/film_grain.h"
+
+decl_generate_grain_y_fn(dav1d_generate_grain_y_avx2);
+decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_avx2);
+decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_avx2);
+
+COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c) {
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+#if BITDEPTH == 8 && ARCH_X86_64
+    c->generate_grain_y = dav1d_generate_grain_y_avx2;
+    c->fgy_32x32xn = dav1d_fgy_32x32xn_avx2;
+    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_avx2;
+#endif
+}
--- a/third_party/dav1d/src/x86/itx.asm
+++ b/third_party/dav1d/src/x86/itx.asm
@ -47,9 +47,11 @@ pw_m%2_%1: dw -%2, %1
 pw_3803_1321:   dw  3803,  1321
 pw_m1321_2482:  dw -1321,  2482
 pw_2482_3344:   dw  2482,  3344
+pw_m3344_3344:  dw -3344,  3344
 pw_m3803_3344:  dw -3803,  3344
 pw_m3803_m6688: dw -3803, -6688
-%define pw_3344x8 iadst4_dconly2b
+COEF_PAIR           2896,  2896
+pw_2896_m2896:  dw  2896, -2896

 pw_5:      times 2 dw 5
 pw_2048:   times 2 dw 2048
@ -464,13 +466,15 @@ ALIGN function_align
 %macro IADST4_1D_PACKED 0
    punpcklwd            m2, m1, m0
    punpckhwd            m3, m1, m0
-    psubw                m0, m1
-    punpckhqdq           m1, m1
-    paddw                m1, m0 ; in0 - in2 + in3
+    vpbroadcastd         m5, [o(pw_m3344_3344)]
    vpbroadcastd         m0, [o(pw_3803_1321)]
    vpbroadcastd         m4, [o(pw_m1321_2482)]
+    pmaddwd              m1, m5, m2 ; 3344*in3 - 3344*in2
+    psrld                m5, 16
    pmaddwd              m0, m2
    pmaddwd              m2, m4
+    pmaddwd              m5, m3 ; 3344*in0
+    paddd                m1, m5 ; 3344*in0 - 3344*in2 + 3344*in3
    vpbroadcastd         m4, [o(pw_2482_3344)]
    vpbroadcastd         m5, [o(pw_m3803_3344)]
    pmaddwd              m4, m3
@ -478,19 +482,16 @@ ALIGN function_align
    paddd                m4, m0 ; 1321*in0 + 3344*in1 + 3803*in2 + 2482*in3
    vpbroadcastd         m0, [o(pw_m3803_m6688)]
    pmaddwd              m3, m0
-    vpbroadcastd         m0, [o(pw_3344x8)]
-    pmulhrsw             m1, m0 ; out2 ____
    vpbroadcastd         m0, [o(pd_2048)]
    paddd                m2, m0
+    paddd                m1, m0
    paddd                m0, m4
    paddd                m5, m2 ; 2482*in0 + 3344*in1 - 1321*in2 - 3803*in3
    paddd                m2, m4
    paddd                m2, m3
-    psrad                m0, 12
-    psrad                m5, 12
-    psrad                m2, 12
+    REPX      {psrad x, 12}, m1, m2, m0, m5
    packssdw             m0, m5 ; out0 out1
-    packssdw             m2, m2 ; out3 out3
+    packssdw             m1, m2 ; out2 out3
 %endmacro

 INV_TXFM_4X4_FN dct, dct,      0
@ -524,14 +525,13 @@ cglobal iadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
    mova                 m0, [cq+16*0]
    mova                 m1, [cq+16*1]
    call .main
-    punpckhwd            m3, m0, m2
+    punpckhwd            m3, m0, m1
    punpcklwd            m0, m1
    punpckhwd            m1, m0, m3
    punpcklwd            m0, m3
    jmp                tx2q
 .pass2:
    call .main
-    vpblendd             m1, m1, m2, 0x0c ; out2 out3
 .end:
    pxor                 m2, m2
    mova          [cq+16*0], m2
@ -552,14 +552,13 @@ cglobal iflipadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
    mova                 m0, [cq+16*0]
    mova                 m1, [cq+16*1]
    call m(iadst_4x4_internal).main
-    punpcklwd            m1, m0
-    punpckhwd            m2, m0
-    punpcklwd            m0, m2, m1
-    punpckhwd            m1, m2, m1
+    punpcklwd            m2, m1, m0
+    punpckhwd            m1, m0
+    punpcklwd            m0, m1, m2
+    punpckhwd            m1, m2
    jmp                tx2q
 .pass2:
    call m(iadst_4x4_internal).main
-    vpblendd             m1, m1, m2, 0x0c ; out2 out3
 .end:
    pxor                 m2, m2
    mova          [cq+16*0], m2
@ -710,12 +709,55 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
    paddsw               m1, m5     ; out3 out2
 %endmacro

-%macro IADST8_1D_PACKED 0
+%macro IADST8_1D_PACKED 1 ; pass
    vpbroadcastd         m6, [o(pd_2048)]
    punpckhwd            m0, m4, m3 ; 0 7
    punpckhwd            m1, m5, m2 ; 2 5
    punpcklwd            m2, m5     ; 4 3
    punpcklwd            m3, m4     ; 6 1
+%if %1 == 1
+    ITX_MUL2X_PACK        0, 4, 5, 6,  401, 4076, 3 ; t1a t0a
+    ITX_MUL2X_PACK        1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a
+    ITX_MUL2X_PACK        2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a
+    ITX_MUL2X_PACK        3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a
+    psubsw               m4, m0, m2 ; t5 t4
+    paddsw               m0, m2     ; t1 t0
+    psubsw               m5, m1, m3 ; t6 t7
+    paddsw               m1, m3     ; t2 t3
+    ITX_MUL2X_PACK        4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a
+    ITX_MUL2X_PACK        5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a
+%if mmsize > 16
+    vbroadcasti128       m2, [o(deint_shuf)]
+%else
+    mova                 m2, [o(deint_shuf)]
+%endif
+    pshuflw              m1, m1, q2301
+    pshufhw              m1, m1, q2301
+    psubsw               m3, m0, m1        ; t3 t2
+    paddsw               m0, m1            ; -out7  out0
+    psubsw               m1, m4, m5        ; t7 t6
+    paddsw               m4, m5            ;  out6 -out1
+    pshufb               m0, m2
+    pshufb               m4, m2
+    vpbroadcastd         m5, [o(pw_m2896_2896)]
+    pmaddwd              m2, m5, m3
+    pmaddwd              m5, m1
+    paddd                m2, m6
+    paddd                m5, m6
+    psrad                m2, 12
+    psrad                m5, 12
+    packssdw             m2, m5            ; out4 -out5
+    vpbroadcastd         m5, [o(pw_2896_2896)]
+    pmaddwd              m3, m5
+    pmaddwd              m1, m5
+    paddd                m3, m6
+    paddd                m1, m6
+    psrad                m3, 12
+    psrad                m1, 12
+    packssdw             m1, m3            ; out2 -out3
+    punpcklqdq           m3, m4, m0        ; out6 -out7
+    punpckhqdq           m0, m4            ; out0 -out1
+%else
    ITX_MUL2X_PACK        0, 4, 5, 6,  401, 4076 ; t0a t1a
    ITX_MUL2X_PACK        1, 4, 5, 6, 1931, 3612 ; t2a t3a
    ITX_MUL2X_PACK        2, 4, 5, 6, 3166, 2598 ; t4a t5a
@ -738,11 +780,12 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
    vpblendd             m0, m0, m4, 0xcc  ; out0 -out1
    shufps               m4, m2, m1, q1032 ; t3 t7
    vpblendd             m1, m2, m1, 0xcc  ; t2 t6
-    psubw                m2, m1, m4        ; t2-t3 t6-t7
-    paddw                m1, m4            ; t2+t3 t6+t7
+    psubsw               m2, m1, m4        ; t2-t3 t6-t7
+    paddsw               m1, m4            ; t2+t3 t6+t7
    pmulhrsw             m2, m5            ; out4 -out5
    pshufd               m1, m1, q1032
    pmulhrsw             m1, m5            ; out2 -out3
+%endif
 %endmacro

 INIT_YMM avx2
@ -790,7 +833,7 @@ cglobal iadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
    pmulhrsw             m0, m2
    pmulhrsw             m1, m2
    call m(iadst_8x4_internal).main
-    punpckhwd            m3, m0, m2
+    punpckhwd            m3, m0, m1
    punpcklwd            m0, m1
    punpckhwd            m1, m0, m3
    punpcklwd            m0, m3
@ -800,7 +843,7 @@ cglobal iadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
    vextracti128        xm3, m1, 1
    pshufd              xm4, xm0, q1032
    pshufd              xm5, xm1, q1032
-    call .main
+    call .main_pass2
    vpbroadcastd         m4, [o(pw_2048)]
    vinserti128          m0, m0, xm2, 1
    vinserti128          m1, m1, xm3, 1
@ -822,8 +865,12 @@ cglobal iadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
    WRITE_4X8             0, 1
    RET
 ALIGN function_align
-.main:
-    WRAP_XMM IADST8_1D_PACKED
+.main_pass1:
+    WRAP_XMM IADST8_1D_PACKED 1
+    ret
+ALIGN function_align
+.main_pass2:
+    WRAP_XMM IADST8_1D_PACKED 2
    ret

 INV_TXFM_4X8_FN flipadst, dct,      0
@ -839,7 +886,7 @@ cglobal iflipadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
    pmulhrsw             m1, m2
    call m(iadst_8x4_internal).main
    punpcklwd            m3, m1, m0
-    punpckhwd            m1, m2, m0
+    punpckhwd            m1, m0
    punpcklwd            m0, m1, m3
    punpckhwd            m1, m3
    jmp                tx2q
@ -848,7 +895,7 @@ cglobal iflipadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
    vextracti128        xm3, m1, 1
    pshufd              xm4, xm0, q1032
    pshufd              xm5, xm1, q1032
-    call m(iadst_4x8_internal).main
+    call m(iadst_4x8_internal).main_pass2
    vpbroadcastd         m5, [o(pw_2048)]
    vinserti128          m3, m3, xm1, 1
    vinserti128          m2, m2, xm0, 1
@ -1099,8 +1146,13 @@ cglobal iadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
    jmp                tx2q
 .pass2:
    call .main
-    pshufd               m1, m1, q1032
+    vpbroadcastd         m5, [o(pw_2896x8)]
+    paddsw               m1, m2, m4
+    psubsw               m2, m4
+    pmulhrsw             m1, m5     ; -out7   out4   out6  -out5
+    pmulhrsw             m2, m5     ;  out8  -out11 -out9   out10
    vpbroadcastd         m5, [o(pw_2048)]
+    pshufd               m1, m1, q1032
    vpblendd             m4, m1, m0, 0x33
    vpblendd             m0, m0, m2, 0x33
    vpblendd             m2, m2, m3, 0x33
@ -1176,7 +1228,6 @@ ALIGN function_align
    vinserti128          m0, m0, xm2, 1    ; t1   t0   t9a  t8a
    vperm2i128           m2, m4, m1, 0x31  ; t7a  t6a  t15  t14
    vinserti128          m4, m4, xm1, 1    ; t4a  t5a  t12  t13
-    vpbroadcastd         m5, [o(pw_2896x8)]
    pshufd               m2, m2, q1032     ; t6a  t7a  t14  t15
    psubsw               m1, m0, m3        ; t3a t2a t11 t10
    paddsw               m0, m3     ; -out15  out0   out14 -out1
@ -1184,10 +1235,21 @@ ALIGN function_align
    psubsw               m4, m2            ; t6 t7 t14a t15a
    shufps               m2, m1, m4, q1032 ; t2a t6  t10 t14a
    vpblendd             m4, m4, m1, 0x33  ; t3a t7  t11 t15a
-    paddw                m1, m2, m4
-    psubw                m2, m4
-    pmulhrsw             m1, m5     ; -out7   out4   out6  -out5
-    pmulhrsw             m2, m5     ;  out8  -out11 -out9   out10
+    ret
+ALIGN function_align
+.main_pass1_end:
+    vpbroadcastd         m5, [o(pw_m2896_2896)]
+    vpbroadcastd         m6, [o(pw_2896_2896)]
+    punpcklwd            m1, m4, m2
+    punpckhwd            m4, m2
+    pmaddwd              m2, m5, m4
+    pmaddwd              m4, m6
+    pmaddwd              m5, m1
+    pmaddwd              m1, m6
+    REPX      {paddd x, m8}, m5, m1, m2, m4
+    REPX      {psrad x, 12}, m5, m2, m1, m4
+    packssdw             m2, m5     ; -out11  out8   out10 -out9
+    packssdw             m1, m4     ; -out7   out4   out6  -out5
    ret

 INV_TXFM_4X16_FN flipadst, dct,      0
@ -1214,8 +1276,13 @@ cglobal iflipadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
    jmp                tx2q
 .pass2:
    call m(iadst_4x16_internal).main
-    pshufd               m1, m1, q1032
+    vpbroadcastd         m5, [o(pw_2896x8)]
+    paddsw               m1, m2, m4
+    psubsw               m2, m4
+    pmulhrsw             m1, m5     ; -out7   out4   out6  -out5
+    pmulhrsw             m2, m5     ;  out8  -out11 -out9   out10
    vpbroadcastd         m6, [o(pw_2048)]
+    pshufd               m1, m1, q1032
    vpblendd             m4, m0, m2, 0x33
    vpblendd             m0, m0, m1, 0xcc
    vpblendd             m1, m1, m3, 0xcc
@ -1381,7 +1448,7 @@ cglobal iadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
    pmulhrsw            xm2, xm0, [cq+16*2]
    pmulhrsw            xm4, xm0
    pmulhrsw            xm5, xm0
-    call m(iadst_4x8_internal).main
+    call m(iadst_4x8_internal).main_pass1
    vinserti128        m0, m0, xm2, 1
    vinserti128        m1, m1, xm3, 1
    punpckhwd          m2, m0, m1
@ -1393,7 +1460,6 @@ cglobal iadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
    jmp              tx2q
 .pass2:
    call .main
-    vpblendd             m1, m1, m2, 0xcc
 .end:
    vpermq               m0, m0, q3120
    vpermq               m1, m1, q3120
@ -1427,7 +1493,7 @@ cglobal iflipadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
    pmulhrsw            xm2, xm0, [cq+16*2]
    pmulhrsw            xm4, xm0
    pmulhrsw            xm5, xm0
-    call m(iadst_4x8_internal).main
+    call m(iadst_4x8_internal).main_pass1
    vinserti128          m3, m3, xm1, 1
    vinserti128          m2, m2, xm0, 1
    punpckhwd            m1, m3, m2
@ -1439,7 +1505,7 @@ cglobal iflipadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
    jmp                tx2q
 .pass2:
    call m(iadst_8x4_internal).main
-    vpblendd             m2, m2, m1, 0x33
+    mova                 m2, m1
    vpermq               m1, m0, q2031
    vpermq               m0, m2, q2031
    jmp m(iadst_8x4_internal).end2
@ -1580,7 +1646,7 @@ cglobal iadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
    vpermq               m3, [cq+32*3], q3120 ; 6 7
    vpermq               m5, [cq+32*1], q1302 ; 3 2
    vpermq               m2, [cq+32*2], q3120 ; 4 5
-    call .main
+    call .main_pass1
    vpbroadcastd         m5, [o(pw_16384)]
    punpcklwd            m4, m0, m1
    punpckhwd            m0, m1
@ -1604,7 +1670,7 @@ cglobal iadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
 .pass2:
    pshufd               m4, m0, q1032
    pshufd               m5, m1, q1032
-    call .main
+    call .main_pass2
    vpbroadcastd         m5, [o(pw_2048)]
    vpbroadcastd        xm4, [o(pw_4096)]
    psubw                m4, m5 ; lower half = 2048, upper half = -2048
@ -1629,8 +1695,12 @@ cglobal iadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
    WRITE_8X4             2, 3, 4, 5
    RET
 ALIGN function_align
-.main:
-    IADST8_1D_PACKED
+.main_pass1:
+    IADST8_1D_PACKED 1
+    ret
+ALIGN function_align
+.main_pass2:
+    IADST8_1D_PACKED 2
    ret

 INV_TXFM_8X8_FN flipadst, dct
@ -1643,7 +1713,7 @@ cglobal iflipadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
    vpermq               m3, [cq+32*3], q3120 ; 6 7
    vpermq               m5, [cq+32*1], q1302 ; 3 2
    vpermq               m2, [cq+32*2], q3120 ; 4 5
-    call m(iadst_8x8_internal).main
+    call m(iadst_8x8_internal).main_pass1
    vpbroadcastd         m5, [o(pw_16384)]
    punpckhwd            m4, m3, m2
    punpcklwd            m3, m2
@ -1667,7 +1737,7 @@ cglobal iflipadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
 .pass2:
    pshufd               m4, m0, q1032
    pshufd               m5, m1, q1032
-    call m(iadst_8x8_internal).main
+    call m(iadst_8x8_internal).main_pass2
    vpbroadcastd         m4, [o(pw_2048)]
    vpbroadcastd        xm5, [o(pw_4096)]
    psubw                m4, m5 ; lower half = -2048, upper half = 2048
@ -1867,6 +1937,7 @@ INV_TXFM_8X16_FN adst, identity
 cglobal iadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
    ITX_8X16_LOAD_COEFS
    call m(iadst_16x8_internal).main
+    call m(iadst_16x8_internal).main_pass1_end
    vpbroadcastd        m10, [o(pw_16384)]
    pslld                m9, m10, 17
    psubw               m10, m9 ; 16384, -16384
@ -1874,6 +1945,7 @@ cglobal iadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
 ALIGN function_align
 .pass2:
    call .main
+    call .main_pass2_end
    vpbroadcastd         m9, [o(pw_2048)]
    vpbroadcastd        xm8, [o(pw_4096)]
    psubw                m8, m9
@ -1930,38 +2002,72 @@ ALIGN function_align
    paddsw               m4, m6     ; t8a  t9a
    vpbroadcastd        m11, [o(pw_m3784_1567)]
    vpbroadcastd        m12, [o(pw_1567_3784)]
-    ITX_MUL2X_PACK        3, 6, _, 10, 11, 12, 4 ; t4a t5a
+    ITX_MUL2X_PACK        3, 6, _, 10, 12, 11, 6 ; t5a t4a
    psubw                m6, m9, m11 ; pw_3784_m1567
-    ITX_MUL2X_PACK        8, 12, _, 10, 12, 6, 4 ; t6a t7a
+    ITX_MUL2X_PACK        8, 6, _, 10, 6, 12, 6  ; t7a t6a
    vpbroadcastd        m11, [o(pw_m1567_3784)]
    vpbroadcastd        m12, [o(pw_3784_1567)]
-    ITX_MUL2X_PACK        2, 6, _, 10, 11, 12, 4 ; t15 t14
+    ITX_MUL2X_PACK        2, 6, _, 10, 11, 12, 6 ; t15 t14
    psubw                m6, m9, m11 ; pw_1567_m3784
-    ITX_MUL2X_PACK        5, 12, _, 10, 12, 6, 4 ; t13 t12
-    vbroadcasti128      m11, [o(deint_shuf)]
-    vpbroadcastd        m12, [o(pw_2896x8)]
-    psubsw               m6, m0, m1        ;  t3a    t2a
+    ITX_MUL2X_PACK        5, 12, _, 10, 12, 6, 6 ; t13 t12
+    vbroadcasti128      m12, [o(deint_shuf)]
+    paddsw               m6, m4, m7        ; -out1  out14
+    psubsw               m4, m7            ;  t10    t11
+    psubsw              m11, m3, m8        ;  t7     t6
+    paddsw               m8, m3            ;  out12 -out3
+    psubsw               m3, m0, m1        ;  t3a    t2a
    paddsw               m0, m1            ; -out15  out0
    paddsw               m1, m2, m5        ; -out13  out2
    psubsw               m5, m2            ;  t15a   t14a
-    paddsw               m2, m4, m7        ; -out1  out14
-    psubsw               m4, m7            ;  t10    t11
-    psubsw               m7, m3, m8        ;  t6     t7
-    paddsw               m8, m3            ; -out3   out12
-    REPX    {pshufb x, m11}, m6, m4, m0, m2
-    vpblendd             m3, m6, m4, 0xcc  ;  t3a    t11
-    shufps               m6, m6, m4, q1032 ;  t2a    t10
-    vpblendd             m4, m5, m7, 0xcc  ;  t15a   t7
-    shufps               m5, m5, m7, q1032 ;  t14a   t6
-    shufps               m7, m2, m0, q1032 ;  out14 -out15
-    vpblendd             m0, m0, m2, 0x33  ; -out1   out0
-    paddw                m2, m5, m4        ; -out5   out4
-    psubw                m5, m4            ;  out10 -out11
-    psubw                m4, m6, m3        ;  out8  -out9
-    paddw                m3, m6            ; -out7   out6
-    shufps               m6, m8, m1, q1032 ;  out12 -out13
-    vpblendd             m1, m1, m8, 0x33  ; -out3   out2
-    REPX  {pmulhrsw x, m12}, m2, m3, m4, m5
+    pshufb               m0, m12
+    pshufb               m6, m12
+    pshufb               m8, m12
+    pshufb               m1, m12
+    shufps               m7, m6, m0, q1032 ;  out14 -out15
+    vpblendd             m0, m6, 0x33      ; -out1   out0
+    punpcklqdq           m6, m8, m1        ;  out12 -out13
+    punpckhqdq           m1, m8, m1        ; -out3   out2
+    ret
+ALIGN function_align
+.main_pass1_end:
+    vpbroadcastd         m8, [o(pw_m2896_2896)]
+    vpbroadcastd        m12, [o(pw_2896_2896)]
+    pmaddwd              m9, m8, m11       ; -out11
+    pmaddwd              m2, m12, m5       ; -out5
+    pmaddwd              m5, m8            ;  out10
+    pmaddwd             m11, m12           ;  out4
+    REPX     {paddd x, m10}, m9, m5, m2, m11
+    REPX     {psrad x, 12 }, m9, m5, m2, m11
+    packssdw             m5, m9            ;  out10 -out11
+    packssdw             m2, m11           ; -out5   out4
+    pmaddwd             m11, m8, m3        ;  out8
+    vpbroadcastd         m8, [o(pw_2896_m2896)]
+    pmaddwd              m3, m12           ; -out7
+    pmaddwd              m8, m4            ; -out9
+    pmaddwd              m4, m12           ;  out6
+    REPX     {paddd x, m10}, m11, m3, m8, m4
+    REPX     {psrad x, 12 }, m11, m3, m8, m4
+    packssdw             m3, m4            ; -out7   out6
+    packssdw             m4, m11, m8       ;  out8  -out9
+    vpbroadcastd        m10, [o(pw_16384)]
+    pxor                 m9, m9
+    ret
+ALIGN function_align
+.main_pass2_end:
+    vpbroadcastd         m8, [o(pw_2896x8)]
+    pshufb               m2, m11, m12
+    pshufb               m5, m12
+    pshufb               m3, m12
+    pshufb               m4, m12
+    punpcklqdq          m11, m5, m2        ;  t15a   t7
+    punpckhqdq           m5, m2            ;  t14a   t6
+    shufps               m2, m3, m4, q1032 ;  t2a    t10
+    vpblendd             m3, m4, 0xcc      ;  t3a    t11
+    psubsw               m4, m2, m3        ;  out8  -out9
+    paddsw               m3, m2            ; -out7   out6
+    paddsw               m2, m5, m11       ; -out5   out4
+    psubsw               m5, m11           ;  out10 -out11
+    REPX   {pmulhrsw x, m8}, m2, m3, m4, m5
    ret

 INV_TXFM_8X16_FN flipadst, dct
@ -1972,6 +2078,7 @@ INV_TXFM_8X16_FN flipadst, identity
 cglobal iflipadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
    ITX_8X16_LOAD_COEFS
    call m(iadst_16x8_internal).main
+    call m(iadst_16x8_internal).main_pass1_end
    vpbroadcastd         m9, [o(pw_16384)]
    pslld               m10, m9, 17
    psubw               m10, m9 ; -16384, 16384
@ -1990,6 +2097,7 @@ cglobal iflipadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
    jmp m(idct_8x16_internal).pass1_end2
 .pass2:
    call m(iadst_8x16_internal).main
+    call m(iadst_8x16_internal).main_pass2_end
    vpbroadcastd         m8, [o(pw_2048)]
    vpbroadcastd        xm9, [o(pw_4096)]
    psubw                m8, m9
@ -2232,7 +2340,7 @@ cglobal iadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
    vpermq               m1, [cq+32*1], q1230
    vpermq               m2, [cq+32*2], q2103
    call m(iadst_4x16_internal).main2
-    pshufd               m2, m2, q1032
+    call m(iadst_4x16_internal).main_pass1_end
    punpcklwd            m4, m3, m1
    punpcklwd            m5, m2, m0
    punpckhwd            m0, m1
@ -2276,20 +2384,26 @@ cglobal iadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
    RET
 ALIGN function_align
 .main:
+    vpbroadcastd         m6, [o(pw_m3344_3344)]
    vpbroadcastd         m7, [o(pw_3803_1321)]
    vpbroadcastd         m8, [o(pw_m1321_2482)]
    vpbroadcastd         m9, [o(pw_2482_3344)]
    punpcklwd            m4, m2, m0 ; in2 in0 l
-    psubw                m6, m0, m2
    punpckhwd            m2, m0     ; in2 in0 h
-    paddw                m6, m3     ; t2
+    psrld                m5, m6, 16
+    pmaddwd             m10, m6, m4 ; t2:02 l
+    pmaddwd              m6, m2     ; t2:02 h
    pmaddwd              m0, m7, m4 ; t0:02 l
    pmaddwd              m7, m2     ; t0:02 h
    pmaddwd              m4, m8     ; t1:02 l
    pmaddwd              m8, m2     ; t1:02 h
    punpckhwd            m2, m3, m1 ; in3 in1 h
    punpcklwd            m3, m1     ; in3 in1 l
+    pmaddwd              m1, m5, m2 ; t2:3 h
+    pmaddwd              m5, m3     ; t2:3 l
+    paddd                m6, m1
    vpbroadcastd         m1, [o(pd_2048)]
+    paddd               m10, m5
    pmaddwd              m5, m9, m3
    pmaddwd              m9, m2
    paddd                m0, m1
@ -2299,6 +2413,8 @@ ALIGN function_align
    vpbroadcastd         m9, [o(pw_m3803_3344)]
    pmaddwd              m5, m9, m2
    pmaddwd              m9, m3
+    paddd               m10, m1     ; t2 + 2048 l
+    paddd                m6, m1     ; t2 + 2048 h
    paddd                m5, m1     ; t1:13 + 2048 h
    paddd                m1, m9     ; t1:13 + 2048 l
    vpbroadcastd         m9, [o(pw_m3803_m6688)]
@ -2310,12 +2426,11 @@ ALIGN function_align
    paddd                m4, m0
    paddd                m2, m8     ; t0 + t1 - t3 + 2048 h
    paddd                m3, m4     ; t0 + t1 - t3 + 2048 l
-    REPX      {psrad x, 12}, m0, m7, m5, m1, m2, m3
+    REPX      {psrad x, 12}, m10, m6, m0, m7, m5, m1, m2, m3
    packssdw             m0, m7
    packssdw             m1, m5
    packssdw             m3, m2
-    vpbroadcastd         m2, [o(pw_3344x8)]
-    pmulhrsw             m2, m6
+    packssdw             m2, m10, m6
    ret

 INV_TXFM_16X4_FN flipadst, dct
@ -2329,7 +2444,7 @@ cglobal iflipadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
    vpermq               m1, [cq+32*1], q1230
    vpermq               m2, [cq+32*2], q2103
    call m(iadst_4x16_internal).main2
-    pshufd               m2, m2, q1032
+    call m(iadst_4x16_internal).main_pass1_end
    punpckhwd            m4, m3, m2
    punpckhwd            m5, m1, m0
    punpcklwd            m0, m2
@ -2552,7 +2667,7 @@ INV_TXFM_16X8_FN adst, identity
 cglobal iadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
    ITX_16X8_LOAD_COEFS 1302
    call m(iadst_8x16_internal).main2
-    vpbroadcastd        m10, [o(pw_16384)]
+    call m(iadst_8x16_internal).main_pass1_end
    psubw               m11, m9, m10
    punpcklwd            m8, m0, m2
    punpckhwd            m0, m2
@ -2567,7 +2682,7 @@ cglobal iadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
 ALIGN function_align
 .pass2:
    call .main
-    vpbroadcastd         m9, [o(pw_2048)]
+    call .main_pass2_end
    pxor                 m8, m8
    psubw                m8, m9
    REPX   {pmulhrsw x, m9}, m0, m2, m4, m6
@ -2591,21 +2706,50 @@ ALIGN function_align
    ITX_MULSUB_2W         4, 8, 3, 9, 10, 3784, 1567 ; t6a, t7a
    psubsw               m9, m6, m8 ;  t7
    paddsw               m6, m8     ;  out6
-    vpbroadcastd         m8, [o(pw_2896x8)]
    psubsw               m3, m7, m5 ;  t3
    paddsw               m7, m5     ; -out7
    psubsw               m5, m0, m2 ;  t2
    paddsw               m0, m2     ;  out0
    psubsw               m2, m1, m4 ;  t6
    paddsw               m1, m4     ; -out1
-    psubw                m4, m5, m3
-    paddw                m3, m5
-    psubw                m5, m2, m9
-    paddw                m2, m9
+    ret
+ALIGN function_align
+.main_pass1_end:
+    vpbroadcastd        m11, [o(pw_m2896_2896)]
+    vpbroadcastd        m12, [o(pw_2896_2896)]
+    punpckhwd            m4, m3, m5
+    punpcklwd            m3, m5
+    pmaddwd              m5, m11, m4
+    pmaddwd              m4, m12
+    pmaddwd              m8, m11, m3
+    pmaddwd              m3, m12
+    REPX     {paddd x, m10}, m5, m4, m8, m3
+    REPX     {psrad x, 12 }, m5, m8, m4, m3
+    packssdw             m3, m4     ; -out3
+    packssdw             m4, m8, m5 ;  out4
+    punpcklwd            m5, m9, m2
+    punpckhwd            m9, m2
+    pmaddwd              m2, m12, m5
+    pmaddwd              m5, m11
+    pmaddwd             m12, m9
+    pmaddwd             m11, m9
+    REPX     {paddd x, m10}, m2, m5, m12, m11
+    REPX     {psrad x, 12 }, m2, m12, m5, m11
+    packssdw             m2, m12    ;  out2
+    packssdw             m5, m11    ; -out5
+    ret
+ALIGN function_align
+.main_pass2_end:
+    vpbroadcastd         m8, [o(pw_2896x8)]
+    psubsw               m4, m5, m3
+    paddsw               m3, m5
+    psubsw               m5, m2, m9
+    paddsw               m2, m9
    pmulhrsw             m2, m8     ;  out2
    pmulhrsw             m3, m8     ; -out3
    pmulhrsw             m4, m8     ;  out4
    pmulhrsw             m5, m8     ; -out5
+    vpbroadcastd         m9, [o(pw_2048)]
    ret

 INV_TXFM_16X8_FN flipadst, dct
@ -2616,7 +2760,7 @@ INV_TXFM_16X8_FN flipadst, identity
 cglobal iflipadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
    ITX_16X8_LOAD_COEFS 1302
    call m(iadst_8x16_internal).main2
-    vpbroadcastd        m10, [o(pw_16384)]
+    call m(iadst_8x16_internal).main_pass1_end
    psubw                m9, m10
    punpcklwd            m8, m6, m4
    punpckhwd            m6, m4
@ -2655,7 +2799,7 @@ cglobal iflipadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
    jmp                tx2q
 .pass2:
    call m(iadst_16x8_internal).main
-    vpbroadcastd         m9, [o(pw_2048)]
+    call m(iadst_16x8_internal).main_pass2_end
    pxor                 m8, m8
    psubw                m8, m9
    pmulhrsw            m10, m7, m8
@ -2986,8 +3130,12 @@ INV_TXFM_16X16_FN adst, flipadst
 cglobal iadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
    ITX_16X16_LOAD_COEFS
    call .main
-    vpbroadcastd         m1, [o(pw_8192)]
-    REPX   {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
+    call .main_pass1_end
+    pmulhrsw             m0, m1, [cq+32*0]
+    pmulhrsw             m2, m1, [cq+32*1]
+    REPX   {pmulhrsw x, m1}, m4, m6, m8, m10
+    pmulhrsw            m12, m1, [cq+32*2]
+    pmulhrsw            m14, m1, [cq+32*3]
    vextracti128 [rsp+16*5], m8, 1
    mova         [rsp+16*1], xm8
    pxor                 m8, m8
@ -2996,7 +3144,7 @@ cglobal iadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
 ALIGN function_align
 .pass2:
    call .main
-    vpbroadcastd         m1, [o(pw_2048)]
+    call .main_pass2_end
    REPX   {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
    mova         [rsp+32*0], m6
    pxor                 m6, m6
@ -3081,16 +3229,73 @@ ALIGN function_align
    paddsw               m0, m12      ;  out0
    paddsw              m12, m8, m5   ;  out12
    psubsw               m8, m5       ;  t7
-    paddw                m5, m10, m11 ; -out5
-    psubw               m10, m11      ;  out10
-    psubw               m11, m4, m8   ; -out11
-    paddw                m4, m8       ;  out4
-    psubw                m8, m7, m9   ;  out8
-    paddw                m7, m9       ; -out7
-    psubw                m9, m1, m6   ; -out9
-    paddw                m6, m1       ;  out6
+    ret
+ALIGN function_align
+.main_pass1_end:
+    mova          [cq+32*0], m0
+    mova          [cq+32*1], m2
+    mova          [cq+32*2], m12
+    mova          [cq+32*3], m14
+    vpbroadcastd        m14, [pw_m2896_2896]
+    vpbroadcastd        m12, [pw_2896_2896]
+    vpbroadcastd         m2, [pd_2048]
+    punpcklwd            m5, m11, m10
+    punpckhwd           m11, m10
+    pmaddwd             m10, m14, m5
+    pmaddwd              m0, m14, m11
+    pmaddwd              m5, m12
+    pmaddwd             m11, m12
+    REPX      {paddd x, m2}, m10, m0, m5, m11
+    REPX      {psrad x, 12}, m10, m0, m5, m11
+    packssdw            m10, m0  ;  out10
+    packssdw             m5, m11 ; -out5
+    punpcklwd           m11, m8, m4
+    punpckhwd            m8, m4
+    pmaddwd              m4, m12, m11
+    pmaddwd              m0, m12, m8
+    pmaddwd             m11, m14
+    pmaddwd              m8, m14
+    REPX      {paddd x, m2}, m4, m0, m11, m8
+    REPX      {psrad x, 12}, m4, m0, m11, m8
+    packssdw             m4, m0  ;  out4
+    packssdw            m11, m8  ; -out11
+    punpcklwd            m8, m9, m7
+    punpckhwd            m9, m7
+    pmaddwd              m7, m12, m8
+    pmaddwd              m0, m12, m9
+    pmaddwd              m8, m14
+    pmaddwd              m9, m14
+    REPX      {paddd x, m2}, m7, m0, m8, m9
+    REPX      {psrad x, 12}, m7, m0, m8, m9
+    packssdw             m7, m0  ; -out7
+    packssdw             m8, m9  ;  out8
+    punpckhwd            m0, m6, m1
+    punpcklwd            m6, m1
+    pmaddwd              m1, m14, m0
+    pmaddwd              m9, m14, m6
+    pmaddwd              m0, m12
+    pmaddwd              m6, m12
+    REPX      {paddd x, m2}, m1, m9, m0, m6
+    REPX      {psrad x, 12}, m1, m9, m0, m6
+    packssdw             m9, m1  ; -out7
+    packssdw             m6, m0  ;  out8
+    vpbroadcastd         m1, [o(pw_8192)]
+    ret
+ALIGN function_align
+.main_pass2_end:
+    ; In pass 2 we're going to clip to pixels afterwards anyway, so clipping to
+    ; 16-bit here will produce the same result as using 32-bit intermediates.
+    paddsw               m5, m10, m11 ; -out5
+    psubsw              m10, m11      ;  out10
+    psubsw              m11, m4, m8   ; -out11
+    paddsw               m4, m8       ;  out4
+    psubsw               m8, m7, m9   ;  out8
+    paddsw               m7, m9       ; -out7
+    psubsw               m9, m1, m6   ; -out9
+    paddsw               m6, m1       ;  out6
    vpbroadcastd         m1, [o(pw_2896x8)]
    REPX   {pmulhrsw x, m1}, m4, m5, m6, m7, m8, m9, m10, m11
+    vpbroadcastd         m1, [o(pw_2048)]
    ret

 INV_TXFM_16X16_FN flipadst, dct
@ -3100,16 +3305,16 @@ INV_TXFM_16X16_FN flipadst, flipadst
 cglobal iflipadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
    ITX_16X16_LOAD_COEFS
    call m(iadst_16x16_internal).main
-    vpbroadcastd         m1, [o(pw_8192)]
+    call m(iadst_16x16_internal).main_pass1_end
    pmulhrsw             m6, m1
+    pmulhrsw             m2, m1, m8
    mova         [rsp+32*2], m6
    pmulhrsw             m6, m1, m4
    pmulhrsw             m4, m1, m10
-    pmulhrsw            m10, m1, m12
-    pmulhrsw            m12, m1, m2
-    pmulhrsw             m2, m1, m8
-    pmulhrsw             m8, m1, m14
-    pmulhrsw            m14, m1, m0
+    pmulhrsw             m8, m1, [cq+32*3]
+    pmulhrsw            m10, m1, [cq+32*2]
+    pmulhrsw            m12, m1, [cq+32*1]
+    pmulhrsw            m14, m1, [cq+32*0]
    pxor                 m0, m0
    psubw                m0, m1
    REPX   {pmulhrsw x, m0}, m3, m5, m7, m11, m15
@ -3136,7 +3341,7 @@ cglobal iflipadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
    jmp m(idct_16x16_internal).pass1_end3
 .pass2:
    call m(iadst_16x16_internal).main
-    vpbroadcastd         m1, [o(pw_2048)]
+    call m(iadst_16x16_internal).main_pass2_end
    pmulhrsw             m0, m1
    pmulhrsw             m8, m1
    mova         [rsp+32*0], m0
--- a/third_party/dav1d/src/x86/itx_ssse3.asm
+++ b/third_party/dav1d/src/x86/itx_ssse3.asm
@ -43,8 +43,11 @@ pw_1321_3803:   times 4 dw  1321,  3803
 pw_2482_m1321:  times 4 dw  2482, -1321
 pw_3344_2482:   times 4 dw  3344,  2482
 pw_3344_m3803:  times 4 dw  3344, -3803
+pw_3344_m3344:  times 4 dw  3344, -3344
+pw_0_3344       times 4 dw     0,  3344
 pw_m6688_m3803: times 4 dw -6688, -3803

+COEF_PAIR 2896, 2896
 COEF_PAIR 1567, 3784
 COEF_PAIR  799, 4017
 COEF_PAIR 3406, 2276
@ -126,7 +129,6 @@ pw_2675x8:      times 8 dw  2675*8
 pw_4085x8:      times 8 dw  4085*8
 pw_m301x8:      times 8 dw  -301*8

-
 iadst4_dconly1a: times 2 dw 10568, 19856, 26752, 30424
 iadst4_dconly1b: times 2 dw 30424, 26752, 19856, 10568
 iadst4_dconly2a: dw 10568, 10568, 10568, 10568, 19856, 19856, 19856, 19856
@ -200,7 +202,6 @@ SECTION .text
    ret
 %endmacro

-
 ; flags: 1 = swap, 2: coef_regs
 %macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags
 %if %6 & 2
@ -239,35 +240,6 @@ SECTION .text
    paddsw               m0, m2                ;high: out1 ;low: out0
 %endmacro

-
-%macro IADST4_1D_PACKED 0
-    punpcklwd            m2, m0, m1                ;unpacked in0 in2
-    punpckhwd            m3, m0, m1                ;unpacked in1 in3
-    psubw                m0, m1
-    punpckhqdq           m1, m1                    ;
-    paddw                m1, m0                    ;low: in0 - in2 + in3
-
-    pmaddwd              m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
-    pmaddwd              m2, [o(pw_2482_m1321)]    ;2482 * in0 - 1321 * in2
-    pmaddwd              m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
-    pmaddwd              m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3
-    paddd                m4, m0                    ;t0 + t3
-    pmaddwd              m3, [o(pw_m6688_m3803)]   ;-2 * 3344 * in1 - 3803 * in3
-    pmulhrsw             m1, [o(pw_3344x8)]        ;low: out2
-    mova                 m0, [o(pd_2048)]
-    paddd                m2, m0
-    paddd                m0, m4                    ;t0 + t3 + 2048
-    paddd                m5, m2                    ;t1 + t3 + 2048
-    paddd                m2, m4
-    paddd                m2, m3                    ;t0 + t1 - t3 + 2048
-
-    psrad                m0, 12                    ;out0
-    psrad                m5, 12                    ;out1
-    psrad                m2, 12                    ;out3
-    packssdw             m0, m5                    ;high: out1 ;low: out0
-    packssdw             m2, m2                    ;high: out3 ;low: out3
-%endmacro
-
 %macro INV_TXFM_FN 5+ ; type1, type2, fast_thresh, size, xmm/stack
 cglobal inv_txfm_add_%1_%2_%4, 4, 6, %5, dst, stride, coeff, eob, tx2
    %undef cmp
@ -392,15 +364,14 @@ cglobal iadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
    mova                 m0, [coeffq+16*0]
    mova                 m1, [coeffq+16*1]
    call .main
-    punpckhwd            m3, m0, m2
+    punpckhwd            m2, m0, m1
    punpcklwd            m0, m1
-    punpckhwd            m1, m0, m3       ;high: in3 ;low :in2
-    punpcklwd            m0, m3           ;high: in1 ;low: in0
+    punpckhwd            m1, m0, m2       ;high: in3 ;low :in2
+    punpcklwd            m0, m2           ;high: in1 ;low: in0
    jmp                tx2q

 .pass2:
    call .main
-    punpcklqdq            m1, m2          ;out2 out3

 .end:
    pxor                 m2, m2
@ -412,7 +383,28 @@ cglobal iadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2

 ALIGN function_align
 .main:
-    IADST4_1D_PACKED
+    punpcklwd            m2, m0, m1                ;unpacked in0 in2
+    punpckhwd            m0, m1                    ;unpacked in1 in3
+    mova                 m3, m0
+    pmaddwd              m1, m2, [o(pw_3344_m3344)];3344 * in0 - 3344 * in2
+    pmaddwd              m0, [o(pw_0_3344)]        ;3344 * in3
+    paddd                m1, m0                    ;t2
+    pmaddwd              m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
+    pmaddwd              m2, [o(pw_2482_m1321)]    ;2482 * in0 - 1321 * in2
+    pmaddwd              m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
+    pmaddwd              m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3
+    paddd                m4, m0                    ;t0 + t3
+    pmaddwd              m3, [o(pw_m6688_m3803)]   ;-2 * 3344 * in1 - 3803 * in3
+    mova                 m0, [o(pd_2048)]
+    paddd                m1, m0                    ;t2 + 2048
+    paddd                m2, m0
+    paddd                m0, m4                    ;t0 + t3 + 2048
+    paddd                m5, m2                    ;t1 + t3 + 2048
+    paddd                m2, m4
+    paddd                m2, m3                    ;t0 + t1 - t3 + 2048
+    REPX      {psrad x, 12}, m1, m0, m5, m2
+    packssdw             m0, m5                    ;high: out1 ;low: out0
+    packssdw             m1, m2                    ;high: out3 ;low: out3
    ret

 INV_TXFM_4X4_FN flipadst, dct,      0
@ -424,16 +416,14 @@ cglobal iflipadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
    mova                 m0, [coeffq+16*0]
    mova                 m1, [coeffq+16*1]
    call m(iadst_4x4_internal).main
-    punpcklwd            m1, m0
-    punpckhwd            m2, m0
-    punpcklwd            m0, m2, m1            ;high: in3 ;low :in2
-    punpckhwd            m2, m1                ;high: in1 ;low: in0
-    mova                 m1, m2
+    punpcklwd            m2, m1, m0
+    punpckhwd            m1, m0
+    punpcklwd            m0, m1, m2            ;high: in3 ;low :in2
+    punpckhwd            m1, m2                ;high: in1 ;low: in0
    jmp                tx2q

 .pass2:
    call m(iadst_4x4_internal).main
-    punpcklqdq            m1, m2               ;out2 out3

 .end:
    pxor                 m2, m2
@ -584,99 +574,6 @@ cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, coeff
    mova                m%4, m%5
 %endmacro

-%macro IADST4_1D 0
-    mova                 m4, m2
-    psubw                m2, m0, m4
-    paddw                m2, m3                        ;low: in0 - in2 + in3
-
-    punpckhwd            m6, m0, m4                    ;unpacked in0 in2
-    punpckhwd            m7, m1, m3                    ;unpacked in1 in3
-    punpcklwd            m0, m4                        ;unpacked in0 in2
-    punpcklwd            m1, m3                        ;unpacked in1 in3
-
-    pmaddwd              m4, m0, [o(pw_1321_3803)]     ;1321 * in0 + 3803 * in2
-    pmaddwd              m0, [o(pw_2482_m1321)]        ;2482 * in0 - 1321 * in2
-    pmaddwd              m3, m1, [o(pw_3344_2482)]     ;3344 * in1 + 2482 * in3
-    pmaddwd              m5, m1, [o(pw_3344_m3803)]    ;3344 * in1 - 3803 * in3
-    paddd                m3, m4                        ;t0 + t3
-
-    pmaddwd              m1, [o(pw_m6688_m3803)]       ;-2 * 3344 * in1 - 3803 * in3
-    pmulhrsw             m2, [o(pw_3344x8)]            ;out2
-    mova                 m4, [o(pd_2048)]
-    paddd                m0, m4
-    paddd                m4, m3                        ;t0 + t3 + 2048
-    paddd                m5, m0                        ;t1 + t3 + 2048
-    paddd                m3, m0
-    paddd                m3, m1                        ;t0 + t1 - t3 + 2048
-
-    psrad                m4, 12                        ;out0
-    psrad                m5, 12                        ;out1
-    psrad                m3, 12                        ;out3
-    packssdw             m0, m4, m5                    ;low: out0  high: out1
-
-    pmaddwd              m4, m6, [o(pw_1321_3803)]     ;1321 * in0 + 3803 * in2
-    pmaddwd              m6, [o(pw_2482_m1321)]        ;2482 * in0 - 1321 * in2
-    pmaddwd              m1, m7, [o(pw_3344_2482)]     ;3344 * in1 + 2482 * in3
-    pmaddwd              m5, m7, [o(pw_3344_m3803)]    ;3344 * in1 - 3803 * in3
-    paddd                m1, m4                        ;t0 + t3
-    pmaddwd              m7, [o(pw_m6688_m3803)]       ;-2 * 3344 * in1 - 3803 * in3
-
-    mova                 m4, [o(pd_2048)]
-    paddd                m6, m4
-    paddd                m4, m1                        ;t0 + t3 + 2048
-    paddd                m5, m6                        ;t1 + t3 + 2048
-    paddd                m1, m6
-    paddd                m1, m7                        ;t0 + t1 - t3 + 2048
-
-    psrad                m4, 12                        ;out0
-    psrad                m5, 12                        ;out1
-    psrad                m1, 12                        ;out3
-    packssdw             m3, m1                        ;out3
-    packssdw             m4, m5                        ;low: out0  high: out1
-
-    punpckhqdq           m1, m0, m4                    ;out1
-    punpcklqdq           m0, m4                        ;out0
-%endmacro
-
-%macro IADST8_1D_PACKED 0
-    mova                 m6, [o(pd_2048)]
-    punpckhwd            m4, m3, m0                ;unpacked in7 in0
-    punpckhwd            m5, m2, m1                ;unpacked in5 in2
-    punpcklwd            m1, m2                    ;unpacked in3 in4
-    punpcklwd            m0, m3                    ;unpacked in1 in6
-    ITX_MUL2X_PACK        4, 2, 6,  401, 4076      ;low:  t0a   high:  t1a
-    ITX_MUL2X_PACK        5, 2, 6, 1931, 3612      ;low:  t2a   high:  t3a
-    ITX_MUL2X_PACK        1, 2, 6, 3166, 2598      ;low:  t4a   high:  t5a
-    ITX_MUL2X_PACK        0, 2, 6, 3920, 1189      ;low:  t6a   high:  t7a
-
-    psubsw               m3, m4, m1                ;low:  t4    high:  t5
-    paddsw               m4, m1                    ;low:  t0    high:  t1
-    psubsw               m2, m5, m0                ;low:  t6    high:  t7
-    paddsw               m5, m0                    ;low:  t2    high:  t3
-
-    shufps               m1, m3, m2, q1032
-    punpckhwd            m2, m1
-    punpcklwd            m3, m1
-    ITX_MUL2X_PACK        3, 0, 6, 1567, 3784, 1   ;low:  t5a   high:  t4a
-    ITX_MUL2X_PACK        2, 0, 6, 3784, 1567      ;low:  t7a   high:  t6a
-
-    psubsw               m1, m4, m5                ;low:  t2    high:  t3
-    paddsw               m4, m5                    ;low:  out0  high: -out7
-    psubsw               m5, m3, m2                ;low:  t7    high:  t6
-    paddsw               m3, m2                    ;low:  out6  high: -out1
-    shufps               m0, m4, m3, q3210         ;low:  out0  high: -out1
-    shufps               m3, m4, q3210             ;low:  out6  high: -out7
-
-    shufps               m4, m1, m5, q1032         ;low:  t3    high:  t7
-    shufps               m1, m5, q3210             ;low:  t2    high:  t6
-    mova                 m5, [o(pw_2896x8)]
-    psubw                m2, m1, m4                ;low:  t2-t3 high:  t6-t7
-    paddw                m1, m4                    ;low:  t2+t3 high:  t6+t7
-    pmulhrsw             m2, m5                    ;low:  out4  high: -out5
-    shufps               m1, m1, q1032
-    pmulhrsw             m1, m5                    ;low:  out2  high: -out3
-%endmacro
-
 %macro WRITE_4X8 4 ;row[1-4]
    WRITE_4X4             0, 1, 4, 5, 6, %1, %2, %3, %4
    lea                dstq, [dstq+strideq*4]
@ -838,7 +735,48 @@ cglobal iadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2

 ALIGN function_align
 .main:
-    IADST8_1D_PACKED
+    mova                 m6, [o(pd_2048)]
+    punpckhwd            m4, m3, m0                ;unpacked in7 in0
+    punpckhwd            m5, m2, m1                ;unpacked in5 in2
+    punpcklwd            m1, m2                    ;unpacked in3 in4
+    punpcklwd            m0, m3                    ;unpacked in1 in6
+    ITX_MUL2X_PACK        4, 2, 6,  401, 4076      ;low:  t0a   high:  t1a
+    ITX_MUL2X_PACK        5, 2, 6, 1931, 3612      ;low:  t2a   high:  t3a
+    ITX_MUL2X_PACK        1, 2, 6, 3166, 2598      ;low:  t4a   high:  t5a
+    ITX_MUL2X_PACK        0, 2, 6, 3920, 1189      ;low:  t6a   high:  t7a
+
+    psubsw               m3, m4, m1                ;low:  t4    high:  t5
+    paddsw               m4, m1                    ;low:  t0    high:  t1
+    psubsw               m2, m5, m0                ;low:  t6    high:  t7
+    paddsw               m5, m0                    ;low:  t2    high:  t3
+
+    shufps               m1, m3, m2, q1032
+    punpckhwd            m2, m1
+    punpcklwd            m3, m1
+    ITX_MUL2X_PACK        3, 0, 6, 1567, 3784, 1   ;low:  t5a   high:  t4a
+    ITX_MUL2X_PACK        2, 0, 6, 3784, 1567      ;low:  t7a   high:  t6a
+
+    psubsw               m1, m4, m5                ;low:  t2    high:  t3
+    paddsw               m4, m5                    ;low:  out0  high: -out7
+    psubsw               m5, m3, m2                ;low:  t7    high:  t6
+    paddsw               m3, m2                    ;low:  out6  high: -out1
+    shufps               m0, m4, m3, q3210         ;low:  out0  high: -out1
+    shufps               m3, m4, q3210             ;low:  out6  high: -out7
+
+    mova                 m2, [o(pw_2896_m2896)]
+    mova                 m7, [o(pw_2896_2896)]
+    shufps               m4, m1, m5, q1032         ;low:  t3    high:  t7
+    shufps               m1, m5, q3210             ;low:  t2    high:  t6
+    punpcklwd            m5, m1, m4
+    punpckhwd            m1, m4
+    pmaddwd              m4, m2, m1                ;-out5
+    pmaddwd              m2, m5                    ; out4
+    pmaddwd              m1, m7                    ; out2
+    pmaddwd              m5, m7                    ;-out3
+    REPX      {paddd x, m6}, m4, m2, m1, m5
+    REPX      {psrad x, 12}, m4, m2, m1, m5
+    packssdw             m1, m5                    ;low:  out2  high: -out3
+    packssdw             m2, m4                    ;low:  out4  high: -out5
    ret

 INV_TXFM_4X8_FN flipadst, dct,      0
@ -1109,7 +1047,67 @@ cglobal iadst_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2

 ALIGN function_align
 .main:
-    IADST4_1D
+    punpckhwd            m6, m0, m2                    ;unpacked in0 in2
+    punpcklwd            m0, m2                        ;unpacked in0 in2
+    punpckhwd            m7, m1, m3                    ;unpacked in1 in3
+    punpcklwd            m1, m3                        ;unpacked in1 in3
+
+    mova                 m2, [o(pw_3344_m3344)]
+    mova                 m4, [o(pw_0_3344)]
+    pmaddwd              m3, m2, m6                    ;3344 * in0 - 3344 * in2
+    pmaddwd              m5, m4, m7                    ;3344 * in3
+    pmaddwd              m2, m0
+    pmaddwd              m4, m1
+    paddd                m3, m5
+    paddd                m2, m4
+    mova                 m4, [o(pd_2048)]
+    paddd                m3, m4                        ;t2 + 2048
+    paddd                m2, m4
+    psrad                m3, 12
+    psrad                m2, 12
+    packssdw             m2, m3                        ;out2
+
+    pmaddwd              m4, m0, [o(pw_1321_3803)]     ;1321 * in0 + 3803 * in2
+    pmaddwd              m0, [o(pw_2482_m1321)]        ;2482 * in0 - 1321 * in2
+    pmaddwd              m3, m1, [o(pw_3344_2482)]     ;3344 * in1 + 2482 * in3
+    pmaddwd              m5, m1, [o(pw_3344_m3803)]    ;3344 * in1 - 3803 * in3
+    paddd                m3, m4                        ;t0 + t3
+
+    pmaddwd              m1, [o(pw_m6688_m3803)]       ;-2 * 3344 * in1 - 3803 * in3
+    mova                 m4, [o(pd_2048)]
+    paddd                m0, m4
+    paddd                m4, m3                        ;t0 + t3 + 2048
+    paddd                m5, m0                        ;t1 + t3 + 2048
+    paddd                m3, m0
+    paddd                m3, m1                        ;t0 + t1 - t3 + 2048
+
+    psrad                m4, 12                        ;out0
+    psrad                m5, 12                        ;out1
+    psrad                m3, 12                        ;out3
+    packssdw             m0, m4, m5                    ;low: out0  high: out1
+
+    pmaddwd              m4, m6, [o(pw_1321_3803)]     ;1321 * in0 + 3803 * in2
+    pmaddwd              m6, [o(pw_2482_m1321)]        ;2482 * in0 - 1321 * in2
+    pmaddwd              m1, m7, [o(pw_3344_2482)]     ;3344 * in1 + 2482 * in3
+    pmaddwd              m5, m7, [o(pw_3344_m3803)]    ;3344 * in1 - 3803 * in3
+    paddd                m1, m4                        ;t0 + t3
+    pmaddwd              m7, [o(pw_m6688_m3803)]       ;-2 * 3344 * in1 - 3803 * in3
+
+    mova                 m4, [o(pd_2048)]
+    paddd                m6, m4
+    paddd                m4, m1                        ;t0 + t3 + 2048
+    paddd                m5, m6                        ;t1 + t3 + 2048
+    paddd                m1, m6
+    paddd                m1, m7                        ;t0 + t1 - t3 + 2048
+
+    psrad                m4, 12                        ;out0
+    psrad                m5, 12                        ;out1
+    psrad                m1, 12                        ;out3
+    packssdw             m3, m1                        ;out3
+    packssdw             m4, m5                        ;low: out0  high: out1
+
+    punpckhqdq           m1, m0, m4                    ;out1
+    punpcklqdq           m0, m4                        ;out0
    ret

 INV_TXFM_8X4_FN flipadst, dct
@ -1423,6 +1421,7 @@ cglobal iadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2

 .pass1:
    call .main
+    call .main_pass1_end

 .pass1_end:
    mova                    m7, [o(pw_16384)]
@ -1441,6 +1440,7 @@ ALIGN function_align

 .pass2_main:
    call .main
+    call .main_pass2_end

 .end:
    mova                    m7, [o(pw_2048)]
@ -1491,10 +1491,57 @@ ALIGN function_align
    psubsw                  m5, m6                        ;t6
    paddsw                  m6, m2, m7                    ;out6
    psubsw                  m2, m7                        ;t7
-    paddw                   m7, m4, m3                    ;t2 + t3
-    psubw                   m4, m3                        ;t2 - t3
-    paddw                   m3, m5, m2                    ;t6 + t7
-    psubw                   m5, m2                        ;t6 - t7
+    ret
+ALIGN function_align
+.main_pass1_end:
+    mova  [rsp+gprsize*2+16*1], m1
+    mova  [rsp+gprsize*2+16*2], m6
+    punpckhwd               m1, m4, m3
+    punpcklwd               m4, m3
+    punpckhwd               m7, m5, m2
+    punpcklwd               m5, m2
+    mova                    m2, [o(pw_2896_2896)]
+    mova                    m6, [o(pd_2048)]
+    pmaddwd                 m3, m2, m7
+    pmaddwd                 m2, m5
+    paddd                   m3, m6
+    paddd                   m2, m6
+    psrad                   m3, 12
+    psrad                   m2, 12
+    packssdw                m2, m3                        ;out2
+    mova                    m3, [o(pw_2896_m2896)]
+    pmaddwd                 m7, m3
+    pmaddwd                 m5, m3
+    paddd                   m7, m6
+    paddd                   m5, m6
+    psrad                   m7, 12
+    psrad                   m5, 12
+    packssdw                m5, m7                        ;-out5
+    mova                    m3, [o(pw_2896_2896)]
+    pmaddwd                 m7, m3, m1
+    pmaddwd                 m3, m4
+    paddd                   m7, m6
+    paddd                   m3, m6
+    psrad                   m7, 12
+    psrad                   m3, 12
+    packssdw                m3, m7                        ;-out3
+    mova                    m7, [o(pw_2896_m2896)]
+    pmaddwd                 m1, m7
+    pmaddwd                 m4, m7
+    paddd                   m1, m6
+    paddd                   m4, m6
+    psrad                   m1, 12
+    psrad                   m4, 12
+    packssdw                m4, m1                        ;-out5
+    mova                    m1, [rsp+gprsize*2+16*1]
+    mova                    m6, [rsp+gprsize*2+16*2]
+    ret
+ALIGN function_align
+.main_pass2_end:
+    paddsw                  m7, m4, m3                    ;t2 + t3
+    psubsw                  m4, m3                        ;t2 - t3
+    paddsw                  m3, m5, m2                    ;t6 + t7
+    psubsw                  m5, m2                        ;t6 - t7
    mova                    m2, [o(pw_2896x8)]
    pmulhrsw                m4, m2                        ;out4
    pmulhrsw                m5, m2                        ;-out5
@ -1513,6 +1560,7 @@ cglobal iflipadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2

 .pass1:
    call m(iadst_8x8_internal).main
+    call m(iadst_8x8_internal).main_pass1_end

 .pass1_end:
    mova                    m7, [o(pw_m16384)]
@ -1542,6 +1590,7 @@ ALIGN function_align

 .pass2_main:
    call m(iadst_8x8_internal).main
+    call m(iadst_8x8_internal).main_pass2_end

 .end:
    mova                    m7, [o(pw_2048)]
@ -1753,6 +1802,7 @@ cglobal iadst_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2

 .pass2:
    call m(iadst_16x4_internal).main
+    call m(iadst_16x4_internal).main_pass2_end

    punpcklqdq            m6, m5, m4                ;low: -out5  high: -out7
    punpckhqdq            m4, m5                    ;low:  out8  high:  out10
@ -1820,6 +1870,7 @@ cglobal iflipadst_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2

 .pass2:
    call m(iadst_16x4_internal).main
+    call m(iadst_16x4_internal).main_pass2_end

    punpckhqdq            m6, m5, m4                ;low:  out5  high:  out7
    punpcklqdq            m4, m5                    ;low: -out8  high: -out10
@ -2160,6 +2211,7 @@ INV_TXFM_16X4_FN adst, identity
 cglobal iadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
    LOAD_7ROWS        coeffq, 16
    call .main
+    call .main_pass1_end

    punpckhwd             m6, m7, m0                 ;packed -out11, -out15
    punpcklwd             m0, m7                     ;packed   out0,   out4
@ -2193,92 +2245,137 @@ cglobal iadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
 ALIGN function_align
 .main:
    mova       [coeffq+16*6], m0
-    pshufd                m1, m1, q1032
+    pshufd                m0, m1, q1032
    pshufd                m2, m2, q1032
-    punpckhwd             m0, m6, m1                 ;packed in13,  in2
-    punpcklwd             m1, m6                     ;packed  in3, in12
-    punpckhwd             m6, m5, m2                 ;packed in11,  in4
+    punpckhwd             m1, m6, m0                 ;packed in13,  in2
+    punpcklwd             m0, m6                     ;packed  in3, in12
+    punpckhwd             m7, m5, m2                 ;packed in11,  in4
    punpcklwd             m2, m5                     ;packed  in5, in10
-    mova                  m7, [o(pd_2048)]
-    ITX_MUL2X_PACK         0, 5, 7,  995, 3973       ;low:t2   high:t3
-    ITX_MUL2X_PACK         6, 5, 7, 1751, 3703       ;low:t4   high:t5
-    ITX_MUL2X_PACK         2, 5, 7, 3513, 2106       ;low:t10  high:t11
-    ITX_MUL2X_PACK         1, 5, 7, 3857, 1380       ;low:t12  high:t13
-    psubsw                m5, m0, m2                 ;low:t10a high:t11a
-    paddsw                m0, m2                     ;low:t2a  high:t3a
-    psubsw                m2, m6, m1                 ;low:t12a high:t13a
-    paddsw                m6, m1                     ;low:t4a  high:t5a
-    punpcklqdq            m1, m5
-    punpckhwd             m1, m5                     ;packed t10a, t11a
+    mova                  m6, [o(pd_2048)]
+    ITX_MUL2X_PACK         1, 5, 6,  995, 3973       ;low:t2   high:t3
+    ITX_MUL2X_PACK         7, 5, 6, 1751, 3703       ;low:t4   high:t5
+    ITX_MUL2X_PACK         2, 5, 6, 3513, 2106       ;low:t10  high:t11
+    ITX_MUL2X_PACK         0, 5, 6, 3857, 1380       ;low:t12  high:t13
+    psubsw                m5, m1, m2                 ;low:t10a high:t11a
+    paddsw                m1, m2                     ;low:t2a  high:t3a
+    psubsw                m2, m7, m0                 ;low:t12a high:t13a
+    paddsw                m7, m0                     ;low:t4a  high:t5a
+    punpcklqdq            m0, m5
+    punpckhwd             m0, m5                     ;packed t10a, t11a
    punpcklqdq            m5, m2
    punpckhwd             m2, m5                     ;packed t13a, t12a
-    ITX_MUL2X_PACK         1, 5, 7, 3406, 2276       ;low:t10  high:t11
-    ITX_MUL2X_PACK         2, 5, 7, 4017,  799, 1    ;low:t12  high:t13
-    mova       [coeffq+16*4], m0
-    mova       [coeffq+16*5], m6
-    mova                  m0, [coeffq+16*6]
-    mova                  m6, [coeffq+16*7]
-    pshufd                m0, m0, q1032
+    ITX_MUL2X_PACK         0, 5, 6, 3406, 2276       ;low:t10  high:t11
+    ITX_MUL2X_PACK         2, 5, 6, 4017,  799, 1    ;low:t12  high:t13
+    mova       [coeffq+16*4], m1
+    mova       [coeffq+16*5], m7
+    mova                  m1, [coeffq+16*6]
+    mova                  m7, [coeffq+16*7]
+    pshufd                m1, m1, q1032
    pshufd                m3, m3, q1032
-    punpckhwd             m5, m6, m0                 ;packed in15,  in0
-    punpcklwd             m0, m6                     ;packed  in1, in14
-    punpckhwd             m6, m4, m3                 ;packed  in9,  in6
+    punpckhwd             m5, m7, m1                 ;packed in15,  in0
+    punpcklwd             m1, m7                     ;packed  in1, in14
+    punpckhwd             m7, m4, m3                 ;packed  in9,  in6
    punpcklwd             m3, m4                     ;packed  in7,  in8
-    ITX_MUL2X_PACK         5, 4, 7,  201, 4091       ;low:t0    high:t1
-    ITX_MUL2X_PACK         6, 4, 7, 2440, 3290       ;low:t6    high:t7
-    ITX_MUL2X_PACK         3, 4, 7, 3035, 2751       ;low:t8    high:t9
-    ITX_MUL2X_PACK         0, 4, 7, 4052,  601       ;low:t14   high:t15
+    ITX_MUL2X_PACK         5, 4, 6,  201, 4091       ;low:t0    high:t1
+    ITX_MUL2X_PACK         7, 4, 6, 2440, 3290       ;low:t6    high:t7
+    ITX_MUL2X_PACK         3, 4, 6, 3035, 2751       ;low:t8    high:t9
+    ITX_MUL2X_PACK         1, 4, 6, 4052,  601       ;low:t14   high:t15
    psubsw                m4, m5, m3                 ;low:t8a   high:t9a
    paddsw                m5, m3                     ;low:t0a   high:t1a
-    psubsw                m3, m6, m0                 ;low:t14a  high:t15a
-    paddsw                m6, m0                     ;low:t6a   high:t7a
-    punpcklqdq            m0, m4
-    punpckhwd             m0, m4                     ;packed  t8a,  t9a
+    psubsw                m3, m7, m1                 ;low:t14a  high:t15a
+    paddsw                m7, m1                     ;low:t6a   high:t7a
+    punpcklqdq            m1, m4
+    punpckhwd             m1, m4                     ;packed  t8a,  t9a
    punpcklqdq            m4, m3
    punpckhwd             m3, m4                     ;packed t15a, t14a
-    ITX_MUL2X_PACK         0, 4, 7,  799, 4017       ;low:t8    high:t9
-    ITX_MUL2X_PACK         3, 4, 7, 2276, 3406, 1    ;low:t14   high:t15
-    psubsw                m4, m0, m2                 ;low:t12a  high:t13a
-    paddsw                m0, m2                     ;low:t8a   high:t9a
-    psubsw                m2, m1, m3                 ;low:t14a  high:t15a
-    paddsw                m1, m3                     ;low:t10a  high:t11a
-    punpcklqdq            m3, m4
-    punpckhwd             m3, m4                     ;packed t12a, t13a
-    punpcklqdq            m4, m2
-    punpckhwd             m2, m4                     ;packed t15a, t14a
-    ITX_MUL2X_PACK         3, 4, 7, 1567, 3784       ;low:t12   high:t13
-    ITX_MUL2X_PACK         2, 4, 7, 3784, 1567, 1    ;low:t14   high:t15
-    psubsw                m4, m0, m1                 ;low:t10   high:t11
-    paddsw                m0, m1                     ;low:-out1 high:out14
+    ITX_MUL2X_PACK         1, 4, 6,  799, 4017       ;low:t8    high:t9
+    ITX_MUL2X_PACK         3, 4, 6, 2276, 3406, 1    ;low:t14   high:t15
+    paddsw                m4, m1, m2                 ;low:t12a  high:t13a
+    psubsw                m1, m2                     ;low:t8a   high:t9a
+    psubsw                m2, m0, m3                 ;low:t14a  high:t15a
+    paddsw                m0, m3                     ;low:t10a  high:t11a
+    punpcklqdq            m3, m1
+    punpckhwd             m3, m1                     ;packed t12a, t13a
+    punpcklqdq            m1, m2
+    punpckhwd             m2, m1                     ;packed t15a, t14a
+    ITX_MUL2X_PACK         3, 1, 6, 1567, 3784       ;low:t12   high:t13
+    ITX_MUL2X_PACK         2, 1, 6, 3784, 1567, 1    ;low:t14   high:t15
    psubsw                m1, m3, m2                 ;low:t14a  high:t15a
    paddsw                m3, m2                     ;low:out2  high:-out13
-    punpckhqdq            m2, m4, m1                 ;low:t11   high:t15a
-    punpcklqdq            m4, m1                     ;low:t10   high:t14a
-    psubw                 m1, m4, m2
-    paddw                 m2, m4
+    psubsw                m2, m4, m0                 ;low:t10   high:t11
+    paddsw                m0, m4                     ;low:-out1 high:out14
    mova       [coeffq+16*6], m0
    mova       [coeffq+16*7], m3
    mova                  m0, [coeffq+16*4]
    mova                  m3, [coeffq+16*5]
    psubsw                m4, m5, m3                 ;low:t4    high:t5
    paddsw                m5, m3                     ;low:t0    high:t1
-    psubsw                m3, m0 ,m6                 ;low:t6    high:t7
-    paddsw                m0, m6                     ;low:t2    high:t3
-    punpcklqdq            m6, m4
-    punpckhwd             m6, m4                     ;packed t4, t5
+    psubsw                m3, m0, m7                 ;low:t6    high:t7
+    paddsw                m0, m7                     ;low:t2    high:t3
+    punpcklqdq            m7, m4
+    punpckhwd             m7, m4                     ;packed t4, t5
    punpcklqdq            m4, m3
    punpckhwd             m3, m4                     ;packed t7, t6
-    ITX_MUL2X_PACK         6, 4, 7, 1567, 3784       ;low:t4a   high:t5a
-    ITX_MUL2X_PACK         3, 4, 7, 3784, 1567, 1    ;low:t6a   high:t7a
+    ITX_MUL2X_PACK         7, 4, 6, 1567, 3784       ;low:t4a   high:t5a
+    ITX_MUL2X_PACK         3, 4, 6, 3784, 1567, 1    ;low:t6a   high:t7a
    psubsw                m4, m5, m0                 ;low:t2a   high:t3a
    paddsw                m0, m5                     ;low:out0  high:-out15
-    psubsw                m5, m6, m3                 ;low:t6    high:t7
-    paddsw                m3, m6                     ;low:-out3 high:out12
+    psubsw                m5, m7, m3                 ;low:t6    high:t7
+    paddsw                m3, m7                     ;low:-out3 high:out12
+    ret
+ALIGN function_align
+.main_pass1_end:
+    mova                  m7, [o(deint_shuf1)]
+    mova       [coeffq+16*4], m0
+    mova       [coeffq+16*5], m3
+    mova                  m0, [o(pw_2896_m2896)]
+    mova                  m3, [o(pw_2896_2896)]
+    pshufb                m1, m7                     ;t14a t15a
+    pshufb                m2, m7                     ;t10  t11
+    pshufb                m4, m7                     ;t2a  t3a
+    pshufb                m5, m7                     ;t6   t7
+    pmaddwd               m7, m0, m2
+    pmaddwd               m2, m3
+    paddd                 m7, m6
+    paddd                 m2, m6
+    psrad                 m7, 12
+    psrad                 m2, 12
+    packssdw              m2, m7                     ;low:out6  high:-out9
+    pmaddwd               m7, m0, m4
+    pmaddwd               m4, m3
+    paddd                 m7, m6
+    paddd                 m4, m6
+    psrad                 m7, 12
+    psrad                 m4, 12
+    packssdw              m4, m7                     ;low:-out7 high:out8
+    pmaddwd               m7, m3, m5
+    pmaddwd               m5, m0
+    paddd                 m7, m6
+    paddd                 m5, m6
+    psrad                 m7, 12
+    psrad                 m5, 12
+    packssdw              m7, m5                     ;low:out4  high:-out11
+    pmaddwd               m5, m3, m1
+    pmaddwd               m1, m0
+    paddd                 m5, m6
+    paddd                 m1, m6
+    psrad                 m5, 12
+    psrad                 m1, 12
+    packssdw              m5, m1                     ;low:-out5 high:out10
+    mova                  m0, [coeffq+16*4]
+    mova                  m3, [coeffq+16*5]
+    ret
+ALIGN function_align
+.main_pass2_end:
    mova                  m7, [o(pw_2896x8)]
+    punpckhqdq            m6, m2, m1                 ;low:t11   high:t15a
+    punpcklqdq            m2, m1                     ;low:t10   high:t14a
+    psubsw                m1, m2, m6
+    paddsw                m2, m6
    punpckhqdq            m6, m4, m5                 ;low:t3a   high:t7
    punpcklqdq            m4, m5                     ;low:t2a   high:t6
-    psubw                 m5, m4, m6
-    paddw                 m4, m6
+    psubsw                m5, m4, m6
+    paddsw                m4, m6
    pmulhrsw              m1, m7                     ;low:-out9 high:out10
    pmulhrsw              m2, m7                     ;low:out6  high:-out5
    pmulhrsw              m5, m7                     ;low:out8  high:-out11
@ -2298,6 +2395,7 @@ INV_TXFM_16X4_FN flipadst, identity
 cglobal iflipadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
    LOAD_7ROWS        coeffq, 16
    call m(iadst_16x4_internal).main
+    call m(iadst_16x4_internal).main_pass1_end

    punpcklwd             m6, m7, m0                 ;packed  out11,  out15
    punpckhwd             m0, m7                     ;packed  -out0,  -out4
@ -2360,7 +2458,7 @@ cglobal iidentity_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
 %endmacro

 %macro INV_TXFM_8X16_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 8x16, 8, 16*12
+    INV_TXFM_FN          %1, %2, %3, 8x16, 8, 16*16
 %ifidn %1_%2, dct_dct
    pshuflw              m0, [coeffq], q0000
    punpcklwd            m0, m0
@ -2548,6 +2646,7 @@ cglobal iadst_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
    mova                    m7, [coeffq+16*11]

    call m(iadst_16x8_internal).main
+    call m(iadst_16x8_internal).main_pass2_end

    mov                    r3, dstq
    lea                  dstq, [dstq+strideq*8]
@ -2599,6 +2698,7 @@ cglobal iflipadst_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
    mova                    m7, [coeffq+16*11]

    call m(iadst_16x8_internal).main
+    call m(iadst_16x8_internal).main_pass2_end
    jmp  m(iflipadst_8x8_internal).end

 .end:
@ -2652,7 +2752,7 @@ cglobal iidentity_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2


 %macro INV_TXFM_16X8_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 16x8, 8, 16*12
+    INV_TXFM_FN          %1, %2, %3, 16x8, 8, 16*16
 %ifidn %1_%2, dct_dct
    movd                 m1, [o(pw_2896x8)]
    pmulhrsw             m0, m1, [coeffq]
@ -2893,6 +2993,7 @@ cglobal iadst_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
    pmulhrsw                m7,     [coeffq+16*13]

    call .main
+    call .main_pass1_end
    mov                    r3, tx2q
    lea                  tx2q, [o(m(iadst_16x8_internal).pass1_end)]
    jmp m(iadst_8x8_internal).pass1_end
@ -2998,23 +3099,15 @@ ALIGN function_align
    mova  [rsp+gprsize*2+16*6], m3                       ;-out3
    psubsw                  m3, m0, m4                   ;t7
    paddsw                  m0, m4                       ;out12
-    mova                    m7, [o(pw_2896x8)]
-    psubw                   m4, m2, m3
-    paddw                   m2, m3
+    mova [rsp+gprsize*2+16*12], m3
    mova                    m3, [rsp+gprsize*2+16*7]     ;t3
-    pmulhrsw                m4, m7                       ;-out11
-    pmulhrsw                m2, m7                       ;out4
-    mova  [rsp+gprsize*2+16*7], m2                       ;out4
+    mova [rsp+gprsize*2+16* 7], m2                       ;out4
    psubsw                  m2, m5, m3                   ;t3a
    paddsw                  m5, m3                       ;-out15
-    psubw                   m3, m1, m2
-    paddw                   m1, m2
+    mova [rsp+gprsize*2+16*11], m2
    mova                    m2, [rsp+gprsize*2+32*5]     ;t15
-    pmulhrsw                m3, m7                       ;out8
-    pmulhrsw                m1, m7                       ;-out7
-    mova [rsp+gprsize*2+32*5 ], m1                       ;-out7
+    mova [rsp+gprsize*2+16*10], m1                       ;-out7
    mova                    m1, [rsp+gprsize*2+16*0]     ;t11
-    mova [rsp+gprsize*2+16*11], m3                       ;out8
    mova [rsp+gprsize*2+16*0 ], m5                       ;-out15
    mova                    m3, [rsp+gprsize*2+16*1]     ;t10
    mova [rsp+gprsize*2+16*1 ], m4                       ;-out11
@ -3044,26 +3137,106 @@ ALIGN function_align
    paddsw                  m2, m6                       ;-out1
    paddsw                  m6, m4, m1                   ;out14
    psubsw                  m4, m1                       ;t11
-    psubw                   m1, m3, m4
-    paddw                   m3, m4
-    pmulhrsw                m1, m7                       ;-out9
-    pmulhrsw                m3, m7                       ;out6
-    mova  [rsp+gprsize*2+16*4], m2                       ;-out1
+    mova [rsp+gprsize*2+16*14], m4
+    mova [rsp+gprsize*2+16* 4], m2                       ;-out1
    mova                    m4, [rsp+gprsize*2+16*8]     ;t14
    mova                    m2, [rsp+gprsize*2+16*9]     ;t15
-    mova  [rsp+gprsize*2+16*9], m3                       ;out6
+    mova [rsp+gprsize*2+16* 9], m3                       ;out6
    psubsw                  m3, m0, m4                   ;t14a
    paddsw                  m0, m4                       ;out2
    psubsw                  m4, m5, m2                   ;t15a
    paddsw                  m5, m2                       ;-out13
-    psubw                   m2, m3, m4
-    paddw                   m3, m4
-    mova  [rsp+gprsize*2+16*5], m0                       ;out2
-    pmulhrsw                m3, m7                       ;-out5
+    mova [rsp+gprsize*2+16* 5], m0                       ;out2
+    ret
+ALIGN function_align
+.main_pass1_end:
+    mova                    m0, [rsp+gprsize*2+16*14]
+    mova [rsp+gprsize*2+16*14], m5
+    mova [rsp+gprsize*2+16*15], m6
+    mova                    m5, [o(pw_2896_2896)]
+    mova                    m6, [o(pw_2896_m2896)]
+    mova                    m7, [o(pd_2048)]
+    punpcklwd               m2, m3, m4
+    punpckhwd               m3, m4
+    pmaddwd                 m4, m5, m2
+    pmaddwd                 m2, m6
+    pmaddwd                 m1, m5, m3
+    pmaddwd                 m3, m6
+    REPX         {paddd x, m7}, m4, m2, m1, m3
+    REPX         {psrad x, 12}, m4, m1, m2, m3
+    packssdw                m4, m1                       ;-out5
+    packssdw                m2, m3                       ;out10
+    mova [rsp+gprsize*2+16* 8], m4
+    mova                    m3, [rsp+gprsize*2+16* 9]
+    punpcklwd               m1, m3, m0
+    punpckhwd               m3, m0
+    pmaddwd                 m0, m5, m1
+    pmaddwd                 m1, m6
+    pmaddwd                 m4, m5, m3
+    pmaddwd                 m3, m6
+    REPX         {paddd x, m7}, m0, m1, m4, m3
+    REPX         {psrad x, 12}, m0, m4, m1, m3
+    packssdw                m0, m4                       ;out6
+    packssdw                m1, m3                       ;-out9
+    mova [rsp+gprsize*2+16* 9], m0
+    mova                    m0, [rsp+gprsize*2+16* 7]
+    mova                    m4, [rsp+gprsize*2+16*12]
+    punpcklwd               m3, m0, m4
+    punpckhwd               m0, m4
+    pmaddwd                 m4, m5, m3
+    pmaddwd                 m3, m6
+    pmaddwd                 m5, m0
+    pmaddwd                 m0, m6
+    REPX         {paddd x, m7}, m4, m3, m5, m0
+    REPX         {psrad x, 12}, m4, m5, m3, m0
+    packssdw                m4, m5                       ;out4
+    packssdw                m3, m0                       ;-out11
+    mova [rsp+gprsize*2+16* 7], m4
+    mova                    m4, [rsp+gprsize*2+16*10]
+    mova                    m5, [rsp+gprsize*2+16*11]
+    punpcklwd               m0, m4, m5
+    punpckhwd               m4, m5
+    pmaddwd                 m5, m0, [o(pw_2896_2896)]
+    pmaddwd                 m0, m6
+    pmaddwd                 m6, m4
+    pmaddwd                 m4, [o(pw_2896_2896)]
+    REPX         {paddd x, m7}, m5, m0, m6, m4
+    REPX         {psrad x, 12}, m0, m6, m5, m4
+    packssdw                m0, m6                       ;out8
+    packssdw                m5, m4                       ;-out7
+    mova [rsp+gprsize*2+16*10], m5
+    mova                    m4, [rsp+gprsize*2+16* 2]    ;out12
+    mova                    m5, [rsp+gprsize*2+16*14]    ;-out13
+    mova                    m6, [rsp+gprsize*2+16*15]    ;out14
+    ret
+ALIGN function_align
+.main_pass2_end:
+    mova                    m7, [o(pw_2896x8)]
+    mova                    m1, [rsp+gprsize*2+16* 9]
+    mova                    m2, [rsp+gprsize*2+16*14]
+    paddsw                  m0, m1, m2
+    psubsw                  m1, m2
+    pmulhrsw                m0, m7                       ;out6
+    pmulhrsw                m1, m7                       ;-out9
+    mova [rsp+gprsize*2+16* 9], m0
+    psubsw                  m2, m3, m4
+    paddsw                  m3, m4
    pmulhrsw                m2, m7                       ;out10
-    mova  [rsp+gprsize*2+16*8], m3                       ;-out5
-    mova                    m0, [rsp+gprsize*2+16*11]    ;out8
-    mova                    m3, [rsp+gprsize*2+16*1 ]    ;-out11
+    pmulhrsw                m3, m7                       ;-out5
+    mova [rsp+gprsize*2+16* 8], m3
+    mova                    m3, [rsp+gprsize*2+16* 7]
+    mova                    m4, [rsp+gprsize*2+16*12]
+    paddsw                  m0, m3, m4
+    psubsw                  m3, m4
+    pmulhrsw                m0, m7                       ;out4
+    pmulhrsw                m3, m7                       ;-out11
+    mova [rsp+gprsize*2+16* 7], m0
+    mova                    m0, [rsp+gprsize*2+16*10]
+    paddsw                  m4, m0, [rsp+gprsize*2+16*11]
+    psubsw                  m0, [rsp+gprsize*2+16*11]
+    pmulhrsw                m4, m7                       ;-out7
+    pmulhrsw                m0, m7                       ;out8
+    mova [rsp+gprsize*2+16*10], m4
    mova                    m4, [rsp+gprsize*2+16*2 ]    ;out12
    ret

@ -3100,6 +3273,7 @@ cglobal iflipadst_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
    pmulhrsw                m7,     [coeffq+16*13]

    call m(iadst_16x8_internal).main
+    call m(iadst_16x8_internal).main_pass1_end

    mova                    m7, [rsp+gprsize+16*0]
    SAVE_8ROWS     coeffq+16*0, 32
@ -3184,7 +3358,7 @@ cglobal iidentity_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2


 %macro INV_TXFM_16X16_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 16x16, 8, 16*12
+    INV_TXFM_FN          %1, %2, %3, 16x16, 8, 16*16
 %ifidn %1_%2, dct_dct
    movd                   m1, [o(pw_2896x8)]
    pmulhrsw               m0, m1, [coeffq]
@ -3423,6 +3597,7 @@ INV_TXFM_16X16_FN adst, flipadst
 cglobal iadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
    ITX_16X16_ADST_LOAD_ODD_COEFS
    call m(iadst_16x8_internal).main
+    call m(iadst_16x8_internal).main_pass1_end

    mov                     r3, tx2q
    lea                   tx2q, [o(m(iadst_16x16_internal).pass1_end)]
@ -3441,6 +3616,7 @@ cglobal iadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
    SAVE_8ROWS     coeffq+16*1, 32
    ITX_16X16_ADST_LOAD_EVEN_COEFS
    call m(iadst_16x8_internal).main
+    call m(iadst_16x8_internal).main_pass1_end

    lea                   tx2q, [o(m(iadst_16x16_internal).pass1_end2)]
    mova                    m7, [o(pw_8192)]
@ -3496,6 +3672,7 @@ INV_TXFM_16X16_FN flipadst, flipadst
 cglobal iflipadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
    ITX_16X16_ADST_LOAD_ODD_COEFS
    call m(iadst_16x8_internal).main
+    call m(iadst_16x8_internal).main_pass1_end

    mov                     r3, tx2q
    lea                   tx2q, [o(m(iflipadst_16x16_internal).pass1_end)]
@ -3514,6 +3691,7 @@ cglobal iflipadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
    SAVE_8ROWS    coeffq+16*17, 32
    ITX_16X16_ADST_LOAD_EVEN_COEFS
    call m(iadst_16x8_internal).main
+    call m(iadst_16x8_internal).main_pass1_end

    mova                    m7, [rsp+gprsize+16*0]
    SAVE_8ROWS     coeffq+16*0, 32
--- a/third_party/dav1d/src/x86/loopfilter_init_tmpl.c
+++ b/third_party/dav1d/src/x86/loopfilter_init_tmpl.c
@ -28,14 +28,27 @@
 #include "src/cpu.h"
 #include "src/loopfilter.h"

-decl_loopfilter_sb_fn(dav1d_lpf_h_sb_y_avx2);
-decl_loopfilter_sb_fn(dav1d_lpf_v_sb_y_avx2);
-decl_loopfilter_sb_fn(dav1d_lpf_h_sb_uv_avx2);
-decl_loopfilter_sb_fn(dav1d_lpf_v_sb_uv_avx2);
+#define decl_loopfilter_sb_fns(ext) \
+decl_loopfilter_sb_fn(dav1d_lpf_h_sb_y_##ext); \
+decl_loopfilter_sb_fn(dav1d_lpf_v_sb_y_##ext); \
+decl_loopfilter_sb_fn(dav1d_lpf_h_sb_uv_##ext); \
+decl_loopfilter_sb_fn(dav1d_lpf_v_sb_uv_##ext)
+
+decl_loopfilter_sb_fns(ssse3);
+decl_loopfilter_sb_fns(avx2);

 COLD void bitfn(dav1d_loop_filter_dsp_init_x86)(Dav1dLoopFilterDSPContext *const c) {
    const unsigned flags = dav1d_get_cpu_flags();

+    if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+#if BITDEPTH == 8
+    c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_ssse3;
+    c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_ssse3;
+    c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_ssse3;
+    c->loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_ssse3;
+#endif
+
    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;

 #if BITDEPTH == 8 && ARCH_X86_64
--- a/third_party/dav1d/src/x86/loopfilter_ssse3.asm
+++ b/third_party/dav1d/src/x86/loopfilter_ssse3.asm
--- a/third_party/dav1d/src/x86/mc.asm
+++ b/third_party/dav1d/src/x86/mc.asm
@ -170,8 +170,6 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
 .put:
    movzx                wd, word [t2+wq*2+table_offset(put,)]
    add                  wq, t2
-    lea                  t1, [ssq*3]
-    lea                  t2, [dsq*3]
    jmp                  wq
 .put_w2:
    movzx               t0d, word [srcq+ssq*0]
@ -194,11 +192,11 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
    jg .put_w4
    RET
 .put_w8:
-    movq                 m0, [srcq+ssq*0]
-    movq                 m1, [srcq+ssq*1]
+    mov                  t0, [srcq+ssq*0]
+    mov                  t1, [srcq+ssq*1]
    lea                srcq, [srcq+ssq*2]
-    movq       [dstq+dsq*0], m0
-    movq       [dstq+dsq*1], m1
+    mov        [dstq+dsq*0], t0
+    mov        [dstq+dsq*1], t1
    lea                dstq, [dstq+dsq*2]
    sub                  hd, 2
    jg .put_w8
@ -206,30 +204,22 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
 .put_w16:
    movu                 m0, [srcq+ssq*0]
    movu                 m1, [srcq+ssq*1]
-    movu                 m2, [srcq+ssq*2]
-    movu                 m3, [srcq+t1   ]
-    lea                srcq, [srcq+ssq*4]
+    lea                srcq, [srcq+ssq*2]
    mova       [dstq+dsq*0], m0
    mova       [dstq+dsq*1], m1
-    mova       [dstq+dsq*2], m2
-    mova       [dstq+t2   ], m3
-    lea                dstq, [dstq+dsq*4]
-    sub                  hd, 4
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
    jg .put_w16
    RET
 INIT_YMM avx2
 .put_w32:
    movu                 m0, [srcq+ssq*0]
    movu                 m1, [srcq+ssq*1]
-    movu                 m2, [srcq+ssq*2]
-    movu                 m3, [srcq+t1   ]
-    lea                srcq, [srcq+ssq*4]
+    lea                srcq, [srcq+ssq*2]
    mova       [dstq+dsq*0], m0
    mova       [dstq+dsq*1], m1
-    mova       [dstq+dsq*2], m2
-    mova       [dstq+t2   ], m3
-    lea                dstq, [dstq+dsq*4]
-    sub                  hd, 4
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
    jg .put_w32
    RET
 .put_w64:
--- a/third_party/dav1d/src/x86/mc_ssse3.asm
+++ b/third_party/dav1d/src/x86/mc_ssse3.asm
@ -177,7 +177,6 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak
 .put:
    movzx                wd, word [t0+wq*2+table_offset(put,)]
    add                  wq, t0
-    lea                  r6, [ssq*3]
    RESTORE_DSQ_32       t0
    jmp                  wq
 .put_w2:
@ -211,20 +210,14 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak
    jg .put_w8
    RET
 .put_w16:
-    lea                  r4, [dsq*3]
-.put_w16_in:
    movu                 m0, [srcq+ssq*0]
    movu                 m1, [srcq+ssq*1]
-    movu                 m2, [srcq+ssq*2]
-    movu                 m3, [srcq+r6   ]
-    lea                srcq, [srcq+ssq*4]
+    lea                srcq, [srcq+ssq*2]
    mova       [dstq+dsq*0], m0
    mova       [dstq+dsq*1], m1
-    mova       [dstq+dsq*2], m2
-    mova       [dstq+r4   ], m3
-    lea                dstq, [dstq+dsq*4]
-    sub                  hd, 4
-    jg .put_w16_in
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .put_w16
    RET
 .put_w32:
    movu                 m0, [srcq+ssq*0+16*0]
--- a/third_party/dav1d/src/x86/msac.asm
+++ b/third_party/dav1d/src/x86/msac.asm
@ -27,7 +27,7 @@

 SECTION_RODATA 64 ; avoids cacheline splits

-dw 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
+min_prob:  dw 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
 pw_0xff00: times 8 dw 0xff00
 pw_32:     times 8 dw 32

@ -35,21 +35,24 @@ pw_32:     times 8 dw 32
 %define resp   resq
 %define movp   movq
 %define c_shuf q3333
-%define DECODE_SYMBOL_ADAPT_INIT
+%macro DECODE_SYMBOL_ADAPT_INIT 0-1
+%endmacro
 %else
 %define resp   resd
 %define movp   movd
 %define c_shuf q1111
-%macro DECODE_SYMBOL_ADAPT_INIT 0
+%macro DECODE_SYMBOL_ADAPT_INIT 0-1 0 ; hi_tok
    mov            t0, r0m
    mov            t1, r1m
+%if %1 == 0
    mov            t2, r2m
+%endif
 %if STACK_ALIGNMENT >= 16
-    sub           esp, 40
+    sub           esp, 40-%1*4
 %else
    mov           eax, esp
    and           esp, ~15
-    sub           esp, 40
+    sub           esp, 40-%1*4
    mov         [esp], eax
 %endif
 %endmacro
@ -69,13 +72,13 @@ endstruc
 SECTION .text

 %if WIN64
-DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 3
-%define buf rsp+8  ; shadow space
+DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 3, 8
+%define buf rsp+stack_offset+8 ; shadow space
 %elif UNIX64
-DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 0
+DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 0, 8
 %define buf rsp-40 ; red zone
 %else
-DECLARE_REG_TMP 2, 3, 4, 1, 5, 6, 5, 2
+DECLARE_REG_TMP 2, 3, 4, 1, 5, 6, 5, 2, 3
 %define buf esp+8
 %endif

@ -88,7 +91,7 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6
    movp           m3, [t0+msac.dif]
    mov           t3d, [t0+msac.update_cdf]
    mov           t4d, t2d
-    neg            t2
+    not            t2     ; -(n_symbols + 1)
    pshuflw        m2, m2, q0000
    movd     [buf+12], m2
    pand           m2, [rax]
@ -112,15 +115,15 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6
    pcmpeqw        m2, m2
    mov           t2d, t3d
    shr           t3d, 4
-    cmp           t4d, 4
-    sbb           t3d, -5 ; (count >> 4) + (n_symbols > 3) + 4
+    cmp           t4d, 3
+    sbb           t3d, -5 ; (count >> 4) + (n_symbols > 2) + 4
    cmp           t2d, 32
    adc           t2d, 0  ; count + (count < 32)
    movd           m3, t3d
    pavgw          m2, m1 ; i >= val ? -1 : 32768
    psubw          m2, m0 ; for (i = 0; i < val; i++)
    psubw          m0, m1 ;     cdf[i] += (32768 - cdf[i]) >> rate;
-    psraw          m2, m3 ; for (; i < n_symbols - 1; i++)
+    psraw          m2, m3 ; for (; i < n_symbols; i++)
    paddw          m0, m2 ;     cdf[i] += ((  -1 - cdf[i]) >> rate) + 1;
    movq         [t1], m0
    mov     [t1+t4*2], t2w
@ -214,11 +217,11 @@ cglobal msac_decode_symbol_adapt8, 0, 6, 6
    DECODE_SYMBOL_ADAPT_INIT
    LEA           rax, pw_0xff00
    movd           m2, [t0+msac.rng]
-    movu           m1, [t1]
+    mova           m1, [t1]
    movp           m3, [t0+msac.dif]
    mov           t3d, [t0+msac.update_cdf]
    mov           t4d, t2d
-    neg            t2
+    not            t2
    pshuflw        m2, m2, q0000
    movd     [buf+12], m2
    punpcklqdq     m2, m2
@ -242,7 +245,7 @@ cglobal msac_decode_symbol_adapt8, 0, 6, 6
    pcmpeqw        m2, m2
    mov           t2d, t3d
    shr           t3d, 4
-    cmp           t4d, 4 ; may be called with n_symbols < 4
+    cmp           t4d, 3 ; may be called with n_symbols <= 2
    sbb           t3d, -5
    cmp           t2d, 32
    adc           t2d, 0
@ -252,7 +255,7 @@ cglobal msac_decode_symbol_adapt8, 0, 6, 6
    psubw          m0, m1
    psraw          m2, m3
    paddw          m0, m2
-    movu         [t1], m0
+    mova         [t1], m0
    mov     [t1+t4*2], t2w
    jmp m(msac_decode_symbol_adapt4).renorm

@ -260,12 +263,12 @@ cglobal msac_decode_symbol_adapt16, 0, 6, 6
    DECODE_SYMBOL_ADAPT_INIT
    LEA           rax, pw_0xff00
    movd           m4, [t0+msac.rng]
-    movu           m2, [t1]
-    movu           m3, [t1+16]
+    mova           m2, [t1]
+    mova           m3, [t1+16]
    movp           m5, [t0+msac.dif]
    mov           t3d, [t0+msac.update_cdf]
    mov           t4d, t2d
-    neg            t2
+    not            t2
 %if WIN64
    sub           rsp, 48 ; need 36 bytes, shadow space is only 32
 %endif
@ -288,8 +291,8 @@ cglobal msac_decode_symbol_adapt16, 0, 6, 6
    punpcklqdq     m5, m5
    paddw          m3, m4
    mova        [buf], m2
-    mova     [buf+16], m3
    psubusw        m2, m5
+    mova     [buf+16], m3
    psubusw        m3, m5
    pxor           m4, m4
    pcmpeqw        m2, m4
@ -301,7 +304,7 @@ cglobal msac_decode_symbol_adapt16, 0, 6, 6
    movzx         t3d, word [t1+t4*2]
    pcmpeqw        m4, m4
    mova           m5, m4
-    lea           t2d, [t3+80] ; only support n_symbols >= 4
+    lea           t2d, [t3+80] ; only support n_symbols > 2
    shr           t2d, 4
    cmp           t3d, 32
    adc           t3d, 0
@ -316,8 +319,8 @@ cglobal msac_decode_symbol_adapt16, 0, 6, 6
    psraw          m5, m2
    paddw          m0, m4
    paddw          m1, m5
-    movu         [t1], m0
-    movu      [t1+16], m1
+    mova         [t1], m0
+    mova      [t1+16], m1
    mov     [t1+t4*2], t3w
 .renorm:
    tzcnt         eax, eax
@ -440,3 +443,158 @@ cglobal msac_decode_bool, 0, 6, 0
    movzx         eax, al
 %endif
    jmp m(msac_decode_symbol_adapt4).renorm3
+
+%macro HI_TOK 1 ; update_cdf
+%if ARCH_X86_64 == 0
+    mov           eax, -24
+%endif
+%%loop:
+%if %1
+    movzx         t2d, word [t1+3*2]
+%endif
+    mova           m1, m0
+    pshuflw        m2, m2, q0000
+    psrlw          m1, 6
+    movd     [buf+12], m2
+    pand           m2, m4
+    psllw          m1, 7
+    pmulhuw        m1, m2
+%if ARCH_X86_64 == 0
+    add           eax, 5
+    mov       [buf+8], eax
+%endif
+    pshuflw        m3, m3, c_shuf
+    paddw          m1, m5
+    movq     [buf+16], m1
+    psubusw        m1, m3
+    pxor           m2, m2
+    pcmpeqw        m1, m2
+    pmovmskb      eax, m1
+%if %1
+    lea           ecx, [t2+80]
+    pcmpeqw        m2, m2
+    shr           ecx, 4
+    cmp           t2d, 32
+    adc           t2d, 0
+    movd           m3, ecx
+    pavgw          m2, m1
+    psubw          m2, m0
+    psubw          m0, m1
+    psraw          m2, m3
+    paddw          m0, m2
+    movq         [t1], m0
+    mov      [t1+3*2], t2w
+%endif
+    tzcnt         eax, eax
+    movzx         ecx, word [buf+rax+16]
+    movzx         t2d, word [buf+rax+14]
+    not            t4
+%if ARCH_X86_64
+    add           t6d, 5
+%endif
+    sub           eax, 5   ; setup for merging the tok_br and tok branches
+    sub           t2d, ecx
+    shl           rcx, gprsize*8-16
+    add            t4, rcx
+    bsr           ecx, t2d
+    xor           ecx, 15
+    shl           t2d, cl
+    shl            t4, cl
+    movd           m2, t2d
+    mov [t7+msac.rng], t2d
+    not            t4
+    sub           t5d, ecx
+    jge %%end
+    mov            t2, [t7+msac.buf]
+    mov           rcx, [t7+msac.end]
+%if UNIX64 == 0
+    push           t8
+%endif
+    lea            t8, [t2+gprsize]
+    cmp            t8, rcx
+    ja %%refill_eob
+    mov            t2, [t2]
+    lea           ecx, [t5+23]
+    add           t5d, 16
+    shr           ecx, 3
+    bswap          t2
+    sub            t8, rcx
+    shl           ecx, 3
+    shr            t2, cl
+    sub           ecx, t5d
+    mov           t5d, gprsize*8-16
+    shl            t2, cl
+    mov [t7+msac.buf], t8
+%if UNIX64 == 0
+    pop            t8
+%endif
+    sub           t5d, ecx
+    xor            t4, t2
+%%end:
+    movp           m3, t4
+%if ARCH_X86_64
+    add           t6d, eax ; CF = tok_br < 3 || tok == 15
+    jnc %%loop
+    lea           eax, [t6+30]
+%else
+    add           eax, [buf+8]
+    jnc %%loop
+    add           eax, 30
+%if STACK_ALIGNMENT >= 16
+    add           esp, 36
+%else
+    mov           esp, [esp]
+%endif
+%endif
+    mov [t7+msac.dif], t4
+    shr           eax, 1
+    mov [t7+msac.cnt], t5d
+    RET
+%%refill_eob:
+    mov            t8, rcx
+    mov           ecx, gprsize*8-24
+    sub           ecx, t5d
+%%refill_eob_loop:
+    cmp            t2, t8
+    jae %%refill_eob_end
+    movzx         t5d, byte [t2]
+    inc            t2
+    shl            t5, cl
+    xor            t4, t5
+    sub           ecx, 8
+    jge %%refill_eob_loop
+%%refill_eob_end:
+%if UNIX64 == 0
+    pop            t8
+%endif
+    mov           t5d, gprsize*8-24
+    mov [t7+msac.buf], t2
+    sub           t5d, ecx
+    jmp %%end
+%endmacro
+
+cglobal msac_decode_hi_tok, 0, 7 + ARCH_X86_64, 6
+    DECODE_SYMBOL_ADAPT_INIT 1
+%if ARCH_X86_64 == 0 && PIC
+    LEA            t2, min_prob+12*2
+    %define base t2-(min_prob+12*2)
+%else
+    %define base 0
+%endif
+    movq           m0, [t1]
+    movd           m2, [t0+msac.rng]
+    mov           eax, [t0+msac.update_cdf]
+    movq           m4, [base+pw_0xff00]
+    movp           m3, [t0+msac.dif]
+    movq           m5, [base+min_prob+12*2]
+    mov            t4, [t0+msac.dif]
+    mov           t5d, [t0+msac.cnt]
+%if ARCH_X86_64
+    mov           t6d, -24
+%endif
+    movifnidn      t7, t0
+    test          eax, eax
+    jz .no_update_cdf
+    HI_TOK          1
+.no_update_cdf:
+    HI_TOK          0
--- a/third_party/dav1d/src/x86/msac.h
+++ b/third_party/dav1d/src/x86/msac.h
@ -37,11 +37,13 @@ unsigned dav1d_msac_decode_symbol_adapt16_sse2(MsacContext *s, uint16_t *cdf,
 unsigned dav1d_msac_decode_bool_adapt_sse2(MsacContext *s, uint16_t *cdf);
 unsigned dav1d_msac_decode_bool_equi_sse2(MsacContext *s);
 unsigned dav1d_msac_decode_bool_sse2(MsacContext *s, unsigned f);
+unsigned dav1d_msac_decode_hi_tok_sse2(MsacContext *s, uint16_t *cdf);

 #if ARCH_X86_64 || defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
 #define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt4_sse2
 #define dav1d_msac_decode_symbol_adapt8  dav1d_msac_decode_symbol_adapt8_sse2
 #define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_sse2
+#define dav1d_msac_decode_hi_tok         dav1d_msac_decode_hi_tok_sse2
 #endif

 #define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_sse2
--- a/third_party/dav1d/tests/checkasm/checkasm.c
+++ b/third_party/dav1d/tests/checkasm/checkasm.c
@ -65,6 +65,7 @@ static const struct {
    { "msac", checkasm_check_msac },
 #if CONFIG_8BPC
    { "cdef_8bpc", checkasm_check_cdef_8bpc },
+    { "filmgrain_8bpc", checkasm_check_filmgrain_8bpc },
    { "ipred_8bpc", checkasm_check_ipred_8bpc },
    { "itx_8bpc", checkasm_check_itx_8bpc },
    { "loopfilter_8bpc", checkasm_check_loopfilter_8bpc },
@ -73,6 +74,7 @@ static const struct {
 #endif
 #if CONFIG_16BPC
    { "cdef_16bpc", checkasm_check_cdef_16bpc },
+    { "filmgrain_16bpc", checkasm_check_filmgrain_16bpc },
    { "ipred_16bpc", checkasm_check_ipred_16bpc },
    { "itx_16bpc", checkasm_check_itx_16bpc },
    { "loopfilter_16bpc", checkasm_check_loopfilter_16bpc },
@ -703,6 +705,7 @@ void checkasm_set_signal_handler_state(const int enabled) {
        RemoveVectoredExceptionHandler(signal_handler);
 #else
    void (*const handler)(int) = enabled ? signal_handler : SIG_DFL;
+    signal(SIGBUS,  handler);
    signal(SIGFPE,  handler);
    signal(SIGILL,  handler);
    signal(SIGSEGV, handler);
--- a/third_party/dav1d/tests/checkasm/checkasm.h
+++ b/third_party/dav1d/tests/checkasm/checkasm.h
@ -60,6 +60,7 @@ name##_16bpc(void)

 void checkasm_check_msac(void);
 decl_check_bitfns(void checkasm_check_cdef);
+decl_check_bitfns(void checkasm_check_filmgrain);
 decl_check_bitfns(void checkasm_check_ipred);
 decl_check_bitfns(void checkasm_check_itx);
 decl_check_bitfns(void checkasm_check_loopfilter);
@ -279,7 +280,7 @@ void checkasm_stack_clobber(uint64_t clobber, ...);
        }\
    } while (0)
 #else
-#define bench_new(...) while (0)
+#define bench_new(...) do {} while (0)
 #endif

 #define DECL_CHECKASM_CHECK_FUNC(type) \
--- a/third_party/dav1d/tests/checkasm/filmgrain.c
+++ b/third_party/dav1d/tests/checkasm/filmgrain.c
@ -0,0 +1,269 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "tests/checkasm/checkasm.h"
+
+#include <string.h>
+
+#include "src/levels.h"
+#include "src/film_grain.h"
+#define UNIT_TEST 1
+#include "src/fg_apply_tmpl.c"
+
+static void check_gen_grny(const Dav1dFilmGrainDSPContext *const dsp) {
+    entry grain_lut_c[GRAIN_HEIGHT][GRAIN_WIDTH];
+    entry grain_lut_a[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
+
+    declare_func(void, entry grain_lut[][GRAIN_WIDTH],
+                 const Dav1dFilmGrainData *data HIGHBD_DECL_SUFFIX);
+
+    for (int i = 0; i < 4; i++) {
+        if (check_func(dsp->generate_grain_y, "gen_grain_y_ar%d_%dbpc", i, BITDEPTH)) {
+            Dav1dFilmGrainData fg_data;
+            fg_data.seed = rnd() & 0xFFFF;
+
+#if BITDEPTH == 16
+            const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#endif
+
+            fg_data.grain_scale_shift = rnd() & 3;
+            fg_data.ar_coeff_shift = (rnd() & 3) + 6;
+            fg_data.ar_coeff_lag = i;
+            const int num_y_pos = 2 * fg_data.ar_coeff_lag * (fg_data.ar_coeff_lag + 1);
+            for (int n = 0; n < num_y_pos; n++)
+                fg_data.ar_coeffs_y[n] = (rnd() & 0xff) - 128;
+
+            call_ref(grain_lut_c, &fg_data HIGHBD_TAIL_SUFFIX);
+            call_new(grain_lut_a, &fg_data HIGHBD_TAIL_SUFFIX);
+            if (memcmp(grain_lut_c, grain_lut_a,
+                       GRAIN_WIDTH * GRAIN_HEIGHT * sizeof(entry)))
+            {
+                fail();
+            }
+
+            bench_new(grain_lut_a, &fg_data HIGHBD_TAIL_SUFFIX);
+        }
+    }
+
+    report("gen_grain_y");
+}
+
+static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
+    ALIGN_STK_32(pixel, c_dst, 128 * 32,);
+    ALIGN_STK_32(pixel, a_dst, 128 * 32,);
+    ALIGN_STK_32(pixel, src, 128 * 32,);
+    const ptrdiff_t stride = 128 * sizeof(pixel);
+
+    declare_func(void, pixel *dst_row, const pixel *src_row, ptrdiff_t stride,
+                 const Dav1dFilmGrainData *data, size_t pw,
+                 const uint8_t scaling[SCALING_SIZE],
+                 const entry grain_lut[][GRAIN_WIDTH],
+                 int bh, int row_num HIGHBD_DECL_SUFFIX);
+
+    if (check_func(dsp->fgy_32x32xn, "fgy_32x32xn_%dbpc", BITDEPTH)) {
+        Dav1dFilmGrainData fg_data;
+        fg_data.seed = rnd() & 0xFFFF;
+
+#if BITDEPTH == 16
+        const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#else
+        const int bitdepth_max = 0xff;
+#endif
+
+        uint8_t scaling[SCALING_SIZE];
+        entry grain_lut[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
+        fg_data.grain_scale_shift = rnd() & 3;
+        fg_data.ar_coeff_shift = (rnd() & 3) + 6;
+        fg_data.ar_coeff_lag = rnd() & 3;
+        const int num_y_pos = 2 * fg_data.ar_coeff_lag * (fg_data.ar_coeff_lag + 1);
+        for (int n = 0; n < num_y_pos; n++)
+            fg_data.ar_coeffs_y[n] = (rnd() & 0xff) - 128;
+        dsp->generate_grain_y(grain_lut, &fg_data HIGHBD_TAIL_SUFFIX);
+
+        fg_data.num_y_points = 2 + (rnd() % 13);
+        const int pad = 0xff / fg_data.num_y_points;
+        for (int n = 0; n < fg_data.num_y_points; n++) {
+            fg_data.y_points[n][0] = 0xff * n / fg_data.num_y_points;
+            fg_data.y_points[n][0] += rnd() % pad;
+            fg_data.y_points[n][1] = rnd() & 0xff;
+        }
+        generate_scaling(bitdepth_from_max(bitdepth_max), fg_data.y_points,
+                         fg_data.num_y_points, scaling);
+
+        const int w = 1 + (rnd() & 127);
+        const int h = 1 + (rnd() & 31);
+
+        for (int y = 0; y < h; y++)
+            for (int x = 0; x < w; x++)
+                src[y * PXSTRIDE(stride) + x] = rnd() & bitdepth_max;
+        const int row_num = rnd() & 1 ? rnd() & 0x7ff : 0;
+
+        fg_data.clip_to_restricted_range = rnd() & 1;
+        fg_data.scaling_shift = (rnd() & 3) + 8;
+        for (fg_data.overlap_flag = 0; fg_data.overlap_flag <= 1;
+             fg_data.overlap_flag++)
+        {
+            call_ref(c_dst, src, stride, &fg_data, w, scaling, grain_lut, h,
+                     row_num HIGHBD_TAIL_SUFFIX);
+            call_new(a_dst, src, stride, &fg_data, w, scaling, grain_lut, h,
+                     row_num HIGHBD_TAIL_SUFFIX);
+
+            checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst");
+        }
+        fg_data.overlap_flag = 1;
+        bench_new(a_dst, src, stride, &fg_data, 64, scaling, grain_lut, 32,
+                  row_num HIGHBD_TAIL_SUFFIX);
+    }
+
+    report("fgy_32x32xn");
+}
+
+static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
+    ALIGN_STK_32(pixel, c_dst, 128 * 32,);
+    ALIGN_STK_32(pixel, a_dst, 128 * 32,);
+    ALIGN_STK_32(pixel, src, 128 * 32,);
+    ALIGN_STK_32(pixel, luma_src, 128 * 32,);
+    const ptrdiff_t lstride = 128 * sizeof(pixel);
+
+    declare_func(void, pixel *dst_row, const pixel *src_row, ptrdiff_t stride,
+                 const Dav1dFilmGrainData *data, size_t pw,
+                 const uint8_t scaling[SCALING_SIZE],
+                 const entry grain_lut[][GRAIN_WIDTH], int bh, int row_num,
+                 const pixel *luma_row, ptrdiff_t luma_stride, int uv_pl,
+                 int is_identity HIGHBD_DECL_SUFFIX);
+
+    for (int layout_idx = 0; layout_idx < 3; layout_idx++) {
+        const char ss_name[][4] = {
+            [DAV1D_PIXEL_LAYOUT_I420 - 1] = "420",
+            [DAV1D_PIXEL_LAYOUT_I422 - 1] = "422",
+            [DAV1D_PIXEL_LAYOUT_I444 - 1] = "444",
+        };
+        const enum Dav1dPixelLayout layout = layout_idx + 1;
+        const int ss_x = layout != DAV1D_PIXEL_LAYOUT_I444;
+        const int ss_y = layout == DAV1D_PIXEL_LAYOUT_I420;
+        const ptrdiff_t stride = (ss_x ? 96 : 128) * sizeof(pixel);
+
+        for (int csfl = 0; csfl <= 1; csfl++) {
+            if (check_func(dsp->fguv_32x32xn[layout_idx],
+                           "fguv_32x32xn_%dbpc_%s_csfl%d",
+                           BITDEPTH, ss_name[layout_idx], csfl))
+            {
+                Dav1dFilmGrainData fg_data;
+
+                fg_data.seed = rnd() & 0xFFFF;
+
+#if BITDEPTH == 16
+                const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#else
+                const int bitdepth_max = 0xff;
+#endif
+                const int uv_pl = rnd() & 1;
+                const int is_identity = rnd() & 1;
+
+                uint8_t scaling[SCALING_SIZE];
+                entry grain_lut[2][GRAIN_HEIGHT + 1][GRAIN_WIDTH];
+                fg_data.grain_scale_shift = rnd() & 3;
+                fg_data.ar_coeff_shift = (rnd() & 3) + 6;
+                fg_data.ar_coeff_lag = rnd() & 3;
+                const int num_y_pos = 2 * fg_data.ar_coeff_lag * (fg_data.ar_coeff_lag + 1);
+                for (int n = 0; n < num_y_pos; n++)
+                    fg_data.ar_coeffs_y[n] = (rnd() & 0xff) - 128;
+                dsp->generate_grain_y(grain_lut[0], &fg_data HIGHBD_TAIL_SUFFIX);
+                dsp->generate_grain_uv[layout_idx](grain_lut[1], grain_lut[0],
+                                                   &fg_data, uv_pl HIGHBD_TAIL_SUFFIX);
+
+                const int w = 1 + (rnd() & (127 >> ss_x));
+                const int h = 1 + (rnd() & (31 >> ss_y));
+                const int lw = w << ss_x, lh = h << ss_y;
+
+                for (int y = 0; y < h; y++)
+                    for (int x = 0; x < w; x++)
+                        src[y * PXSTRIDE(stride) + x] = rnd() & bitdepth_max;
+                for (int y = 0; y < lh; y++)
+                    for (int x = 0; x < lw; x++)
+                        luma_src[y * PXSTRIDE(lstride) + x] = rnd() & bitdepth_max;
+                const int row_num = rnd() & 1 ? rnd() & 0x7ff : 0;
+
+                if (csfl) {
+                    fg_data.num_y_points = 2 + (rnd() % 13);
+                    const int pad = 0xff / fg_data.num_y_points;
+                    for (int n = 0; n < fg_data.num_y_points; n++) {
+                        fg_data.y_points[n][0] = 0xff * n / fg_data.num_y_points;
+                        fg_data.y_points[n][0] += rnd() % pad;
+                        fg_data.y_points[n][1] = rnd() & 0xff;
+                    }
+                    generate_scaling(bitdepth_from_max(bitdepth_max), fg_data.y_points,
+                                     fg_data.num_y_points, scaling);
+                } else {
+                    fg_data.num_uv_points[uv_pl] = 2 + (rnd() % 9);
+                    const int pad = 0xff / fg_data.num_uv_points[uv_pl];
+                    for (int n = 0; n < fg_data.num_uv_points[uv_pl]; n++) {
+                        fg_data.uv_points[uv_pl][n][0] = 0xff * n / fg_data.num_uv_points[uv_pl];
+                        fg_data.uv_points[uv_pl][n][0] += rnd() % pad;
+                        fg_data.uv_points[uv_pl][n][1] = rnd() & 0xff;
+                    }
+                    generate_scaling(bitdepth_from_max(bitdepth_max), fg_data.uv_points[uv_pl],
+                                     fg_data.num_uv_points[uv_pl], scaling);
+
+                    fg_data.uv_mult[uv_pl] = (rnd() & 0xff) - 128;
+                    fg_data.uv_luma_mult[uv_pl] = (rnd() & 0xff) - 128;
+                    fg_data.uv_offset[uv_pl] = (rnd() & 0x1ff) - 256;
+                }
+
+                fg_data.clip_to_restricted_range = rnd() & 1;
+                fg_data.scaling_shift = (rnd() & 3) + 8;
+                fg_data.chroma_scaling_from_luma = csfl;
+                for (fg_data.overlap_flag = 0; fg_data.overlap_flag <= 1;
+                     fg_data.overlap_flag++)
+                {
+                    call_ref(c_dst, src, stride, &fg_data, w, scaling, grain_lut[1], h,
+                             row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
+                    call_new(a_dst, src, stride, &fg_data, w, scaling, grain_lut[1], h,
+                             row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
+
+                    checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst");
+                }
+
+                fg_data.overlap_flag = 1;
+                bench_new(a_dst, src, stride, &fg_data, 32, scaling, grain_lut[1], 16,
+                          row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
+            }
+        }
+    }
+
+    report("fguv_32x32xn");
+}
+
+void bitfn(checkasm_check_filmgrain)(void) {
+    Dav1dFilmGrainDSPContext c;
+
+    bitfn(dav1d_film_grain_dsp_init)(&c);
+
+    check_gen_grny(&c);
+    check_fgy_sbrow(&c);
+    check_fguv_sbrow(&c);
+}
--- a/third_party/dav1d/tests/checkasm/itx.c
+++ b/third_party/dav1d/tests/checkasm/itx.c
@ -138,7 +138,7 @@ static int copy_subcoefs(coef *coeff,
     * dimensions are non-zero. This leads to braching to specific optimized
     * simd versions (e.g. dc-only) so that we get full asm coverage in this
     * test */
-    const int16_t *const scan = dav1d_scans[tx][dav1d_tx_type_class[txtp]];
+    const uint16_t *const scan = dav1d_scans[tx][dav1d_tx_type_class[txtp]];
    const int sub_high = subsh > 0 ? subsh * 8 - 1 : 0;
    const int sub_low  = subsh > 1 ? sub_high - 8 : 0;
    int n, eob;
--- a/third_party/dav1d/tests/checkasm/mc.c
+++ b/third_party/dav1d/tests/checkasm/mc.c
@ -27,8 +27,6 @@

 #include "tests/checkasm/checkasm.h"

-#include <assert.h>
-
 #include "src/levels.h"
 #include "src/mc.h"

--- a/third_party/dav1d/tests/checkasm/msac.c
+++ b/third_party/dav1d/tests/checkasm/msac.c
@ -38,7 +38,7 @@
 /* The normal code doesn't use function pointers */
 typedef unsigned (*decode_symbol_adapt_fn)(MsacContext *s, uint16_t *cdf,
                                           size_t n_symbols);
-typedef unsigned (*decode_bool_adapt_fn)(MsacContext *s, uint16_t *cdf);
+typedef unsigned (*decode_adapt_fn)(MsacContext *s, uint16_t *cdf);
 typedef unsigned (*decode_bool_equi_fn)(MsacContext *s);
 typedef unsigned (*decode_bool_fn)(MsacContext *s, unsigned f);

@ -46,17 +46,20 @@ typedef struct {
    decode_symbol_adapt_fn symbol_adapt4;
    decode_symbol_adapt_fn symbol_adapt8;
    decode_symbol_adapt_fn symbol_adapt16;
-    decode_bool_adapt_fn   bool_adapt;
+    decode_adapt_fn        bool_adapt;
    decode_bool_equi_fn    bool_equi;
    decode_bool_fn         bool;
+    decode_adapt_fn        hi_tok;
 } MsacDSPContext;

-static void randomize_cdf(uint16_t *const cdf, int n) {
-    for (int i = 16; i > n; i--)
-        cdf[i] = rnd(); /* randomize padding */
-    cdf[n] = cdf[n-1] = 0;
-    while (--n > 0)
-        cdf[n-1] = cdf[n] + rnd() % (32768 - cdf[n] - n) + 1;
+static void randomize_cdf(uint16_t *const cdf, const int n) {
+    int i;
+    for (i = 15; i > n; i--)
+        cdf[i] = rnd(); // padding
+    cdf[i] = 0;         // count
+    do {
+        cdf[i - 1] = cdf[i] + rnd() % (32768 - cdf[i] - i) + 1;
+    } while (--i > 0);
 }

 /* memcmp() on structs can have weird behavior due to padding etc. */
@ -69,7 +72,7 @@ static int msac_cmp(const MsacContext *const a, const MsacContext *const b) {
 static void msac_dump(unsigned c_res, unsigned a_res,
                      const MsacContext *const a, const MsacContext *const b,
                      const uint16_t *const cdf_a, const uint16_t *const cdf_b,
-                      int num_cdf)
+                      const int num_cdf)
 {
    if (c_res != a_res)
        fprintf(stderr, "c_res %u a_res %u\n", c_res, a_res);
@ -86,16 +89,15 @@ static void msac_dump(unsigned c_res, unsigned a_res,
    if (a->allow_update_cdf)
        fprintf(stderr, "allow_update_cdf %d vs %d\n",
                a->allow_update_cdf, b->allow_update_cdf);
-    if (cdf_a != NULL && cdf_b != NULL &&
-        memcmp(cdf_a, cdf_b, sizeof(*cdf_a) * num_cdf)) {
+    if (num_cdf && memcmp(cdf_a, cdf_b, sizeof(*cdf_a) * (num_cdf + 1))) {
        fprintf(stderr, "cdf:\n");
-        for (int i = 0; i < num_cdf; i++)
+        for (int i = 0; i <= num_cdf; i++)
            fprintf(stderr, " %5u", cdf_a[i]);
        fprintf(stderr, "\n");
-        for (int i = 0; i < num_cdf; i++)
+        for (int i = 0; i <= num_cdf; i++)
            fprintf(stderr, " %5u", cdf_b[i]);
        fprintf(stderr, "\n");
-        for (int i = 0; i < num_cdf; i++)
+        for (int i = 0; i <= num_cdf; i++)
            fprintf(stderr, "     %c", cdf_a[i] != cdf_b[i] ? 'x' : '.');
        fprintf(stderr, "\n");
    }
@ -117,26 +119,24 @@ static void msac_dump(unsigned c_res, unsigned a_res,
                    {                                                      \
                        if (fail())                                        \
                            msac_dump(c_res, a_res, &s_c, &s_a,            \
-                                      cdf[0], cdf[1], ns + 1);             \
+                                      cdf[0], cdf[1], ns);                 \
                    }                                                      \
                }                                                          \
-                if (cdf_update && ns == n)                                 \
-                    bench_new(&s_a, cdf[0], n);                            \
+                if (cdf_update && ns == n - 1)                             \
+                    bench_new(&s_a, cdf[1], ns);                           \
            }                                                              \
        }                                                                  \
    }                                                                      \
 } while (0)

 static void check_decode_symbol(MsacDSPContext *const c, uint8_t *const buf) {
-    /* Use an aligned CDF buffer for more consistent benchmark
-     * results, and a misaligned one for checking correctness. */
-    ALIGN_STK_16(uint16_t, cdf, 2, [17]);
+    ALIGN_STK_32(uint16_t, cdf, 2, [16]);
    MsacContext s_c, s_a;

    declare_func(unsigned, MsacContext *s, uint16_t *cdf, size_t n_symbols);
-    CHECK_SYMBOL_ADAPT( 4, 1,  5);
-    CHECK_SYMBOL_ADAPT( 8, 1,  8);
-    CHECK_SYMBOL_ADAPT(16, 4, 16);
+    CHECK_SYMBOL_ADAPT( 4, 1,  4);
+    CHECK_SYMBOL_ADAPT( 8, 1,  7);
+    CHECK_SYMBOL_ADAPT(16, 3, 15);
    report("decode_symbol");
 }

@ -158,11 +158,11 @@ static void check_decode_bool(MsacDSPContext *const c, uint8_t *const buf) {
                    memcmp(cdf[0], cdf[1], sizeof(*cdf)))
                {
                    if (fail())
-                        msac_dump(c_res, a_res, &s_c, &s_a, cdf[0], cdf[1], 2);
+                        msac_dump(c_res, a_res, &s_c, &s_a, cdf[0], cdf[1], 1);
                }
            }
            if (cdf_update)
-                bench_new(&s_a, cdf[0]);
+                bench_new(&s_a, cdf[1]);
        }
    }

@ -200,6 +200,35 @@ static void check_decode_bool(MsacDSPContext *const c, uint8_t *const buf) {
    report("decode_bool");
 }

+static void check_decode_hi_tok(MsacDSPContext *const c, uint8_t *const buf) {
+    ALIGN_STK_16(uint16_t, cdf, 2, [16]);
+    MsacContext s_c, s_a;
+
+    if (check_func(c->hi_tok, "msac_decode_hi_tok")) {
+        declare_func(unsigned, MsacContext *s, uint16_t *cdf);
+        for (int cdf_update = 0; cdf_update <= 1; cdf_update++) {
+            dav1d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update);
+            s_a = s_c;
+            randomize_cdf(cdf[0], 3);
+            memcpy(cdf[1], cdf[0], sizeof(*cdf));
+            for (int i = 0; i < 64; i++) {
+                unsigned c_res = call_ref(&s_c, cdf[0]);
+                unsigned a_res = call_new(&s_a, cdf[1]);
+                if (c_res != a_res || msac_cmp(&s_c, &s_a) ||
+                    memcmp(cdf[0], cdf[1], sizeof(*cdf)))
+                {
+                    if (fail())
+                        msac_dump(c_res, a_res, &s_c, &s_a, cdf[0], cdf[1], 3);
+                    break;
+                }
+            }
+            if (cdf_update)
+                bench_new(&s_a, cdf[1]);
+        }
+    }
+    report("decode_hi_tok");
+}
+
 void checkasm_check_msac(void) {
    MsacDSPContext c;
    c.symbol_adapt4  = dav1d_msac_decode_symbol_adapt_c;
@ -208,6 +237,7 @@ void checkasm_check_msac(void) {
    c.bool_adapt     = dav1d_msac_decode_bool_adapt_c;
    c.bool_equi      = dav1d_msac_decode_bool_equi_c;
    c.bool           = dav1d_msac_decode_bool_c;
+    c.hi_tok         = dav1d_msac_decode_hi_tok_c;

 #if ARCH_AARCH64 && HAVE_ASM
    if (dav1d_get_cpu_flags() & DAV1D_ARM_CPU_FLAG_NEON) {
@ -226,6 +256,7 @@ void checkasm_check_msac(void) {
        c.bool_adapt     = dav1d_msac_decode_bool_adapt_sse2;
        c.bool_equi      = dav1d_msac_decode_bool_equi_sse2;
        c.bool           = dav1d_msac_decode_bool_sse2;
+        c.hi_tok         = dav1d_msac_decode_hi_tok_sse2;
    }
 #endif

@ -235,4 +266,5 @@ void checkasm_check_msac(void) {

    check_decode_symbol(&c, buf);
    check_decode_bool(&c, buf);
+    check_decode_hi_tok(&c, buf);
 }
--- a/third_party/dav1d/tests/meson.build
+++ b/third_party/dav1d/tests/meson.build
@ -41,6 +41,7 @@ if is_asm_enabled

    checkasm_tmpl_sources = files(
        'checkasm/cdef.c',
+        'checkasm/filmgrain.c',
        'checkasm/ipred.c',
        'checkasm/itx.c',
        'checkasm/loopfilter.c',
--- a/third_party/dav1d/tools/dav1d.c
+++ b/third_party/dav1d/tools/dav1d.c
@ -29,7 +29,6 @@
 #include "vcs_version.h"
 #include "cli_config.h"

-#include <assert.h>
 #include <errno.h>
 #include <inttypes.h>
 #include <math.h>
@ -137,7 +136,7 @@ int main(const int argc, char *const *const argv) {
    Dav1dPicture p;
    Dav1dContext *c;
    Dav1dData data;
-    unsigned n_out = 0, total, fps[2];
+    unsigned n_out = 0, total, fps[2], timebase[2];
    uint64_t nspf, tfirst, elapsed;
    double i_fps;
    FILE *frametimes = NULL;
@ -155,7 +154,7 @@ int main(const int argc, char *const *const argv) {

    if ((res = input_open(&in, cli_settings.demuxer,
                          cli_settings.inputfile,
-                          fps, &total)) < 0)
+                          fps, &total, timebase)) < 0)
    {
        return res;
    }
--- a/third_party/dav1d/tools/dav1d_cli_parse.c
+++ b/third_party/dav1d/tools/dav1d_cli_parse.c
@ -27,7 +27,6 @@

 #include "config.h"

-#include <assert.h>
 #include <getopt.h>
 #include <limits.h>
 #include <math.h>
--- a/third_party/dav1d/tools/input/annexb.c
+++ b/third_party/dav1d/tools/input/annexb.c
@ -27,7 +27,6 @@

 #include "config.h"

-#include <assert.h>
 #include <errno.h>
 #include <stdio.h>
 #include <stdint.h>
@ -60,7 +59,7 @@ static int leb128(AnnexbInputContext *const c, size_t *const len) {
 }

 static int annexb_open(AnnexbInputContext *const c, const char *const file,
-                       unsigned fps[2], unsigned *const num_frames)
+                       unsigned fps[2], unsigned *const num_frames, unsigned timebase[2])
 {
    int res;
    size_t len;
@ -73,6 +72,8 @@ static int annexb_open(AnnexbInputContext *const c, const char *const file,
    // TODO: Parse sequence header and read timing info if any.
    fps[0] = 25;
    fps[1] = 1;
+    timebase[0] = 25;
+    timebase[1] = 1;
    for (*num_frames = 0;; (*num_frames)++) {
        res = leb128(c, &len);
        if (res < 0)
--- a/third_party/dav1d/tools/input/demuxer.h
+++ b/third_party/dav1d/tools/input/demuxer.h
@ -36,7 +36,7 @@ typedef struct Demuxer {
    const char *name;
    const char *extension;
    int (*open)(DemuxerPriv *ctx, const char *filename,
-                unsigned fps[2], unsigned *num_frames);
+                unsigned fps[2], unsigned *num_frames, unsigned timebase[2]);
    int (*read)(DemuxerPriv *ctx, Dav1dData *data);
    void (*close)(DemuxerPriv *ctx);
 } Demuxer;
--- a/third_party/dav1d/tools/input/input.c
+++ b/third_party/dav1d/tools/input/input.c
@ -27,12 +27,13 @@

 #include "config.h"

-#include <assert.h>
 #include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

+#include "common/attributes.h"
+
 #include "input/input.h"
 #include "input/demuxer.h"

@ -75,7 +76,7 @@ static const char *find_extension(const char *const f) {

 int input_open(DemuxerContext **const c_out,
               const char *const name, const char *const filename,
-               unsigned fps[2], unsigned *const num_frames)
+               unsigned fps[2], unsigned *const num_frames, unsigned timebase[2])
 {
    const Demuxer *impl;
    DemuxerContext *c;
@ -120,7 +121,7 @@ int input_open(DemuxerContext **const c_out,
    memset(c, 0, sizeof(DemuxerContext) + impl->priv_data_size);
    c->impl = impl;
    c->data = (DemuxerPriv *) &c[1];
-    if ((res = impl->open(c->data, filename, fps, num_frames)) < 0) {
+    if ((res = impl->open(c->data, filename, fps, num_frames, timebase)) < 0) {
        free(c);
        return res;
    }
--- a/third_party/dav1d/tools/input/input.h
+++ b/third_party/dav1d/tools/input/input.h
@ -35,7 +35,7 @@ typedef struct DemuxerContext DemuxerContext;
 void init_demuxers(void);
 int input_open(DemuxerContext **const c_out,
               const char *const name, const char *const filename,
-               unsigned fps[2], unsigned *num_frames);
+               unsigned fps[2], unsigned *num_frames, unsigned timebase[2]);
 int input_read(DemuxerContext *ctx, Dav1dData *data);
 void input_close(DemuxerContext *ctx);

--- a/third_party/dav1d/tools/input/ivf.c
+++ b/third_party/dav1d/tools/input/ivf.c
@ -27,7 +27,6 @@

 #include "config.h"

-#include <assert.h>
 #include <errno.h>
 #include <stdio.h>
 #include <stdint.h>
@ -49,7 +48,7 @@ static int64_t rl64(const uint8_t *const p) {
 }

 static int ivf_open(IvfInputContext *const c, const char *const file,
-                    unsigned fps[2], unsigned *const num_frames)
+                    unsigned fps[2], unsigned *const num_frames, unsigned timebase[2])
 {
    size_t res;
    uint8_t hdr[32];
@ -74,17 +73,18 @@ static int ivf_open(IvfInputContext *const c, const char *const file,
        return -1;
    }

-    fps[0] = rl32(&hdr[16]);
-    fps[1] = rl32(&hdr[20]);
+    timebase[0] = rl32(&hdr[16]);
+    timebase[1] = rl32(&hdr[20]);
    const unsigned duration = rl32(&hdr[24]);
+
    uint8_t data[4];
    for (*num_frames = 0;; (*num_frames)++) {
        if ((res = fread(data, 4, 1, c->f)) != 1)
            break; // EOF
        fseeko(c->f, rl32(data) + 8, SEEK_CUR);
    }
-    fps[0] *= *num_frames;
-    fps[1] *= duration;
+    fps[0] = timebase[0] * *num_frames;
+    fps[1] = timebase[1] * duration;
    fseeko(c->f, 32, SEEK_SET);

    return 0;
--- a/third_party/dav1d/tools/meson.build
+++ b/third_party/dav1d/tools/meson.build
@ -22,9 +22,38 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-#
-# Build definition for the dav1d tools
-#
+# Common source files used by tools and examples
+
+dav1d_input_sources = files(
+    'input/input.c',
+    'input/annexb.c',
+    'input/ivf.c',
+)
+
+dav1d_output_sources = files(
+    'output/md5.c',
+    'output/null.c',
+    'output/output.c',
+    'output/y4m2.c',
+    'output/yuv.c',
+)
+
+dav1d_input_objs = static_library('dav1d_input',
+    dav1d_input_sources,
+
+    include_directories : dav1d_inc_dirs,
+    install : false,
+    build_by_default : false,
+)
+
+dav1d_output_objs = static_library('dav1d_output',
+    dav1d_output_sources,
+
+    include_directories : dav1d_inc_dirs,
+    install : false,
+    build_by_default : false,
+)
+

 # Leave subdir if tools are disabled
 if not get_option('enable_tools')
@ -32,6 +61,10 @@ if not get_option('enable_tools')
 endif


+#
+# Build definition for the dav1d tools
+#
+
 # Configuratin data for cli_config.h
 cli_cdata = configuration_data()

@ -56,21 +89,13 @@ cli_config_h_target = configure_file(output: 'cli_config.h', configuration: cli_
 dav1d_sources = files(
    'dav1d.c',
    'dav1d_cli_parse.c',
-    'input/input.c',
-    'input/annexb.c',
-    'input/ivf.c',
-    'output/md5.c',
-    'output/null.c',
-    'output/output.c',
-    'output/y4m2.c',
-    'output/yuv.c',
 )

 dav1d = executable('dav1d',
    dav1d_sources,
    rev_target, cli_config_h_target,

-    link_with : libdav1d,
+    link_with : [libdav1d, dav1d_input_objs, dav1d_output_objs],
    include_directories : [dav1d_inc_dirs],
    dependencies : [getopt_dependency, thread_dependency, rt_dependency],
    install : true,
--- a/third_party/dav1d/tools/output/output.c
+++ b/third_party/dav1d/tools/output/output.c
@ -27,12 +27,13 @@

 #include "config.h"

-#include <assert.h>
 #include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

+#include "common/attributes.h"
+
 #include "output/output.h"
 #include "output/muxer.h"