Bug 1841624 - Update dav1d to 616bfd1506a8a75c6a358e578cbec9ca11931502 r=chunmin

Differential Revision: https://phabricator.services.mozilla.com/D182716
2024-11-23 21:01:08 +00:00 · 2023-07-07 16:33:42 +00:00 · 2023-07-07 16:33:42 +00:00 · 1f101a78be
commit 1f101a78be
parent 115775cd64
33 changed files with 3213 additions and 1332 deletions
--- a/media/libdav1d/moz.yaml
+++ b/media/libdav1d/moz.yaml
@ -20,11 +20,11 @@ origin:

  # Human-readable identifier for this version/release
  # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: 8b419c16bf1e37bc98044089da58f06824462cb9 (2023-06-02T00:00:12.000+02:00).
+  release: 616bfd1506a8a75c6a358e578cbec9ca11931502 (2023-07-01T11:36:39.000+03:00).

  # Revision to pull in
  # Must be a long or short commit SHA (long preferred)
-  revision: 8b419c16bf1e37bc98044089da58f06824462cb9
+  revision: 616bfd1506a8a75c6a358e578cbec9ca11931502

  # The package's license, where possible using the mnemonic from
  # https://spdx.org/licenses/
--- a/media/libdav1d/vcs_version.h
+++ b/media/libdav1d/vcs_version.h
@ -1,2 +1,2 @@
 /* auto-generated, do not edit */
-#define DAV1D_VERSION "8b419c16bf1e37bc98044089da58f06824462cb9"
+#define DAV1D_VERSION "616bfd1506a8a75c6a358e578cbec9ca11931502"
--- a/media/libdav1d/version.h
+++ b/media/libdav1d/version.h
@ -27,8 +27,8 @@
 #ifndef DAV1D_VERSION_H
 #define DAV1D_VERSION_H

-#define DAV1D_API_VERSION_MAJOR 6
-#define DAV1D_API_VERSION_MINOR 9
+#define DAV1D_API_VERSION_MAJOR 7
+#define DAV1D_API_VERSION_MINOR 0
 #define DAV1D_API_VERSION_PATCH 0

 #endif /* DAV1D_VERSION_H */
--- a/third_party/dav1d/include/common/validate.h
+++ b/third_party/dav1d/include/common/validate.h
@ -32,24 +32,26 @@
 #include <stdlib.h>

 #if defined(NDEBUG)
-#define debug_abort()
+#define debug_print(...) do {} while (0)
+#define debug_abort() do {} while (0)
 #else
+#define debug_print(...) fprintf(stderr, __VA_ARGS__)
 #define debug_abort abort
 #endif

 #define validate_input_or_ret_with_msg(x, r, ...) \
    if (!(x)) { \
-        fprintf(stderr, "Input validation check \'%s\' failed in %s!\n", \
-                #x, __func__); \
-        fprintf(stderr, __VA_ARGS__); \
+        debug_print("Input validation check \'%s\' failed in %s!\n", \
+                    #x, __func__); \
+        debug_print(__VA_ARGS__); \
        debug_abort(); \
        return r; \
    }

 #define validate_input_or_ret(x, r) \
    if (!(x)) { \
-        fprintf(stderr, "Input validation check \'%s\' failed in %s!\n", \
-                #x, __func__); \
+        debug_print("Input validation check \'%s\' failed in %s!\n", \
+                    #x, __func__); \
        debug_abort(); \
        return r; \
    }
--- a/third_party/dav1d/include/dav1d/dav1d.h
+++ b/third_party/dav1d/include/dav1d/dav1d.h
@ -103,6 +103,15 @@ typedef struct Dav1dSettings {
 */
 DAV1D_API const char *dav1d_version(void);

+/**
+ * Get library API version.
+ *
+ * @return A value in the format 0x00XXYYZZ, where XX is the major version,
+ *         YY the minor version, and ZZ the patch version.
+ * @see DAV1D_API_MAJOR, DAV1D_API_MINOR, DAV1D_API_PATCH
+ */
+DAV1D_API unsigned dav1d_version_api(void);
+
 /**
 * Initialize settings to default values.
 *
--- a/third_party/dav1d/include/dav1d/headers.h
+++ b/third_party/dav1d/include/dav1d/headers.h
@ -182,8 +182,8 @@ enum Dav1dChromaSamplePosition {
 };

 typedef struct Dav1dContentLightLevel {
-    int max_content_light_level;
-    int max_frame_average_light_level;
+    uint16_t max_content_light_level;
+    uint16_t max_frame_average_light_level;
 } Dav1dContentLightLevel;

 typedef struct Dav1dMasteringDisplay {
@ -210,7 +210,7 @@ typedef struct Dav1dSequenceHeader {
     * 1 for 8-10 bits/component 4:4:4; 2 for 4:2:2 at any bits/component,
     * or 12 bits/component at any chroma subsampling.
     */
-    int profile;
+    uint8_t profile;
    /**
     * Maximum dimensions for this stream. In non-scalable streams, these
     * are often the actual dimensions of the stream, although that is not
@ -229,60 +229,60 @@ typedef struct Dav1dSequenceHeader {
     * (twelve_bit) to distinguish between 10 and 12 bits/component. To get
     * the spec's hbd, use !!our_hbd, and to get twelve_bit, use hbd == 2.
     */
-    int hbd;
+    uint8_t hbd;
    /**
     * Pixel data uses JPEG pixel range ([0,255] for 8bits) instead of
     * MPEG pixel range ([16,235] for 8bits luma, [16,240] for 8bits chroma).
     */
-    int color_range;
+    uint8_t color_range;

-    int num_operating_points;
+    uint8_t num_operating_points;
    struct Dav1dSequenceHeaderOperatingPoint {
-        int major_level, minor_level;
-        int initial_display_delay;
-        int idc;
-        int tier;
-        int decoder_model_param_present;
-        int display_model_param_present;
+        uint8_t major_level, minor_level;
+        uint8_t initial_display_delay;
+        uint16_t idc;
+        uint8_t tier;
+        uint8_t decoder_model_param_present;
+        uint8_t display_model_param_present;
    } operating_points[DAV1D_MAX_OPERATING_POINTS];

-    int still_picture;
-    int reduced_still_picture_header;
-    int timing_info_present;
-    int num_units_in_tick;
-    int time_scale;
-    int equal_picture_interval;
-    unsigned num_ticks_per_picture;
-    int decoder_model_info_present;
-    int encoder_decoder_buffer_delay_length;
-    int num_units_in_decoding_tick;
-    int buffer_removal_delay_length;
-    int frame_presentation_delay_length;
-    int display_model_info_present;
-    int width_n_bits, height_n_bits;
-    int frame_id_numbers_present;
-    int delta_frame_id_n_bits;
-    int frame_id_n_bits;
-    int sb128;
-    int filter_intra;
-    int intra_edge_filter;
-    int inter_intra;
-    int masked_compound;
-    int warped_motion;
-    int dual_filter;
-    int order_hint;
-    int jnt_comp;
-    int ref_frame_mvs;
+    uint8_t still_picture;
+    uint8_t reduced_still_picture_header;
+    uint8_t timing_info_present;
+    uint32_t num_units_in_tick;
+    uint32_t time_scale;
+    uint8_t equal_picture_interval;
+    uint32_t num_ticks_per_picture;
+    uint8_t decoder_model_info_present;
+    uint8_t encoder_decoder_buffer_delay_length;
+    uint32_t num_units_in_decoding_tick;
+    uint8_t buffer_removal_delay_length;
+    uint8_t frame_presentation_delay_length;
+    uint8_t display_model_info_present;
+    uint8_t width_n_bits, height_n_bits;
+    uint8_t frame_id_numbers_present;
+    uint8_t delta_frame_id_n_bits;
+    uint8_t frame_id_n_bits;
+    uint8_t sb128;
+    uint8_t filter_intra;
+    uint8_t intra_edge_filter;
+    uint8_t inter_intra;
+    uint8_t masked_compound;
+    uint8_t warped_motion;
+    uint8_t dual_filter;
+    uint8_t order_hint;
+    uint8_t jnt_comp;
+    uint8_t ref_frame_mvs;
    enum Dav1dAdaptiveBoolean screen_content_tools;
    enum Dav1dAdaptiveBoolean force_integer_mv;
-    int order_hint_n_bits;
-    int super_res;
-    int cdef;
-    int restoration;
-    int ss_hor, ss_ver, monochrome;
-    int color_description_present;
-    int separate_uv_delta_q;
-    int film_grain_present;
+    uint8_t order_hint_n_bits;
+    uint8_t super_res;
+    uint8_t cdef;
+    uint8_t restoration;
+    uint8_t ss_hor, ss_ver, monochrome;
+    uint8_t color_description_present;
+    uint8_t separate_uv_delta_q;
+    uint8_t film_grain_present;

    // Dav1dSequenceHeaders of the same sequence are required to be
    // bit-identical until this offset. See 7.5 "Ordering of OBUs":
@ -291,29 +291,29 @@ typedef struct Dav1dSequenceHeader {
    //   sequence header appears except for the contents of
    //   operating_parameters_info.
    struct Dav1dSequenceHeaderOperatingParameterInfo {
-        int decoder_buffer_delay;
-        int encoder_buffer_delay;
-        int low_delay_mode;
+        uint32_t decoder_buffer_delay;
+        uint32_t encoder_buffer_delay;
+        uint8_t low_delay_mode;
    } operating_parameter_info[DAV1D_MAX_OPERATING_POINTS];
 } Dav1dSequenceHeader;

 typedef struct Dav1dSegmentationData {
-    int delta_q;
-    int delta_lf_y_v, delta_lf_y_h, delta_lf_u, delta_lf_v;
-    int ref;
-    int skip;
-    int globalmv;
+    int16_t delta_q;
+    int8_t delta_lf_y_v, delta_lf_y_h, delta_lf_u, delta_lf_v;
+    int8_t ref;
+    uint8_t skip;
+    uint8_t globalmv;
 } Dav1dSegmentationData;

 typedef struct Dav1dSegmentationDataSet {
    Dav1dSegmentationData d[DAV1D_MAX_SEGMENTS];
-    int preskip;
-    int last_active_segid;
+    uint8_t preskip;
+    int8_t last_active_segid;
 } Dav1dSegmentationDataSet;

 typedef struct Dav1dLoopfilterModeRefDeltas {
-    int mode_delta[2 /* is_zeromv */];
-    int ref_delta[DAV1D_TOTAL_REFS_PER_FRAME];
+    int8_t mode_delta[2 /* is_zeromv */];
+    int8_t ref_delta[DAV1D_TOTAL_REFS_PER_FRAME];
 } Dav1dLoopfilterModeRefDeltas;

 typedef struct Dav1dFilmGrainData {
@ -339,100 +339,101 @@ typedef struct Dav1dFilmGrainData {
 typedef struct Dav1dFrameHeader {
    struct {
        Dav1dFilmGrainData data;
-        int present, update;
+        uint8_t present, update;
    } film_grain; ///< film grain parameters
    enum Dav1dFrameType frame_type; ///< type of the picture
    int width[2 /* { coded_width, superresolution_upscaled_width } */], height;
-    int frame_offset; ///< frame number
-    int temporal_id; ///< temporal id of the frame for SVC
-    int spatial_id; ///< spatial id of the frame for SVC
+    uint8_t frame_offset; ///< frame number
+    uint8_t temporal_id; ///< temporal id of the frame for SVC
+    uint8_t spatial_id; ///< spatial id of the frame for SVC

-    int show_existing_frame;
-    int existing_frame_idx;
-    int frame_id;
-    int frame_presentation_delay;
-    int show_frame;
-    int showable_frame;
-    int error_resilient_mode;
-    int disable_cdf_update;
-    int allow_screen_content_tools;
-    int force_integer_mv;
-    int frame_size_override;
-    int primary_ref_frame;
-    int buffer_removal_time_present;
+    uint8_t show_existing_frame;
+    uint8_t existing_frame_idx;
+    uint32_t frame_id;
+    uint32_t frame_presentation_delay;
+    uint8_t show_frame;
+    uint8_t showable_frame;
+    uint8_t error_resilient_mode;
+    uint8_t disable_cdf_update;
+    uint8_t allow_screen_content_tools;
+    uint8_t force_integer_mv;
+    uint8_t frame_size_override;
+    uint8_t primary_ref_frame;
+    uint8_t buffer_removal_time_present;
    struct Dav1dFrameHeaderOperatingPoint {
-        int buffer_removal_time;
+        uint32_t buffer_removal_time;
    } operating_points[DAV1D_MAX_OPERATING_POINTS];
-    int refresh_frame_flags;
+    uint8_t refresh_frame_flags;
    int render_width, render_height;
    struct {
-        int width_scale_denominator;
-        int enabled;
+        uint8_t width_scale_denominator;
+        uint8_t enabled;
    } super_res;
-    int have_render_size;
-    int allow_intrabc;
-    int frame_ref_short_signaling;
-    int refidx[DAV1D_REFS_PER_FRAME];
-    int hp;
+    uint8_t have_render_size;
+    uint8_t allow_intrabc;
+    uint8_t frame_ref_short_signaling;
+    int8_t refidx[DAV1D_REFS_PER_FRAME];
+    uint8_t hp;
    enum Dav1dFilterMode subpel_filter_mode;
-    int switchable_motion_mode;
-    int use_ref_frame_mvs;
-    int refresh_context;
+    uint8_t switchable_motion_mode;
+    uint8_t use_ref_frame_mvs;
+    uint8_t refresh_context;
    struct {
-        int uniform;
-        unsigned n_bytes;
-        int min_log2_cols, max_log2_cols, log2_cols, cols;
-        int min_log2_rows, max_log2_rows, log2_rows, rows;
+        uint8_t uniform;
+        uint8_t n_bytes;
+        uint8_t min_log2_cols, max_log2_cols, log2_cols, cols;
+        uint8_t min_log2_rows, max_log2_rows, log2_rows, rows;
        uint16_t col_start_sb[DAV1D_MAX_TILE_COLS + 1];
        uint16_t row_start_sb[DAV1D_MAX_TILE_ROWS + 1];
-        int update;
+        uint16_t update;
    } tiling;
    struct {
-        int yac;
-        int ydc_delta;
-        int udc_delta, uac_delta, vdc_delta, vac_delta;
-        int qm, qm_y, qm_u, qm_v;
+        uint8_t yac;
+        int8_t ydc_delta;
+        int8_t udc_delta, uac_delta, vdc_delta, vac_delta;
+        uint8_t qm, qm_y, qm_u, qm_v;
    } quant;
    struct {
-        int enabled, update_map, temporal, update_data;
+        uint8_t enabled, update_map, temporal, update_data;
        Dav1dSegmentationDataSet seg_data;
-        int lossless[DAV1D_MAX_SEGMENTS], qidx[DAV1D_MAX_SEGMENTS];
+        uint8_t lossless[DAV1D_MAX_SEGMENTS], qidx[DAV1D_MAX_SEGMENTS];
    } segmentation;
    struct {
        struct {
-            int present;
-            int res_log2;
+            uint8_t present;
+            uint8_t res_log2;
        } q;
        struct {
-            int present;
-            int res_log2;
-            int multi;
+            uint8_t present;
+            uint8_t res_log2;
+            uint8_t multi;
        } lf;
    } delta;
-    int all_lossless;
+    uint8_t all_lossless;
    struct {
-        int level_y[2 /* dir */];
-        int level_u, level_v;
-        int mode_ref_delta_enabled;
-        int mode_ref_delta_update;
+        uint8_t level_y[2 /* dir */];
+        uint8_t level_u, level_v;
+        uint8_t mode_ref_delta_enabled;
+        uint8_t mode_ref_delta_update;
        Dav1dLoopfilterModeRefDeltas mode_ref_deltas;
-        int sharpness;
+        uint8_t sharpness;
    } loopfilter;
    struct {
-        int damping;
-        int n_bits;
-        int y_strength[DAV1D_MAX_CDEF_STRENGTHS];
-        int uv_strength[DAV1D_MAX_CDEF_STRENGTHS];
+        uint8_t damping;
+        uint8_t n_bits;
+        uint8_t y_strength[DAV1D_MAX_CDEF_STRENGTHS];
+        uint8_t uv_strength[DAV1D_MAX_CDEF_STRENGTHS];
    } cdef;
    struct {
        enum Dav1dRestorationType type[3 /* plane */];
-        int unit_size[2 /* y, uv */];
+        uint8_t unit_size[2 /* y, uv */];
    } restoration;
    enum Dav1dTxfmMode txfm_mode;
-    int switchable_comp_refs;
-    int skip_mode_allowed, skip_mode_enabled, skip_mode_refs[2];
-    int warp_motion;
-    int reduced_txtp_set;
+    uint8_t switchable_comp_refs;
+    uint8_t skip_mode_allowed, skip_mode_enabled;
+    int8_t skip_mode_refs[2];
+    uint8_t warp_motion;
+    uint8_t reduced_txtp_set;
    Dav1dWarpedMotionParams gmv[DAV1D_REFS_PER_FRAME];
 } Dav1dFrameHeader;

--- a/third_party/dav1d/include/dav1d/picture.h
+++ b/third_party/dav1d/include/dav1d/picture.h
@ -91,7 +91,7 @@ typedef struct Dav1dPicture {
     */
    size_t n_itut_t35;

-    uintptr_t reserved[3]; ///< reserved for future use
+    uintptr_t reserved[4]; ///< reserved for future use

    struct Dav1dRef *frame_hdr_ref; ///< Dav1dFrameHeader allocation origin
    struct Dav1dRef *seq_hdr_ref; ///< Dav1dSequenceHeader allocation origin
--- a/third_party/dav1d/include/dav1d/version.h.in
+++ b/third_party/dav1d/include/dav1d/version.h.in
@ -35,6 +35,14 @@ extern "C" {
 #define DAV1D_API_VERSION_MINOR @DAV1D_API_VERSION_MINOR@
 #define DAV1D_API_VERSION_PATCH @DAV1D_API_VERSION_PATCH@

+/**
+ * Extract version components from the value returned by
+ * dav1d_version_int()
+ */
+#define DAV1D_API_MAJOR(v) (((v) >> 16) & 0xFF)
+#define DAV1D_API_MINOR(v) (((v) >>  8) & 0xFF)
+#define DAV1D_API_PATCH(v) (((v) >>  0) & 0xFF)
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
--- a/third_party/dav1d/meson.build
+++ b/third_party/dav1d/meson.build
@ -30,7 +30,7 @@ project('dav1d', ['c'],
                      'b_ndebug=if-release'],
    meson_version: '>= 0.49.0')

-dav1d_soname_version       = '6.9.0'
+dav1d_soname_version       = '7.0.0'
 dav1d_api_version_array    = dav1d_soname_version.split('.')
 dav1d_api_version_major    = dav1d_api_version_array[0]
 dav1d_api_version_minor    = dav1d_api_version_array[1]
@ -149,6 +149,10 @@ else
        endif
        cdata.set('HAVE_CLOCK_GETTIME', 1)
    endif
+
+    if cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args)
+        cdata.set('HAVE_POSIX_MEMALIGN', 1)
+    endif
 endif

 # check for fseeko on android. It is not always available if _FILE_OFFSET_BITS is defined to 64
@ -226,14 +230,6 @@ else
    getopt_dependency = []
 endif

-if cc.has_function('_aligned_malloc', prefix : '#include <malloc.h>', args : test_args)
-    cdata.set('HAVE_ALIGNED_MALLOC', 1)
-elif cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args)
-    cdata.set('HAVE_POSIX_MEMALIGN', 1)
-elif cc.has_function('memalign', prefix : '#include <malloc.h>', args : test_args)
-    cdata.set('HAVE_MEMALIGN', 1)
-endif
-
 if (host_machine.cpu_family() == 'aarch64' or
    host_machine.cpu_family().startswith('arm') or
    host_machine.cpu() == 'ppc64le')
--- a/third_party/dav1d/src/arm/32/refmvs.S
+++ b/third_party/dav1d/src/arm/32/refmvs.S
@ -95,3 +95,209 @@ L(splat_tbl):
        bgt             1b
        pop             {r4, pc}
 endfunc
+
+const mv_tbls, align=4
+        .byte           255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+        .byte           0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0
+        .byte           4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
+        .byte           4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
+endconst
+
+const mask_mult, align=4
+        .byte           1, 2, 1, 2, 0, 0, 0, 0
+endconst
+
+// void dav1d_save_tmvs_neon(refmvs_temporal_block *rp, ptrdiff_t stride,
+//                           refmvs_block **rr, const uint8_t *ref_sign,
+//                           int col_end8, int row_end8,
+//                           int col_start8, int row_start8)
+function save_tmvs_neon, export=1
+        push            {r4-r11,lr}
+        ldrd            r4,  r5,  [sp, #36]
+        ldrd            r6,  r7,  [sp, #44]
+
+        vmov.i8         d30, #0
+        vld1.8          {d31}, [r3]
+        adr             r8,  L(save_tmvs_tbl)
+        movrel_local    lr,  mask_mult
+        movrel_local    r12, mv_tbls
+        vld1.8          {d29}, [lr]
+        vext.8          d31, d30, d31, #7         // [0, ref_sign]
+        mov             r3,  #5
+        mul             r1,  r1,  r3              // stride *= 5
+        sub             r5,  r5,  r7              // h = row_end8 - row_start8
+        lsl             r7,  r7,  #1              // row_start8 <<= 1
+1:
+        mov             r3,  #5
+        mov             r11, #12*2
+        and             r9,  r7,  #30             // (y & 15) * 2
+        ldr             r9,  [r2, r9, lsl #2]     // b = rr[(y & 15) * 2]
+        add             r9,  r9,  #12             // &b[... + 1]
+        mla             r10, r4,  r11,  r9        // end_cand_b = &b[col_end8*2 + 1]
+        mla             r9,  r6,  r11,  r9        // cand_b = &b[x*2 + 1]
+
+        mla             r3,  r6,  r3,   r0        // &rp[x]
+
+        push            {r2,r4,r6}
+
+2:
+        ldrb            r11, [r9, #10]            // cand_b->bs
+        add             lr,  r9,  #8
+        vld1.8          {d0, d1}, [r9]            // cand_b->mv
+        add             r11, r8,  r11, lsl #3
+        vld1.16         {d2[]},  [lr]             // cand_b->ref
+        ldrh            lr,  [r11]                // bw8
+        mov             r2,  r8
+        add             r9,  r9,  lr,  lsl #1     // cand_b += bw8*2
+        cmp             r9,  r10
+        vmov            d4,  d0
+        bge             3f
+
+        ldrb            r2,  [r9, #10]            // cand_b->bs
+        add             lr,  r9,  #8
+        vld1.8          {d6, d7}, [r9]            // cand_b->mv
+        add             r2,  r8,  r2,  lsl #3
+        vld1.16         {d2[1]},  [lr]            // cand_b->ref
+        ldrh            lr,  [r2]                 // bw8
+        add             r9,  r9,  lr,  lsl #1     // cand_b += bw8*2
+        vmov            d5,  d6
+
+3:
+        vabs.s16        q2,  q2                   // abs(mv[].xy)
+        vtbl.8          d2,  {d31}, d2            // ref_sign[ref]
+        vshr.u16        q2,  q2,  #12             // abs(mv[].xy) >> 12
+        vmull.u8        q1,  d2,  d29             // ref_sign[ref] * {1, 2}
+        vceq.i32        q2,  q2,  #0              // abs(mv[].xy) <= 4096
+        vmovn.i32       d4,  q2                   // abs() condition to 16 bit
+        vand            d2,  d2,  d4              // h[0-3] contains conditions for mv[0-1]
+        vpadd.i16       d2,  d2,  d2              // Combine condition for [1] and [0]
+        vmov.u16        r4,  d2[0]                // Extract case for first block
+        vmov.u16        r6,  d2[1]
+        ldr             r11, [r11, #4]            // Fetch jump table entry
+        ldr             r2,  [r2,  #4]
+        add             r4,  r12,  r4,  lsl #4
+        add             r6,  r12,  r6,  lsl #4
+        vld1.8          {d2, d3}, [r4]            // Load permutation table base on case
+        vld1.8          {d4, d5}, [r6]
+        add             r11, r8,  r11             // Find jump table target
+        add             r2,  r8,  r2
+        vtbl.8          d16, {d0, d1}, d2         // Permute cand_b to output refmvs_temporal_block
+        vtbl.8          d17, {d0, d1}, d3
+        vtbl.8          d18, {d6, d7}, d4
+        vtbl.8          d19, {d6, d7}, d5
+        vmov            q0,  q8
+
+        // q1 follows on q0 (q8), with another 3 full repetitions of the pattern.
+        vext.8          q1,  q8,  q8,  #1
+        vext.8          q10, q9,  q9,  #1
+        // q2 ends with 3 complete repetitions of the pattern.
+        vext.8          q2,  q8,  q1,  #4
+        vext.8          q11, q9,  q10, #4
+
+        blx             r11
+        bge             4f  // if (cand_b >= end)
+        vmov            q0,  q9
+        vmov            q1,  q10
+        vmov            q2,  q11
+        cmp             r9,  r10
+        blx             r2
+        blt             2b  // if (cand_b < end)
+
+4:
+        pop             {r2,r4,r6}
+
+        subs            r5,  r5,  #1              // h--
+        add             r7,  r7,  #2              // y += 2
+        add             r0,  r0,  r1              // rp += stride
+        bgt             1b
+
+        pop             {r4-r11,pc}
+
+        .align 2
+L(save_tmvs_tbl):
+        .word 16 * 12
+        .word 160f - L(save_tmvs_tbl) + CONFIG_THUMB
+        .word 16 * 12
+        .word 160f - L(save_tmvs_tbl) + CONFIG_THUMB
+        .word 8 * 12
+        .word 80f  - L(save_tmvs_tbl) + CONFIG_THUMB
+        .word 8 * 12
+        .word 80f  - L(save_tmvs_tbl) + CONFIG_THUMB
+        .word 8 * 12
+        .word 80f  - L(save_tmvs_tbl) + CONFIG_THUMB
+        .word 8 * 12
+        .word 80f  - L(save_tmvs_tbl) + CONFIG_THUMB
+        .word 4 * 12
+        .word 40f  - L(save_tmvs_tbl) + CONFIG_THUMB
+        .word 4 * 12
+        .word 40f  - L(save_tmvs_tbl) + CONFIG_THUMB
+        .word 4 * 12
+        .word 40f  - L(save_tmvs_tbl) + CONFIG_THUMB
+        .word 4 * 12
+        .word 40f  - L(save_tmvs_tbl) + CONFIG_THUMB
+        .word 2 * 12
+        .word 20f  - L(save_tmvs_tbl) + CONFIG_THUMB
+        .word 2 * 12
+        .word 20f  - L(save_tmvs_tbl) + CONFIG_THUMB
+        .word 2 * 12
+        .word 20f  - L(save_tmvs_tbl) + CONFIG_THUMB
+        .word 2 * 12
+        .word 20f  - L(save_tmvs_tbl) + CONFIG_THUMB
+        .word 2 * 12
+        .word 20f  - L(save_tmvs_tbl) + CONFIG_THUMB
+        .word 1 * 12
+        .word 10f  - L(save_tmvs_tbl) + CONFIG_THUMB
+        .word 1 * 12
+        .word 10f  - L(save_tmvs_tbl) + CONFIG_THUMB
+        .word 1 * 12
+        .word 10f  - L(save_tmvs_tbl) + CONFIG_THUMB
+        .word 1 * 12
+        .word 10f  - L(save_tmvs_tbl) + CONFIG_THUMB
+        .word 1 * 12
+        .word 10f  - L(save_tmvs_tbl) + CONFIG_THUMB
+        .word 1 * 12
+        .word 10f  - L(save_tmvs_tbl) + CONFIG_THUMB
+        .word 1 * 12
+        .word 10f  - L(save_tmvs_tbl) + CONFIG_THUMB
+
+10:
+        add             r4,  r3,  #4
+        vst1.32         {d0[0]}, [r3]
+        vst1.8          {d0[4]}, [r4]
+        add             r3,  r3,  #5
+        bx              lr
+20:
+        add             r4,  r3,  #8
+        vst1.8          {d0}, [r3]
+        vst1.16         {d1[0]}, [r4]
+        add             r3,  r3,  #2*5
+        bx              lr
+40:
+        add             r4,  r3,  #16
+        vst1.8          {q0}, [r3]
+        vst1.32         {d2[0]}, [r4]
+        add             r3,  r3,  #4*5
+        bx              lr
+80:
+        add             r4,  r3,  #(8*5-16)
+        // This writes 6 full entries plus 2 extra bytes
+        vst1.8          {q0, q1}, [r3]
+        // Write the last few, overlapping with the first write.
+        vst1.8          {q2}, [r4]
+        add             r3,  r3,  #8*5
+        bx              lr
+160:
+        add             r4,  r3,  #6*5
+        add             r6,  r3,  #12*5
+        // This writes 6 full entries plus 2 extra bytes
+        vst1.8          {q0, q1}, [r3]
+        // Write another 6 full entries, slightly overlapping with the first set
+        vst1.8          {q0, q1}, [r4]
+        add             r4,  r3,  #(16*5-16)
+        // Write 8 bytes (one full entry) after the first 12
+        vst1.8          {d0}, [r6]
+        // Write the last 3 entries
+        vst1.8          {q2}, [r4]
+        add             r3,  r3,  #16*5
+        bx              lr
+endfunc
--- a/third_party/dav1d/src/arm/64/looprestoration.S
+++ b/third_party/dav1d/src/arm/64/looprestoration.S
@ -965,371 +965,338 @@ function wiener_filter5_hv_8bpc_neon
        ret
 endfunc

-#define SUM_STRIDE (384+16)
-
 #include "looprestoration_tmpl.S"

-// void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
-//                                 const pixel (*left)[4],
-//                                 const pixel *src, const ptrdiff_t stride,
-//                                 const int w, const int h,
-//                                 const enum LrEdgeFlags edges);
-function sgr_box3_h_8bpc_neon, export=1
-        add             w5,  w5,  #2 // w += 2
+// void dav1d_sgr_box3_row_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
+//                                     const pixel (*left)[4],
+//                                     const pixel *src, const int w,
+//                                     const enum LrEdgeFlags edges);
+function sgr_box3_row_h_8bpc_neon, export=1
+        add             w4,  w4,  #2 // w += 2

-        // Set up pointers for reading/writing alternate rows
-        add             x10, x0,  #(4*SUM_STRIDE)   // sumsq
-        add             x11, x1,  #(2*SUM_STRIDE)   // sum
-        add             x12, x3,  x4                // src
-        lsl             x4,  x4,  #1
-        mov             x9,       #(2*2*SUM_STRIDE) // double sum stride
-
-        // Subtract the aligned width from the output stride.
-        add             w13, w5,  #7
-        bic             w13, w13, #7
-        sub             x9,  x9,  w13, uxtw #1
-
-        // Store the width for the vertical loop
-        mov             w8,  w5
-
-        // Subtract the number of pixels read from the input from the stride
-        add             w13, w13, #8
-        sub             x4,  x4,  w13, uxtw
-
-        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
-        tst             w7,  #1 // LR_HAVE_LEFT
-        b.eq            2f
-        // LR_HAVE_LEFT
+        tst             w5,  #1 // LR_HAVE_LEFT
+        b.eq            1f
        cbnz            x2,  0f
-        // left == NULL
+
+        // LR_HAVE_LEFT && left == NULL
        sub             x3,  x3,  #2
-        sub             x12, x12, #2
-        b               1f
-0:      // LR_HAVE_LEFT, left != NULL
-2:      // !LR_HAVE_LEFT, increase the stride.
-        // For this case we don't read the left 2 pixels from the src pointer,
-        // but shift it as if we had done that.
-        add             x4,  x4,  #2
+        ld1             {v0.16b}, [x3], #16
+        b               2f

-
-1:      // Loop vertically
-        ld1             {v0.16b},  [x3],  #16
-        ld1             {v4.16b},  [x12], #16
-
-        tst             w7,  #1 // LR_HAVE_LEFT
-        b.eq            0f
-        cbz             x2,  2f
+0:
        // LR_HAVE_LEFT, left != NULL
-        ld1             {v1.s}[3],  [x2], #4
-        // Move x3/x12 back to account for the last 2 bytes we loaded earlier,
+        ld1             {v0.16b},  [x3], #16
+        ld1             {v1.s}[3], [x2]
+        // Move x3 back to account for the last 2 bytes we loaded earlier,
        // which we'll shift out.
        sub             x3,  x3,  #2
-        sub             x12, x12, #2
-        ld1             {v5.s}[3],  [x2], #4
        ext             v0.16b, v1.16b, v0.16b, #14
-        ext             v4.16b, v5.16b, v4.16b, #14
        b               2f
-0:
+
+1:
+        ld1             {v0.16b}, [x3], #16
        // !LR_HAVE_LEFT, fill v1 with the leftmost byte
        // and shift v0 to have 2x the first byte at the front.
        dup             v1.16b, v0.b[0]
-        dup             v5.16b, v4.b[0]
        // Move x3 back to account for the last 2 bytes we loaded before,
        // which we shifted out.
        sub             x3,  x3,  #2
-        sub             x12, x12, #2
        ext             v0.16b, v1.16b, v0.16b, #14
-        ext             v4.16b, v5.16b, v4.16b, #14

 2:
        umull           v1.8h,   v0.8b,   v0.8b
        umull2          v2.8h,   v0.16b,  v0.16b
-        umull           v5.8h,   v4.8b,   v4.8b
-        umull2          v6.8h,   v4.16b,  v4.16b

-        tst             w7,  #2 // LR_HAVE_RIGHT
+        tst             w5,  #2 // LR_HAVE_RIGHT
        b.ne            4f
        // If we'll need to pad the right edge, load that byte to pad with
        // here since we can find it pretty easily from here.
-        sub             w13, w5, #(2 + 16 - 2 + 1)
+        sub             w13, w4, #(2 + 16 - 2 + 1)
        ldr             b30, [x3,  w13, sxtw]
-        ldr             b31, [x12, w13, sxtw]
-        // Fill v30/v31 with the right padding pixel
+        // Fill v30 with the right padding pixel
        dup             v30.16b, v30.b[0]
-        dup             v31.16b, v31.b[0]
 3:      // !LR_HAVE_RIGHT

        // Check whether we need to pad the right edge
-        cmp             w5,  #10
+        cmp             w4,  #10
        b.ge            4f   // If w >= 10, all used input pixels are valid

        // 1 <= w < 10, w pixels valid in v0. For w=9, this ends up called
        // again; it's not strictly needed in those cases (we pad enough here),
        // but keeping the code as simple as possible.

-        // Insert padding in v0/4.b[w] onwards
+        // Insert padding in v0.b[w] onwards
        movrel          x13, right_ext_mask
-        sub             x13, x13, w5,  uxtw
+        sub             x13, x13, w4,  uxtw
        ld1             {v29.16b}, [x13]

        bit             v0.16b,  v30.16b, v29.16b
-        bit             v4.16b,  v31.16b, v29.16b

        // Update the precalculated squares
        umull           v1.8h,   v0.8b,   v0.8b
        umull2          v2.8h,   v0.16b,  v0.16b
-        umull           v5.8h,   v4.8b,   v4.8b
-        umull2          v6.8h,   v4.16b,  v4.16b

 4:      // Loop horizontally
        ext             v16.16b, v0.16b,  v0.16b, #1
        ext             v17.16b, v0.16b,  v0.16b, #2
-        ext             v18.16b, v4.16b,  v4.16b, #1
-        ext             v19.16b, v4.16b,  v4.16b, #2
        uaddl           v3.8h,   v0.8b,   v16.8b
-        uaddw           v3.8h,   v3.8h,   v17.8b
-        uaddl           v7.8h,   v4.8b,   v18.8b
-        uaddw           v7.8h,   v7.8h,   v19.8b
-
        ext             v20.16b, v1.16b,  v2.16b, #2
+        uaddw           v3.8h,   v3.8h,   v17.8b
+
        ext             v21.16b, v1.16b,  v2.16b, #4
-        ext             v22.16b, v5.16b,  v6.16b, #2
-        ext             v23.16b, v5.16b,  v6.16b, #4

        uaddl           v26.4s,  v1.4h,   v20.4h
        uaddl2          v27.4s,  v1.8h,   v20.8h
        uaddw           v26.4s,  v26.4s,  v21.4h
        uaddw2          v27.4s,  v27.4s,  v21.8h

-        uaddl           v28.4s,  v5.4h,   v22.4h
-        uaddl2          v29.4s,  v5.8h,   v22.8h
-        uaddw           v28.4s,  v28.4s,  v23.4h
-        uaddw2          v29.4s,  v29.4s,  v23.8h
-
-        subs            w5,  w5,  #8
+        subs            w4,  w4,  #8

        st1             {v3.8h},         [x1],  #16
-        st1             {v7.8h},         [x11], #16
        st1             {v26.4s,v27.4s}, [x0],  #32
-        st1             {v28.4s,v29.4s}, [x10], #32

        b.le            9f
-        tst             w7,  #2 // LR_HAVE_RIGHT
+        tst             w5,  #2 // LR_HAVE_RIGHT
        ld1             {v3.8b},  [x3],  #8
-        ld1             {v7.8b},  [x12], #8
        mov             v1.16b,  v2.16b
-        mov             v5.16b,  v6.16b
        ext             v0.16b,  v0.16b,  v3.16b, #8
-        ext             v4.16b,  v4.16b,  v7.16b, #8
        umull           v2.8h,   v3.8b,   v3.8b
-        umull           v6.8h,   v7.8b,   v7.8b

        b.ne            4b // If we don't need to pad, just keep summing.
        b               3b // If we need to pad, check how many pixels we have left.

 9:
-        subs            w6,  w6,  #2
-        b.le            0f
-        // Jump to the next row and loop horizontally
-        add             x0,  x0,  x9, lsl #1
-        add             x10, x10, x9, lsl #1
-        add             x1,  x1,  x9
-        add             x11, x11, x9
-        add             x3,  x3,  x4
-        add             x12, x12, x4
-        mov             w5,  w8
-        b               1b
-0:
        ret
 endfunc

-// void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
-//                                 const pixel (*left)[4],
-//                                 const pixel *src, const ptrdiff_t stride,
-//                                 const int w, const int h,
-//                                 const enum LrEdgeFlags edges);
-function sgr_box5_h_8bpc_neon, export=1
-        add             w5,  w5,  #2 // w += 2
+// void dav1d_sgr_box5_row_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
+//                                     const pixel (*left)[4],
+//                                     const pixel *src, const int w,
+//                                     const enum LrEdgeFlags edges);
+function sgr_box5_row_h_8bpc_neon, export=1
+        add             w4,  w4,  #2 // w += 2

-        // Set up pointers for reading/writing alternate rows
-        add             x10, x0,  #(4*SUM_STRIDE)   // sumsq
-        add             x11, x1,  #(2*SUM_STRIDE)   // sum
-        add             x12, x3,  x4                // src
-        lsl             x4,  x4,  #1
-        mov             x9,       #(2*2*SUM_STRIDE) // double sum stride
-
-        // Subtract the aligned width from the output stride.
-        add             w13, w5,  #7
-        bic             w13, w13, #7
-        sub             x9,  x9,  w13, uxtw #1
-        add             w13, w13, #8
-        sub             x4,  x4,  w13, uxtw
-
-        // Store the width for the vertical loop
-        mov             w8,  w5
-
-        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
-        tst             w7,  #1 // LR_HAVE_LEFT
-        b.eq            2f
-        // LR_HAVE_LEFT
+        tst             w5,  #1 // LR_HAVE_LEFT
+        b.eq            1f
        cbnz            x2,  0f
-        // left == NULL
+
+        // LR_HAVE_LEFT && left == NULL
        sub             x3,  x3,  #3
-        sub             x12, x12, #3
-        b               1f
-0:      // LR_HAVE_LEFT, left != NULL
-2:      // !LR_HAVE_LEFT, increase the stride.
-        // For this case we don't read the left 3 pixels from the src pointer,
-        // but shift it as if we had done that.
-        add             x4,  x4,  #3
+        ld1             {v0.16b}, [x3], #16
+        b               2f

-1:      // Loop vertically
-        ld1             {v0.16b},  [x3],  #16
-        ld1             {v4.16b},  [x12], #16
-
-        tst             w7,  #1 // LR_HAVE_LEFT
-        b.eq            0f
-        cbz             x2,  2f
+0:
        // LR_HAVE_LEFT, left != NULL
-        ld1             {v1.s}[3],  [x2], #4
-        // Move x3/x12 back to account for the last 3 bytes we loaded earlier,
+        ld1             {v0.16b},  [x3], #16
+        ld1             {v1.s}[3], [x2], #4
+        // Move x3 back to account for the last 3 bytes we loaded earlier,
        // which we'll shift out.
        sub             x3,  x3,  #3
-        sub             x12, x12, #3
-        ld1             {v5.s}[3],  [x2], #4
        ext             v0.16b, v1.16b, v0.16b, #13
-        ext             v4.16b, v5.16b, v4.16b, #13
        b               2f
-0:
+
+1:
+        ld1             {v0.16b}, [x3], #16
        // !LR_HAVE_LEFT, fill v1 with the leftmost byte
        // and shift v0 to have 3x the first byte at the front.
        dup             v1.16b, v0.b[0]
-        dup             v5.16b, v4.b[0]
        // Move x3 back to account for the last 3 bytes we loaded before,
        // which we shifted out.
        sub             x3,  x3,  #3
-        sub             x12, x12, #3
        ext             v0.16b, v1.16b, v0.16b, #13
-        ext             v4.16b, v5.16b, v4.16b, #13

 2:
        umull           v1.8h,   v0.8b,   v0.8b
        umull2          v2.8h,   v0.16b,  v0.16b
-        umull           v5.8h,   v4.8b,   v4.8b
-        umull2          v6.8h,   v4.16b,  v4.16b

-        tst             w7,  #2 // LR_HAVE_RIGHT
+        tst             w5,  #2 // LR_HAVE_RIGHT
        b.ne            4f
        // If we'll need to pad the right edge, load that byte to pad with
        // here since we can find it pretty easily from here.
-        sub             w13, w5, #(2 + 16 - 3 + 1)
+        sub             w13, w4, #(2 + 16 - 3 + 1)
        ldr             b30, [x3,  w13, sxtw]
-        ldr             b31, [x12, w13, sxtw]
-        // Fill v30/v31 with the right padding pixel
+        // Fill v30 with the right padding pixel
        dup             v30.16b, v30.b[0]
-        dup             v31.16b, v31.b[0]
 3:      // !LR_HAVE_RIGHT

        // Check whether we need to pad the right edge
-        cmp             w5,  #11
+        cmp             w4,  #11
        b.ge            4f   // If w >= 11, all used input pixels are valid

        // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
        // this ends up called again; it's not strictly needed in those
        // cases (we pad enough here), but keeping the code as simple as possible.

-        // Insert padding in v0/4.b[w+1] onwards; fuse the +1 into the
+        // Insert padding in v0.b[w+1] onwards; fuse the +1 into the
        // buffer pointer.
        movrel          x13, right_ext_mask, -1
-        sub             x13, x13, w5,  uxtw
+        sub             x13, x13, w4,  uxtw
        ld1             {v29.16b}, [x13]

        bit             v0.16b,  v30.16b, v29.16b
-        bit             v4.16b,  v31.16b, v29.16b

        // Update the precalculated squares
        umull           v1.8h,   v0.8b,   v0.8b
        umull2          v2.8h,   v0.16b,  v0.16b
-        umull           v5.8h,   v4.8b,   v4.8b
-        umull2          v6.8h,   v4.16b,  v4.16b

 4:      // Loop horizontally
        ext             v16.16b, v0.16b,  v0.16b, #1
        ext             v17.16b, v0.16b,  v0.16b, #2
        ext             v18.16b, v0.16b,  v0.16b, #3
        ext             v19.16b, v0.16b,  v0.16b, #4
-        ext             v20.16b, v4.16b,  v4.16b, #1
-        ext             v21.16b, v4.16b,  v4.16b, #2
-        ext             v22.16b, v4.16b,  v4.16b, #3
-        ext             v23.16b, v4.16b,  v4.16b, #4
        uaddl           v3.8h,   v0.8b,   v16.8b
        uaddl           v24.8h,  v17.8b,  v18.8b
-        uaddl           v7.8h,   v4.8b,   v20.8b
        uaddw           v3.8h,   v3.8h,   v19.8b
-        uaddl           v25.8h,  v21.8b,  v22.8b
-        uaddw           v7.8h,   v7.8h,   v23.8b
        add             v3.8h,   v3.8h,   v24.8h
-        add             v7.8h,   v7.8h,   v25.8h

        ext             v16.16b, v1.16b,  v2.16b, #2
        ext             v17.16b, v1.16b,  v2.16b, #4
        ext             v18.16b, v1.16b,  v2.16b, #6
        ext             v19.16b, v1.16b,  v2.16b, #8
-        ext             v20.16b, v5.16b,  v6.16b, #2
-        ext             v21.16b, v5.16b,  v6.16b, #4
-        ext             v22.16b, v5.16b,  v6.16b, #6
-        ext             v23.16b, v5.16b,  v6.16b, #8

        uaddl           v26.4s,  v1.4h,   v16.4h
        uaddl2          v27.4s,  v1.8h,   v16.8h
        uaddl           v16.4s,  v17.4h,  v18.4h
        uaddl2          v17.4s,  v17.8h,  v18.8h
-        uaddl           v28.4s,  v5.4h,   v20.4h
-        uaddl2          v29.4s,  v5.8h,   v20.8h
        uaddw           v26.4s,  v26.4s,  v19.4h
        uaddw2          v27.4s,  v27.4s,  v19.8h
-        uaddl           v20.4s,  v21.4h,  v22.4h
-        uaddl2          v21.4s,  v21.8h,  v22.8h
-        uaddw           v28.4s,  v28.4s,  v23.4h
-        uaddw2          v29.4s,  v29.4s,  v23.8h
        add             v26.4s,  v26.4s,  v16.4s
        add             v27.4s,  v27.4s,  v17.4s
-        add             v28.4s,  v28.4s,  v20.4s
-        add             v29.4s,  v29.4s,  v21.4s

-        subs            w5,  w5,  #8
+        subs            w4,  w4,  #8

        st1             {v3.8h},         [x1],  #16
-        st1             {v7.8h},         [x11], #16
        st1             {v26.4s,v27.4s}, [x0],  #32
-        st1             {v28.4s,v29.4s}, [x10], #32

        b.le            9f
-        tst             w7,  #2 // LR_HAVE_RIGHT
+        tst             w5,  #2 // LR_HAVE_RIGHT
        ld1             {v3.8b},  [x3],  #8
-        ld1             {v7.8b},  [x12], #8
        mov             v1.16b,  v2.16b
-        mov             v5.16b,  v6.16b
        ext             v0.16b,  v0.16b,  v3.16b, #8
-        ext             v4.16b,  v4.16b,  v7.16b, #8
        umull           v2.8h,   v3.8b,   v3.8b
-        umull           v6.8h,   v7.8b,   v7.8b
+
        b.ne            4b // If we don't need to pad, just keep summing.
        b               3b // If we need to pad, check how many pixels we have left.

 9:
-        subs            w6,  w6,  #2
-        b.le            0f
-        // Jump to the next row and loop horizontally
-        add             x0,  x0,  x9, lsl #1
-        add             x10, x10, x9, lsl #1
-        add             x1,  x1,  x9
-        add             x11, x11, x9
-        add             x3,  x3,  x4
-        add             x12, x12, x4
-        mov             w5,  w8
-        b               1b
+        ret
+endfunc
+
+// void dav1d_sgr_box35_row_h_8bpc_neon(int32_t *sumsq3, int16_t *sum3,
+//                                      int32_t *sumsq5, int16_t *sum5,
+//                                      const pixel (*left)[4],
+//                                      const pixel *src, const int w,
+//                                      const enum LrEdgeFlags edges);
+function sgr_box35_row_h_8bpc_neon, export=1
+        add             w6,  w6,  #2 // w += 2
+
+        tst             w7,  #1 // LR_HAVE_LEFT
+        b.eq            1f
+        cbnz            x4,  0f
+
+        // LR_HAVE_LEFT && left == NULL
+        sub             x5,  x5,  #3
+        ld1             {v0.16b},  [x5], #16
+        b               2f
+
 0:
+        // LR_HAVE_LEFT, left != NULL
+        ld1             {v0.16b},  [x5], #16
+        ld1             {v1.s}[3], [x4], #4
+        // Move x3 back to account for the last 3 bytes we loaded earlier,
+        // which we'll shift out.
+        sub             x5,  x5,  #3
+        ext             v0.16b, v1.16b, v0.16b, #13
+        b               2f
+
+1:
+        ld1             {v0.16b}, [x5], #16
+        // !LR_HAVE_LEFT, fill v1 with the leftmost byte
+        // and shift v0 to have 3x the first byte at the front.
+        dup             v1.16b, v0.b[0]
+        // Move x3 back to account for the last 3 bytes we loaded before,
+        // which we shifted out.
+        sub             x5,  x5,  #3
+        ext             v0.16b, v1.16b, v0.16b, #13
+
+2:
+        umull           v1.8h,   v0.8b,   v0.8b
+        umull2          v2.8h,   v0.16b,  v0.16b
+
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        b.ne            4f
+        // If we'll need to pad the right edge, load that byte to pad with
+        // here since we can find it pretty easily from here.
+        sub             w13, w6, #(2 + 16 - 3 + 1)
+        ldr             b30, [x5,  w13, sxtw]
+        // Fill v30 with the right padding pixel
+        dup             v30.16b, v30.b[0]
+3:      // !LR_HAVE_RIGHT
+
+        // Check whether we need to pad the right edge
+        cmp             w6,  #11
+        b.ge            4f   // If w >= 11, all used input pixels are valid
+
+        // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
+        // this ends up called again; it's not strictly needed in those
+        // cases (we pad enough here), but keeping the code as simple as possible.
+
+        // Insert padding in v0.b[w+1] onwards; fuse the +1 into the
+        // buffer pointer.
+        movrel          x13, right_ext_mask, -1
+        sub             x13, x13, w6,  uxtw
+        ld1             {v29.16b}, [x13]
+
+        bit             v0.16b,  v30.16b, v29.16b
+
+        // Update the precalculated squares
+        umull           v1.8h,   v0.8b,   v0.8b
+        umull2          v2.8h,   v0.16b,  v0.16b
+
+4:      // Loop horizontally
+        ext             v16.16b, v0.16b,  v0.16b, #1
+        ext             v17.16b, v0.16b,  v0.16b, #2
+        ext             v19.16b, v0.16b,  v0.16b, #4
+        ext             v18.16b, v0.16b,  v0.16b, #3
+        uaddl           v3.8h,   v16.8b,  v17.8b
+        uaddl           v24.8h,  v0.8b,   v19.8b
+        uaddw           v3.8h,   v3.8h,   v18.8b
+
+        ext             v16.16b, v1.16b,  v2.16b, #2
+        ext             v17.16b, v1.16b,  v2.16b, #4
+        ext             v19.16b, v1.16b,  v2.16b, #8
+        ext             v18.16b, v1.16b,  v2.16b, #6
+
+        st1             {v3.8h},         [x1], #16
+        add             v3.8h,   v3.8h,   v24.8h
+
+        uaddl           v26.4s,  v16.4h,  v17.4h
+        uaddl2          v27.4s,  v16.8h,  v17.8h
+        uaddl           v16.4s,  v1.4h,   v19.4h
+        uaddl2          v17.4s,  v1.8h,   v19.8h
+        uaddw           v26.4s,  v26.4s,  v18.4h
+        uaddw2          v27.4s,  v27.4s,  v18.8h
+
+        st1             {v26.4s,v27.4s}, [x0], #32
+        add             v26.4s,  v26.4s,  v16.4s
+        add             v27.4s,  v27.4s,  v17.4s
+
+        subs            w6,  w6,  #8
+
+        st1             {v3.8h},         [x3], #16
+        st1             {v26.4s,v27.4s}, [x2], #32
+
+        b.le            9f
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        ld1             {v3.8b},  [x5],  #8
+        mov             v1.16b,  v2.16b
+        ext             v0.16b,  v0.16b,  v3.16b, #8
+        umull           v2.8h,   v3.8b,   v3.8b
+
+        b.ne            4b // If we don't need to pad, just keep summing.
+        b               3b // If we need to pad, check how many pixels we have left.
+
+9:
        ret
 endfunc

--- a/third_party/dav1d/src/arm/64/looprestoration16.S
+++ b/third_party/dav1d/src/arm/64/looprestoration16.S
@ -1070,349 +1070,318 @@ function wiener_filter5_hv_16bpc_neon
        ret
 endfunc

-#define SUM_STRIDE (384+16)
-
 #include "looprestoration_tmpl.S"

-// void dav1d_sgr_box3_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
-//                                  const pixel (*left)[4],
-//                                  const pixel *src, const ptrdiff_t stride,
-//                                  const int w, const int h,
-//                                  const enum LrEdgeFlags edges);
-function sgr_box3_h_16bpc_neon, export=1
-        add             w5,  w5,  #2 // w += 2
+// void dav1d_sgr_box3_row_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
+//                                      const pixel (*left)[4],
+//                                      const pixel *src, const int w,
+//                                      const enum LrEdgeFlags edges);
+function sgr_box3_row_h_16bpc_neon, export=1
+        add             w4,  w4,  #2 // w += 2

-        // Set up pointers for reading/writing alternate rows
-        add             x10, x0,  #(4*SUM_STRIDE)   // sumsq
-        add             x11, x1,  #(2*SUM_STRIDE)   // sum
-        add             x12, x3,  x4                // src
-        lsl             x4,  x4,  #1
-        mov             x9,       #(2*2*SUM_STRIDE) // double sum stride
-
-        // Subtract the aligned width from the output stride.
-        add             w13, w5,  #7
-        bic             w13, w13, #7
-        sub             x9,  x9,  w13, uxtw #1
-
-        // Store the width for the vertical loop
-        mov             w8,  w5
-
-        // Subtract the number of pixels read from the input from the stride
-        add             w13, w13, #8
-        sub             x4,  x4,  w13, uxtw #1
-
-        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
-        tst             w7,  #1 // LR_HAVE_LEFT
-        b.eq            2f
-        // LR_HAVE_LEFT
+        tst             w5,  #1 // LR_HAVE_LEFT
+        b.eq            1f
        cbnz            x2,  0f
-        // left == NULL
+
+        // LR_HAVE_LEFT && left == NULL
        sub             x3,  x3,  #4
-        sub             x12, x12, #4
-        b               1f
-0:      // LR_HAVE_LEFT, left != NULL
-2:      // !LR_HAVE_LEFT, increase the stride.
-        // For this case we don't read the left 2 pixels from the src pointer,
-        // but shift it as if we had done that.
-        add             x4,  x4,  #4
+        ld1             {v0.8h, v1.8h}, [x3], #32
+        b               2f

-
-1:      // Loop vertically
-        ld1             {v0.8h, v1.8h},   [x3],  #32
-        ld1             {v16.8h, v17.8h}, [x12], #32
-
-        tst             w7,  #1 // LR_HAVE_LEFT
-        b.eq            0f
-        cbz             x2,  2f
+0:
        // LR_HAVE_LEFT, left != NULL
-        ld1             {v2.d}[1],  [x2], #8
-        // Move x3/x12 back to account for the last 2 pixels we loaded earlier,
+        ld1             {v0.8h, v1.8h}, [x3], #32
+        ld1             {v2.d}[1], [x2]
+        // Move x3 back to account for the last 2 pixels we loaded earlier,
        // which we'll shift out.
        sub             x3,  x3,  #4
-        sub             x12, x12, #4
-        ld1             {v18.d}[1], [x2], #8
-        ext             v1.16b,  v0.16b,  v1.16b,  #12
-        ext             v0.16b,  v2.16b,  v0.16b,  #12
-        ext             v17.16b, v16.16b, v17.16b, #12
-        ext             v16.16b, v18.16b, v16.16b, #12
+        ext             v1.16b, v0.16b, v1.16b, #12
+        ext             v0.16b, v2.16b, v0.16b, #12
        b               2f
-0:
+
+1:
+        ld1             {v0.8h, v1.8h}, [x3], #32
        // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
        // and shift v0/v1 to have 2x the first pixel at the front.
-        dup             v2.8h,  v0.h[0]
-        dup             v18.8h, v16.h[0]
+        dup             v2.8h, v0.h[0]
        // Move x3 back to account for the last 2 pixels we loaded before,
        // which we shifted out.
        sub             x3,  x3,  #4
-        sub             x12, x12, #4
-        ext             v1.16b,  v0.16b,  v1.16b,  #12
-        ext             v0.16b,  v2.16b,  v0.16b,  #12
-        ext             v17.16b, v16.16b, v17.16b, #12
-        ext             v16.16b, v18.16b, v16.16b, #12
+        ext             v1.16b, v0.16b, v1.16b, #12
+        ext             v0.16b, v2.16b, v0.16b, #12

 2:
-        tst             w7,  #2 // LR_HAVE_RIGHT
+        tst             w5,  #2 // LR_HAVE_RIGHT
        b.ne            4f
        // If we'll need to pad the right edge, load that pixel to pad with
        // here since we can find it pretty easily from here.
-        sub             w13, w5, #(2 + 16 - 2 + 1)
+        sub             w13, w4, #(2 + 16 - 2 + 1)
        ldr             h30, [x3,  w13, sxtw #1]
-        ldr             h31, [x12, w13, sxtw #1]
-        // Fill v30/v31 with the right padding pixel
+        // Fill v30 with the right padding pixel
        dup             v30.8h,  v30.h[0]
-        dup             v31.8h,  v31.h[0]
 3:      // !LR_HAVE_RIGHT

        // Check whether we need to pad the right edge
-        cmp             w5,  #10
+        cmp             w4,  #10
        b.ge            4f   // If w >= 10, all used input pixels are valid

-        // 1 <= w < 10, w pixels valid in v0-v1. For w=9, this ends up called
+        // 1 <= w < 10, w pixels valid in v0. For w=9, this ends up called
        // again; it's not strictly needed in those cases (we pad enough here),
        // but keeping the code as simple as possible.

-        // Insert padding in v0/1.h[w] onwards
+        // Insert padding in v0.b[w] onwards
        movrel          x13, right_ext_mask
-        sub             x13, x13, w5,  uxtw #1
+        sub             x13, x13, w4,  uxtw #1
        ld1             {v28.16b, v29.16b}, [x13]

        bit             v0.16b,  v30.16b, v28.16b
        bit             v1.16b,  v30.16b, v29.16b
-        bit             v16.16b, v31.16b, v28.16b
-        bit             v17.16b, v31.16b, v29.16b

 4:      // Loop horizontally
        ext             v26.16b, v0.16b,  v1.16b,  #2
-        ext             v28.16b, v16.16b, v17.16b, #2
        ext             v27.16b, v0.16b,  v1.16b,  #4
-        ext             v29.16b, v16.16b, v17.16b, #4

        add             v6.8h,   v0.8h,   v26.8h
        umull           v22.4s,  v0.4h,   v0.4h
        umlal           v22.4s,  v26.4h,  v26.4h
        umlal           v22.4s,  v27.4h,  v27.4h
-        add             v7.8h,   v16.8h,  v28.8h
-        umull           v24.4s,  v16.4h,  v16.4h
-        umlal           v24.4s,  v28.4h,  v28.4h
-        umlal           v24.4s,  v29.4h,  v29.4h
        add             v6.8h,   v6.8h,   v27.8h
        umull2          v23.4s,  v0.8h,   v0.8h
        umlal2          v23.4s,  v26.8h,  v26.8h
        umlal2          v23.4s,  v27.8h,  v27.8h
-        add             v7.8h,   v7.8h,   v29.8h
-        umull2          v25.4s,  v16.8h,  v16.8h
-        umlal2          v25.4s,  v28.8h,  v28.8h
-        umlal2          v25.4s,  v29.8h,  v29.8h

-        subs            w5,  w5,  #8
+        subs            w4,  w4,  #8

        st1             {v6.8h},         [x1],  #16
-        st1             {v7.8h},         [x11], #16
        st1             {v22.4s,v23.4s}, [x0],  #32
-        st1             {v24.4s,v25.4s}, [x10], #32

        b.le            9f
-        tst             w7,  #2 // LR_HAVE_RIGHT
+        tst             w5,  #2 // LR_HAVE_RIGHT
        mov             v0.16b,  v1.16b
-        mov             v16.16b, v17.16b
        ld1             {v1.8h},  [x3],  #16
-        ld1             {v17.8h}, [x12], #16

        b.ne            4b // If we don't need to pad, just keep summing.
        b               3b // If we need to pad, check how many pixels we have left.

 9:
-        subs            w6,  w6,  #2
-        b.le            0f
-        // Jump to the next row and loop horizontally
-        add             x0,  x0,  x9, lsl #1
-        add             x10, x10, x9, lsl #1
-        add             x1,  x1,  x9
-        add             x11, x11, x9
-        add             x3,  x3,  x4
-        add             x12, x12, x4
-        mov             w5,  w8
-        b               1b
-0:
        ret
 endfunc

-// void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
-//                                  const pixel (*left)[4],
-//                                  const pixel *src, const ptrdiff_t stride,
-//                                  const int w, const int h,
-//                                  const enum LrEdgeFlags edges);
-function sgr_box5_h_16bpc_neon, export=1
-        add             w5,  w5,  #2 // w += 2
+// void dav1d_sgr_box5_row_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
+//                                      const pixel (*left)[4],
+//                                      const pixel *src, const int w,
+//                                      const enum LrEdgeFlags edges);
+function sgr_box5_row_h_16bpc_neon, export=1
+        add             w4,  w4,  #2 // w += 2

-        // Set up pointers for reading/writing alternate rows
-        add             x10, x0,  #(4*SUM_STRIDE)   // sumsq
-        add             x11, x1,  #(2*SUM_STRIDE)   // sum
-        add             x12, x3,  x4                // src
-        lsl             x4,  x4,  #1
-        mov             x9,       #(2*2*SUM_STRIDE) // double sum stride
-
-        // Subtract the aligned width from the output stride.
-        add             w13, w5,  #7
-        bic             w13, w13, #7
-        sub             x9,  x9,  w13, uxtw #1
-        add             w13, w13, #8
-        sub             x4,  x4,  w13, uxtw #1
-
-        // Store the width for the vertical loop
-        mov             w8,  w5
-
-        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
-        tst             w7,  #1 // LR_HAVE_LEFT
-        b.eq            2f
-        // LR_HAVE_LEFT
+        tst             w5,  #1 // LR_HAVE_LEFT
+        b.eq            1f
        cbnz            x2,  0f
-        // left == NULL
+
+        // LR_HAVE_LEFT && left == NULL
        sub             x3,  x3,  #6
-        sub             x12, x12, #6
-        b               1f
-0:      // LR_HAVE_LEFT, left != NULL
-2:      // !LR_HAVE_LEFT, increase the stride.
-        // For this case we don't read the left 3 pixels from the src pointer,
-        // but shift it as if we had done that.
-        add             x4,  x4,  #6
+        ld1             {v0.8h, v1.8h}, [x3], #32
+        b               2f

-1:      // Loop vertically
-        ld1             {v0.8h, v1.8h},   [x3],  #32
-        ld1             {v16.8h, v17.8h}, [x12], #32
-
-        tst             w7,  #1 // LR_HAVE_LEFT
-        b.eq            0f
-        cbz             x2,  2f
+0:
        // LR_HAVE_LEFT, left != NULL
-        ld1             {v2.d}[1],  [x2], #8
-        // Move x3/x12 back to account for the last 3 pixels we loaded earlier,
+        ld1             {v0.8h, v1.8h}, [x3], #32
+        ld1             {v2.d}[1], [x2], #8
+        // Move x3 back to account for the last 3 pixels we loaded earlier,
        // which we'll shift out.
        sub             x3,  x3,  #6
-        sub             x12, x12, #6
-        ld1             {v18.d}[1],  [x2], #8
        ext             v1.16b,  v0.16b,  v1.16b,  #10
        ext             v0.16b,  v2.16b,  v0.16b,  #10
-        ext             v17.16b, v16.16b, v17.16b, #10
-        ext             v16.16b, v18.16b, v16.16b, #10
        b               2f
-0:
+
+1:
+        ld1             {v0.8h, v1.8h}, [x3], #32
        // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
        // and shift v0/v1 to have 3x the first pixel at the front.
        dup             v2.8h,  v0.h[0]
-        dup             v18.8h, v16.h[0]
        // Move x3 back to account for the last 3 pixels we loaded before,
        // which we shifted out.
        sub             x3,  x3,  #6
-        sub             x12, x12, #6
        ext             v1.16b,  v0.16b,  v1.16b,  #10
        ext             v0.16b,  v2.16b,  v0.16b,  #10
-        ext             v17.16b, v16.16b, v17.16b, #10
-        ext             v16.16b, v18.16b, v16.16b, #10

 2:
-        tst             w7,  #2 // LR_HAVE_RIGHT
+        tst             w5,  #2 // LR_HAVE_RIGHT
        b.ne            4f
        // If we'll need to pad the right edge, load that pixel to pad with
        // here since we can find it pretty easily from here.
-        sub             w13, w5, #(2 + 16 - 3 + 1)
+        sub             w13, w4, #(2 + 16 - 3 + 1)
        ldr             h30, [x3,  w13, sxtw #1]
-        ldr             h31, [x12, w13, sxtw #1]
-        // Fill v30/v31 with the right padding pixel
+        // Fill v30 with the right padding pixel
        dup             v30.8h,  v30.h[0]
-        dup             v31.8h,  v31.h[0]
 3:      // !LR_HAVE_RIGHT

        // Check whether we need to pad the right edge
-        cmp             w5,  #11
+        cmp             w4,  #11
        b.ge            4f   // If w >= 11, all used input pixels are valid

-        // 1 <= w < 11, w+1 pixels valid in v0-v1. For w=9 or w=10,
+        // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
        // this ends up called again; it's not strictly needed in those
        // cases (we pad enough here), but keeping the code as simple as possible.

-        // Insert padding in v0/1.h[w+1] onwards; fuse the +1 into the
+        // Insert padding in v0.b[w+1] onwards; fuse the +1 into the
        // buffer pointer.
-        movrel          x13, right_ext_mask, -2
-        sub             x13, x13, w5,  uxtw #1
+        movrel          x13, right_ext_mask, -1
+        sub             x13, x13, w4,  uxtw #1
        ld1             {v28.16b, v29.16b}, [x13]

        bit             v0.16b,  v30.16b, v28.16b
        bit             v1.16b,  v30.16b, v29.16b
-        bit             v16.16b, v31.16b, v28.16b
-        bit             v17.16b, v31.16b, v29.16b

 4:      // Loop horizontally
        ext             v26.16b, v0.16b,  v1.16b,  #2
-        ext             v28.16b, v16.16b, v17.16b, #2
        ext             v27.16b, v0.16b,  v1.16b,  #4
-        ext             v29.16b, v16.16b, v17.16b, #4

        add             v6.8h,   v0.8h,   v26.8h
        umull           v22.4s,  v0.4h,   v0.4h
        umlal           v22.4s,  v26.4h,  v26.4h
        umlal           v22.4s,  v27.4h,  v27.4h
-        add             v7.8h,   v16.8h,  v28.8h
-        umull           v24.4s,  v16.4h,  v16.4h
-        umlal           v24.4s,  v28.4h,  v28.4h
-        umlal           v24.4s,  v29.4h,  v29.4h
        add             v6.8h,   v6.8h,   v27.8h
        umull2          v23.4s,  v0.8h,   v0.8h
        umlal2          v23.4s,  v26.8h,  v26.8h
        umlal2          v23.4s,  v27.8h,  v27.8h
-        add             v7.8h,   v7.8h,   v29.8h
-        umull2          v25.4s,  v16.8h,  v16.8h
-        umlal2          v25.4s,  v28.8h,  v28.8h
-        umlal2          v25.4s,  v29.8h,  v29.8h

        ext             v26.16b, v0.16b,  v1.16b,  #6
-        ext             v28.16b, v16.16b, v17.16b, #6
        ext             v27.16b, v0.16b,  v1.16b,  #8
-        ext             v29.16b, v16.16b, v17.16b, #8

        add             v6.8h,   v6.8h,   v26.8h
        umlal           v22.4s,  v26.4h,  v26.4h
        umlal           v22.4s,  v27.4h,  v27.4h
-        add             v7.8h,   v7.8h,   v28.8h
-        umlal           v24.4s,  v28.4h,  v28.4h
-        umlal           v24.4s,  v29.4h,  v29.4h
        add             v6.8h,   v6.8h,   v27.8h
        umlal2          v23.4s,  v26.8h,  v26.8h
        umlal2          v23.4s,  v27.8h,  v27.8h
-        add             v7.8h,   v7.8h,   v29.8h
-        umlal2          v25.4s,  v28.8h,  v28.8h
-        umlal2          v25.4s,  v29.8h,  v29.8h

-        subs            w5,  w5,  #8
+        subs            w4,  w4,  #8

        st1             {v6.8h},         [x1],  #16
-        st1             {v7.8h},         [x11], #16
        st1             {v22.4s,v23.4s}, [x0],  #32
-        st1             {v24.4s,v25.4s}, [x10], #32

        b.le            9f
-        tst             w7,  #2 // LR_HAVE_RIGHT
+        tst             w5,  #2 // LR_HAVE_RIGHT
        mov             v0.16b,  v1.16b
-        mov             v16.16b, v17.16b
-        ld1             {v1.8h},  [x3],  #16
-        ld1             {v17.8h}, [x12], #16
+        ld1             {v1.8h}, [x3], #16

        b.ne            4b // If we don't need to pad, just keep summing.
        b               3b // If we need to pad, check how many pixels we have left.

 9:
-        subs            w6,  w6,  #2
-        b.le            0f
-        // Jump to the next row and loop horizontally
-        add             x0,  x0,  x9, lsl #1
-        add             x10, x10, x9, lsl #1
-        add             x1,  x1,  x9
-        add             x11, x11, x9
-        add             x3,  x3,  x4
-        add             x12, x12, x4
-        mov             w5,  w8
-        b               1b
+        ret
+endfunc
+
+// void dav1d_sgr_box35_row_h_16bpc_neon(int32_t *sumsq3, int16_t *sum3,
+//                                       int32_t *sumsq5, int16_t *sum5,
+//                                       const pixel (*left)[4],
+//                                       const pixel *src, const int w,
+//                                       const enum LrEdgeFlags edges);
+function sgr_box35_row_h_16bpc_neon, export=1
+        add             w6,  w6,  #2 // w += 2
+
+        tst             w7,  #1 // LR_HAVE_LEFT
+        b.eq            1f
+        cbnz            x4,  0f
+
+        // LR_HAVE_LEFT && left == NULL
+        sub             x5,  x5,  #6
+        ld1             {v0.8h, v1.8h}, [x5], #32
+        b               2f
+
 0:
+        // LR_HAVE_LEFT, left != NULL
+        ld1             {v0.8h, v1.8h}, [x5], #32
+        ld1             {v2.d}[1], [x4], #8
+        // Move x3 back to account for the last 3 pixels we loaded earlier,
+        // which we'll shift out.
+        sub             x5,  x5,  #6
+        ext             v1.16b,  v0.16b,  v1.16b,  #10
+        ext             v0.16b,  v2.16b,  v0.16b,  #10
+        b               2f
+
+1:
+        ld1             {v0.8h, v1.8h}, [x5], #32
+        // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
+        // and shift v0/v1 to have 3x the first pixel at the front.
+        dup             v2.8h,  v0.h[0]
+        // Move x5 back to account for the last 3 pixels we loaded before,
+        // which we shifted out.
+        sub             x5,  x5,  #6
+        ext             v1.16b,  v0.16b,  v1.16b,  #10
+        ext             v0.16b,  v2.16b,  v0.16b,  #10
+
+2:
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        b.ne            4f
+        // If we'll need to pad the right edge, load that pixel to pad with
+        // here since we can find it pretty easily from here.
+        sub             w13, w6, #(2 + 16 - 3 + 1)
+        ldr             h30, [x5,  w13, sxtw #1]
+        // Fill v30 with the right padding pixel
+        dup             v30.8h,  v30.h[0]
+3:      // !LR_HAVE_RIGHT
+
+        // Check whether we need to pad the right edge
+        cmp             w6,  #11
+        b.ge            4f   // If w >= 11, all used input pixels are valid
+
+        // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
+        // this ends up called again; it's not strictly needed in those
+        // cases (we pad enough here), but keeping the code as simple as possible.
+
+        // Insert padding in v0.b[w+1] onwards; fuse the +1 into the
+        // buffer pointer.
+        movrel          x13, right_ext_mask, -1
+        sub             x13, x13, w6,  uxtw #1
+        ld1             {v28.16b, v29.16b}, [x13]
+
+        bit             v0.16b,  v30.16b, v28.16b
+        bit             v1.16b,  v30.16b, v29.16b
+
+4:      // Loop horizontally
+        ext             v16.16b, v0.16b,  v1.16b,  #2
+        ext             v17.16b, v0.16b,  v1.16b,  #4
+        ext             v19.16b, v0.16b,  v1.16b,  #8
+        ext             v18.16b, v0.16b,  v1.16b,  #6
+
+        add             v20.8h,  v16.8h,  v17.8h
+        add             v21.8h,  v0.8h,   v19.8h
+        add             v20.8h,  v20.8h,  v18.8h
+
+        umull           v22.4s,  v16.4h,  v16.4h
+        umlal           v22.4s,  v17.4h,  v17.4h
+        umlal           v22.4s,  v18.4h,  v18.4h
+
+        umull2          v23.4s,  v16.8h,  v16.8h
+        umlal2          v23.4s,  v17.8h,  v17.8h
+        umlal2          v23.4s,  v18.8h,  v18.8h
+
+        add             v21.8h,  v21.8h,  v20.8h
+        st1             {v20.8h},        [x1], #16
+        st1             {v22.4s,v23.4s}, [x0], #32
+
+        umlal           v22.4s,  v0.4h,   v0.4h
+        umlal           v22.4s,  v19.4h,  v19.4h
+
+        umlal2          v23.4s,  v0.8h,   v0.8h
+        umlal2          v23.4s,  v19.8h,  v19.8h
+
+        subs            w6,  w6,  #8
+
+        st1             {v21.8h},        [x3], #16
+        st1             {v22.4s,v23.4s}, [x2], #32
+
+        b.le            9f
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        mov             v0.16b,  v1.16b
+        ld1             {v1.8h}, [x5], #16
+
+        b.ne            4b // If we don't need to pad, just keep summing.
+        b               3b // If we need to pad, check how many pixels we have left.
+
+9:
        ret
 endfunc

--- a/third_party/dav1d/src/arm/64/looprestoration_common.S
+++ b/third_party/dav1d/src/arm/64/looprestoration_common.S
@ -28,332 +28,29 @@
 #include "src/arm/asm.S"
 #include "util.S"

-#define SUM_STRIDE (384+16)
+// void dav1d_sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum,
+//                               int32_t *AA, int16_t *BB,
+//                               const int w, const int s,
+//                               const int bitdepth_max);
+function sgr_box3_vert_neon, export=1
+        stp             d8,  d9,  [sp, #-0x30]!
+        stp             d10, d11, [sp, #0x10]
+        stp             d12, d13, [sp, #0x20]

-// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
-//                            const int w, const int h,
-//                            const enum LrEdgeFlags edges);
-function sgr_box3_v_neon, export=1
-        add             w10, w3,  #2 // Number of output rows to move back
-        mov             w11, w3      // Number of input rows to move back
-        add             w2,  w2,  #2 // Actual summed width
-        mov             x7,       #(4*SUM_STRIDE) // sumsq stride
-        mov             x8,       #(2*SUM_STRIDE) // sum stride
-        sub             x0,  x0,  #(4*SUM_STRIDE) // sumsq -= stride
-        sub             x1,  x1,  #(2*SUM_STRIDE) // sum   -= stride
+        add             w4,  w4,  #2
+        clz             w9,  w6        // bitdepth_max
+        dup             v28.4s,   w5   // strength

-        tst             w4,  #4 // LR_HAVE_TOP
-        b.eq            0f
-        // If have top, read from row -2.
-        sub             x5,  x0,  #(4*SUM_STRIDE)
-        sub             x6,  x1,  #(2*SUM_STRIDE)
-        add             w11, w11, #2
-        b               1f
-0:
-        // !LR_HAVE_TOP
-        // If we don't have top, read from row 0 even if
-        // we start writing to row -1.
-        add             x5,  x0,  #(4*SUM_STRIDE)
-        add             x6,  x1,  #(2*SUM_STRIDE)
-1:
+        ldp             x5,  x6,  [x0]
+        ldr             x0,       [x0, #16]
+        ldp             x7,  x8,  [x1]
+        ldr             x1,       [x1, #16]

-        tst             w4,  #8 // LR_HAVE_BOTTOM
-        b.eq            1f
-        // LR_HAVE_BOTTOM
-        add             w3,  w3,  #2  // Sum all h+2 lines with the main loop
-        add             w11, w11, #2
-1:
-        mov             w9,  w3       // Backup of h for next loops
+        movi            v31.4s,   #9   // n

-1:
-        // Start of horizontal loop; start one vertical filter slice.
-        // Start loading rows into v16-v21 and v24-v26 taking top
-        // padding into consideration.
-        tst             w4,  #4 // LR_HAVE_TOP
-        ld1             {v16.4s, v17.4s}, [x5], x7
-        ld1             {v24.8h},         [x6], x8
-        b.eq            2f
-        // LR_HAVE_TOP
-        ld1             {v18.4s, v19.4s}, [x5], x7
-        ld1             {v25.8h},         [x6], x8
-        ld1             {v20.4s, v21.4s}, [x5], x7
-        ld1             {v26.8h},         [x6], x8
-        b               3f
-2:      // !LR_HAVE_TOP
-        mov             v18.16b, v16.16b
-        mov             v19.16b, v17.16b
-        mov             v25.16b, v24.16b
-        mov             v20.16b, v16.16b
-        mov             v21.16b, v17.16b
-        mov             v26.16b, v24.16b
-
-3:
-        subs            w3,  w3,  #1
-.macro add3
-        add             v16.4s,  v16.4s,  v18.4s
-        add             v17.4s,  v17.4s,  v19.4s
-        add             v24.8h,  v24.8h,  v25.8h
-        add             v16.4s,  v16.4s,  v20.4s
-        add             v17.4s,  v17.4s,  v21.4s
-        add             v24.8h,  v24.8h,  v26.8h
-        st1             {v16.4s, v17.4s}, [x0], x7
-        st1             {v24.8h},         [x1], x8
-.endm
-        add3
-        mov             v16.16b, v18.16b
-        mov             v17.16b, v19.16b
-        mov             v24.16b, v25.16b
-        mov             v18.16b, v20.16b
-        mov             v19.16b, v21.16b
-        mov             v25.16b, v26.16b
-        b.le            4f
-        ld1             {v20.4s, v21.4s}, [x5], x7
-        ld1             {v26.8h},         [x6], x8
-        b               3b
-
-4:
-        tst             w4,  #8 // LR_HAVE_BOTTOM
-        b.ne            5f
-        // !LR_HAVE_BOTTOM
-        // Produce two more rows, extending the already loaded rows.
-        add3
-        mov             v16.16b, v18.16b
-        mov             v17.16b, v19.16b
-        mov             v24.16b, v25.16b
-        add3
-
-5:      // End of one vertical slice.
-        subs            w2,  w2,  #8
-        b.le            0f
-        // Move pointers back up to the top and loop horizontally.
-        // Input pointers
-        msub            x5,  x7,  x11, x5
-        msub            x6,  x8,  x11, x6
-        // Output pointers
-        msub            x0,  x7,  x10, x0
-        msub            x1,  x8,  x10, x1
-        add             x0,  x0,  #32
-        add             x1,  x1,  #16
-        add             x5,  x5,  #32
-        add             x6,  x6,  #16
-        mov             w3,  w9
-        b               1b
-
-0:
-        ret
-.purgem add3
-endfunc
-
-// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
-//                            const int w, const int h,
-//                            const enum LrEdgeFlags edges);
-function sgr_box5_v_neon, export=1
-        add             w10, w3,  #2 // Number of output rows to move back
-        mov             w11, w3      // Number of input rows to move back
-        add             w2,  w2,  #8 // Actual summed width
-        mov             x7,       #(4*SUM_STRIDE) // sumsq stride
-        mov             x8,       #(2*SUM_STRIDE) // sum stride
-        sub             x0,  x0,  #(4*SUM_STRIDE) // sumsq -= stride
-        sub             x1,  x1,  #(2*SUM_STRIDE) // sum   -= stride
-
-        tst             w4,  #4 // LR_HAVE_TOP
-        b.eq            0f
-        // If have top, read from row -2.
-        sub             x5,  x0,  #(4*SUM_STRIDE)
-        sub             x6,  x1,  #(2*SUM_STRIDE)
-        add             w11, w11, #2
-        b               1f
-0:
-        // !LR_HAVE_TOP
-        // If we don't have top, read from row 0 even if
-        // we start writing to row -1.
-        add             x5,  x0,  #(4*SUM_STRIDE)
-        add             x6,  x1,  #(2*SUM_STRIDE)
-1:
-
-        tst             w4,  #8 // LR_HAVE_BOTTOM
-        b.eq            0f
-        // LR_HAVE_BOTTOM
-        add             w3,  w3,  #2  // Handle h+2 lines with the main loop
-        add             w11, w11, #2
-        b               1f
-0:
-        // !LR_HAVE_BOTTOM
-        sub             w3,  w3,  #1  // Handle h-1 lines with the main loop
-1:
-        mov             w9,  w3       // Backup of h for next loops
-
-1:
-        // Start of horizontal loop; start one vertical filter slice.
-        // Start loading rows into v16-v25 and v26-v30 taking top
-        // padding into consideration.
-        tst             w4,  #4 // LR_HAVE_TOP
-        ld1             {v16.4s, v17.4s}, [x5], x7
-        ld1             {v26.8h},         [x6], x8
-        b.eq            2f
-        // LR_HAVE_TOP
-        ld1             {v20.4s, v21.4s}, [x5], x7
-        ld1             {v28.8h},         [x6], x8
-        mov             v18.16b, v16.16b
-        mov             v19.16b, v17.16b
-        mov             v27.16b, v26.16b
-        ld1             {v22.4s, v23.4s}, [x5], x7
-        ld1             {v29.8h},         [x6], x8
-        b               3f
-2:      // !LR_HAVE_TOP
-        mov             v18.16b, v16.16b
-        mov             v19.16b, v17.16b
-        mov             v27.16b, v26.16b
-        mov             v20.16b, v16.16b
-        mov             v21.16b, v17.16b
-        mov             v28.16b, v26.16b
-        mov             v22.16b, v16.16b
-        mov             v23.16b, v17.16b
-        mov             v29.16b, v26.16b
-
-3:
-        cbz             w3,  4f
-        ld1             {v24.4s, v25.4s}, [x5], x7
-        ld1             {v30.8h},         [x6], x8
-
-3:
-        // Start of vertical loop
-        subs            w3,  w3,  #2
-.macro add5
-        add             v16.4s,  v16.4s,  v18.4s
-        add             v17.4s,  v17.4s,  v19.4s
-        add             v26.8h,  v26.8h,  v27.8h
-        add             v0.4s,   v20.4s,  v22.4s
-        add             v1.4s,   v21.4s,  v23.4s
-        add             v2.8h,   v28.8h,  v29.8h
-        add             v16.4s,  v16.4s,  v24.4s
-        add             v17.4s,  v17.4s,  v25.4s
-        add             v26.8h,  v26.8h,  v30.8h
-        add             v16.4s,  v16.4s,  v0.4s
-        add             v17.4s,  v17.4s,  v1.4s
-        add             v26.8h,  v26.8h,  v2.8h
-        st1             {v16.4s, v17.4s}, [x0], x7
-        st1             {v26.8h},         [x1], x8
-.endm
-        add5
-.macro shift2
-        mov             v16.16b, v20.16b
-        mov             v17.16b, v21.16b
-        mov             v26.16b, v28.16b
-        mov             v18.16b, v22.16b
-        mov             v19.16b, v23.16b
-        mov             v27.16b, v29.16b
-        mov             v20.16b, v24.16b
-        mov             v21.16b, v25.16b
-        mov             v28.16b, v30.16b
-.endm
-        shift2
-        add             x0,  x0,  x7
-        add             x1,  x1,  x8
-        b.le            5f
-        ld1             {v22.4s, v23.4s}, [x5], x7
-        ld1             {v29.8h},         [x6], x8
-        ld1             {v24.4s, v25.4s}, [x5], x7
-        ld1             {v30.8h},         [x6], x8
-        b               3b
-
-4:
-        // h == 1, !LR_HAVE_BOTTOM.
-        // Pad the last row with the only content row, and add.
-        mov             v24.16b, v22.16b
-        mov             v25.16b, v23.16b
-        mov             v30.16b, v29.16b
-        add5
-        shift2
-        add             x0,  x0,  x7
-        add             x1,  x1,  x8
-        add5
-        b               6f
-
-5:
-        tst             w4,  #8 // LR_HAVE_BOTTOM
-        b.ne            6f
-        // !LR_HAVE_BOTTOM
-        cbnz            w3,  5f
-        // The intended three edge rows left; output the one at h-2 and
-        // the past edge one at h.
-        ld1             {v22.4s, v23.4s}, [x5], x7
-        ld1             {v29.8h},         [x6], x8
-        // Pad the past-edge row from the last content row.
-        mov             v24.16b, v22.16b
-        mov             v25.16b, v23.16b
-        mov             v30.16b, v29.16b
-        add5
-        shift2
-        add             x0,  x0,  x7
-        add             x1,  x1,  x8
-        // The last two rows are already padded properly here.
-        add5
-        b               6f
-
-5:
-        // w3 == -1, two rows left, output one.
-        // Pad the last two rows from the mid one.
-        mov             v22.16b, v20.16b
-        mov             v23.16b, v21.16b
-        mov             v29.16b, v28.16b
-        mov             v24.16b, v20.16b
-        mov             v25.16b, v21.16b
-        mov             v30.16b, v28.16b
-        add5
-        add             x0,  x0,  x7
-        add             x1,  x1,  x8
-        b               6f
-
-6:      // End of one vertical slice.
-        subs            w2,  w2,  #8
-        b.le            0f
-        // Move pointers back up to the top and loop horizontally.
-        // Input pointers
-        msub            x5,  x7,  x11, x5
-        msub            x6,  x8,  x11, x6
-        // Output pointers
-        msub            x0,  x7,  x10, x0
-        msub            x1,  x8,  x10, x1
-        add             x0,  x0,  #32
-        add             x1,  x1,  #16
-        add             x5,  x5,  #32
-        add             x6,  x6,  #16
-        mov             w3,  w9
-        b               1b
-
-0:
-        ret
-.purgem add5
-endfunc
-
-// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
-//                              const int w, const int h, const int strength,
-//                              const int bitdepth_max);
-// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
-//                              const int w, const int h, const int strength,
-//                              const int bitdepth_max);
-function sgr_calc_ab1_neon, export=1
-        clz             w9,  w5
-        add             x3,  x3,  #2 // h += 2
-        movi            v31.4s,   #9 // n
-        mov             x5,  #455
-        mov             x8,  #SUM_STRIDE
-        b               sgr_calc_ab_neon
-endfunc
-
-function sgr_calc_ab2_neon, export=1
-        clz             w9,  w5
-        add             x3,  x3,  #3  // h += 3
-        asr             x3,  x3,  #1  // h /= 2
-        movi            v31.4s,   #25 // n
-        mov             x5,  #164
-        mov             x8,  #(2*SUM_STRIDE)
-endfunc
-
-function sgr_calc_ab_neon
        sub             w9,  w9,  #24  // -bitdepth_min_8
        movrel          x12, X(sgr_x_by_x)
+        mov             w13, #455      // one_by_x
        ld1             {v16.16b, v17.16b, v18.16b}, [x12]
        dup             v6.8h,    w9   // -bitdepth_min_8
        movi            v19.16b,  #5
@ -363,70 +60,213 @@ function sgr_calc_ab_neon
        movi            v23.8b,   #169 // idx of last 2
        movi            v24.8b,   #254 // idx of last 1
        saddl           v7.4s,    v6.4h,   v6.4h  // -2*bitdepth_min_8
-        add             x2,  x2,  #2 // w += 2
-        add             x7,  x2,  #7
-        bic             x7,  x7,  #7 // aligned w
-        sub             x7,  x8,  x7 // increment between rows
        movi            v29.8h,   #1, lsl #8
-        dup             v28.4s,   w4
-        dup             v30.4s,   w5 // one_by_x
-        sub             x0,  x0,  #(4*(SUM_STRIDE))
-        sub             x1,  x1,  #(2*(SUM_STRIDE))
-        mov             x6,  x2   // backup of w
+        dup             v30.4s,   w13  // one_by_x
+
        sub             v16.16b, v16.16b, v19.16b
        sub             v17.16b, v17.16b, v19.16b
        sub             v18.16b, v18.16b, v19.16b
+
+        ld1             {v8.4s,  v9.4s},  [x5], #32
+        ld1             {v10.4s, v11.4s}, [x6], #32
+        ld1             {v12.8h},         [x7], #16
+        ld1             {v13.8h},         [x8], #16
+        ld1             {v0.4s, v1.4s},   [x0], #32
+        ld1             {v2.8h},          [x1], #16
 1:
-        subs            x2,  x2,  #8
-        ld1             {v0.4s, v1.4s}, [x0]   // a
-        ld1             {v2.8h}, [x1]          // b
-        srshl           v0.4s,  v0.4s,  v7.4s
-        srshl           v1.4s,  v1.4s,  v7.4s
-        srshl           v4.8h,  v2.8h,  v6.8h
-        mul             v0.4s,  v0.4s,  v31.4s // a * n
-        mul             v1.4s,  v1.4s,  v31.4s // a * n
-        umull           v3.4s,  v4.4h,  v4.4h  // b * b
-        umull2          v4.4s,  v4.8h,  v4.8h  // b * b
-        uqsub           v0.4s,  v0.4s,  v3.4s  // imax(a * n - b * b, 0)
-        uqsub           v1.4s,  v1.4s,  v4.4s  // imax(a * n - b * b, 0)
-        mul             v0.4s,  v0.4s,  v28.4s // p * s
-        mul             v1.4s,  v1.4s,  v28.4s // p * s
-        uqshrn          v0.4h,  v0.4s,  #16
-        uqshrn2         v0.8h,  v1.4s,  #16
-        uqrshrn         v0.8b,  v0.8h,  #4     // imin(z, 255)

-        cmhi            v25.8b, v0.8b,  v20.8b // = -1 if sgr_x_by_x[v0] < 5
-        cmhi            v26.8b, v0.8b,  v21.8b // = -1 if sgr_x_by_x[v0] < 4
-        tbl             v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b
-        cmhi            v27.8b, v0.8b,  v22.8b // = -1 if sgr_x_by_x[v0] < 3
-        cmhi            v4.8b,  v0.8b,  v23.8b // = -1 if sgr_x_by_x[v0] < 2
-        add             v25.8b, v25.8b, v26.8b
-        cmhi            v5.8b,  v0.8b,  v24.8b // = -1 if sgr_x_by_x[v0] < 1
-        add             v27.8b, v27.8b, v4.8b
-        add             v5.8b,  v5.8b,  v19.8b
-        add             v25.8b, v25.8b, v27.8b
-        add             v1.8b,  v1.8b,  v5.8b
-        add             v1.8b,  v1.8b,  v25.8b
-        uxtl            v1.8h,  v1.8b          // x
+        add             v8.4s,   v8.4s,   v10.4s
+        add             v9.4s,   v9.4s,   v11.4s

-        umull           v3.4s,  v1.4h,  v2.4h  // x * BB[i]
-        umull2          v4.4s,  v1.8h,  v2.8h  // x * BB[i]
-        mul             v3.4s,  v3.4s,  v30.4s // x * BB[i] * sgr_one_by_x
-        mul             v4.4s,  v4.4s,  v30.4s // x * BB[i] * sgr_one_by_x
-        srshr           v3.4s,  v3.4s,  #12    // AA[i]
-        srshr           v4.4s,  v4.4s,  #12    // AA[i]
-        sub             v2.8h,  v29.8h, v1.8h  // 256 - x
+        add             v12.8h,  v12.8h,  v13.8h

-        st1             {v3.4s, v4.4s}, [x0], #32
-        st1             {v2.8h}, [x1], #16
+        subs            w4,  w4,  #8
+        add             v0.4s,   v0.4s,   v8.4s
+        add             v1.4s,   v1.4s,   v9.4s
+        add             v2.8h,   v2.8h,   v12.8h
+
+        srshl           v0.4s,   v0.4s,   v7.4s
+        srshl           v1.4s,   v1.4s,   v7.4s
+        srshl           v4.8h,   v2.8h,   v6.8h
+        mul             v0.4s,   v0.4s,   v31.4s // a * n
+        mul             v1.4s,   v1.4s,   v31.4s // a * n
+        umull           v3.4s,   v4.4h,   v4.4h  // b * b
+        umull2          v4.4s,   v4.8h,   v4.8h  // b * b
+        uqsub           v0.4s,   v0.4s,   v3.4s  // imax(a * n - b * b, 0)
+        uqsub           v1.4s,   v1.4s,   v4.4s  // imax(a * n - b * b, 0)
+        mul             v0.4s,   v0.4s,   v28.4s // p * s
+        mul             v1.4s,   v1.4s,   v28.4s // p * s
+        ld1             {v8.4s,  v9.4s},  [x5], #32
+        uqshrn          v0.4h,   v0.4s,   #16
+        uqshrn2         v0.8h,   v1.4s,   #16
+        ld1             {v10.4s, v11.4s}, [x6], #32
+        uqrshrn         v0.8b,   v0.8h,   #4     // imin(z, 255)
+
+        ld1             {v12.8h},         [x7], #16
+
+        cmhi            v25.8b,  v0.8b,   v20.8b // = -1 if sgr_x_by_x[v0] < 5
+        cmhi            v26.8b,  v0.8b,   v21.8b // = -1 if sgr_x_by_x[v0] < 4
+        tbl             v1.8b, {v16.16b,  v17.16b, v18.16b}, v0.8b
+        cmhi            v27.8b,  v0.8b,   v22.8b // = -1 if sgr_x_by_x[v0] < 3
+        cmhi            v4.8b,   v0.8b,   v23.8b // = -1 if sgr_x_by_x[v0] < 2
+        add             v25.8b,  v25.8b,  v26.8b
+        cmhi            v5.8b,   v0.8b,   v24.8b // = -1 if sgr_x_by_x[v0] < 1
+        add             v27.8b,  v27.8b,  v4.8b
+        add             v5.8b,   v5.8b,   v19.8b
+        add             v25.8b,  v25.8b,  v27.8b
+        add             v5.8b,   v1.8b,   v5.8b
+        ld1             {v13.8h},         [x8], #16
+        add             v5.8b,   v5.8b,   v25.8b
+        ld1             {v0.4s, v1.4s},   [x0], #32
+        uxtl            v5.8h,   v5.8b           // x
+
+        umull           v3.4s,   v5.4h,   v2.4h  // x * BB[i]
+        umull2          v4.4s,   v5.8h,   v2.8h  // x * BB[i]
+        mul             v3.4s,   v3.4s,   v30.4s // x * BB[i] * sgr_one_by_x
+        mul             v4.4s,   v4.4s,   v30.4s // x * BB[i] * sgr_one_by_x
+        srshr           v3.4s,   v3.4s,   #12    // AA[i]
+        srshr           v4.4s,   v4.4s,   #12    // AA[i]
+        sub             v5.8h,   v29.8h,  v5.8h  // 256 - x
+        ld1             {v2.8h},          [x1], #16
+
+        st1             {v3.4s, v4.4s}, [x2], #32
+        st1             {v5.8h}, [x3], #16
        b.gt            1b

-        subs            x3,  x3,  #1
-        b.le            0f
-        add             x0,  x0,  x7, lsl #2
-        add             x1,  x1,  x7, lsl #1
-        mov             x2,  x6
-        b               1b
-0:
+        ldp             d12, d13, [sp, #0x20]
+        ldp             d10, d11, [sp, #0x10]
+        ldp             d8,  d9,  [sp], 0x30
+        ret
+endfunc
+
+// void dav1d_sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum,
+//                               int32_t *AA, int16_t *BB,
+//                               const int w, const int s,
+//                               const int bitdepth_max);
+function sgr_box5_vert_neon, export=1
+        stp             d8,  d9,  [sp, #-0x40]!
+        stp             d10, d11, [sp, #0x10]
+        stp             d12, d13, [sp, #0x20]
+        stp             d14, d15, [sp, #0x30]
+
+        add             w4,  w4,  #2
+        clz             w15, w6        // bitdepth_max
+        dup             v28.4s,   w5   // strength
+
+        ldp             x5,  x6,  [x0]
+        ldp             x7,  x8,  [x0, #16]
+        ldr             x0,       [x0, #32]
+        ldp             x9,  x10, [x1]
+        ldp             x11, x12, [x1, #16]
+        ldr             x1,       [x1, #32]
+
+        movi            v31.4s,   #25   // n
+
+        sub             w15, w15, #24  // -bitdepth_min_8
+        movrel          x13, X(sgr_x_by_x)
+        mov             w14, #164      // one_by_x
+        ld1             {v16.16b, v17.16b, v18.16b}, [x13]
+        dup             v6.8h,   w15  // -bitdepth_min_8
+        movi            v19.16b, #5
+        movi            v24.8b,  #254 // idx of last 1
+        saddl           v7.4s,   v6.4h,   v6.4h  // -2*bitdepth_min_8
+        movi            v29.8h,  #1, lsl #8
+        dup             v30.4s,  w14  // one_by_x
+
+        sub             v16.16b, v16.16b, v19.16b
+        sub             v17.16b, v17.16b, v19.16b
+        sub             v18.16b, v18.16b, v19.16b
+
+        ld1             {v8.4s,  v9.4s},  [x5], #32
+        ld1             {v10.4s, v11.4s}, [x6], #32
+        ld1             {v12.4s, v13.4s}, [x7], #32
+        ld1             {v14.4s, v15.4s}, [x8], #32
+        ld1             {v20.8h},         [x9], #16
+        ld1             {v21.8h},         [x10], #16
+        ld1             {v22.8h},         [x11], #16
+        ld1             {v23.8h},         [x12], #16
+        ld1             {v0.4s,  v1.4s},  [x0], #32
+        ld1             {v2.8h},          [x1], #16
+
+1:
+        add             v8.4s,   v8.4s,   v10.4s
+        add             v9.4s,   v9.4s,   v11.4s
+        add             v12.4s,  v12.4s,  v14.4s
+        add             v13.4s,  v13.4s,  v15.4s
+
+        add             v20.8h,  v20.8h,  v21.8h
+        add             v22.8h,  v22.8h,  v23.8h
+
+        add             v0.4s,   v0.4s,   v8.4s
+        add             v1.4s,   v1.4s,   v9.4s
+        add             v2.8h,   v2.8h,   v20.8h
+
+        add             v0.4s,   v0.4s,   v12.4s
+        add             v1.4s,   v1.4s,   v13.4s
+        add             v2.8h,   v2.8h,   v22.8h
+
+        subs            w4,  w4,  #8
+
+        movi            v20.8b,  #55  // idx of last 5
+        movi            v21.8b,  #72  // idx of last 4
+        movi            v22.8b,  #101 // idx of last 3
+        movi            v23.8b,  #169 // idx of last 2
+
+        srshl           v0.4s,   v0.4s,   v7.4s
+        srshl           v1.4s,   v1.4s,   v7.4s
+        srshl           v4.8h,   v2.8h,   v6.8h
+        mul             v0.4s,   v0.4s,   v31.4s // a * n
+        mul             v1.4s,   v1.4s,   v31.4s // a * n
+        umull           v3.4s,   v4.4h,   v4.4h  // b * b
+        umull2          v4.4s,   v4.8h,   v4.8h  // b * b
+        uqsub           v0.4s,   v0.4s,   v3.4s  // imax(a * n - b * b, 0)
+        uqsub           v1.4s,   v1.4s,   v4.4s  // imax(a * n - b * b, 0)
+        mul             v0.4s,   v0.4s,   v28.4s // p * s
+        mul             v1.4s,   v1.4s,   v28.4s // p * s
+        ld1             {v8.4s,  v9.4s},  [x5], #32
+        uqshrn          v0.4h,   v0.4s,   #16
+        uqshrn2         v0.8h,   v1.4s,   #16
+        ld1             {v10.4s, v11.4s}, [x6], #32
+        uqrshrn         v0.8b,   v0.8h,   #4     // imin(z, 255)
+
+        ld1             {v12.4s, v13.4s}, [x7], #32
+
+        cmhi            v25.8b,  v0.8b,   v20.8b // = -1 if sgr_x_by_x[v0] < 5
+        cmhi            v26.8b,  v0.8b,   v21.8b // = -1 if sgr_x_by_x[v0] < 4
+        tbl             v1.8b, {v16.16b,  v17.16b, v18.16b}, v0.8b
+        cmhi            v27.8b,  v0.8b,   v22.8b // = -1 if sgr_x_by_x[v0] < 3
+        cmhi            v4.8b,   v0.8b,   v23.8b // = -1 if sgr_x_by_x[v0] < 2
+        ld1             {v14.4s, v15.4s}, [x8], #32
+        add             v25.8b,  v25.8b,  v26.8b
+        cmhi            v5.8b,   v0.8b,   v24.8b // = -1 if sgr_x_by_x[v0] < 1
+        add             v27.8b,  v27.8b,  v4.8b
+        ld1             {v20.8h},         [x9], #16
+        add             v5.8b,   v5.8b,   v19.8b
+        add             v25.8b,  v25.8b,  v27.8b
+        ld1             {v21.8h},         [x10], #16
+        add             v5.8b,   v1.8b,   v5.8b
+        ld1             {v22.8h},         [x11], #16
+        add             v5.8b,   v5.8b,   v25.8b
+        ld1             {v23.8h},         [x12], #16
+        uxtl            v5.8h,   v5.8b           // x
+
+        ld1             {v0.4s,  v1.4s},  [x0], #32
+        umull           v3.4s,   v5.4h,   v2.4h  // x * BB[i]
+        umull2          v4.4s,   v5.8h,   v2.8h  // x * BB[i]
+        mul             v3.4s,   v3.4s,   v30.4s // x * BB[i] * sgr_one_by_x
+        mul             v4.4s,   v4.4s,   v30.4s // x * BB[i] * sgr_one_by_x
+        srshr           v3.4s,   v3.4s,   #12    // AA[i]
+        srshr           v4.4s,   v4.4s,   #12    // AA[i]
+        sub             v5.8h,   v29.8h,  v5.8h  // 256 - x
+        ld1             {v2.8h},          [x1], #16
+
+        st1             {v3.4s, v4.4s}, [x2], #32
+        st1             {v5.8h}, [x3], #16
+        b.gt            1b
+
+        ldp             d14, d15, [sp, #0x30]
+        ldp             d12, d13, [sp, #0x20]
+        ldp             d10, d11, [sp, #0x10]
+        ldp             d8,  d9,  [sp], 0x40
        ret
 endfunc
--- a/third_party/dav1d/src/arm/64/looprestoration_tmpl.S
+++ b/third_party/dav1d/src/arm/64/looprestoration_tmpl.S
@ -30,52 +30,224 @@
 #define FILTER_OUT_STRIDE 384

 .macro sgr_funcs bpc
-// void dav1d_sgr_finish_filter1_Xbpc_neon(int16_t *tmp,
-//                                         const pixel *src, const ptrdiff_t stride,
-//                                         const int32_t *a, const int16_t *b,
-//                                         const int w, const int h);
-function sgr_finish_filter1_\bpc\()bpc_neon, export=1
-        sub             x7,  x3,  #(4*SUM_STRIDE)
-        add             x8,  x3,  #(4*SUM_STRIDE)
-        sub             x9,  x4,  #(2*SUM_STRIDE)
-        add             x10, x4,  #(2*SUM_STRIDE)
-        mov             x11, #SUM_STRIDE
-        mov             x12, #FILTER_OUT_STRIDE
-        add             x13, x5,  #7
-        bic             x13, x13, #7 // Aligned width
+// void dav1d_sgr_finish_filter1_2rows_Xbpc_neon(int16_t *tmp,
+//                                               const pixel *src,
+//                                               const ptrdiff_t src_stride,
+//                                               const int32_t **a,
+//                                               const int16_t **b,
+//                                               const int w, const int h);
+function sgr_finish_filter1_2rows_\bpc\()bpc_neon, export=1
+        stp             d8,  d9,  [sp, #-0x40]!
+        stp             d10, d11, [sp, #0x10]
+        stp             d12, d13, [sp, #0x20]
+        stp             d14, d15, [sp, #0x30]
+
+        ldp             x7,  x8,  [x3]
+        ldp             x9,  x3,  [x3, #16]
+        ldp             x10, x11, [x4]
+        ldp             x12, x4,  [x4, #16]
+
+        mov             x13, #FILTER_OUT_STRIDE
+        cmp             w6,  #1
+        add             x2,  x1,  x2 // src + stride
+        csel            x2,  x1,  x2,  le // if (h <= 1) x2 = x1
+        add             x13, x0,  x13, lsl #1
+
+        movi            v30.8h, #3
+        movi            v31.4s, #3
+1:
+        ld1             {v0.8h, v1.8h}, [x10], #32
+        ld1             {v2.8h, v3.8h}, [x11], #32
+        ld1             {v4.8h, v5.8h}, [x12], #32
+        ld1             {v6.8h, v7.8h}, [x4],  #32
+        ld1             {v16.4s, v17.4s, v18.4s}, [x7], #48
+        ld1             {v19.4s, v20.4s, v21.4s}, [x8], #48
+        ld1             {v22.4s, v23.4s, v24.4s}, [x9], #48
+        ld1             {v25.4s, v26.4s, v27.4s}, [x3], #48
+
+2:
+        ext             v8.16b,  v0.16b,  v1.16b, #2  // [0][1]
+        ext             v9.16b,  v2.16b,  v3.16b, #2  // [1][1]
+        ext             v10.16b, v4.16b,  v5.16b, #2  // [2][1]
+        ext             v11.16b, v0.16b,  v1.16b, #4  // [0][2]
+        ext             v12.16b, v2.16b,  v3.16b, #4  // [1][2]
+        ext             v13.16b, v4.16b,  v5.16b, #4  // [2][2]
+
+        add             v14.8h,  v2.8h,   v8.8h       // [1][0] + [0][1]
+        add             v15.8h,  v9.8h,   v10.8h      // [1][1] + [2][1]
+
+        add             v28.8h,  v0.8h,   v11.8h      // [0][0] + [0][2]
+        add             v14.8h,  v14.8h,  v12.8h      // () + [1][2]
+        add             v29.8h,  v4.8h,   v13.8h      // [2][0] + [2][2]
+
+        ext             v8.16b,  v6.16b,  v7.16b, #2  // [3][1]
+        ext             v11.16b, v6.16b,  v7.16b, #4  // [3][2]
+
+        add             v14.8h,  v14.8h,  v15.8h      // mid
+        add             v15.8h,  v28.8h,  v29.8h      // corners
+
+        add             v28.8h,  v4.8h,   v9.8h       // [2][0] + [1][1]
+        add             v29.8h,  v10.8h,  v8.8h       // [2][1] + [3][1]
+
+        add             v2.8h,   v2.8h,   v12.8h      // [1][0] + [1][2]
+        add             v28.8h,  v28.8h,  v13.8h      // () + [2][2]
+        add             v4.8h,   v6.8h,   v11.8h      // [3][0] + [3][2]
+
+        add             v0.8h,   v28.8h,  v29.8h      // mid
+        add             v2.8h,   v2.8h,   v4.8h       // corners
+
+        shl             v4.8h,   v14.8h,  #2
+        mla             v4.8h,   v15.8h,  v30.8h      // * 3 -> a
+
+        shl             v0.8h,   v0.8h,   #2
+        mla             v0.8h,   v2.8h,   v30.8h      // * 3 -> a
+
+        ext             v8.16b,  v16.16b, v17.16b, #4 // [0][1]
+        ext             v9.16b,  v17.16b, v18.16b, #4
+        ext             v10.16b, v16.16b, v17.16b, #8 // [0][2]
+        ext             v11.16b, v17.16b, v18.16b, #8
+        ext             v12.16b, v19.16b, v20.16b, #4 // [1][1]
+        ext             v13.16b, v20.16b, v21.16b, #4
+        add             v8.4s,   v8.4s,   v19.4s      // [0][1] + [1][0]
+        add             v9.4s,   v9.4s,   v20.4s
+        add             v16.4s,  v16.4s,  v10.4s      // [0][0] + [0][2]
+        add             v17.4s,  v17.4s,  v11.4s
+        ext             v14.16b, v19.16b, v20.16b, #8 // [1][2]
+        ext             v15.16b, v20.16b, v21.16b, #8
+        add             v16.4s,  v16.4s,  v22.4s      // () + [2][0]
+        add             v17.4s,  v17.4s,  v23.4s
+        add             v28.4s,  v12.4s,  v14.4s      // [1][1] + [1][2]
+        add             v29.4s,  v13.4s,  v15.4s
+        ext             v10.16b, v22.16b, v23.16b, #4 // [2][1]
+        ext             v11.16b, v23.16b, v24.16b, #4
+        add             v8.4s,   v8.4s,   v28.4s      // mid (incomplete)
+        add             v9.4s,   v9.4s,   v29.4s
+
+        add             v19.4s,  v19.4s,  v14.4s      // [1][0] + [1][2]
+        add             v20.4s,  v20.4s,  v15.4s
+        add             v14.4s,  v22.4s,  v12.4s      // [2][0] + [1][1]
+        add             v15.4s,  v23.4s,  v13.4s
+
+        ext             v12.16b, v22.16b, v23.16b, #8 // [2][2]
+        ext             v13.16b, v23.16b, v24.16b, #8
+        ext             v28.16b, v25.16b, v26.16b, #4 // [3][1]
+        ext             v29.16b, v26.16b, v27.16b, #4
+        add             v8.4s,   v8.4s,   v10.4s      // () + [2][1] = mid
+        add             v9.4s,   v9.4s,   v11.4s
+        add             v14.4s,  v14.4s,  v10.4s      // () + [2][1]
+        add             v15.4s,  v15.4s,  v11.4s
+        ext             v10.16b, v25.16b, v26.16b, #8 // [3][2]
+        ext             v11.16b, v26.16b, v27.16b, #8
+        add             v16.4s,  v16.4s,  v12.4s      // () + [2][2] = corner
+        add             v17.4s,  v17.4s,  v13.4s
+
+        add             v12.4s,  v12.4s,  v28.4s      // [2][2] + [3][1]
+        add             v13.4s,  v13.4s,  v29.4s
+        add             v25.4s,  v25.4s,  v10.4s      // [3][0] + [3][2]
+        add             v26.4s,  v26.4s,  v11.4s
+
+        add             v14.4s,  v14.4s,  v12.4s      // mid
+        add             v15.4s,  v15.4s,  v13.4s
+        add             v19.4s,  v19.4s,  v25.4s      // corner
+        add             v20.4s,  v20.4s,  v26.4s
+
 .if \bpc == 8
-        sub             x2,  x2,  x13
+        ld1             {v25.8b}, [x1], #8            // src
+        ld1             {v26.8b}, [x2], #8
 .else
-        sub             x2,  x2,  x13, lsl #1
+        ld1             {v25.8h}, [x1], #16           // src
+        ld1             {v26.8h}, [x2], #16
 .endif
-        sub             x12, x12, x13
-        sub             x11, x11, x13
-        sub             x11, x11, #4 // We read 4 extra elements from a
-        sub             x14, x11, #4 // We read 8 extra elements from b
-        mov             x13, x5
+
+        shl             v8.4s,   v8.4s,   #2
+        shl             v9.4s,   v9.4s,   #2
+        mla             v8.4s,   v16.4s,  v31.4s      // * 3 -> b
+        mla             v9.4s,   v17.4s,  v31.4s
+
+.if \bpc == 8
+        uxtl            v25.8h,  v25.8b               // src
+        uxtl            v26.8h,  v26.8b
+.endif
+
+        shl             v14.4s,  v14.4s,  #2
+        shl             v15.4s,  v15.4s,  #2
+        mla             v14.4s,  v19.4s,  v31.4s      // * 3 -> b
+        mla             v15.4s,  v20.4s,  v31.4s
+
+        umlal           v8.4s,   v4.4h,   v25.4h      // b + a * src
+        umlal2          v9.4s,   v4.8h,   v25.8h
+        umlal           v14.4s,  v0.4h,   v26.4h      // b + a * src
+        umlal2          v15.4s,  v0.8h,   v26.8h
+        mov             v0.16b,  v1.16b
+        rshrn           v8.4h,   v8.4s,   #9
+        rshrn2          v8.8h,   v9.4s,   #9
+        mov             v2.16b,  v3.16b
+        rshrn           v14.4h,  v14.4s,  #9
+        rshrn2          v14.8h,  v15.4s,  #9
+        subs            w5,  w5,  #8
+        mov             v4.16b,  v5.16b
+        st1             {v8.8h},  [x0],  #16
+        mov             v6.16b,  v7.16b
+        st1             {v14.8h}, [x13], #16
+
+        b.le            3f
+        mov             v16.16b, v18.16b
+        mov             v19.16b, v21.16b
+        mov             v22.16b, v24.16b
+        mov             v25.16b, v27.16b
+        ld1             {v1.8h}, [x10], #16
+        ld1             {v3.8h}, [x11], #16
+        ld1             {v5.8h}, [x12], #16
+        ld1             {v7.8h}, [x4],  #16
+        ld1             {v17.4s, v18.4s}, [x7], #32
+        ld1             {v20.4s, v21.4s}, [x8], #32
+        ld1             {v23.4s, v24.4s}, [x9], #32
+        ld1             {v26.4s, v27.4s}, [x3], #32
+        b               2b
+
+3:
+        ldp             d14, d15, [sp, #0x30]
+        ldp             d12, d13, [sp, #0x20]
+        ldp             d10, d11, [sp, #0x10]
+        ldp             d8,  d9,  [sp], 0x40
+        ret
+endfunc
+
+// void dav1d_sgr_finish_weighted1_Xbpc_neon(pixel *dst,
+//                                           const int32_t **a, const int16_t **b,
+//                                           const int w, const int w1,
+//                                           const int bitdepth_max);
+function sgr_finish_weighted1_\bpc\()bpc_neon, export=1
+        ldp             x7,  x8,  [x1]
+        ldr             x1,       [x1, #16]
+        ldp             x9,  x10, [x2]
+        ldr             x2,       [x2, #16]
+
+        dup             v31.8h, w4
+        dup             v30.8h, w5
+
        movi            v6.8h,  #3
        movi            v7.4s,  #3
 1:
-        ld1             {v0.8h, v1.8h}, [x9], #32
-        ld1             {v2.8h, v3.8h}, [x4], #32
-        ld1             {v4.8h, v5.8h}, [x10], #32
+        ld1             {v0.8h, v1.8h}, [x9],  #32
+        ld1             {v2.8h, v3.8h}, [x10], #32
+        ld1             {v4.8h, v5.8h}, [x2],  #32
        ld1             {v16.4s, v17.4s, v18.4s}, [x7], #48
-        ld1             {v19.4s, v20.4s, v21.4s}, [x3], #48
-        ld1             {v22.4s, v23.4s, v24.4s}, [x8], #48
+        ld1             {v19.4s, v20.4s, v21.4s}, [x8], #48
+        ld1             {v22.4s, v23.4s, v24.4s}, [x1], #48

 2:
-        subs            x5,  x5,  #8
        ext             v25.16b, v0.16b,  v1.16b, #2  // -stride
        ext             v26.16b, v2.16b,  v3.16b, #2  // 0
        ext             v27.16b, v4.16b,  v5.16b, #2  // +stride
        ext             v28.16b, v0.16b,  v1.16b, #4  // +1-stride
        ext             v29.16b, v2.16b,  v3.16b, #4  // +1
-        ext             v30.16b, v4.16b,  v5.16b, #4  // +1+stride
        add             v2.8h,   v2.8h,   v25.8h      // -1, -stride
+        ext             v25.16b, v4.16b,  v5.16b, #4  // +1+stride
        add             v26.8h,  v26.8h,  v27.8h      // 0, +stride
        add             v0.8h,   v0.8h,   v28.8h      // -1-stride, +1-stride
        add             v2.8h,   v2.8h,   v26.8h
-        add             v4.8h,   v4.8h,   v30.8h      // -1+stride, +1+stride
+        add             v4.8h,   v4.8h,   v25.8h      // -1+stride, +1+stride
        add             v2.8h,   v2.8h,   v29.8h      // +1
        add             v0.8h,   v0.8h,   v4.8h

@ -85,7 +257,7 @@ function sgr_finish_filter1_\bpc\()bpc_neon, export=1
        ext             v27.16b, v16.16b, v17.16b, #8 // +1-stride
        ext             v28.16b, v17.16b, v18.16b, #8
        ext             v29.16b, v19.16b, v20.16b, #4 // 0
-        ext             v30.16b, v20.16b, v21.16b, #4
+        ext             v4.16b,  v20.16b, v21.16b, #4
        mla             v2.8h,   v0.8h,   v6.8h       // * 3 -> a
        add             v25.4s,  v25.4s,  v19.4s      // -stride, -1
        add             v26.4s,  v26.4s,  v20.4s
@ -96,22 +268,22 @@ function sgr_finish_filter1_\bpc\()bpc_neon, export=1
        add             v16.4s,  v16.4s,  v22.4s      // -1+stride
        add             v17.4s,  v17.4s,  v23.4s
        add             v29.4s,  v29.4s,  v27.4s      // 0, +1
-        add             v30.4s,  v30.4s,  v28.4s
+        add             v4.4s,   v4.4s,   v28.4s
        add             v25.4s,  v25.4s,  v29.4s
-        add             v26.4s,  v26.4s,  v30.4s
+        add             v26.4s,  v26.4s,  v4.4s
        ext             v27.16b, v22.16b, v23.16b, #4 // +stride
        ext             v28.16b, v23.16b, v24.16b, #4
        ext             v29.16b, v22.16b, v23.16b, #8 // +1+stride
-        ext             v30.16b, v23.16b, v24.16b, #8
+        ext             v4.16b,  v23.16b, v24.16b, #8
 .if \bpc == 8
-        ld1             {v19.8b}, [x1], #8            // src
+        ld1             {v19.8b}, [x0]                // src
 .else
-        ld1             {v19.8h}, [x1], #16           // src
+        ld1             {v19.8h}, [x0]                // src
 .endif
        add             v25.4s,  v25.4s,  v27.4s      // +stride
        add             v26.4s,  v26.4s,  v28.4s
        add             v16.4s,  v16.4s,  v29.4s      // +1+stride
-        add             v17.4s,  v17.4s,  v30.4s
+        add             v17.4s,  v17.4s,  v4.4s
        shl             v25.4s,  v25.4s,  #2
        shl             v26.4s,  v26.4s,  #2
        mla             v25.4s,  v16.4s,  v7.4s       // * 3 -> b
@ -125,61 +297,68 @@ function sgr_finish_filter1_\bpc\()bpc_neon, export=1
        mov             v2.16b,  v3.16b
        rshrn           v25.4h,  v25.4s,  #9
        rshrn2          v25.8h,  v26.4s,  #9
-        mov             v4.16b,  v5.16b
-        st1             {v25.8h}, [x0], #16

-        b.le            3f
+        subs            w3,  w3,  #8
+
+        // weighted1
+        shl             v19.8h,  v19.8h,  #4   // u
+        mov             v4.16b,  v5.16b
+
+        sub             v25.8h,  v25.8h,  v19.8h // t1 - u
+        ld1             {v1.8h}, [x9],  #16
+        ushll           v26.4s,  v19.4h,  #7     // u << 7
+        ushll2          v27.4s,  v19.8h,  #7     // u << 7
+        ld1             {v3.8h}, [x10], #16
+        smlal           v26.4s,  v25.4h,  v31.4h // v
+        smlal2          v27.4s,  v25.8h,  v31.8h // v
+        ld1             {v5.8h}, [x2],  #16
+.if \bpc == 8
+        rshrn           v26.4h,  v26.4s,  #11
+        rshrn2          v26.8h,  v27.4s,  #11
        mov             v16.16b, v18.16b
+        sqxtun          v26.8b,  v26.8h
        mov             v19.16b, v21.16b
        mov             v22.16b, v24.16b
-        ld1             {v1.8h}, [x9], #16
-        ld1             {v3.8h}, [x4], #16
-        ld1             {v5.8h}, [x10], #16
+        st1             {v26.8b}, [x0], #8
+.else
+        sqrshrun        v26.4h,  v26.4s,  #11
+        sqrshrun2       v26.8h,  v27.4s,  #11
+        mov             v16.16b, v18.16b
+        umin            v26.8h,  v26.8h,  v30.8h
+        mov             v19.16b, v21.16b
+        mov             v22.16b, v24.16b
+        st1             {v26.8h}, [x0], #16
+.endif
+
+        b.le            3f
        ld1             {v17.4s, v18.4s}, [x7], #32
-        ld1             {v20.4s, v21.4s}, [x3], #32
-        ld1             {v23.4s, v24.4s}, [x8], #32
+        ld1             {v20.4s, v21.4s}, [x8], #32
+        ld1             {v23.4s, v24.4s}, [x1], #32
        b               2b

 3:
-        subs            x6,  x6,  #1
-        b.le            0f
-        mov             x5,  x13
-        add             x0,  x0,  x12, lsl #1
-        add             x1,  x1,  x2
-        add             x3,  x3,  x11, lsl #2
-        add             x7,  x7,  x11, lsl #2
-        add             x8,  x8,  x11, lsl #2
-        add             x4,  x4,  x14, lsl #1
-        add             x9,  x9,  x14, lsl #1
-        add             x10, x10, x14, lsl #1
-        b               1b
-0:
        ret
 endfunc

-// void dav1d_sgr_finish_filter2_Xbpc_neon(int16_t *tmp,
-//                                         const pixel *src, const ptrdiff_t stride,
-//                                         const int32_t *a, const int16_t *b,
-//                                         const int w, const int h);
-function sgr_finish_filter2_\bpc\()bpc_neon, export=1
-        add             x7,  x3,  #(4*(SUM_STRIDE))
-        sub             x3,  x3,  #(4*(SUM_STRIDE))
-        add             x8,  x4,  #(2*(SUM_STRIDE))
-        sub             x4,  x4,  #(2*(SUM_STRIDE))
-        mov             x9,  #(2*SUM_STRIDE)
+// void dav1d_sgr_finish_filter2_2rows_Xbpc_neon(int16_t *tmp,
+//                                               const pixel *src,
+//                                               const ptrdiff_t stride,
+//                                               const int32_t **a,
+//                                               const int16_t **b,
+//                                               const int w, const int h);
+function sgr_finish_filter2_2rows_\bpc\()bpc_neon, export=1
+        stp             d8,  d9,  [sp, #-0x40]!
+        stp             d10, d11, [sp, #0x10]
+        stp             d12, d13, [sp, #0x20]
+        stp             d14, d15, [sp, #0x30]
+
+        ldp             x3,  x7,  [x3]
+        ldp             x4,  x8,  [x4]
        mov             x10, #FILTER_OUT_STRIDE
-        add             x11, x5,  #7
-        bic             x11, x11, #7 // Aligned width
-.if \bpc == 8
-        sub             x2,  x2,  x11
-.else
-        sub             x2,  x2,  x11, lsl #1
-.endif
-        sub             x10, x10, x11
-        sub             x9,  x9,  x11
-        sub             x9,  x9,  #4 // We read 4 extra elements from a
-        sub             x12, x9,  #4 // We read 8 extra elements from b
-        mov             x11, x5
+        cmp             w6,  #1
+        add             x2,  x1,  x2 // src + stride
+        csel            x2,  x1,  x2,  le // if (h <= 1) x2 = x1
+        add             x10, x0,  x10, lsl #1
        movi            v4.8h,  #5
        movi            v5.4s,  #5
        movi            v6.8h,  #6
@ -191,7 +370,6 @@ function sgr_finish_filter2_\bpc\()bpc_neon, export=1
        ld1             {v19.4s, v20.4s, v21.4s}, [x7], #48

 2:
-        subs            x5,  x5,  #8
        ext             v24.16b, v0.16b,  v1.16b, #4  // +1-stride
        ext             v25.16b, v2.16b,  v3.16b, #4  // +1+stride
        ext             v22.16b, v0.16b,  v1.16b, #2  // -stride
@ -201,6 +379,9 @@ function sgr_finish_filter2_\bpc\()bpc_neon, export=1
        add             v2.8h,   v22.8h,  v23.8h      // -stride, +stride
        add             v0.8h,   v0.8h,   v25.8h

+        mul             v8.8h,   v25.8h,  v4.8h       // * 5
+        mla             v8.8h,   v23.8h,  v6.8h       // * 6
+
        ext             v22.16b, v16.16b, v17.16b, #4 // -stride
        ext             v23.16b, v17.16b, v18.16b, #4
        ext             v24.16b, v19.16b, v20.16b, #4 // +stride
@ -213,8 +394,10 @@ function sgr_finish_filter2_\bpc\()bpc_neon, export=1
        mla             v0.8h,   v2.8h,   v6.8h       // * 6
 .if \bpc == 8
        ld1             {v31.8b}, [x1], #8
+        ld1             {v30.8b}, [x2], #8
 .else
        ld1             {v31.8h}, [x1], #16
+        ld1             {v30.8h}, [x2], #16
 .endif
        add             v16.4s,  v16.4s,  v26.4s      // -1-stride, +1-stride
        add             v17.4s,  v17.4s,  v27.4s
@ -223,6 +406,11 @@ function sgr_finish_filter2_\bpc\()bpc_neon, export=1
        add             v16.4s,  v16.4s,  v19.4s
        add             v17.4s,  v17.4s,  v20.4s

+        mul             v9.4s,   v19.4s,  v5.4s       // * 5
+        mla             v9.4s,   v24.4s,  v7.4s       // * 6
+        mul             v10.4s,  v20.4s,  v5.4s       // * 5
+        mla             v10.4s,  v25.4s,  v7.4s       // * 6
+
        add             v22.4s,  v22.4s,  v24.4s      // -stride, +stride
        add             v23.4s,  v23.4s,  v25.4s
        // This is, surprisingly, faster than other variants where the
@ -234,16 +422,23 @@ function sgr_finish_filter2_\bpc\()bpc_neon, export=1

 .if \bpc == 8
        uxtl            v31.8h,  v31.8b
+        uxtl            v30.8h,  v30.8b
 .endif
        umlal           v16.4s,  v0.4h,   v31.4h      // b + a * src
        umlal2          v17.4s,  v0.8h,   v31.8h
+        umlal           v9.4s,   v8.4h,   v30.4h      // b + a * src
+        umlal2          v10.4s,  v8.8h,   v30.8h
        mov             v0.16b,  v1.16b
        rshrn           v16.4h,  v16.4s,  #9
        rshrn2          v16.8h,  v17.4s,  #9
+        rshrn           v9.4h,   v9.4s,   #8
+        rshrn2          v9.8h,   v10.4s,  #8
+        subs            w5,  w5,  #8
        mov             v2.16b,  v3.16b
-        st1             {v16.8h}, [x0], #16
+        st1             {v16.8h}, [x0],  #16
+        st1             {v9.8h},  [x10], #16

-        b.le            3f
+        b.le            9f
        mov             v16.16b, v18.16b
        mov             v19.16b, v21.16b
        ld1             {v1.8h}, [x4], #16
@ -252,201 +447,160 @@ function sgr_finish_filter2_\bpc\()bpc_neon, export=1
        ld1             {v20.4s, v21.4s}, [x7], #32
        b               2b

-3:
-        subs            x6,  x6,  #1
-        b.le            0f
-        mov             x5,  x11
-        add             x0,  x0,  x10, lsl #1
-        add             x1,  x1,  x2
-        add             x3,  x3,  x9, lsl #2
-        add             x7,  x7,  x9, lsl #2
-        add             x4,  x4,  x12, lsl #1
-        add             x8,  x8,  x12, lsl #1
-        mov             x13, x3
-        mov             x14, x4
-
-        ld1             {v0.8h, v1.8h}, [x4], #32
-        ld1             {v16.4s, v17.4s, v18.4s}, [x3], #48
-
-4:
-        subs            x5,  x5,  #8
-        ext             v23.16b, v0.16b,  v1.16b, #4  // +1
-        ext             v22.16b, v0.16b,  v1.16b, #2  // 0
-        add             v0.8h,   v0.8h,   v23.8h      // -1, +1
-
-        ext             v24.16b, v16.16b, v17.16b, #4 // 0
-        ext             v25.16b, v17.16b, v18.16b, #4
-        ext             v26.16b, v16.16b, v17.16b, #8 // +1
-        ext             v27.16b, v17.16b, v18.16b, #8
-        mul             v2.8h,   v22.8h,  v6.8h       // * 6
-        mla             v2.8h,   v0.8h,   v4.8h       // * 5 -> a
-.if \bpc == 8
-        ld1             {v31.8b}, [x1], #8
-.else
-        ld1             {v31.8h}, [x1], #16
-.endif
-        add             v16.4s,  v16.4s,  v26.4s      // -1, +1
-        add             v17.4s,  v17.4s,  v27.4s
-.if \bpc == 8
-        uxtl            v31.8h,  v31.8b
-.endif
-        // This is, surprisingly, faster than other variants where the
-        // mul+mla pairs are further apart, on Cortex A53.
-        mul             v24.4s,  v24.4s,  v7.4s       // * 6
-        mla             v24.4s,  v16.4s,  v5.4s       // * 5 -> b
-        mul             v25.4s,  v25.4s,  v7.4s       // * 6
-        mla             v25.4s,  v17.4s,  v5.4s       // * 5 -> b
-
-        umlal           v24.4s,  v2.4h,   v31.4h      // b + a * src
-        umlal2          v25.4s,  v2.8h,   v31.8h
-        mov             v0.16b,  v1.16b
-        rshrn           v24.4h,  v24.4s,  #8
-        rshrn2          v24.8h,  v25.4s,  #8
-        mov             v16.16b, v18.16b
-        st1             {v24.8h}, [x0], #16
-
-        b.le            5f
-        ld1             {v1.8h}, [x4], #16
-        ld1             {v17.4s, v18.4s}, [x3], #32
-        b               4b
-
-5:
-        subs            x6,  x6,  #1
-        b.le            0f
-        mov             x5,  x11
-        add             x0,  x0,  x10, lsl #1
-        add             x1,  x1,  x2
-        mov             x3,  x13 // Rewind x3/x4 to where they started
-        mov             x4,  x14
-        b               1b
-0:
+9:
+        ldp             d14, d15, [sp, #0x30]
+        ldp             d12, d13, [sp, #0x20]
+        ldp             d10, d11, [sp, #0x10]
+        ldp             d8,  d9,  [sp], 0x40
        ret
 endfunc

-// void dav1d_sgr_weighted1_Xbpc_neon(pixel *dst, const ptrdiff_t dst_stride,
-//                                    const pixel *src, const ptrdiff_t src_stride,
-//                                    const int16_t *t1, const int w, const int h,
-//                                    const int wt, const int bitdepth_max);
-function sgr_weighted1_\bpc\()bpc_neon, export=1
-.if \bpc == 16
-        ldr             w8,  [sp]
-.endif
-        dup             v31.8h, w7
-        cmp             x6,  #2
-.if \bpc == 16
-        dup             v30.8h, w8
-.endif
-        add             x9,  x0,  x1
-        add             x10, x2,  x3
-        add             x11, x4,  #2*FILTER_OUT_STRIDE
-        mov             x7,  #(4*FILTER_OUT_STRIDE)
-        lsl             x1,  x1,  #1
-        lsl             x3,  x3,  #1
-        add             x8,  x5,  #7
-        bic             x8,  x8,  #7 // Aligned width
-.if \bpc == 8
-        sub             x1,  x1,  x8
-        sub             x3,  x3,  x8
-.else
-        sub             x1,  x1,  x8, lsl #1
-        sub             x3,  x3,  x8, lsl #1
-.endif
-        sub             x7,  x7,  x8, lsl #1
-        mov             x8,  x5
-        b.lt            2f
-1:
-.if \bpc == 8
-        ld1             {v0.8b}, [x2],  #8
-        ld1             {v4.8b}, [x10], #8
-.else
-        ld1             {v0.8h}, [x2],  #16
-        ld1             {v4.8h}, [x10], #16
-.endif
-        ld1             {v1.8h}, [x4],  #16
-        ld1             {v5.8h}, [x11], #16
-        subs            x5,  x5,  #8
-.if \bpc == 8
-        ushll           v0.8h,  v0.8b,  #4     // u
-        ushll           v4.8h,  v4.8b,  #4     // u
-.else
-        shl             v0.8h,  v0.8h,  #4     // u
-        shl             v4.8h,  v4.8h,  #4     // u
-.endif
-        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u
-        sub             v5.8h,  v5.8h,  v4.8h  // t1 - u
-        ushll           v2.4s,  v0.4h,  #7     // u << 7
-        ushll2          v3.4s,  v0.8h,  #7     // u << 7
-        ushll           v6.4s,  v4.4h,  #7     // u << 7
-        ushll2          v7.4s,  v4.8h,  #7     // u << 7
-        smlal           v2.4s,  v1.4h,  v31.4h // v
-        smlal2          v3.4s,  v1.8h,  v31.8h // v
-        smlal           v6.4s,  v5.4h,  v31.4h // v
-        smlal2          v7.4s,  v5.8h,  v31.8h // v
-.if \bpc == 8
-        rshrn           v2.4h,  v2.4s,  #11
-        rshrn2          v2.8h,  v3.4s,  #11
-        rshrn           v6.4h,  v6.4s,  #11
-        rshrn2          v6.8h,  v7.4s,  #11
-        sqxtun          v2.8b,  v2.8h
-        sqxtun          v6.8b,  v6.8h
-        st1             {v2.8b}, [x0], #8
-        st1             {v6.8b}, [x9], #8
-.else
-        sqrshrun        v2.4h,  v2.4s,  #11
-        sqrshrun2       v2.8h,  v3.4s,  #11
-        sqrshrun        v6.4h,  v6.4s,  #11
-        sqrshrun2       v6.8h,  v7.4s,  #11
-        umin            v2.8h,  v2.8h,  v30.8h
-        umin            v6.8h,  v6.8h,  v30.8h
-        st1             {v2.8h}, [x0], #16
-        st1             {v6.8h}, [x9], #16
-.endif
-        b.gt            1b
+// void dav1d_sgr_finish_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride,
+//                                           const int32_t **a,
+//                                           const int16_t **b,
+//                                           const int w, const int h,
+//                                           const int w1,
+//                                           const int bitdepth_max);
+function sgr_finish_weighted2_\bpc\()bpc_neon, export=1
+        stp             d8,  d9,  [sp, #-0x30]!
+        str             d10,      [sp, #0x10]
+        stp             d14, d15, [sp, #0x20]

-        sub             x6,  x6,  #2
-        cmp             x6,  #1
-        b.lt            0f
-        mov             x5,  x8
-        add             x0,  x0,  x1
-        add             x9,  x9,  x1
-        add             x2,  x2,  x3
-        add             x10, x10, x3
-        add             x4,  x4,  x7
-        add             x11, x11, x7
-        b.eq            2f
-        b               1b
+        dup             v14.8h, w6
+        dup             v15.8h, w7
+
+        ldp             x2,  x7,  [x2]
+        ldp             x3,  x8,  [x3]
+        cmp             w5,  #1
+        add             x1,  x0,  x1 // src + stride
+        // if (h <= 1), set the pointer to the second row to any dummy buffer
+        // we can clobber (x2 in this case)
+        csel            x1,  x2,  x1,  le
+        movi            v4.8h,  #5
+        movi            v5.4s,  #5
+        movi            v6.8h,  #6
+        movi            v7.4s,  #6
+1:
+        ld1             {v0.8h, v1.8h}, [x3], #32
+        ld1             {v2.8h, v3.8h}, [x8], #32
+        ld1             {v16.4s, v17.4s, v18.4s}, [x2], #48
+        ld1             {v19.4s, v20.4s, v21.4s}, [x7], #48

 2:
+        ext             v24.16b, v0.16b,  v1.16b, #4  // +1-stride
+        ext             v25.16b, v2.16b,  v3.16b, #4  // +1+stride
+        ext             v22.16b, v0.16b,  v1.16b, #2  // -stride
+        ext             v23.16b, v2.16b,  v3.16b, #2  // +stride
+        add             v0.8h,   v0.8h,   v24.8h      // -1-stride, +1-stride
+        add             v25.8h,  v2.8h,   v25.8h      // -1+stride, +1+stride
+        add             v2.8h,   v22.8h,  v23.8h      // -stride, +stride
+        add             v0.8h,   v0.8h,   v25.8h
+
+        mul             v8.8h,   v25.8h,  v4.8h       // * 5
+        mla             v8.8h,   v23.8h,  v6.8h       // * 6
+
+        ext             v22.16b, v16.16b, v17.16b, #4 // -stride
+        ext             v23.16b, v17.16b, v18.16b, #4
+        ext             v24.16b, v19.16b, v20.16b, #4 // +stride
+        ext             v25.16b, v20.16b, v21.16b, #4
+        ext             v26.16b, v16.16b, v17.16b, #8 // +1-stride
+        ext             v27.16b, v17.16b, v18.16b, #8
+        ext             v28.16b, v19.16b, v20.16b, #8 // +1+stride
+        ext             v29.16b, v20.16b, v21.16b, #8
+        mul             v0.8h,   v0.8h,   v4.8h       // * 5
+        mla             v0.8h,   v2.8h,   v6.8h       // * 6
 .if \bpc == 8
-        ld1             {v0.8b}, [x2], #8
+        ld1             {v31.8b}, [x0]
+        ld1             {v30.8b}, [x1]
 .else
-        ld1             {v0.8h}, [x2], #16
+        ld1             {v31.8h}, [x0]
+        ld1             {v30.8h}, [x1]
 .endif
-        ld1             {v1.8h}, [x4], #16
-        subs            x5,  x5,  #8
+        add             v16.4s,  v16.4s,  v26.4s      // -1-stride, +1-stride
+        add             v17.4s,  v17.4s,  v27.4s
+        add             v19.4s,  v19.4s,  v28.4s      // -1+stride, +1+stride
+        add             v20.4s,  v20.4s,  v29.4s
+        add             v16.4s,  v16.4s,  v19.4s
+        add             v17.4s,  v17.4s,  v20.4s
+
+        mul             v9.4s,   v19.4s,  v5.4s       // * 5
+        mla             v9.4s,   v24.4s,  v7.4s       // * 6
+        mul             v10.4s,  v20.4s,  v5.4s       // * 5
+        mla             v10.4s,  v25.4s,  v7.4s       // * 6
+
+        add             v22.4s,  v22.4s,  v24.4s      // -stride, +stride
+        add             v23.4s,  v23.4s,  v25.4s
+        // This is, surprisingly, faster than other variants where the
+        // mul+mla pairs are further apart, on Cortex A53.
+        mul             v16.4s,  v16.4s,  v5.4s       // * 5
+        mla             v16.4s,  v22.4s,  v7.4s       // * 6
+        mul             v17.4s,  v17.4s,  v5.4s       // * 5
+        mla             v17.4s,  v23.4s,  v7.4s       // * 6
+
 .if \bpc == 8
-        ushll           v0.8h,  v0.8b,  #4     // u
-.else
-        shl             v0.8h,  v0.8h,  #4     // u
+        uxtl            v31.8h,  v31.8b
+        uxtl            v30.8h,  v30.8b
 .endif
-        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u
-        ushll           v2.4s,  v0.4h,  #7     // u << 7
-        ushll2          v3.4s,  v0.8h,  #7     // u << 7
-        smlal           v2.4s,  v1.4h,  v31.4h // v
-        smlal2          v3.4s,  v1.8h,  v31.8h // v
+        umlal           v16.4s,  v0.4h,   v31.4h      // b + a * src
+        umlal2          v17.4s,  v0.8h,   v31.8h
+        umlal           v9.4s,   v8.4h,   v30.4h      // b + a * src
+        umlal2          v10.4s,  v8.8h,   v30.8h
+        mov             v0.16b,  v1.16b
+        rshrn           v16.4h,  v16.4s,  #9
+        rshrn2          v16.8h,  v17.4s,  #9
+        rshrn           v9.4h,   v9.4s,   #8
+        rshrn2          v9.8h,   v10.4s,  #8
+
+        subs            w4,  w4,  #8
+
+        // weighted1
+        shl             v31.8h,  v31.8h,  #4     // u
+        shl             v30.8h,  v30.8h,  #4
+        mov             v2.16b,  v3.16b
+
+        sub             v16.8h,  v16.8h,  v31.8h // t1 - u
+        sub             v9.8h,   v9.8h,   v30.8h
+        ld1             {v1.8h}, [x3], #16
+        ushll           v22.4s,  v31.4h,  #7     // u << 7
+        ushll2          v23.4s,  v31.8h,  #7
+        ushll           v24.4s,  v30.4h,  #7
+        ushll2          v25.4s,  v30.8h,  #7
+        ld1             {v3.8h}, [x8], #16
+        smlal           v22.4s,  v16.4h,  v14.4h // v
+        smlal2          v23.4s,  v16.8h,  v14.8h
+        mov             v16.16b, v18.16b
+        smlal           v24.4s,  v9.4h,   v14.4h
+        smlal2          v25.4s,  v9.8h,   v14.8h
+        mov             v19.16b, v21.16b
 .if \bpc == 8
-        rshrn           v2.4h,  v2.4s,  #11
-        rshrn2          v2.8h,  v3.4s,  #11
-        sqxtun          v2.8b,  v2.8h
-        st1             {v2.8b}, [x0], #8
+        rshrn           v22.4h,  v22.4s,  #11
+        rshrn2          v22.8h,  v23.4s,  #11
+        rshrn           v23.4h,  v24.4s,  #11
+        rshrn2          v23.8h,  v25.4s,  #11
+        sqxtun          v22.8b,  v22.8h
+        sqxtun          v23.8b,  v23.8h
+        st1             {v22.8b}, [x0], #8
+        st1             {v23.8b}, [x1], #8
 .else
-        sqrshrun        v2.4h,  v2.4s,  #11
-        sqrshrun2       v2.8h,  v3.4s,  #11
-        umin            v2.8h,  v2.8h,  v30.8h
-        st1             {v2.8h}, [x0], #16
+        sqrshrun        v22.4h,  v22.4s,  #11
+        sqrshrun2       v22.8h,  v23.4s,  #11
+        sqrshrun        v23.4h,  v24.4s,  #11
+        sqrshrun2       v23.8h,  v25.4s,  #11
+        umin            v22.8h,  v22.8h,  v15.8h
+        umin            v23.8h,  v23.8h,  v15.8h
+        st1             {v22.8h}, [x0], #16
+        st1             {v23.8h}, [x1], #16
 .endif
-        b.gt            2b
-0:
+
+        b.le            3f
+        ld1             {v17.4s, v18.4s}, [x2], #32
+        ld1             {v20.4s, v21.4s}, [x7], #32
+        b               2b
+
+3:
+        ldp             d14, d15, [sp, #0x20]
+        ldr             d10,      [sp, #0x10]
+        ldp             d8,  d9,  [sp], 0x30
        ret
 endfunc

@ -461,7 +615,7 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1
 .else
        ldp             x8,  x9,  [sp]
 .endif
-        cmp             x7,  #2
+        cmp             w7,  #2
        add             x10, x0,  x1
        add             x11, x2,  x3
        add             x12, x4,  #2*FILTER_OUT_STRIDE
@ -483,7 +637,7 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1
        sub             x3,  x3,  x9, lsl #1
 .endif
        sub             x8,  x8,  x9, lsl #1
-        mov             x9,  x6
+        mov             w9,  w6
        b.lt            2f
 1:
 .if \bpc == 8
@ -497,7 +651,7 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1
        ld1             {v17.8h}, [x12], #16
        ld1             {v2.8h},  [x5],  #16
        ld1             {v18.8h}, [x13], #16
-        subs            x6,  x6,  #8
+        subs            w6,  w6,  #8
 .if \bpc == 8
        ushll           v0.8h,  v0.8b,  #4     // u
        ushll           v16.8h, v16.8b, #4     // u
@ -542,10 +696,10 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1
 .endif
        b.gt            1b

-        subs            x7,  x7,  #2
-        cmp             x7,  #1
+        subs            w7,  w7,  #2
+        cmp             w7,  #1
        b.lt            0f
-        mov             x6,  x9
+        mov             w6,  w9
        add             x0,  x0,  x1
        add             x10, x10, x1
        add             x2,  x2,  x3
@ -565,7 +719,7 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1
 .endif
        ld1             {v1.8h}, [x4], #16
        ld1             {v2.8h}, [x5], #16
-        subs            x6,  x6,  #8
+        subs            w6,  w6,  #8
 .if \bpc == 8
        ushll           v0.8h,  v0.8b,  #4     // u
 .else
--- a/third_party/dav1d/src/arm/64/refmvs.S
+++ b/third_party/dav1d/src/arm/64/refmvs.S
@ -89,3 +89,204 @@ L(splat_tbl):
        .hword L(splat_tbl) -   20b
        .hword L(splat_tbl) -   10b
 endfunc
+
+const mv_tbls, align=4
+        .byte           255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+        .byte           0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0
+        .byte           4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
+        .byte           4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
+endconst
+
+const mask_mult, align=4
+        .byte           1, 2, 1, 2, 0, 0, 0, 0
+endconst
+
+// void dav1d_save_tmvs_neon(refmvs_temporal_block *rp, ptrdiff_t stride,
+//                           refmvs_block **rr, const uint8_t *ref_sign,
+//                           int col_end8, int row_end8,
+//                           int col_start8, int row_start8)
+function save_tmvs_neon, export=1
+        AARCH64_SIGN_LINK_REGISTER
+        stp             x29, x30, [sp, #-16]!
+        mov             x29, sp
+
+        movi            v30.8b,  #0
+        ld1             {v31.8b}, [x3]
+        adr             x8,  L(save_tmvs_tbl)
+        movrel          x16, mask_mult
+        movrel          x13, mv_tbls
+        ld1             {v29.8b}, [x16]
+        ext             v31.8b,  v30.8b,  v31.8b,  #7 // [0, ref_sign]
+        mov             w15, #5
+        mov             w14, #12*2
+        sxtw            x4,  w4
+        sxtw            x6,  w6
+        mul             w1,  w1,  w15             // stride *= 5
+        sub             w5,  w5,  w7              // h = row_end8 - row_start8
+        lsl             w7,  w7,  #1              // row_start8 <<= 1
+1:
+        mov             w15, #5
+        and             w9,  w7,  #30             // (y & 15) * 2
+        ldr             x9,  [x2, w9, uxtw #3]    // b = rr[(y & 15) * 2]
+        add             x9,  x9,  #12             // &b[... + 1]
+        madd            x10, x4,  x14,  x9        // end_cand_b = &b[col_end8*2 + 1]
+        madd            x9,  x6,  x14,  x9        // cand_b = &b[x*2 + 1]
+
+        madd            x3,  x6,  x15,  x0        // &rp[x]
+
+2:
+        ldrb            w11, [x9, #10]            // cand_b->bs
+        ld1             {v0.16b}, [x9]            // cand_b->mv
+        add             x11, x8,  w11, uxtw #2
+        ldr             h1,  [x9, #8]             // cand_b->ref
+        ldrh            w12, [x11]                // bw8
+        mov             x15, x8
+        add             x9,  x9,  w12, uxtw #1    // cand_b += bw8*2
+        cmp             x9,  x10
+        mov             v2.8b,   v0.8b
+        b.ge            3f
+
+        ldrb            w15, [x9, #10]            // cand_b->bs
+        add             x16, x9,  #8
+        ld1             {v4.16b}, [x9]            // cand_b->mv
+        add             x15, x8,  w15, uxtw #2
+        ld1             {v1.h}[1], [x16]          // cand_b->ref
+        ldrh            w12, [x15]                // bw8
+        add             x9,  x9,  w12, uxtw #1    // cand_b += bw8*2
+        trn1            v2.2d,   v0.2d,   v4.2d
+
+3:
+        abs             v2.8h,   v2.8h            // abs(mv[].xy)
+        tbl             v1.8b, {v31.16b}, v1.8b   // ref_sign[ref]
+        ushr            v2.8h,   v2.8h,   #12     // abs(mv[].xy) >> 12
+        umull           v1.8h,   v1.8b,   v29.8b  // ref_sign[ref] * {1, 2}
+        cmeq            v2.4s,   v2.4s,   #0      // abs(mv[].xy) <= 4096
+        xtn             v2.4h,   v2.4s            // abs() condition to 16 bit
+        and             v1.8b,   v1.8b,   v2.8b   // h[0-3] contains conditions for mv[0-1]
+        addp            v1.4h,   v1.4h,   v1.4h   // Combine condition for [1] and [0]
+        umov            w16, v1.h[0]              // Extract case for first block
+        umov            w17, v1.h[1]
+        ldrh            w11, [x11, #2]            // Fetch jump table entry
+        ldrh            w15, [x15, #2]
+        ldr             q1, [x13, w16, uxtw #4]   // Load permutation table base on case
+        ldr             q5, [x13, w17, uxtw #4]
+        sub             x11, x8,  w11, uxtw       // Find jump table target
+        sub             x15, x8,  w15, uxtw
+        tbl             v0.16b, {v0.16b}, v1.16b  // Permute cand_b to output refmvs_temporal_block
+        tbl             v4.16b, {v4.16b}, v5.16b
+
+        // v1 follows on v0, with another 3 full repetitions of the pattern.
+        ext             v1.16b,  v0.16b,  v0.16b,  #1
+        ext             v5.16b,  v4.16b,  v4.16b,  #1
+        // v2 ends with 3 complete repetitions of the pattern.
+        ext             v2.16b,  v0.16b,  v1.16b,  #4
+        ext             v6.16b,  v4.16b,  v5.16b,  #4
+
+        blr             x11
+        b.ge            4f  // if (cand_b >= end)
+        mov             v0.16b,  v4.16b
+        mov             v1.16b,  v5.16b
+        mov             v2.16b,  v6.16b
+        cmp             x9,  x10
+        blr             x15
+        b.lt            2b  // if (cand_b < end)
+
+4:
+        subs            w5,  w5,  #1              // h--
+        add             w7,  w7,  #2              // y += 2
+        add             x0,  x0,  x1              // rp += stride
+        b.gt            1b
+
+        ldp             x29, x30, [sp], #16
+        AARCH64_VALIDATE_LINK_REGISTER
+        ret
+
+10:
+        AARCH64_VALID_JUMP_TARGET
+        add             x16, x3,  #4
+        st1             {v0.s}[0], [x3]
+        st1             {v0.b}[4], [x16]
+        add             x3,  x3,  #5
+        ret
+20:
+        AARCH64_VALID_JUMP_TARGET
+        add             x16, x3,  #8
+        st1             {v0.d}[0], [x3]
+        st1             {v0.h}[4], [x16]
+        add             x3,  x3,  #2*5
+        ret
+40:
+        AARCH64_VALID_JUMP_TARGET
+        st1             {v0.16b}, [x3]
+        str             s1, [x3, #16]
+        add             x3,  x3,  #4*5
+        ret
+80:
+        AARCH64_VALID_JUMP_TARGET
+        // This writes 6 full entries plus 2 extra bytes
+        st1             {v0.16b, v1.16b}, [x3]
+        // Write the last few, overlapping with the first write.
+        stur            q2, [x3, #(8*5-16)]
+        add             x3,  x3,  #8*5
+        ret
+160:
+        AARCH64_VALID_JUMP_TARGET
+        add             x16, x3,  #6*5
+        add             x17, x3,  #12*5
+        // This writes 6 full entries plus 2 extra bytes
+        st1             {v0.16b, v1.16b}, [x3]
+        // Write another 6 full entries, slightly overlapping with the first set
+        st1             {v0.16b, v1.16b}, [x16]
+        // Write 8 bytes (one full entry) after the first 12
+        st1             {v0.8b}, [x17]
+        // Write the last 3 entries
+        str             q2, [x3, #(16*5-16)]
+        add             x3,  x3,  #16*5
+        ret
+
+L(save_tmvs_tbl):
+        .hword 16 * 12
+        .hword L(save_tmvs_tbl) - 160b
+        .hword 16 * 12
+        .hword L(save_tmvs_tbl) - 160b
+        .hword 8 * 12
+        .hword L(save_tmvs_tbl) -  80b
+        .hword 8 * 12
+        .hword L(save_tmvs_tbl) -  80b
+        .hword 8 * 12
+        .hword L(save_tmvs_tbl) -  80b
+        .hword 8 * 12
+        .hword L(save_tmvs_tbl) -  80b
+        .hword 4 * 12
+        .hword L(save_tmvs_tbl) -  40b
+        .hword 4 * 12
+        .hword L(save_tmvs_tbl) -  40b
+        .hword 4 * 12
+        .hword L(save_tmvs_tbl) -  40b
+        .hword 4 * 12
+        .hword L(save_tmvs_tbl) -  40b
+        .hword 2 * 12
+        .hword L(save_tmvs_tbl) -  20b
+        .hword 2 * 12
+        .hword L(save_tmvs_tbl) -  20b
+        .hword 2 * 12
+        .hword L(save_tmvs_tbl) -  20b
+        .hword 2 * 12
+        .hword L(save_tmvs_tbl) -  20b
+        .hword 2 * 12
+        .hword L(save_tmvs_tbl) -  20b
+        .hword 1 * 12
+        .hword L(save_tmvs_tbl) -  10b
+        .hword 1 * 12
+        .hword L(save_tmvs_tbl) -  10b
+        .hword 1 * 12
+        .hword L(save_tmvs_tbl) -  10b
+        .hword 1 * 12
+        .hword L(save_tmvs_tbl) -  10b
+        .hword 1 * 12
+        .hword L(save_tmvs_tbl) -  10b
+        .hword 1 * 12
+        .hword L(save_tmvs_tbl) -  10b
+        .hword 1 * 12
+        .hword L(save_tmvs_tbl) -  10b
+endfunc
--- a/third_party/dav1d/src/arm/looprestoration.h
+++ b/third_party/dav1d/src/arm/looprestoration.h
@ -105,6 +105,7 @@ static void wiener_filter_neon(pixel *const dst, const ptrdiff_t stride,
 }
 #endif

+#if ARCH_ARM
 void BF(dav1d_sgr_box3_h, neon)(int32_t *sumsq, int16_t *sum,
                                const pixel (*left)[4],
                                const pixel *src, const ptrdiff_t stride,
@ -246,6 +247,853 @@ static void sgr_filter_mix_neon(pixel *const dst, const ptrdiff_t stride,
                                  tmp1, tmp2, w, h, wt HIGHBD_TAIL_SUFFIX);
 }

+#else
+static void rotate(int32_t **sumsq_ptrs, int16_t **sum_ptrs, int n) {
+    int32_t *tmp32 = sumsq_ptrs[0];
+    int16_t *tmp16 = sum_ptrs[0];
+    for (int i = 0; i < n - 1; i++) {
+        sumsq_ptrs[i] = sumsq_ptrs[i+1];
+        sum_ptrs[i] = sum_ptrs[i+1];
+    }
+    sumsq_ptrs[n - 1] = tmp32;
+    sum_ptrs[n - 1] = tmp16;
+}
+static void rotate5_x2(int32_t **sumsq_ptrs, int16_t **sum_ptrs) {
+    int32_t *tmp32[2];
+    int16_t *tmp16[2];
+    for (int i = 0; i < 2; i++) {
+        tmp32[i] = sumsq_ptrs[i];
+        tmp16[i] = sum_ptrs[i];
+    }
+    for (int i = 0; i < 3; i++) {
+        sumsq_ptrs[i] = sumsq_ptrs[i+2];
+        sum_ptrs[i] = sum_ptrs[i+2];
+    }
+    for (int i = 0; i < 2; i++) {
+        sumsq_ptrs[3 + i] = tmp32[i];
+        sum_ptrs[3 + i] = tmp16[i];
+    }
+}
+
+static void rotate_ab_3(int32_t **A_ptrs, int16_t **B_ptrs) {
+    rotate(A_ptrs, B_ptrs, 3);
+}
+
+static void rotate_ab_2(int32_t **A_ptrs, int16_t **B_ptrs) {
+    rotate(A_ptrs, B_ptrs, 2);
+}
+
+static void rotate_ab_4(int32_t **A_ptrs, int16_t **B_ptrs) {
+    rotate(A_ptrs, B_ptrs, 4);
+}
+
+void BF(dav1d_sgr_box3_row_h, neon)(int32_t *sumsq, int16_t *sum,
+                                    const pixel (*left)[4],
+                                    const pixel *src, const int w,
+                                    const enum LrEdgeFlags edges);
+void BF(dav1d_sgr_box5_row_h, neon)(int32_t *sumsq, int16_t *sum,
+                                    const pixel (*left)[4],
+                                    const pixel *src, const int w,
+                                    const enum LrEdgeFlags edges);
+void BF(dav1d_sgr_box35_row_h, neon)(int32_t *sumsq3, int16_t *sum3,
+                                     int32_t *sumsq5, int16_t *sum5,
+                                     const pixel (*left)[4],
+                                     const pixel *src, const int w,
+                                     const enum LrEdgeFlags edges);
+
+void dav1d_sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum,
+                              int32_t *AA, int16_t *BB,
+                              const int w, const int s,
+                              const int bitdepth_max);
+void dav1d_sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum,
+                              int32_t *AA, int16_t *BB,
+                              const int w, const int s,
+                              const int bitdepth_max);
+
+void BF(dav1d_sgr_finish_weighted1, neon)(pixel *dst,
+                                          int32_t **A_ptrs, int16_t **B_ptrs,
+                                          const int w, const int w1
+                                          HIGHBD_DECL_SUFFIX);
+void BF(dav1d_sgr_finish_weighted2, neon)(pixel *dst, const ptrdiff_t stride,
+                                          int32_t **A_ptrs, int16_t **B_ptrs,
+                                          const int w, const int h,
+                                          const int w1 HIGHBD_DECL_SUFFIX);
+
+void BF(dav1d_sgr_finish_filter1_2rows, neon)(int16_t *tmp, const pixel *src,
+                                              const ptrdiff_t src_stride,
+                                              int32_t **A_ptrs,
+                                              int16_t **B_ptrs,
+                                              const int w, const int h);
+void BF(dav1d_sgr_finish_filter2_2rows, neon)(int16_t *tmp, const pixel *src,
+                                              const ptrdiff_t src_stride,
+                                              int32_t **A_ptrs, int16_t **B_ptrs,
+                                              const int w, const int h);
+void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride,
+                                   const pixel *src, const ptrdiff_t src_stride,
+                                   const int16_t *t1, const int16_t *t2,
+                                   const int w, const int h,
+                                   const int16_t wt[2] HIGHBD_DECL_SUFFIX);
+
+static void sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum,
+                               int32_t *sumsq_out, int16_t *sum_out,
+                               const int w, int s, int bitdepth_max) {
+    // box3_v + calc_ab1
+    dav1d_sgr_box3_vert_neon(sumsq, sum, sumsq_out, sum_out, w, s, bitdepth_max);
+    rotate(sumsq, sum, 3);
+}
+
+static void sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum,
+                               int32_t *sumsq_out, int16_t *sum_out,
+                               const int w, int s, int bitdepth_max) {
+    // box5_v + calc_ab2
+    dav1d_sgr_box5_vert_neon(sumsq, sum, sumsq_out, sum_out, w, s, bitdepth_max);
+    rotate5_x2(sumsq, sum);
+}
+
+static void sgr_box3_hv_neon(int32_t **sumsq, int16_t **sum,
+                             int32_t *AA, int16_t *BB,
+                             const pixel (*left)[4],
+                             const pixel *src, const int w,
+                             const int s,
+                             const enum LrEdgeFlags edges,
+                             const int bitdepth_max) {
+    BF(dav1d_sgr_box3_row_h, neon)(sumsq[2], sum[2], left, src, w, edges);
+    sgr_box3_vert_neon(sumsq, sum, AA, BB, w, s, bitdepth_max);
+}
+
+
+static void sgr_finish1_neon(pixel **dst, const ptrdiff_t stride,
+                             int32_t **A_ptrs, int16_t **B_ptrs, const int w,
+                             const int w1 HIGHBD_DECL_SUFFIX) {
+    BF(dav1d_sgr_finish_weighted1, neon)(*dst, A_ptrs, B_ptrs,
+                                         w, w1 HIGHBD_TAIL_SUFFIX);
+    *dst += PXSTRIDE(stride);
+    rotate_ab_3(A_ptrs, B_ptrs);
+}
+
+static void sgr_finish2_neon(pixel **dst, const ptrdiff_t stride,
+                             int32_t **A_ptrs, int16_t **B_ptrs,
+                             const int w, const int h, const int w1
+                             HIGHBD_DECL_SUFFIX) {
+    BF(dav1d_sgr_finish_weighted2, neon)(*dst, stride, A_ptrs, B_ptrs,
+                                         w, h, w1 HIGHBD_TAIL_SUFFIX);
+    *dst += 2*PXSTRIDE(stride);
+    rotate_ab_2(A_ptrs, B_ptrs);
+}
+
+static void sgr_finish_mix_neon(pixel **dst, const ptrdiff_t stride,
+                                int32_t **A5_ptrs, int16_t **B5_ptrs,
+                                int32_t **A3_ptrs, int16_t **B3_ptrs,
+                                const int w, const int h,
+                                const int w0, const int w1 HIGHBD_DECL_SUFFIX) {
+#define FILTER_OUT_STRIDE 384
+    ALIGN_STK_16(int16_t, tmp5, 2*FILTER_OUT_STRIDE,);
+    ALIGN_STK_16(int16_t, tmp3, 2*FILTER_OUT_STRIDE,);
+
+    BF(dav1d_sgr_finish_filter2_2rows, neon)(tmp5, *dst, stride,
+                                             A5_ptrs, B5_ptrs, w, h);
+    BF(dav1d_sgr_finish_filter1_2rows, neon)(tmp3, *dst, stride,
+                                             A3_ptrs, B3_ptrs, w, h);
+    const int16_t wt[2] = { w0, w1 };
+    BF(dav1d_sgr_weighted2, neon)(*dst, stride, *dst, stride,
+                                  tmp5, tmp3, w, h, wt HIGHBD_TAIL_SUFFIX);
+    *dst += h*PXSTRIDE(stride);
+    rotate_ab_2(A5_ptrs, B5_ptrs);
+    rotate_ab_4(A3_ptrs, B3_ptrs);
+}
+
+
+static void sgr_filter_3x3_neon(pixel *dst, const ptrdiff_t stride,
+                                const pixel (*left)[4], const pixel *lpf,
+                                const int w, int h,
+                                const LooprestorationParams *const params,
+                                const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+#define BUF_STRIDE (384 + 16)
+    ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 3 + 16,);
+    ALIGN_STK_16(int16_t, sum_buf, BUF_STRIDE * 3 + 16,);
+    int32_t *sumsq_ptrs[3], *sumsq_rows[3];
+    int16_t *sum_ptrs[3], *sum_rows[3];
+    for (int i = 0; i < 3; i++) {
+        sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE];
+        sum_rows[i] = &sum_buf[i * BUF_STRIDE];
+    }
+
+    ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 3 + 16,);
+    ALIGN_STK_16(int16_t, B_buf, BUF_STRIDE * 3 + 16,);
+    int32_t *A_ptrs[3];
+    int16_t *B_ptrs[3];
+    for (int i = 0; i < 3; i++) {
+        A_ptrs[i] = &A_buf[i * BUF_STRIDE];
+        B_ptrs[i] = &B_buf[i * BUF_STRIDE];
+    }
+    const pixel *src = dst;
+    const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
+
+    if (edges & LR_HAVE_TOP) {
+        sumsq_ptrs[0] = sumsq_rows[0];
+        sumsq_ptrs[1] = sumsq_rows[1];
+        sumsq_ptrs[2] = sumsq_rows[2];
+        sum_ptrs[0] = sum_rows[0];
+        sum_ptrs[1] = sum_rows[1];
+        sum_ptrs[2] = sum_rows[2];
+
+        BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[0], sum_rows[0],
+                                       NULL, lpf, w, edges);
+        lpf += PXSTRIDE(stride);
+        BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[1], sum_rows[1],
+                                       NULL, lpf, w, edges);
+
+        sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
+                         left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
+        left++;
+        src += PXSTRIDE(stride);
+        rotate_ab_3(A_ptrs, B_ptrs);
+
+        if (--h <= 0)
+            goto vert_1;
+
+        sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
+        left++;
+        src += PXSTRIDE(stride);
+        rotate_ab_3(A_ptrs, B_ptrs);
+
+        if (--h <= 0)
+            goto vert_2;
+    } else {
+        sumsq_ptrs[0] = sumsq_rows[0];
+        sumsq_ptrs[1] = sumsq_rows[0];
+        sumsq_ptrs[2] = sumsq_rows[0];
+        sum_ptrs[0] = sum_rows[0];
+        sum_ptrs[1] = sum_rows[0];
+        sum_ptrs[2] = sum_rows[0];
+
+        BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[0], sum_rows[0],
+                                       left, src, w, edges);
+        left++;
+        src += PXSTRIDE(stride);
+
+        sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
+                           w, params->sgr.s1, BITDEPTH_MAX);
+        rotate_ab_3(A_ptrs, B_ptrs);
+
+        if (--h <= 0)
+            goto vert_1;
+
+        sumsq_ptrs[2] = sumsq_rows[1];
+        sum_ptrs[2] = sum_rows[1];
+
+        sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
+                         left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
+        left++;
+        src += PXSTRIDE(stride);
+        rotate_ab_3(A_ptrs, B_ptrs);
+
+        if (--h <= 0)
+            goto vert_2;
+
+        sumsq_ptrs[2] = sumsq_rows[2];
+        sum_ptrs[2] = sum_rows[2];
+    }
+
+    do {
+        sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
+                         left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
+        left++;
+        src += PXSTRIDE(stride);
+
+        sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
+                         w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
+    } while (--h > 0);
+
+    if (!(edges & LR_HAVE_BOTTOM))
+        goto vert_2;
+
+    sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
+                     NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX);
+    lpf_bottom += PXSTRIDE(stride);
+
+    sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
+                     w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
+
+    sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
+                     NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX);
+
+    sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
+                     w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
+    return;
+
+vert_2:
+    sumsq_ptrs[2] = sumsq_ptrs[1];
+    sum_ptrs[2] = sum_ptrs[1];
+    sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
+                       w, params->sgr.s1, BITDEPTH_MAX);
+
+    sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
+                     w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
+
+output_1:
+    sumsq_ptrs[2] = sumsq_ptrs[1];
+    sum_ptrs[2] = sum_ptrs[1];
+    sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
+                       w, params->sgr.s1, BITDEPTH_MAX);
+
+    sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
+                     w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
+    return;
+
+vert_1:
+    sumsq_ptrs[2] = sumsq_ptrs[1];
+    sum_ptrs[2] = sum_ptrs[1];
+    sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
+                       w, params->sgr.s1, BITDEPTH_MAX);
+    rotate_ab_3(A_ptrs, B_ptrs);
+    goto output_1;
+}
+
+static void sgr_filter_5x5_neon(pixel *dst, const ptrdiff_t stride,
+                                const pixel (*left)[4], const pixel *lpf,
+                                const int w, int h,
+                                const LooprestorationParams *const params,
+                                const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+    ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 5 + 16,);
+    ALIGN_STK_16(int16_t, sum_buf, BUF_STRIDE * 5 + 16,);
+    int32_t *sumsq_ptrs[5], *sumsq_rows[5];
+    int16_t *sum_ptrs[5], *sum_rows[5];
+    for (int i = 0; i < 5; i++) {
+        sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE];
+        sum_rows[i] = &sum_buf[i * BUF_STRIDE];
+    }
+
+    ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 2 + 16,);
+    ALIGN_STK_16(int16_t, B_buf, BUF_STRIDE * 2 + 16,);
+    int32_t *A_ptrs[2];
+    int16_t *B_ptrs[2];
+    for (int i = 0; i < 2; i++) {
+        A_ptrs[i] = &A_buf[i * BUF_STRIDE];
+        B_ptrs[i] = &B_buf[i * BUF_STRIDE];
+    }
+    const pixel *src = dst;
+    const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
+
+    if (edges & LR_HAVE_TOP) {
+        sumsq_ptrs[0] = sumsq_rows[0];
+        sumsq_ptrs[1] = sumsq_rows[0];
+        sumsq_ptrs[2] = sumsq_rows[1];
+        sumsq_ptrs[3] = sumsq_rows[2];
+        sumsq_ptrs[4] = sumsq_rows[3];
+        sum_ptrs[0] = sum_rows[0];
+        sum_ptrs[1] = sum_rows[0];
+        sum_ptrs[2] = sum_rows[1];
+        sum_ptrs[3] = sum_rows[2];
+        sum_ptrs[4] = sum_rows[3];
+
+        BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[0], sum_rows[0],
+                                       NULL, lpf, w, edges);
+        lpf += PXSTRIDE(stride);
+        BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[1], sum_rows[1],
+                                       NULL, lpf, w, edges);
+
+        BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[2], sum_rows[2],
+                                       left, src, w, edges);
+        left++;
+        src += PXSTRIDE(stride);
+
+        if (--h <= 0)
+            goto vert_1;
+
+        BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[3], sum_rows[3],
+                                       left, src, w, edges);
+        left++;
+        src += PXSTRIDE(stride);
+        sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
+                           w, params->sgr.s0, BITDEPTH_MAX);
+        rotate_ab_2(A_ptrs, B_ptrs);
+
+        if (--h <= 0)
+            goto vert_2;
+
+        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
+        // one of them to point at the previously unused rows[4].
+        sumsq_ptrs[3] = sumsq_rows[4];
+        sum_ptrs[3] = sum_rows[4];
+    } else {
+        sumsq_ptrs[0] = sumsq_rows[0];
+        sumsq_ptrs[1] = sumsq_rows[0];
+        sumsq_ptrs[2] = sumsq_rows[0];
+        sumsq_ptrs[3] = sumsq_rows[0];
+        sumsq_ptrs[4] = sumsq_rows[0];
+        sum_ptrs[0] = sum_rows[0];
+        sum_ptrs[1] = sum_rows[0];
+        sum_ptrs[2] = sum_rows[0];
+        sum_ptrs[3] = sum_rows[0];
+        sum_ptrs[4] = sum_rows[0];
+
+        BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[0], sum_rows[0],
+                                       left, src, w, edges);
+        left++;
+        src += PXSTRIDE(stride);
+
+        if (--h <= 0)
+            goto vert_1;
+
+        sumsq_ptrs[4] = sumsq_rows[1];
+        sum_ptrs[4] = sum_rows[1];
+
+        BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[1], sum_rows[1],
+                                       left, src, w, edges);
+        left++;
+        src += PXSTRIDE(stride);
+
+        sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
+                           w, params->sgr.s0, BITDEPTH_MAX);
+        rotate_ab_2(A_ptrs, B_ptrs);
+
+        if (--h <= 0)
+            goto vert_2;
+
+        sumsq_ptrs[3] = sumsq_rows[2];
+        sumsq_ptrs[4] = sumsq_rows[3];
+        sum_ptrs[3] = sum_rows[2];
+        sum_ptrs[4] = sum_rows[3];
+
+        BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[2], sum_rows[2],
+                                       left, src, w, edges);
+        left++;
+        src += PXSTRIDE(stride);
+
+        if (--h <= 0)
+            goto odd;
+
+        BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[3], sum_rows[3],
+                                       left, src, w, edges);
+        left++;
+        src += PXSTRIDE(stride);
+
+        sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
+                           w, params->sgr.s0, BITDEPTH_MAX);
+        sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
+                         w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
+
+        if (--h <= 0)
+            goto vert_2;
+
+        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
+        // one of them to point at the previously unused rows[4].
+        sumsq_ptrs[3] = sumsq_rows[4];
+        sum_ptrs[3] = sum_rows[4];
+    }
+
+    do {
+        BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[3], sum_ptrs[3],
+                                       left, src, w, edges);
+        left++;
+        src += PXSTRIDE(stride);
+
+        if (--h <= 0)
+            goto odd;
+
+        BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[4], sum_ptrs[4],
+                                       left, src, w, edges);
+        left++;
+        src += PXSTRIDE(stride);
+
+        sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
+                           w, params->sgr.s0, BITDEPTH_MAX);
+        sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
+                         w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
+    } while (--h > 0);
+
+    if (!(edges & LR_HAVE_BOTTOM))
+        goto vert_2;
+
+    BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[3], sum_ptrs[3],
+                                   NULL, lpf_bottom, w, edges);
+    lpf_bottom += PXSTRIDE(stride);
+    BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[4], sum_ptrs[4],
+                                   NULL, lpf_bottom, w, edges);
+
+output_2:
+    sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
+                       w, params->sgr.s0, BITDEPTH_MAX);
+    sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
+                     w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
+    return;
+
+vert_2:
+    // Duplicate the last row twice more
+    sumsq_ptrs[3] = sumsq_ptrs[2];
+    sumsq_ptrs[4] = sumsq_ptrs[2];
+    sum_ptrs[3] = sum_ptrs[2];
+    sum_ptrs[4] = sum_ptrs[2];
+    goto output_2;
+
+odd:
+    // Copy the last row as padding once
+    sumsq_ptrs[4] = sumsq_ptrs[3];
+    sum_ptrs[4] = sum_ptrs[3];
+
+    sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
+                       w, params->sgr.s0, BITDEPTH_MAX);
+    sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
+                     w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
+
+output_1:
+    // Duplicate the last row twice more
+    sumsq_ptrs[3] = sumsq_ptrs[2];
+    sumsq_ptrs[4] = sumsq_ptrs[2];
+    sum_ptrs[3] = sum_ptrs[2];
+    sum_ptrs[4] = sum_ptrs[2];
+
+    sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
+                       w, params->sgr.s0, BITDEPTH_MAX);
+    // Output only one row
+    sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
+                     w, 1, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
+    return;
+
+vert_1:
+    // Copy the last row as padding once
+    sumsq_ptrs[4] = sumsq_ptrs[3];
+    sum_ptrs[4] = sum_ptrs[3];
+
+    sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
+                       w, params->sgr.s0, BITDEPTH_MAX);
+    rotate_ab_2(A_ptrs, B_ptrs);
+
+    goto output_1;
+}
+
+static void sgr_filter_mix_neon(pixel *dst, const ptrdiff_t stride,
+                                const pixel (*left)[4], const pixel *lpf,
+                                const int w, int h,
+                                const LooprestorationParams *const params,
+                                const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+    ALIGN_STK_16(int32_t, sumsq5_buf, BUF_STRIDE * 5 + 16,);
+    ALIGN_STK_16(int16_t, sum5_buf, BUF_STRIDE * 5 + 16,);
+    int32_t *sumsq5_ptrs[5], *sumsq5_rows[5];
+    int16_t *sum5_ptrs[5], *sum5_rows[5];
+    for (int i = 0; i < 5; i++) {
+        sumsq5_rows[i] = &sumsq5_buf[i * BUF_STRIDE];
+        sum5_rows[i] = &sum5_buf[i * BUF_STRIDE];
+    }
+    ALIGN_STK_16(int32_t, sumsq3_buf, BUF_STRIDE * 3 + 16,);
+    ALIGN_STK_16(int16_t, sum3_buf, BUF_STRIDE * 3 + 16,);
+    int32_t *sumsq3_ptrs[3], *sumsq3_rows[3];
+    int16_t *sum3_ptrs[3], *sum3_rows[3];
+    for (int i = 0; i < 3; i++) {
+        sumsq3_rows[i] = &sumsq3_buf[i * BUF_STRIDE];
+        sum3_rows[i] = &sum3_buf[i * BUF_STRIDE];
+    }
+
+    ALIGN_STK_16(int32_t, A5_buf, BUF_STRIDE * 2 + 16,);
+    ALIGN_STK_16(int16_t, B5_buf, BUF_STRIDE * 2 + 16,);
+    int32_t *A5_ptrs[2];
+    int16_t *B5_ptrs[2];
+    for (int i = 0; i < 2; i++) {
+        A5_ptrs[i] = &A5_buf[i * BUF_STRIDE];
+        B5_ptrs[i] = &B5_buf[i * BUF_STRIDE];
+    }
+    ALIGN_STK_16(int32_t, A3_buf, BUF_STRIDE * 4 + 16,);
+    ALIGN_STK_16(int16_t, B3_buf, BUF_STRIDE * 4 + 16,);
+    int32_t *A3_ptrs[4];
+    int16_t *B3_ptrs[4];
+    for (int i = 0; i < 4; i++) {
+        A3_ptrs[i] = &A3_buf[i * BUF_STRIDE];
+        B3_ptrs[i] = &B3_buf[i * BUF_STRIDE];
+    }
+    const pixel *src = dst;
+    const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
+
+    if (edges & LR_HAVE_TOP) {
+        sumsq5_ptrs[0] = sumsq5_rows[0];
+        sumsq5_ptrs[1] = sumsq5_rows[0];
+        sumsq5_ptrs[2] = sumsq5_rows[1];
+        sumsq5_ptrs[3] = sumsq5_rows[2];
+        sumsq5_ptrs[4] = sumsq5_rows[3];
+        sum5_ptrs[0] = sum5_rows[0];
+        sum5_ptrs[1] = sum5_rows[0];
+        sum5_ptrs[2] = sum5_rows[1];
+        sum5_ptrs[3] = sum5_rows[2];
+        sum5_ptrs[4] = sum5_rows[3];
+
+        sumsq3_ptrs[0] = sumsq3_rows[0];
+        sumsq3_ptrs[1] = sumsq3_rows[1];
+        sumsq3_ptrs[2] = sumsq3_rows[2];
+        sum3_ptrs[0] = sum3_rows[0];
+        sum3_ptrs[1] = sum3_rows[1];
+        sum3_ptrs[2] = sum3_rows[2];
+
+        BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[0], sum3_rows[0],
+                                        sumsq5_rows[0], sum5_rows[0],
+                                        NULL, lpf, w, edges);
+        lpf += PXSTRIDE(stride);
+        BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[1], sum3_rows[1],
+                                        sumsq5_rows[1], sum5_rows[1],
+                                        NULL, lpf, w, edges);
+
+        BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[2], sum3_rows[2],
+                                        sumsq5_rows[2], sum5_rows[2],
+                                        left, src, w, edges);
+        left++;
+        src += PXSTRIDE(stride);
+
+        sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+                           w, params->sgr.s1, BITDEPTH_MAX);
+        rotate_ab_4(A3_ptrs, B3_ptrs);
+
+        if (--h <= 0)
+            goto vert_1;
+
+        BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
+                                        sumsq5_rows[3], sum5_rows[3],
+                                        left, src, w, edges);
+        left++;
+        src += PXSTRIDE(stride);
+        sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
+                           w, params->sgr.s0, BITDEPTH_MAX);
+        rotate_ab_2(A5_ptrs, B5_ptrs);
+        sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+                           w, params->sgr.s1, BITDEPTH_MAX);
+        rotate_ab_4(A3_ptrs, B3_ptrs);
+
+        if (--h <= 0)
+            goto vert_2;
+
+        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
+        // one of them to point at the previously unused rows[4].
+        sumsq5_ptrs[3] = sumsq5_rows[4];
+        sum5_ptrs[3] = sum5_rows[4];
+    } else {
+        sumsq5_ptrs[0] = sumsq5_rows[0];
+        sumsq5_ptrs[1] = sumsq5_rows[0];
+        sumsq5_ptrs[2] = sumsq5_rows[0];
+        sumsq5_ptrs[3] = sumsq5_rows[0];
+        sumsq5_ptrs[4] = sumsq5_rows[0];
+        sum5_ptrs[0] = sum5_rows[0];
+        sum5_ptrs[1] = sum5_rows[0];
+        sum5_ptrs[2] = sum5_rows[0];
+        sum5_ptrs[3] = sum5_rows[0];
+        sum5_ptrs[4] = sum5_rows[0];
+
+        sumsq3_ptrs[0] = sumsq3_rows[0];
+        sumsq3_ptrs[1] = sumsq3_rows[0];
+        sumsq3_ptrs[2] = sumsq3_rows[0];
+        sum3_ptrs[0] = sum3_rows[0];
+        sum3_ptrs[1] = sum3_rows[0];
+        sum3_ptrs[2] = sum3_rows[0];
+
+        BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[0], sum3_rows[0],
+                                        sumsq5_rows[0], sum5_rows[0],
+                                        left, src, w, edges);
+        left++;
+        src += PXSTRIDE(stride);
+
+        sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+                           w, params->sgr.s1, BITDEPTH_MAX);
+        rotate_ab_4(A3_ptrs, B3_ptrs);
+
+        if (--h <= 0)
+            goto vert_1;
+
+        sumsq5_ptrs[4] = sumsq5_rows[1];
+        sum5_ptrs[4] = sum5_rows[1];
+
+        sumsq3_ptrs[2] = sumsq3_rows[1];
+        sum3_ptrs[2] = sum3_rows[1];
+
+        BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[1], sum3_rows[1],
+                                        sumsq5_rows[1], sum5_rows[1],
+                                        left, src, w, edges);
+        left++;
+        src += PXSTRIDE(stride);
+
+        sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
+                           w, params->sgr.s0, BITDEPTH_MAX);
+        rotate_ab_2(A5_ptrs, B5_ptrs);
+        sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+                           w, params->sgr.s1, BITDEPTH_MAX);
+        rotate_ab_4(A3_ptrs, B3_ptrs);
+
+        if (--h <= 0)
+            goto vert_2;
+
+        sumsq5_ptrs[3] = sumsq5_rows[2];
+        sumsq5_ptrs[4] = sumsq5_rows[3];
+        sum5_ptrs[3] = sum5_rows[2];
+        sum5_ptrs[4] = sum5_rows[3];
+
+        sumsq3_ptrs[2] = sumsq3_rows[2];
+        sum3_ptrs[2] = sum3_rows[2];
+
+        BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[2], sum3_rows[2],
+                                        sumsq5_rows[2], sum5_rows[2],
+                                        left, src, w, edges);
+        left++;
+        src += PXSTRIDE(stride);
+
+        sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+                           w, params->sgr.s1, BITDEPTH_MAX);
+        rotate_ab_4(A3_ptrs, B3_ptrs);
+
+        if (--h <= 0)
+            goto odd;
+
+        BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
+                                        sumsq5_rows[3], sum5_rows[3],
+                                        left, src, w, edges);
+        left++;
+        src += PXSTRIDE(stride);
+
+        sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
+                           w, params->sgr.s0, BITDEPTH_MAX);
+        sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+                           w, params->sgr.s1, BITDEPTH_MAX);
+        sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
+                            w, 2, params->sgr.w0, params->sgr.w1
+                            HIGHBD_TAIL_SUFFIX);
+
+        if (--h <= 0)
+            goto vert_2;
+
+        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
+        // one of them to point at the previously unused rows[4].
+        sumsq5_ptrs[3] = sumsq5_rows[4];
+        sum5_ptrs[3] = sum5_rows[4];
+    }
+
+    do {
+        BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
+                                        sumsq5_ptrs[3], sum5_ptrs[3],
+                                        left, src, w, edges);
+        left++;
+        src += PXSTRIDE(stride);
+
+        sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+                           w, params->sgr.s1, BITDEPTH_MAX);
+        rotate_ab_4(A3_ptrs, B3_ptrs);
+
+        if (--h <= 0)
+            goto odd;
+
+        BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
+                                        sumsq5_ptrs[4], sum5_ptrs[4],
+                                        left, src, w, edges);
+        left++;
+        src += PXSTRIDE(stride);
+
+        sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
+                           w, params->sgr.s0, BITDEPTH_MAX);
+        sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+                           w, params->sgr.s1, BITDEPTH_MAX);
+        sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
+                            w, 2, params->sgr.w0, params->sgr.w1
+                            HIGHBD_TAIL_SUFFIX);
+    } while (--h > 0);
+
+    if (!(edges & LR_HAVE_BOTTOM))
+        goto vert_2;
+
+    BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
+                                    sumsq5_ptrs[3], sum5_ptrs[3],
+                                    NULL, lpf_bottom, w, edges);
+    lpf_bottom += PXSTRIDE(stride);
+    sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+                       w, params->sgr.s1, BITDEPTH_MAX);
+    rotate_ab_4(A3_ptrs, B3_ptrs);
+
+    BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
+                                    sumsq5_ptrs[4], sum5_ptrs[4],
+                                    NULL, lpf_bottom, w, edges);
+
+output_2:
+    sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
+                       w, params->sgr.s0, BITDEPTH_MAX);
+    sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+                       w, params->sgr.s1, BITDEPTH_MAX);
+    sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
+                        w, 2, params->sgr.w0, params->sgr.w1
+                        HIGHBD_TAIL_SUFFIX);
+    return;
+
+vert_2:
+    // Duplicate the last row twice more
+    sumsq5_ptrs[3] = sumsq5_ptrs[2];
+    sumsq5_ptrs[4] = sumsq5_ptrs[2];
+    sum5_ptrs[3] = sum5_ptrs[2];
+    sum5_ptrs[4] = sum5_ptrs[2];
+
+    sumsq3_ptrs[2] = sumsq3_ptrs[1];
+    sum3_ptrs[2] = sum3_ptrs[1];
+    sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+                       w, params->sgr.s1, BITDEPTH_MAX);
+    rotate_ab_4(A3_ptrs, B3_ptrs);
+
+    sumsq3_ptrs[2] = sumsq3_ptrs[1];
+    sum3_ptrs[2] = sum3_ptrs[1];
+
+    goto output_2;
+
+odd:
+    // Copy the last row as padding once
+    sumsq5_ptrs[4] = sumsq5_ptrs[3];
+    sum5_ptrs[4] = sum5_ptrs[3];
+
+    sumsq3_ptrs[2] = sumsq3_ptrs[1];
+    sum3_ptrs[2] = sum3_ptrs[1];
+
+    sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
+                       w, params->sgr.s0, BITDEPTH_MAX);
+    sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+                       w, params->sgr.s1, BITDEPTH_MAX);
+    sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
+                        w, 2, params->sgr.w0, params->sgr.w1
+                        HIGHBD_TAIL_SUFFIX);
+
+output_1:
+    // Duplicate the last row twice more
+    sumsq5_ptrs[3] = sumsq5_ptrs[2];
+    sumsq5_ptrs[4] = sumsq5_ptrs[2];
+    sum5_ptrs[3] = sum5_ptrs[2];
+    sum5_ptrs[4] = sum5_ptrs[2];
+
+    sumsq3_ptrs[2] = sumsq3_ptrs[1];
+    sum3_ptrs[2] = sum3_ptrs[1];
+
+    sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
+                       w, params->sgr.s0, BITDEPTH_MAX);
+    sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+                       w, params->sgr.s1, BITDEPTH_MAX);
+    rotate_ab_4(A3_ptrs, B3_ptrs);
+    // Output only one row
+    sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
+                        w, 1, params->sgr.w0, params->sgr.w1
+                        HIGHBD_TAIL_SUFFIX);
+    return;
+
+vert_1:
+    // Copy the last row as padding once
+    sumsq5_ptrs[4] = sumsq5_ptrs[3];
+    sum5_ptrs[4] = sum5_ptrs[3];
+
+    sumsq3_ptrs[2] = sumsq3_ptrs[1];
+    sum3_ptrs[2] = sum3_ptrs[1];
+
+    sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
+                       w, params->sgr.s0, BITDEPTH_MAX);
+    rotate_ab_2(A5_ptrs, B5_ptrs);
+    sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+                       w, params->sgr.s1, BITDEPTH_MAX);
+    rotate_ab_4(A3_ptrs, B3_ptrs);
+
+    goto output_1;
+}
+
+#endif
+
+
 static ALWAYS_INLINE void loop_restoration_dsp_init_arm(Dav1dLoopRestorationDSPContext *const c, int bpc) {
    const unsigned flags = dav1d_get_cpu_flags();

--- a/third_party/dav1d/src/arm/refmvs.h
+++ b/third_party/dav1d/src/arm/refmvs.h
@ -28,6 +28,7 @@
 #include "src/cpu.h"
 #include "src/refmvs.h"

+decl_save_tmvs_fn(dav1d_save_tmvs_neon);
 decl_splat_mv_fn(dav1d_splat_mv_neon);

 static ALWAYS_INLINE void refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *const c) {
@ -35,5 +36,6 @@ static ALWAYS_INLINE void refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *const c) {

    if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;

+    c->save_tmvs = dav1d_save_tmvs_neon;
    c->splat_mv = dav1d_splat_mv_neon;
 }
--- a/third_party/dav1d/src/data.c
+++ b/third_party/dav1d/src/data.c
@ -44,7 +44,7 @@ uint8_t *dav1d_data_create_internal(Dav1dData *const buf, const size_t sz) {
    validate_input_or_ret(buf != NULL, NULL);

    if (sz > SIZE_MAX / 2) return NULL;
-    buf->ref = dav1d_ref_create(sz);
+    buf->ref = dav1d_ref_create(ALLOC_DAV1DDATA, sz);
    if (!buf->ref) return NULL;
    buf->data = buf->ref->const_data;
    buf->sz = sz;
@ -65,7 +65,7 @@ int dav1d_data_wrap_internal(Dav1dData *const buf, const uint8_t *const ptr,
    validate_input_or_ret(free_callback != NULL, DAV1D_ERR(EINVAL));

    if (sz > SIZE_MAX / 2) return DAV1D_ERR(EINVAL);
-    Dav1dRef *const ref = malloc(sizeof(Dav1dRef));
+    Dav1dRef *const ref = dav1d_malloc(ALLOC_DAV1DDATA, sizeof(Dav1dRef));
    if (!ref) return DAV1D_ERR(ENOMEM);

    buf->ref = dav1d_ref_init(ref, ptr, free_callback, cookie, 1);
@ -86,7 +86,7 @@ int dav1d_data_wrap_user_data_internal(Dav1dData *const buf,
    validate_input_or_ret(buf != NULL, DAV1D_ERR(EINVAL));
    validate_input_or_ret(free_callback != NULL, DAV1D_ERR(EINVAL));

-    Dav1dRef *const ref = malloc(sizeof(Dav1dRef));
+    Dav1dRef *const ref = dav1d_malloc(ALLOC_DAV1DDATA, sizeof(Dav1dRef));
    if (!ref) return DAV1D_ERR(ENOMEM);

    buf->m.user_data.ref = dav1d_ref_init(ref, user_data, free_callback, cookie, 1);
@ -95,14 +95,13 @@ int dav1d_data_wrap_user_data_internal(Dav1dData *const buf,
    return 0;
 }

-
 void dav1d_data_ref(Dav1dData *const dst, const Dav1dData *const src) {
-    validate_input(dst != NULL);
-    validate_input(dst->data == NULL);
-    validate_input(src != NULL);
+    assert(dst != NULL);
+    assert(dst->data == NULL);
+    assert(src != NULL);

    if (src->ref) {
-        validate_input(src->data != NULL);
+        assert(src->data != NULL);
        dav1d_ref_inc(src->ref);
    }
    if (src->m.user_data.ref) dav1d_ref_inc(src->m.user_data.ref);
--- a/third_party/dav1d/src/decode.c
+++ b/third_party/dav1d/src/decode.c
@ -2932,8 +2932,8 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
    int retval = DAV1D_ERR(ENOMEM);

    if (f->sbh > f->lf.start_of_tile_row_sz) {
-        free(f->lf.start_of_tile_row);
-        f->lf.start_of_tile_row = malloc(f->sbh * sizeof(uint8_t));
+        dav1d_free(f->lf.start_of_tile_row);
+        f->lf.start_of_tile_row = dav1d_malloc(ALLOC_TILE, f->sbh * sizeof(uint8_t));
        if (!f->lf.start_of_tile_row) {
            f->lf.start_of_tile_row_sz = 0;
            goto error;
@ -2950,24 +2950,24 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
    const int n_ts = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows;
    if (n_ts != f->n_ts) {
        if (c->n_fc > 1) {
-            freep(&f->frame_thread.tile_start_off);
+            dav1d_free(f->frame_thread.tile_start_off);
            f->frame_thread.tile_start_off =
-                malloc(sizeof(*f->frame_thread.tile_start_off) * n_ts);
+                dav1d_malloc(ALLOC_TILE, sizeof(*f->frame_thread.tile_start_off) * n_ts);
            if (!f->frame_thread.tile_start_off) {
                f->n_ts = 0;
                goto error;
            }
        }
        dav1d_free_aligned(f->ts);
-        f->ts = dav1d_alloc_aligned(sizeof(*f->ts) * n_ts, 32);
+        f->ts = dav1d_alloc_aligned(ALLOC_TILE, sizeof(*f->ts) * n_ts, 32);
        if (!f->ts) goto error;
        f->n_ts = n_ts;
    }

    const int a_sz = f->sb128w * f->frame_hdr->tiling.rows * (1 + (c->n_fc > 1 && c->n_tc > 1));
    if (a_sz != f->a_sz) {
-        freep(&f->a);
-        f->a = malloc(sizeof(*f->a) * a_sz);
+        dav1d_free(f->a);
+        f->a = dav1d_malloc(ALLOC_TILE, sizeof(*f->a) * a_sz);
        if (!f->a) {
            f->a_sz = 0;
            goto error;
@ -2993,9 +2993,10 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {

        const int lowest_pixel_mem_sz = f->frame_hdr->tiling.cols * f->sbh;
        if (lowest_pixel_mem_sz != f->tile_thread.lowest_pixel_mem_sz) {
-            free(f->tile_thread.lowest_pixel_mem);
+            dav1d_free(f->tile_thread.lowest_pixel_mem);
            f->tile_thread.lowest_pixel_mem =
-                malloc(lowest_pixel_mem_sz * sizeof(*f->tile_thread.lowest_pixel_mem));
+                dav1d_malloc(ALLOC_TILE, lowest_pixel_mem_sz *
+                             sizeof(*f->tile_thread.lowest_pixel_mem));
            if (!f->tile_thread.lowest_pixel_mem) {
                f->tile_thread.lowest_pixel_mem_sz = 0;
                goto error;
@ -3016,9 +3017,9 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {

        const int cf_sz = (num_sb128 * size_mul[0]) << hbd;
        if (cf_sz != f->frame_thread.cf_sz) {
-            dav1d_freep_aligned(&f->frame_thread.cf);
+            dav1d_free_aligned(f->frame_thread.cf);
            f->frame_thread.cf =
-                dav1d_alloc_aligned((size_t)cf_sz * 128 * 128 / 2, 64);
+                dav1d_alloc_aligned(ALLOC_COEF, (size_t)cf_sz * 128 * 128 / 2, 64);
            if (!f->frame_thread.cf) {
                f->frame_thread.cf_sz = 0;
                goto error;
@ -3029,9 +3030,9 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {

        if (f->frame_hdr->allow_screen_content_tools) {
            if (num_sb128 != f->frame_thread.pal_sz) {
-                dav1d_freep_aligned(&f->frame_thread.pal);
+                dav1d_free_aligned(f->frame_thread.pal);
                f->frame_thread.pal =
-                    dav1d_alloc_aligned(sizeof(*f->frame_thread.pal) *
+                    dav1d_alloc_aligned(ALLOC_PAL, sizeof(*f->frame_thread.pal) *
                                        num_sb128 * 16 * 16, 64);
                if (!f->frame_thread.pal) {
                    f->frame_thread.pal_sz = 0;
@ -3042,9 +3043,9 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {

            const int pal_idx_sz = num_sb128 * size_mul[1];
            if (pal_idx_sz != f->frame_thread.pal_idx_sz) {
-                dav1d_freep_aligned(&f->frame_thread.pal_idx);
+                dav1d_free_aligned(f->frame_thread.pal_idx);
                f->frame_thread.pal_idx =
-                    dav1d_alloc_aligned(sizeof(*f->frame_thread.pal_idx) *
+                    dav1d_alloc_aligned(ALLOC_PAL, sizeof(*f->frame_thread.pal_idx) *
                                        pal_idx_sz * 128 * 128 / 4, 64);
                if (!f->frame_thread.pal_idx) {
                    f->frame_thread.pal_idx_sz = 0;
@ -3072,7 +3073,7 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
        size_t alloc_sz = 64;
        alloc_sz += (size_t)llabs(y_stride) * 4 * f->sbh << need_cdef_lpf_copy;
        alloc_sz += (size_t)llabs(uv_stride) * 8 * f->sbh << need_cdef_lpf_copy;
-        uint8_t *ptr = f->lf.cdef_line_buf = dav1d_alloc_aligned(alloc_sz, 32);
+        uint8_t *ptr = f->lf.cdef_line_buf = dav1d_alloc_aligned(ALLOC_CDEF, alloc_sz, 32);
        if (!ptr) {
            f->lf.cdef_buf_plane_sz[0] = f->lf.cdef_buf_plane_sz[1] = 0;
            goto error;
@ -3132,7 +3133,7 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
        size_t alloc_sz = 128;
        alloc_sz += (size_t)llabs(y_stride) * num_lines;
        alloc_sz += (size_t)llabs(uv_stride) * num_lines * 2;
-        uint8_t *ptr = f->lf.lr_line_buf = dav1d_alloc_aligned(alloc_sz, 64);
+        uint8_t *ptr = f->lf.lr_line_buf = dav1d_alloc_aligned(ALLOC_LR, alloc_sz, 64);
        if (!ptr) {
            f->lf.lr_buf_plane_sz[0] = f->lf.lr_buf_plane_sz[1] = 0;
            goto error;
@ -3158,23 +3159,23 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {

    // update allocation for loopfilter masks
    if (num_sb128 != f->lf.mask_sz) {
-        freep(&f->lf.mask);
-        freep(&f->lf.level);
-        f->lf.mask = malloc(sizeof(*f->lf.mask) * num_sb128);
+        dav1d_free(f->lf.mask);
+        dav1d_free(f->lf.level);
+        f->lf.mask = dav1d_malloc(ALLOC_LF, sizeof(*f->lf.mask) * num_sb128);
        // over-allocate by 3 bytes since some of the SIMD implementations
        // index this from the level type and can thus over-read by up to 3
-        f->lf.level = malloc(sizeof(*f->lf.level) * num_sb128 * 32 * 32 + 3);
+        f->lf.level = dav1d_malloc(ALLOC_LF, sizeof(*f->lf.level) * num_sb128 * 32 * 32 + 3);
        if (!f->lf.mask || !f->lf.level) {
            f->lf.mask_sz = 0;
            goto error;
        }
        if (c->n_fc > 1) {
-            freep(&f->frame_thread.b);
-            freep(&f->frame_thread.cbi);
-            f->frame_thread.b = malloc(sizeof(*f->frame_thread.b) *
-                                       num_sb128 * 32 * 32);
-            f->frame_thread.cbi = malloc(sizeof(*f->frame_thread.cbi) *
-                                         num_sb128 * 32 * 32);
+            dav1d_free(f->frame_thread.b);
+            dav1d_free(f->frame_thread.cbi);
+            f->frame_thread.b = dav1d_malloc(ALLOC_BLOCK, sizeof(*f->frame_thread.b) *
+                                             num_sb128 * 32 * 32);
+            f->frame_thread.cbi = dav1d_malloc(ALLOC_BLOCK, sizeof(*f->frame_thread.cbi) *
+                                               num_sb128 * 32 * 32);
            if (!f->frame_thread.b || !f->frame_thread.cbi) {
                f->lf.mask_sz = 0;
                goto error;
@ -3186,8 +3187,8 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
    f->sr_sb128w = (f->sr_cur.p.p.w + 127) >> 7;
    const int lr_mask_sz = f->sr_sb128w * f->sb128h;
    if (lr_mask_sz != f->lf.lr_mask_sz) {
-        freep(&f->lf.lr_mask);
-        f->lf.lr_mask = malloc(sizeof(*f->lf.lr_mask) * lr_mask_sz);
+        dav1d_free(f->lf.lr_mask);
+        f->lf.lr_mask = dav1d_malloc(ALLOC_LR, sizeof(*f->lf.lr_mask) * lr_mask_sz);
        if (!f->lf.lr_mask) {
            f->lf.lr_mask_sz = 0;
            goto error;
@ -3207,9 +3208,9 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {

    const int ipred_edge_sz = f->sbh * f->sb128w << hbd;
    if (ipred_edge_sz != f->ipred_edge_sz) {
-        dav1d_freep_aligned(&f->ipred_edge[0]);
+        dav1d_free_aligned(f->ipred_edge[0]);
        uint8_t *ptr = f->ipred_edge[0] =
-            dav1d_alloc_aligned(ipred_edge_sz * 128 * 3, 64);
+            dav1d_alloc_aligned(ALLOC_IPRED, ipred_edge_sz * 128 * 3, 64);
        if (!ptr) {
            f->ipred_edge_sz = 0;
            goto error;
@ -3221,8 +3222,8 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {

    const int re_sz = f->sb128h * f->frame_hdr->tiling.cols;
    if (re_sz != f->lf.re_sz) {
-        freep(&f->lf.tx_lpf_right_edge[0]);
-        f->lf.tx_lpf_right_edge[0] = malloc(re_sz * 32 * 2);
+        dav1d_free(f->lf.tx_lpf_right_edge[0]);
+        f->lf.tx_lpf_right_edge[0] = dav1d_malloc(ALLOC_LF, re_sz * 32 * 2);
        if (!f->lf.tx_lpf_right_edge[0]) {
            f->lf.re_sz = 0;
            goto error;
@ -3656,9 +3657,9 @@ int dav1d_submit_frame(Dav1dContext *const c) {

    // FIXME qsort so tiles are in order (for frame threading)
    if (f->n_tile_data_alloc < c->n_tile_data) {
-        freep(&f->tile);
+        dav1d_free(f->tile);
        assert(c->n_tile_data < INT_MAX / (int)sizeof(*f->tile));
-        f->tile = malloc(c->n_tile_data * sizeof(*f->tile));
+        f->tile = dav1d_malloc(ALLOC_TILE, c->n_tile_data * sizeof(*f->tile));
        if (!f->tile) {
            f->n_tile_data_alloc = f->n_tile_data = 0;
            res = DAV1D_ERR(ENOMEM);
--- a/third_party/dav1d/src/lib.c
+++ b/third_party/dav1d/src/lib.c
@ -63,6 +63,12 @@ COLD const char *dav1d_version(void) {
    return DAV1D_VERSION;
 }

+COLD unsigned dav1d_version_api(void) {
+    return (DAV1D_API_VERSION_MAJOR << 16) |
+           (DAV1D_API_VERSION_MINOR <<  8) |
+           (DAV1D_API_VERSION_PATCH <<  0);
+}
+
 COLD void dav1d_default_settings(Dav1dSettings *const s) {
    s->n_threads = 0;
    s->max_frame_delay = 0;
@ -155,7 +161,7 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {

    pthread_attr_setstacksize(&thread_attr, stack_size);

-    Dav1dContext *const c = *c_out = dav1d_alloc_aligned(sizeof(*c), 64);
+    Dav1dContext *const c = *c_out = dav1d_alloc_aligned(ALLOC_COMMON_CTX, sizeof(*c), 64);
    if (!c) goto error;
    memset(c, 0, sizeof(*c));

@ -172,12 +178,12 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {

    dav1d_data_props_set_defaults(&c->cached_error_props);

-    if (dav1d_mem_pool_init(&c->seq_hdr_pool) ||
-        dav1d_mem_pool_init(&c->frame_hdr_pool) ||
-        dav1d_mem_pool_init(&c->segmap_pool) ||
-        dav1d_mem_pool_init(&c->refmvs_pool) ||
-        dav1d_mem_pool_init(&c->pic_ctx_pool) ||
-        dav1d_mem_pool_init(&c->cdf_pool))
+    if (dav1d_mem_pool_init(ALLOC_OBU_HDR, &c->seq_hdr_pool) ||
+        dav1d_mem_pool_init(ALLOC_OBU_HDR, &c->frame_hdr_pool) ||
+        dav1d_mem_pool_init(ALLOC_SEGMAP, &c->segmap_pool) ||
+        dav1d_mem_pool_init(ALLOC_REFMVS, &c->refmvs_pool) ||
+        dav1d_mem_pool_init(ALLOC_PIC_CTX, &c->pic_ctx_pool) ||
+        dav1d_mem_pool_init(ALLOC_CDF, &c->cdf_pool))
    {
        goto error;
    }
@ -186,7 +192,7 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
        c->allocator.release_picture_callback == dav1d_default_picture_release)
    {
        if (c->allocator.cookie) goto error;
-        if (dav1d_mem_pool_init(&c->picture_pool)) goto error;
+        if (dav1d_mem_pool_init(ALLOC_PIC, &c->picture_pool)) goto error;
        c->allocator.cookie = c->picture_pool;
    } else if (c->allocator.alloc_picture_callback   == dav1d_default_picture_alloc ||
               c->allocator.release_picture_callback == dav1d_default_picture_release)
@ -210,11 +216,11 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {

    get_num_threads(c, s, &c->n_tc, &c->n_fc);

-    c->fc = dav1d_alloc_aligned(sizeof(*c->fc) * c->n_fc, 32);
+    c->fc = dav1d_alloc_aligned(ALLOC_THREAD_CTX, sizeof(*c->fc) * c->n_fc, 32);
    if (!c->fc) goto error;
    memset(c->fc, 0, sizeof(*c->fc) * c->n_fc);

-    c->tc = dav1d_alloc_aligned(sizeof(*c->tc) * c->n_tc, 64);
+    c->tc = dav1d_alloc_aligned(ALLOC_THREAD_CTX, sizeof(*c->tc) * c->n_tc, 64);
    if (!c->tc) goto error;
    memset(c->tc, 0, sizeof(*c->tc) * c->n_tc);
    if (c->n_tc > 1) {
@ -235,9 +241,11 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
    }

    if (c->n_fc > 1) {
+        const size_t out_delayed_sz = sizeof(*c->frame_thread.out_delayed) * c->n_fc;
        c->frame_thread.out_delayed =
-            calloc(c->n_fc, sizeof(*c->frame_thread.out_delayed));
+            dav1d_malloc(ALLOC_THREAD_CTX, out_delayed_sz);
        if (!c->frame_thread.out_delayed) goto error;
+        memset(c->frame_thread.out_delayed, 0, out_delayed_sz);
    }
    for (unsigned n = 0; n < c->n_fc; n++) {
        Dav1dFrameContext *const f = &c->fc[n];
@ -592,6 +600,9 @@ void dav1d_flush(Dav1dContext *const c) {

 COLD void dav1d_close(Dav1dContext **const c_out) {
    validate_input(c_out != NULL);
+#if TRACK_HEAP_ALLOCATIONS
+    dav1d_log_alloc_stats(*c_out);
+#endif
    close_internal(c_out, 1);
 }

@ -628,31 +639,31 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {

        // clean-up threading stuff
        if (c->n_fc > 1) {
-            freep(&f->tile_thread.lowest_pixel_mem);
-            freep(&f->frame_thread.b);
-            dav1d_freep_aligned(&f->frame_thread.pal_idx);
-            dav1d_freep_aligned(&f->frame_thread.cf);
-            freep(&f->frame_thread.tile_start_off);
-            dav1d_freep_aligned(&f->frame_thread.pal);
-            freep(&f->frame_thread.cbi);
+            dav1d_free(f->tile_thread.lowest_pixel_mem);
+            dav1d_free(f->frame_thread.b);
+            dav1d_free_aligned(f->frame_thread.pal_idx);
+            dav1d_free_aligned(f->frame_thread.cf);
+            dav1d_free(f->frame_thread.tile_start_off);
+            dav1d_free_aligned(f->frame_thread.pal);
+            dav1d_free(f->frame_thread.cbi);
        }
        if (c->n_tc > 1) {
            pthread_mutex_destroy(&f->task_thread.pending_tasks.lock);
            pthread_cond_destroy(&f->task_thread.cond);
            pthread_mutex_destroy(&f->task_thread.lock);
        }
-        freep(&f->frame_thread.frame_progress);
-        freep(&f->task_thread.tasks);
-        freep(&f->task_thread.tile_tasks[0]);
+        dav1d_free(f->frame_thread.frame_progress);
+        dav1d_free(f->task_thread.tasks);
+        dav1d_free(f->task_thread.tile_tasks[0]);
        dav1d_free_aligned(f->ts);
        dav1d_free_aligned(f->ipred_edge[0]);
-        free(f->a);
-        free(f->tile);
-        free(f->lf.mask);
-        free(f->lf.lr_mask);
-        free(f->lf.level);
-        free(f->lf.tx_lpf_right_edge[0]);
-        free(f->lf.start_of_tile_row);
+        dav1d_free(f->a);
+        dav1d_free(f->tile);
+        dav1d_free(f->lf.mask);
+        dav1d_free(f->lf.level);
+        dav1d_free(f->lf.lr_mask);
+        dav1d_free(f->lf.tx_lpf_right_edge[0]);
+        dav1d_free(f->lf.start_of_tile_row);
        dav1d_refmvs_clear(&f->rf);
        dav1d_free_aligned(f->lf.cdef_line_buf);
        dav1d_free_aligned(f->lf.lr_line_buf);
@ -662,11 +673,11 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
        for (unsigned n = 0; n < c->n_fc; n++)
            if (c->frame_thread.out_delayed[n].p.frame_hdr)
                dav1d_thread_picture_unref(&c->frame_thread.out_delayed[n]);
-        free(c->frame_thread.out_delayed);
+        dav1d_free(c->frame_thread.out_delayed);
    }
    for (int n = 0; n < c->n_tile_data; n++)
        dav1d_data_unref_internal(&c->tile[n].data);
-    free(c->tile);
+    dav1d_free(c->tile);
    for (int n = 0; n < 8; n++) {
        dav1d_cdf_thread_unref(&c->cdf[n]);
        if (c->refs[n].p.p.frame_hdr)
--- a/third_party/dav1d/src/log.c
+++ b/third_party/dav1d/src/log.c
@ -44,7 +44,7 @@ COLD void dav1d_log_default_callback(void *const cookie,
 }

 COLD void dav1d_log(Dav1dContext *const c, const char *const format, ...) {
-    validate_input(c != NULL);
+    assert(c != NULL);

    if (!c->logger.callback)
        return;
--- a/third_party/dav1d/src/mem.c
+++ b/third_party/dav1d/src/mem.c
@ -31,9 +31,208 @@

 #include "src/internal.h"

+#if TRACK_HEAP_ALLOCATIONS
+#include <stdio.h>
+
+#include "src/log.h"
+
+#define DEFAULT_ALIGN 16
+
+typedef struct {
+    size_t sz;
+    unsigned align;
+    enum AllocationType type;
+} Dav1dAllocationData;
+
+typedef struct {
+    size_t curr_sz;
+    size_t peak_sz;
+    unsigned num_allocs;
+    unsigned num_reuses;
+} AllocStats;
+
+static AllocStats tracked_allocs[N_ALLOC_TYPES];
+static size_t curr_total_sz;
+static size_t peak_total_sz;
+static pthread_mutex_t track_alloc_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static void *track_alloc(const enum AllocationType type, char *ptr,
+                         const size_t sz, const size_t align)
+{
+    assert(align >= sizeof(Dav1dAllocationData));
+    if (ptr) {
+        ptr += align;
+        Dav1dAllocationData *const d = &((Dav1dAllocationData*)ptr)[-1];
+        AllocStats *const s = &tracked_allocs[type];
+
+        d->sz = sz;
+        d->align = (unsigned)align;
+        d->type = type;
+
+        pthread_mutex_lock(&track_alloc_mutex);
+        s->num_allocs++;
+        s->curr_sz += sz;
+        if (s->curr_sz > s->peak_sz)
+            s->peak_sz = s->curr_sz;
+
+        curr_total_sz += sz;
+        if (curr_total_sz > peak_total_sz)
+            peak_total_sz = curr_total_sz;
+        pthread_mutex_unlock(&track_alloc_mutex);
+    }
+    return ptr;
+}
+
+static void *track_free(char *const ptr) {
+    const Dav1dAllocationData *const d = &((Dav1dAllocationData*)ptr)[-1];
+    const size_t sz = d->sz;
+
+    pthread_mutex_lock(&track_alloc_mutex);
+    tracked_allocs[d->type].curr_sz -= sz;
+    curr_total_sz -= sz;
+    pthread_mutex_unlock(&track_alloc_mutex);
+
+    return ptr - d->align;
+}
+
+static void dav1d_track_reuse(const enum AllocationType type) {
+    pthread_mutex_lock(&track_alloc_mutex);
+    tracked_allocs[type].num_reuses++;
+    pthread_mutex_unlock(&track_alloc_mutex);
+}
+
+void *dav1d_malloc(const enum AllocationType type, const size_t sz) {
+    void *const ptr = malloc(sz + DEFAULT_ALIGN);
+    return track_alloc(type, ptr, sz, DEFAULT_ALIGN);
+}
+
+void *dav1d_alloc_aligned(const enum AllocationType type,
+                          const size_t sz, const size_t align)
+{
+    assert(!(align & (align - 1)));
+    void *ptr;
+#ifdef _WIN32
+    ptr = _aligned_malloc(sz + align, align);
+#elif defined(HAVE_POSIX_MEMALIGN)
+    if (posix_memalign(&ptr, align, sz + align)) return NULL;
+#else
+    ptr = memalign(align, sz + align);
+#endif
+
+    return track_alloc(type, ptr, sz, align);
+}
+
+void *dav1d_realloc(const enum AllocationType type,
+                    void *ptr, const size_t sz)
+{
+    if (!ptr)
+        return dav1d_malloc(type, sz);
+    ptr = realloc((char*)ptr - DEFAULT_ALIGN, sz + DEFAULT_ALIGN);
+    if (ptr)
+        ptr = track_free((char*)ptr + DEFAULT_ALIGN);
+    return track_alloc(type, ptr, sz, DEFAULT_ALIGN);
+}
+
+void dav1d_free(void *ptr) {
+    if (ptr)
+        free(track_free(ptr));
+}
+
+void dav1d_free_aligned(void *ptr) {
+    if (ptr) {
+        ptr = track_free(ptr);
+#ifdef _WIN32
+        _aligned_free(ptr);
+#else
+        free(ptr);
+#endif
+    }
+}
+
+static COLD int cmp_stats(const void *const a, const void *const b) {
+    const size_t a_sz = ((const AllocStats*)a)->peak_sz;
+    const size_t b_sz = ((const AllocStats*)b)->peak_sz;
+    return a_sz < b_sz ? -1 : a_sz > b_sz;
+}
+
+/* Insert spaces as thousands separators for better readability */
+static COLD int format_tsep(char *const s, const size_t n, const size_t value) {
+    if (value < 1000)
+        return snprintf(s, n, "%u", (unsigned)value);
+
+    const int len = format_tsep(s, n, value / 1000);
+    assert((size_t)len < n);
+    return len + snprintf(s + len, n - len, " %03u", (unsigned)(value % 1000));
+}
+
+COLD void dav1d_log_alloc_stats(Dav1dContext *const c) {
+    static const char *const type_names[N_ALLOC_TYPES] = {
+        [ALLOC_BLOCK     ] = "Block data",
+        [ALLOC_CDEF      ] = "CDEF line buffers",
+        [ALLOC_CDF       ] = "CDF contexts",
+        [ALLOC_COEF      ] = "Coefficient data",
+        [ALLOC_COMMON_CTX] = "Common context data",
+        [ALLOC_DAV1DDATA ] = "Dav1dData",
+        [ALLOC_IPRED     ] = "Intra pred edges",
+        [ALLOC_LF        ] = "Loopfilter data",
+        [ALLOC_LR        ] = "Looprestoration data",
+        [ALLOC_OBU_HDR   ] = "OBU headers",
+        [ALLOC_OBU_META  ] = "OBU metadata",
+        [ALLOC_PAL       ] = "Palette data",
+        [ALLOC_PIC       ] = "Picture buffers",
+        [ALLOC_PIC_CTX   ] = "Picture context data",
+        [ALLOC_REFMVS    ] = "Reference mv data",
+        [ALLOC_SEGMAP    ] = "Segmentation maps",
+        [ALLOC_THREAD_CTX] = "Thread context data",
+        [ALLOC_TILE      ] = "Tile data",
+    };
+
+    struct {
+        AllocStats stats;
+        enum AllocationType type;
+    } data[N_ALLOC_TYPES];
+    unsigned total_allocs = 0;
+    unsigned total_reuses = 0;
+
+    pthread_mutex_lock(&track_alloc_mutex);
+    for (int i = 0; i < N_ALLOC_TYPES; i++) {
+        AllocStats *const s = &data[i].stats;
+        *s = tracked_allocs[i];
+        data[i].type = i;
+        total_allocs += s->num_allocs;
+        total_reuses += s->num_reuses;
+    }
+    size_t total_sz = peak_total_sz;
+    pthread_mutex_unlock(&track_alloc_mutex);
+
+    /* Sort types by memory usage */
+    qsort(&data, N_ALLOC_TYPES, sizeof(*data), cmp_stats);
+
+    const double inv_total_share = 100.0 / total_sz;
+    char total_sz_buf[32];
+    const int sz_len = 4 + format_tsep(total_sz_buf, sizeof(total_sz_buf), total_sz);
+
+    dav1d_log(c, "\n Type                    Allocs    Reuses    Share    Peak size\n"
+                 "---------------------------------------------------------------------\n");
+    for (int i = N_ALLOC_TYPES - 1; i >= 0; i--) {
+        const AllocStats *const s = &data[i].stats;
+        if (s->num_allocs) {
+            const double share = s->peak_sz * inv_total_share;
+            char sz_buf[32];
+            format_tsep(sz_buf, sizeof(sz_buf), s->peak_sz);
+            dav1d_log(c, " %-20s%10u%10u%8.1f%%%*s\n", type_names[data[i].type],
+                      s->num_allocs, s->num_reuses, share, sz_len, sz_buf);
+        }
+    }
+    dav1d_log(c, "---------------------------------------------------------------------\n"
+                 "%31u%10u             %s\n",
+                 total_allocs, total_reuses, total_sz_buf);
+}
+#endif /* TRACK_HEAP_ALLOCATIONS */
+
 static COLD void mem_pool_destroy(Dav1dMemPool *const pool) {
    pthread_mutex_destroy(&pool->lock);
-    free(pool);
+    dav1d_free(pool);
 }

 void dav1d_mem_pool_push(Dav1dMemPool *const pool, Dav1dMemPoolBuffer *const buf) {
@ -66,10 +265,14 @@ Dav1dMemPoolBuffer *dav1d_mem_pool_pop(Dav1dMemPool *const pool, const size_t si
            dav1d_free_aligned(data);
            goto alloc;
        }
+#if TRACK_HEAP_ALLOCATIONS
+        dav1d_track_reuse(pool->type);
+#endif
    } else {
        pthread_mutex_unlock(&pool->lock);
 alloc:
-        data = dav1d_alloc_aligned(size + sizeof(Dav1dMemPoolBuffer), 64);
+        data = dav1d_alloc_aligned(pool->type,
+                                   size + sizeof(Dav1dMemPoolBuffer), 64);
        if (!data) {
            pthread_mutex_lock(&pool->lock);
            const int ref_cnt = --pool->ref_cnt;
@ -84,13 +287,19 @@ alloc:
    return buf;
 }

-COLD int dav1d_mem_pool_init(Dav1dMemPool **const ppool) {
-    Dav1dMemPool *const pool = malloc(sizeof(Dav1dMemPool));
+COLD int dav1d_mem_pool_init(const enum AllocationType type,
+                             Dav1dMemPool **const ppool)
+{
+    Dav1dMemPool *const pool = dav1d_malloc(ALLOC_COMMON_CTX,
+                                            sizeof(Dav1dMemPool));
    if (pool) {
        if (!pthread_mutex_init(&pool->lock, NULL)) {
            pool->buf = NULL;
            pool->ref_cnt = 1;
            pool->end = 0;
+#if TRACK_HEAP_ALLOCATIONS
+            pool->type = type;
+#endif
            *ppool = pool;
            return 0;
        }
--- a/third_party/dav1d/src/mem.h
+++ b/third_party/dav1d/src/mem.h
@ -28,16 +28,42 @@
 #ifndef DAV1D_SRC_MEM_H
 #define DAV1D_SRC_MEM_H

+#define TRACK_HEAP_ALLOCATIONS 0
+
 #include <stdlib.h>

-#if defined(HAVE_ALIGNED_MALLOC) || defined(HAVE_MEMALIGN)
+#if defined(_WIN32) || !defined(HAVE_POSIX_MEMALIGN)
 #include <malloc.h>
 #endif

+#include "dav1d/dav1d.h"
+
 #include "common/attributes.h"

 #include "src/thread.h"

+enum AllocationType {
+    ALLOC_BLOCK,
+    ALLOC_CDEF,
+    ALLOC_CDF,
+    ALLOC_COEF,
+    ALLOC_COMMON_CTX,
+    ALLOC_DAV1DDATA,
+    ALLOC_IPRED,
+    ALLOC_LF,
+    ALLOC_LR,
+    ALLOC_OBU_HDR,
+    ALLOC_OBU_META,
+    ALLOC_PAL,
+    ALLOC_PIC,
+    ALLOC_PIC_CTX,
+    ALLOC_REFMVS,
+    ALLOC_SEGMAP,
+    ALLOC_THREAD_CTX,
+    ALLOC_TILE,
+    N_ALLOC_TYPES,
+};
+
 typedef struct Dav1dMemPoolBuffer {
    void *data;
    struct Dav1dMemPoolBuffer *next;
@ -48,43 +74,59 @@ typedef struct Dav1dMemPool {
    Dav1dMemPoolBuffer *buf;
    int ref_cnt;
    int end;
+#if TRACK_HEAP_ALLOCATIONS
+    enum AllocationType type;
+#endif
 } Dav1dMemPool;

-void dav1d_mem_pool_push(Dav1dMemPool *pool, Dav1dMemPoolBuffer *buf);
-Dav1dMemPoolBuffer *dav1d_mem_pool_pop(Dav1dMemPool *pool, size_t size);
-int dav1d_mem_pool_init(Dav1dMemPool **pool);
-void dav1d_mem_pool_end(Dav1dMemPool *pool);
+
+#if TRACK_HEAP_ALLOCATIONS
+void *dav1d_malloc(enum AllocationType type, size_t sz);
+void *dav1d_realloc(enum AllocationType type, void *ptr, size_t sz);
+void *dav1d_alloc_aligned(enum AllocationType type, size_t sz, size_t align);
+void dav1d_free(void *ptr);
+void dav1d_free_aligned(void *ptr);
+void dav1d_log_alloc_stats(Dav1dContext *c);
+#else
+#define dav1d_mem_pool_init(type, pool) dav1d_mem_pool_init(pool)
+#define dav1d_malloc(type, sz) malloc(sz)
+#define dav1d_realloc(type, ptr, sz) realloc(ptr, sz)
+#define dav1d_free(ptr) free(ptr)

 /*
 * Allocate align-byte aligned memory. The return value can be released
 * by calling the dav1d_free_aligned() function.
 */
-static inline void *dav1d_alloc_aligned(size_t sz, size_t align) {
+static inline void *dav1d_alloc_aligned(const size_t sz, const size_t align) {
    assert(!(align & (align - 1)));
-#ifdef HAVE_POSIX_MEMALIGN
+#ifdef _WIN32
+    return _aligned_malloc(sz, align);
+#elif defined(HAVE_POSIX_MEMALIGN)
    void *ptr;
    if (posix_memalign(&ptr, align, sz)) return NULL;
    return ptr;
-#elif defined(HAVE_ALIGNED_MALLOC)
-    return _aligned_malloc(sz, align);
-#elif defined(HAVE_MEMALIGN)
-    return memalign(align, sz);
 #else
-#error Missing aligned alloc implementation
+    return memalign(align, sz);
 #endif
 }
+#define dav1d_alloc_aligned(type, sz, align) dav1d_alloc_aligned(sz, align)

-static inline void dav1d_free_aligned(void* ptr) {
-#ifdef HAVE_POSIX_MEMALIGN
-    free(ptr);
-#elif defined(HAVE_ALIGNED_MALLOC)
+static inline void dav1d_free_aligned(void *ptr) {
+#ifdef _WIN32
    _aligned_free(ptr);
-#elif defined(HAVE_MEMALIGN)
+#else
    free(ptr);
 #endif
 }

-static inline void dav1d_freep_aligned(void* ptr) {
+#endif /* TRACK_HEAP_ALLOCATIONS */
+
+void dav1d_mem_pool_push(Dav1dMemPool *pool, Dav1dMemPoolBuffer *buf);
+Dav1dMemPoolBuffer *dav1d_mem_pool_pop(Dav1dMemPool *pool, size_t size);
+int dav1d_mem_pool_init(enum AllocationType type, Dav1dMemPool **pool);
+void dav1d_mem_pool_end(Dav1dMemPool *pool);
+
+static inline void dav1d_freep_aligned(void *ptr) {
    void **mem = (void **) ptr;
    if (*mem) {
        dav1d_free_aligned(*mem);
@ -92,12 +134,4 @@ static inline void dav1d_freep_aligned(void* ptr) {
    }
 }

-static inline void freep(void *ptr) {
-    void **mem = (void **) ptr;
-    if (*mem) {
-        free(*mem);
-        *mem = NULL;
-    }
-}
-
 #endif /* DAV1D_SRC_MEM_H */
--- a/third_party/dav1d/src/obu.c
+++ b/third_party/dav1d/src/obu.c
@ -304,7 +304,7 @@ int dav1d_parse_sequence_header(Dav1dSequenceHeader *const out,
 {
    validate_input_or_ret(out != NULL, DAV1D_ERR(EINVAL));
    validate_input_or_ret(ptr != NULL, DAV1D_ERR(EINVAL));
-    validate_input_or_ret(sz > 0, DAV1D_ERR(EINVAL));
+    validate_input_or_ret(sz > 0 && sz <= SIZE_MAX / 2, DAV1D_ERR(EINVAL));

    GetBits gb;
    dav1d_init_get_bits(&gb, ptr, sz);
@ -609,8 +609,8 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
            if (!hdr->frame_ref_short_signaling)
                hdr->refidx[i] = dav1d_get_bits(gb, 3);
            if (seqhdr->frame_id_numbers_present) {
-                const int delta_ref_frame_id_minus_1 = dav1d_get_bits(gb, seqhdr->delta_frame_id_n_bits);
-                const int ref_frame_id = (hdr->frame_id + (1 << seqhdr->frame_id_n_bits) - delta_ref_frame_id_minus_1 - 1) & ((1 << seqhdr->frame_id_n_bits) - 1);
+                const unsigned delta_ref_frame_id = dav1d_get_bits(gb, seqhdr->delta_frame_id_n_bits) + 1;
+                const unsigned ref_frame_id = (hdr->frame_id + (1 << seqhdr->frame_id_n_bits) - delta_ref_frame_id) & ((1 << seqhdr->frame_id_n_bits) - 1);
                Dav1dFrameHeader *const ref_frame_hdr = c->refs[hdr->refidx[i]].p.p.frame_hdr;
                if (!ref_frame_hdr || ref_frame_hdr->frame_id != ref_frame_id) goto error;
            }
@ -705,7 +705,8 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
            goto error;
        hdr->tiling.n_bytes = dav1d_get_bits(gb, 2) + 1;
    } else {
-        hdr->tiling.n_bytes = hdr->tiling.update = 0;
+        hdr->tiling.n_bytes = 0;
+        hdr->tiling.update = 0;
    }
 #if DEBUG_FRAME_HDR
    printf("HDR: post-tiling: off=%td\n",
@ -739,7 +740,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
        hdr->quant.qm_y = dav1d_get_bits(gb, 4);
        hdr->quant.qm_u = dav1d_get_bits(gb, 4);
        hdr->quant.qm_v =
-            seqhdr->separate_uv_delta_q ? (int)dav1d_get_bits(gb, 4) :
+            seqhdr->separate_uv_delta_q ? dav1d_get_bits(gb, 4) :
                                          hdr->quant.qm_u;
    }
 #if DEBUG_FRAME_HDR
@ -1366,7 +1367,8 @@ ptrdiff_t dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) {
        if (!c->frame_hdr) goto error;
        if (c->n_tile_data_alloc < c->n_tile_data + 1) {
            if ((c->n_tile_data + 1) > INT_MAX / (int)sizeof(*c->tile)) goto error;
-            struct Dav1dTileGroup *tile = realloc(c->tile, (c->n_tile_data + 1) * sizeof(*c->tile));
+            struct Dav1dTileGroup *tile = dav1d_realloc(ALLOC_TILE, c->tile,
+                                                        (c->n_tile_data + 1) * sizeof(*c->tile));
            if (!tile) goto error;
            c->tile = tile;
            memset(c->tile + c->n_tile_data, 0, sizeof(*c->tile));
@ -1406,7 +1408,8 @@ ptrdiff_t dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) {

        switch (meta_type) {
        case OBU_META_HDR_CLL: {
-            Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dContentLightLevel));
+            Dav1dRef *ref = dav1d_ref_create(ALLOC_OBU_META,
+                                             sizeof(Dav1dContentLightLevel));
            if (!ref) return DAV1D_ERR(ENOMEM);
            Dav1dContentLightLevel *const content_light = ref->data;

@ -1434,7 +1437,8 @@ ptrdiff_t dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) {
            break;
        }
        case OBU_META_HDR_MDCV: {
-            Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dMasteringDisplay));
+            Dav1dRef *ref = dav1d_ref_create(ALLOC_OBU_META,
+                                             sizeof(Dav1dMasteringDisplay));
            if (!ref) return DAV1D_ERR(ENOMEM);
            Dav1dMasteringDisplay *const mastering_display = ref->data;

@ -1503,7 +1507,8 @@ ptrdiff_t dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) {
            }

            if ((c->n_itut_t35 + 1) > INT_MAX / (int)sizeof(*c->itut_t35)) goto error;
-            struct Dav1dITUTT35 *itut_t35 = realloc(c->itut_t35, (c->n_itut_t35 + 1) * sizeof(*c->itut_t35));
+            struct Dav1dITUTT35 *itut_t35 = dav1d_realloc(ALLOC_OBU_META, c->itut_t35,
+                                                          (c->n_itut_t35 + 1) * sizeof(*c->itut_t35));
            if (!itut_t35) goto error;
            c->itut_t35 = itut_t35;
            memset(c->itut_t35 + c->n_itut_t35, 0, sizeof(*c->itut_t35));
@ -1511,7 +1516,7 @@ ptrdiff_t dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) {
            struct itut_t35_ctx_context *itut_t35_ctx;
            if (!c->n_itut_t35) {
                assert(!c->itut_t35_ref);
-                itut_t35_ctx = malloc(sizeof(struct itut_t35_ctx_context));
+                itut_t35_ctx = dav1d_malloc(ALLOC_OBU_META, sizeof(struct itut_t35_ctx_context));
                if (!itut_t35_ctx) goto error;
                c->itut_t35_ref = dav1d_ref_init(&itut_t35_ctx->ref, c->itut_t35,
                                                 dav1d_picture_free_itut_t35, itut_t35_ctx, 0);
@ -1524,7 +1529,7 @@ ptrdiff_t dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) {
            itut_t35_ctx->n_itut_t35 = c->n_itut_t35 + 1;

            Dav1dITUTT35 *const itut_t35_metadata = &c->itut_t35[c->n_itut_t35];
-            itut_t35_metadata->payload = malloc(payload_size);
+            itut_t35_metadata->payload = dav1d_malloc(ALLOC_OBU_META, payload_size);
            if (!itut_t35_metadata->payload) goto error;

            itut_t35_metadata->country_code = country_code;
--- a/third_party/dav1d/src/picture.c
+++ b/third_party/dav1d/src/picture.c
@ -106,9 +106,9 @@ void dav1d_picture_free_itut_t35(const uint8_t *const data, void *const user_dat
    struct itut_t35_ctx_context *itut_t35_ctx = user_data;

    for (size_t i = 0; i < itut_t35_ctx->n_itut_t35; i++)
-        free(itut_t35_ctx->itut_t35[i].payload);
-    free(itut_t35_ctx->itut_t35);
-    free(itut_t35_ctx);
+        dav1d_free(itut_t35_ctx->itut_t35[i].payload);
+    dav1d_free(itut_t35_ctx->itut_t35);
+    dav1d_free(itut_t35_ctx);
 }

 static int picture_alloc_with_edges(Dav1dContext *const c,
@ -249,12 +249,12 @@ int dav1d_picture_alloc_copy(Dav1dContext *const c, Dav1dPicture *const dst, con
 }

 void dav1d_picture_ref(Dav1dPicture *const dst, const Dav1dPicture *const src) {
-    validate_input(dst != NULL);
-    validate_input(dst->data[0] == NULL);
-    validate_input(src != NULL);
+    assert(dst != NULL);
+    assert(dst->data[0] == NULL);
+    assert(src != NULL);

    if (src->ref) {
-        validate_input(src->data[0] != NULL);
+        assert(src->data[0] != NULL);
        dav1d_ref_inc(src->ref);
    }
    if (src->frame_hdr_ref) dav1d_ref_inc(src->frame_hdr_ref);
@ -267,12 +267,12 @@ void dav1d_picture_ref(Dav1dPicture *const dst, const Dav1dPicture *const src) {
 }

 void dav1d_picture_move_ref(Dav1dPicture *const dst, Dav1dPicture *const src) {
-    validate_input(dst != NULL);
-    validate_input(dst->data[0] == NULL);
-    validate_input(src != NULL);
+    assert(dst != NULL);
+    assert(dst->data[0] == NULL);
+    assert(src != NULL);

    if (src->ref)
-        validate_input(src->data[0] != NULL);
+        assert(src->data[0] != NULL);

    *dst = *src;
    memset(src, 0, sizeof(*src));
--- a/third_party/dav1d/src/ref.c
+++ b/third_party/dav1d/src/ref.c
@ -34,10 +34,10 @@ static void default_free_callback(const uint8_t *const data, void *const user_da
    dav1d_free_aligned(user_data);
 }

-Dav1dRef *dav1d_ref_create(size_t size) {
+Dav1dRef *dav1d_ref_create(const enum AllocationType type, size_t size) {
    size = (size + sizeof(void*) - 1) & ~(sizeof(void*) - 1);

-    uint8_t *const data = dav1d_alloc_aligned(size + sizeof(Dav1dRef), 64);
+    uint8_t *const data = dav1d_alloc_aligned(type, size + sizeof(Dav1dRef), 64);
    if (!data) return NULL;

    Dav1dRef *const res = (Dav1dRef*)(data + size);
@ -81,6 +81,6 @@ void dav1d_ref_dec(Dav1dRef **const pref) {
    if (atomic_fetch_sub(&ref->ref_cnt, 1) == 1) {
        const int free_ref = ref->free_ref;
        ref->free_callback(ref->const_data, ref->user_data);
-        if (free_ref) free(ref);
+        if (free_ref) dav1d_free(ref);
    }
 }
--- a/third_party/dav1d/src/ref.h
+++ b/third_party/dav1d/src/ref.h
@ -45,7 +45,11 @@ struct Dav1dRef {
    void *user_data;
 };

-Dav1dRef *dav1d_ref_create(size_t size);
+#if !TRACK_HEAP_ALLOCATIONS
+#define dav1d_ref_create(type, size) dav1d_ref_create(size)
+#endif
+
+Dav1dRef *dav1d_ref_create(enum AllocationType type, size_t size);
 Dav1dRef *dav1d_ref_create_using_pool(Dav1dMemPool *pool, size_t size);
 void dav1d_ref_dec(Dav1dRef **ref);

--- a/third_party/dav1d/src/refmvs.c
+++ b/third_party/dav1d/src/refmvs.c
@ -817,7 +817,7 @@ int dav1d_refmvs_init_frame(refmvs_frame *const rf,
    if (r_stride != rf->r_stride || n_tile_rows != rf->n_tile_rows) {
        if (rf->r) dav1d_freep_aligned(&rf->r);
        const int uses_2pass = n_tile_threads > 1 && n_frame_threads > 1;
-        rf->r = dav1d_alloc_aligned(sizeof(*rf->r) * 35 * r_stride * n_tile_rows * (1 + uses_2pass), 64);
+        rf->r = dav1d_alloc_aligned(ALLOC_REFMVS, sizeof(*rf->r) * 35 * r_stride * n_tile_rows * (1 + uses_2pass), 64);
        if (!rf->r) return DAV1D_ERR(ENOMEM);
        rf->r_stride = r_stride;
    }
@ -825,7 +825,7 @@ int dav1d_refmvs_init_frame(refmvs_frame *const rf,
    const ptrdiff_t rp_stride = r_stride >> 1;
    if (rp_stride != rf->rp_stride || n_tile_rows != rf->n_tile_rows) {
        if (rf->rp_proj) dav1d_freep_aligned(&rf->rp_proj);
-        rf->rp_proj = dav1d_alloc_aligned(sizeof(*rf->rp_proj) * 16 * rp_stride * n_tile_rows, 64);
+        rf->rp_proj = dav1d_alloc_aligned(ALLOC_REFMVS, sizeof(*rf->rp_proj) * 16 * rp_stride * n_tile_rows, 64);
        if (!rf->rp_proj) return DAV1D_ERR(ENOMEM);
        rf->rp_stride = rp_stride;
    }
--- a/third_party/dav1d/src/thread.h
+++ b/third_party/dav1d/src/thread.h
@ -33,6 +33,7 @@
 #include <limits.h>
 #include <windows.h>

+#define PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT
 #define PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT

 typedef struct {
--- a/third_party/dav1d/src/thread_task.c
+++ b/third_party/dav1d/src/thread_task.c
@ -224,7 +224,7 @@ static int create_filter_sbrow(Dav1dFrameContext *const f,
    int num_tasks = f->sbh * (1 + uses_2pass);
    if (num_tasks > f->task_thread.num_tasks) {
        const size_t size = sizeof(Dav1dTask) * num_tasks;
-        tasks = realloc(f->task_thread.tasks, size);
+        tasks = dav1d_realloc(ALLOC_COMMON_CTX, f->task_thread.tasks, size);
        if (!tasks) return -1;
        memset(tasks, 0, size);
        f->task_thread.tasks = tasks;
@ -237,8 +237,8 @@ static int create_filter_sbrow(Dav1dFrameContext *const f,
    } else {
        const int prog_sz = ((f->sbh + 31) & ~31) >> 5;
        if (prog_sz > f->frame_thread.prog_sz) {
-            atomic_uint *const prog = realloc(f->frame_thread.frame_progress,
-                                              2 * prog_sz * sizeof(*prog));
+            atomic_uint *const prog = dav1d_realloc(ALLOC_COMMON_CTX, f->frame_thread.frame_progress,
+                                                    2 * prog_sz * sizeof(*prog));
            if (!prog) return -1;
            f->frame_thread.frame_progress = prog;
            f->frame_thread.copy_lpf_progress = prog + prog_sz;
@ -275,7 +275,7 @@ int dav1d_task_create_tile_sbrow(Dav1dFrameContext *const f, const int pass,
        int alloc_num_tasks = num_tasks * (1 + uses_2pass);
        if (alloc_num_tasks > f->task_thread.num_tile_tasks) {
            const size_t size = sizeof(Dav1dTask) * alloc_num_tasks;
-            tasks = realloc(f->task_thread.tile_tasks[0], size);
+            tasks = dav1d_realloc(ALLOC_COMMON_CTX, f->task_thread.tile_tasks[0], size);
            if (!tasks) return -1;
            memset(tasks, 0, size);
            f->task_thread.tile_tasks[0] = tasks;
--- a/third_party/dav1d/src/x86/refmvs.asm
+++ b/third_party/dav1d/src/x86/refmvs.asm
@ -47,6 +47,10 @@ SECTION_RODATA 64
 %endmacro

 %if ARCH_X86_64
+mv_proj:       dw    0, 16384, 8192, 5461, 4096, 3276, 2730, 2340
+               dw 2048,  1820, 1638, 1489, 1365, 1260, 1170, 1092
+               dw 1024,   963,  910,  862,  819,  780,  744,  712
+               dw  682,   655,  630,  606,  585,  564,  546,  528
 splat_mv_shuf: db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,  0,  1,  2,  3
               db  4,  5,  6,  7,  8,  9, 10, 11,  0,  1,  2,  3,  4,  5,  6,  7
               db  8,  9, 10, 11,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11
@ -61,6 +65,7 @@ cond_shuf512:  db  3,  3,  3,  3,  7,  7,  7,  7,  7,  7,  7,  7,  3,  3,  3,  3
 save_cond0:    db  0x80, 0x81, 0x82, 0x83, 0x89, 0x84, 0x00, 0x00
 save_cond1:    db  0x84, 0x85, 0x86, 0x87, 0x88, 0x80, 0x00, 0x00
 pb_128:        times 16 db 128
+pq_8192:       dq 8192

 save_tmvs_ssse3_table: SAVE_TMVS_TABLE 2, 16, ssse3
                       SAVE_TMVS_TABLE 4,  8, ssse3
@ -329,6 +334,225 @@ cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
    RET

 %if ARCH_X86_64
+INIT_XMM sse4
+; refmvs_frame *rf, int tile_row_idx,
+; int col_start8, int col_end8, int row_start8, int row_end8
+cglobal load_tmvs, 6, 15, 4, -0x50, rf, tridx, xstart, xend, ystart, yend, \
+                                    stride, rp_proj, roff, troff, \
+                                    xendi, xstarti, iw8, ih8, dst
+    xor           r14d, r14d
+    cmp dword [rfq+212], 1          ; n_tile_threads
+    mov           ih8d, [rfq+20]    ; rf->ih8
+    mov           iw8d, [rfq+16]    ; rf->iw8
+    mov        xstartd, xstartd
+    mov          xendd, xendd
+    cmove       tridxd, r14d
+    lea       xstartid, [xstartq-8]
+    lea         xendid, [xendq+8]
+    mov        strideq, [rfq+184]
+    mov       rp_projq, [rfq+176]
+    cmp           ih8d, yendd
+    mov     [rsp+0x30], strideq
+    cmovs        yendd, ih8d
+    test      xstartid, xstartid
+    cmovs     xstartid, r14d
+    cmp           iw8d, xendid
+    cmovs       xendid, iw8d
+    mov         troffq, strideq
+    shl         troffq, 4
+    imul        troffq, tridxq
+    mov           dstd, ystartd
+    and           dstd, 15
+    imul          dstq, strideq
+    add           dstq, troffq      ; (16 * tridx + (ystart & 15)) * stride
+    lea           dstq, [dstq*5]
+    add           dstq, rp_projq
+    lea         troffq, [troffq*5]  ; 16 * tridx * stride * 5
+    lea           r13d, [xendq*5]
+    lea            r12, [strideq*5]
+ DEFINE_ARGS rf, w5, xstart, xend, ystart, yend, h, x5, \
+             _, troff, xendi, xstarti, stride5, _, dst
+    lea            w5d, [xstartq*5]
+    add             r7, troffq      ; rp_proj + tile_row_offset
+    mov             hd, yendd
+    mov     [rsp+0x28], r7
+    add           dstq, r13
+    sub            w5q, r13
+    sub             hd, ystartd
+.init_xloop_start:
+    mov            x5q, w5q
+    test           w5b, 1
+    jz .init_2blk
+    mov dword [dstq+x5q], 0x80008000
+    add            x5q, 5
+    jz .init_next_row
+.init_2blk:
+    mov dword [dstq+x5q+0], 0x80008000
+    mov dword [dstq+x5q+5], 0x80008000
+    add            x5q, 10
+    jl .init_2blk
+.init_next_row:
+    add           dstq, stride5q
+    dec             hd
+    jg .init_xloop_start
+ DEFINE_ARGS rf, _, xstart, xend, ystart, yend, n7, stride, \
+             _, _, xendi, xstarti, stride5, _, n
+    mov           r13d, [rfq+152]   ; rf->n_mfmvs
+    test          r13d, r13d
+    jz .ret
+    mov     [rsp+0x0c], r13d
+    mov        strideq, [rsp+0x30]
+    movddup         m3, [pq_8192]
+    mov            r9d, ystartd
+    mov     [rsp+0x38], yendd
+    mov     [rsp+0x20], xstartid
+    xor             nd, nd
+    xor            n7d, n7d
+    imul            r9, strideq     ; ystart * stride
+    mov     [rsp+0x48], rfq
+    mov     [rsp+0x18], stride5q
+    lea             r7, [r9*5]
+    mov     [rsp+0x24], ystartd
+    mov     [rsp+0x00], r7
+.nloop:
+ DEFINE_ARGS y, off, xstart, xend, ystart, rf, n7, refsign, \
+             ref, rp_ref, xendi, xstarti, _, _, n
+    mov            rfq, [rsp+0x48]
+    mov           refd, [rfq+56+nq*4]       ; ref2cur
+    cmp           refd, 0x80000000
+    je .next_n
+    mov     [rsp+0x40], refd
+    mov           offq, [rsp+0x00]          ; ystart * stride * 5
+    movzx         refd, byte [rfq+53+nq]    ; rf->mfmv_ref[n]
+    lea       refsignq, [refq-4]
+    mov        rp_refq, [rfq+168]
+    movq            m2, refsignq
+    add           offq, [rp_refq+refq*8]    ; r = rp_ref[ref] + row_offset
+    mov     [rsp+0x14], nd
+    mov             yd, ystartd
+.yloop:
+    mov           r11d, [rsp+0x24]          ; ystart
+    mov           r12d, [rsp+0x38]          ; yend
+    mov           r14d, yd
+    and           r14d, ~7                  ; y_sb_align
+    cmp           r11d, r14d
+    cmovs         r11d, r14d                ; imax(y_sb_align, ystart)
+    mov     [rsp+0x44], r11d                ; y_proj_start
+    add           r14d, 8
+    cmp           r12d, r14d
+    cmovs         r14d, r12d                ; imin(y_sb_align + 8, yend)
+    mov     [rsp+0x3c], r14d                ; y_proj_end
+ DEFINE_ARGS y, src, xstart, xend, frac, rf, n7, mv, \
+             ref, x, xendi, mvx, mvy, rb, ref2ref
+    mov             xd, [rsp+0x20] ; xstarti
+.xloop:
+    lea            rbd, [xq*5]
+    add            rbq, srcq
+    movsx         refd, byte [rbq+4]
+    test          refd, refd
+    jz .next_x_bad_ref
+    mov            rfq, [rsp+0x48]
+    lea           r14d, [16+n7q+refq]
+    mov       ref2refd, [rfq+r14*4]         ; rf->mfmv_ref2ref[n][b_ref-1]
+    test      ref2refd, ref2refd
+    jz .next_x_bad_ref
+    lea          fracq, [mv_proj]
+    movzx        fracd, word [fracq+ref2refq*2]
+    mov            mvd, [rbq]
+    imul         fracd, [rsp+0x40] ; ref2cur
+    pmovsxwq        m0, [rbq]
+    movd            m1, fracd
+    punpcklqdq      m1, m1
+    pmuldq          m0, m1          ; mv * frac
+    pshufd          m1, m0, q3311
+    paddd           m0, m3
+    paddd           m0, m1
+    psrad           m0, 14          ; offset = (xy + (xy >> 31) + 8192) >> 14
+    pabsd           m1, m0
+    packssdw        m0, m0
+    psrld           m1, 6
+    packuswb        m1, m1
+    pxor            m0, m2          ; offset ^ ref_sign
+    psignd          m1, m0          ; apply_sign(abs(offset) >> 6, offset ^ refsign)
+    movq          mvxq, m1
+    lea           mvyd, [mvxq+yq]   ; ypos
+    sar           mvxq, 32
+ DEFINE_ARGS y, src, xstart, xend, _, _, n7, mv, \
+             ref, x, xendi, mvx, ypos, rb, ref2ref
+    cmp          yposd, [rsp+0x44] ; y_proj_start
+    jl .next_x_bad_pos_y
+    cmp          yposd, [rsp+0x3c] ; y_proj_end
+    jge .next_x_bad_pos_y
+    and          yposd, 15
+    add           mvxq, xq          ; xpos
+    imul         yposq, [rsp+0x30]  ; pos = (ypos & 15) * stride
+ DEFINE_ARGS y, src, xstart, xend, dst, _, n7, mv, \
+             ref, x, xendi, xpos, pos, rb, ref2ref
+    mov           dstq, [rsp+0x28]  ; dst = rp_proj + tile_row_offset
+    add           posq, xposq       ; pos += xpos
+    lea           posq, [posq*5]
+    add           dstq, posq        ; dst += pos5
+    jmp .write_loop_entry
+.write_loop:
+    add            rbq, 5
+    cmp           refb, byte [rbq+4]
+    jne .xloop
+    cmp            mvd, [rbq]
+    jne .xloop
+    add           dstq, 5
+    inc          xposd
+.write_loop_entry:
+    mov           r12d, xd
+    and           r12d, ~7
+    lea            r5d, [r12-8]
+    cmp            r5d, xstartd
+    cmovs          r5d, xstartd     ; x_proj_start
+    cmp          xposd, r5d
+    jl .next_xpos
+    add           r12d, 16
+    cmp          xendd, r12d
+    cmovs         r12d, xendd       ; x_proj_end
+    cmp          xposd, r12d
+    jge .next_xpos
+    mov       [dstq+0], mvd
+    mov  byte [dstq+4], ref2refb
+.next_xpos:
+    inc             xd
+    cmp             xd, xendid
+    jl .write_loop
+.next_y:
+ DEFINE_ARGS y, src, xstart, xend, ystart, _, n7, _, _, x, xendi, _, _, _, n
+    add           srcq, [rsp+0x18] ; stride5
+    inc             yd
+    cmp             yd, [rsp+0x38] ; yend
+    jne .yloop
+    mov             nd, [rsp+0x14]
+    mov        ystartd, [rsp+0x24]
+.next_n:
+    add            n7d, 7
+    inc             nd
+    cmp             nd, [rsp+0x0c] ; n_mfmvs
+    jne .nloop
+.ret:
+    RET
+.next_x:
+ DEFINE_ARGS y, src, xstart, xend, _, _, n7, mv, ref, x, xendi, _, _, rb, _
+    add            rbq, 5
+    cmp           refb, byte [rbq+4]
+    jne .xloop
+    cmp            mvd, [rbq]
+    jne .xloop
+.next_x_bad_pos_y:
+    inc             xd
+    cmp             xd, xendid
+    jl .next_x
+    jmp .next_y
+.next_x_bad_ref:
+    inc             xd
+    cmp             xd, xendid
+    jl .xloop
+    jmp .next_y
+
 INIT_YMM avx2
 ; refmvs_temporal_block *rp, ptrdiff_t stride,
 ; refmvs_block **rr, uint8_t *ref_sign,
--- a/third_party/dav1d/src/x86/refmvs.h
+++ b/third_party/dav1d/src/x86/refmvs.h
@ -28,6 +28,8 @@
 #include "src/cpu.h"
 #include "src/refmvs.h"

+decl_load_tmvs_fn(dav1d_load_tmvs_sse4);
+
 decl_save_tmvs_fn(dav1d_save_tmvs_ssse3);
 decl_save_tmvs_fn(dav1d_save_tmvs_avx2);
 decl_save_tmvs_fn(dav1d_save_tmvs_avx512icl);
@ -47,7 +49,10 @@ static ALWAYS_INLINE void refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *const c) {

    c->save_tmvs = dav1d_save_tmvs_ssse3;

+    if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
 #if ARCH_X86_64
+    c->load_tmvs = dav1d_load_tmvs_sse4;
+
    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;

    c->save_tmvs = dav1d_save_tmvs_avx2;
--- a/third_party/dav1d/tests/checkasm/refmvs.c
+++ b/third_party/dav1d/tests/checkasm/refmvs.c
@ -39,6 +39,190 @@ static inline int gen_mv(const int total_bits, int spel_bits) {
    return rnd() & 1 ? -bits : bits;
 }

+#define ARRAY_SIZE(n) (sizeof(n)/sizeof(*(n)))
+
+static inline int get_min_mv_val(const int idx) {
+    if      (idx <= 9)  return idx;
+    else if (idx <= 18) return (idx - 9) * 10;
+    else if (idx <= 27) return (idx - 18) * 100;
+    else if (idx <= 36) return (idx - 27) * 1000;
+    else                return (idx - 36) * 10000;
+}
+
+static inline void gen_tmv(refmvs_temporal_block *const rb, const int *ref2ref) {
+    rb->ref = rnd() % 7;
+    if (!rb->ref) return;
+    static const int x_prob[] = {
+        26447556, 6800591, 3708783,  2198592, 1635940, 1145901, 1052602, 1261759,
+         1099739,  755108, 6075404,  4355916, 3254908, 2897157, 2273676, 2154432,
+         1937436, 1694818, 1466863, 10203087, 5241546, 3328819, 2187483, 1458997,
+         1030842,  806863,  587219,   525024, 1858953,  422368,  114626,   16992
+    };
+    static const int y_prob[] = {
+        33845001, 7591218,  6425971, 4115838, 4032161, 2515962, 2614601, 2343656,
+         2898897, 1397254, 10125350, 5124449, 3232914, 2185499, 1608775, 1342585,
+          980208,  795714,   649665, 3369250, 1298716,  486002,  279588,  235990,
+          110318,   89372,    66895,   46980,  153322,   32960,    4500,     389
+    };
+    const int prob = rnd() % 100000000;
+    int acc = 0;
+    for (unsigned i = 0; i < ARRAY_SIZE(x_prob); i++) {
+        acc += x_prob[i];
+        if (prob < acc) {
+            const int min = get_min_mv_val(i);
+            const int max = get_min_mv_val(i + 1);
+            const int val = min + rnd() % (max - min);
+            rb->mv.x = iclip(val * ref2ref[rb->ref], -(1 << 15), (1 << 15) - 1);
+            break;
+        }
+    }
+    acc = 0;
+    for (unsigned i = 0; i < ARRAY_SIZE(y_prob); i++) {
+        acc += y_prob[i];
+        if (prob < acc) {
+            const int min = get_min_mv_val(i);
+            const int max = get_min_mv_val(i + 1);
+            const int val = min + rnd() % (max - min);
+            rb->mv.y = iclip(val * ref2ref[rb->ref], -(1 << 15), (1 << 15) - 1);
+            break;
+        }
+    }
+}
+
+static inline int get_ref2cur(void) {
+    const int prob = rnd() % 100;
+    static const uint8_t ref2cur[11] = { 35, 55, 67, 73, 78, 83, 84, 87, 90, 93, 100 };
+    for (int i = 0; i < 11; i++)
+        if (prob < ref2cur[i])
+            return rnd() & 1 ? -(i + 1) : i + 1;
+    return 0;
+}
+
+static inline int get_seqlen(void) {
+    int len = 0, max_len;
+    const int prob = rnd() % 100000;
+    // =1 =2 =3 =4  <8  =8 <16 =16 <32 =32 <48 =48 <64 =64 >64 eq240
+    //  5 17 1.5 16  5  10  5   7   4   3  1.5  2   1    2   20   15   chimera blocks
+    // 25 38 2.5 19 3.5 5.5 2 1.87 .86 .4  .18 .2 .067 .165 .478 .28   chimera sequences
+
+    if      (prob < 25000) len = 1;       // =1   5%
+    else if (prob < 63000) len = 2;       // =2   17%
+    else if (prob < 65500) len = 3;       // =3   1.5%
+    else if (prob < 84500) len = 4;       // =4   16%
+    else if (prob < 88000) max_len = 7;   // <8   5% (43.5% tot <8)
+    else if (prob < 93500) len = 8;       // =8   10%
+    else if (prob < 95500) max_len = 15;  // <16  5%
+    else if (prob < 97370) len = 16;      // =16  7%
+    else if (prob < 98230) max_len = 31;  // <32  4%
+    else if (prob < 98630) len = 32;      // =32  3%
+    else if (prob < 98810) max_len = 47;  // <48  1.5%
+    else if (prob < 99010) len = 48;      // =48  2%
+    else if (prob < 99077) max_len = 63;  // <64  1%
+    else if (prob < 99242) len = 64;      // =64  2%
+    else if (prob < 99720) max_len = 239; // <240 5%
+    else                   len = 240;     // =240 15%
+
+    if (!len) len = 1 + rnd() % max_len;
+    return len;
+}
+
+static inline void init_rp_ref(refmvs_frame const *const rf,
+                               const int col_start8, const int col_end8,
+                               const int row_start8, const int row_end8)
+{
+    const int col_start8i = imax(col_start8 - 8, 0);
+    const int col_end8i = imin(col_end8 + 8, rf->iw8);
+    for (int n = 0; n < rf->n_mfmvs; n++) {
+        refmvs_temporal_block *rp_ref = rf->rp_ref[rf->mfmv_ref[n]];
+        for (int i = row_start8; i < imin(row_end8, rf->ih8); i++) {
+            for (int j = col_start8i; j < col_end8i;) {
+                refmvs_temporal_block rb;
+                gen_tmv(&rb, rf->mfmv_ref2ref[n]);
+                for (int k = get_seqlen(); k && j < col_end8i; k--, j++)
+                    rp_ref[i * rf->iw8 + j] = rb;
+            }
+        }
+    }
+}
+
+static void check_load_tmvs(const Dav1dRefmvsDSPContext *const c) {
+    refmvs_temporal_block *rp_ref[7] = {0};
+    refmvs_temporal_block c_rp_proj[240 * 63];
+    refmvs_temporal_block a_rp_proj[240 * 63];
+    refmvs_frame rf = {
+        .rp_ref = rp_ref,
+        .rp_stride = 240, .iw8 = 240, .ih8 = 63,
+        .n_mfmvs = 3
+    };
+    const size_t rp_ref_sz = rf.ih8 * rf.rp_stride * sizeof(refmvs_temporal_block);
+
+    declare_func(void, const refmvs_frame *rf, int tile_row_idx,
+                 int col_start8, int col_end8, int row_start8, int row_end8);
+
+    if (check_func(c->load_tmvs, "load_tmvs")) {
+        const int row_start8 = (rnd() & 3) << 4;
+        const int row_end8 = row_start8 + 16;
+        const int col_start8 = rnd() & 31;
+        const int col_end8 = rf.iw8 - (rnd() & 31);
+
+        for (int n = 0; n < rf.n_mfmvs; n++) {
+            rf.mfmv_ref[n] = rnd() % 7;
+            rf.mfmv_ref2cur[n] = get_ref2cur();
+            for (int r = 0; r < 7; r++)
+                rf.mfmv_ref2ref[n][r] = rnd() & 31;
+        }
+        for (int n = 0; n < rf.n_mfmvs; n++) {
+            refmvs_temporal_block **p_rp_ref = &rp_ref[rf.mfmv_ref[n]];
+            if (!*p_rp_ref)
+                *p_rp_ref = malloc(rp_ref_sz);
+        }
+        init_rp_ref(&rf, 0, rf.iw8, row_start8, row_end8);
+        for (int i = 0; i < rf.iw8 * rf.ih8; i++) {
+            c_rp_proj[i].mv.n = a_rp_proj[i].mv.n = 0xdeadbeef;
+            c_rp_proj[i].ref = a_rp_proj[i].ref = 0xdd;
+        }
+
+        rf.n_tile_threads = 1;
+
+        rf.rp_proj = c_rp_proj;
+        call_ref(&rf, 0, col_start8, col_end8, row_start8, row_end8);
+        rf.rp_proj = a_rp_proj;
+        call_new(&rf, 0, col_start8, col_end8, row_start8, row_end8);
+
+        for (int i = 0; i < rf.ih8; i++)
+            for (int j = 0; j < rf.iw8; j++)
+                if (c_rp_proj[i * rf.iw8 + j].mv.n != a_rp_proj[i * rf.iw8 + j].mv.n ||
+                    (c_rp_proj[i * rf.iw8 + j].ref != a_rp_proj[i * rf.iw8 + j].ref &&
+                     c_rp_proj[i * rf.iw8 + j].mv.n != INVALID_MV))
+                {
+                    if (fail()) {
+                        fprintf(stderr, "[%d][%d] c_rp.mv.x = 0x%x a_rp.mv.x = 0x%x\n",
+                                i, j, c_rp_proj[i * rf.iw8 + j].mv.x, a_rp_proj[i * rf.iw8 + j].mv.x);
+                        fprintf(stderr, "[%d][%d] c_rp.mv.y = 0x%x a_rp.mv.y = 0x%x\n",
+                                i, j, c_rp_proj[i * rf.iw8 + j].mv.y, a_rp_proj[i * rf.iw8 + j].mv.y);
+                        fprintf(stderr, "[%d][%d] c_rp.ref = %u a_rp.ref = %u\n",
+                                i, j, c_rp_proj[i * rf.iw8 + j].ref, a_rp_proj[i * rf.iw8 + j].ref);
+                    }
+                }
+
+        if (checkasm_bench_func()) {
+            for (int n = 0; n < rf.n_mfmvs; n++) {
+                rf.mfmv_ref2cur[n] = 1;
+                for (int r = 0; r < 7; r++)
+                    rf.mfmv_ref2ref[n][r] = 1;
+            }
+            bench_new(&rf, 0, 0, rf.iw8, row_start8, row_end8);
+        }
+
+        for (int n = 0; n < rf.n_mfmvs; n++) {
+            free(rp_ref[rf.mfmv_ref[n]]);
+            rp_ref[rf.mfmv_ref[n]] = NULL;
+        }
+    }
+
+    report("load_tmvs");
+}
+
 static void check_save_tmvs(const Dav1dRefmvsDSPContext *const c) {
    refmvs_block *rr[31];
    refmvs_block r[31 * 256];
@ -162,6 +346,7 @@ void checkasm_check_refmvs(void) {
    Dav1dRefmvsDSPContext c;
    dav1d_refmvs_dsp_init(&c);

+    check_load_tmvs(&c);
    check_save_tmvs(&c);
    check_splat_mv(&c);
 }