Bug 1841624 - Update dav1d to 616bfd1506a8a75c6a358e578cbec9ca11931502 r=chunmin

Differential Revision: https://phabricator.services.mozilla.com/D182716
This commit is contained in:
Updatebot 2023-07-07 16:33:42 +00:00
parent 115775cd64
commit 1f101a78be
33 changed files with 3213 additions and 1332 deletions

View File

@ -20,11 +20,11 @@ origin:
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
release: 8b419c16bf1e37bc98044089da58f06824462cb9 (2023-06-02T00:00:12.000+02:00).
release: 616bfd1506a8a75c6a358e578cbec9ca11931502 (2023-07-01T11:36:39.000+03:00).
# Revision to pull in
# Must be a long or short commit SHA (long preferred)
revision: 8b419c16bf1e37bc98044089da58f06824462cb9
revision: 616bfd1506a8a75c6a358e578cbec9ca11931502
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/

View File

@ -1,2 +1,2 @@
/* auto-generated, do not edit */
#define DAV1D_VERSION "8b419c16bf1e37bc98044089da58f06824462cb9"
#define DAV1D_VERSION "616bfd1506a8a75c6a358e578cbec9ca11931502"

View File

@ -27,8 +27,8 @@
#ifndef DAV1D_VERSION_H
#define DAV1D_VERSION_H
#define DAV1D_API_VERSION_MAJOR 6
#define DAV1D_API_VERSION_MINOR 9
#define DAV1D_API_VERSION_MAJOR 7
#define DAV1D_API_VERSION_MINOR 0
#define DAV1D_API_VERSION_PATCH 0
#endif /* DAV1D_VERSION_H */

View File

@ -32,24 +32,26 @@
#include <stdlib.h>
#if defined(NDEBUG)
#define debug_abort()
#define debug_print(...) do {} while (0)
#define debug_abort() do {} while (0)
#else
#define debug_print(...) fprintf(stderr, __VA_ARGS__)
#define debug_abort abort
#endif
#define validate_input_or_ret_with_msg(x, r, ...) \
if (!(x)) { \
fprintf(stderr, "Input validation check \'%s\' failed in %s!\n", \
#x, __func__); \
fprintf(stderr, __VA_ARGS__); \
debug_print("Input validation check \'%s\' failed in %s!\n", \
#x, __func__); \
debug_print(__VA_ARGS__); \
debug_abort(); \
return r; \
}
#define validate_input_or_ret(x, r) \
if (!(x)) { \
fprintf(stderr, "Input validation check \'%s\' failed in %s!\n", \
#x, __func__); \
debug_print("Input validation check \'%s\' failed in %s!\n", \
#x, __func__); \
debug_abort(); \
return r; \
}

View File

@ -103,6 +103,15 @@ typedef struct Dav1dSettings {
*/
DAV1D_API const char *dav1d_version(void);
/**
* Get library API version.
*
* @return A value in the format 0x00XXYYZZ, where XX is the major version,
* YY the minor version, and ZZ the patch version.
* @see DAV1D_API_MAJOR, DAV1D_API_MINOR, DAV1D_API_PATCH
*/
DAV1D_API unsigned dav1d_version_api(void);
/**
* Initialize settings to default values.
*

View File

@ -182,8 +182,8 @@ enum Dav1dChromaSamplePosition {
};
typedef struct Dav1dContentLightLevel {
int max_content_light_level;
int max_frame_average_light_level;
uint16_t max_content_light_level;
uint16_t max_frame_average_light_level;
} Dav1dContentLightLevel;
typedef struct Dav1dMasteringDisplay {
@ -210,7 +210,7 @@ typedef struct Dav1dSequenceHeader {
* 1 for 8-10 bits/component 4:4:4; 2 for 4:2:2 at any bits/component,
* or 12 bits/component at any chroma subsampling.
*/
int profile;
uint8_t profile;
/**
* Maximum dimensions for this stream. In non-scalable streams, these
* are often the actual dimensions of the stream, although that is not
@ -229,60 +229,60 @@ typedef struct Dav1dSequenceHeader {
* (twelve_bit) to distinguish between 10 and 12 bits/component. To get
* the spec's hbd, use !!our_hbd, and to get twelve_bit, use hbd == 2.
*/
int hbd;
uint8_t hbd;
/**
* Pixel data uses JPEG pixel range ([0,255] for 8bits) instead of
* MPEG pixel range ([16,235] for 8bits luma, [16,240] for 8bits chroma).
*/
int color_range;
uint8_t color_range;
int num_operating_points;
uint8_t num_operating_points;
struct Dav1dSequenceHeaderOperatingPoint {
int major_level, minor_level;
int initial_display_delay;
int idc;
int tier;
int decoder_model_param_present;
int display_model_param_present;
uint8_t major_level, minor_level;
uint8_t initial_display_delay;
uint16_t idc;
uint8_t tier;
uint8_t decoder_model_param_present;
uint8_t display_model_param_present;
} operating_points[DAV1D_MAX_OPERATING_POINTS];
int still_picture;
int reduced_still_picture_header;
int timing_info_present;
int num_units_in_tick;
int time_scale;
int equal_picture_interval;
unsigned num_ticks_per_picture;
int decoder_model_info_present;
int encoder_decoder_buffer_delay_length;
int num_units_in_decoding_tick;
int buffer_removal_delay_length;
int frame_presentation_delay_length;
int display_model_info_present;
int width_n_bits, height_n_bits;
int frame_id_numbers_present;
int delta_frame_id_n_bits;
int frame_id_n_bits;
int sb128;
int filter_intra;
int intra_edge_filter;
int inter_intra;
int masked_compound;
int warped_motion;
int dual_filter;
int order_hint;
int jnt_comp;
int ref_frame_mvs;
uint8_t still_picture;
uint8_t reduced_still_picture_header;
uint8_t timing_info_present;
uint32_t num_units_in_tick;
uint32_t time_scale;
uint8_t equal_picture_interval;
uint32_t num_ticks_per_picture;
uint8_t decoder_model_info_present;
uint8_t encoder_decoder_buffer_delay_length;
uint32_t num_units_in_decoding_tick;
uint8_t buffer_removal_delay_length;
uint8_t frame_presentation_delay_length;
uint8_t display_model_info_present;
uint8_t width_n_bits, height_n_bits;
uint8_t frame_id_numbers_present;
uint8_t delta_frame_id_n_bits;
uint8_t frame_id_n_bits;
uint8_t sb128;
uint8_t filter_intra;
uint8_t intra_edge_filter;
uint8_t inter_intra;
uint8_t masked_compound;
uint8_t warped_motion;
uint8_t dual_filter;
uint8_t order_hint;
uint8_t jnt_comp;
uint8_t ref_frame_mvs;
enum Dav1dAdaptiveBoolean screen_content_tools;
enum Dav1dAdaptiveBoolean force_integer_mv;
int order_hint_n_bits;
int super_res;
int cdef;
int restoration;
int ss_hor, ss_ver, monochrome;
int color_description_present;
int separate_uv_delta_q;
int film_grain_present;
uint8_t order_hint_n_bits;
uint8_t super_res;
uint8_t cdef;
uint8_t restoration;
uint8_t ss_hor, ss_ver, monochrome;
uint8_t color_description_present;
uint8_t separate_uv_delta_q;
uint8_t film_grain_present;
// Dav1dSequenceHeaders of the same sequence are required to be
// bit-identical until this offset. See 7.5 "Ordering of OBUs":
@ -291,29 +291,29 @@ typedef struct Dav1dSequenceHeader {
// sequence header appears except for the contents of
// operating_parameters_info.
struct Dav1dSequenceHeaderOperatingParameterInfo {
int decoder_buffer_delay;
int encoder_buffer_delay;
int low_delay_mode;
uint32_t decoder_buffer_delay;
uint32_t encoder_buffer_delay;
uint8_t low_delay_mode;
} operating_parameter_info[DAV1D_MAX_OPERATING_POINTS];
} Dav1dSequenceHeader;
typedef struct Dav1dSegmentationData {
int delta_q;
int delta_lf_y_v, delta_lf_y_h, delta_lf_u, delta_lf_v;
int ref;
int skip;
int globalmv;
int16_t delta_q;
int8_t delta_lf_y_v, delta_lf_y_h, delta_lf_u, delta_lf_v;
int8_t ref;
uint8_t skip;
uint8_t globalmv;
} Dav1dSegmentationData;
typedef struct Dav1dSegmentationDataSet {
Dav1dSegmentationData d[DAV1D_MAX_SEGMENTS];
int preskip;
int last_active_segid;
uint8_t preskip;
int8_t last_active_segid;
} Dav1dSegmentationDataSet;
typedef struct Dav1dLoopfilterModeRefDeltas {
int mode_delta[2 /* is_zeromv */];
int ref_delta[DAV1D_TOTAL_REFS_PER_FRAME];
int8_t mode_delta[2 /* is_zeromv */];
int8_t ref_delta[DAV1D_TOTAL_REFS_PER_FRAME];
} Dav1dLoopfilterModeRefDeltas;
typedef struct Dav1dFilmGrainData {
@ -339,100 +339,101 @@ typedef struct Dav1dFilmGrainData {
typedef struct Dav1dFrameHeader {
struct {
Dav1dFilmGrainData data;
int present, update;
uint8_t present, update;
} film_grain; ///< film grain parameters
enum Dav1dFrameType frame_type; ///< type of the picture
int width[2 /* { coded_width, superresolution_upscaled_width } */], height;
int frame_offset; ///< frame number
int temporal_id; ///< temporal id of the frame for SVC
int spatial_id; ///< spatial id of the frame for SVC
uint8_t frame_offset; ///< frame number
uint8_t temporal_id; ///< temporal id of the frame for SVC
uint8_t spatial_id; ///< spatial id of the frame for SVC
int show_existing_frame;
int existing_frame_idx;
int frame_id;
int frame_presentation_delay;
int show_frame;
int showable_frame;
int error_resilient_mode;
int disable_cdf_update;
int allow_screen_content_tools;
int force_integer_mv;
int frame_size_override;
int primary_ref_frame;
int buffer_removal_time_present;
uint8_t show_existing_frame;
uint8_t existing_frame_idx;
uint32_t frame_id;
uint32_t frame_presentation_delay;
uint8_t show_frame;
uint8_t showable_frame;
uint8_t error_resilient_mode;
uint8_t disable_cdf_update;
uint8_t allow_screen_content_tools;
uint8_t force_integer_mv;
uint8_t frame_size_override;
uint8_t primary_ref_frame;
uint8_t buffer_removal_time_present;
struct Dav1dFrameHeaderOperatingPoint {
int buffer_removal_time;
uint32_t buffer_removal_time;
} operating_points[DAV1D_MAX_OPERATING_POINTS];
int refresh_frame_flags;
uint8_t refresh_frame_flags;
int render_width, render_height;
struct {
int width_scale_denominator;
int enabled;
uint8_t width_scale_denominator;
uint8_t enabled;
} super_res;
int have_render_size;
int allow_intrabc;
int frame_ref_short_signaling;
int refidx[DAV1D_REFS_PER_FRAME];
int hp;
uint8_t have_render_size;
uint8_t allow_intrabc;
uint8_t frame_ref_short_signaling;
int8_t refidx[DAV1D_REFS_PER_FRAME];
uint8_t hp;
enum Dav1dFilterMode subpel_filter_mode;
int switchable_motion_mode;
int use_ref_frame_mvs;
int refresh_context;
uint8_t switchable_motion_mode;
uint8_t use_ref_frame_mvs;
uint8_t refresh_context;
struct {
int uniform;
unsigned n_bytes;
int min_log2_cols, max_log2_cols, log2_cols, cols;
int min_log2_rows, max_log2_rows, log2_rows, rows;
uint8_t uniform;
uint8_t n_bytes;
uint8_t min_log2_cols, max_log2_cols, log2_cols, cols;
uint8_t min_log2_rows, max_log2_rows, log2_rows, rows;
uint16_t col_start_sb[DAV1D_MAX_TILE_COLS + 1];
uint16_t row_start_sb[DAV1D_MAX_TILE_ROWS + 1];
int update;
uint16_t update;
} tiling;
struct {
int yac;
int ydc_delta;
int udc_delta, uac_delta, vdc_delta, vac_delta;
int qm, qm_y, qm_u, qm_v;
uint8_t yac;
int8_t ydc_delta;
int8_t udc_delta, uac_delta, vdc_delta, vac_delta;
uint8_t qm, qm_y, qm_u, qm_v;
} quant;
struct {
int enabled, update_map, temporal, update_data;
uint8_t enabled, update_map, temporal, update_data;
Dav1dSegmentationDataSet seg_data;
int lossless[DAV1D_MAX_SEGMENTS], qidx[DAV1D_MAX_SEGMENTS];
uint8_t lossless[DAV1D_MAX_SEGMENTS], qidx[DAV1D_MAX_SEGMENTS];
} segmentation;
struct {
struct {
int present;
int res_log2;
uint8_t present;
uint8_t res_log2;
} q;
struct {
int present;
int res_log2;
int multi;
uint8_t present;
uint8_t res_log2;
uint8_t multi;
} lf;
} delta;
int all_lossless;
uint8_t all_lossless;
struct {
int level_y[2 /* dir */];
int level_u, level_v;
int mode_ref_delta_enabled;
int mode_ref_delta_update;
uint8_t level_y[2 /* dir */];
uint8_t level_u, level_v;
uint8_t mode_ref_delta_enabled;
uint8_t mode_ref_delta_update;
Dav1dLoopfilterModeRefDeltas mode_ref_deltas;
int sharpness;
uint8_t sharpness;
} loopfilter;
struct {
int damping;
int n_bits;
int y_strength[DAV1D_MAX_CDEF_STRENGTHS];
int uv_strength[DAV1D_MAX_CDEF_STRENGTHS];
uint8_t damping;
uint8_t n_bits;
uint8_t y_strength[DAV1D_MAX_CDEF_STRENGTHS];
uint8_t uv_strength[DAV1D_MAX_CDEF_STRENGTHS];
} cdef;
struct {
enum Dav1dRestorationType type[3 /* plane */];
int unit_size[2 /* y, uv */];
uint8_t unit_size[2 /* y, uv */];
} restoration;
enum Dav1dTxfmMode txfm_mode;
int switchable_comp_refs;
int skip_mode_allowed, skip_mode_enabled, skip_mode_refs[2];
int warp_motion;
int reduced_txtp_set;
uint8_t switchable_comp_refs;
uint8_t skip_mode_allowed, skip_mode_enabled;
int8_t skip_mode_refs[2];
uint8_t warp_motion;
uint8_t reduced_txtp_set;
Dav1dWarpedMotionParams gmv[DAV1D_REFS_PER_FRAME];
} Dav1dFrameHeader;

View File

@ -91,7 +91,7 @@ typedef struct Dav1dPicture {
*/
size_t n_itut_t35;
uintptr_t reserved[3]; ///< reserved for future use
uintptr_t reserved[4]; ///< reserved for future use
struct Dav1dRef *frame_hdr_ref; ///< Dav1dFrameHeader allocation origin
struct Dav1dRef *seq_hdr_ref; ///< Dav1dSequenceHeader allocation origin

View File

@ -35,6 +35,14 @@ extern "C" {
#define DAV1D_API_VERSION_MINOR @DAV1D_API_VERSION_MINOR@
#define DAV1D_API_VERSION_PATCH @DAV1D_API_VERSION_PATCH@
/**
* Extract version components from the value returned by
* dav1d_version_int()
*/
#define DAV1D_API_MAJOR(v) (((v) >> 16) & 0xFF)
#define DAV1D_API_MINOR(v) (((v) >> 8) & 0xFF)
#define DAV1D_API_PATCH(v) (((v) >> 0) & 0xFF)
#ifdef __cplusplus
} /* extern "C" */
#endif

View File

@ -30,7 +30,7 @@ project('dav1d', ['c'],
'b_ndebug=if-release'],
meson_version: '>= 0.49.0')
dav1d_soname_version = '6.9.0'
dav1d_soname_version = '7.0.0'
dav1d_api_version_array = dav1d_soname_version.split('.')
dav1d_api_version_major = dav1d_api_version_array[0]
dav1d_api_version_minor = dav1d_api_version_array[1]
@ -149,6 +149,10 @@ else
endif
cdata.set('HAVE_CLOCK_GETTIME', 1)
endif
if cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args)
cdata.set('HAVE_POSIX_MEMALIGN', 1)
endif
endif
# check for fseeko on android. It is not always available if _FILE_OFFSET_BITS is defined to 64
@ -226,14 +230,6 @@ else
getopt_dependency = []
endif
if cc.has_function('_aligned_malloc', prefix : '#include <malloc.h>', args : test_args)
cdata.set('HAVE_ALIGNED_MALLOC', 1)
elif cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args)
cdata.set('HAVE_POSIX_MEMALIGN', 1)
elif cc.has_function('memalign', prefix : '#include <malloc.h>', args : test_args)
cdata.set('HAVE_MEMALIGN', 1)
endif
if (host_machine.cpu_family() == 'aarch64' or
host_machine.cpu_family().startswith('arm') or
host_machine.cpu() == 'ppc64le')

View File

@ -95,3 +95,209 @@ L(splat_tbl):
bgt 1b
pop {r4, pc}
endfunc
const mv_tbls, align=4
.byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
.byte 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0
.byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
.byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
endconst
const mask_mult, align=4
.byte 1, 2, 1, 2, 0, 0, 0, 0
endconst
// void dav1d_save_tmvs_neon(refmvs_temporal_block *rp, ptrdiff_t stride,
// refmvs_block **rr, const uint8_t *ref_sign,
// int col_end8, int row_end8,
// int col_start8, int row_start8)
function save_tmvs_neon, export=1
push {r4-r11,lr}
ldrd r4, r5, [sp, #36]
ldrd r6, r7, [sp, #44]
vmov.i8 d30, #0
vld1.8 {d31}, [r3]
adr r8, L(save_tmvs_tbl)
movrel_local lr, mask_mult
movrel_local r12, mv_tbls
vld1.8 {d29}, [lr]
vext.8 d31, d30, d31, #7 // [0, ref_sign]
mov r3, #5
mul r1, r1, r3 // stride *= 5
sub r5, r5, r7 // h = row_end8 - row_start8
lsl r7, r7, #1 // row_start8 <<= 1
1:
mov r3, #5
mov r11, #12*2
and r9, r7, #30 // (y & 15) * 2
ldr r9, [r2, r9, lsl #2] // b = rr[(y & 15) * 2]
add r9, r9, #12 // &b[... + 1]
mla r10, r4, r11, r9 // end_cand_b = &b[col_end8*2 + 1]
mla r9, r6, r11, r9 // cand_b = &b[x*2 + 1]
mla r3, r6, r3, r0 // &rp[x]
push {r2,r4,r6}
2:
ldrb r11, [r9, #10] // cand_b->bs
add lr, r9, #8
vld1.8 {d0, d1}, [r9] // cand_b->mv
add r11, r8, r11, lsl #3
vld1.16 {d2[]}, [lr] // cand_b->ref
ldrh lr, [r11] // bw8
mov r2, r8
add r9, r9, lr, lsl #1 // cand_b += bw8*2
cmp r9, r10
vmov d4, d0
bge 3f
ldrb r2, [r9, #10] // cand_b->bs
add lr, r9, #8
vld1.8 {d6, d7}, [r9] // cand_b->mv
add r2, r8, r2, lsl #3
vld1.16 {d2[1]}, [lr] // cand_b->ref
ldrh lr, [r2] // bw8
add r9, r9, lr, lsl #1 // cand_b += bw8*2
vmov d5, d6
3:
vabs.s16 q2, q2 // abs(mv[].xy)
vtbl.8 d2, {d31}, d2 // ref_sign[ref]
vshr.u16 q2, q2, #12 // abs(mv[].xy) >> 12
vmull.u8 q1, d2, d29 // ref_sign[ref] * {1, 2}
vceq.i32 q2, q2, #0 // abs(mv[].xy) <= 4096
vmovn.i32 d4, q2 // abs() condition to 16 bit
vand d2, d2, d4 // h[0-3] contains conditions for mv[0-1]
vpadd.i16 d2, d2, d2 // Combine condition for [1] and [0]
vmov.u16 r4, d2[0] // Extract case for first block
vmov.u16 r6, d2[1]
ldr r11, [r11, #4] // Fetch jump table entry
ldr r2, [r2, #4]
add r4, r12, r4, lsl #4
add r6, r12, r6, lsl #4
vld1.8 {d2, d3}, [r4] // Load permutation table base on case
vld1.8 {d4, d5}, [r6]
add r11, r8, r11 // Find jump table target
add r2, r8, r2
vtbl.8 d16, {d0, d1}, d2 // Permute cand_b to output refmvs_temporal_block
vtbl.8 d17, {d0, d1}, d3
vtbl.8 d18, {d6, d7}, d4
vtbl.8 d19, {d6, d7}, d5
vmov q0, q8
// q1 follows on q0 (q8), with another 3 full repetitions of the pattern.
vext.8 q1, q8, q8, #1
vext.8 q10, q9, q9, #1
// q2 ends with 3 complete repetitions of the pattern.
vext.8 q2, q8, q1, #4
vext.8 q11, q9, q10, #4
blx r11
bge 4f // if (cand_b >= end)
vmov q0, q9
vmov q1, q10
vmov q2, q11
cmp r9, r10
blx r2
blt 2b // if (cand_b < end)
4:
pop {r2,r4,r6}
subs r5, r5, #1 // h--
add r7, r7, #2 // y += 2
add r0, r0, r1 // rp += stride
bgt 1b
pop {r4-r11,pc}
.align 2
L(save_tmvs_tbl):
.word 16 * 12
.word 160f - L(save_tmvs_tbl) + CONFIG_THUMB
.word 16 * 12
.word 160f - L(save_tmvs_tbl) + CONFIG_THUMB
.word 8 * 12
.word 80f - L(save_tmvs_tbl) + CONFIG_THUMB
.word 8 * 12
.word 80f - L(save_tmvs_tbl) + CONFIG_THUMB
.word 8 * 12
.word 80f - L(save_tmvs_tbl) + CONFIG_THUMB
.word 8 * 12
.word 80f - L(save_tmvs_tbl) + CONFIG_THUMB
.word 4 * 12
.word 40f - L(save_tmvs_tbl) + CONFIG_THUMB
.word 4 * 12
.word 40f - L(save_tmvs_tbl) + CONFIG_THUMB
.word 4 * 12
.word 40f - L(save_tmvs_tbl) + CONFIG_THUMB
.word 4 * 12
.word 40f - L(save_tmvs_tbl) + CONFIG_THUMB
.word 2 * 12
.word 20f - L(save_tmvs_tbl) + CONFIG_THUMB
.word 2 * 12
.word 20f - L(save_tmvs_tbl) + CONFIG_THUMB
.word 2 * 12
.word 20f - L(save_tmvs_tbl) + CONFIG_THUMB
.word 2 * 12
.word 20f - L(save_tmvs_tbl) + CONFIG_THUMB
.word 2 * 12
.word 20f - L(save_tmvs_tbl) + CONFIG_THUMB
.word 1 * 12
.word 10f - L(save_tmvs_tbl) + CONFIG_THUMB
.word 1 * 12
.word 10f - L(save_tmvs_tbl) + CONFIG_THUMB
.word 1 * 12
.word 10f - L(save_tmvs_tbl) + CONFIG_THUMB
.word 1 * 12
.word 10f - L(save_tmvs_tbl) + CONFIG_THUMB
.word 1 * 12
.word 10f - L(save_tmvs_tbl) + CONFIG_THUMB
.word 1 * 12
.word 10f - L(save_tmvs_tbl) + CONFIG_THUMB
.word 1 * 12
.word 10f - L(save_tmvs_tbl) + CONFIG_THUMB
10:
add r4, r3, #4
vst1.32 {d0[0]}, [r3]
vst1.8 {d0[4]}, [r4]
add r3, r3, #5
bx lr
20:
add r4, r3, #8
vst1.8 {d0}, [r3]
vst1.16 {d1[0]}, [r4]
add r3, r3, #2*5
bx lr
40:
add r4, r3, #16
vst1.8 {q0}, [r3]
vst1.32 {d2[0]}, [r4]
add r3, r3, #4*5
bx lr
80:
add r4, r3, #(8*5-16)
// This writes 6 full entries plus 2 extra bytes
vst1.8 {q0, q1}, [r3]
// Write the last few, overlapping with the first write.
vst1.8 {q2}, [r4]
add r3, r3, #8*5
bx lr
160:
add r4, r3, #6*5
add r6, r3, #12*5
// This writes 6 full entries plus 2 extra bytes
vst1.8 {q0, q1}, [r3]
// Write another 6 full entries, slightly overlapping with the first set
vst1.8 {q0, q1}, [r4]
add r4, r3, #(16*5-16)
// Write 8 bytes (one full entry) after the first 12
vst1.8 {d0}, [r6]
// Write the last 3 entries
vst1.8 {q2}, [r4]
add r3, r3, #16*5
bx lr
endfunc

View File

@ -965,371 +965,338 @@ function wiener_filter5_hv_8bpc_neon
ret
endfunc
#define SUM_STRIDE (384+16)
#include "looprestoration_tmpl.S"
// void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const ptrdiff_t stride,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box3_h_8bpc_neon, export=1
add w5, w5, #2 // w += 2
// void dav1d_sgr_box3_row_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const int w,
// const enum LrEdgeFlags edges);
function sgr_box3_row_h_8bpc_neon, export=1
add w4, w4, #2 // w += 2
// Set up pointers for reading/writing alternate rows
add x10, x0, #(4*SUM_STRIDE) // sumsq
add x11, x1, #(2*SUM_STRIDE) // sum
add x12, x3, x4 // src
lsl x4, x4, #1
mov x9, #(2*2*SUM_STRIDE) // double sum stride
// Subtract the aligned width from the output stride.
add w13, w5, #7
bic w13, w13, #7
sub x9, x9, w13, uxtw #1
// Store the width for the vertical loop
mov w8, w5
// Subtract the number of pixels read from the input from the stride
add w13, w13, #8
sub x4, x4, w13, uxtw
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
tst w7, #1 // LR_HAVE_LEFT
b.eq 2f
// LR_HAVE_LEFT
tst w5, #1 // LR_HAVE_LEFT
b.eq 1f
cbnz x2, 0f
// left == NULL
// LR_HAVE_LEFT && left == NULL
sub x3, x3, #2
sub x12, x12, #2
b 1f
0: // LR_HAVE_LEFT, left != NULL
2: // !LR_HAVE_LEFT, increase the stride.
// For this case we don't read the left 2 pixels from the src pointer,
// but shift it as if we had done that.
add x4, x4, #2
ld1 {v0.16b}, [x3], #16
b 2f
1: // Loop vertically
ld1 {v0.16b}, [x3], #16
ld1 {v4.16b}, [x12], #16
tst w7, #1 // LR_HAVE_LEFT
b.eq 0f
cbz x2, 2f
0:
// LR_HAVE_LEFT, left != NULL
ld1 {v1.s}[3], [x2], #4
// Move x3/x12 back to account for the last 2 bytes we loaded earlier,
ld1 {v0.16b}, [x3], #16
ld1 {v1.s}[3], [x2]
// Move x3 back to account for the last 2 bytes we loaded earlier,
// which we'll shift out.
sub x3, x3, #2
sub x12, x12, #2
ld1 {v5.s}[3], [x2], #4
ext v0.16b, v1.16b, v0.16b, #14
ext v4.16b, v5.16b, v4.16b, #14
b 2f
0:
1:
ld1 {v0.16b}, [x3], #16
// !LR_HAVE_LEFT, fill v1 with the leftmost byte
// and shift v0 to have 2x the first byte at the front.
dup v1.16b, v0.b[0]
dup v5.16b, v4.b[0]
// Move x3 back to account for the last 2 bytes we loaded before,
// which we shifted out.
sub x3, x3, #2
sub x12, x12, #2
ext v0.16b, v1.16b, v0.16b, #14
ext v4.16b, v5.16b, v4.16b, #14
2:
umull v1.8h, v0.8b, v0.8b
umull2 v2.8h, v0.16b, v0.16b
umull v5.8h, v4.8b, v4.8b
umull2 v6.8h, v4.16b, v4.16b
tst w7, #2 // LR_HAVE_RIGHT
tst w5, #2 // LR_HAVE_RIGHT
b.ne 4f
// If we'll need to pad the right edge, load that byte to pad with
// here since we can find it pretty easily from here.
sub w13, w5, #(2 + 16 - 2 + 1)
sub w13, w4, #(2 + 16 - 2 + 1)
ldr b30, [x3, w13, sxtw]
ldr b31, [x12, w13, sxtw]
// Fill v30/v31 with the right padding pixel
// Fill v30 with the right padding pixel
dup v30.16b, v30.b[0]
dup v31.16b, v31.b[0]
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp w5, #10
cmp w4, #10
b.ge 4f // If w >= 10, all used input pixels are valid
// 1 <= w < 10, w pixels valid in v0. For w=9, this ends up called
// again; it's not strictly needed in those cases (we pad enough here),
// but keeping the code as simple as possible.
// Insert padding in v0/4.b[w] onwards
// Insert padding in v0.b[w] onwards
movrel x13, right_ext_mask
sub x13, x13, w5, uxtw
sub x13, x13, w4, uxtw
ld1 {v29.16b}, [x13]
bit v0.16b, v30.16b, v29.16b
bit v4.16b, v31.16b, v29.16b
// Update the precalculated squares
umull v1.8h, v0.8b, v0.8b
umull2 v2.8h, v0.16b, v0.16b
umull v5.8h, v4.8b, v4.8b
umull2 v6.8h, v4.16b, v4.16b
4: // Loop horizontally
ext v16.16b, v0.16b, v0.16b, #1
ext v17.16b, v0.16b, v0.16b, #2
ext v18.16b, v4.16b, v4.16b, #1
ext v19.16b, v4.16b, v4.16b, #2
uaddl v3.8h, v0.8b, v16.8b
uaddw v3.8h, v3.8h, v17.8b
uaddl v7.8h, v4.8b, v18.8b
uaddw v7.8h, v7.8h, v19.8b
ext v20.16b, v1.16b, v2.16b, #2
uaddw v3.8h, v3.8h, v17.8b
ext v21.16b, v1.16b, v2.16b, #4
ext v22.16b, v5.16b, v6.16b, #2
ext v23.16b, v5.16b, v6.16b, #4
uaddl v26.4s, v1.4h, v20.4h
uaddl2 v27.4s, v1.8h, v20.8h
uaddw v26.4s, v26.4s, v21.4h
uaddw2 v27.4s, v27.4s, v21.8h
uaddl v28.4s, v5.4h, v22.4h
uaddl2 v29.4s, v5.8h, v22.8h
uaddw v28.4s, v28.4s, v23.4h
uaddw2 v29.4s, v29.4s, v23.8h
subs w5, w5, #8
subs w4, w4, #8
st1 {v3.8h}, [x1], #16
st1 {v7.8h}, [x11], #16
st1 {v26.4s,v27.4s}, [x0], #32
st1 {v28.4s,v29.4s}, [x10], #32
b.le 9f
tst w7, #2 // LR_HAVE_RIGHT
tst w5, #2 // LR_HAVE_RIGHT
ld1 {v3.8b}, [x3], #8
ld1 {v7.8b}, [x12], #8
mov v1.16b, v2.16b
mov v5.16b, v6.16b
ext v0.16b, v0.16b, v3.16b, #8
ext v4.16b, v4.16b, v7.16b, #8
umull v2.8h, v3.8b, v3.8b
umull v6.8h, v7.8b, v7.8b
b.ne 4b // If we don't need to pad, just keep summing.
b 3b // If we need to pad, check how many pixels we have left.
9:
subs w6, w6, #2
b.le 0f
// Jump to the next row and loop horizontally
add x0, x0, x9, lsl #1
add x10, x10, x9, lsl #1
add x1, x1, x9
add x11, x11, x9
add x3, x3, x4
add x12, x12, x4
mov w5, w8
b 1b
0:
ret
endfunc
// void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const ptrdiff_t stride,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box5_h_8bpc_neon, export=1
add w5, w5, #2 // w += 2
// void dav1d_sgr_box5_row_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const int w,
// const enum LrEdgeFlags edges);
function sgr_box5_row_h_8bpc_neon, export=1
add w4, w4, #2 // w += 2
// Set up pointers for reading/writing alternate rows
add x10, x0, #(4*SUM_STRIDE) // sumsq
add x11, x1, #(2*SUM_STRIDE) // sum
add x12, x3, x4 // src
lsl x4, x4, #1
mov x9, #(2*2*SUM_STRIDE) // double sum stride
// Subtract the aligned width from the output stride.
add w13, w5, #7
bic w13, w13, #7
sub x9, x9, w13, uxtw #1
add w13, w13, #8
sub x4, x4, w13, uxtw
// Store the width for the vertical loop
mov w8, w5
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
tst w7, #1 // LR_HAVE_LEFT
b.eq 2f
// LR_HAVE_LEFT
tst w5, #1 // LR_HAVE_LEFT
b.eq 1f
cbnz x2, 0f
// left == NULL
// LR_HAVE_LEFT && left == NULL
sub x3, x3, #3
sub x12, x12, #3
b 1f
0: // LR_HAVE_LEFT, left != NULL
2: // !LR_HAVE_LEFT, increase the stride.
// For this case we don't read the left 3 pixels from the src pointer,
// but shift it as if we had done that.
add x4, x4, #3
ld1 {v0.16b}, [x3], #16
b 2f
1: // Loop vertically
ld1 {v0.16b}, [x3], #16
ld1 {v4.16b}, [x12], #16
tst w7, #1 // LR_HAVE_LEFT
b.eq 0f
cbz x2, 2f
0:
// LR_HAVE_LEFT, left != NULL
ld1 {v1.s}[3], [x2], #4
// Move x3/x12 back to account for the last 3 bytes we loaded earlier,
ld1 {v0.16b}, [x3], #16
ld1 {v1.s}[3], [x2], #4
// Move x3 back to account for the last 3 bytes we loaded earlier,
// which we'll shift out.
sub x3, x3, #3
sub x12, x12, #3
ld1 {v5.s}[3], [x2], #4
ext v0.16b, v1.16b, v0.16b, #13
ext v4.16b, v5.16b, v4.16b, #13
b 2f
0:
1:
ld1 {v0.16b}, [x3], #16
// !LR_HAVE_LEFT, fill v1 with the leftmost byte
// and shift v0 to have 3x the first byte at the front.
dup v1.16b, v0.b[0]
dup v5.16b, v4.b[0]
// Move x3 back to account for the last 3 bytes we loaded before,
// which we shifted out.
sub x3, x3, #3
sub x12, x12, #3
ext v0.16b, v1.16b, v0.16b, #13
ext v4.16b, v5.16b, v4.16b, #13
2:
umull v1.8h, v0.8b, v0.8b
umull2 v2.8h, v0.16b, v0.16b
umull v5.8h, v4.8b, v4.8b
umull2 v6.8h, v4.16b, v4.16b
tst w7, #2 // LR_HAVE_RIGHT
tst w5, #2 // LR_HAVE_RIGHT
b.ne 4f
// If we'll need to pad the right edge, load that byte to pad with
// here since we can find it pretty easily from here.
sub w13, w5, #(2 + 16 - 3 + 1)
sub w13, w4, #(2 + 16 - 3 + 1)
ldr b30, [x3, w13, sxtw]
ldr b31, [x12, w13, sxtw]
// Fill v30/v31 with the right padding pixel
// Fill v30 with the right padding pixel
dup v30.16b, v30.b[0]
dup v31.16b, v31.b[0]
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp w5, #11
cmp w4, #11
b.ge 4f // If w >= 11, all used input pixels are valid
// 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// Insert padding in v0/4.b[w+1] onwards; fuse the +1 into the
// Insert padding in v0.b[w+1] onwards; fuse the +1 into the
// buffer pointer.
movrel x13, right_ext_mask, -1
sub x13, x13, w5, uxtw
sub x13, x13, w4, uxtw
ld1 {v29.16b}, [x13]
bit v0.16b, v30.16b, v29.16b
bit v4.16b, v31.16b, v29.16b
// Update the precalculated squares
umull v1.8h, v0.8b, v0.8b
umull2 v2.8h, v0.16b, v0.16b
umull v5.8h, v4.8b, v4.8b
umull2 v6.8h, v4.16b, v4.16b
4: // Loop horizontally
ext v16.16b, v0.16b, v0.16b, #1
ext v17.16b, v0.16b, v0.16b, #2
ext v18.16b, v0.16b, v0.16b, #3
ext v19.16b, v0.16b, v0.16b, #4
ext v20.16b, v4.16b, v4.16b, #1
ext v21.16b, v4.16b, v4.16b, #2
ext v22.16b, v4.16b, v4.16b, #3
ext v23.16b, v4.16b, v4.16b, #4
uaddl v3.8h, v0.8b, v16.8b
uaddl v24.8h, v17.8b, v18.8b
uaddl v7.8h, v4.8b, v20.8b
uaddw v3.8h, v3.8h, v19.8b
uaddl v25.8h, v21.8b, v22.8b
uaddw v7.8h, v7.8h, v23.8b
add v3.8h, v3.8h, v24.8h
add v7.8h, v7.8h, v25.8h
ext v16.16b, v1.16b, v2.16b, #2
ext v17.16b, v1.16b, v2.16b, #4
ext v18.16b, v1.16b, v2.16b, #6
ext v19.16b, v1.16b, v2.16b, #8
ext v20.16b, v5.16b, v6.16b, #2
ext v21.16b, v5.16b, v6.16b, #4
ext v22.16b, v5.16b, v6.16b, #6
ext v23.16b, v5.16b, v6.16b, #8
uaddl v26.4s, v1.4h, v16.4h
uaddl2 v27.4s, v1.8h, v16.8h
uaddl v16.4s, v17.4h, v18.4h
uaddl2 v17.4s, v17.8h, v18.8h
uaddl v28.4s, v5.4h, v20.4h
uaddl2 v29.4s, v5.8h, v20.8h
uaddw v26.4s, v26.4s, v19.4h
uaddw2 v27.4s, v27.4s, v19.8h
uaddl v20.4s, v21.4h, v22.4h
uaddl2 v21.4s, v21.8h, v22.8h
uaddw v28.4s, v28.4s, v23.4h
uaddw2 v29.4s, v29.4s, v23.8h
add v26.4s, v26.4s, v16.4s
add v27.4s, v27.4s, v17.4s
add v28.4s, v28.4s, v20.4s
add v29.4s, v29.4s, v21.4s
subs w5, w5, #8
subs w4, w4, #8
st1 {v3.8h}, [x1], #16
st1 {v7.8h}, [x11], #16
st1 {v26.4s,v27.4s}, [x0], #32
st1 {v28.4s,v29.4s}, [x10], #32
b.le 9f
tst w7, #2 // LR_HAVE_RIGHT
tst w5, #2 // LR_HAVE_RIGHT
ld1 {v3.8b}, [x3], #8
ld1 {v7.8b}, [x12], #8
mov v1.16b, v2.16b
mov v5.16b, v6.16b
ext v0.16b, v0.16b, v3.16b, #8
ext v4.16b, v4.16b, v7.16b, #8
umull v2.8h, v3.8b, v3.8b
umull v6.8h, v7.8b, v7.8b
b.ne 4b // If we don't need to pad, just keep summing.
b 3b // If we need to pad, check how many pixels we have left.
9:
subs w6, w6, #2
b.le 0f
// Jump to the next row and loop horizontally
add x0, x0, x9, lsl #1
add x10, x10, x9, lsl #1
add x1, x1, x9
add x11, x11, x9
add x3, x3, x4
add x12, x12, x4
mov w5, w8
b 1b
ret
endfunc
// void dav1d_sgr_box35_row_h_8bpc_neon(int32_t *sumsq3, int16_t *sum3,
// int32_t *sumsq5, int16_t *sum5,
// const pixel (*left)[4],
// const pixel *src, const int w,
// const enum LrEdgeFlags edges);
function sgr_box35_row_h_8bpc_neon, export=1
add w6, w6, #2 // w += 2
tst w7, #1 // LR_HAVE_LEFT
b.eq 1f
cbnz x4, 0f
// LR_HAVE_LEFT && left == NULL
sub x5, x5, #3
ld1 {v0.16b}, [x5], #16
b 2f
0:
// LR_HAVE_LEFT, left != NULL
ld1 {v0.16b}, [x5], #16
ld1 {v1.s}[3], [x4], #4
// Move x3 back to account for the last 3 bytes we loaded earlier,
// which we'll shift out.
sub x5, x5, #3
ext v0.16b, v1.16b, v0.16b, #13
b 2f
1:
ld1 {v0.16b}, [x5], #16
// !LR_HAVE_LEFT, fill v1 with the leftmost byte
// and shift v0 to have 3x the first byte at the front.
dup v1.16b, v0.b[0]
// Move x3 back to account for the last 3 bytes we loaded before,
// which we shifted out.
sub x5, x5, #3
ext v0.16b, v1.16b, v0.16b, #13
2:
umull v1.8h, v0.8b, v0.8b
umull2 v2.8h, v0.16b, v0.16b
tst w7, #2 // LR_HAVE_RIGHT
b.ne 4f
// If we'll need to pad the right edge, load that byte to pad with
// here since we can find it pretty easily from here.
sub w13, w6, #(2 + 16 - 3 + 1)
ldr b30, [x5, w13, sxtw]
// Fill v30 with the right padding pixel
dup v30.16b, v30.b[0]
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp w6, #11
b.ge 4f // If w >= 11, all used input pixels are valid
// 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// Insert padding in v0.b[w+1] onwards; fuse the +1 into the
// buffer pointer.
movrel x13, right_ext_mask, -1
sub x13, x13, w6, uxtw
ld1 {v29.16b}, [x13]
bit v0.16b, v30.16b, v29.16b
// Update the precalculated squares
umull v1.8h, v0.8b, v0.8b
umull2 v2.8h, v0.16b, v0.16b
4: // Loop horizontally
ext v16.16b, v0.16b, v0.16b, #1
ext v17.16b, v0.16b, v0.16b, #2
ext v19.16b, v0.16b, v0.16b, #4
ext v18.16b, v0.16b, v0.16b, #3
uaddl v3.8h, v16.8b, v17.8b
uaddl v24.8h, v0.8b, v19.8b
uaddw v3.8h, v3.8h, v18.8b
ext v16.16b, v1.16b, v2.16b, #2
ext v17.16b, v1.16b, v2.16b, #4
ext v19.16b, v1.16b, v2.16b, #8
ext v18.16b, v1.16b, v2.16b, #6
st1 {v3.8h}, [x1], #16
add v3.8h, v3.8h, v24.8h
uaddl v26.4s, v16.4h, v17.4h
uaddl2 v27.4s, v16.8h, v17.8h
uaddl v16.4s, v1.4h, v19.4h
uaddl2 v17.4s, v1.8h, v19.8h
uaddw v26.4s, v26.4s, v18.4h
uaddw2 v27.4s, v27.4s, v18.8h
st1 {v26.4s,v27.4s}, [x0], #32
add v26.4s, v26.4s, v16.4s
add v27.4s, v27.4s, v17.4s
subs w6, w6, #8
st1 {v3.8h}, [x3], #16
st1 {v26.4s,v27.4s}, [x2], #32
b.le 9f
tst w7, #2 // LR_HAVE_RIGHT
ld1 {v3.8b}, [x5], #8
mov v1.16b, v2.16b
ext v0.16b, v0.16b, v3.16b, #8
umull v2.8h, v3.8b, v3.8b
b.ne 4b // If we don't need to pad, just keep summing.
b 3b // If we need to pad, check how many pixels we have left.
9:
ret
endfunc

View File

@ -1070,349 +1070,318 @@ function wiener_filter5_hv_16bpc_neon
ret
endfunc
#define SUM_STRIDE (384+16)
#include "looprestoration_tmpl.S"
// void dav1d_sgr_box3_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const ptrdiff_t stride,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box3_h_16bpc_neon, export=1
add w5, w5, #2 // w += 2
// void dav1d_sgr_box3_row_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const int w,
// const enum LrEdgeFlags edges);
function sgr_box3_row_h_16bpc_neon, export=1
add w4, w4, #2 // w += 2
// Set up pointers for reading/writing alternate rows
add x10, x0, #(4*SUM_STRIDE) // sumsq
add x11, x1, #(2*SUM_STRIDE) // sum
add x12, x3, x4 // src
lsl x4, x4, #1
mov x9, #(2*2*SUM_STRIDE) // double sum stride
// Subtract the aligned width from the output stride.
add w13, w5, #7
bic w13, w13, #7
sub x9, x9, w13, uxtw #1
// Store the width for the vertical loop
mov w8, w5
// Subtract the number of pixels read from the input from the stride
add w13, w13, #8
sub x4, x4, w13, uxtw #1
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
tst w7, #1 // LR_HAVE_LEFT
b.eq 2f
// LR_HAVE_LEFT
tst w5, #1 // LR_HAVE_LEFT
b.eq 1f
cbnz x2, 0f
// left == NULL
// LR_HAVE_LEFT && left == NULL
sub x3, x3, #4
sub x12, x12, #4
b 1f
0: // LR_HAVE_LEFT, left != NULL
2: // !LR_HAVE_LEFT, increase the stride.
// For this case we don't read the left 2 pixels from the src pointer,
// but shift it as if we had done that.
add x4, x4, #4
ld1 {v0.8h, v1.8h}, [x3], #32
b 2f
1: // Loop vertically
ld1 {v0.8h, v1.8h}, [x3], #32
ld1 {v16.8h, v17.8h}, [x12], #32
tst w7, #1 // LR_HAVE_LEFT
b.eq 0f
cbz x2, 2f
0:
// LR_HAVE_LEFT, left != NULL
ld1 {v2.d}[1], [x2], #8
// Move x3/x12 back to account for the last 2 pixels we loaded earlier,
ld1 {v0.8h, v1.8h}, [x3], #32
ld1 {v2.d}[1], [x2]
// Move x3 back to account for the last 2 pixels we loaded earlier,
// which we'll shift out.
sub x3, x3, #4
sub x12, x12, #4
ld1 {v18.d}[1], [x2], #8
ext v1.16b, v0.16b, v1.16b, #12
ext v0.16b, v2.16b, v0.16b, #12
ext v17.16b, v16.16b, v17.16b, #12
ext v16.16b, v18.16b, v16.16b, #12
ext v1.16b, v0.16b, v1.16b, #12
ext v0.16b, v2.16b, v0.16b, #12
b 2f
0:
1:
ld1 {v0.8h, v1.8h}, [x3], #32
// !LR_HAVE_LEFT, fill v2 with the leftmost pixel
// and shift v0/v1 to have 2x the first pixel at the front.
dup v2.8h, v0.h[0]
dup v18.8h, v16.h[0]
dup v2.8h, v0.h[0]
// Move x3 back to account for the last 2 pixels we loaded before,
// which we shifted out.
sub x3, x3, #4
sub x12, x12, #4
ext v1.16b, v0.16b, v1.16b, #12
ext v0.16b, v2.16b, v0.16b, #12
ext v17.16b, v16.16b, v17.16b, #12
ext v16.16b, v18.16b, v16.16b, #12
ext v1.16b, v0.16b, v1.16b, #12
ext v0.16b, v2.16b, v0.16b, #12
2:
tst w7, #2 // LR_HAVE_RIGHT
tst w5, #2 // LR_HAVE_RIGHT
b.ne 4f
// If we'll need to pad the right edge, load that pixel to pad with
// here since we can find it pretty easily from here.
sub w13, w5, #(2 + 16 - 2 + 1)
sub w13, w4, #(2 + 16 - 2 + 1)
ldr h30, [x3, w13, sxtw #1]
ldr h31, [x12, w13, sxtw #1]
// Fill v30/v31 with the right padding pixel
// Fill v30 with the right padding pixel
dup v30.8h, v30.h[0]
dup v31.8h, v31.h[0]
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp w5, #10
cmp w4, #10
b.ge 4f // If w >= 10, all used input pixels are valid
// 1 <= w < 10, w pixels valid in v0-v1. For w=9, this ends up called
// 1 <= w < 10, w pixels valid in v0. For w=9, this ends up called
// again; it's not strictly needed in those cases (we pad enough here),
// but keeping the code as simple as possible.
// Insert padding in v0/1.h[w] onwards
// Insert padding in v0.b[w] onwards
movrel x13, right_ext_mask
sub x13, x13, w5, uxtw #1
sub x13, x13, w4, uxtw #1
ld1 {v28.16b, v29.16b}, [x13]
bit v0.16b, v30.16b, v28.16b
bit v1.16b, v30.16b, v29.16b
bit v16.16b, v31.16b, v28.16b
bit v17.16b, v31.16b, v29.16b
4: // Loop horizontally
ext v26.16b, v0.16b, v1.16b, #2
ext v28.16b, v16.16b, v17.16b, #2
ext v27.16b, v0.16b, v1.16b, #4
ext v29.16b, v16.16b, v17.16b, #4
add v6.8h, v0.8h, v26.8h
umull v22.4s, v0.4h, v0.4h
umlal v22.4s, v26.4h, v26.4h
umlal v22.4s, v27.4h, v27.4h
add v7.8h, v16.8h, v28.8h
umull v24.4s, v16.4h, v16.4h
umlal v24.4s, v28.4h, v28.4h
umlal v24.4s, v29.4h, v29.4h
add v6.8h, v6.8h, v27.8h
umull2 v23.4s, v0.8h, v0.8h
umlal2 v23.4s, v26.8h, v26.8h
umlal2 v23.4s, v27.8h, v27.8h
add v7.8h, v7.8h, v29.8h
umull2 v25.4s, v16.8h, v16.8h
umlal2 v25.4s, v28.8h, v28.8h
umlal2 v25.4s, v29.8h, v29.8h
subs w5, w5, #8
subs w4, w4, #8
st1 {v6.8h}, [x1], #16
st1 {v7.8h}, [x11], #16
st1 {v22.4s,v23.4s}, [x0], #32
st1 {v24.4s,v25.4s}, [x10], #32
b.le 9f
tst w7, #2 // LR_HAVE_RIGHT
tst w5, #2 // LR_HAVE_RIGHT
mov v0.16b, v1.16b
mov v16.16b, v17.16b
ld1 {v1.8h}, [x3], #16
ld1 {v17.8h}, [x12], #16
b.ne 4b // If we don't need to pad, just keep summing.
b 3b // If we need to pad, check how many pixels we have left.
9:
subs w6, w6, #2
b.le 0f
// Jump to the next row and loop horizontally
add x0, x0, x9, lsl #1
add x10, x10, x9, lsl #1
add x1, x1, x9
add x11, x11, x9
add x3, x3, x4
add x12, x12, x4
mov w5, w8
b 1b
0:
ret
endfunc
// void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const ptrdiff_t stride,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box5_h_16bpc_neon, export=1
add w5, w5, #2 // w += 2
// void dav1d_sgr_box5_row_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const int w,
// const enum LrEdgeFlags edges);
function sgr_box5_row_h_16bpc_neon, export=1
add w4, w4, #2 // w += 2
// Set up pointers for reading/writing alternate rows
add x10, x0, #(4*SUM_STRIDE) // sumsq
add x11, x1, #(2*SUM_STRIDE) // sum
add x12, x3, x4 // src
lsl x4, x4, #1
mov x9, #(2*2*SUM_STRIDE) // double sum stride
// Subtract the aligned width from the output stride.
add w13, w5, #7
bic w13, w13, #7
sub x9, x9, w13, uxtw #1
add w13, w13, #8
sub x4, x4, w13, uxtw #1
// Store the width for the vertical loop
mov w8, w5
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
tst w7, #1 // LR_HAVE_LEFT
b.eq 2f
// LR_HAVE_LEFT
tst w5, #1 // LR_HAVE_LEFT
b.eq 1f
cbnz x2, 0f
// left == NULL
// LR_HAVE_LEFT && left == NULL
sub x3, x3, #6
sub x12, x12, #6
b 1f
0: // LR_HAVE_LEFT, left != NULL
2: // !LR_HAVE_LEFT, increase the stride.
// For this case we don't read the left 3 pixels from the src pointer,
// but shift it as if we had done that.
add x4, x4, #6
ld1 {v0.8h, v1.8h}, [x3], #32
b 2f
1: // Loop vertically
ld1 {v0.8h, v1.8h}, [x3], #32
ld1 {v16.8h, v17.8h}, [x12], #32
tst w7, #1 // LR_HAVE_LEFT
b.eq 0f
cbz x2, 2f
0:
// LR_HAVE_LEFT, left != NULL
ld1 {v2.d}[1], [x2], #8
// Move x3/x12 back to account for the last 3 pixels we loaded earlier,
ld1 {v0.8h, v1.8h}, [x3], #32
ld1 {v2.d}[1], [x2], #8
// Move x3 back to account for the last 3 pixels we loaded earlier,
// which we'll shift out.
sub x3, x3, #6
sub x12, x12, #6
ld1 {v18.d}[1], [x2], #8
ext v1.16b, v0.16b, v1.16b, #10
ext v0.16b, v2.16b, v0.16b, #10
ext v17.16b, v16.16b, v17.16b, #10
ext v16.16b, v18.16b, v16.16b, #10
b 2f
0:
1:
ld1 {v0.8h, v1.8h}, [x3], #32
// !LR_HAVE_LEFT, fill v2 with the leftmost pixel
// and shift v0/v1 to have 3x the first pixel at the front.
dup v2.8h, v0.h[0]
dup v18.8h, v16.h[0]
// Move x3 back to account for the last 3 pixels we loaded before,
// which we shifted out.
sub x3, x3, #6
sub x12, x12, #6
ext v1.16b, v0.16b, v1.16b, #10
ext v0.16b, v2.16b, v0.16b, #10
ext v17.16b, v16.16b, v17.16b, #10
ext v16.16b, v18.16b, v16.16b, #10
2:
tst w7, #2 // LR_HAVE_RIGHT
tst w5, #2 // LR_HAVE_RIGHT
b.ne 4f
// If we'll need to pad the right edge, load that pixel to pad with
// here since we can find it pretty easily from here.
sub w13, w5, #(2 + 16 - 3 + 1)
sub w13, w4, #(2 + 16 - 3 + 1)
ldr h30, [x3, w13, sxtw #1]
ldr h31, [x12, w13, sxtw #1]
// Fill v30/v31 with the right padding pixel
// Fill v30 with the right padding pixel
dup v30.8h, v30.h[0]
dup v31.8h, v31.h[0]
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp w5, #11
cmp w4, #11
b.ge 4f // If w >= 11, all used input pixels are valid
// 1 <= w < 11, w+1 pixels valid in v0-v1. For w=9 or w=10,
// 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// Insert padding in v0/1.h[w+1] onwards; fuse the +1 into the
// Insert padding in v0.b[w+1] onwards; fuse the +1 into the
// buffer pointer.
movrel x13, right_ext_mask, -2
sub x13, x13, w5, uxtw #1
movrel x13, right_ext_mask, -1
sub x13, x13, w4, uxtw #1
ld1 {v28.16b, v29.16b}, [x13]
bit v0.16b, v30.16b, v28.16b
bit v1.16b, v30.16b, v29.16b
bit v16.16b, v31.16b, v28.16b
bit v17.16b, v31.16b, v29.16b
4: // Loop horizontally
ext v26.16b, v0.16b, v1.16b, #2
ext v28.16b, v16.16b, v17.16b, #2
ext v27.16b, v0.16b, v1.16b, #4
ext v29.16b, v16.16b, v17.16b, #4
add v6.8h, v0.8h, v26.8h
umull v22.4s, v0.4h, v0.4h
umlal v22.4s, v26.4h, v26.4h
umlal v22.4s, v27.4h, v27.4h
add v7.8h, v16.8h, v28.8h
umull v24.4s, v16.4h, v16.4h
umlal v24.4s, v28.4h, v28.4h
umlal v24.4s, v29.4h, v29.4h
add v6.8h, v6.8h, v27.8h
umull2 v23.4s, v0.8h, v0.8h
umlal2 v23.4s, v26.8h, v26.8h
umlal2 v23.4s, v27.8h, v27.8h
add v7.8h, v7.8h, v29.8h
umull2 v25.4s, v16.8h, v16.8h
umlal2 v25.4s, v28.8h, v28.8h
umlal2 v25.4s, v29.8h, v29.8h
ext v26.16b, v0.16b, v1.16b, #6
ext v28.16b, v16.16b, v17.16b, #6
ext v27.16b, v0.16b, v1.16b, #8
ext v29.16b, v16.16b, v17.16b, #8
add v6.8h, v6.8h, v26.8h
umlal v22.4s, v26.4h, v26.4h
umlal v22.4s, v27.4h, v27.4h
add v7.8h, v7.8h, v28.8h
umlal v24.4s, v28.4h, v28.4h
umlal v24.4s, v29.4h, v29.4h
add v6.8h, v6.8h, v27.8h
umlal2 v23.4s, v26.8h, v26.8h
umlal2 v23.4s, v27.8h, v27.8h
add v7.8h, v7.8h, v29.8h
umlal2 v25.4s, v28.8h, v28.8h
umlal2 v25.4s, v29.8h, v29.8h
subs w5, w5, #8
subs w4, w4, #8
st1 {v6.8h}, [x1], #16
st1 {v7.8h}, [x11], #16
st1 {v22.4s,v23.4s}, [x0], #32
st1 {v24.4s,v25.4s}, [x10], #32
b.le 9f
tst w7, #2 // LR_HAVE_RIGHT
tst w5, #2 // LR_HAVE_RIGHT
mov v0.16b, v1.16b
mov v16.16b, v17.16b
ld1 {v1.8h}, [x3], #16
ld1 {v17.8h}, [x12], #16
ld1 {v1.8h}, [x3], #16
b.ne 4b // If we don't need to pad, just keep summing.
b 3b // If we need to pad, check how many pixels we have left.
9:
subs w6, w6, #2
b.le 0f
// Jump to the next row and loop horizontally
add x0, x0, x9, lsl #1
add x10, x10, x9, lsl #1
add x1, x1, x9
add x11, x11, x9
add x3, x3, x4
add x12, x12, x4
mov w5, w8
b 1b
ret
endfunc
// void dav1d_sgr_box35_row_h_16bpc_neon(int32_t *sumsq3, int16_t *sum3,
// int32_t *sumsq5, int16_t *sum5,
// const pixel (*left)[4],
// const pixel *src, const int w,
// const enum LrEdgeFlags edges);
function sgr_box35_row_h_16bpc_neon, export=1
add w6, w6, #2 // w += 2
tst w7, #1 // LR_HAVE_LEFT
b.eq 1f
cbnz x4, 0f
// LR_HAVE_LEFT && left == NULL
sub x5, x5, #6
ld1 {v0.8h, v1.8h}, [x5], #32
b 2f
0:
// LR_HAVE_LEFT, left != NULL
ld1 {v0.8h, v1.8h}, [x5], #32
ld1 {v2.d}[1], [x4], #8
// Move x3 back to account for the last 3 pixels we loaded earlier,
// which we'll shift out.
sub x5, x5, #6
ext v1.16b, v0.16b, v1.16b, #10
ext v0.16b, v2.16b, v0.16b, #10
b 2f
1:
ld1 {v0.8h, v1.8h}, [x5], #32
// !LR_HAVE_LEFT, fill v2 with the leftmost pixel
// and shift v0/v1 to have 3x the first pixel at the front.
dup v2.8h, v0.h[0]
// Move x5 back to account for the last 3 pixels we loaded before,
// which we shifted out.
sub x5, x5, #6
ext v1.16b, v0.16b, v1.16b, #10
ext v0.16b, v2.16b, v0.16b, #10
2:
tst w7, #2 // LR_HAVE_RIGHT
b.ne 4f
// If we'll need to pad the right edge, load that pixel to pad with
// here since we can find it pretty easily from here.
sub w13, w6, #(2 + 16 - 3 + 1)
ldr h30, [x5, w13, sxtw #1]
// Fill v30 with the right padding pixel
dup v30.8h, v30.h[0]
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp w6, #11
b.ge 4f // If w >= 11, all used input pixels are valid
// 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// Insert padding in v0.b[w+1] onwards; fuse the +1 into the
// buffer pointer.
movrel x13, right_ext_mask, -1
sub x13, x13, w6, uxtw #1
ld1 {v28.16b, v29.16b}, [x13]
bit v0.16b, v30.16b, v28.16b
bit v1.16b, v30.16b, v29.16b
4: // Loop horizontally
ext v16.16b, v0.16b, v1.16b, #2
ext v17.16b, v0.16b, v1.16b, #4
ext v19.16b, v0.16b, v1.16b, #8
ext v18.16b, v0.16b, v1.16b, #6
add v20.8h, v16.8h, v17.8h
add v21.8h, v0.8h, v19.8h
add v20.8h, v20.8h, v18.8h
umull v22.4s, v16.4h, v16.4h
umlal v22.4s, v17.4h, v17.4h
umlal v22.4s, v18.4h, v18.4h
umull2 v23.4s, v16.8h, v16.8h
umlal2 v23.4s, v17.8h, v17.8h
umlal2 v23.4s, v18.8h, v18.8h
add v21.8h, v21.8h, v20.8h
st1 {v20.8h}, [x1], #16
st1 {v22.4s,v23.4s}, [x0], #32
umlal v22.4s, v0.4h, v0.4h
umlal v22.4s, v19.4h, v19.4h
umlal2 v23.4s, v0.8h, v0.8h
umlal2 v23.4s, v19.8h, v19.8h
subs w6, w6, #8
st1 {v21.8h}, [x3], #16
st1 {v22.4s,v23.4s}, [x2], #32
b.le 9f
tst w7, #2 // LR_HAVE_RIGHT
mov v0.16b, v1.16b
ld1 {v1.8h}, [x5], #16
b.ne 4b // If we don't need to pad, just keep summing.
b 3b // If we need to pad, check how many pixels we have left.
9:
ret
endfunc

View File

@ -28,332 +28,29 @@
#include "src/arm/asm.S"
#include "util.S"
#define SUM_STRIDE (384+16)
// void dav1d_sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum,
// int32_t *AA, int16_t *BB,
// const int w, const int s,
// const int bitdepth_max);
function sgr_box3_vert_neon, export=1
stp d8, d9, [sp, #-0x30]!
stp d10, d11, [sp, #0x10]
stp d12, d13, [sp, #0x20]
// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box3_v_neon, export=1
add w10, w3, #2 // Number of output rows to move back
mov w11, w3 // Number of input rows to move back
add w2, w2, #2 // Actual summed width
mov x7, #(4*SUM_STRIDE) // sumsq stride
mov x8, #(2*SUM_STRIDE) // sum stride
sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride
sub x1, x1, #(2*SUM_STRIDE) // sum -= stride
add w4, w4, #2
clz w9, w6 // bitdepth_max
dup v28.4s, w5 // strength
tst w4, #4 // LR_HAVE_TOP
b.eq 0f
// If have top, read from row -2.
sub x5, x0, #(4*SUM_STRIDE)
sub x6, x1, #(2*SUM_STRIDE)
add w11, w11, #2
b 1f
0:
// !LR_HAVE_TOP
// If we don't have top, read from row 0 even if
// we start writing to row -1.
add x5, x0, #(4*SUM_STRIDE)
add x6, x1, #(2*SUM_STRIDE)
1:
ldp x5, x6, [x0]
ldr x0, [x0, #16]
ldp x7, x8, [x1]
ldr x1, [x1, #16]
tst w4, #8 // LR_HAVE_BOTTOM
b.eq 1f
// LR_HAVE_BOTTOM
add w3, w3, #2 // Sum all h+2 lines with the main loop
add w11, w11, #2
1:
mov w9, w3 // Backup of h for next loops
movi v31.4s, #9 // n
1:
// Start of horizontal loop; start one vertical filter slice.
// Start loading rows into v16-v21 and v24-v26 taking top
// padding into consideration.
tst w4, #4 // LR_HAVE_TOP
ld1 {v16.4s, v17.4s}, [x5], x7
ld1 {v24.8h}, [x6], x8
b.eq 2f
// LR_HAVE_TOP
ld1 {v18.4s, v19.4s}, [x5], x7
ld1 {v25.8h}, [x6], x8
ld1 {v20.4s, v21.4s}, [x5], x7
ld1 {v26.8h}, [x6], x8
b 3f
2: // !LR_HAVE_TOP
mov v18.16b, v16.16b
mov v19.16b, v17.16b
mov v25.16b, v24.16b
mov v20.16b, v16.16b
mov v21.16b, v17.16b
mov v26.16b, v24.16b
3:
subs w3, w3, #1
.macro add3
add v16.4s, v16.4s, v18.4s
add v17.4s, v17.4s, v19.4s
add v24.8h, v24.8h, v25.8h
add v16.4s, v16.4s, v20.4s
add v17.4s, v17.4s, v21.4s
add v24.8h, v24.8h, v26.8h
st1 {v16.4s, v17.4s}, [x0], x7
st1 {v24.8h}, [x1], x8
.endm
add3
mov v16.16b, v18.16b
mov v17.16b, v19.16b
mov v24.16b, v25.16b
mov v18.16b, v20.16b
mov v19.16b, v21.16b
mov v25.16b, v26.16b
b.le 4f
ld1 {v20.4s, v21.4s}, [x5], x7
ld1 {v26.8h}, [x6], x8
b 3b
4:
tst w4, #8 // LR_HAVE_BOTTOM
b.ne 5f
// !LR_HAVE_BOTTOM
// Produce two more rows, extending the already loaded rows.
add3
mov v16.16b, v18.16b
mov v17.16b, v19.16b
mov v24.16b, v25.16b
add3
5: // End of one vertical slice.
subs w2, w2, #8
b.le 0f
// Move pointers back up to the top and loop horizontally.
// Input pointers
msub x5, x7, x11, x5
msub x6, x8, x11, x6
// Output pointers
msub x0, x7, x10, x0
msub x1, x8, x10, x1
add x0, x0, #32
add x1, x1, #16
add x5, x5, #32
add x6, x6, #16
mov w3, w9
b 1b
0:
ret
.purgem add3
endfunc
// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box5_v_neon, export=1
add w10, w3, #2 // Number of output rows to move back
mov w11, w3 // Number of input rows to move back
add w2, w2, #8 // Actual summed width
mov x7, #(4*SUM_STRIDE) // sumsq stride
mov x8, #(2*SUM_STRIDE) // sum stride
sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride
sub x1, x1, #(2*SUM_STRIDE) // sum -= stride
tst w4, #4 // LR_HAVE_TOP
b.eq 0f
// If have top, read from row -2.
sub x5, x0, #(4*SUM_STRIDE)
sub x6, x1, #(2*SUM_STRIDE)
add w11, w11, #2
b 1f
0:
// !LR_HAVE_TOP
// If we don't have top, read from row 0 even if
// we start writing to row -1.
add x5, x0, #(4*SUM_STRIDE)
add x6, x1, #(2*SUM_STRIDE)
1:
tst w4, #8 // LR_HAVE_BOTTOM
b.eq 0f
// LR_HAVE_BOTTOM
add w3, w3, #2 // Handle h+2 lines with the main loop
add w11, w11, #2
b 1f
0:
// !LR_HAVE_BOTTOM
sub w3, w3, #1 // Handle h-1 lines with the main loop
1:
mov w9, w3 // Backup of h for next loops
1:
// Start of horizontal loop; start one vertical filter slice.
// Start loading rows into v16-v25 and v26-v30 taking top
// padding into consideration.
tst w4, #4 // LR_HAVE_TOP
ld1 {v16.4s, v17.4s}, [x5], x7
ld1 {v26.8h}, [x6], x8
b.eq 2f
// LR_HAVE_TOP
ld1 {v20.4s, v21.4s}, [x5], x7
ld1 {v28.8h}, [x6], x8
mov v18.16b, v16.16b
mov v19.16b, v17.16b
mov v27.16b, v26.16b
ld1 {v22.4s, v23.4s}, [x5], x7
ld1 {v29.8h}, [x6], x8
b 3f
2: // !LR_HAVE_TOP
mov v18.16b, v16.16b
mov v19.16b, v17.16b
mov v27.16b, v26.16b
mov v20.16b, v16.16b
mov v21.16b, v17.16b
mov v28.16b, v26.16b
mov v22.16b, v16.16b
mov v23.16b, v17.16b
mov v29.16b, v26.16b
3:
cbz w3, 4f
ld1 {v24.4s, v25.4s}, [x5], x7
ld1 {v30.8h}, [x6], x8
3:
// Start of vertical loop
subs w3, w3, #2
.macro add5
add v16.4s, v16.4s, v18.4s
add v17.4s, v17.4s, v19.4s
add v26.8h, v26.8h, v27.8h
add v0.4s, v20.4s, v22.4s
add v1.4s, v21.4s, v23.4s
add v2.8h, v28.8h, v29.8h
add v16.4s, v16.4s, v24.4s
add v17.4s, v17.4s, v25.4s
add v26.8h, v26.8h, v30.8h
add v16.4s, v16.4s, v0.4s
add v17.4s, v17.4s, v1.4s
add v26.8h, v26.8h, v2.8h
st1 {v16.4s, v17.4s}, [x0], x7
st1 {v26.8h}, [x1], x8
.endm
add5
.macro shift2
mov v16.16b, v20.16b
mov v17.16b, v21.16b
mov v26.16b, v28.16b
mov v18.16b, v22.16b
mov v19.16b, v23.16b
mov v27.16b, v29.16b
mov v20.16b, v24.16b
mov v21.16b, v25.16b
mov v28.16b, v30.16b
.endm
shift2
add x0, x0, x7
add x1, x1, x8
b.le 5f
ld1 {v22.4s, v23.4s}, [x5], x7
ld1 {v29.8h}, [x6], x8
ld1 {v24.4s, v25.4s}, [x5], x7
ld1 {v30.8h}, [x6], x8
b 3b
4:
// h == 1, !LR_HAVE_BOTTOM.
// Pad the last row with the only content row, and add.
mov v24.16b, v22.16b
mov v25.16b, v23.16b
mov v30.16b, v29.16b
add5
shift2
add x0, x0, x7
add x1, x1, x8
add5
b 6f
5:
tst w4, #8 // LR_HAVE_BOTTOM
b.ne 6f
// !LR_HAVE_BOTTOM
cbnz w3, 5f
// The intended three edge rows left; output the one at h-2 and
// the past edge one at h.
ld1 {v22.4s, v23.4s}, [x5], x7
ld1 {v29.8h}, [x6], x8
// Pad the past-edge row from the last content row.
mov v24.16b, v22.16b
mov v25.16b, v23.16b
mov v30.16b, v29.16b
add5
shift2
add x0, x0, x7
add x1, x1, x8
// The last two rows are already padded properly here.
add5
b 6f
5:
// w3 == -1, two rows left, output one.
// Pad the last two rows from the mid one.
mov v22.16b, v20.16b
mov v23.16b, v21.16b
mov v29.16b, v28.16b
mov v24.16b, v20.16b
mov v25.16b, v21.16b
mov v30.16b, v28.16b
add5
add x0, x0, x7
add x1, x1, x8
b 6f
6: // End of one vertical slice.
subs w2, w2, #8
b.le 0f
// Move pointers back up to the top and loop horizontally.
// Input pointers
msub x5, x7, x11, x5
msub x6, x8, x11, x6
// Output pointers
msub x0, x7, x10, x0
msub x1, x8, x10, x1
add x0, x0, #32
add x1, x1, #16
add x5, x5, #32
add x6, x6, #16
mov w3, w9
b 1b
0:
ret
.purgem add5
endfunc
// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
// const int w, const int h, const int strength,
// const int bitdepth_max);
// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
// const int w, const int h, const int strength,
// const int bitdepth_max);
function sgr_calc_ab1_neon, export=1
clz w9, w5
add x3, x3, #2 // h += 2
movi v31.4s, #9 // n
mov x5, #455
mov x8, #SUM_STRIDE
b sgr_calc_ab_neon
endfunc
function sgr_calc_ab2_neon, export=1
clz w9, w5
add x3, x3, #3 // h += 3
asr x3, x3, #1 // h /= 2
movi v31.4s, #25 // n
mov x5, #164
mov x8, #(2*SUM_STRIDE)
endfunc
function sgr_calc_ab_neon
sub w9, w9, #24 // -bitdepth_min_8
movrel x12, X(sgr_x_by_x)
mov w13, #455 // one_by_x
ld1 {v16.16b, v17.16b, v18.16b}, [x12]
dup v6.8h, w9 // -bitdepth_min_8
movi v19.16b, #5
@ -363,70 +60,213 @@ function sgr_calc_ab_neon
movi v23.8b, #169 // idx of last 2
movi v24.8b, #254 // idx of last 1
saddl v7.4s, v6.4h, v6.4h // -2*bitdepth_min_8
add x2, x2, #2 // w += 2
add x7, x2, #7
bic x7, x7, #7 // aligned w
sub x7, x8, x7 // increment between rows
movi v29.8h, #1, lsl #8
dup v28.4s, w4
dup v30.4s, w5 // one_by_x
sub x0, x0, #(4*(SUM_STRIDE))
sub x1, x1, #(2*(SUM_STRIDE))
mov x6, x2 // backup of w
dup v30.4s, w13 // one_by_x
sub v16.16b, v16.16b, v19.16b
sub v17.16b, v17.16b, v19.16b
sub v18.16b, v18.16b, v19.16b
ld1 {v8.4s, v9.4s}, [x5], #32
ld1 {v10.4s, v11.4s}, [x6], #32
ld1 {v12.8h}, [x7], #16
ld1 {v13.8h}, [x8], #16
ld1 {v0.4s, v1.4s}, [x0], #32
ld1 {v2.8h}, [x1], #16
1:
subs x2, x2, #8
ld1 {v0.4s, v1.4s}, [x0] // a
ld1 {v2.8h}, [x1] // b
srshl v0.4s, v0.4s, v7.4s
srshl v1.4s, v1.4s, v7.4s
srshl v4.8h, v2.8h, v6.8h
mul v0.4s, v0.4s, v31.4s // a * n
mul v1.4s, v1.4s, v31.4s // a * n
umull v3.4s, v4.4h, v4.4h // b * b
umull2 v4.4s, v4.8h, v4.8h // b * b
uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0)
uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0)
mul v0.4s, v0.4s, v28.4s // p * s
mul v1.4s, v1.4s, v28.4s // p * s
uqshrn v0.4h, v0.4s, #16
uqshrn2 v0.8h, v1.4s, #16
uqrshrn v0.8b, v0.8h, #4 // imin(z, 255)
cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5
cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4
tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b
cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3
cmhi v4.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2
add v25.8b, v25.8b, v26.8b
cmhi v5.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1
add v27.8b, v27.8b, v4.8b
add v5.8b, v5.8b, v19.8b
add v25.8b, v25.8b, v27.8b
add v1.8b, v1.8b, v5.8b
add v1.8b, v1.8b, v25.8b
uxtl v1.8h, v1.8b // x
add v8.4s, v8.4s, v10.4s
add v9.4s, v9.4s, v11.4s
umull v3.4s, v1.4h, v2.4h // x * BB[i]
umull2 v4.4s, v1.8h, v2.8h // x * BB[i]
mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x
mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x
srshr v3.4s, v3.4s, #12 // AA[i]
srshr v4.4s, v4.4s, #12 // AA[i]
sub v2.8h, v29.8h, v1.8h // 256 - x
add v12.8h, v12.8h, v13.8h
st1 {v3.4s, v4.4s}, [x0], #32
st1 {v2.8h}, [x1], #16
subs w4, w4, #8
add v0.4s, v0.4s, v8.4s
add v1.4s, v1.4s, v9.4s
add v2.8h, v2.8h, v12.8h
srshl v0.4s, v0.4s, v7.4s
srshl v1.4s, v1.4s, v7.4s
srshl v4.8h, v2.8h, v6.8h
mul v0.4s, v0.4s, v31.4s // a * n
mul v1.4s, v1.4s, v31.4s // a * n
umull v3.4s, v4.4h, v4.4h // b * b
umull2 v4.4s, v4.8h, v4.8h // b * b
uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0)
uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0)
mul v0.4s, v0.4s, v28.4s // p * s
mul v1.4s, v1.4s, v28.4s // p * s
ld1 {v8.4s, v9.4s}, [x5], #32
uqshrn v0.4h, v0.4s, #16
uqshrn2 v0.8h, v1.4s, #16
ld1 {v10.4s, v11.4s}, [x6], #32
uqrshrn v0.8b, v0.8h, #4 // imin(z, 255)
ld1 {v12.8h}, [x7], #16
cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5
cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4
tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b
cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3
cmhi v4.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2
add v25.8b, v25.8b, v26.8b
cmhi v5.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1
add v27.8b, v27.8b, v4.8b
add v5.8b, v5.8b, v19.8b
add v25.8b, v25.8b, v27.8b
add v5.8b, v1.8b, v5.8b
ld1 {v13.8h}, [x8], #16
add v5.8b, v5.8b, v25.8b
ld1 {v0.4s, v1.4s}, [x0], #32
uxtl v5.8h, v5.8b // x
umull v3.4s, v5.4h, v2.4h // x * BB[i]
umull2 v4.4s, v5.8h, v2.8h // x * BB[i]
mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x
mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x
srshr v3.4s, v3.4s, #12 // AA[i]
srshr v4.4s, v4.4s, #12 // AA[i]
sub v5.8h, v29.8h, v5.8h // 256 - x
ld1 {v2.8h}, [x1], #16
st1 {v3.4s, v4.4s}, [x2], #32
st1 {v5.8h}, [x3], #16
b.gt 1b
subs x3, x3, #1
b.le 0f
add x0, x0, x7, lsl #2
add x1, x1, x7, lsl #1
mov x2, x6
b 1b
0:
ldp d12, d13, [sp, #0x20]
ldp d10, d11, [sp, #0x10]
ldp d8, d9, [sp], 0x30
ret
endfunc
// void dav1d_sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum,
// int32_t *AA, int16_t *BB,
// const int w, const int s,
// const int bitdepth_max);
function sgr_box5_vert_neon, export=1
stp d8, d9, [sp, #-0x40]!
stp d10, d11, [sp, #0x10]
stp d12, d13, [sp, #0x20]
stp d14, d15, [sp, #0x30]
add w4, w4, #2
clz w15, w6 // bitdepth_max
dup v28.4s, w5 // strength
ldp x5, x6, [x0]
ldp x7, x8, [x0, #16]
ldr x0, [x0, #32]
ldp x9, x10, [x1]
ldp x11, x12, [x1, #16]
ldr x1, [x1, #32]
movi v31.4s, #25 // n
sub w15, w15, #24 // -bitdepth_min_8
movrel x13, X(sgr_x_by_x)
mov w14, #164 // one_by_x
ld1 {v16.16b, v17.16b, v18.16b}, [x13]
dup v6.8h, w15 // -bitdepth_min_8
movi v19.16b, #5
movi v24.8b, #254 // idx of last 1
saddl v7.4s, v6.4h, v6.4h // -2*bitdepth_min_8
movi v29.8h, #1, lsl #8
dup v30.4s, w14 // one_by_x
sub v16.16b, v16.16b, v19.16b
sub v17.16b, v17.16b, v19.16b
sub v18.16b, v18.16b, v19.16b
ld1 {v8.4s, v9.4s}, [x5], #32
ld1 {v10.4s, v11.4s}, [x6], #32
ld1 {v12.4s, v13.4s}, [x7], #32
ld1 {v14.4s, v15.4s}, [x8], #32
ld1 {v20.8h}, [x9], #16
ld1 {v21.8h}, [x10], #16
ld1 {v22.8h}, [x11], #16
ld1 {v23.8h}, [x12], #16
ld1 {v0.4s, v1.4s}, [x0], #32
ld1 {v2.8h}, [x1], #16
1:
add v8.4s, v8.4s, v10.4s
add v9.4s, v9.4s, v11.4s
add v12.4s, v12.4s, v14.4s
add v13.4s, v13.4s, v15.4s
add v20.8h, v20.8h, v21.8h
add v22.8h, v22.8h, v23.8h
add v0.4s, v0.4s, v8.4s
add v1.4s, v1.4s, v9.4s
add v2.8h, v2.8h, v20.8h
add v0.4s, v0.4s, v12.4s
add v1.4s, v1.4s, v13.4s
add v2.8h, v2.8h, v22.8h
subs w4, w4, #8
movi v20.8b, #55 // idx of last 5
movi v21.8b, #72 // idx of last 4
movi v22.8b, #101 // idx of last 3
movi v23.8b, #169 // idx of last 2
srshl v0.4s, v0.4s, v7.4s
srshl v1.4s, v1.4s, v7.4s
srshl v4.8h, v2.8h, v6.8h
mul v0.4s, v0.4s, v31.4s // a * n
mul v1.4s, v1.4s, v31.4s // a * n
umull v3.4s, v4.4h, v4.4h // b * b
umull2 v4.4s, v4.8h, v4.8h // b * b
uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0)
uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0)
mul v0.4s, v0.4s, v28.4s // p * s
mul v1.4s, v1.4s, v28.4s // p * s
ld1 {v8.4s, v9.4s}, [x5], #32
uqshrn v0.4h, v0.4s, #16
uqshrn2 v0.8h, v1.4s, #16
ld1 {v10.4s, v11.4s}, [x6], #32
uqrshrn v0.8b, v0.8h, #4 // imin(z, 255)
ld1 {v12.4s, v13.4s}, [x7], #32
cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5
cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4
tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b
cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3
cmhi v4.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2
ld1 {v14.4s, v15.4s}, [x8], #32
add v25.8b, v25.8b, v26.8b
cmhi v5.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1
add v27.8b, v27.8b, v4.8b
ld1 {v20.8h}, [x9], #16
add v5.8b, v5.8b, v19.8b
add v25.8b, v25.8b, v27.8b
ld1 {v21.8h}, [x10], #16
add v5.8b, v1.8b, v5.8b
ld1 {v22.8h}, [x11], #16
add v5.8b, v5.8b, v25.8b
ld1 {v23.8h}, [x12], #16
uxtl v5.8h, v5.8b // x
ld1 {v0.4s, v1.4s}, [x0], #32
umull v3.4s, v5.4h, v2.4h // x * BB[i]
umull2 v4.4s, v5.8h, v2.8h // x * BB[i]
mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x
mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x
srshr v3.4s, v3.4s, #12 // AA[i]
srshr v4.4s, v4.4s, #12 // AA[i]
sub v5.8h, v29.8h, v5.8h // 256 - x
ld1 {v2.8h}, [x1], #16
st1 {v3.4s, v4.4s}, [x2], #32
st1 {v5.8h}, [x3], #16
b.gt 1b
ldp d14, d15, [sp, #0x30]
ldp d12, d13, [sp, #0x20]
ldp d10, d11, [sp, #0x10]
ldp d8, d9, [sp], 0x40
ret
endfunc

View File

@ -30,52 +30,224 @@
#define FILTER_OUT_STRIDE 384
.macro sgr_funcs bpc
// void dav1d_sgr_finish_filter1_Xbpc_neon(int16_t *tmp,
// const pixel *src, const ptrdiff_t stride,
// const int32_t *a, const int16_t *b,
// const int w, const int h);
function sgr_finish_filter1_\bpc\()bpc_neon, export=1
sub x7, x3, #(4*SUM_STRIDE)
add x8, x3, #(4*SUM_STRIDE)
sub x9, x4, #(2*SUM_STRIDE)
add x10, x4, #(2*SUM_STRIDE)
mov x11, #SUM_STRIDE
mov x12, #FILTER_OUT_STRIDE
add x13, x5, #7
bic x13, x13, #7 // Aligned width
// void dav1d_sgr_finish_filter1_2rows_Xbpc_neon(int16_t *tmp,
// const pixel *src,
// const ptrdiff_t src_stride,
// const int32_t **a,
// const int16_t **b,
// const int w, const int h);
function sgr_finish_filter1_2rows_\bpc\()bpc_neon, export=1
stp d8, d9, [sp, #-0x40]!
stp d10, d11, [sp, #0x10]
stp d12, d13, [sp, #0x20]
stp d14, d15, [sp, #0x30]
ldp x7, x8, [x3]
ldp x9, x3, [x3, #16]
ldp x10, x11, [x4]
ldp x12, x4, [x4, #16]
mov x13, #FILTER_OUT_STRIDE
cmp w6, #1
add x2, x1, x2 // src + stride
csel x2, x1, x2, le // if (h <= 1) x2 = x1
add x13, x0, x13, lsl #1
movi v30.8h, #3
movi v31.4s, #3
1:
ld1 {v0.8h, v1.8h}, [x10], #32
ld1 {v2.8h, v3.8h}, [x11], #32
ld1 {v4.8h, v5.8h}, [x12], #32
ld1 {v6.8h, v7.8h}, [x4], #32
ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48
ld1 {v19.4s, v20.4s, v21.4s}, [x8], #48
ld1 {v22.4s, v23.4s, v24.4s}, [x9], #48
ld1 {v25.4s, v26.4s, v27.4s}, [x3], #48
2:
ext v8.16b, v0.16b, v1.16b, #2 // [0][1]
ext v9.16b, v2.16b, v3.16b, #2 // [1][1]
ext v10.16b, v4.16b, v5.16b, #2 // [2][1]
ext v11.16b, v0.16b, v1.16b, #4 // [0][2]
ext v12.16b, v2.16b, v3.16b, #4 // [1][2]
ext v13.16b, v4.16b, v5.16b, #4 // [2][2]
add v14.8h, v2.8h, v8.8h // [1][0] + [0][1]
add v15.8h, v9.8h, v10.8h // [1][1] + [2][1]
add v28.8h, v0.8h, v11.8h // [0][0] + [0][2]
add v14.8h, v14.8h, v12.8h // () + [1][2]
add v29.8h, v4.8h, v13.8h // [2][0] + [2][2]
ext v8.16b, v6.16b, v7.16b, #2 // [3][1]
ext v11.16b, v6.16b, v7.16b, #4 // [3][2]
add v14.8h, v14.8h, v15.8h // mid
add v15.8h, v28.8h, v29.8h // corners
add v28.8h, v4.8h, v9.8h // [2][0] + [1][1]
add v29.8h, v10.8h, v8.8h // [2][1] + [3][1]
add v2.8h, v2.8h, v12.8h // [1][0] + [1][2]
add v28.8h, v28.8h, v13.8h // () + [2][2]
add v4.8h, v6.8h, v11.8h // [3][0] + [3][2]
add v0.8h, v28.8h, v29.8h // mid
add v2.8h, v2.8h, v4.8h // corners
shl v4.8h, v14.8h, #2
mla v4.8h, v15.8h, v30.8h // * 3 -> a
shl v0.8h, v0.8h, #2
mla v0.8h, v2.8h, v30.8h // * 3 -> a
ext v8.16b, v16.16b, v17.16b, #4 // [0][1]
ext v9.16b, v17.16b, v18.16b, #4
ext v10.16b, v16.16b, v17.16b, #8 // [0][2]
ext v11.16b, v17.16b, v18.16b, #8
ext v12.16b, v19.16b, v20.16b, #4 // [1][1]
ext v13.16b, v20.16b, v21.16b, #4
add v8.4s, v8.4s, v19.4s // [0][1] + [1][0]
add v9.4s, v9.4s, v20.4s
add v16.4s, v16.4s, v10.4s // [0][0] + [0][2]
add v17.4s, v17.4s, v11.4s
ext v14.16b, v19.16b, v20.16b, #8 // [1][2]
ext v15.16b, v20.16b, v21.16b, #8
add v16.4s, v16.4s, v22.4s // () + [2][0]
add v17.4s, v17.4s, v23.4s
add v28.4s, v12.4s, v14.4s // [1][1] + [1][2]
add v29.4s, v13.4s, v15.4s
ext v10.16b, v22.16b, v23.16b, #4 // [2][1]
ext v11.16b, v23.16b, v24.16b, #4
add v8.4s, v8.4s, v28.4s // mid (incomplete)
add v9.4s, v9.4s, v29.4s
add v19.4s, v19.4s, v14.4s // [1][0] + [1][2]
add v20.4s, v20.4s, v15.4s
add v14.4s, v22.4s, v12.4s // [2][0] + [1][1]
add v15.4s, v23.4s, v13.4s
ext v12.16b, v22.16b, v23.16b, #8 // [2][2]
ext v13.16b, v23.16b, v24.16b, #8
ext v28.16b, v25.16b, v26.16b, #4 // [3][1]
ext v29.16b, v26.16b, v27.16b, #4
add v8.4s, v8.4s, v10.4s // () + [2][1] = mid
add v9.4s, v9.4s, v11.4s
add v14.4s, v14.4s, v10.4s // () + [2][1]
add v15.4s, v15.4s, v11.4s
ext v10.16b, v25.16b, v26.16b, #8 // [3][2]
ext v11.16b, v26.16b, v27.16b, #8
add v16.4s, v16.4s, v12.4s // () + [2][2] = corner
add v17.4s, v17.4s, v13.4s
add v12.4s, v12.4s, v28.4s // [2][2] + [3][1]
add v13.4s, v13.4s, v29.4s
add v25.4s, v25.4s, v10.4s // [3][0] + [3][2]
add v26.4s, v26.4s, v11.4s
add v14.4s, v14.4s, v12.4s // mid
add v15.4s, v15.4s, v13.4s
add v19.4s, v19.4s, v25.4s // corner
add v20.4s, v20.4s, v26.4s
.if \bpc == 8
sub x2, x2, x13
ld1 {v25.8b}, [x1], #8 // src
ld1 {v26.8b}, [x2], #8
.else
sub x2, x2, x13, lsl #1
ld1 {v25.8h}, [x1], #16 // src
ld1 {v26.8h}, [x2], #16
.endif
sub x12, x12, x13
sub x11, x11, x13
sub x11, x11, #4 // We read 4 extra elements from a
sub x14, x11, #4 // We read 8 extra elements from b
mov x13, x5
shl v8.4s, v8.4s, #2
shl v9.4s, v9.4s, #2
mla v8.4s, v16.4s, v31.4s // * 3 -> b
mla v9.4s, v17.4s, v31.4s
.if \bpc == 8
uxtl v25.8h, v25.8b // src
uxtl v26.8h, v26.8b
.endif
shl v14.4s, v14.4s, #2
shl v15.4s, v15.4s, #2
mla v14.4s, v19.4s, v31.4s // * 3 -> b
mla v15.4s, v20.4s, v31.4s
umlal v8.4s, v4.4h, v25.4h // b + a * src
umlal2 v9.4s, v4.8h, v25.8h
umlal v14.4s, v0.4h, v26.4h // b + a * src
umlal2 v15.4s, v0.8h, v26.8h
mov v0.16b, v1.16b
rshrn v8.4h, v8.4s, #9
rshrn2 v8.8h, v9.4s, #9
mov v2.16b, v3.16b
rshrn v14.4h, v14.4s, #9
rshrn2 v14.8h, v15.4s, #9
subs w5, w5, #8
mov v4.16b, v5.16b
st1 {v8.8h}, [x0], #16
mov v6.16b, v7.16b
st1 {v14.8h}, [x13], #16
b.le 3f
mov v16.16b, v18.16b
mov v19.16b, v21.16b
mov v22.16b, v24.16b
mov v25.16b, v27.16b
ld1 {v1.8h}, [x10], #16
ld1 {v3.8h}, [x11], #16
ld1 {v5.8h}, [x12], #16
ld1 {v7.8h}, [x4], #16
ld1 {v17.4s, v18.4s}, [x7], #32
ld1 {v20.4s, v21.4s}, [x8], #32
ld1 {v23.4s, v24.4s}, [x9], #32
ld1 {v26.4s, v27.4s}, [x3], #32
b 2b
3:
ldp d14, d15, [sp, #0x30]
ldp d12, d13, [sp, #0x20]
ldp d10, d11, [sp, #0x10]
ldp d8, d9, [sp], 0x40
ret
endfunc
// void dav1d_sgr_finish_weighted1_Xbpc_neon(pixel *dst,
// const int32_t **a, const int16_t **b,
// const int w, const int w1,
// const int bitdepth_max);
function sgr_finish_weighted1_\bpc\()bpc_neon, export=1
ldp x7, x8, [x1]
ldr x1, [x1, #16]
ldp x9, x10, [x2]
ldr x2, [x2, #16]
dup v31.8h, w4
dup v30.8h, w5
movi v6.8h, #3
movi v7.4s, #3
1:
ld1 {v0.8h, v1.8h}, [x9], #32
ld1 {v2.8h, v3.8h}, [x4], #32
ld1 {v4.8h, v5.8h}, [x10], #32
ld1 {v0.8h, v1.8h}, [x9], #32
ld1 {v2.8h, v3.8h}, [x10], #32
ld1 {v4.8h, v5.8h}, [x2], #32
ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48
ld1 {v19.4s, v20.4s, v21.4s}, [x3], #48
ld1 {v22.4s, v23.4s, v24.4s}, [x8], #48
ld1 {v19.4s, v20.4s, v21.4s}, [x8], #48
ld1 {v22.4s, v23.4s, v24.4s}, [x1], #48
2:
subs x5, x5, #8
ext v25.16b, v0.16b, v1.16b, #2 // -stride
ext v26.16b, v2.16b, v3.16b, #2 // 0
ext v27.16b, v4.16b, v5.16b, #2 // +stride
ext v28.16b, v0.16b, v1.16b, #4 // +1-stride
ext v29.16b, v2.16b, v3.16b, #4 // +1
ext v30.16b, v4.16b, v5.16b, #4 // +1+stride
add v2.8h, v2.8h, v25.8h // -1, -stride
ext v25.16b, v4.16b, v5.16b, #4 // +1+stride
add v26.8h, v26.8h, v27.8h // 0, +stride
add v0.8h, v0.8h, v28.8h // -1-stride, +1-stride
add v2.8h, v2.8h, v26.8h
add v4.8h, v4.8h, v30.8h // -1+stride, +1+stride
add v4.8h, v4.8h, v25.8h // -1+stride, +1+stride
add v2.8h, v2.8h, v29.8h // +1
add v0.8h, v0.8h, v4.8h
@ -85,7 +257,7 @@ function sgr_finish_filter1_\bpc\()bpc_neon, export=1
ext v27.16b, v16.16b, v17.16b, #8 // +1-stride
ext v28.16b, v17.16b, v18.16b, #8
ext v29.16b, v19.16b, v20.16b, #4 // 0
ext v30.16b, v20.16b, v21.16b, #4
ext v4.16b, v20.16b, v21.16b, #4
mla v2.8h, v0.8h, v6.8h // * 3 -> a
add v25.4s, v25.4s, v19.4s // -stride, -1
add v26.4s, v26.4s, v20.4s
@ -96,22 +268,22 @@ function sgr_finish_filter1_\bpc\()bpc_neon, export=1
add v16.4s, v16.4s, v22.4s // -1+stride
add v17.4s, v17.4s, v23.4s
add v29.4s, v29.4s, v27.4s // 0, +1
add v30.4s, v30.4s, v28.4s
add v4.4s, v4.4s, v28.4s
add v25.4s, v25.4s, v29.4s
add v26.4s, v26.4s, v30.4s
add v26.4s, v26.4s, v4.4s
ext v27.16b, v22.16b, v23.16b, #4 // +stride
ext v28.16b, v23.16b, v24.16b, #4
ext v29.16b, v22.16b, v23.16b, #8 // +1+stride
ext v30.16b, v23.16b, v24.16b, #8
ext v4.16b, v23.16b, v24.16b, #8
.if \bpc == 8
ld1 {v19.8b}, [x1], #8 // src
ld1 {v19.8b}, [x0] // src
.else
ld1 {v19.8h}, [x1], #16 // src
ld1 {v19.8h}, [x0] // src
.endif
add v25.4s, v25.4s, v27.4s // +stride
add v26.4s, v26.4s, v28.4s
add v16.4s, v16.4s, v29.4s // +1+stride
add v17.4s, v17.4s, v30.4s
add v17.4s, v17.4s, v4.4s
shl v25.4s, v25.4s, #2
shl v26.4s, v26.4s, #2
mla v25.4s, v16.4s, v7.4s // * 3 -> b
@ -125,61 +297,68 @@ function sgr_finish_filter1_\bpc\()bpc_neon, export=1
mov v2.16b, v3.16b
rshrn v25.4h, v25.4s, #9
rshrn2 v25.8h, v26.4s, #9
mov v4.16b, v5.16b
st1 {v25.8h}, [x0], #16
b.le 3f
subs w3, w3, #8
// weighted1
shl v19.8h, v19.8h, #4 // u
mov v4.16b, v5.16b
sub v25.8h, v25.8h, v19.8h // t1 - u
ld1 {v1.8h}, [x9], #16
ushll v26.4s, v19.4h, #7 // u << 7
ushll2 v27.4s, v19.8h, #7 // u << 7
ld1 {v3.8h}, [x10], #16
smlal v26.4s, v25.4h, v31.4h // v
smlal2 v27.4s, v25.8h, v31.8h // v
ld1 {v5.8h}, [x2], #16
.if \bpc == 8
rshrn v26.4h, v26.4s, #11
rshrn2 v26.8h, v27.4s, #11
mov v16.16b, v18.16b
sqxtun v26.8b, v26.8h
mov v19.16b, v21.16b
mov v22.16b, v24.16b
ld1 {v1.8h}, [x9], #16
ld1 {v3.8h}, [x4], #16
ld1 {v5.8h}, [x10], #16
st1 {v26.8b}, [x0], #8
.else
sqrshrun v26.4h, v26.4s, #11
sqrshrun2 v26.8h, v27.4s, #11
mov v16.16b, v18.16b
umin v26.8h, v26.8h, v30.8h
mov v19.16b, v21.16b
mov v22.16b, v24.16b
st1 {v26.8h}, [x0], #16
.endif
b.le 3f
ld1 {v17.4s, v18.4s}, [x7], #32
ld1 {v20.4s, v21.4s}, [x3], #32
ld1 {v23.4s, v24.4s}, [x8], #32
ld1 {v20.4s, v21.4s}, [x8], #32
ld1 {v23.4s, v24.4s}, [x1], #32
b 2b
3:
subs x6, x6, #1
b.le 0f
mov x5, x13
add x0, x0, x12, lsl #1
add x1, x1, x2
add x3, x3, x11, lsl #2
add x7, x7, x11, lsl #2
add x8, x8, x11, lsl #2
add x4, x4, x14, lsl #1
add x9, x9, x14, lsl #1
add x10, x10, x14, lsl #1
b 1b
0:
ret
endfunc
// void dav1d_sgr_finish_filter2_Xbpc_neon(int16_t *tmp,
// const pixel *src, const ptrdiff_t stride,
// const int32_t *a, const int16_t *b,
// const int w, const int h);
function sgr_finish_filter2_\bpc\()bpc_neon, export=1
add x7, x3, #(4*(SUM_STRIDE))
sub x3, x3, #(4*(SUM_STRIDE))
add x8, x4, #(2*(SUM_STRIDE))
sub x4, x4, #(2*(SUM_STRIDE))
mov x9, #(2*SUM_STRIDE)
// void dav1d_sgr_finish_filter2_2rows_Xbpc_neon(int16_t *tmp,
// const pixel *src,
// const ptrdiff_t stride,
// const int32_t **a,
// const int16_t **b,
// const int w, const int h);
function sgr_finish_filter2_2rows_\bpc\()bpc_neon, export=1
stp d8, d9, [sp, #-0x40]!
stp d10, d11, [sp, #0x10]
stp d12, d13, [sp, #0x20]
stp d14, d15, [sp, #0x30]
ldp x3, x7, [x3]
ldp x4, x8, [x4]
mov x10, #FILTER_OUT_STRIDE
add x11, x5, #7
bic x11, x11, #7 // Aligned width
.if \bpc == 8
sub x2, x2, x11
.else
sub x2, x2, x11, lsl #1
.endif
sub x10, x10, x11
sub x9, x9, x11
sub x9, x9, #4 // We read 4 extra elements from a
sub x12, x9, #4 // We read 8 extra elements from b
mov x11, x5
cmp w6, #1
add x2, x1, x2 // src + stride
csel x2, x1, x2, le // if (h <= 1) x2 = x1
add x10, x0, x10, lsl #1
movi v4.8h, #5
movi v5.4s, #5
movi v6.8h, #6
@ -191,7 +370,6 @@ function sgr_finish_filter2_\bpc\()bpc_neon, export=1
ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48
2:
subs x5, x5, #8
ext v24.16b, v0.16b, v1.16b, #4 // +1-stride
ext v25.16b, v2.16b, v3.16b, #4 // +1+stride
ext v22.16b, v0.16b, v1.16b, #2 // -stride
@ -201,6 +379,9 @@ function sgr_finish_filter2_\bpc\()bpc_neon, export=1
add v2.8h, v22.8h, v23.8h // -stride, +stride
add v0.8h, v0.8h, v25.8h
mul v8.8h, v25.8h, v4.8h // * 5
mla v8.8h, v23.8h, v6.8h // * 6
ext v22.16b, v16.16b, v17.16b, #4 // -stride
ext v23.16b, v17.16b, v18.16b, #4
ext v24.16b, v19.16b, v20.16b, #4 // +stride
@ -213,8 +394,10 @@ function sgr_finish_filter2_\bpc\()bpc_neon, export=1
mla v0.8h, v2.8h, v6.8h // * 6
.if \bpc == 8
ld1 {v31.8b}, [x1], #8
ld1 {v30.8b}, [x2], #8
.else
ld1 {v31.8h}, [x1], #16
ld1 {v30.8h}, [x2], #16
.endif
add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride
add v17.4s, v17.4s, v27.4s
@ -223,6 +406,11 @@ function sgr_finish_filter2_\bpc\()bpc_neon, export=1
add v16.4s, v16.4s, v19.4s
add v17.4s, v17.4s, v20.4s
mul v9.4s, v19.4s, v5.4s // * 5
mla v9.4s, v24.4s, v7.4s // * 6
mul v10.4s, v20.4s, v5.4s // * 5
mla v10.4s, v25.4s, v7.4s // * 6
add v22.4s, v22.4s, v24.4s // -stride, +stride
add v23.4s, v23.4s, v25.4s
// This is, surprisingly, faster than other variants where the
@ -234,16 +422,23 @@ function sgr_finish_filter2_\bpc\()bpc_neon, export=1
.if \bpc == 8
uxtl v31.8h, v31.8b
uxtl v30.8h, v30.8b
.endif
umlal v16.4s, v0.4h, v31.4h // b + a * src
umlal2 v17.4s, v0.8h, v31.8h
umlal v9.4s, v8.4h, v30.4h // b + a * src
umlal2 v10.4s, v8.8h, v30.8h
mov v0.16b, v1.16b
rshrn v16.4h, v16.4s, #9
rshrn2 v16.8h, v17.4s, #9
rshrn v9.4h, v9.4s, #8
rshrn2 v9.8h, v10.4s, #8
subs w5, w5, #8
mov v2.16b, v3.16b
st1 {v16.8h}, [x0], #16
st1 {v16.8h}, [x0], #16
st1 {v9.8h}, [x10], #16
b.le 3f
b.le 9f
mov v16.16b, v18.16b
mov v19.16b, v21.16b
ld1 {v1.8h}, [x4], #16
@ -252,201 +447,160 @@ function sgr_finish_filter2_\bpc\()bpc_neon, export=1
ld1 {v20.4s, v21.4s}, [x7], #32
b 2b
3:
subs x6, x6, #1
b.le 0f
mov x5, x11
add x0, x0, x10, lsl #1
add x1, x1, x2
add x3, x3, x9, lsl #2
add x7, x7, x9, lsl #2
add x4, x4, x12, lsl #1
add x8, x8, x12, lsl #1
mov x13, x3
mov x14, x4
ld1 {v0.8h, v1.8h}, [x4], #32
ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48
4:
subs x5, x5, #8
ext v23.16b, v0.16b, v1.16b, #4 // +1
ext v22.16b, v0.16b, v1.16b, #2 // 0
add v0.8h, v0.8h, v23.8h // -1, +1
ext v24.16b, v16.16b, v17.16b, #4 // 0
ext v25.16b, v17.16b, v18.16b, #4
ext v26.16b, v16.16b, v17.16b, #8 // +1
ext v27.16b, v17.16b, v18.16b, #8
mul v2.8h, v22.8h, v6.8h // * 6
mla v2.8h, v0.8h, v4.8h // * 5 -> a
.if \bpc == 8
ld1 {v31.8b}, [x1], #8
.else
ld1 {v31.8h}, [x1], #16
.endif
add v16.4s, v16.4s, v26.4s // -1, +1
add v17.4s, v17.4s, v27.4s
.if \bpc == 8
uxtl v31.8h, v31.8b
.endif
// This is, surprisingly, faster than other variants where the
// mul+mla pairs are further apart, on Cortex A53.
mul v24.4s, v24.4s, v7.4s // * 6
mla v24.4s, v16.4s, v5.4s // * 5 -> b
mul v25.4s, v25.4s, v7.4s // * 6
mla v25.4s, v17.4s, v5.4s // * 5 -> b
umlal v24.4s, v2.4h, v31.4h // b + a * src
umlal2 v25.4s, v2.8h, v31.8h
mov v0.16b, v1.16b
rshrn v24.4h, v24.4s, #8
rshrn2 v24.8h, v25.4s, #8
mov v16.16b, v18.16b
st1 {v24.8h}, [x0], #16
b.le 5f
ld1 {v1.8h}, [x4], #16
ld1 {v17.4s, v18.4s}, [x3], #32
b 4b
5:
subs x6, x6, #1
b.le 0f
mov x5, x11
add x0, x0, x10, lsl #1
add x1, x1, x2
mov x3, x13 // Rewind x3/x4 to where they started
mov x4, x14
b 1b
0:
9:
ldp d14, d15, [sp, #0x30]
ldp d12, d13, [sp, #0x20]
ldp d10, d11, [sp, #0x10]
ldp d8, d9, [sp], 0x40
ret
endfunc
// void dav1d_sgr_weighted1_Xbpc_neon(pixel *dst, const ptrdiff_t dst_stride,
// const pixel *src, const ptrdiff_t src_stride,
// const int16_t *t1, const int w, const int h,
// const int wt, const int bitdepth_max);
function sgr_weighted1_\bpc\()bpc_neon, export=1
.if \bpc == 16
ldr w8, [sp]
.endif
dup v31.8h, w7
cmp x6, #2
.if \bpc == 16
dup v30.8h, w8
.endif
add x9, x0, x1
add x10, x2, x3
add x11, x4, #2*FILTER_OUT_STRIDE
mov x7, #(4*FILTER_OUT_STRIDE)
lsl x1, x1, #1
lsl x3, x3, #1
add x8, x5, #7
bic x8, x8, #7 // Aligned width
.if \bpc == 8
sub x1, x1, x8
sub x3, x3, x8
.else
sub x1, x1, x8, lsl #1
sub x3, x3, x8, lsl #1
.endif
sub x7, x7, x8, lsl #1
mov x8, x5
b.lt 2f
1:
.if \bpc == 8
ld1 {v0.8b}, [x2], #8
ld1 {v4.8b}, [x10], #8
.else
ld1 {v0.8h}, [x2], #16
ld1 {v4.8h}, [x10], #16
.endif
ld1 {v1.8h}, [x4], #16
ld1 {v5.8h}, [x11], #16
subs x5, x5, #8
.if \bpc == 8
ushll v0.8h, v0.8b, #4 // u
ushll v4.8h, v4.8b, #4 // u
.else
shl v0.8h, v0.8h, #4 // u
shl v4.8h, v4.8h, #4 // u
.endif
sub v1.8h, v1.8h, v0.8h // t1 - u
sub v5.8h, v5.8h, v4.8h // t1 - u
ushll v2.4s, v0.4h, #7 // u << 7
ushll2 v3.4s, v0.8h, #7 // u << 7
ushll v6.4s, v4.4h, #7 // u << 7
ushll2 v7.4s, v4.8h, #7 // u << 7
smlal v2.4s, v1.4h, v31.4h // v
smlal2 v3.4s, v1.8h, v31.8h // v
smlal v6.4s, v5.4h, v31.4h // v
smlal2 v7.4s, v5.8h, v31.8h // v
.if \bpc == 8
rshrn v2.4h, v2.4s, #11
rshrn2 v2.8h, v3.4s, #11
rshrn v6.4h, v6.4s, #11
rshrn2 v6.8h, v7.4s, #11
sqxtun v2.8b, v2.8h
sqxtun v6.8b, v6.8h
st1 {v2.8b}, [x0], #8
st1 {v6.8b}, [x9], #8
.else
sqrshrun v2.4h, v2.4s, #11
sqrshrun2 v2.8h, v3.4s, #11
sqrshrun v6.4h, v6.4s, #11
sqrshrun2 v6.8h, v7.4s, #11
umin v2.8h, v2.8h, v30.8h
umin v6.8h, v6.8h, v30.8h
st1 {v2.8h}, [x0], #16
st1 {v6.8h}, [x9], #16
.endif
b.gt 1b
// void dav1d_sgr_finish_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride,
// const int32_t **a,
// const int16_t **b,
// const int w, const int h,
// const int w1,
// const int bitdepth_max);
function sgr_finish_weighted2_\bpc\()bpc_neon, export=1
stp d8, d9, [sp, #-0x30]!
str d10, [sp, #0x10]
stp d14, d15, [sp, #0x20]
sub x6, x6, #2
cmp x6, #1
b.lt 0f
mov x5, x8
add x0, x0, x1
add x9, x9, x1
add x2, x2, x3
add x10, x10, x3
add x4, x4, x7
add x11, x11, x7
b.eq 2f
b 1b
dup v14.8h, w6
dup v15.8h, w7
ldp x2, x7, [x2]
ldp x3, x8, [x3]
cmp w5, #1
add x1, x0, x1 // src + stride
// if (h <= 1), set the pointer to the second row to any dummy buffer
// we can clobber (x2 in this case)
csel x1, x2, x1, le
movi v4.8h, #5
movi v5.4s, #5
movi v6.8h, #6
movi v7.4s, #6
1:
ld1 {v0.8h, v1.8h}, [x3], #32
ld1 {v2.8h, v3.8h}, [x8], #32
ld1 {v16.4s, v17.4s, v18.4s}, [x2], #48
ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48
2:
ext v24.16b, v0.16b, v1.16b, #4 // +1-stride
ext v25.16b, v2.16b, v3.16b, #4 // +1+stride
ext v22.16b, v0.16b, v1.16b, #2 // -stride
ext v23.16b, v2.16b, v3.16b, #2 // +stride
add v0.8h, v0.8h, v24.8h // -1-stride, +1-stride
add v25.8h, v2.8h, v25.8h // -1+stride, +1+stride
add v2.8h, v22.8h, v23.8h // -stride, +stride
add v0.8h, v0.8h, v25.8h
mul v8.8h, v25.8h, v4.8h // * 5
mla v8.8h, v23.8h, v6.8h // * 6
ext v22.16b, v16.16b, v17.16b, #4 // -stride
ext v23.16b, v17.16b, v18.16b, #4
ext v24.16b, v19.16b, v20.16b, #4 // +stride
ext v25.16b, v20.16b, v21.16b, #4
ext v26.16b, v16.16b, v17.16b, #8 // +1-stride
ext v27.16b, v17.16b, v18.16b, #8
ext v28.16b, v19.16b, v20.16b, #8 // +1+stride
ext v29.16b, v20.16b, v21.16b, #8
mul v0.8h, v0.8h, v4.8h // * 5
mla v0.8h, v2.8h, v6.8h // * 6
.if \bpc == 8
ld1 {v0.8b}, [x2], #8
ld1 {v31.8b}, [x0]
ld1 {v30.8b}, [x1]
.else
ld1 {v0.8h}, [x2], #16
ld1 {v31.8h}, [x0]
ld1 {v30.8h}, [x1]
.endif
ld1 {v1.8h}, [x4], #16
subs x5, x5, #8
add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride
add v17.4s, v17.4s, v27.4s
add v19.4s, v19.4s, v28.4s // -1+stride, +1+stride
add v20.4s, v20.4s, v29.4s
add v16.4s, v16.4s, v19.4s
add v17.4s, v17.4s, v20.4s
mul v9.4s, v19.4s, v5.4s // * 5
mla v9.4s, v24.4s, v7.4s // * 6
mul v10.4s, v20.4s, v5.4s // * 5
mla v10.4s, v25.4s, v7.4s // * 6
add v22.4s, v22.4s, v24.4s // -stride, +stride
add v23.4s, v23.4s, v25.4s
// This is, surprisingly, faster than other variants where the
// mul+mla pairs are further apart, on Cortex A53.
mul v16.4s, v16.4s, v5.4s // * 5
mla v16.4s, v22.4s, v7.4s // * 6
mul v17.4s, v17.4s, v5.4s // * 5
mla v17.4s, v23.4s, v7.4s // * 6
.if \bpc == 8
ushll v0.8h, v0.8b, #4 // u
.else
shl v0.8h, v0.8h, #4 // u
uxtl v31.8h, v31.8b
uxtl v30.8h, v30.8b
.endif
sub v1.8h, v1.8h, v0.8h // t1 - u
ushll v2.4s, v0.4h, #7 // u << 7
ushll2 v3.4s, v0.8h, #7 // u << 7
smlal v2.4s, v1.4h, v31.4h // v
smlal2 v3.4s, v1.8h, v31.8h // v
umlal v16.4s, v0.4h, v31.4h // b + a * src
umlal2 v17.4s, v0.8h, v31.8h
umlal v9.4s, v8.4h, v30.4h // b + a * src
umlal2 v10.4s, v8.8h, v30.8h
mov v0.16b, v1.16b
rshrn v16.4h, v16.4s, #9
rshrn2 v16.8h, v17.4s, #9
rshrn v9.4h, v9.4s, #8
rshrn2 v9.8h, v10.4s, #8
subs w4, w4, #8
// weighted1
shl v31.8h, v31.8h, #4 // u
shl v30.8h, v30.8h, #4
mov v2.16b, v3.16b
sub v16.8h, v16.8h, v31.8h // t1 - u
sub v9.8h, v9.8h, v30.8h
ld1 {v1.8h}, [x3], #16
ushll v22.4s, v31.4h, #7 // u << 7
ushll2 v23.4s, v31.8h, #7
ushll v24.4s, v30.4h, #7
ushll2 v25.4s, v30.8h, #7
ld1 {v3.8h}, [x8], #16
smlal v22.4s, v16.4h, v14.4h // v
smlal2 v23.4s, v16.8h, v14.8h
mov v16.16b, v18.16b
smlal v24.4s, v9.4h, v14.4h
smlal2 v25.4s, v9.8h, v14.8h
mov v19.16b, v21.16b
.if \bpc == 8
rshrn v2.4h, v2.4s, #11
rshrn2 v2.8h, v3.4s, #11
sqxtun v2.8b, v2.8h
st1 {v2.8b}, [x0], #8
rshrn v22.4h, v22.4s, #11
rshrn2 v22.8h, v23.4s, #11
rshrn v23.4h, v24.4s, #11
rshrn2 v23.8h, v25.4s, #11
sqxtun v22.8b, v22.8h
sqxtun v23.8b, v23.8h
st1 {v22.8b}, [x0], #8
st1 {v23.8b}, [x1], #8
.else
sqrshrun v2.4h, v2.4s, #11
sqrshrun2 v2.8h, v3.4s, #11
umin v2.8h, v2.8h, v30.8h
st1 {v2.8h}, [x0], #16
sqrshrun v22.4h, v22.4s, #11
sqrshrun2 v22.8h, v23.4s, #11
sqrshrun v23.4h, v24.4s, #11
sqrshrun2 v23.8h, v25.4s, #11
umin v22.8h, v22.8h, v15.8h
umin v23.8h, v23.8h, v15.8h
st1 {v22.8h}, [x0], #16
st1 {v23.8h}, [x1], #16
.endif
b.gt 2b
0:
b.le 3f
ld1 {v17.4s, v18.4s}, [x2], #32
ld1 {v20.4s, v21.4s}, [x7], #32
b 2b
3:
ldp d14, d15, [sp, #0x20]
ldr d10, [sp, #0x10]
ldp d8, d9, [sp], 0x30
ret
endfunc
@ -461,7 +615,7 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1
.else
ldp x8, x9, [sp]
.endif
cmp x7, #2
cmp w7, #2
add x10, x0, x1
add x11, x2, x3
add x12, x4, #2*FILTER_OUT_STRIDE
@ -483,7 +637,7 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1
sub x3, x3, x9, lsl #1
.endif
sub x8, x8, x9, lsl #1
mov x9, x6
mov w9, w6
b.lt 2f
1:
.if \bpc == 8
@ -497,7 +651,7 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1
ld1 {v17.8h}, [x12], #16
ld1 {v2.8h}, [x5], #16
ld1 {v18.8h}, [x13], #16
subs x6, x6, #8
subs w6, w6, #8
.if \bpc == 8
ushll v0.8h, v0.8b, #4 // u
ushll v16.8h, v16.8b, #4 // u
@ -542,10 +696,10 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1
.endif
b.gt 1b
subs x7, x7, #2
cmp x7, #1
subs w7, w7, #2
cmp w7, #1
b.lt 0f
mov x6, x9
mov w6, w9
add x0, x0, x1
add x10, x10, x1
add x2, x2, x3
@ -565,7 +719,7 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1
.endif
ld1 {v1.8h}, [x4], #16
ld1 {v2.8h}, [x5], #16
subs x6, x6, #8
subs w6, w6, #8
.if \bpc == 8
ushll v0.8h, v0.8b, #4 // u
.else

View File

@ -89,3 +89,204 @@ L(splat_tbl):
.hword L(splat_tbl) - 20b
.hword L(splat_tbl) - 10b
endfunc
const mv_tbls, align=4
.byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
.byte 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0
.byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
.byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
endconst
const mask_mult, align=4
.byte 1, 2, 1, 2, 0, 0, 0, 0
endconst
// void dav1d_save_tmvs_neon(refmvs_temporal_block *rp, ptrdiff_t stride,
// refmvs_block **rr, const uint8_t *ref_sign,
// int col_end8, int row_end8,
// int col_start8, int row_start8)
function save_tmvs_neon, export=1
AARCH64_SIGN_LINK_REGISTER
stp x29, x30, [sp, #-16]!
mov x29, sp
movi v30.8b, #0
ld1 {v31.8b}, [x3]
adr x8, L(save_tmvs_tbl)
movrel x16, mask_mult
movrel x13, mv_tbls
ld1 {v29.8b}, [x16]
ext v31.8b, v30.8b, v31.8b, #7 // [0, ref_sign]
mov w15, #5
mov w14, #12*2
sxtw x4, w4
sxtw x6, w6
mul w1, w1, w15 // stride *= 5
sub w5, w5, w7 // h = row_end8 - row_start8
lsl w7, w7, #1 // row_start8 <<= 1
1:
mov w15, #5
and w9, w7, #30 // (y & 15) * 2
ldr x9, [x2, w9, uxtw #3] // b = rr[(y & 15) * 2]
add x9, x9, #12 // &b[... + 1]
madd x10, x4, x14, x9 // end_cand_b = &b[col_end8*2 + 1]
madd x9, x6, x14, x9 // cand_b = &b[x*2 + 1]
madd x3, x6, x15, x0 // &rp[x]
2:
ldrb w11, [x9, #10] // cand_b->bs
ld1 {v0.16b}, [x9] // cand_b->mv
add x11, x8, w11, uxtw #2
ldr h1, [x9, #8] // cand_b->ref
ldrh w12, [x11] // bw8
mov x15, x8
add x9, x9, w12, uxtw #1 // cand_b += bw8*2
cmp x9, x10
mov v2.8b, v0.8b
b.ge 3f
ldrb w15, [x9, #10] // cand_b->bs
add x16, x9, #8
ld1 {v4.16b}, [x9] // cand_b->mv
add x15, x8, w15, uxtw #2
ld1 {v1.h}[1], [x16] // cand_b->ref
ldrh w12, [x15] // bw8
add x9, x9, w12, uxtw #1 // cand_b += bw8*2
trn1 v2.2d, v0.2d, v4.2d
3:
abs v2.8h, v2.8h // abs(mv[].xy)
tbl v1.8b, {v31.16b}, v1.8b // ref_sign[ref]
ushr v2.8h, v2.8h, #12 // abs(mv[].xy) >> 12
umull v1.8h, v1.8b, v29.8b // ref_sign[ref] * {1, 2}
cmeq v2.4s, v2.4s, #0 // abs(mv[].xy) <= 4096
xtn v2.4h, v2.4s // abs() condition to 16 bit
and v1.8b, v1.8b, v2.8b // h[0-3] contains conditions for mv[0-1]
addp v1.4h, v1.4h, v1.4h // Combine condition for [1] and [0]
umov w16, v1.h[0] // Extract case for first block
umov w17, v1.h[1]
ldrh w11, [x11, #2] // Fetch jump table entry
ldrh w15, [x15, #2]
ldr q1, [x13, w16, uxtw #4] // Load permutation table base on case
ldr q5, [x13, w17, uxtw #4]
sub x11, x8, w11, uxtw // Find jump table target
sub x15, x8, w15, uxtw
tbl v0.16b, {v0.16b}, v1.16b // Permute cand_b to output refmvs_temporal_block
tbl v4.16b, {v4.16b}, v5.16b
// v1 follows on v0, with another 3 full repetitions of the pattern.
ext v1.16b, v0.16b, v0.16b, #1
ext v5.16b, v4.16b, v4.16b, #1
// v2 ends with 3 complete repetitions of the pattern.
ext v2.16b, v0.16b, v1.16b, #4
ext v6.16b, v4.16b, v5.16b, #4
blr x11
b.ge 4f // if (cand_b >= end)
mov v0.16b, v4.16b
mov v1.16b, v5.16b
mov v2.16b, v6.16b
cmp x9, x10
blr x15
b.lt 2b // if (cand_b < end)
4:
subs w5, w5, #1 // h--
add w7, w7, #2 // y += 2
add x0, x0, x1 // rp += stride
b.gt 1b
ldp x29, x30, [sp], #16
AARCH64_VALIDATE_LINK_REGISTER
ret
10:
AARCH64_VALID_JUMP_TARGET
add x16, x3, #4
st1 {v0.s}[0], [x3]
st1 {v0.b}[4], [x16]
add x3, x3, #5
ret
20:
AARCH64_VALID_JUMP_TARGET
add x16, x3, #8
st1 {v0.d}[0], [x3]
st1 {v0.h}[4], [x16]
add x3, x3, #2*5
ret
40:
AARCH64_VALID_JUMP_TARGET
st1 {v0.16b}, [x3]
str s1, [x3, #16]
add x3, x3, #4*5
ret
80:
AARCH64_VALID_JUMP_TARGET
// This writes 6 full entries plus 2 extra bytes
st1 {v0.16b, v1.16b}, [x3]
// Write the last few, overlapping with the first write.
stur q2, [x3, #(8*5-16)]
add x3, x3, #8*5
ret
160:
AARCH64_VALID_JUMP_TARGET
add x16, x3, #6*5
add x17, x3, #12*5
// This writes 6 full entries plus 2 extra bytes
st1 {v0.16b, v1.16b}, [x3]
// Write another 6 full entries, slightly overlapping with the first set
st1 {v0.16b, v1.16b}, [x16]
// Write 8 bytes (one full entry) after the first 12
st1 {v0.8b}, [x17]
// Write the last 3 entries
str q2, [x3, #(16*5-16)]
add x3, x3, #16*5
ret
L(save_tmvs_tbl):
.hword 16 * 12
.hword L(save_tmvs_tbl) - 160b
.hword 16 * 12
.hword L(save_tmvs_tbl) - 160b
.hword 8 * 12
.hword L(save_tmvs_tbl) - 80b
.hword 8 * 12
.hword L(save_tmvs_tbl) - 80b
.hword 8 * 12
.hword L(save_tmvs_tbl) - 80b
.hword 8 * 12
.hword L(save_tmvs_tbl) - 80b
.hword 4 * 12
.hword L(save_tmvs_tbl) - 40b
.hword 4 * 12
.hword L(save_tmvs_tbl) - 40b
.hword 4 * 12
.hword L(save_tmvs_tbl) - 40b
.hword 4 * 12
.hword L(save_tmvs_tbl) - 40b
.hword 2 * 12
.hword L(save_tmvs_tbl) - 20b
.hword 2 * 12
.hword L(save_tmvs_tbl) - 20b
.hword 2 * 12
.hword L(save_tmvs_tbl) - 20b
.hword 2 * 12
.hword L(save_tmvs_tbl) - 20b
.hword 2 * 12
.hword L(save_tmvs_tbl) - 20b
.hword 1 * 12
.hword L(save_tmvs_tbl) - 10b
.hword 1 * 12
.hword L(save_tmvs_tbl) - 10b
.hword 1 * 12
.hword L(save_tmvs_tbl) - 10b
.hword 1 * 12
.hword L(save_tmvs_tbl) - 10b
.hword 1 * 12
.hword L(save_tmvs_tbl) - 10b
.hword 1 * 12
.hword L(save_tmvs_tbl) - 10b
.hword 1 * 12
.hword L(save_tmvs_tbl) - 10b
endfunc

View File

@ -105,6 +105,7 @@ static void wiener_filter_neon(pixel *const dst, const ptrdiff_t stride,
}
#endif
#if ARCH_ARM
void BF(dav1d_sgr_box3_h, neon)(int32_t *sumsq, int16_t *sum,
const pixel (*left)[4],
const pixel *src, const ptrdiff_t stride,
@ -246,6 +247,853 @@ static void sgr_filter_mix_neon(pixel *const dst, const ptrdiff_t stride,
tmp1, tmp2, w, h, wt HIGHBD_TAIL_SUFFIX);
}
#else
static void rotate(int32_t **sumsq_ptrs, int16_t **sum_ptrs, int n) {
int32_t *tmp32 = sumsq_ptrs[0];
int16_t *tmp16 = sum_ptrs[0];
for (int i = 0; i < n - 1; i++) {
sumsq_ptrs[i] = sumsq_ptrs[i+1];
sum_ptrs[i] = sum_ptrs[i+1];
}
sumsq_ptrs[n - 1] = tmp32;
sum_ptrs[n - 1] = tmp16;
}
static void rotate5_x2(int32_t **sumsq_ptrs, int16_t **sum_ptrs) {
int32_t *tmp32[2];
int16_t *tmp16[2];
for (int i = 0; i < 2; i++) {
tmp32[i] = sumsq_ptrs[i];
tmp16[i] = sum_ptrs[i];
}
for (int i = 0; i < 3; i++) {
sumsq_ptrs[i] = sumsq_ptrs[i+2];
sum_ptrs[i] = sum_ptrs[i+2];
}
for (int i = 0; i < 2; i++) {
sumsq_ptrs[3 + i] = tmp32[i];
sum_ptrs[3 + i] = tmp16[i];
}
}
static void rotate_ab_3(int32_t **A_ptrs, int16_t **B_ptrs) {
rotate(A_ptrs, B_ptrs, 3);
}
static void rotate_ab_2(int32_t **A_ptrs, int16_t **B_ptrs) {
rotate(A_ptrs, B_ptrs, 2);
}
static void rotate_ab_4(int32_t **A_ptrs, int16_t **B_ptrs) {
rotate(A_ptrs, B_ptrs, 4);
}
void BF(dav1d_sgr_box3_row_h, neon)(int32_t *sumsq, int16_t *sum,
const pixel (*left)[4],
const pixel *src, const int w,
const enum LrEdgeFlags edges);
void BF(dav1d_sgr_box5_row_h, neon)(int32_t *sumsq, int16_t *sum,
const pixel (*left)[4],
const pixel *src, const int w,
const enum LrEdgeFlags edges);
void BF(dav1d_sgr_box35_row_h, neon)(int32_t *sumsq3, int16_t *sum3,
int32_t *sumsq5, int16_t *sum5,
const pixel (*left)[4],
const pixel *src, const int w,
const enum LrEdgeFlags edges);
void dav1d_sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum,
int32_t *AA, int16_t *BB,
const int w, const int s,
const int bitdepth_max);
void dav1d_sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum,
int32_t *AA, int16_t *BB,
const int w, const int s,
const int bitdepth_max);
void BF(dav1d_sgr_finish_weighted1, neon)(pixel *dst,
int32_t **A_ptrs, int16_t **B_ptrs,
const int w, const int w1
HIGHBD_DECL_SUFFIX);
void BF(dav1d_sgr_finish_weighted2, neon)(pixel *dst, const ptrdiff_t stride,
int32_t **A_ptrs, int16_t **B_ptrs,
const int w, const int h,
const int w1 HIGHBD_DECL_SUFFIX);
void BF(dav1d_sgr_finish_filter1_2rows, neon)(int16_t *tmp, const pixel *src,
const ptrdiff_t src_stride,
int32_t **A_ptrs,
int16_t **B_ptrs,
const int w, const int h);
void BF(dav1d_sgr_finish_filter2_2rows, neon)(int16_t *tmp, const pixel *src,
const ptrdiff_t src_stride,
int32_t **A_ptrs, int16_t **B_ptrs,
const int w, const int h);
void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride,
const pixel *src, const ptrdiff_t src_stride,
const int16_t *t1, const int16_t *t2,
const int w, const int h,
const int16_t wt[2] HIGHBD_DECL_SUFFIX);
static void sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum,
int32_t *sumsq_out, int16_t *sum_out,
const int w, int s, int bitdepth_max) {
// box3_v + calc_ab1
dav1d_sgr_box3_vert_neon(sumsq, sum, sumsq_out, sum_out, w, s, bitdepth_max);
rotate(sumsq, sum, 3);
}
static void sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum,
int32_t *sumsq_out, int16_t *sum_out,
const int w, int s, int bitdepth_max) {
// box5_v + calc_ab2
dav1d_sgr_box5_vert_neon(sumsq, sum, sumsq_out, sum_out, w, s, bitdepth_max);
rotate5_x2(sumsq, sum);
}
static void sgr_box3_hv_neon(int32_t **sumsq, int16_t **sum,
int32_t *AA, int16_t *BB,
const pixel (*left)[4],
const pixel *src, const int w,
const int s,
const enum LrEdgeFlags edges,
const int bitdepth_max) {
BF(dav1d_sgr_box3_row_h, neon)(sumsq[2], sum[2], left, src, w, edges);
sgr_box3_vert_neon(sumsq, sum, AA, BB, w, s, bitdepth_max);
}
static void sgr_finish1_neon(pixel **dst, const ptrdiff_t stride,
int32_t **A_ptrs, int16_t **B_ptrs, const int w,
const int w1 HIGHBD_DECL_SUFFIX) {
BF(dav1d_sgr_finish_weighted1, neon)(*dst, A_ptrs, B_ptrs,
w, w1 HIGHBD_TAIL_SUFFIX);
*dst += PXSTRIDE(stride);
rotate_ab_3(A_ptrs, B_ptrs);
}
static void sgr_finish2_neon(pixel **dst, const ptrdiff_t stride,
int32_t **A_ptrs, int16_t **B_ptrs,
const int w, const int h, const int w1
HIGHBD_DECL_SUFFIX) {
BF(dav1d_sgr_finish_weighted2, neon)(*dst, stride, A_ptrs, B_ptrs,
w, h, w1 HIGHBD_TAIL_SUFFIX);
*dst += 2*PXSTRIDE(stride);
rotate_ab_2(A_ptrs, B_ptrs);
}
static void sgr_finish_mix_neon(pixel **dst, const ptrdiff_t stride,
int32_t **A5_ptrs, int16_t **B5_ptrs,
int32_t **A3_ptrs, int16_t **B3_ptrs,
const int w, const int h,
const int w0, const int w1 HIGHBD_DECL_SUFFIX) {
#define FILTER_OUT_STRIDE 384
ALIGN_STK_16(int16_t, tmp5, 2*FILTER_OUT_STRIDE,);
ALIGN_STK_16(int16_t, tmp3, 2*FILTER_OUT_STRIDE,);
BF(dav1d_sgr_finish_filter2_2rows, neon)(tmp5, *dst, stride,
A5_ptrs, B5_ptrs, w, h);
BF(dav1d_sgr_finish_filter1_2rows, neon)(tmp3, *dst, stride,
A3_ptrs, B3_ptrs, w, h);
const int16_t wt[2] = { w0, w1 };
BF(dav1d_sgr_weighted2, neon)(*dst, stride, *dst, stride,
tmp5, tmp3, w, h, wt HIGHBD_TAIL_SUFFIX);
*dst += h*PXSTRIDE(stride);
rotate_ab_2(A5_ptrs, B5_ptrs);
rotate_ab_4(A3_ptrs, B3_ptrs);
}
static void sgr_filter_3x3_neon(pixel *dst, const ptrdiff_t stride,
const pixel (*left)[4], const pixel *lpf,
const int w, int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
#define BUF_STRIDE (384 + 16)
ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 3 + 16,);
ALIGN_STK_16(int16_t, sum_buf, BUF_STRIDE * 3 + 16,);
int32_t *sumsq_ptrs[3], *sumsq_rows[3];
int16_t *sum_ptrs[3], *sum_rows[3];
for (int i = 0; i < 3; i++) {
sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE];
sum_rows[i] = &sum_buf[i * BUF_STRIDE];
}
ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 3 + 16,);
ALIGN_STK_16(int16_t, B_buf, BUF_STRIDE * 3 + 16,);
int32_t *A_ptrs[3];
int16_t *B_ptrs[3];
for (int i = 0; i < 3; i++) {
A_ptrs[i] = &A_buf[i * BUF_STRIDE];
B_ptrs[i] = &B_buf[i * BUF_STRIDE];
}
const pixel *src = dst;
const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
if (edges & LR_HAVE_TOP) {
sumsq_ptrs[0] = sumsq_rows[0];
sumsq_ptrs[1] = sumsq_rows[1];
sumsq_ptrs[2] = sumsq_rows[2];
sum_ptrs[0] = sum_rows[0];
sum_ptrs[1] = sum_rows[1];
sum_ptrs[2] = sum_rows[2];
BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[0], sum_rows[0],
NULL, lpf, w, edges);
lpf += PXSTRIDE(stride);
BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[1], sum_rows[1],
NULL, lpf, w, edges);
sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
left++;
src += PXSTRIDE(stride);
rotate_ab_3(A_ptrs, B_ptrs);
if (--h <= 0)
goto vert_1;
sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
left++;
src += PXSTRIDE(stride);
rotate_ab_3(A_ptrs, B_ptrs);
if (--h <= 0)
goto vert_2;
} else {
sumsq_ptrs[0] = sumsq_rows[0];
sumsq_ptrs[1] = sumsq_rows[0];
sumsq_ptrs[2] = sumsq_rows[0];
sum_ptrs[0] = sum_rows[0];
sum_ptrs[1] = sum_rows[0];
sum_ptrs[2] = sum_rows[0];
BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[0], sum_rows[0],
left, src, w, edges);
left++;
src += PXSTRIDE(stride);
sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
w, params->sgr.s1, BITDEPTH_MAX);
rotate_ab_3(A_ptrs, B_ptrs);
if (--h <= 0)
goto vert_1;
sumsq_ptrs[2] = sumsq_rows[1];
sum_ptrs[2] = sum_rows[1];
sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
left++;
src += PXSTRIDE(stride);
rotate_ab_3(A_ptrs, B_ptrs);
if (--h <= 0)
goto vert_2;
sumsq_ptrs[2] = sumsq_rows[2];
sum_ptrs[2] = sum_rows[2];
}
do {
sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
left++;
src += PXSTRIDE(stride);
sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
} while (--h > 0);
if (!(edges & LR_HAVE_BOTTOM))
goto vert_2;
sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX);
lpf_bottom += PXSTRIDE(stride);
sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX);
sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
return;
vert_2:
sumsq_ptrs[2] = sumsq_ptrs[1];
sum_ptrs[2] = sum_ptrs[1];
sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
w, params->sgr.s1, BITDEPTH_MAX);
sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
output_1:
sumsq_ptrs[2] = sumsq_ptrs[1];
sum_ptrs[2] = sum_ptrs[1];
sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
w, params->sgr.s1, BITDEPTH_MAX);
sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
return;
vert_1:
sumsq_ptrs[2] = sumsq_ptrs[1];
sum_ptrs[2] = sum_ptrs[1];
sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
w, params->sgr.s1, BITDEPTH_MAX);
rotate_ab_3(A_ptrs, B_ptrs);
goto output_1;
}
static void sgr_filter_5x5_neon(pixel *dst, const ptrdiff_t stride,
const pixel (*left)[4], const pixel *lpf,
const int w, int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 5 + 16,);
ALIGN_STK_16(int16_t, sum_buf, BUF_STRIDE * 5 + 16,);
int32_t *sumsq_ptrs[5], *sumsq_rows[5];
int16_t *sum_ptrs[5], *sum_rows[5];
for (int i = 0; i < 5; i++) {
sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE];
sum_rows[i] = &sum_buf[i * BUF_STRIDE];
}
ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 2 + 16,);
ALIGN_STK_16(int16_t, B_buf, BUF_STRIDE * 2 + 16,);
int32_t *A_ptrs[2];
int16_t *B_ptrs[2];
for (int i = 0; i < 2; i++) {
A_ptrs[i] = &A_buf[i * BUF_STRIDE];
B_ptrs[i] = &B_buf[i * BUF_STRIDE];
}
const pixel *src = dst;
const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
if (edges & LR_HAVE_TOP) {
sumsq_ptrs[0] = sumsq_rows[0];
sumsq_ptrs[1] = sumsq_rows[0];
sumsq_ptrs[2] = sumsq_rows[1];
sumsq_ptrs[3] = sumsq_rows[2];
sumsq_ptrs[4] = sumsq_rows[3];
sum_ptrs[0] = sum_rows[0];
sum_ptrs[1] = sum_rows[0];
sum_ptrs[2] = sum_rows[1];
sum_ptrs[3] = sum_rows[2];
sum_ptrs[4] = sum_rows[3];
BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[0], sum_rows[0],
NULL, lpf, w, edges);
lpf += PXSTRIDE(stride);
BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[1], sum_rows[1],
NULL, lpf, w, edges);
BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[2], sum_rows[2],
left, src, w, edges);
left++;
src += PXSTRIDE(stride);
if (--h <= 0)
goto vert_1;
BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[3], sum_rows[3],
left, src, w, edges);
left++;
src += PXSTRIDE(stride);
sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
w, params->sgr.s0, BITDEPTH_MAX);
rotate_ab_2(A_ptrs, B_ptrs);
if (--h <= 0)
goto vert_2;
// ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
// one of them to point at the previously unused rows[4].
sumsq_ptrs[3] = sumsq_rows[4];
sum_ptrs[3] = sum_rows[4];
} else {
sumsq_ptrs[0] = sumsq_rows[0];
sumsq_ptrs[1] = sumsq_rows[0];
sumsq_ptrs[2] = sumsq_rows[0];
sumsq_ptrs[3] = sumsq_rows[0];
sumsq_ptrs[4] = sumsq_rows[0];
sum_ptrs[0] = sum_rows[0];
sum_ptrs[1] = sum_rows[0];
sum_ptrs[2] = sum_rows[0];
sum_ptrs[3] = sum_rows[0];
sum_ptrs[4] = sum_rows[0];
BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[0], sum_rows[0],
left, src, w, edges);
left++;
src += PXSTRIDE(stride);
if (--h <= 0)
goto vert_1;
sumsq_ptrs[4] = sumsq_rows[1];
sum_ptrs[4] = sum_rows[1];
BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[1], sum_rows[1],
left, src, w, edges);
left++;
src += PXSTRIDE(stride);
sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
w, params->sgr.s0, BITDEPTH_MAX);
rotate_ab_2(A_ptrs, B_ptrs);
if (--h <= 0)
goto vert_2;
sumsq_ptrs[3] = sumsq_rows[2];
sumsq_ptrs[4] = sumsq_rows[3];
sum_ptrs[3] = sum_rows[2];
sum_ptrs[4] = sum_rows[3];
BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[2], sum_rows[2],
left, src, w, edges);
left++;
src += PXSTRIDE(stride);
if (--h <= 0)
goto odd;
BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[3], sum_rows[3],
left, src, w, edges);
left++;
src += PXSTRIDE(stride);
sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
w, params->sgr.s0, BITDEPTH_MAX);
sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
if (--h <= 0)
goto vert_2;
// ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
// one of them to point at the previously unused rows[4].
sumsq_ptrs[3] = sumsq_rows[4];
sum_ptrs[3] = sum_rows[4];
}
do {
BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[3], sum_ptrs[3],
left, src, w, edges);
left++;
src += PXSTRIDE(stride);
if (--h <= 0)
goto odd;
BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[4], sum_ptrs[4],
left, src, w, edges);
left++;
src += PXSTRIDE(stride);
sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
w, params->sgr.s0, BITDEPTH_MAX);
sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
} while (--h > 0);
if (!(edges & LR_HAVE_BOTTOM))
goto vert_2;
BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[3], sum_ptrs[3],
NULL, lpf_bottom, w, edges);
lpf_bottom += PXSTRIDE(stride);
BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[4], sum_ptrs[4],
NULL, lpf_bottom, w, edges);
output_2:
sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
w, params->sgr.s0, BITDEPTH_MAX);
sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
return;
vert_2:
// Duplicate the last row twice more
sumsq_ptrs[3] = sumsq_ptrs[2];
sumsq_ptrs[4] = sumsq_ptrs[2];
sum_ptrs[3] = sum_ptrs[2];
sum_ptrs[4] = sum_ptrs[2];
goto output_2;
odd:
// Copy the last row as padding once
sumsq_ptrs[4] = sumsq_ptrs[3];
sum_ptrs[4] = sum_ptrs[3];
sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
w, params->sgr.s0, BITDEPTH_MAX);
sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
output_1:
// Duplicate the last row twice more
sumsq_ptrs[3] = sumsq_ptrs[2];
sumsq_ptrs[4] = sumsq_ptrs[2];
sum_ptrs[3] = sum_ptrs[2];
sum_ptrs[4] = sum_ptrs[2];
sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
w, params->sgr.s0, BITDEPTH_MAX);
// Output only one row
sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
w, 1, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
return;
vert_1:
// Copy the last row as padding once
sumsq_ptrs[4] = sumsq_ptrs[3];
sum_ptrs[4] = sum_ptrs[3];
sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
w, params->sgr.s0, BITDEPTH_MAX);
rotate_ab_2(A_ptrs, B_ptrs);
goto output_1;
}
static void sgr_filter_mix_neon(pixel *dst, const ptrdiff_t stride,
const pixel (*left)[4], const pixel *lpf,
const int w, int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
ALIGN_STK_16(int32_t, sumsq5_buf, BUF_STRIDE * 5 + 16,);
ALIGN_STK_16(int16_t, sum5_buf, BUF_STRIDE * 5 + 16,);
int32_t *sumsq5_ptrs[5], *sumsq5_rows[5];
int16_t *sum5_ptrs[5], *sum5_rows[5];
for (int i = 0; i < 5; i++) {
sumsq5_rows[i] = &sumsq5_buf[i * BUF_STRIDE];
sum5_rows[i] = &sum5_buf[i * BUF_STRIDE];
}
ALIGN_STK_16(int32_t, sumsq3_buf, BUF_STRIDE * 3 + 16,);
ALIGN_STK_16(int16_t, sum3_buf, BUF_STRIDE * 3 + 16,);
int32_t *sumsq3_ptrs[3], *sumsq3_rows[3];
int16_t *sum3_ptrs[3], *sum3_rows[3];
for (int i = 0; i < 3; i++) {
sumsq3_rows[i] = &sumsq3_buf[i * BUF_STRIDE];
sum3_rows[i] = &sum3_buf[i * BUF_STRIDE];
}
ALIGN_STK_16(int32_t, A5_buf, BUF_STRIDE * 2 + 16,);
ALIGN_STK_16(int16_t, B5_buf, BUF_STRIDE * 2 + 16,);
int32_t *A5_ptrs[2];
int16_t *B5_ptrs[2];
for (int i = 0; i < 2; i++) {
A5_ptrs[i] = &A5_buf[i * BUF_STRIDE];
B5_ptrs[i] = &B5_buf[i * BUF_STRIDE];
}
ALIGN_STK_16(int32_t, A3_buf, BUF_STRIDE * 4 + 16,);
ALIGN_STK_16(int16_t, B3_buf, BUF_STRIDE * 4 + 16,);
int32_t *A3_ptrs[4];
int16_t *B3_ptrs[4];
for (int i = 0; i < 4; i++) {
A3_ptrs[i] = &A3_buf[i * BUF_STRIDE];
B3_ptrs[i] = &B3_buf[i * BUF_STRIDE];
}
const pixel *src = dst;
const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
if (edges & LR_HAVE_TOP) {
sumsq5_ptrs[0] = sumsq5_rows[0];
sumsq5_ptrs[1] = sumsq5_rows[0];
sumsq5_ptrs[2] = sumsq5_rows[1];
sumsq5_ptrs[3] = sumsq5_rows[2];
sumsq5_ptrs[4] = sumsq5_rows[3];
sum5_ptrs[0] = sum5_rows[0];
sum5_ptrs[1] = sum5_rows[0];
sum5_ptrs[2] = sum5_rows[1];
sum5_ptrs[3] = sum5_rows[2];
sum5_ptrs[4] = sum5_rows[3];
sumsq3_ptrs[0] = sumsq3_rows[0];
sumsq3_ptrs[1] = sumsq3_rows[1];
sumsq3_ptrs[2] = sumsq3_rows[2];
sum3_ptrs[0] = sum3_rows[0];
sum3_ptrs[1] = sum3_rows[1];
sum3_ptrs[2] = sum3_rows[2];
BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[0], sum3_rows[0],
sumsq5_rows[0], sum5_rows[0],
NULL, lpf, w, edges);
lpf += PXSTRIDE(stride);
BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[1], sum3_rows[1],
sumsq5_rows[1], sum5_rows[1],
NULL, lpf, w, edges);
BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[2], sum3_rows[2],
sumsq5_rows[2], sum5_rows[2],
left, src, w, edges);
left++;
src += PXSTRIDE(stride);
sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
w, params->sgr.s1, BITDEPTH_MAX);
rotate_ab_4(A3_ptrs, B3_ptrs);
if (--h <= 0)
goto vert_1;
BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
sumsq5_rows[3], sum5_rows[3],
left, src, w, edges);
left++;
src += PXSTRIDE(stride);
sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
w, params->sgr.s0, BITDEPTH_MAX);
rotate_ab_2(A5_ptrs, B5_ptrs);
sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
w, params->sgr.s1, BITDEPTH_MAX);
rotate_ab_4(A3_ptrs, B3_ptrs);
if (--h <= 0)
goto vert_2;
// ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
// one of them to point at the previously unused rows[4].
sumsq5_ptrs[3] = sumsq5_rows[4];
sum5_ptrs[3] = sum5_rows[4];
} else {
sumsq5_ptrs[0] = sumsq5_rows[0];
sumsq5_ptrs[1] = sumsq5_rows[0];
sumsq5_ptrs[2] = sumsq5_rows[0];
sumsq5_ptrs[3] = sumsq5_rows[0];
sumsq5_ptrs[4] = sumsq5_rows[0];
sum5_ptrs[0] = sum5_rows[0];
sum5_ptrs[1] = sum5_rows[0];
sum5_ptrs[2] = sum5_rows[0];
sum5_ptrs[3] = sum5_rows[0];
sum5_ptrs[4] = sum5_rows[0];
sumsq3_ptrs[0] = sumsq3_rows[0];
sumsq3_ptrs[1] = sumsq3_rows[0];
sumsq3_ptrs[2] = sumsq3_rows[0];
sum3_ptrs[0] = sum3_rows[0];
sum3_ptrs[1] = sum3_rows[0];
sum3_ptrs[2] = sum3_rows[0];
BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[0], sum3_rows[0],
sumsq5_rows[0], sum5_rows[0],
left, src, w, edges);
left++;
src += PXSTRIDE(stride);
sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
w, params->sgr.s1, BITDEPTH_MAX);
rotate_ab_4(A3_ptrs, B3_ptrs);
if (--h <= 0)
goto vert_1;
sumsq5_ptrs[4] = sumsq5_rows[1];
sum5_ptrs[4] = sum5_rows[1];
sumsq3_ptrs[2] = sumsq3_rows[1];
sum3_ptrs[2] = sum3_rows[1];
BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[1], sum3_rows[1],
sumsq5_rows[1], sum5_rows[1],
left, src, w, edges);
left++;
src += PXSTRIDE(stride);
sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
w, params->sgr.s0, BITDEPTH_MAX);
rotate_ab_2(A5_ptrs, B5_ptrs);
sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
w, params->sgr.s1, BITDEPTH_MAX);
rotate_ab_4(A3_ptrs, B3_ptrs);
if (--h <= 0)
goto vert_2;
sumsq5_ptrs[3] = sumsq5_rows[2];
sumsq5_ptrs[4] = sumsq5_rows[3];
sum5_ptrs[3] = sum5_rows[2];
sum5_ptrs[4] = sum5_rows[3];
sumsq3_ptrs[2] = sumsq3_rows[2];
sum3_ptrs[2] = sum3_rows[2];
BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[2], sum3_rows[2],
sumsq5_rows[2], sum5_rows[2],
left, src, w, edges);
left++;
src += PXSTRIDE(stride);
sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
w, params->sgr.s1, BITDEPTH_MAX);
rotate_ab_4(A3_ptrs, B3_ptrs);
if (--h <= 0)
goto odd;
BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
sumsq5_rows[3], sum5_rows[3],
left, src, w, edges);
left++;
src += PXSTRIDE(stride);
sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
w, params->sgr.s0, BITDEPTH_MAX);
sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
w, params->sgr.s1, BITDEPTH_MAX);
sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
w, 2, params->sgr.w0, params->sgr.w1
HIGHBD_TAIL_SUFFIX);
if (--h <= 0)
goto vert_2;
// ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
// one of them to point at the previously unused rows[4].
sumsq5_ptrs[3] = sumsq5_rows[4];
sum5_ptrs[3] = sum5_rows[4];
}
do {
BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
sumsq5_ptrs[3], sum5_ptrs[3],
left, src, w, edges);
left++;
src += PXSTRIDE(stride);
sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
w, params->sgr.s1, BITDEPTH_MAX);
rotate_ab_4(A3_ptrs, B3_ptrs);
if (--h <= 0)
goto odd;
BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
sumsq5_ptrs[4], sum5_ptrs[4],
left, src, w, edges);
left++;
src += PXSTRIDE(stride);
sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
w, params->sgr.s0, BITDEPTH_MAX);
sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
w, params->sgr.s1, BITDEPTH_MAX);
sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
w, 2, params->sgr.w0, params->sgr.w1
HIGHBD_TAIL_SUFFIX);
} while (--h > 0);
if (!(edges & LR_HAVE_BOTTOM))
goto vert_2;
BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
sumsq5_ptrs[3], sum5_ptrs[3],
NULL, lpf_bottom, w, edges);
lpf_bottom += PXSTRIDE(stride);
sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
w, params->sgr.s1, BITDEPTH_MAX);
rotate_ab_4(A3_ptrs, B3_ptrs);
BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
sumsq5_ptrs[4], sum5_ptrs[4],
NULL, lpf_bottom, w, edges);
output_2:
sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
w, params->sgr.s0, BITDEPTH_MAX);
sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
w, params->sgr.s1, BITDEPTH_MAX);
sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
w, 2, params->sgr.w0, params->sgr.w1
HIGHBD_TAIL_SUFFIX);
return;
vert_2:
// Duplicate the last row twice more
sumsq5_ptrs[3] = sumsq5_ptrs[2];
sumsq5_ptrs[4] = sumsq5_ptrs[2];
sum5_ptrs[3] = sum5_ptrs[2];
sum5_ptrs[4] = sum5_ptrs[2];
sumsq3_ptrs[2] = sumsq3_ptrs[1];
sum3_ptrs[2] = sum3_ptrs[1];
sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
w, params->sgr.s1, BITDEPTH_MAX);
rotate_ab_4(A3_ptrs, B3_ptrs);
sumsq3_ptrs[2] = sumsq3_ptrs[1];
sum3_ptrs[2] = sum3_ptrs[1];
goto output_2;
odd:
// Copy the last row as padding once
sumsq5_ptrs[4] = sumsq5_ptrs[3];
sum5_ptrs[4] = sum5_ptrs[3];
sumsq3_ptrs[2] = sumsq3_ptrs[1];
sum3_ptrs[2] = sum3_ptrs[1];
sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
w, params->sgr.s0, BITDEPTH_MAX);
sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
w, params->sgr.s1, BITDEPTH_MAX);
sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
w, 2, params->sgr.w0, params->sgr.w1
HIGHBD_TAIL_SUFFIX);
output_1:
// Duplicate the last row twice more
sumsq5_ptrs[3] = sumsq5_ptrs[2];
sumsq5_ptrs[4] = sumsq5_ptrs[2];
sum5_ptrs[3] = sum5_ptrs[2];
sum5_ptrs[4] = sum5_ptrs[2];
sumsq3_ptrs[2] = sumsq3_ptrs[1];
sum3_ptrs[2] = sum3_ptrs[1];
sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
w, params->sgr.s0, BITDEPTH_MAX);
sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
w, params->sgr.s1, BITDEPTH_MAX);
rotate_ab_4(A3_ptrs, B3_ptrs);
// Output only one row
sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
w, 1, params->sgr.w0, params->sgr.w1
HIGHBD_TAIL_SUFFIX);
return;
vert_1:
// Copy the last row as padding once
sumsq5_ptrs[4] = sumsq5_ptrs[3];
sum5_ptrs[4] = sum5_ptrs[3];
sumsq3_ptrs[2] = sumsq3_ptrs[1];
sum3_ptrs[2] = sum3_ptrs[1];
sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
w, params->sgr.s0, BITDEPTH_MAX);
rotate_ab_2(A5_ptrs, B5_ptrs);
sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
w, params->sgr.s1, BITDEPTH_MAX);
rotate_ab_4(A3_ptrs, B3_ptrs);
goto output_1;
}
#endif
static ALWAYS_INLINE void loop_restoration_dsp_init_arm(Dav1dLoopRestorationDSPContext *const c, int bpc) {
const unsigned flags = dav1d_get_cpu_flags();

View File

@ -28,6 +28,7 @@
#include "src/cpu.h"
#include "src/refmvs.h"
decl_save_tmvs_fn(dav1d_save_tmvs_neon);
decl_splat_mv_fn(dav1d_splat_mv_neon);
static ALWAYS_INLINE void refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *const c) {
@ -35,5 +36,6 @@ static ALWAYS_INLINE void refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *const c) {
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
c->save_tmvs = dav1d_save_tmvs_neon;
c->splat_mv = dav1d_splat_mv_neon;
}

View File

@ -44,7 +44,7 @@ uint8_t *dav1d_data_create_internal(Dav1dData *const buf, const size_t sz) {
validate_input_or_ret(buf != NULL, NULL);
if (sz > SIZE_MAX / 2) return NULL;
buf->ref = dav1d_ref_create(sz);
buf->ref = dav1d_ref_create(ALLOC_DAV1DDATA, sz);
if (!buf->ref) return NULL;
buf->data = buf->ref->const_data;
buf->sz = sz;
@ -65,7 +65,7 @@ int dav1d_data_wrap_internal(Dav1dData *const buf, const uint8_t *const ptr,
validate_input_or_ret(free_callback != NULL, DAV1D_ERR(EINVAL));
if (sz > SIZE_MAX / 2) return DAV1D_ERR(EINVAL);
Dav1dRef *const ref = malloc(sizeof(Dav1dRef));
Dav1dRef *const ref = dav1d_malloc(ALLOC_DAV1DDATA, sizeof(Dav1dRef));
if (!ref) return DAV1D_ERR(ENOMEM);
buf->ref = dav1d_ref_init(ref, ptr, free_callback, cookie, 1);
@ -86,7 +86,7 @@ int dav1d_data_wrap_user_data_internal(Dav1dData *const buf,
validate_input_or_ret(buf != NULL, DAV1D_ERR(EINVAL));
validate_input_or_ret(free_callback != NULL, DAV1D_ERR(EINVAL));
Dav1dRef *const ref = malloc(sizeof(Dav1dRef));
Dav1dRef *const ref = dav1d_malloc(ALLOC_DAV1DDATA, sizeof(Dav1dRef));
if (!ref) return DAV1D_ERR(ENOMEM);
buf->m.user_data.ref = dav1d_ref_init(ref, user_data, free_callback, cookie, 1);
@ -95,14 +95,13 @@ int dav1d_data_wrap_user_data_internal(Dav1dData *const buf,
return 0;
}
void dav1d_data_ref(Dav1dData *const dst, const Dav1dData *const src) {
validate_input(dst != NULL);
validate_input(dst->data == NULL);
validate_input(src != NULL);
assert(dst != NULL);
assert(dst->data == NULL);
assert(src != NULL);
if (src->ref) {
validate_input(src->data != NULL);
assert(src->data != NULL);
dav1d_ref_inc(src->ref);
}
if (src->m.user_data.ref) dav1d_ref_inc(src->m.user_data.ref);

View File

@ -2932,8 +2932,8 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
int retval = DAV1D_ERR(ENOMEM);
if (f->sbh > f->lf.start_of_tile_row_sz) {
free(f->lf.start_of_tile_row);
f->lf.start_of_tile_row = malloc(f->sbh * sizeof(uint8_t));
dav1d_free(f->lf.start_of_tile_row);
f->lf.start_of_tile_row = dav1d_malloc(ALLOC_TILE, f->sbh * sizeof(uint8_t));
if (!f->lf.start_of_tile_row) {
f->lf.start_of_tile_row_sz = 0;
goto error;
@ -2950,24 +2950,24 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
const int n_ts = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows;
if (n_ts != f->n_ts) {
if (c->n_fc > 1) {
freep(&f->frame_thread.tile_start_off);
dav1d_free(f->frame_thread.tile_start_off);
f->frame_thread.tile_start_off =
malloc(sizeof(*f->frame_thread.tile_start_off) * n_ts);
dav1d_malloc(ALLOC_TILE, sizeof(*f->frame_thread.tile_start_off) * n_ts);
if (!f->frame_thread.tile_start_off) {
f->n_ts = 0;
goto error;
}
}
dav1d_free_aligned(f->ts);
f->ts = dav1d_alloc_aligned(sizeof(*f->ts) * n_ts, 32);
f->ts = dav1d_alloc_aligned(ALLOC_TILE, sizeof(*f->ts) * n_ts, 32);
if (!f->ts) goto error;
f->n_ts = n_ts;
}
const int a_sz = f->sb128w * f->frame_hdr->tiling.rows * (1 + (c->n_fc > 1 && c->n_tc > 1));
if (a_sz != f->a_sz) {
freep(&f->a);
f->a = malloc(sizeof(*f->a) * a_sz);
dav1d_free(f->a);
f->a = dav1d_malloc(ALLOC_TILE, sizeof(*f->a) * a_sz);
if (!f->a) {
f->a_sz = 0;
goto error;
@ -2993,9 +2993,10 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
const int lowest_pixel_mem_sz = f->frame_hdr->tiling.cols * f->sbh;
if (lowest_pixel_mem_sz != f->tile_thread.lowest_pixel_mem_sz) {
free(f->tile_thread.lowest_pixel_mem);
dav1d_free(f->tile_thread.lowest_pixel_mem);
f->tile_thread.lowest_pixel_mem =
malloc(lowest_pixel_mem_sz * sizeof(*f->tile_thread.lowest_pixel_mem));
dav1d_malloc(ALLOC_TILE, lowest_pixel_mem_sz *
sizeof(*f->tile_thread.lowest_pixel_mem));
if (!f->tile_thread.lowest_pixel_mem) {
f->tile_thread.lowest_pixel_mem_sz = 0;
goto error;
@ -3016,9 +3017,9 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
const int cf_sz = (num_sb128 * size_mul[0]) << hbd;
if (cf_sz != f->frame_thread.cf_sz) {
dav1d_freep_aligned(&f->frame_thread.cf);
dav1d_free_aligned(f->frame_thread.cf);
f->frame_thread.cf =
dav1d_alloc_aligned((size_t)cf_sz * 128 * 128 / 2, 64);
dav1d_alloc_aligned(ALLOC_COEF, (size_t)cf_sz * 128 * 128 / 2, 64);
if (!f->frame_thread.cf) {
f->frame_thread.cf_sz = 0;
goto error;
@ -3029,9 +3030,9 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
if (f->frame_hdr->allow_screen_content_tools) {
if (num_sb128 != f->frame_thread.pal_sz) {
dav1d_freep_aligned(&f->frame_thread.pal);
dav1d_free_aligned(f->frame_thread.pal);
f->frame_thread.pal =
dav1d_alloc_aligned(sizeof(*f->frame_thread.pal) *
dav1d_alloc_aligned(ALLOC_PAL, sizeof(*f->frame_thread.pal) *
num_sb128 * 16 * 16, 64);
if (!f->frame_thread.pal) {
f->frame_thread.pal_sz = 0;
@ -3042,9 +3043,9 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
const int pal_idx_sz = num_sb128 * size_mul[1];
if (pal_idx_sz != f->frame_thread.pal_idx_sz) {
dav1d_freep_aligned(&f->frame_thread.pal_idx);
dav1d_free_aligned(f->frame_thread.pal_idx);
f->frame_thread.pal_idx =
dav1d_alloc_aligned(sizeof(*f->frame_thread.pal_idx) *
dav1d_alloc_aligned(ALLOC_PAL, sizeof(*f->frame_thread.pal_idx) *
pal_idx_sz * 128 * 128 / 4, 64);
if (!f->frame_thread.pal_idx) {
f->frame_thread.pal_idx_sz = 0;
@ -3072,7 +3073,7 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
size_t alloc_sz = 64;
alloc_sz += (size_t)llabs(y_stride) * 4 * f->sbh << need_cdef_lpf_copy;
alloc_sz += (size_t)llabs(uv_stride) * 8 * f->sbh << need_cdef_lpf_copy;
uint8_t *ptr = f->lf.cdef_line_buf = dav1d_alloc_aligned(alloc_sz, 32);
uint8_t *ptr = f->lf.cdef_line_buf = dav1d_alloc_aligned(ALLOC_CDEF, alloc_sz, 32);
if (!ptr) {
f->lf.cdef_buf_plane_sz[0] = f->lf.cdef_buf_plane_sz[1] = 0;
goto error;
@ -3132,7 +3133,7 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
size_t alloc_sz = 128;
alloc_sz += (size_t)llabs(y_stride) * num_lines;
alloc_sz += (size_t)llabs(uv_stride) * num_lines * 2;
uint8_t *ptr = f->lf.lr_line_buf = dav1d_alloc_aligned(alloc_sz, 64);
uint8_t *ptr = f->lf.lr_line_buf = dav1d_alloc_aligned(ALLOC_LR, alloc_sz, 64);
if (!ptr) {
f->lf.lr_buf_plane_sz[0] = f->lf.lr_buf_plane_sz[1] = 0;
goto error;
@ -3158,23 +3159,23 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
// update allocation for loopfilter masks
if (num_sb128 != f->lf.mask_sz) {
freep(&f->lf.mask);
freep(&f->lf.level);
f->lf.mask = malloc(sizeof(*f->lf.mask) * num_sb128);
dav1d_free(f->lf.mask);
dav1d_free(f->lf.level);
f->lf.mask = dav1d_malloc(ALLOC_LF, sizeof(*f->lf.mask) * num_sb128);
// over-allocate by 3 bytes since some of the SIMD implementations
// index this from the level type and can thus over-read by up to 3
f->lf.level = malloc(sizeof(*f->lf.level) * num_sb128 * 32 * 32 + 3);
f->lf.level = dav1d_malloc(ALLOC_LF, sizeof(*f->lf.level) * num_sb128 * 32 * 32 + 3);
if (!f->lf.mask || !f->lf.level) {
f->lf.mask_sz = 0;
goto error;
}
if (c->n_fc > 1) {
freep(&f->frame_thread.b);
freep(&f->frame_thread.cbi);
f->frame_thread.b = malloc(sizeof(*f->frame_thread.b) *
num_sb128 * 32 * 32);
f->frame_thread.cbi = malloc(sizeof(*f->frame_thread.cbi) *
num_sb128 * 32 * 32);
dav1d_free(f->frame_thread.b);
dav1d_free(f->frame_thread.cbi);
f->frame_thread.b = dav1d_malloc(ALLOC_BLOCK, sizeof(*f->frame_thread.b) *
num_sb128 * 32 * 32);
f->frame_thread.cbi = dav1d_malloc(ALLOC_BLOCK, sizeof(*f->frame_thread.cbi) *
num_sb128 * 32 * 32);
if (!f->frame_thread.b || !f->frame_thread.cbi) {
f->lf.mask_sz = 0;
goto error;
@ -3186,8 +3187,8 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
f->sr_sb128w = (f->sr_cur.p.p.w + 127) >> 7;
const int lr_mask_sz = f->sr_sb128w * f->sb128h;
if (lr_mask_sz != f->lf.lr_mask_sz) {
freep(&f->lf.lr_mask);
f->lf.lr_mask = malloc(sizeof(*f->lf.lr_mask) * lr_mask_sz);
dav1d_free(f->lf.lr_mask);
f->lf.lr_mask = dav1d_malloc(ALLOC_LR, sizeof(*f->lf.lr_mask) * lr_mask_sz);
if (!f->lf.lr_mask) {
f->lf.lr_mask_sz = 0;
goto error;
@ -3207,9 +3208,9 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
const int ipred_edge_sz = f->sbh * f->sb128w << hbd;
if (ipred_edge_sz != f->ipred_edge_sz) {
dav1d_freep_aligned(&f->ipred_edge[0]);
dav1d_free_aligned(f->ipred_edge[0]);
uint8_t *ptr = f->ipred_edge[0] =
dav1d_alloc_aligned(ipred_edge_sz * 128 * 3, 64);
dav1d_alloc_aligned(ALLOC_IPRED, ipred_edge_sz * 128 * 3, 64);
if (!ptr) {
f->ipred_edge_sz = 0;
goto error;
@ -3221,8 +3222,8 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
const int re_sz = f->sb128h * f->frame_hdr->tiling.cols;
if (re_sz != f->lf.re_sz) {
freep(&f->lf.tx_lpf_right_edge[0]);
f->lf.tx_lpf_right_edge[0] = malloc(re_sz * 32 * 2);
dav1d_free(f->lf.tx_lpf_right_edge[0]);
f->lf.tx_lpf_right_edge[0] = dav1d_malloc(ALLOC_LF, re_sz * 32 * 2);
if (!f->lf.tx_lpf_right_edge[0]) {
f->lf.re_sz = 0;
goto error;
@ -3656,9 +3657,9 @@ int dav1d_submit_frame(Dav1dContext *const c) {
// FIXME qsort so tiles are in order (for frame threading)
if (f->n_tile_data_alloc < c->n_tile_data) {
freep(&f->tile);
dav1d_free(f->tile);
assert(c->n_tile_data < INT_MAX / (int)sizeof(*f->tile));
f->tile = malloc(c->n_tile_data * sizeof(*f->tile));
f->tile = dav1d_malloc(ALLOC_TILE, c->n_tile_data * sizeof(*f->tile));
if (!f->tile) {
f->n_tile_data_alloc = f->n_tile_data = 0;
res = DAV1D_ERR(ENOMEM);

View File

@ -63,6 +63,12 @@ COLD const char *dav1d_version(void) {
return DAV1D_VERSION;
}
COLD unsigned dav1d_version_api(void) {
return (DAV1D_API_VERSION_MAJOR << 16) |
(DAV1D_API_VERSION_MINOR << 8) |
(DAV1D_API_VERSION_PATCH << 0);
}
COLD void dav1d_default_settings(Dav1dSettings *const s) {
s->n_threads = 0;
s->max_frame_delay = 0;
@ -155,7 +161,7 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
pthread_attr_setstacksize(&thread_attr, stack_size);
Dav1dContext *const c = *c_out = dav1d_alloc_aligned(sizeof(*c), 64);
Dav1dContext *const c = *c_out = dav1d_alloc_aligned(ALLOC_COMMON_CTX, sizeof(*c), 64);
if (!c) goto error;
memset(c, 0, sizeof(*c));
@ -172,12 +178,12 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
dav1d_data_props_set_defaults(&c->cached_error_props);
if (dav1d_mem_pool_init(&c->seq_hdr_pool) ||
dav1d_mem_pool_init(&c->frame_hdr_pool) ||
dav1d_mem_pool_init(&c->segmap_pool) ||
dav1d_mem_pool_init(&c->refmvs_pool) ||
dav1d_mem_pool_init(&c->pic_ctx_pool) ||
dav1d_mem_pool_init(&c->cdf_pool))
if (dav1d_mem_pool_init(ALLOC_OBU_HDR, &c->seq_hdr_pool) ||
dav1d_mem_pool_init(ALLOC_OBU_HDR, &c->frame_hdr_pool) ||
dav1d_mem_pool_init(ALLOC_SEGMAP, &c->segmap_pool) ||
dav1d_mem_pool_init(ALLOC_REFMVS, &c->refmvs_pool) ||
dav1d_mem_pool_init(ALLOC_PIC_CTX, &c->pic_ctx_pool) ||
dav1d_mem_pool_init(ALLOC_CDF, &c->cdf_pool))
{
goto error;
}
@ -186,7 +192,7 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
c->allocator.release_picture_callback == dav1d_default_picture_release)
{
if (c->allocator.cookie) goto error;
if (dav1d_mem_pool_init(&c->picture_pool)) goto error;
if (dav1d_mem_pool_init(ALLOC_PIC, &c->picture_pool)) goto error;
c->allocator.cookie = c->picture_pool;
} else if (c->allocator.alloc_picture_callback == dav1d_default_picture_alloc ||
c->allocator.release_picture_callback == dav1d_default_picture_release)
@ -210,11 +216,11 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
get_num_threads(c, s, &c->n_tc, &c->n_fc);
c->fc = dav1d_alloc_aligned(sizeof(*c->fc) * c->n_fc, 32);
c->fc = dav1d_alloc_aligned(ALLOC_THREAD_CTX, sizeof(*c->fc) * c->n_fc, 32);
if (!c->fc) goto error;
memset(c->fc, 0, sizeof(*c->fc) * c->n_fc);
c->tc = dav1d_alloc_aligned(sizeof(*c->tc) * c->n_tc, 64);
c->tc = dav1d_alloc_aligned(ALLOC_THREAD_CTX, sizeof(*c->tc) * c->n_tc, 64);
if (!c->tc) goto error;
memset(c->tc, 0, sizeof(*c->tc) * c->n_tc);
if (c->n_tc > 1) {
@ -235,9 +241,11 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
}
if (c->n_fc > 1) {
const size_t out_delayed_sz = sizeof(*c->frame_thread.out_delayed) * c->n_fc;
c->frame_thread.out_delayed =
calloc(c->n_fc, sizeof(*c->frame_thread.out_delayed));
dav1d_malloc(ALLOC_THREAD_CTX, out_delayed_sz);
if (!c->frame_thread.out_delayed) goto error;
memset(c->frame_thread.out_delayed, 0, out_delayed_sz);
}
for (unsigned n = 0; n < c->n_fc; n++) {
Dav1dFrameContext *const f = &c->fc[n];
@ -592,6 +600,9 @@ void dav1d_flush(Dav1dContext *const c) {
COLD void dav1d_close(Dav1dContext **const c_out) {
validate_input(c_out != NULL);
#if TRACK_HEAP_ALLOCATIONS
dav1d_log_alloc_stats(*c_out);
#endif
close_internal(c_out, 1);
}
@ -628,31 +639,31 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
// clean-up threading stuff
if (c->n_fc > 1) {
freep(&f->tile_thread.lowest_pixel_mem);
freep(&f->frame_thread.b);
dav1d_freep_aligned(&f->frame_thread.pal_idx);
dav1d_freep_aligned(&f->frame_thread.cf);
freep(&f->frame_thread.tile_start_off);
dav1d_freep_aligned(&f->frame_thread.pal);
freep(&f->frame_thread.cbi);
dav1d_free(f->tile_thread.lowest_pixel_mem);
dav1d_free(f->frame_thread.b);
dav1d_free_aligned(f->frame_thread.pal_idx);
dav1d_free_aligned(f->frame_thread.cf);
dav1d_free(f->frame_thread.tile_start_off);
dav1d_free_aligned(f->frame_thread.pal);
dav1d_free(f->frame_thread.cbi);
}
if (c->n_tc > 1) {
pthread_mutex_destroy(&f->task_thread.pending_tasks.lock);
pthread_cond_destroy(&f->task_thread.cond);
pthread_mutex_destroy(&f->task_thread.lock);
}
freep(&f->frame_thread.frame_progress);
freep(&f->task_thread.tasks);
freep(&f->task_thread.tile_tasks[0]);
dav1d_free(f->frame_thread.frame_progress);
dav1d_free(f->task_thread.tasks);
dav1d_free(f->task_thread.tile_tasks[0]);
dav1d_free_aligned(f->ts);
dav1d_free_aligned(f->ipred_edge[0]);
free(f->a);
free(f->tile);
free(f->lf.mask);
free(f->lf.lr_mask);
free(f->lf.level);
free(f->lf.tx_lpf_right_edge[0]);
free(f->lf.start_of_tile_row);
dav1d_free(f->a);
dav1d_free(f->tile);
dav1d_free(f->lf.mask);
dav1d_free(f->lf.level);
dav1d_free(f->lf.lr_mask);
dav1d_free(f->lf.tx_lpf_right_edge[0]);
dav1d_free(f->lf.start_of_tile_row);
dav1d_refmvs_clear(&f->rf);
dav1d_free_aligned(f->lf.cdef_line_buf);
dav1d_free_aligned(f->lf.lr_line_buf);
@ -662,11 +673,11 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
for (unsigned n = 0; n < c->n_fc; n++)
if (c->frame_thread.out_delayed[n].p.frame_hdr)
dav1d_thread_picture_unref(&c->frame_thread.out_delayed[n]);
free(c->frame_thread.out_delayed);
dav1d_free(c->frame_thread.out_delayed);
}
for (int n = 0; n < c->n_tile_data; n++)
dav1d_data_unref_internal(&c->tile[n].data);
free(c->tile);
dav1d_free(c->tile);
for (int n = 0; n < 8; n++) {
dav1d_cdf_thread_unref(&c->cdf[n]);
if (c->refs[n].p.p.frame_hdr)

View File

@ -44,7 +44,7 @@ COLD void dav1d_log_default_callback(void *const cookie,
}
COLD void dav1d_log(Dav1dContext *const c, const char *const format, ...) {
validate_input(c != NULL);
assert(c != NULL);
if (!c->logger.callback)
return;

View File

@ -31,9 +31,208 @@
#include "src/internal.h"
#if TRACK_HEAP_ALLOCATIONS
#include <stdio.h>
#include "src/log.h"
#define DEFAULT_ALIGN 16
typedef struct {
size_t sz;
unsigned align;
enum AllocationType type;
} Dav1dAllocationData;
typedef struct {
size_t curr_sz;
size_t peak_sz;
unsigned num_allocs;
unsigned num_reuses;
} AllocStats;
static AllocStats tracked_allocs[N_ALLOC_TYPES];
static size_t curr_total_sz;
static size_t peak_total_sz;
static pthread_mutex_t track_alloc_mutex = PTHREAD_MUTEX_INITIALIZER;
static void *track_alloc(const enum AllocationType type, char *ptr,
const size_t sz, const size_t align)
{
assert(align >= sizeof(Dav1dAllocationData));
if (ptr) {
ptr += align;
Dav1dAllocationData *const d = &((Dav1dAllocationData*)ptr)[-1];
AllocStats *const s = &tracked_allocs[type];
d->sz = sz;
d->align = (unsigned)align;
d->type = type;
pthread_mutex_lock(&track_alloc_mutex);
s->num_allocs++;
s->curr_sz += sz;
if (s->curr_sz > s->peak_sz)
s->peak_sz = s->curr_sz;
curr_total_sz += sz;
if (curr_total_sz > peak_total_sz)
peak_total_sz = curr_total_sz;
pthread_mutex_unlock(&track_alloc_mutex);
}
return ptr;
}
static void *track_free(char *const ptr) {
const Dav1dAllocationData *const d = &((Dav1dAllocationData*)ptr)[-1];
const size_t sz = d->sz;
pthread_mutex_lock(&track_alloc_mutex);
tracked_allocs[d->type].curr_sz -= sz;
curr_total_sz -= sz;
pthread_mutex_unlock(&track_alloc_mutex);
return ptr - d->align;
}
static void dav1d_track_reuse(const enum AllocationType type) {
pthread_mutex_lock(&track_alloc_mutex);
tracked_allocs[type].num_reuses++;
pthread_mutex_unlock(&track_alloc_mutex);
}
void *dav1d_malloc(const enum AllocationType type, const size_t sz) {
void *const ptr = malloc(sz + DEFAULT_ALIGN);
return track_alloc(type, ptr, sz, DEFAULT_ALIGN);
}
void *dav1d_alloc_aligned(const enum AllocationType type,
const size_t sz, const size_t align)
{
assert(!(align & (align - 1)));
void *ptr;
#ifdef _WIN32
ptr = _aligned_malloc(sz + align, align);
#elif defined(HAVE_POSIX_MEMALIGN)
if (posix_memalign(&ptr, align, sz + align)) return NULL;
#else
ptr = memalign(align, sz + align);
#endif
return track_alloc(type, ptr, sz, align);
}
void *dav1d_realloc(const enum AllocationType type,
void *ptr, const size_t sz)
{
if (!ptr)
return dav1d_malloc(type, sz);
ptr = realloc((char*)ptr - DEFAULT_ALIGN, sz + DEFAULT_ALIGN);
if (ptr)
ptr = track_free((char*)ptr + DEFAULT_ALIGN);
return track_alloc(type, ptr, sz, DEFAULT_ALIGN);
}
void dav1d_free(void *ptr) {
if (ptr)
free(track_free(ptr));
}
void dav1d_free_aligned(void *ptr) {
if (ptr) {
ptr = track_free(ptr);
#ifdef _WIN32
_aligned_free(ptr);
#else
free(ptr);
#endif
}
}
static COLD int cmp_stats(const void *const a, const void *const b) {
const size_t a_sz = ((const AllocStats*)a)->peak_sz;
const size_t b_sz = ((const AllocStats*)b)->peak_sz;
return a_sz < b_sz ? -1 : a_sz > b_sz;
}
/* Insert spaces as thousands separators for better readability */
static COLD int format_tsep(char *const s, const size_t n, const size_t value) {
if (value < 1000)
return snprintf(s, n, "%u", (unsigned)value);
const int len = format_tsep(s, n, value / 1000);
assert((size_t)len < n);
return len + snprintf(s + len, n - len, " %03u", (unsigned)(value % 1000));
}
COLD void dav1d_log_alloc_stats(Dav1dContext *const c) {
static const char *const type_names[N_ALLOC_TYPES] = {
[ALLOC_BLOCK ] = "Block data",
[ALLOC_CDEF ] = "CDEF line buffers",
[ALLOC_CDF ] = "CDF contexts",
[ALLOC_COEF ] = "Coefficient data",
[ALLOC_COMMON_CTX] = "Common context data",
[ALLOC_DAV1DDATA ] = "Dav1dData",
[ALLOC_IPRED ] = "Intra pred edges",
[ALLOC_LF ] = "Loopfilter data",
[ALLOC_LR ] = "Looprestoration data",
[ALLOC_OBU_HDR ] = "OBU headers",
[ALLOC_OBU_META ] = "OBU metadata",
[ALLOC_PAL ] = "Palette data",
[ALLOC_PIC ] = "Picture buffers",
[ALLOC_PIC_CTX ] = "Picture context data",
[ALLOC_REFMVS ] = "Reference mv data",
[ALLOC_SEGMAP ] = "Segmentation maps",
[ALLOC_THREAD_CTX] = "Thread context data",
[ALLOC_TILE ] = "Tile data",
};
struct {
AllocStats stats;
enum AllocationType type;
} data[N_ALLOC_TYPES];
unsigned total_allocs = 0;
unsigned total_reuses = 0;
pthread_mutex_lock(&track_alloc_mutex);
for (int i = 0; i < N_ALLOC_TYPES; i++) {
AllocStats *const s = &data[i].stats;
*s = tracked_allocs[i];
data[i].type = i;
total_allocs += s->num_allocs;
total_reuses += s->num_reuses;
}
size_t total_sz = peak_total_sz;
pthread_mutex_unlock(&track_alloc_mutex);
/* Sort types by memory usage */
qsort(&data, N_ALLOC_TYPES, sizeof(*data), cmp_stats);
const double inv_total_share = 100.0 / total_sz;
char total_sz_buf[32];
const int sz_len = 4 + format_tsep(total_sz_buf, sizeof(total_sz_buf), total_sz);
dav1d_log(c, "\n Type Allocs Reuses Share Peak size\n"
"---------------------------------------------------------------------\n");
for (int i = N_ALLOC_TYPES - 1; i >= 0; i--) {
const AllocStats *const s = &data[i].stats;
if (s->num_allocs) {
const double share = s->peak_sz * inv_total_share;
char sz_buf[32];
format_tsep(sz_buf, sizeof(sz_buf), s->peak_sz);
dav1d_log(c, " %-20s%10u%10u%8.1f%%%*s\n", type_names[data[i].type],
s->num_allocs, s->num_reuses, share, sz_len, sz_buf);
}
}
dav1d_log(c, "---------------------------------------------------------------------\n"
"%31u%10u %s\n",
total_allocs, total_reuses, total_sz_buf);
}
#endif /* TRACK_HEAP_ALLOCATIONS */
static COLD void mem_pool_destroy(Dav1dMemPool *const pool) {
pthread_mutex_destroy(&pool->lock);
free(pool);
dav1d_free(pool);
}
void dav1d_mem_pool_push(Dav1dMemPool *const pool, Dav1dMemPoolBuffer *const buf) {
@ -66,10 +265,14 @@ Dav1dMemPoolBuffer *dav1d_mem_pool_pop(Dav1dMemPool *const pool, const size_t si
dav1d_free_aligned(data);
goto alloc;
}
#if TRACK_HEAP_ALLOCATIONS
dav1d_track_reuse(pool->type);
#endif
} else {
pthread_mutex_unlock(&pool->lock);
alloc:
data = dav1d_alloc_aligned(size + sizeof(Dav1dMemPoolBuffer), 64);
data = dav1d_alloc_aligned(pool->type,
size + sizeof(Dav1dMemPoolBuffer), 64);
if (!data) {
pthread_mutex_lock(&pool->lock);
const int ref_cnt = --pool->ref_cnt;
@ -84,13 +287,19 @@ alloc:
return buf;
}
COLD int dav1d_mem_pool_init(Dav1dMemPool **const ppool) {
Dav1dMemPool *const pool = malloc(sizeof(Dav1dMemPool));
COLD int dav1d_mem_pool_init(const enum AllocationType type,
Dav1dMemPool **const ppool)
{
Dav1dMemPool *const pool = dav1d_malloc(ALLOC_COMMON_CTX,
sizeof(Dav1dMemPool));
if (pool) {
if (!pthread_mutex_init(&pool->lock, NULL)) {
pool->buf = NULL;
pool->ref_cnt = 1;
pool->end = 0;
#if TRACK_HEAP_ALLOCATIONS
pool->type = type;
#endif
*ppool = pool;
return 0;
}

View File

@ -28,16 +28,42 @@
#ifndef DAV1D_SRC_MEM_H
#define DAV1D_SRC_MEM_H
#define TRACK_HEAP_ALLOCATIONS 0
#include <stdlib.h>
#if defined(HAVE_ALIGNED_MALLOC) || defined(HAVE_MEMALIGN)
#if defined(_WIN32) || !defined(HAVE_POSIX_MEMALIGN)
#include <malloc.h>
#endif
#include "dav1d/dav1d.h"
#include "common/attributes.h"
#include "src/thread.h"
enum AllocationType {
ALLOC_BLOCK,
ALLOC_CDEF,
ALLOC_CDF,
ALLOC_COEF,
ALLOC_COMMON_CTX,
ALLOC_DAV1DDATA,
ALLOC_IPRED,
ALLOC_LF,
ALLOC_LR,
ALLOC_OBU_HDR,
ALLOC_OBU_META,
ALLOC_PAL,
ALLOC_PIC,
ALLOC_PIC_CTX,
ALLOC_REFMVS,
ALLOC_SEGMAP,
ALLOC_THREAD_CTX,
ALLOC_TILE,
N_ALLOC_TYPES,
};
typedef struct Dav1dMemPoolBuffer {
void *data;
struct Dav1dMemPoolBuffer *next;
@ -48,43 +74,59 @@ typedef struct Dav1dMemPool {
Dav1dMemPoolBuffer *buf;
int ref_cnt;
int end;
#if TRACK_HEAP_ALLOCATIONS
enum AllocationType type;
#endif
} Dav1dMemPool;
void dav1d_mem_pool_push(Dav1dMemPool *pool, Dav1dMemPoolBuffer *buf);
Dav1dMemPoolBuffer *dav1d_mem_pool_pop(Dav1dMemPool *pool, size_t size);
int dav1d_mem_pool_init(Dav1dMemPool **pool);
void dav1d_mem_pool_end(Dav1dMemPool *pool);
#if TRACK_HEAP_ALLOCATIONS
void *dav1d_malloc(enum AllocationType type, size_t sz);
void *dav1d_realloc(enum AllocationType type, void *ptr, size_t sz);
void *dav1d_alloc_aligned(enum AllocationType type, size_t sz, size_t align);
void dav1d_free(void *ptr);
void dav1d_free_aligned(void *ptr);
void dav1d_log_alloc_stats(Dav1dContext *c);
#else
#define dav1d_mem_pool_init(type, pool) dav1d_mem_pool_init(pool)
#define dav1d_malloc(type, sz) malloc(sz)
#define dav1d_realloc(type, ptr, sz) realloc(ptr, sz)
#define dav1d_free(ptr) free(ptr)
/*
* Allocate align-byte aligned memory. The return value can be released
* by calling the dav1d_free_aligned() function.
*/
static inline void *dav1d_alloc_aligned(size_t sz, size_t align) {
static inline void *dav1d_alloc_aligned(const size_t sz, const size_t align) {
assert(!(align & (align - 1)));
#ifdef HAVE_POSIX_MEMALIGN
#ifdef _WIN32
return _aligned_malloc(sz, align);
#elif defined(HAVE_POSIX_MEMALIGN)
void *ptr;
if (posix_memalign(&ptr, align, sz)) return NULL;
return ptr;
#elif defined(HAVE_ALIGNED_MALLOC)
return _aligned_malloc(sz, align);
#elif defined(HAVE_MEMALIGN)
return memalign(align, sz);
#else
#error Missing aligned alloc implementation
return memalign(align, sz);
#endif
}
#define dav1d_alloc_aligned(type, sz, align) dav1d_alloc_aligned(sz, align)
static inline void dav1d_free_aligned(void* ptr) {
#ifdef HAVE_POSIX_MEMALIGN
free(ptr);
#elif defined(HAVE_ALIGNED_MALLOC)
static inline void dav1d_free_aligned(void *ptr) {
#ifdef _WIN32
_aligned_free(ptr);
#elif defined(HAVE_MEMALIGN)
#else
free(ptr);
#endif
}
static inline void dav1d_freep_aligned(void* ptr) {
#endif /* TRACK_HEAP_ALLOCATIONS */
void dav1d_mem_pool_push(Dav1dMemPool *pool, Dav1dMemPoolBuffer *buf);
Dav1dMemPoolBuffer *dav1d_mem_pool_pop(Dav1dMemPool *pool, size_t size);
int dav1d_mem_pool_init(enum AllocationType type, Dav1dMemPool **pool);
void dav1d_mem_pool_end(Dav1dMemPool *pool);
static inline void dav1d_freep_aligned(void *ptr) {
void **mem = (void **) ptr;
if (*mem) {
dav1d_free_aligned(*mem);
@ -92,12 +134,4 @@ static inline void dav1d_freep_aligned(void* ptr) {
}
}
static inline void freep(void *ptr) {
void **mem = (void **) ptr;
if (*mem) {
free(*mem);
*mem = NULL;
}
}
#endif /* DAV1D_SRC_MEM_H */

View File

@ -304,7 +304,7 @@ int dav1d_parse_sequence_header(Dav1dSequenceHeader *const out,
{
validate_input_or_ret(out != NULL, DAV1D_ERR(EINVAL));
validate_input_or_ret(ptr != NULL, DAV1D_ERR(EINVAL));
validate_input_or_ret(sz > 0, DAV1D_ERR(EINVAL));
validate_input_or_ret(sz > 0 && sz <= SIZE_MAX / 2, DAV1D_ERR(EINVAL));
GetBits gb;
dav1d_init_get_bits(&gb, ptr, sz);
@ -609,8 +609,8 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
if (!hdr->frame_ref_short_signaling)
hdr->refidx[i] = dav1d_get_bits(gb, 3);
if (seqhdr->frame_id_numbers_present) {
const int delta_ref_frame_id_minus_1 = dav1d_get_bits(gb, seqhdr->delta_frame_id_n_bits);
const int ref_frame_id = (hdr->frame_id + (1 << seqhdr->frame_id_n_bits) - delta_ref_frame_id_minus_1 - 1) & ((1 << seqhdr->frame_id_n_bits) - 1);
const unsigned delta_ref_frame_id = dav1d_get_bits(gb, seqhdr->delta_frame_id_n_bits) + 1;
const unsigned ref_frame_id = (hdr->frame_id + (1 << seqhdr->frame_id_n_bits) - delta_ref_frame_id) & ((1 << seqhdr->frame_id_n_bits) - 1);
Dav1dFrameHeader *const ref_frame_hdr = c->refs[hdr->refidx[i]].p.p.frame_hdr;
if (!ref_frame_hdr || ref_frame_hdr->frame_id != ref_frame_id) goto error;
}
@ -705,7 +705,8 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
goto error;
hdr->tiling.n_bytes = dav1d_get_bits(gb, 2) + 1;
} else {
hdr->tiling.n_bytes = hdr->tiling.update = 0;
hdr->tiling.n_bytes = 0;
hdr->tiling.update = 0;
}
#if DEBUG_FRAME_HDR
printf("HDR: post-tiling: off=%td\n",
@ -739,7 +740,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
hdr->quant.qm_y = dav1d_get_bits(gb, 4);
hdr->quant.qm_u = dav1d_get_bits(gb, 4);
hdr->quant.qm_v =
seqhdr->separate_uv_delta_q ? (int)dav1d_get_bits(gb, 4) :
seqhdr->separate_uv_delta_q ? dav1d_get_bits(gb, 4) :
hdr->quant.qm_u;
}
#if DEBUG_FRAME_HDR
@ -1366,7 +1367,8 @@ ptrdiff_t dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) {
if (!c->frame_hdr) goto error;
if (c->n_tile_data_alloc < c->n_tile_data + 1) {
if ((c->n_tile_data + 1) > INT_MAX / (int)sizeof(*c->tile)) goto error;
struct Dav1dTileGroup *tile = realloc(c->tile, (c->n_tile_data + 1) * sizeof(*c->tile));
struct Dav1dTileGroup *tile = dav1d_realloc(ALLOC_TILE, c->tile,
(c->n_tile_data + 1) * sizeof(*c->tile));
if (!tile) goto error;
c->tile = tile;
memset(c->tile + c->n_tile_data, 0, sizeof(*c->tile));
@ -1406,7 +1408,8 @@ ptrdiff_t dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) {
switch (meta_type) {
case OBU_META_HDR_CLL: {
Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dContentLightLevel));
Dav1dRef *ref = dav1d_ref_create(ALLOC_OBU_META,
sizeof(Dav1dContentLightLevel));
if (!ref) return DAV1D_ERR(ENOMEM);
Dav1dContentLightLevel *const content_light = ref->data;
@ -1434,7 +1437,8 @@ ptrdiff_t dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) {
break;
}
case OBU_META_HDR_MDCV: {
Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dMasteringDisplay));
Dav1dRef *ref = dav1d_ref_create(ALLOC_OBU_META,
sizeof(Dav1dMasteringDisplay));
if (!ref) return DAV1D_ERR(ENOMEM);
Dav1dMasteringDisplay *const mastering_display = ref->data;
@ -1503,7 +1507,8 @@ ptrdiff_t dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) {
}
if ((c->n_itut_t35 + 1) > INT_MAX / (int)sizeof(*c->itut_t35)) goto error;
struct Dav1dITUTT35 *itut_t35 = realloc(c->itut_t35, (c->n_itut_t35 + 1) * sizeof(*c->itut_t35));
struct Dav1dITUTT35 *itut_t35 = dav1d_realloc(ALLOC_OBU_META, c->itut_t35,
(c->n_itut_t35 + 1) * sizeof(*c->itut_t35));
if (!itut_t35) goto error;
c->itut_t35 = itut_t35;
memset(c->itut_t35 + c->n_itut_t35, 0, sizeof(*c->itut_t35));
@ -1511,7 +1516,7 @@ ptrdiff_t dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) {
struct itut_t35_ctx_context *itut_t35_ctx;
if (!c->n_itut_t35) {
assert(!c->itut_t35_ref);
itut_t35_ctx = malloc(sizeof(struct itut_t35_ctx_context));
itut_t35_ctx = dav1d_malloc(ALLOC_OBU_META, sizeof(struct itut_t35_ctx_context));
if (!itut_t35_ctx) goto error;
c->itut_t35_ref = dav1d_ref_init(&itut_t35_ctx->ref, c->itut_t35,
dav1d_picture_free_itut_t35, itut_t35_ctx, 0);
@ -1524,7 +1529,7 @@ ptrdiff_t dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) {
itut_t35_ctx->n_itut_t35 = c->n_itut_t35 + 1;
Dav1dITUTT35 *const itut_t35_metadata = &c->itut_t35[c->n_itut_t35];
itut_t35_metadata->payload = malloc(payload_size);
itut_t35_metadata->payload = dav1d_malloc(ALLOC_OBU_META, payload_size);
if (!itut_t35_metadata->payload) goto error;
itut_t35_metadata->country_code = country_code;

View File

@ -106,9 +106,9 @@ void dav1d_picture_free_itut_t35(const uint8_t *const data, void *const user_dat
struct itut_t35_ctx_context *itut_t35_ctx = user_data;
for (size_t i = 0; i < itut_t35_ctx->n_itut_t35; i++)
free(itut_t35_ctx->itut_t35[i].payload);
free(itut_t35_ctx->itut_t35);
free(itut_t35_ctx);
dav1d_free(itut_t35_ctx->itut_t35[i].payload);
dav1d_free(itut_t35_ctx->itut_t35);
dav1d_free(itut_t35_ctx);
}
static int picture_alloc_with_edges(Dav1dContext *const c,
@ -249,12 +249,12 @@ int dav1d_picture_alloc_copy(Dav1dContext *const c, Dav1dPicture *const dst, con
}
void dav1d_picture_ref(Dav1dPicture *const dst, const Dav1dPicture *const src) {
validate_input(dst != NULL);
validate_input(dst->data[0] == NULL);
validate_input(src != NULL);
assert(dst != NULL);
assert(dst->data[0] == NULL);
assert(src != NULL);
if (src->ref) {
validate_input(src->data[0] != NULL);
assert(src->data[0] != NULL);
dav1d_ref_inc(src->ref);
}
if (src->frame_hdr_ref) dav1d_ref_inc(src->frame_hdr_ref);
@ -267,12 +267,12 @@ void dav1d_picture_ref(Dav1dPicture *const dst, const Dav1dPicture *const src) {
}
void dav1d_picture_move_ref(Dav1dPicture *const dst, Dav1dPicture *const src) {
validate_input(dst != NULL);
validate_input(dst->data[0] == NULL);
validate_input(src != NULL);
assert(dst != NULL);
assert(dst->data[0] == NULL);
assert(src != NULL);
if (src->ref)
validate_input(src->data[0] != NULL);
assert(src->data[0] != NULL);
*dst = *src;
memset(src, 0, sizeof(*src));

View File

@ -34,10 +34,10 @@ static void default_free_callback(const uint8_t *const data, void *const user_da
dav1d_free_aligned(user_data);
}
Dav1dRef *dav1d_ref_create(size_t size) {
Dav1dRef *dav1d_ref_create(const enum AllocationType type, size_t size) {
size = (size + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
uint8_t *const data = dav1d_alloc_aligned(size + sizeof(Dav1dRef), 64);
uint8_t *const data = dav1d_alloc_aligned(type, size + sizeof(Dav1dRef), 64);
if (!data) return NULL;
Dav1dRef *const res = (Dav1dRef*)(data + size);
@ -81,6 +81,6 @@ void dav1d_ref_dec(Dav1dRef **const pref) {
if (atomic_fetch_sub(&ref->ref_cnt, 1) == 1) {
const int free_ref = ref->free_ref;
ref->free_callback(ref->const_data, ref->user_data);
if (free_ref) free(ref);
if (free_ref) dav1d_free(ref);
}
}

View File

@ -45,7 +45,11 @@ struct Dav1dRef {
void *user_data;
};
Dav1dRef *dav1d_ref_create(size_t size);
#if !TRACK_HEAP_ALLOCATIONS
#define dav1d_ref_create(type, size) dav1d_ref_create(size)
#endif
Dav1dRef *dav1d_ref_create(enum AllocationType type, size_t size);
Dav1dRef *dav1d_ref_create_using_pool(Dav1dMemPool *pool, size_t size);
void dav1d_ref_dec(Dav1dRef **ref);

View File

@ -817,7 +817,7 @@ int dav1d_refmvs_init_frame(refmvs_frame *const rf,
if (r_stride != rf->r_stride || n_tile_rows != rf->n_tile_rows) {
if (rf->r) dav1d_freep_aligned(&rf->r);
const int uses_2pass = n_tile_threads > 1 && n_frame_threads > 1;
rf->r = dav1d_alloc_aligned(sizeof(*rf->r) * 35 * r_stride * n_tile_rows * (1 + uses_2pass), 64);
rf->r = dav1d_alloc_aligned(ALLOC_REFMVS, sizeof(*rf->r) * 35 * r_stride * n_tile_rows * (1 + uses_2pass), 64);
if (!rf->r) return DAV1D_ERR(ENOMEM);
rf->r_stride = r_stride;
}
@ -825,7 +825,7 @@ int dav1d_refmvs_init_frame(refmvs_frame *const rf,
const ptrdiff_t rp_stride = r_stride >> 1;
if (rp_stride != rf->rp_stride || n_tile_rows != rf->n_tile_rows) {
if (rf->rp_proj) dav1d_freep_aligned(&rf->rp_proj);
rf->rp_proj = dav1d_alloc_aligned(sizeof(*rf->rp_proj) * 16 * rp_stride * n_tile_rows, 64);
rf->rp_proj = dav1d_alloc_aligned(ALLOC_REFMVS, sizeof(*rf->rp_proj) * 16 * rp_stride * n_tile_rows, 64);
if (!rf->rp_proj) return DAV1D_ERR(ENOMEM);
rf->rp_stride = rp_stride;
}

View File

@ -33,6 +33,7 @@
#include <limits.h>
#include <windows.h>
#define PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT
#define PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT
typedef struct {

View File

@ -224,7 +224,7 @@ static int create_filter_sbrow(Dav1dFrameContext *const f,
int num_tasks = f->sbh * (1 + uses_2pass);
if (num_tasks > f->task_thread.num_tasks) {
const size_t size = sizeof(Dav1dTask) * num_tasks;
tasks = realloc(f->task_thread.tasks, size);
tasks = dav1d_realloc(ALLOC_COMMON_CTX, f->task_thread.tasks, size);
if (!tasks) return -1;
memset(tasks, 0, size);
f->task_thread.tasks = tasks;
@ -237,8 +237,8 @@ static int create_filter_sbrow(Dav1dFrameContext *const f,
} else {
const int prog_sz = ((f->sbh + 31) & ~31) >> 5;
if (prog_sz > f->frame_thread.prog_sz) {
atomic_uint *const prog = realloc(f->frame_thread.frame_progress,
2 * prog_sz * sizeof(*prog));
atomic_uint *const prog = dav1d_realloc(ALLOC_COMMON_CTX, f->frame_thread.frame_progress,
2 * prog_sz * sizeof(*prog));
if (!prog) return -1;
f->frame_thread.frame_progress = prog;
f->frame_thread.copy_lpf_progress = prog + prog_sz;
@ -275,7 +275,7 @@ int dav1d_task_create_tile_sbrow(Dav1dFrameContext *const f, const int pass,
int alloc_num_tasks = num_tasks * (1 + uses_2pass);
if (alloc_num_tasks > f->task_thread.num_tile_tasks) {
const size_t size = sizeof(Dav1dTask) * alloc_num_tasks;
tasks = realloc(f->task_thread.tile_tasks[0], size);
tasks = dav1d_realloc(ALLOC_COMMON_CTX, f->task_thread.tile_tasks[0], size);
if (!tasks) return -1;
memset(tasks, 0, size);
f->task_thread.tile_tasks[0] = tasks;

View File

@ -47,6 +47,10 @@ SECTION_RODATA 64
%endmacro
%if ARCH_X86_64
mv_proj: dw 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340
dw 2048, 1820, 1638, 1489, 1365, 1260, 1170, 1092
dw 1024, 963, 910, 862, 819, 780, 744, 712
dw 682, 655, 630, 606, 585, 564, 546, 528
splat_mv_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3
db 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7
db 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
@ -61,6 +65,7 @@ cond_shuf512: db 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 3, 3
save_cond0: db 0x80, 0x81, 0x82, 0x83, 0x89, 0x84, 0x00, 0x00
save_cond1: db 0x84, 0x85, 0x86, 0x87, 0x88, 0x80, 0x00, 0x00
pb_128: times 16 db 128
pq_8192: dq 8192
save_tmvs_ssse3_table: SAVE_TMVS_TABLE 2, 16, ssse3
SAVE_TMVS_TABLE 4, 8, ssse3
@ -329,6 +334,225 @@ cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
RET
%if ARCH_X86_64
INIT_XMM sse4
; refmvs_frame *rf, int tile_row_idx,
; int col_start8, int col_end8, int row_start8, int row_end8
cglobal load_tmvs, 6, 15, 4, -0x50, rf, tridx, xstart, xend, ystart, yend, \
stride, rp_proj, roff, troff, \
xendi, xstarti, iw8, ih8, dst
xor r14d, r14d
cmp dword [rfq+212], 1 ; n_tile_threads
mov ih8d, [rfq+20] ; rf->ih8
mov iw8d, [rfq+16] ; rf->iw8
mov xstartd, xstartd
mov xendd, xendd
cmove tridxd, r14d
lea xstartid, [xstartq-8]
lea xendid, [xendq+8]
mov strideq, [rfq+184]
mov rp_projq, [rfq+176]
cmp ih8d, yendd
mov [rsp+0x30], strideq
cmovs yendd, ih8d
test xstartid, xstartid
cmovs xstartid, r14d
cmp iw8d, xendid
cmovs xendid, iw8d
mov troffq, strideq
shl troffq, 4
imul troffq, tridxq
mov dstd, ystartd
and dstd, 15
imul dstq, strideq
add dstq, troffq ; (16 * tridx + (ystart & 15)) * stride
lea dstq, [dstq*5]
add dstq, rp_projq
lea troffq, [troffq*5] ; 16 * tridx * stride * 5
lea r13d, [xendq*5]
lea r12, [strideq*5]
DEFINE_ARGS rf, w5, xstart, xend, ystart, yend, h, x5, \
_, troff, xendi, xstarti, stride5, _, dst
lea w5d, [xstartq*5]
add r7, troffq ; rp_proj + tile_row_offset
mov hd, yendd
mov [rsp+0x28], r7
add dstq, r13
sub w5q, r13
sub hd, ystartd
.init_xloop_start:
mov x5q, w5q
test w5b, 1
jz .init_2blk
mov dword [dstq+x5q], 0x80008000
add x5q, 5
jz .init_next_row
.init_2blk:
mov dword [dstq+x5q+0], 0x80008000
mov dword [dstq+x5q+5], 0x80008000
add x5q, 10
jl .init_2blk
.init_next_row:
add dstq, stride5q
dec hd
jg .init_xloop_start
DEFINE_ARGS rf, _, xstart, xend, ystart, yend, n7, stride, \
_, _, xendi, xstarti, stride5, _, n
mov r13d, [rfq+152] ; rf->n_mfmvs
test r13d, r13d
jz .ret
mov [rsp+0x0c], r13d
mov strideq, [rsp+0x30]
movddup m3, [pq_8192]
mov r9d, ystartd
mov [rsp+0x38], yendd
mov [rsp+0x20], xstartid
xor nd, nd
xor n7d, n7d
imul r9, strideq ; ystart * stride
mov [rsp+0x48], rfq
mov [rsp+0x18], stride5q
lea r7, [r9*5]
mov [rsp+0x24], ystartd
mov [rsp+0x00], r7
.nloop:
DEFINE_ARGS y, off, xstart, xend, ystart, rf, n7, refsign, \
ref, rp_ref, xendi, xstarti, _, _, n
mov rfq, [rsp+0x48]
mov refd, [rfq+56+nq*4] ; ref2cur
cmp refd, 0x80000000
je .next_n
mov [rsp+0x40], refd
mov offq, [rsp+0x00] ; ystart * stride * 5
movzx refd, byte [rfq+53+nq] ; rf->mfmv_ref[n]
lea refsignq, [refq-4]
mov rp_refq, [rfq+168]
movq m2, refsignq
add offq, [rp_refq+refq*8] ; r = rp_ref[ref] + row_offset
mov [rsp+0x14], nd
mov yd, ystartd
.yloop:
mov r11d, [rsp+0x24] ; ystart
mov r12d, [rsp+0x38] ; yend
mov r14d, yd
and r14d, ~7 ; y_sb_align
cmp r11d, r14d
cmovs r11d, r14d ; imax(y_sb_align, ystart)
mov [rsp+0x44], r11d ; y_proj_start
add r14d, 8
cmp r12d, r14d
cmovs r14d, r12d ; imin(y_sb_align + 8, yend)
mov [rsp+0x3c], r14d ; y_proj_end
DEFINE_ARGS y, src, xstart, xend, frac, rf, n7, mv, \
ref, x, xendi, mvx, mvy, rb, ref2ref
mov xd, [rsp+0x20] ; xstarti
.xloop:
lea rbd, [xq*5]
add rbq, srcq
movsx refd, byte [rbq+4]
test refd, refd
jz .next_x_bad_ref
mov rfq, [rsp+0x48]
lea r14d, [16+n7q+refq]
mov ref2refd, [rfq+r14*4] ; rf->mfmv_ref2ref[n][b_ref-1]
test ref2refd, ref2refd
jz .next_x_bad_ref
lea fracq, [mv_proj]
movzx fracd, word [fracq+ref2refq*2]
mov mvd, [rbq]
imul fracd, [rsp+0x40] ; ref2cur
pmovsxwq m0, [rbq]
movd m1, fracd
punpcklqdq m1, m1
pmuldq m0, m1 ; mv * frac
pshufd m1, m0, q3311
paddd m0, m3
paddd m0, m1
psrad m0, 14 ; offset = (xy + (xy >> 31) + 8192) >> 14
pabsd m1, m0
packssdw m0, m0
psrld m1, 6
packuswb m1, m1
pxor m0, m2 ; offset ^ ref_sign
psignd m1, m0 ; apply_sign(abs(offset) >> 6, offset ^ refsign)
movq mvxq, m1
lea mvyd, [mvxq+yq] ; ypos
sar mvxq, 32
DEFINE_ARGS y, src, xstart, xend, _, _, n7, mv, \
ref, x, xendi, mvx, ypos, rb, ref2ref
cmp yposd, [rsp+0x44] ; y_proj_start
jl .next_x_bad_pos_y
cmp yposd, [rsp+0x3c] ; y_proj_end
jge .next_x_bad_pos_y
and yposd, 15
add mvxq, xq ; xpos
imul yposq, [rsp+0x30] ; pos = (ypos & 15) * stride
DEFINE_ARGS y, src, xstart, xend, dst, _, n7, mv, \
ref, x, xendi, xpos, pos, rb, ref2ref
mov dstq, [rsp+0x28] ; dst = rp_proj + tile_row_offset
add posq, xposq ; pos += xpos
lea posq, [posq*5]
add dstq, posq ; dst += pos5
jmp .write_loop_entry
.write_loop:
add rbq, 5
cmp refb, byte [rbq+4]
jne .xloop
cmp mvd, [rbq]
jne .xloop
add dstq, 5
inc xposd
.write_loop_entry:
mov r12d, xd
and r12d, ~7
lea r5d, [r12-8]
cmp r5d, xstartd
cmovs r5d, xstartd ; x_proj_start
cmp xposd, r5d
jl .next_xpos
add r12d, 16
cmp xendd, r12d
cmovs r12d, xendd ; x_proj_end
cmp xposd, r12d
jge .next_xpos
mov [dstq+0], mvd
mov byte [dstq+4], ref2refb
.next_xpos:
inc xd
cmp xd, xendid
jl .write_loop
.next_y:
DEFINE_ARGS y, src, xstart, xend, ystart, _, n7, _, _, x, xendi, _, _, _, n
add srcq, [rsp+0x18] ; stride5
inc yd
cmp yd, [rsp+0x38] ; yend
jne .yloop
mov nd, [rsp+0x14]
mov ystartd, [rsp+0x24]
.next_n:
add n7d, 7
inc nd
cmp nd, [rsp+0x0c] ; n_mfmvs
jne .nloop
.ret:
RET
.next_x:
DEFINE_ARGS y, src, xstart, xend, _, _, n7, mv, ref, x, xendi, _, _, rb, _
add rbq, 5
cmp refb, byte [rbq+4]
jne .xloop
cmp mvd, [rbq]
jne .xloop
.next_x_bad_pos_y:
inc xd
cmp xd, xendid
jl .next_x
jmp .next_y
.next_x_bad_ref:
inc xd
cmp xd, xendid
jl .xloop
jmp .next_y
INIT_YMM avx2
; refmvs_temporal_block *rp, ptrdiff_t stride,
; refmvs_block **rr, uint8_t *ref_sign,

View File

@ -28,6 +28,8 @@
#include "src/cpu.h"
#include "src/refmvs.h"
decl_load_tmvs_fn(dav1d_load_tmvs_sse4);
decl_save_tmvs_fn(dav1d_save_tmvs_ssse3);
decl_save_tmvs_fn(dav1d_save_tmvs_avx2);
decl_save_tmvs_fn(dav1d_save_tmvs_avx512icl);
@ -47,7 +49,10 @@ static ALWAYS_INLINE void refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *const c) {
c->save_tmvs = dav1d_save_tmvs_ssse3;
if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
#if ARCH_X86_64
c->load_tmvs = dav1d_load_tmvs_sse4;
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
c->save_tmvs = dav1d_save_tmvs_avx2;

View File

@ -39,6 +39,190 @@ static inline int gen_mv(const int total_bits, int spel_bits) {
return rnd() & 1 ? -bits : bits;
}
#define ARRAY_SIZE(n) (sizeof(n)/sizeof(*(n)))
static inline int get_min_mv_val(const int idx) {
if (idx <= 9) return idx;
else if (idx <= 18) return (idx - 9) * 10;
else if (idx <= 27) return (idx - 18) * 100;
else if (idx <= 36) return (idx - 27) * 1000;
else return (idx - 36) * 10000;
}
static inline void gen_tmv(refmvs_temporal_block *const rb, const int *ref2ref) {
rb->ref = rnd() % 7;
if (!rb->ref) return;
static const int x_prob[] = {
26447556, 6800591, 3708783, 2198592, 1635940, 1145901, 1052602, 1261759,
1099739, 755108, 6075404, 4355916, 3254908, 2897157, 2273676, 2154432,
1937436, 1694818, 1466863, 10203087, 5241546, 3328819, 2187483, 1458997,
1030842, 806863, 587219, 525024, 1858953, 422368, 114626, 16992
};
static const int y_prob[] = {
33845001, 7591218, 6425971, 4115838, 4032161, 2515962, 2614601, 2343656,
2898897, 1397254, 10125350, 5124449, 3232914, 2185499, 1608775, 1342585,
980208, 795714, 649665, 3369250, 1298716, 486002, 279588, 235990,
110318, 89372, 66895, 46980, 153322, 32960, 4500, 389
};
const int prob = rnd() % 100000000;
int acc = 0;
for (unsigned i = 0; i < ARRAY_SIZE(x_prob); i++) {
acc += x_prob[i];
if (prob < acc) {
const int min = get_min_mv_val(i);
const int max = get_min_mv_val(i + 1);
const int val = min + rnd() % (max - min);
rb->mv.x = iclip(val * ref2ref[rb->ref], -(1 << 15), (1 << 15) - 1);
break;
}
}
acc = 0;
for (unsigned i = 0; i < ARRAY_SIZE(y_prob); i++) {
acc += y_prob[i];
if (prob < acc) {
const int min = get_min_mv_val(i);
const int max = get_min_mv_val(i + 1);
const int val = min + rnd() % (max - min);
rb->mv.y = iclip(val * ref2ref[rb->ref], -(1 << 15), (1 << 15) - 1);
break;
}
}
}
static inline int get_ref2cur(void) {
const int prob = rnd() % 100;
static const uint8_t ref2cur[11] = { 35, 55, 67, 73, 78, 83, 84, 87, 90, 93, 100 };
for (int i = 0; i < 11; i++)
if (prob < ref2cur[i])
return rnd() & 1 ? -(i + 1) : i + 1;
return 0;
}
static inline int get_seqlen(void) {
int len = 0, max_len;
const int prob = rnd() % 100000;
// =1 =2 =3 =4 <8 =8 <16 =16 <32 =32 <48 =48 <64 =64 >64 eq240
// 5 17 1.5 16 5 10 5 7 4 3 1.5 2 1 2 20 15 chimera blocks
// 25 38 2.5 19 3.5 5.5 2 1.87 .86 .4 .18 .2 .067 .165 .478 .28 chimera sequences
if (prob < 25000) len = 1; // =1 5%
else if (prob < 63000) len = 2; // =2 17%
else if (prob < 65500) len = 3; // =3 1.5%
else if (prob < 84500) len = 4; // =4 16%
else if (prob < 88000) max_len = 7; // <8 5% (43.5% tot <8)
else if (prob < 93500) len = 8; // =8 10%
else if (prob < 95500) max_len = 15; // <16 5%
else if (prob < 97370) len = 16; // =16 7%
else if (prob < 98230) max_len = 31; // <32 4%
else if (prob < 98630) len = 32; // =32 3%
else if (prob < 98810) max_len = 47; // <48 1.5%
else if (prob < 99010) len = 48; // =48 2%
else if (prob < 99077) max_len = 63; // <64 1%
else if (prob < 99242) len = 64; // =64 2%
else if (prob < 99720) max_len = 239; // <240 5%
else len = 240; // =240 15%
if (!len) len = 1 + rnd() % max_len;
return len;
}
static inline void init_rp_ref(refmvs_frame const *const rf,
const int col_start8, const int col_end8,
const int row_start8, const int row_end8)
{
const int col_start8i = imax(col_start8 - 8, 0);
const int col_end8i = imin(col_end8 + 8, rf->iw8);
for (int n = 0; n < rf->n_mfmvs; n++) {
refmvs_temporal_block *rp_ref = rf->rp_ref[rf->mfmv_ref[n]];
for (int i = row_start8; i < imin(row_end8, rf->ih8); i++) {
for (int j = col_start8i; j < col_end8i;) {
refmvs_temporal_block rb;
gen_tmv(&rb, rf->mfmv_ref2ref[n]);
for (int k = get_seqlen(); k && j < col_end8i; k--, j++)
rp_ref[i * rf->iw8 + j] = rb;
}
}
}
}
static void check_load_tmvs(const Dav1dRefmvsDSPContext *const c) {
refmvs_temporal_block *rp_ref[7] = {0};
refmvs_temporal_block c_rp_proj[240 * 63];
refmvs_temporal_block a_rp_proj[240 * 63];
refmvs_frame rf = {
.rp_ref = rp_ref,
.rp_stride = 240, .iw8 = 240, .ih8 = 63,
.n_mfmvs = 3
};
const size_t rp_ref_sz = rf.ih8 * rf.rp_stride * sizeof(refmvs_temporal_block);
declare_func(void, const refmvs_frame *rf, int tile_row_idx,
int col_start8, int col_end8, int row_start8, int row_end8);
if (check_func(c->load_tmvs, "load_tmvs")) {
const int row_start8 = (rnd() & 3) << 4;
const int row_end8 = row_start8 + 16;
const int col_start8 = rnd() & 31;
const int col_end8 = rf.iw8 - (rnd() & 31);
for (int n = 0; n < rf.n_mfmvs; n++) {
rf.mfmv_ref[n] = rnd() % 7;
rf.mfmv_ref2cur[n] = get_ref2cur();
for (int r = 0; r < 7; r++)
rf.mfmv_ref2ref[n][r] = rnd() & 31;
}
for (int n = 0; n < rf.n_mfmvs; n++) {
refmvs_temporal_block **p_rp_ref = &rp_ref[rf.mfmv_ref[n]];
if (!*p_rp_ref)
*p_rp_ref = malloc(rp_ref_sz);
}
init_rp_ref(&rf, 0, rf.iw8, row_start8, row_end8);
for (int i = 0; i < rf.iw8 * rf.ih8; i++) {
c_rp_proj[i].mv.n = a_rp_proj[i].mv.n = 0xdeadbeef;
c_rp_proj[i].ref = a_rp_proj[i].ref = 0xdd;
}
rf.n_tile_threads = 1;
rf.rp_proj = c_rp_proj;
call_ref(&rf, 0, col_start8, col_end8, row_start8, row_end8);
rf.rp_proj = a_rp_proj;
call_new(&rf, 0, col_start8, col_end8, row_start8, row_end8);
for (int i = 0; i < rf.ih8; i++)
for (int j = 0; j < rf.iw8; j++)
if (c_rp_proj[i * rf.iw8 + j].mv.n != a_rp_proj[i * rf.iw8 + j].mv.n ||
(c_rp_proj[i * rf.iw8 + j].ref != a_rp_proj[i * rf.iw8 + j].ref &&
c_rp_proj[i * rf.iw8 + j].mv.n != INVALID_MV))
{
if (fail()) {
fprintf(stderr, "[%d][%d] c_rp.mv.x = 0x%x a_rp.mv.x = 0x%x\n",
i, j, c_rp_proj[i * rf.iw8 + j].mv.x, a_rp_proj[i * rf.iw8 + j].mv.x);
fprintf(stderr, "[%d][%d] c_rp.mv.y = 0x%x a_rp.mv.y = 0x%x\n",
i, j, c_rp_proj[i * rf.iw8 + j].mv.y, a_rp_proj[i * rf.iw8 + j].mv.y);
fprintf(stderr, "[%d][%d] c_rp.ref = %u a_rp.ref = %u\n",
i, j, c_rp_proj[i * rf.iw8 + j].ref, a_rp_proj[i * rf.iw8 + j].ref);
}
}
if (checkasm_bench_func()) {
for (int n = 0; n < rf.n_mfmvs; n++) {
rf.mfmv_ref2cur[n] = 1;
for (int r = 0; r < 7; r++)
rf.mfmv_ref2ref[n][r] = 1;
}
bench_new(&rf, 0, 0, rf.iw8, row_start8, row_end8);
}
for (int n = 0; n < rf.n_mfmvs; n++) {
free(rp_ref[rf.mfmv_ref[n]]);
rp_ref[rf.mfmv_ref[n]] = NULL;
}
}
report("load_tmvs");
}
static void check_save_tmvs(const Dav1dRefmvsDSPContext *const c) {
refmvs_block *rr[31];
refmvs_block r[31 * 256];
@ -162,6 +346,7 @@ void checkasm_check_refmvs(void) {
Dav1dRefmvsDSPContext c;
dav1d_refmvs_dsp_init(&c);
check_load_tmvs(&c);
check_save_tmvs(&c);
check_splat_mv(&c);
}