mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-11-23 21:01:08 +00:00
Bug 1841624 - Update dav1d to 616bfd1506a8a75c6a358e578cbec9ca11931502 r=chunmin
Differential Revision: https://phabricator.services.mozilla.com/D182716
This commit is contained in:
parent
115775cd64
commit
1f101a78be
@ -20,11 +20,11 @@ origin:
|
||||
|
||||
# Human-readable identifier for this version/release
|
||||
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
||||
release: 8b419c16bf1e37bc98044089da58f06824462cb9 (2023-06-02T00:00:12.000+02:00).
|
||||
release: 616bfd1506a8a75c6a358e578cbec9ca11931502 (2023-07-01T11:36:39.000+03:00).
|
||||
|
||||
# Revision to pull in
|
||||
# Must be a long or short commit SHA (long preferred)
|
||||
revision: 8b419c16bf1e37bc98044089da58f06824462cb9
|
||||
revision: 616bfd1506a8a75c6a358e578cbec9ca11931502
|
||||
|
||||
# The package's license, where possible using the mnemonic from
|
||||
# https://spdx.org/licenses/
|
||||
|
@ -1,2 +1,2 @@
|
||||
/* auto-generated, do not edit */
|
||||
#define DAV1D_VERSION "8b419c16bf1e37bc98044089da58f06824462cb9"
|
||||
#define DAV1D_VERSION "616bfd1506a8a75c6a358e578cbec9ca11931502"
|
||||
|
@ -27,8 +27,8 @@
|
||||
#ifndef DAV1D_VERSION_H
|
||||
#define DAV1D_VERSION_H
|
||||
|
||||
#define DAV1D_API_VERSION_MAJOR 6
|
||||
#define DAV1D_API_VERSION_MINOR 9
|
||||
#define DAV1D_API_VERSION_MAJOR 7
|
||||
#define DAV1D_API_VERSION_MINOR 0
|
||||
#define DAV1D_API_VERSION_PATCH 0
|
||||
|
||||
#endif /* DAV1D_VERSION_H */
|
||||
|
14
third_party/dav1d/include/common/validate.h
vendored
14
third_party/dav1d/include/common/validate.h
vendored
@ -32,24 +32,26 @@
|
||||
#include <stdlib.h>
|
||||
|
||||
#if defined(NDEBUG)
|
||||
#define debug_abort()
|
||||
#define debug_print(...) do {} while (0)
|
||||
#define debug_abort() do {} while (0)
|
||||
#else
|
||||
#define debug_print(...) fprintf(stderr, __VA_ARGS__)
|
||||
#define debug_abort abort
|
||||
#endif
|
||||
|
||||
#define validate_input_or_ret_with_msg(x, r, ...) \
|
||||
if (!(x)) { \
|
||||
fprintf(stderr, "Input validation check \'%s\' failed in %s!\n", \
|
||||
#x, __func__); \
|
||||
fprintf(stderr, __VA_ARGS__); \
|
||||
debug_print("Input validation check \'%s\' failed in %s!\n", \
|
||||
#x, __func__); \
|
||||
debug_print(__VA_ARGS__); \
|
||||
debug_abort(); \
|
||||
return r; \
|
||||
}
|
||||
|
||||
#define validate_input_or_ret(x, r) \
|
||||
if (!(x)) { \
|
||||
fprintf(stderr, "Input validation check \'%s\' failed in %s!\n", \
|
||||
#x, __func__); \
|
||||
debug_print("Input validation check \'%s\' failed in %s!\n", \
|
||||
#x, __func__); \
|
||||
debug_abort(); \
|
||||
return r; \
|
||||
}
|
||||
|
9
third_party/dav1d/include/dav1d/dav1d.h
vendored
9
third_party/dav1d/include/dav1d/dav1d.h
vendored
@ -103,6 +103,15 @@ typedef struct Dav1dSettings {
|
||||
*/
|
||||
DAV1D_API const char *dav1d_version(void);
|
||||
|
||||
/**
|
||||
* Get library API version.
|
||||
*
|
||||
* @return A value in the format 0x00XXYYZZ, where XX is the major version,
|
||||
* YY the minor version, and ZZ the patch version.
|
||||
* @see DAV1D_API_MAJOR, DAV1D_API_MINOR, DAV1D_API_PATCH
|
||||
*/
|
||||
DAV1D_API unsigned dav1d_version_api(void);
|
||||
|
||||
/**
|
||||
* Initialize settings to default values.
|
||||
*
|
||||
|
239
third_party/dav1d/include/dav1d/headers.h
vendored
239
third_party/dav1d/include/dav1d/headers.h
vendored
@ -182,8 +182,8 @@ enum Dav1dChromaSamplePosition {
|
||||
};
|
||||
|
||||
typedef struct Dav1dContentLightLevel {
|
||||
int max_content_light_level;
|
||||
int max_frame_average_light_level;
|
||||
uint16_t max_content_light_level;
|
||||
uint16_t max_frame_average_light_level;
|
||||
} Dav1dContentLightLevel;
|
||||
|
||||
typedef struct Dav1dMasteringDisplay {
|
||||
@ -210,7 +210,7 @@ typedef struct Dav1dSequenceHeader {
|
||||
* 1 for 8-10 bits/component 4:4:4; 2 for 4:2:2 at any bits/component,
|
||||
* or 12 bits/component at any chroma subsampling.
|
||||
*/
|
||||
int profile;
|
||||
uint8_t profile;
|
||||
/**
|
||||
* Maximum dimensions for this stream. In non-scalable streams, these
|
||||
* are often the actual dimensions of the stream, although that is not
|
||||
@ -229,60 +229,60 @@ typedef struct Dav1dSequenceHeader {
|
||||
* (twelve_bit) to distinguish between 10 and 12 bits/component. To get
|
||||
* the spec's hbd, use !!our_hbd, and to get twelve_bit, use hbd == 2.
|
||||
*/
|
||||
int hbd;
|
||||
uint8_t hbd;
|
||||
/**
|
||||
* Pixel data uses JPEG pixel range ([0,255] for 8bits) instead of
|
||||
* MPEG pixel range ([16,235] for 8bits luma, [16,240] for 8bits chroma).
|
||||
*/
|
||||
int color_range;
|
||||
uint8_t color_range;
|
||||
|
||||
int num_operating_points;
|
||||
uint8_t num_operating_points;
|
||||
struct Dav1dSequenceHeaderOperatingPoint {
|
||||
int major_level, minor_level;
|
||||
int initial_display_delay;
|
||||
int idc;
|
||||
int tier;
|
||||
int decoder_model_param_present;
|
||||
int display_model_param_present;
|
||||
uint8_t major_level, minor_level;
|
||||
uint8_t initial_display_delay;
|
||||
uint16_t idc;
|
||||
uint8_t tier;
|
||||
uint8_t decoder_model_param_present;
|
||||
uint8_t display_model_param_present;
|
||||
} operating_points[DAV1D_MAX_OPERATING_POINTS];
|
||||
|
||||
int still_picture;
|
||||
int reduced_still_picture_header;
|
||||
int timing_info_present;
|
||||
int num_units_in_tick;
|
||||
int time_scale;
|
||||
int equal_picture_interval;
|
||||
unsigned num_ticks_per_picture;
|
||||
int decoder_model_info_present;
|
||||
int encoder_decoder_buffer_delay_length;
|
||||
int num_units_in_decoding_tick;
|
||||
int buffer_removal_delay_length;
|
||||
int frame_presentation_delay_length;
|
||||
int display_model_info_present;
|
||||
int width_n_bits, height_n_bits;
|
||||
int frame_id_numbers_present;
|
||||
int delta_frame_id_n_bits;
|
||||
int frame_id_n_bits;
|
||||
int sb128;
|
||||
int filter_intra;
|
||||
int intra_edge_filter;
|
||||
int inter_intra;
|
||||
int masked_compound;
|
||||
int warped_motion;
|
||||
int dual_filter;
|
||||
int order_hint;
|
||||
int jnt_comp;
|
||||
int ref_frame_mvs;
|
||||
uint8_t still_picture;
|
||||
uint8_t reduced_still_picture_header;
|
||||
uint8_t timing_info_present;
|
||||
uint32_t num_units_in_tick;
|
||||
uint32_t time_scale;
|
||||
uint8_t equal_picture_interval;
|
||||
uint32_t num_ticks_per_picture;
|
||||
uint8_t decoder_model_info_present;
|
||||
uint8_t encoder_decoder_buffer_delay_length;
|
||||
uint32_t num_units_in_decoding_tick;
|
||||
uint8_t buffer_removal_delay_length;
|
||||
uint8_t frame_presentation_delay_length;
|
||||
uint8_t display_model_info_present;
|
||||
uint8_t width_n_bits, height_n_bits;
|
||||
uint8_t frame_id_numbers_present;
|
||||
uint8_t delta_frame_id_n_bits;
|
||||
uint8_t frame_id_n_bits;
|
||||
uint8_t sb128;
|
||||
uint8_t filter_intra;
|
||||
uint8_t intra_edge_filter;
|
||||
uint8_t inter_intra;
|
||||
uint8_t masked_compound;
|
||||
uint8_t warped_motion;
|
||||
uint8_t dual_filter;
|
||||
uint8_t order_hint;
|
||||
uint8_t jnt_comp;
|
||||
uint8_t ref_frame_mvs;
|
||||
enum Dav1dAdaptiveBoolean screen_content_tools;
|
||||
enum Dav1dAdaptiveBoolean force_integer_mv;
|
||||
int order_hint_n_bits;
|
||||
int super_res;
|
||||
int cdef;
|
||||
int restoration;
|
||||
int ss_hor, ss_ver, monochrome;
|
||||
int color_description_present;
|
||||
int separate_uv_delta_q;
|
||||
int film_grain_present;
|
||||
uint8_t order_hint_n_bits;
|
||||
uint8_t super_res;
|
||||
uint8_t cdef;
|
||||
uint8_t restoration;
|
||||
uint8_t ss_hor, ss_ver, monochrome;
|
||||
uint8_t color_description_present;
|
||||
uint8_t separate_uv_delta_q;
|
||||
uint8_t film_grain_present;
|
||||
|
||||
// Dav1dSequenceHeaders of the same sequence are required to be
|
||||
// bit-identical until this offset. See 7.5 "Ordering of OBUs":
|
||||
@ -291,29 +291,29 @@ typedef struct Dav1dSequenceHeader {
|
||||
// sequence header appears except for the contents of
|
||||
// operating_parameters_info.
|
||||
struct Dav1dSequenceHeaderOperatingParameterInfo {
|
||||
int decoder_buffer_delay;
|
||||
int encoder_buffer_delay;
|
||||
int low_delay_mode;
|
||||
uint32_t decoder_buffer_delay;
|
||||
uint32_t encoder_buffer_delay;
|
||||
uint8_t low_delay_mode;
|
||||
} operating_parameter_info[DAV1D_MAX_OPERATING_POINTS];
|
||||
} Dav1dSequenceHeader;
|
||||
|
||||
typedef struct Dav1dSegmentationData {
|
||||
int delta_q;
|
||||
int delta_lf_y_v, delta_lf_y_h, delta_lf_u, delta_lf_v;
|
||||
int ref;
|
||||
int skip;
|
||||
int globalmv;
|
||||
int16_t delta_q;
|
||||
int8_t delta_lf_y_v, delta_lf_y_h, delta_lf_u, delta_lf_v;
|
||||
int8_t ref;
|
||||
uint8_t skip;
|
||||
uint8_t globalmv;
|
||||
} Dav1dSegmentationData;
|
||||
|
||||
typedef struct Dav1dSegmentationDataSet {
|
||||
Dav1dSegmentationData d[DAV1D_MAX_SEGMENTS];
|
||||
int preskip;
|
||||
int last_active_segid;
|
||||
uint8_t preskip;
|
||||
int8_t last_active_segid;
|
||||
} Dav1dSegmentationDataSet;
|
||||
|
||||
typedef struct Dav1dLoopfilterModeRefDeltas {
|
||||
int mode_delta[2 /* is_zeromv */];
|
||||
int ref_delta[DAV1D_TOTAL_REFS_PER_FRAME];
|
||||
int8_t mode_delta[2 /* is_zeromv */];
|
||||
int8_t ref_delta[DAV1D_TOTAL_REFS_PER_FRAME];
|
||||
} Dav1dLoopfilterModeRefDeltas;
|
||||
|
||||
typedef struct Dav1dFilmGrainData {
|
||||
@ -339,100 +339,101 @@ typedef struct Dav1dFilmGrainData {
|
||||
typedef struct Dav1dFrameHeader {
|
||||
struct {
|
||||
Dav1dFilmGrainData data;
|
||||
int present, update;
|
||||
uint8_t present, update;
|
||||
} film_grain; ///< film grain parameters
|
||||
enum Dav1dFrameType frame_type; ///< type of the picture
|
||||
int width[2 /* { coded_width, superresolution_upscaled_width } */], height;
|
||||
int frame_offset; ///< frame number
|
||||
int temporal_id; ///< temporal id of the frame for SVC
|
||||
int spatial_id; ///< spatial id of the frame for SVC
|
||||
uint8_t frame_offset; ///< frame number
|
||||
uint8_t temporal_id; ///< temporal id of the frame for SVC
|
||||
uint8_t spatial_id; ///< spatial id of the frame for SVC
|
||||
|
||||
int show_existing_frame;
|
||||
int existing_frame_idx;
|
||||
int frame_id;
|
||||
int frame_presentation_delay;
|
||||
int show_frame;
|
||||
int showable_frame;
|
||||
int error_resilient_mode;
|
||||
int disable_cdf_update;
|
||||
int allow_screen_content_tools;
|
||||
int force_integer_mv;
|
||||
int frame_size_override;
|
||||
int primary_ref_frame;
|
||||
int buffer_removal_time_present;
|
||||
uint8_t show_existing_frame;
|
||||
uint8_t existing_frame_idx;
|
||||
uint32_t frame_id;
|
||||
uint32_t frame_presentation_delay;
|
||||
uint8_t show_frame;
|
||||
uint8_t showable_frame;
|
||||
uint8_t error_resilient_mode;
|
||||
uint8_t disable_cdf_update;
|
||||
uint8_t allow_screen_content_tools;
|
||||
uint8_t force_integer_mv;
|
||||
uint8_t frame_size_override;
|
||||
uint8_t primary_ref_frame;
|
||||
uint8_t buffer_removal_time_present;
|
||||
struct Dav1dFrameHeaderOperatingPoint {
|
||||
int buffer_removal_time;
|
||||
uint32_t buffer_removal_time;
|
||||
} operating_points[DAV1D_MAX_OPERATING_POINTS];
|
||||
int refresh_frame_flags;
|
||||
uint8_t refresh_frame_flags;
|
||||
int render_width, render_height;
|
||||
struct {
|
||||
int width_scale_denominator;
|
||||
int enabled;
|
||||
uint8_t width_scale_denominator;
|
||||
uint8_t enabled;
|
||||
} super_res;
|
||||
int have_render_size;
|
||||
int allow_intrabc;
|
||||
int frame_ref_short_signaling;
|
||||
int refidx[DAV1D_REFS_PER_FRAME];
|
||||
int hp;
|
||||
uint8_t have_render_size;
|
||||
uint8_t allow_intrabc;
|
||||
uint8_t frame_ref_short_signaling;
|
||||
int8_t refidx[DAV1D_REFS_PER_FRAME];
|
||||
uint8_t hp;
|
||||
enum Dav1dFilterMode subpel_filter_mode;
|
||||
int switchable_motion_mode;
|
||||
int use_ref_frame_mvs;
|
||||
int refresh_context;
|
||||
uint8_t switchable_motion_mode;
|
||||
uint8_t use_ref_frame_mvs;
|
||||
uint8_t refresh_context;
|
||||
struct {
|
||||
int uniform;
|
||||
unsigned n_bytes;
|
||||
int min_log2_cols, max_log2_cols, log2_cols, cols;
|
||||
int min_log2_rows, max_log2_rows, log2_rows, rows;
|
||||
uint8_t uniform;
|
||||
uint8_t n_bytes;
|
||||
uint8_t min_log2_cols, max_log2_cols, log2_cols, cols;
|
||||
uint8_t min_log2_rows, max_log2_rows, log2_rows, rows;
|
||||
uint16_t col_start_sb[DAV1D_MAX_TILE_COLS + 1];
|
||||
uint16_t row_start_sb[DAV1D_MAX_TILE_ROWS + 1];
|
||||
int update;
|
||||
uint16_t update;
|
||||
} tiling;
|
||||
struct {
|
||||
int yac;
|
||||
int ydc_delta;
|
||||
int udc_delta, uac_delta, vdc_delta, vac_delta;
|
||||
int qm, qm_y, qm_u, qm_v;
|
||||
uint8_t yac;
|
||||
int8_t ydc_delta;
|
||||
int8_t udc_delta, uac_delta, vdc_delta, vac_delta;
|
||||
uint8_t qm, qm_y, qm_u, qm_v;
|
||||
} quant;
|
||||
struct {
|
||||
int enabled, update_map, temporal, update_data;
|
||||
uint8_t enabled, update_map, temporal, update_data;
|
||||
Dav1dSegmentationDataSet seg_data;
|
||||
int lossless[DAV1D_MAX_SEGMENTS], qidx[DAV1D_MAX_SEGMENTS];
|
||||
uint8_t lossless[DAV1D_MAX_SEGMENTS], qidx[DAV1D_MAX_SEGMENTS];
|
||||
} segmentation;
|
||||
struct {
|
||||
struct {
|
||||
int present;
|
||||
int res_log2;
|
||||
uint8_t present;
|
||||
uint8_t res_log2;
|
||||
} q;
|
||||
struct {
|
||||
int present;
|
||||
int res_log2;
|
||||
int multi;
|
||||
uint8_t present;
|
||||
uint8_t res_log2;
|
||||
uint8_t multi;
|
||||
} lf;
|
||||
} delta;
|
||||
int all_lossless;
|
||||
uint8_t all_lossless;
|
||||
struct {
|
||||
int level_y[2 /* dir */];
|
||||
int level_u, level_v;
|
||||
int mode_ref_delta_enabled;
|
||||
int mode_ref_delta_update;
|
||||
uint8_t level_y[2 /* dir */];
|
||||
uint8_t level_u, level_v;
|
||||
uint8_t mode_ref_delta_enabled;
|
||||
uint8_t mode_ref_delta_update;
|
||||
Dav1dLoopfilterModeRefDeltas mode_ref_deltas;
|
||||
int sharpness;
|
||||
uint8_t sharpness;
|
||||
} loopfilter;
|
||||
struct {
|
||||
int damping;
|
||||
int n_bits;
|
||||
int y_strength[DAV1D_MAX_CDEF_STRENGTHS];
|
||||
int uv_strength[DAV1D_MAX_CDEF_STRENGTHS];
|
||||
uint8_t damping;
|
||||
uint8_t n_bits;
|
||||
uint8_t y_strength[DAV1D_MAX_CDEF_STRENGTHS];
|
||||
uint8_t uv_strength[DAV1D_MAX_CDEF_STRENGTHS];
|
||||
} cdef;
|
||||
struct {
|
||||
enum Dav1dRestorationType type[3 /* plane */];
|
||||
int unit_size[2 /* y, uv */];
|
||||
uint8_t unit_size[2 /* y, uv */];
|
||||
} restoration;
|
||||
enum Dav1dTxfmMode txfm_mode;
|
||||
int switchable_comp_refs;
|
||||
int skip_mode_allowed, skip_mode_enabled, skip_mode_refs[2];
|
||||
int warp_motion;
|
||||
int reduced_txtp_set;
|
||||
uint8_t switchable_comp_refs;
|
||||
uint8_t skip_mode_allowed, skip_mode_enabled;
|
||||
int8_t skip_mode_refs[2];
|
||||
uint8_t warp_motion;
|
||||
uint8_t reduced_txtp_set;
|
||||
Dav1dWarpedMotionParams gmv[DAV1D_REFS_PER_FRAME];
|
||||
} Dav1dFrameHeader;
|
||||
|
||||
|
2
third_party/dav1d/include/dav1d/picture.h
vendored
2
third_party/dav1d/include/dav1d/picture.h
vendored
@ -91,7 +91,7 @@ typedef struct Dav1dPicture {
|
||||
*/
|
||||
size_t n_itut_t35;
|
||||
|
||||
uintptr_t reserved[3]; ///< reserved for future use
|
||||
uintptr_t reserved[4]; ///< reserved for future use
|
||||
|
||||
struct Dav1dRef *frame_hdr_ref; ///< Dav1dFrameHeader allocation origin
|
||||
struct Dav1dRef *seq_hdr_ref; ///< Dav1dSequenceHeader allocation origin
|
||||
|
8
third_party/dav1d/include/dav1d/version.h.in
vendored
8
third_party/dav1d/include/dav1d/version.h.in
vendored
@ -35,6 +35,14 @@ extern "C" {
|
||||
#define DAV1D_API_VERSION_MINOR @DAV1D_API_VERSION_MINOR@
|
||||
#define DAV1D_API_VERSION_PATCH @DAV1D_API_VERSION_PATCH@
|
||||
|
||||
/**
|
||||
* Extract version components from the value returned by
|
||||
* dav1d_version_int()
|
||||
*/
|
||||
#define DAV1D_API_MAJOR(v) (((v) >> 16) & 0xFF)
|
||||
#define DAV1D_API_MINOR(v) (((v) >> 8) & 0xFF)
|
||||
#define DAV1D_API_PATCH(v) (((v) >> 0) & 0xFF)
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif
|
||||
|
14
third_party/dav1d/meson.build
vendored
14
third_party/dav1d/meson.build
vendored
@ -30,7 +30,7 @@ project('dav1d', ['c'],
|
||||
'b_ndebug=if-release'],
|
||||
meson_version: '>= 0.49.0')
|
||||
|
||||
dav1d_soname_version = '6.9.0'
|
||||
dav1d_soname_version = '7.0.0'
|
||||
dav1d_api_version_array = dav1d_soname_version.split('.')
|
||||
dav1d_api_version_major = dav1d_api_version_array[0]
|
||||
dav1d_api_version_minor = dav1d_api_version_array[1]
|
||||
@ -149,6 +149,10 @@ else
|
||||
endif
|
||||
cdata.set('HAVE_CLOCK_GETTIME', 1)
|
||||
endif
|
||||
|
||||
if cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args)
|
||||
cdata.set('HAVE_POSIX_MEMALIGN', 1)
|
||||
endif
|
||||
endif
|
||||
|
||||
# check for fseeko on android. It is not always available if _FILE_OFFSET_BITS is defined to 64
|
||||
@ -226,14 +230,6 @@ else
|
||||
getopt_dependency = []
|
||||
endif
|
||||
|
||||
if cc.has_function('_aligned_malloc', prefix : '#include <malloc.h>', args : test_args)
|
||||
cdata.set('HAVE_ALIGNED_MALLOC', 1)
|
||||
elif cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args)
|
||||
cdata.set('HAVE_POSIX_MEMALIGN', 1)
|
||||
elif cc.has_function('memalign', prefix : '#include <malloc.h>', args : test_args)
|
||||
cdata.set('HAVE_MEMALIGN', 1)
|
||||
endif
|
||||
|
||||
if (host_machine.cpu_family() == 'aarch64' or
|
||||
host_machine.cpu_family().startswith('arm') or
|
||||
host_machine.cpu() == 'ppc64le')
|
||||
|
206
third_party/dav1d/src/arm/32/refmvs.S
vendored
206
third_party/dav1d/src/arm/32/refmvs.S
vendored
@ -95,3 +95,209 @@ L(splat_tbl):
|
||||
bgt 1b
|
||||
pop {r4, pc}
|
||||
endfunc
|
||||
|
||||
const mv_tbls, align=4
|
||||
.byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
|
||||
.byte 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0
|
||||
.byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
|
||||
.byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
|
||||
endconst
|
||||
|
||||
const mask_mult, align=4
|
||||
.byte 1, 2, 1, 2, 0, 0, 0, 0
|
||||
endconst
|
||||
|
||||
// void dav1d_save_tmvs_neon(refmvs_temporal_block *rp, ptrdiff_t stride,
|
||||
// refmvs_block **rr, const uint8_t *ref_sign,
|
||||
// int col_end8, int row_end8,
|
||||
// int col_start8, int row_start8)
|
||||
function save_tmvs_neon, export=1
|
||||
push {r4-r11,lr}
|
||||
ldrd r4, r5, [sp, #36]
|
||||
ldrd r6, r7, [sp, #44]
|
||||
|
||||
vmov.i8 d30, #0
|
||||
vld1.8 {d31}, [r3]
|
||||
adr r8, L(save_tmvs_tbl)
|
||||
movrel_local lr, mask_mult
|
||||
movrel_local r12, mv_tbls
|
||||
vld1.8 {d29}, [lr]
|
||||
vext.8 d31, d30, d31, #7 // [0, ref_sign]
|
||||
mov r3, #5
|
||||
mul r1, r1, r3 // stride *= 5
|
||||
sub r5, r5, r7 // h = row_end8 - row_start8
|
||||
lsl r7, r7, #1 // row_start8 <<= 1
|
||||
1:
|
||||
mov r3, #5
|
||||
mov r11, #12*2
|
||||
and r9, r7, #30 // (y & 15) * 2
|
||||
ldr r9, [r2, r9, lsl #2] // b = rr[(y & 15) * 2]
|
||||
add r9, r9, #12 // &b[... + 1]
|
||||
mla r10, r4, r11, r9 // end_cand_b = &b[col_end8*2 + 1]
|
||||
mla r9, r6, r11, r9 // cand_b = &b[x*2 + 1]
|
||||
|
||||
mla r3, r6, r3, r0 // &rp[x]
|
||||
|
||||
push {r2,r4,r6}
|
||||
|
||||
2:
|
||||
ldrb r11, [r9, #10] // cand_b->bs
|
||||
add lr, r9, #8
|
||||
vld1.8 {d0, d1}, [r9] // cand_b->mv
|
||||
add r11, r8, r11, lsl #3
|
||||
vld1.16 {d2[]}, [lr] // cand_b->ref
|
||||
ldrh lr, [r11] // bw8
|
||||
mov r2, r8
|
||||
add r9, r9, lr, lsl #1 // cand_b += bw8*2
|
||||
cmp r9, r10
|
||||
vmov d4, d0
|
||||
bge 3f
|
||||
|
||||
ldrb r2, [r9, #10] // cand_b->bs
|
||||
add lr, r9, #8
|
||||
vld1.8 {d6, d7}, [r9] // cand_b->mv
|
||||
add r2, r8, r2, lsl #3
|
||||
vld1.16 {d2[1]}, [lr] // cand_b->ref
|
||||
ldrh lr, [r2] // bw8
|
||||
add r9, r9, lr, lsl #1 // cand_b += bw8*2
|
||||
vmov d5, d6
|
||||
|
||||
3:
|
||||
vabs.s16 q2, q2 // abs(mv[].xy)
|
||||
vtbl.8 d2, {d31}, d2 // ref_sign[ref]
|
||||
vshr.u16 q2, q2, #12 // abs(mv[].xy) >> 12
|
||||
vmull.u8 q1, d2, d29 // ref_sign[ref] * {1, 2}
|
||||
vceq.i32 q2, q2, #0 // abs(mv[].xy) <= 4096
|
||||
vmovn.i32 d4, q2 // abs() condition to 16 bit
|
||||
vand d2, d2, d4 // h[0-3] contains conditions for mv[0-1]
|
||||
vpadd.i16 d2, d2, d2 // Combine condition for [1] and [0]
|
||||
vmov.u16 r4, d2[0] // Extract case for first block
|
||||
vmov.u16 r6, d2[1]
|
||||
ldr r11, [r11, #4] // Fetch jump table entry
|
||||
ldr r2, [r2, #4]
|
||||
add r4, r12, r4, lsl #4
|
||||
add r6, r12, r6, lsl #4
|
||||
vld1.8 {d2, d3}, [r4] // Load permutation table base on case
|
||||
vld1.8 {d4, d5}, [r6]
|
||||
add r11, r8, r11 // Find jump table target
|
||||
add r2, r8, r2
|
||||
vtbl.8 d16, {d0, d1}, d2 // Permute cand_b to output refmvs_temporal_block
|
||||
vtbl.8 d17, {d0, d1}, d3
|
||||
vtbl.8 d18, {d6, d7}, d4
|
||||
vtbl.8 d19, {d6, d7}, d5
|
||||
vmov q0, q8
|
||||
|
||||
// q1 follows on q0 (q8), with another 3 full repetitions of the pattern.
|
||||
vext.8 q1, q8, q8, #1
|
||||
vext.8 q10, q9, q9, #1
|
||||
// q2 ends with 3 complete repetitions of the pattern.
|
||||
vext.8 q2, q8, q1, #4
|
||||
vext.8 q11, q9, q10, #4
|
||||
|
||||
blx r11
|
||||
bge 4f // if (cand_b >= end)
|
||||
vmov q0, q9
|
||||
vmov q1, q10
|
||||
vmov q2, q11
|
||||
cmp r9, r10
|
||||
blx r2
|
||||
blt 2b // if (cand_b < end)
|
||||
|
||||
4:
|
||||
pop {r2,r4,r6}
|
||||
|
||||
subs r5, r5, #1 // h--
|
||||
add r7, r7, #2 // y += 2
|
||||
add r0, r0, r1 // rp += stride
|
||||
bgt 1b
|
||||
|
||||
pop {r4-r11,pc}
|
||||
|
||||
.align 2
|
||||
L(save_tmvs_tbl):
|
||||
.word 16 * 12
|
||||
.word 160f - L(save_tmvs_tbl) + CONFIG_THUMB
|
||||
.word 16 * 12
|
||||
.word 160f - L(save_tmvs_tbl) + CONFIG_THUMB
|
||||
.word 8 * 12
|
||||
.word 80f - L(save_tmvs_tbl) + CONFIG_THUMB
|
||||
.word 8 * 12
|
||||
.word 80f - L(save_tmvs_tbl) + CONFIG_THUMB
|
||||
.word 8 * 12
|
||||
.word 80f - L(save_tmvs_tbl) + CONFIG_THUMB
|
||||
.word 8 * 12
|
||||
.word 80f - L(save_tmvs_tbl) + CONFIG_THUMB
|
||||
.word 4 * 12
|
||||
.word 40f - L(save_tmvs_tbl) + CONFIG_THUMB
|
||||
.word 4 * 12
|
||||
.word 40f - L(save_tmvs_tbl) + CONFIG_THUMB
|
||||
.word 4 * 12
|
||||
.word 40f - L(save_tmvs_tbl) + CONFIG_THUMB
|
||||
.word 4 * 12
|
||||
.word 40f - L(save_tmvs_tbl) + CONFIG_THUMB
|
||||
.word 2 * 12
|
||||
.word 20f - L(save_tmvs_tbl) + CONFIG_THUMB
|
||||
.word 2 * 12
|
||||
.word 20f - L(save_tmvs_tbl) + CONFIG_THUMB
|
||||
.word 2 * 12
|
||||
.word 20f - L(save_tmvs_tbl) + CONFIG_THUMB
|
||||
.word 2 * 12
|
||||
.word 20f - L(save_tmvs_tbl) + CONFIG_THUMB
|
||||
.word 2 * 12
|
||||
.word 20f - L(save_tmvs_tbl) + CONFIG_THUMB
|
||||
.word 1 * 12
|
||||
.word 10f - L(save_tmvs_tbl) + CONFIG_THUMB
|
||||
.word 1 * 12
|
||||
.word 10f - L(save_tmvs_tbl) + CONFIG_THUMB
|
||||
.word 1 * 12
|
||||
.word 10f - L(save_tmvs_tbl) + CONFIG_THUMB
|
||||
.word 1 * 12
|
||||
.word 10f - L(save_tmvs_tbl) + CONFIG_THUMB
|
||||
.word 1 * 12
|
||||
.word 10f - L(save_tmvs_tbl) + CONFIG_THUMB
|
||||
.word 1 * 12
|
||||
.word 10f - L(save_tmvs_tbl) + CONFIG_THUMB
|
||||
.word 1 * 12
|
||||
.word 10f - L(save_tmvs_tbl) + CONFIG_THUMB
|
||||
|
||||
10:
|
||||
add r4, r3, #4
|
||||
vst1.32 {d0[0]}, [r3]
|
||||
vst1.8 {d0[4]}, [r4]
|
||||
add r3, r3, #5
|
||||
bx lr
|
||||
20:
|
||||
add r4, r3, #8
|
||||
vst1.8 {d0}, [r3]
|
||||
vst1.16 {d1[0]}, [r4]
|
||||
add r3, r3, #2*5
|
||||
bx lr
|
||||
40:
|
||||
add r4, r3, #16
|
||||
vst1.8 {q0}, [r3]
|
||||
vst1.32 {d2[0]}, [r4]
|
||||
add r3, r3, #4*5
|
||||
bx lr
|
||||
80:
|
||||
add r4, r3, #(8*5-16)
|
||||
// This writes 6 full entries plus 2 extra bytes
|
||||
vst1.8 {q0, q1}, [r3]
|
||||
// Write the last few, overlapping with the first write.
|
||||
vst1.8 {q2}, [r4]
|
||||
add r3, r3, #8*5
|
||||
bx lr
|
||||
160:
|
||||
add r4, r3, #6*5
|
||||
add r6, r3, #12*5
|
||||
// This writes 6 full entries plus 2 extra bytes
|
||||
vst1.8 {q0, q1}, [r3]
|
||||
// Write another 6 full entries, slightly overlapping with the first set
|
||||
vst1.8 {q0, q1}, [r4]
|
||||
add r4, r3, #(16*5-16)
|
||||
// Write 8 bytes (one full entry) after the first 12
|
||||
vst1.8 {d0}, [r6]
|
||||
// Write the last 3 entries
|
||||
vst1.8 {q2}, [r4]
|
||||
add r3, r3, #16*5
|
||||
bx lr
|
||||
endfunc
|
||||
|
381
third_party/dav1d/src/arm/64/looprestoration.S
vendored
381
third_party/dav1d/src/arm/64/looprestoration.S
vendored
@ -965,371 +965,338 @@ function wiener_filter5_hv_8bpc_neon
|
||||
ret
|
||||
endfunc
|
||||
|
||||
#define SUM_STRIDE (384+16)
|
||||
|
||||
#include "looprestoration_tmpl.S"
|
||||
|
||||
// void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
|
||||
// const pixel (*left)[4],
|
||||
// const pixel *src, const ptrdiff_t stride,
|
||||
// const int w, const int h,
|
||||
// const enum LrEdgeFlags edges);
|
||||
function sgr_box3_h_8bpc_neon, export=1
|
||||
add w5, w5, #2 // w += 2
|
||||
// void dav1d_sgr_box3_row_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
|
||||
// const pixel (*left)[4],
|
||||
// const pixel *src, const int w,
|
||||
// const enum LrEdgeFlags edges);
|
||||
function sgr_box3_row_h_8bpc_neon, export=1
|
||||
add w4, w4, #2 // w += 2
|
||||
|
||||
// Set up pointers for reading/writing alternate rows
|
||||
add x10, x0, #(4*SUM_STRIDE) // sumsq
|
||||
add x11, x1, #(2*SUM_STRIDE) // sum
|
||||
add x12, x3, x4 // src
|
||||
lsl x4, x4, #1
|
||||
mov x9, #(2*2*SUM_STRIDE) // double sum stride
|
||||
|
||||
// Subtract the aligned width from the output stride.
|
||||
add w13, w5, #7
|
||||
bic w13, w13, #7
|
||||
sub x9, x9, w13, uxtw #1
|
||||
|
||||
// Store the width for the vertical loop
|
||||
mov w8, w5
|
||||
|
||||
// Subtract the number of pixels read from the input from the stride
|
||||
add w13, w13, #8
|
||||
sub x4, x4, w13, uxtw
|
||||
|
||||
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
|
||||
tst w7, #1 // LR_HAVE_LEFT
|
||||
b.eq 2f
|
||||
// LR_HAVE_LEFT
|
||||
tst w5, #1 // LR_HAVE_LEFT
|
||||
b.eq 1f
|
||||
cbnz x2, 0f
|
||||
// left == NULL
|
||||
|
||||
// LR_HAVE_LEFT && left == NULL
|
||||
sub x3, x3, #2
|
||||
sub x12, x12, #2
|
||||
b 1f
|
||||
0: // LR_HAVE_LEFT, left != NULL
|
||||
2: // !LR_HAVE_LEFT, increase the stride.
|
||||
// For this case we don't read the left 2 pixels from the src pointer,
|
||||
// but shift it as if we had done that.
|
||||
add x4, x4, #2
|
||||
ld1 {v0.16b}, [x3], #16
|
||||
b 2f
|
||||
|
||||
|
||||
1: // Loop vertically
|
||||
ld1 {v0.16b}, [x3], #16
|
||||
ld1 {v4.16b}, [x12], #16
|
||||
|
||||
tst w7, #1 // LR_HAVE_LEFT
|
||||
b.eq 0f
|
||||
cbz x2, 2f
|
||||
0:
|
||||
// LR_HAVE_LEFT, left != NULL
|
||||
ld1 {v1.s}[3], [x2], #4
|
||||
// Move x3/x12 back to account for the last 2 bytes we loaded earlier,
|
||||
ld1 {v0.16b}, [x3], #16
|
||||
ld1 {v1.s}[3], [x2]
|
||||
// Move x3 back to account for the last 2 bytes we loaded earlier,
|
||||
// which we'll shift out.
|
||||
sub x3, x3, #2
|
||||
sub x12, x12, #2
|
||||
ld1 {v5.s}[3], [x2], #4
|
||||
ext v0.16b, v1.16b, v0.16b, #14
|
||||
ext v4.16b, v5.16b, v4.16b, #14
|
||||
b 2f
|
||||
0:
|
||||
|
||||
1:
|
||||
ld1 {v0.16b}, [x3], #16
|
||||
// !LR_HAVE_LEFT, fill v1 with the leftmost byte
|
||||
// and shift v0 to have 2x the first byte at the front.
|
||||
dup v1.16b, v0.b[0]
|
||||
dup v5.16b, v4.b[0]
|
||||
// Move x3 back to account for the last 2 bytes we loaded before,
|
||||
// which we shifted out.
|
||||
sub x3, x3, #2
|
||||
sub x12, x12, #2
|
||||
ext v0.16b, v1.16b, v0.16b, #14
|
||||
ext v4.16b, v5.16b, v4.16b, #14
|
||||
|
||||
2:
|
||||
umull v1.8h, v0.8b, v0.8b
|
||||
umull2 v2.8h, v0.16b, v0.16b
|
||||
umull v5.8h, v4.8b, v4.8b
|
||||
umull2 v6.8h, v4.16b, v4.16b
|
||||
|
||||
tst w7, #2 // LR_HAVE_RIGHT
|
||||
tst w5, #2 // LR_HAVE_RIGHT
|
||||
b.ne 4f
|
||||
// If we'll need to pad the right edge, load that byte to pad with
|
||||
// here since we can find it pretty easily from here.
|
||||
sub w13, w5, #(2 + 16 - 2 + 1)
|
||||
sub w13, w4, #(2 + 16 - 2 + 1)
|
||||
ldr b30, [x3, w13, sxtw]
|
||||
ldr b31, [x12, w13, sxtw]
|
||||
// Fill v30/v31 with the right padding pixel
|
||||
// Fill v30 with the right padding pixel
|
||||
dup v30.16b, v30.b[0]
|
||||
dup v31.16b, v31.b[0]
|
||||
3: // !LR_HAVE_RIGHT
|
||||
|
||||
// Check whether we need to pad the right edge
|
||||
cmp w5, #10
|
||||
cmp w4, #10
|
||||
b.ge 4f // If w >= 10, all used input pixels are valid
|
||||
|
||||
// 1 <= w < 10, w pixels valid in v0. For w=9, this ends up called
|
||||
// again; it's not strictly needed in those cases (we pad enough here),
|
||||
// but keeping the code as simple as possible.
|
||||
|
||||
// Insert padding in v0/4.b[w] onwards
|
||||
// Insert padding in v0.b[w] onwards
|
||||
movrel x13, right_ext_mask
|
||||
sub x13, x13, w5, uxtw
|
||||
sub x13, x13, w4, uxtw
|
||||
ld1 {v29.16b}, [x13]
|
||||
|
||||
bit v0.16b, v30.16b, v29.16b
|
||||
bit v4.16b, v31.16b, v29.16b
|
||||
|
||||
// Update the precalculated squares
|
||||
umull v1.8h, v0.8b, v0.8b
|
||||
umull2 v2.8h, v0.16b, v0.16b
|
||||
umull v5.8h, v4.8b, v4.8b
|
||||
umull2 v6.8h, v4.16b, v4.16b
|
||||
|
||||
4: // Loop horizontally
|
||||
ext v16.16b, v0.16b, v0.16b, #1
|
||||
ext v17.16b, v0.16b, v0.16b, #2
|
||||
ext v18.16b, v4.16b, v4.16b, #1
|
||||
ext v19.16b, v4.16b, v4.16b, #2
|
||||
uaddl v3.8h, v0.8b, v16.8b
|
||||
uaddw v3.8h, v3.8h, v17.8b
|
||||
uaddl v7.8h, v4.8b, v18.8b
|
||||
uaddw v7.8h, v7.8h, v19.8b
|
||||
|
||||
ext v20.16b, v1.16b, v2.16b, #2
|
||||
uaddw v3.8h, v3.8h, v17.8b
|
||||
|
||||
ext v21.16b, v1.16b, v2.16b, #4
|
||||
ext v22.16b, v5.16b, v6.16b, #2
|
||||
ext v23.16b, v5.16b, v6.16b, #4
|
||||
|
||||
uaddl v26.4s, v1.4h, v20.4h
|
||||
uaddl2 v27.4s, v1.8h, v20.8h
|
||||
uaddw v26.4s, v26.4s, v21.4h
|
||||
uaddw2 v27.4s, v27.4s, v21.8h
|
||||
|
||||
uaddl v28.4s, v5.4h, v22.4h
|
||||
uaddl2 v29.4s, v5.8h, v22.8h
|
||||
uaddw v28.4s, v28.4s, v23.4h
|
||||
uaddw2 v29.4s, v29.4s, v23.8h
|
||||
|
||||
subs w5, w5, #8
|
||||
subs w4, w4, #8
|
||||
|
||||
st1 {v3.8h}, [x1], #16
|
||||
st1 {v7.8h}, [x11], #16
|
||||
st1 {v26.4s,v27.4s}, [x0], #32
|
||||
st1 {v28.4s,v29.4s}, [x10], #32
|
||||
|
||||
b.le 9f
|
||||
tst w7, #2 // LR_HAVE_RIGHT
|
||||
tst w5, #2 // LR_HAVE_RIGHT
|
||||
ld1 {v3.8b}, [x3], #8
|
||||
ld1 {v7.8b}, [x12], #8
|
||||
mov v1.16b, v2.16b
|
||||
mov v5.16b, v6.16b
|
||||
ext v0.16b, v0.16b, v3.16b, #8
|
||||
ext v4.16b, v4.16b, v7.16b, #8
|
||||
umull v2.8h, v3.8b, v3.8b
|
||||
umull v6.8h, v7.8b, v7.8b
|
||||
|
||||
b.ne 4b // If we don't need to pad, just keep summing.
|
||||
b 3b // If we need to pad, check how many pixels we have left.
|
||||
|
||||
9:
|
||||
subs w6, w6, #2
|
||||
b.le 0f
|
||||
// Jump to the next row and loop horizontally
|
||||
add x0, x0, x9, lsl #1
|
||||
add x10, x10, x9, lsl #1
|
||||
add x1, x1, x9
|
||||
add x11, x11, x9
|
||||
add x3, x3, x4
|
||||
add x12, x12, x4
|
||||
mov w5, w8
|
||||
b 1b
|
||||
0:
|
||||
ret
|
||||
endfunc
|
||||
|
||||
// void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
|
||||
// const pixel (*left)[4],
|
||||
// const pixel *src, const ptrdiff_t stride,
|
||||
// const int w, const int h,
|
||||
// const enum LrEdgeFlags edges);
|
||||
function sgr_box5_h_8bpc_neon, export=1
|
||||
add w5, w5, #2 // w += 2
|
||||
// void dav1d_sgr_box5_row_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
|
||||
// const pixel (*left)[4],
|
||||
// const pixel *src, const int w,
|
||||
// const enum LrEdgeFlags edges);
|
||||
function sgr_box5_row_h_8bpc_neon, export=1
|
||||
add w4, w4, #2 // w += 2
|
||||
|
||||
// Set up pointers for reading/writing alternate rows
|
||||
add x10, x0, #(4*SUM_STRIDE) // sumsq
|
||||
add x11, x1, #(2*SUM_STRIDE) // sum
|
||||
add x12, x3, x4 // src
|
||||
lsl x4, x4, #1
|
||||
mov x9, #(2*2*SUM_STRIDE) // double sum stride
|
||||
|
||||
// Subtract the aligned width from the output stride.
|
||||
add w13, w5, #7
|
||||
bic w13, w13, #7
|
||||
sub x9, x9, w13, uxtw #1
|
||||
add w13, w13, #8
|
||||
sub x4, x4, w13, uxtw
|
||||
|
||||
// Store the width for the vertical loop
|
||||
mov w8, w5
|
||||
|
||||
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
|
||||
tst w7, #1 // LR_HAVE_LEFT
|
||||
b.eq 2f
|
||||
// LR_HAVE_LEFT
|
||||
tst w5, #1 // LR_HAVE_LEFT
|
||||
b.eq 1f
|
||||
cbnz x2, 0f
|
||||
// left == NULL
|
||||
|
||||
// LR_HAVE_LEFT && left == NULL
|
||||
sub x3, x3, #3
|
||||
sub x12, x12, #3
|
||||
b 1f
|
||||
0: // LR_HAVE_LEFT, left != NULL
|
||||
2: // !LR_HAVE_LEFT, increase the stride.
|
||||
// For this case we don't read the left 3 pixels from the src pointer,
|
||||
// but shift it as if we had done that.
|
||||
add x4, x4, #3
|
||||
ld1 {v0.16b}, [x3], #16
|
||||
b 2f
|
||||
|
||||
1: // Loop vertically
|
||||
ld1 {v0.16b}, [x3], #16
|
||||
ld1 {v4.16b}, [x12], #16
|
||||
|
||||
tst w7, #1 // LR_HAVE_LEFT
|
||||
b.eq 0f
|
||||
cbz x2, 2f
|
||||
0:
|
||||
// LR_HAVE_LEFT, left != NULL
|
||||
ld1 {v1.s}[3], [x2], #4
|
||||
// Move x3/x12 back to account for the last 3 bytes we loaded earlier,
|
||||
ld1 {v0.16b}, [x3], #16
|
||||
ld1 {v1.s}[3], [x2], #4
|
||||
// Move x3 back to account for the last 3 bytes we loaded earlier,
|
||||
// which we'll shift out.
|
||||
sub x3, x3, #3
|
||||
sub x12, x12, #3
|
||||
ld1 {v5.s}[3], [x2], #4
|
||||
ext v0.16b, v1.16b, v0.16b, #13
|
||||
ext v4.16b, v5.16b, v4.16b, #13
|
||||
b 2f
|
||||
0:
|
||||
|
||||
1:
|
||||
ld1 {v0.16b}, [x3], #16
|
||||
// !LR_HAVE_LEFT, fill v1 with the leftmost byte
|
||||
// and shift v0 to have 3x the first byte at the front.
|
||||
dup v1.16b, v0.b[0]
|
||||
dup v5.16b, v4.b[0]
|
||||
// Move x3 back to account for the last 3 bytes we loaded before,
|
||||
// which we shifted out.
|
||||
sub x3, x3, #3
|
||||
sub x12, x12, #3
|
||||
ext v0.16b, v1.16b, v0.16b, #13
|
||||
ext v4.16b, v5.16b, v4.16b, #13
|
||||
|
||||
2:
|
||||
umull v1.8h, v0.8b, v0.8b
|
||||
umull2 v2.8h, v0.16b, v0.16b
|
||||
umull v5.8h, v4.8b, v4.8b
|
||||
umull2 v6.8h, v4.16b, v4.16b
|
||||
|
||||
tst w7, #2 // LR_HAVE_RIGHT
|
||||
tst w5, #2 // LR_HAVE_RIGHT
|
||||
b.ne 4f
|
||||
// If we'll need to pad the right edge, load that byte to pad with
|
||||
// here since we can find it pretty easily from here.
|
||||
sub w13, w5, #(2 + 16 - 3 + 1)
|
||||
sub w13, w4, #(2 + 16 - 3 + 1)
|
||||
ldr b30, [x3, w13, sxtw]
|
||||
ldr b31, [x12, w13, sxtw]
|
||||
// Fill v30/v31 with the right padding pixel
|
||||
// Fill v30 with the right padding pixel
|
||||
dup v30.16b, v30.b[0]
|
||||
dup v31.16b, v31.b[0]
|
||||
3: // !LR_HAVE_RIGHT
|
||||
|
||||
// Check whether we need to pad the right edge
|
||||
cmp w5, #11
|
||||
cmp w4, #11
|
||||
b.ge 4f // If w >= 11, all used input pixels are valid
|
||||
|
||||
// 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
|
||||
// this ends up called again; it's not strictly needed in those
|
||||
// cases (we pad enough here), but keeping the code as simple as possible.
|
||||
|
||||
// Insert padding in v0/4.b[w+1] onwards; fuse the +1 into the
|
||||
// Insert padding in v0.b[w+1] onwards; fuse the +1 into the
|
||||
// buffer pointer.
|
||||
movrel x13, right_ext_mask, -1
|
||||
sub x13, x13, w5, uxtw
|
||||
sub x13, x13, w4, uxtw
|
||||
ld1 {v29.16b}, [x13]
|
||||
|
||||
bit v0.16b, v30.16b, v29.16b
|
||||
bit v4.16b, v31.16b, v29.16b
|
||||
|
||||
// Update the precalculated squares
|
||||
umull v1.8h, v0.8b, v0.8b
|
||||
umull2 v2.8h, v0.16b, v0.16b
|
||||
umull v5.8h, v4.8b, v4.8b
|
||||
umull2 v6.8h, v4.16b, v4.16b
|
||||
|
||||
4: // Loop horizontally
|
||||
ext v16.16b, v0.16b, v0.16b, #1
|
||||
ext v17.16b, v0.16b, v0.16b, #2
|
||||
ext v18.16b, v0.16b, v0.16b, #3
|
||||
ext v19.16b, v0.16b, v0.16b, #4
|
||||
ext v20.16b, v4.16b, v4.16b, #1
|
||||
ext v21.16b, v4.16b, v4.16b, #2
|
||||
ext v22.16b, v4.16b, v4.16b, #3
|
||||
ext v23.16b, v4.16b, v4.16b, #4
|
||||
uaddl v3.8h, v0.8b, v16.8b
|
||||
uaddl v24.8h, v17.8b, v18.8b
|
||||
uaddl v7.8h, v4.8b, v20.8b
|
||||
uaddw v3.8h, v3.8h, v19.8b
|
||||
uaddl v25.8h, v21.8b, v22.8b
|
||||
uaddw v7.8h, v7.8h, v23.8b
|
||||
add v3.8h, v3.8h, v24.8h
|
||||
add v7.8h, v7.8h, v25.8h
|
||||
|
||||
ext v16.16b, v1.16b, v2.16b, #2
|
||||
ext v17.16b, v1.16b, v2.16b, #4
|
||||
ext v18.16b, v1.16b, v2.16b, #6
|
||||
ext v19.16b, v1.16b, v2.16b, #8
|
||||
ext v20.16b, v5.16b, v6.16b, #2
|
||||
ext v21.16b, v5.16b, v6.16b, #4
|
||||
ext v22.16b, v5.16b, v6.16b, #6
|
||||
ext v23.16b, v5.16b, v6.16b, #8
|
||||
|
||||
uaddl v26.4s, v1.4h, v16.4h
|
||||
uaddl2 v27.4s, v1.8h, v16.8h
|
||||
uaddl v16.4s, v17.4h, v18.4h
|
||||
uaddl2 v17.4s, v17.8h, v18.8h
|
||||
uaddl v28.4s, v5.4h, v20.4h
|
||||
uaddl2 v29.4s, v5.8h, v20.8h
|
||||
uaddw v26.4s, v26.4s, v19.4h
|
||||
uaddw2 v27.4s, v27.4s, v19.8h
|
||||
uaddl v20.4s, v21.4h, v22.4h
|
||||
uaddl2 v21.4s, v21.8h, v22.8h
|
||||
uaddw v28.4s, v28.4s, v23.4h
|
||||
uaddw2 v29.4s, v29.4s, v23.8h
|
||||
add v26.4s, v26.4s, v16.4s
|
||||
add v27.4s, v27.4s, v17.4s
|
||||
add v28.4s, v28.4s, v20.4s
|
||||
add v29.4s, v29.4s, v21.4s
|
||||
|
||||
subs w5, w5, #8
|
||||
subs w4, w4, #8
|
||||
|
||||
st1 {v3.8h}, [x1], #16
|
||||
st1 {v7.8h}, [x11], #16
|
||||
st1 {v26.4s,v27.4s}, [x0], #32
|
||||
st1 {v28.4s,v29.4s}, [x10], #32
|
||||
|
||||
b.le 9f
|
||||
tst w7, #2 // LR_HAVE_RIGHT
|
||||
tst w5, #2 // LR_HAVE_RIGHT
|
||||
ld1 {v3.8b}, [x3], #8
|
||||
ld1 {v7.8b}, [x12], #8
|
||||
mov v1.16b, v2.16b
|
||||
mov v5.16b, v6.16b
|
||||
ext v0.16b, v0.16b, v3.16b, #8
|
||||
ext v4.16b, v4.16b, v7.16b, #8
|
||||
umull v2.8h, v3.8b, v3.8b
|
||||
umull v6.8h, v7.8b, v7.8b
|
||||
|
||||
b.ne 4b // If we don't need to pad, just keep summing.
|
||||
b 3b // If we need to pad, check how many pixels we have left.
|
||||
|
||||
9:
|
||||
subs w6, w6, #2
|
||||
b.le 0f
|
||||
// Jump to the next row and loop horizontally
|
||||
add x0, x0, x9, lsl #1
|
||||
add x10, x10, x9, lsl #1
|
||||
add x1, x1, x9
|
||||
add x11, x11, x9
|
||||
add x3, x3, x4
|
||||
add x12, x12, x4
|
||||
mov w5, w8
|
||||
b 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
// void dav1d_sgr_box35_row_h_8bpc_neon(int32_t *sumsq3, int16_t *sum3,
|
||||
// int32_t *sumsq5, int16_t *sum5,
|
||||
// const pixel (*left)[4],
|
||||
// const pixel *src, const int w,
|
||||
// const enum LrEdgeFlags edges);
|
||||
function sgr_box35_row_h_8bpc_neon, export=1
|
||||
add w6, w6, #2 // w += 2
|
||||
|
||||
tst w7, #1 // LR_HAVE_LEFT
|
||||
b.eq 1f
|
||||
cbnz x4, 0f
|
||||
|
||||
// LR_HAVE_LEFT && left == NULL
|
||||
sub x5, x5, #3
|
||||
ld1 {v0.16b}, [x5], #16
|
||||
b 2f
|
||||
|
||||
0:
|
||||
// LR_HAVE_LEFT, left != NULL
|
||||
ld1 {v0.16b}, [x5], #16
|
||||
ld1 {v1.s}[3], [x4], #4
|
||||
// Move x3 back to account for the last 3 bytes we loaded earlier,
|
||||
// which we'll shift out.
|
||||
sub x5, x5, #3
|
||||
ext v0.16b, v1.16b, v0.16b, #13
|
||||
b 2f
|
||||
|
||||
1:
|
||||
ld1 {v0.16b}, [x5], #16
|
||||
// !LR_HAVE_LEFT, fill v1 with the leftmost byte
|
||||
// and shift v0 to have 3x the first byte at the front.
|
||||
dup v1.16b, v0.b[0]
|
||||
// Move x3 back to account for the last 3 bytes we loaded before,
|
||||
// which we shifted out.
|
||||
sub x5, x5, #3
|
||||
ext v0.16b, v1.16b, v0.16b, #13
|
||||
|
||||
2:
|
||||
umull v1.8h, v0.8b, v0.8b
|
||||
umull2 v2.8h, v0.16b, v0.16b
|
||||
|
||||
tst w7, #2 // LR_HAVE_RIGHT
|
||||
b.ne 4f
|
||||
// If we'll need to pad the right edge, load that byte to pad with
|
||||
// here since we can find it pretty easily from here.
|
||||
sub w13, w6, #(2 + 16 - 3 + 1)
|
||||
ldr b30, [x5, w13, sxtw]
|
||||
// Fill v30 with the right padding pixel
|
||||
dup v30.16b, v30.b[0]
|
||||
3: // !LR_HAVE_RIGHT
|
||||
|
||||
// Check whether we need to pad the right edge
|
||||
cmp w6, #11
|
||||
b.ge 4f // If w >= 11, all used input pixels are valid
|
||||
|
||||
// 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
|
||||
// this ends up called again; it's not strictly needed in those
|
||||
// cases (we pad enough here), but keeping the code as simple as possible.
|
||||
|
||||
// Insert padding in v0.b[w+1] onwards; fuse the +1 into the
|
||||
// buffer pointer.
|
||||
movrel x13, right_ext_mask, -1
|
||||
sub x13, x13, w6, uxtw
|
||||
ld1 {v29.16b}, [x13]
|
||||
|
||||
bit v0.16b, v30.16b, v29.16b
|
||||
|
||||
// Update the precalculated squares
|
||||
umull v1.8h, v0.8b, v0.8b
|
||||
umull2 v2.8h, v0.16b, v0.16b
|
||||
|
||||
4: // Loop horizontally
|
||||
ext v16.16b, v0.16b, v0.16b, #1
|
||||
ext v17.16b, v0.16b, v0.16b, #2
|
||||
ext v19.16b, v0.16b, v0.16b, #4
|
||||
ext v18.16b, v0.16b, v0.16b, #3
|
||||
uaddl v3.8h, v16.8b, v17.8b
|
||||
uaddl v24.8h, v0.8b, v19.8b
|
||||
uaddw v3.8h, v3.8h, v18.8b
|
||||
|
||||
ext v16.16b, v1.16b, v2.16b, #2
|
||||
ext v17.16b, v1.16b, v2.16b, #4
|
||||
ext v19.16b, v1.16b, v2.16b, #8
|
||||
ext v18.16b, v1.16b, v2.16b, #6
|
||||
|
||||
st1 {v3.8h}, [x1], #16
|
||||
add v3.8h, v3.8h, v24.8h
|
||||
|
||||
uaddl v26.4s, v16.4h, v17.4h
|
||||
uaddl2 v27.4s, v16.8h, v17.8h
|
||||
uaddl v16.4s, v1.4h, v19.4h
|
||||
uaddl2 v17.4s, v1.8h, v19.8h
|
||||
uaddw v26.4s, v26.4s, v18.4h
|
||||
uaddw2 v27.4s, v27.4s, v18.8h
|
||||
|
||||
st1 {v26.4s,v27.4s}, [x0], #32
|
||||
add v26.4s, v26.4s, v16.4s
|
||||
add v27.4s, v27.4s, v17.4s
|
||||
|
||||
subs w6, w6, #8
|
||||
|
||||
st1 {v3.8h}, [x3], #16
|
||||
st1 {v26.4s,v27.4s}, [x2], #32
|
||||
|
||||
b.le 9f
|
||||
tst w7, #2 // LR_HAVE_RIGHT
|
||||
ld1 {v3.8b}, [x5], #8
|
||||
mov v1.16b, v2.16b
|
||||
ext v0.16b, v0.16b, v3.16b, #8
|
||||
umull v2.8h, v3.8b, v3.8b
|
||||
|
||||
b.ne 4b // If we don't need to pad, just keep summing.
|
||||
b 3b // If we need to pad, check how many pixels we have left.
|
||||
|
||||
9:
|
||||
ret
|
||||
endfunc
|
||||
|
||||
|
379
third_party/dav1d/src/arm/64/looprestoration16.S
vendored
379
third_party/dav1d/src/arm/64/looprestoration16.S
vendored
@ -1070,349 +1070,318 @@ function wiener_filter5_hv_16bpc_neon
|
||||
ret
|
||||
endfunc
|
||||
|
||||
#define SUM_STRIDE (384+16)
|
||||
|
||||
#include "looprestoration_tmpl.S"
|
||||
|
||||
// void dav1d_sgr_box3_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
|
||||
// const pixel (*left)[4],
|
||||
// const pixel *src, const ptrdiff_t stride,
|
||||
// const int w, const int h,
|
||||
// const enum LrEdgeFlags edges);
|
||||
function sgr_box3_h_16bpc_neon, export=1
|
||||
add w5, w5, #2 // w += 2
|
||||
// void dav1d_sgr_box3_row_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
|
||||
// const pixel (*left)[4],
|
||||
// const pixel *src, const int w,
|
||||
// const enum LrEdgeFlags edges);
|
||||
function sgr_box3_row_h_16bpc_neon, export=1
|
||||
add w4, w4, #2 // w += 2
|
||||
|
||||
// Set up pointers for reading/writing alternate rows
|
||||
add x10, x0, #(4*SUM_STRIDE) // sumsq
|
||||
add x11, x1, #(2*SUM_STRIDE) // sum
|
||||
add x12, x3, x4 // src
|
||||
lsl x4, x4, #1
|
||||
mov x9, #(2*2*SUM_STRIDE) // double sum stride
|
||||
|
||||
// Subtract the aligned width from the output stride.
|
||||
add w13, w5, #7
|
||||
bic w13, w13, #7
|
||||
sub x9, x9, w13, uxtw #1
|
||||
|
||||
// Store the width for the vertical loop
|
||||
mov w8, w5
|
||||
|
||||
// Subtract the number of pixels read from the input from the stride
|
||||
add w13, w13, #8
|
||||
sub x4, x4, w13, uxtw #1
|
||||
|
||||
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
|
||||
tst w7, #1 // LR_HAVE_LEFT
|
||||
b.eq 2f
|
||||
// LR_HAVE_LEFT
|
||||
tst w5, #1 // LR_HAVE_LEFT
|
||||
b.eq 1f
|
||||
cbnz x2, 0f
|
||||
// left == NULL
|
||||
|
||||
// LR_HAVE_LEFT && left == NULL
|
||||
sub x3, x3, #4
|
||||
sub x12, x12, #4
|
||||
b 1f
|
||||
0: // LR_HAVE_LEFT, left != NULL
|
||||
2: // !LR_HAVE_LEFT, increase the stride.
|
||||
// For this case we don't read the left 2 pixels from the src pointer,
|
||||
// but shift it as if we had done that.
|
||||
add x4, x4, #4
|
||||
ld1 {v0.8h, v1.8h}, [x3], #32
|
||||
b 2f
|
||||
|
||||
|
||||
1: // Loop vertically
|
||||
ld1 {v0.8h, v1.8h}, [x3], #32
|
||||
ld1 {v16.8h, v17.8h}, [x12], #32
|
||||
|
||||
tst w7, #1 // LR_HAVE_LEFT
|
||||
b.eq 0f
|
||||
cbz x2, 2f
|
||||
0:
|
||||
// LR_HAVE_LEFT, left != NULL
|
||||
ld1 {v2.d}[1], [x2], #8
|
||||
// Move x3/x12 back to account for the last 2 pixels we loaded earlier,
|
||||
ld1 {v0.8h, v1.8h}, [x3], #32
|
||||
ld1 {v2.d}[1], [x2]
|
||||
// Move x3 back to account for the last 2 pixels we loaded earlier,
|
||||
// which we'll shift out.
|
||||
sub x3, x3, #4
|
||||
sub x12, x12, #4
|
||||
ld1 {v18.d}[1], [x2], #8
|
||||
ext v1.16b, v0.16b, v1.16b, #12
|
||||
ext v0.16b, v2.16b, v0.16b, #12
|
||||
ext v17.16b, v16.16b, v17.16b, #12
|
||||
ext v16.16b, v18.16b, v16.16b, #12
|
||||
ext v1.16b, v0.16b, v1.16b, #12
|
||||
ext v0.16b, v2.16b, v0.16b, #12
|
||||
b 2f
|
||||
0:
|
||||
|
||||
1:
|
||||
ld1 {v0.8h, v1.8h}, [x3], #32
|
||||
// !LR_HAVE_LEFT, fill v2 with the leftmost pixel
|
||||
// and shift v0/v1 to have 2x the first pixel at the front.
|
||||
dup v2.8h, v0.h[0]
|
||||
dup v18.8h, v16.h[0]
|
||||
dup v2.8h, v0.h[0]
|
||||
// Move x3 back to account for the last 2 pixels we loaded before,
|
||||
// which we shifted out.
|
||||
sub x3, x3, #4
|
||||
sub x12, x12, #4
|
||||
ext v1.16b, v0.16b, v1.16b, #12
|
||||
ext v0.16b, v2.16b, v0.16b, #12
|
||||
ext v17.16b, v16.16b, v17.16b, #12
|
||||
ext v16.16b, v18.16b, v16.16b, #12
|
||||
ext v1.16b, v0.16b, v1.16b, #12
|
||||
ext v0.16b, v2.16b, v0.16b, #12
|
||||
|
||||
2:
|
||||
tst w7, #2 // LR_HAVE_RIGHT
|
||||
tst w5, #2 // LR_HAVE_RIGHT
|
||||
b.ne 4f
|
||||
// If we'll need to pad the right edge, load that pixel to pad with
|
||||
// here since we can find it pretty easily from here.
|
||||
sub w13, w5, #(2 + 16 - 2 + 1)
|
||||
sub w13, w4, #(2 + 16 - 2 + 1)
|
||||
ldr h30, [x3, w13, sxtw #1]
|
||||
ldr h31, [x12, w13, sxtw #1]
|
||||
// Fill v30/v31 with the right padding pixel
|
||||
// Fill v30 with the right padding pixel
|
||||
dup v30.8h, v30.h[0]
|
||||
dup v31.8h, v31.h[0]
|
||||
3: // !LR_HAVE_RIGHT
|
||||
|
||||
// Check whether we need to pad the right edge
|
||||
cmp w5, #10
|
||||
cmp w4, #10
|
||||
b.ge 4f // If w >= 10, all used input pixels are valid
|
||||
|
||||
// 1 <= w < 10, w pixels valid in v0-v1. For w=9, this ends up called
|
||||
// 1 <= w < 10, w pixels valid in v0. For w=9, this ends up called
|
||||
// again; it's not strictly needed in those cases (we pad enough here),
|
||||
// but keeping the code as simple as possible.
|
||||
|
||||
// Insert padding in v0/1.h[w] onwards
|
||||
// Insert padding in v0.b[w] onwards
|
||||
movrel x13, right_ext_mask
|
||||
sub x13, x13, w5, uxtw #1
|
||||
sub x13, x13, w4, uxtw #1
|
||||
ld1 {v28.16b, v29.16b}, [x13]
|
||||
|
||||
bit v0.16b, v30.16b, v28.16b
|
||||
bit v1.16b, v30.16b, v29.16b
|
||||
bit v16.16b, v31.16b, v28.16b
|
||||
bit v17.16b, v31.16b, v29.16b
|
||||
|
||||
4: // Loop horizontally
|
||||
ext v26.16b, v0.16b, v1.16b, #2
|
||||
ext v28.16b, v16.16b, v17.16b, #2
|
||||
ext v27.16b, v0.16b, v1.16b, #4
|
||||
ext v29.16b, v16.16b, v17.16b, #4
|
||||
|
||||
add v6.8h, v0.8h, v26.8h
|
||||
umull v22.4s, v0.4h, v0.4h
|
||||
umlal v22.4s, v26.4h, v26.4h
|
||||
umlal v22.4s, v27.4h, v27.4h
|
||||
add v7.8h, v16.8h, v28.8h
|
||||
umull v24.4s, v16.4h, v16.4h
|
||||
umlal v24.4s, v28.4h, v28.4h
|
||||
umlal v24.4s, v29.4h, v29.4h
|
||||
add v6.8h, v6.8h, v27.8h
|
||||
umull2 v23.4s, v0.8h, v0.8h
|
||||
umlal2 v23.4s, v26.8h, v26.8h
|
||||
umlal2 v23.4s, v27.8h, v27.8h
|
||||
add v7.8h, v7.8h, v29.8h
|
||||
umull2 v25.4s, v16.8h, v16.8h
|
||||
umlal2 v25.4s, v28.8h, v28.8h
|
||||
umlal2 v25.4s, v29.8h, v29.8h
|
||||
|
||||
subs w5, w5, #8
|
||||
subs w4, w4, #8
|
||||
|
||||
st1 {v6.8h}, [x1], #16
|
||||
st1 {v7.8h}, [x11], #16
|
||||
st1 {v22.4s,v23.4s}, [x0], #32
|
||||
st1 {v24.4s,v25.4s}, [x10], #32
|
||||
|
||||
b.le 9f
|
||||
tst w7, #2 // LR_HAVE_RIGHT
|
||||
tst w5, #2 // LR_HAVE_RIGHT
|
||||
mov v0.16b, v1.16b
|
||||
mov v16.16b, v17.16b
|
||||
ld1 {v1.8h}, [x3], #16
|
||||
ld1 {v17.8h}, [x12], #16
|
||||
|
||||
b.ne 4b // If we don't need to pad, just keep summing.
|
||||
b 3b // If we need to pad, check how many pixels we have left.
|
||||
|
||||
9:
|
||||
subs w6, w6, #2
|
||||
b.le 0f
|
||||
// Jump to the next row and loop horizontally
|
||||
add x0, x0, x9, lsl #1
|
||||
add x10, x10, x9, lsl #1
|
||||
add x1, x1, x9
|
||||
add x11, x11, x9
|
||||
add x3, x3, x4
|
||||
add x12, x12, x4
|
||||
mov w5, w8
|
||||
b 1b
|
||||
0:
|
||||
ret
|
||||
endfunc
|
||||
|
||||
// void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
|
||||
// const pixel (*left)[4],
|
||||
// const pixel *src, const ptrdiff_t stride,
|
||||
// const int w, const int h,
|
||||
// const enum LrEdgeFlags edges);
|
||||
function sgr_box5_h_16bpc_neon, export=1
|
||||
add w5, w5, #2 // w += 2
|
||||
// void dav1d_sgr_box5_row_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
|
||||
// const pixel (*left)[4],
|
||||
// const pixel *src, const int w,
|
||||
// const enum LrEdgeFlags edges);
|
||||
function sgr_box5_row_h_16bpc_neon, export=1
|
||||
add w4, w4, #2 // w += 2
|
||||
|
||||
// Set up pointers for reading/writing alternate rows
|
||||
add x10, x0, #(4*SUM_STRIDE) // sumsq
|
||||
add x11, x1, #(2*SUM_STRIDE) // sum
|
||||
add x12, x3, x4 // src
|
||||
lsl x4, x4, #1
|
||||
mov x9, #(2*2*SUM_STRIDE) // double sum stride
|
||||
|
||||
// Subtract the aligned width from the output stride.
|
||||
add w13, w5, #7
|
||||
bic w13, w13, #7
|
||||
sub x9, x9, w13, uxtw #1
|
||||
add w13, w13, #8
|
||||
sub x4, x4, w13, uxtw #1
|
||||
|
||||
// Store the width for the vertical loop
|
||||
mov w8, w5
|
||||
|
||||
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
|
||||
tst w7, #1 // LR_HAVE_LEFT
|
||||
b.eq 2f
|
||||
// LR_HAVE_LEFT
|
||||
tst w5, #1 // LR_HAVE_LEFT
|
||||
b.eq 1f
|
||||
cbnz x2, 0f
|
||||
// left == NULL
|
||||
|
||||
// LR_HAVE_LEFT && left == NULL
|
||||
sub x3, x3, #6
|
||||
sub x12, x12, #6
|
||||
b 1f
|
||||
0: // LR_HAVE_LEFT, left != NULL
|
||||
2: // !LR_HAVE_LEFT, increase the stride.
|
||||
// For this case we don't read the left 3 pixels from the src pointer,
|
||||
// but shift it as if we had done that.
|
||||
add x4, x4, #6
|
||||
ld1 {v0.8h, v1.8h}, [x3], #32
|
||||
b 2f
|
||||
|
||||
1: // Loop vertically
|
||||
ld1 {v0.8h, v1.8h}, [x3], #32
|
||||
ld1 {v16.8h, v17.8h}, [x12], #32
|
||||
|
||||
tst w7, #1 // LR_HAVE_LEFT
|
||||
b.eq 0f
|
||||
cbz x2, 2f
|
||||
0:
|
||||
// LR_HAVE_LEFT, left != NULL
|
||||
ld1 {v2.d}[1], [x2], #8
|
||||
// Move x3/x12 back to account for the last 3 pixels we loaded earlier,
|
||||
ld1 {v0.8h, v1.8h}, [x3], #32
|
||||
ld1 {v2.d}[1], [x2], #8
|
||||
// Move x3 back to account for the last 3 pixels we loaded earlier,
|
||||
// which we'll shift out.
|
||||
sub x3, x3, #6
|
||||
sub x12, x12, #6
|
||||
ld1 {v18.d}[1], [x2], #8
|
||||
ext v1.16b, v0.16b, v1.16b, #10
|
||||
ext v0.16b, v2.16b, v0.16b, #10
|
||||
ext v17.16b, v16.16b, v17.16b, #10
|
||||
ext v16.16b, v18.16b, v16.16b, #10
|
||||
b 2f
|
||||
0:
|
||||
|
||||
1:
|
||||
ld1 {v0.8h, v1.8h}, [x3], #32
|
||||
// !LR_HAVE_LEFT, fill v2 with the leftmost pixel
|
||||
// and shift v0/v1 to have 3x the first pixel at the front.
|
||||
dup v2.8h, v0.h[0]
|
||||
dup v18.8h, v16.h[0]
|
||||
// Move x3 back to account for the last 3 pixels we loaded before,
|
||||
// which we shifted out.
|
||||
sub x3, x3, #6
|
||||
sub x12, x12, #6
|
||||
ext v1.16b, v0.16b, v1.16b, #10
|
||||
ext v0.16b, v2.16b, v0.16b, #10
|
||||
ext v17.16b, v16.16b, v17.16b, #10
|
||||
ext v16.16b, v18.16b, v16.16b, #10
|
||||
|
||||
2:
|
||||
tst w7, #2 // LR_HAVE_RIGHT
|
||||
tst w5, #2 // LR_HAVE_RIGHT
|
||||
b.ne 4f
|
||||
// If we'll need to pad the right edge, load that pixel to pad with
|
||||
// here since we can find it pretty easily from here.
|
||||
sub w13, w5, #(2 + 16 - 3 + 1)
|
||||
sub w13, w4, #(2 + 16 - 3 + 1)
|
||||
ldr h30, [x3, w13, sxtw #1]
|
||||
ldr h31, [x12, w13, sxtw #1]
|
||||
// Fill v30/v31 with the right padding pixel
|
||||
// Fill v30 with the right padding pixel
|
||||
dup v30.8h, v30.h[0]
|
||||
dup v31.8h, v31.h[0]
|
||||
3: // !LR_HAVE_RIGHT
|
||||
|
||||
// Check whether we need to pad the right edge
|
||||
cmp w5, #11
|
||||
cmp w4, #11
|
||||
b.ge 4f // If w >= 11, all used input pixels are valid
|
||||
|
||||
// 1 <= w < 11, w+1 pixels valid in v0-v1. For w=9 or w=10,
|
||||
// 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
|
||||
// this ends up called again; it's not strictly needed in those
|
||||
// cases (we pad enough here), but keeping the code as simple as possible.
|
||||
|
||||
// Insert padding in v0/1.h[w+1] onwards; fuse the +1 into the
|
||||
// Insert padding in v0.b[w+1] onwards; fuse the +1 into the
|
||||
// buffer pointer.
|
||||
movrel x13, right_ext_mask, -2
|
||||
sub x13, x13, w5, uxtw #1
|
||||
movrel x13, right_ext_mask, -1
|
||||
sub x13, x13, w4, uxtw #1
|
||||
ld1 {v28.16b, v29.16b}, [x13]
|
||||
|
||||
bit v0.16b, v30.16b, v28.16b
|
||||
bit v1.16b, v30.16b, v29.16b
|
||||
bit v16.16b, v31.16b, v28.16b
|
||||
bit v17.16b, v31.16b, v29.16b
|
||||
|
||||
4: // Loop horizontally
|
||||
ext v26.16b, v0.16b, v1.16b, #2
|
||||
ext v28.16b, v16.16b, v17.16b, #2
|
||||
ext v27.16b, v0.16b, v1.16b, #4
|
||||
ext v29.16b, v16.16b, v17.16b, #4
|
||||
|
||||
add v6.8h, v0.8h, v26.8h
|
||||
umull v22.4s, v0.4h, v0.4h
|
||||
umlal v22.4s, v26.4h, v26.4h
|
||||
umlal v22.4s, v27.4h, v27.4h
|
||||
add v7.8h, v16.8h, v28.8h
|
||||
umull v24.4s, v16.4h, v16.4h
|
||||
umlal v24.4s, v28.4h, v28.4h
|
||||
umlal v24.4s, v29.4h, v29.4h
|
||||
add v6.8h, v6.8h, v27.8h
|
||||
umull2 v23.4s, v0.8h, v0.8h
|
||||
umlal2 v23.4s, v26.8h, v26.8h
|
||||
umlal2 v23.4s, v27.8h, v27.8h
|
||||
add v7.8h, v7.8h, v29.8h
|
||||
umull2 v25.4s, v16.8h, v16.8h
|
||||
umlal2 v25.4s, v28.8h, v28.8h
|
||||
umlal2 v25.4s, v29.8h, v29.8h
|
||||
|
||||
ext v26.16b, v0.16b, v1.16b, #6
|
||||
ext v28.16b, v16.16b, v17.16b, #6
|
||||
ext v27.16b, v0.16b, v1.16b, #8
|
||||
ext v29.16b, v16.16b, v17.16b, #8
|
||||
|
||||
add v6.8h, v6.8h, v26.8h
|
||||
umlal v22.4s, v26.4h, v26.4h
|
||||
umlal v22.4s, v27.4h, v27.4h
|
||||
add v7.8h, v7.8h, v28.8h
|
||||
umlal v24.4s, v28.4h, v28.4h
|
||||
umlal v24.4s, v29.4h, v29.4h
|
||||
add v6.8h, v6.8h, v27.8h
|
||||
umlal2 v23.4s, v26.8h, v26.8h
|
||||
umlal2 v23.4s, v27.8h, v27.8h
|
||||
add v7.8h, v7.8h, v29.8h
|
||||
umlal2 v25.4s, v28.8h, v28.8h
|
||||
umlal2 v25.4s, v29.8h, v29.8h
|
||||
|
||||
subs w5, w5, #8
|
||||
subs w4, w4, #8
|
||||
|
||||
st1 {v6.8h}, [x1], #16
|
||||
st1 {v7.8h}, [x11], #16
|
||||
st1 {v22.4s,v23.4s}, [x0], #32
|
||||
st1 {v24.4s,v25.4s}, [x10], #32
|
||||
|
||||
b.le 9f
|
||||
tst w7, #2 // LR_HAVE_RIGHT
|
||||
tst w5, #2 // LR_HAVE_RIGHT
|
||||
mov v0.16b, v1.16b
|
||||
mov v16.16b, v17.16b
|
||||
ld1 {v1.8h}, [x3], #16
|
||||
ld1 {v17.8h}, [x12], #16
|
||||
ld1 {v1.8h}, [x3], #16
|
||||
|
||||
b.ne 4b // If we don't need to pad, just keep summing.
|
||||
b 3b // If we need to pad, check how many pixels we have left.
|
||||
|
||||
9:
|
||||
subs w6, w6, #2
|
||||
b.le 0f
|
||||
// Jump to the next row and loop horizontally
|
||||
add x0, x0, x9, lsl #1
|
||||
add x10, x10, x9, lsl #1
|
||||
add x1, x1, x9
|
||||
add x11, x11, x9
|
||||
add x3, x3, x4
|
||||
add x12, x12, x4
|
||||
mov w5, w8
|
||||
b 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
// void dav1d_sgr_box35_row_h_16bpc_neon(int32_t *sumsq3, int16_t *sum3,
|
||||
// int32_t *sumsq5, int16_t *sum5,
|
||||
// const pixel (*left)[4],
|
||||
// const pixel *src, const int w,
|
||||
// const enum LrEdgeFlags edges);
|
||||
function sgr_box35_row_h_16bpc_neon, export=1
|
||||
add w6, w6, #2 // w += 2
|
||||
|
||||
tst w7, #1 // LR_HAVE_LEFT
|
||||
b.eq 1f
|
||||
cbnz x4, 0f
|
||||
|
||||
// LR_HAVE_LEFT && left == NULL
|
||||
sub x5, x5, #6
|
||||
ld1 {v0.8h, v1.8h}, [x5], #32
|
||||
b 2f
|
||||
|
||||
0:
|
||||
// LR_HAVE_LEFT, left != NULL
|
||||
ld1 {v0.8h, v1.8h}, [x5], #32
|
||||
ld1 {v2.d}[1], [x4], #8
|
||||
// Move x3 back to account for the last 3 pixels we loaded earlier,
|
||||
// which we'll shift out.
|
||||
sub x5, x5, #6
|
||||
ext v1.16b, v0.16b, v1.16b, #10
|
||||
ext v0.16b, v2.16b, v0.16b, #10
|
||||
b 2f
|
||||
|
||||
1:
|
||||
ld1 {v0.8h, v1.8h}, [x5], #32
|
||||
// !LR_HAVE_LEFT, fill v2 with the leftmost pixel
|
||||
// and shift v0/v1 to have 3x the first pixel at the front.
|
||||
dup v2.8h, v0.h[0]
|
||||
// Move x5 back to account for the last 3 pixels we loaded before,
|
||||
// which we shifted out.
|
||||
sub x5, x5, #6
|
||||
ext v1.16b, v0.16b, v1.16b, #10
|
||||
ext v0.16b, v2.16b, v0.16b, #10
|
||||
|
||||
2:
|
||||
tst w7, #2 // LR_HAVE_RIGHT
|
||||
b.ne 4f
|
||||
// If we'll need to pad the right edge, load that pixel to pad with
|
||||
// here since we can find it pretty easily from here.
|
||||
sub w13, w6, #(2 + 16 - 3 + 1)
|
||||
ldr h30, [x5, w13, sxtw #1]
|
||||
// Fill v30 with the right padding pixel
|
||||
dup v30.8h, v30.h[0]
|
||||
3: // !LR_HAVE_RIGHT
|
||||
|
||||
// Check whether we need to pad the right edge
|
||||
cmp w6, #11
|
||||
b.ge 4f // If w >= 11, all used input pixels are valid
|
||||
|
||||
// 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
|
||||
// this ends up called again; it's not strictly needed in those
|
||||
// cases (we pad enough here), but keeping the code as simple as possible.
|
||||
|
||||
// Insert padding in v0.b[w+1] onwards; fuse the +1 into the
|
||||
// buffer pointer.
|
||||
movrel x13, right_ext_mask, -1
|
||||
sub x13, x13, w6, uxtw #1
|
||||
ld1 {v28.16b, v29.16b}, [x13]
|
||||
|
||||
bit v0.16b, v30.16b, v28.16b
|
||||
bit v1.16b, v30.16b, v29.16b
|
||||
|
||||
4: // Loop horizontally
|
||||
ext v16.16b, v0.16b, v1.16b, #2
|
||||
ext v17.16b, v0.16b, v1.16b, #4
|
||||
ext v19.16b, v0.16b, v1.16b, #8
|
||||
ext v18.16b, v0.16b, v1.16b, #6
|
||||
|
||||
add v20.8h, v16.8h, v17.8h
|
||||
add v21.8h, v0.8h, v19.8h
|
||||
add v20.8h, v20.8h, v18.8h
|
||||
|
||||
umull v22.4s, v16.4h, v16.4h
|
||||
umlal v22.4s, v17.4h, v17.4h
|
||||
umlal v22.4s, v18.4h, v18.4h
|
||||
|
||||
umull2 v23.4s, v16.8h, v16.8h
|
||||
umlal2 v23.4s, v17.8h, v17.8h
|
||||
umlal2 v23.4s, v18.8h, v18.8h
|
||||
|
||||
add v21.8h, v21.8h, v20.8h
|
||||
st1 {v20.8h}, [x1], #16
|
||||
st1 {v22.4s,v23.4s}, [x0], #32
|
||||
|
||||
umlal v22.4s, v0.4h, v0.4h
|
||||
umlal v22.4s, v19.4h, v19.4h
|
||||
|
||||
umlal2 v23.4s, v0.8h, v0.8h
|
||||
umlal2 v23.4s, v19.8h, v19.8h
|
||||
|
||||
subs w6, w6, #8
|
||||
|
||||
st1 {v21.8h}, [x3], #16
|
||||
st1 {v22.4s,v23.4s}, [x2], #32
|
||||
|
||||
b.le 9f
|
||||
tst w7, #2 // LR_HAVE_RIGHT
|
||||
mov v0.16b, v1.16b
|
||||
ld1 {v1.8h}, [x5], #16
|
||||
|
||||
b.ne 4b // If we don't need to pad, just keep summing.
|
||||
b 3b // If we need to pad, check how many pixels we have left.
|
||||
|
||||
9:
|
||||
ret
|
||||
endfunc
|
||||
|
||||
|
@ -28,332 +28,29 @@
|
||||
#include "src/arm/asm.S"
|
||||
#include "util.S"
|
||||
|
||||
#define SUM_STRIDE (384+16)
|
||||
// void dav1d_sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum,
|
||||
// int32_t *AA, int16_t *BB,
|
||||
// const int w, const int s,
|
||||
// const int bitdepth_max);
|
||||
function sgr_box3_vert_neon, export=1
|
||||
stp d8, d9, [sp, #-0x30]!
|
||||
stp d10, d11, [sp, #0x10]
|
||||
stp d12, d13, [sp, #0x20]
|
||||
|
||||
// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
|
||||
// const int w, const int h,
|
||||
// const enum LrEdgeFlags edges);
|
||||
function sgr_box3_v_neon, export=1
|
||||
add w10, w3, #2 // Number of output rows to move back
|
||||
mov w11, w3 // Number of input rows to move back
|
||||
add w2, w2, #2 // Actual summed width
|
||||
mov x7, #(4*SUM_STRIDE) // sumsq stride
|
||||
mov x8, #(2*SUM_STRIDE) // sum stride
|
||||
sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride
|
||||
sub x1, x1, #(2*SUM_STRIDE) // sum -= stride
|
||||
add w4, w4, #2
|
||||
clz w9, w6 // bitdepth_max
|
||||
dup v28.4s, w5 // strength
|
||||
|
||||
tst w4, #4 // LR_HAVE_TOP
|
||||
b.eq 0f
|
||||
// If have top, read from row -2.
|
||||
sub x5, x0, #(4*SUM_STRIDE)
|
||||
sub x6, x1, #(2*SUM_STRIDE)
|
||||
add w11, w11, #2
|
||||
b 1f
|
||||
0:
|
||||
// !LR_HAVE_TOP
|
||||
// If we don't have top, read from row 0 even if
|
||||
// we start writing to row -1.
|
||||
add x5, x0, #(4*SUM_STRIDE)
|
||||
add x6, x1, #(2*SUM_STRIDE)
|
||||
1:
|
||||
ldp x5, x6, [x0]
|
||||
ldr x0, [x0, #16]
|
||||
ldp x7, x8, [x1]
|
||||
ldr x1, [x1, #16]
|
||||
|
||||
tst w4, #8 // LR_HAVE_BOTTOM
|
||||
b.eq 1f
|
||||
// LR_HAVE_BOTTOM
|
||||
add w3, w3, #2 // Sum all h+2 lines with the main loop
|
||||
add w11, w11, #2
|
||||
1:
|
||||
mov w9, w3 // Backup of h for next loops
|
||||
movi v31.4s, #9 // n
|
||||
|
||||
1:
|
||||
// Start of horizontal loop; start one vertical filter slice.
|
||||
// Start loading rows into v16-v21 and v24-v26 taking top
|
||||
// padding into consideration.
|
||||
tst w4, #4 // LR_HAVE_TOP
|
||||
ld1 {v16.4s, v17.4s}, [x5], x7
|
||||
ld1 {v24.8h}, [x6], x8
|
||||
b.eq 2f
|
||||
// LR_HAVE_TOP
|
||||
ld1 {v18.4s, v19.4s}, [x5], x7
|
||||
ld1 {v25.8h}, [x6], x8
|
||||
ld1 {v20.4s, v21.4s}, [x5], x7
|
||||
ld1 {v26.8h}, [x6], x8
|
||||
b 3f
|
||||
2: // !LR_HAVE_TOP
|
||||
mov v18.16b, v16.16b
|
||||
mov v19.16b, v17.16b
|
||||
mov v25.16b, v24.16b
|
||||
mov v20.16b, v16.16b
|
||||
mov v21.16b, v17.16b
|
||||
mov v26.16b, v24.16b
|
||||
|
||||
3:
|
||||
subs w3, w3, #1
|
||||
.macro add3
|
||||
add v16.4s, v16.4s, v18.4s
|
||||
add v17.4s, v17.4s, v19.4s
|
||||
add v24.8h, v24.8h, v25.8h
|
||||
add v16.4s, v16.4s, v20.4s
|
||||
add v17.4s, v17.4s, v21.4s
|
||||
add v24.8h, v24.8h, v26.8h
|
||||
st1 {v16.4s, v17.4s}, [x0], x7
|
||||
st1 {v24.8h}, [x1], x8
|
||||
.endm
|
||||
add3
|
||||
mov v16.16b, v18.16b
|
||||
mov v17.16b, v19.16b
|
||||
mov v24.16b, v25.16b
|
||||
mov v18.16b, v20.16b
|
||||
mov v19.16b, v21.16b
|
||||
mov v25.16b, v26.16b
|
||||
b.le 4f
|
||||
ld1 {v20.4s, v21.4s}, [x5], x7
|
||||
ld1 {v26.8h}, [x6], x8
|
||||
b 3b
|
||||
|
||||
4:
|
||||
tst w4, #8 // LR_HAVE_BOTTOM
|
||||
b.ne 5f
|
||||
// !LR_HAVE_BOTTOM
|
||||
// Produce two more rows, extending the already loaded rows.
|
||||
add3
|
||||
mov v16.16b, v18.16b
|
||||
mov v17.16b, v19.16b
|
||||
mov v24.16b, v25.16b
|
||||
add3
|
||||
|
||||
5: // End of one vertical slice.
|
||||
subs w2, w2, #8
|
||||
b.le 0f
|
||||
// Move pointers back up to the top and loop horizontally.
|
||||
// Input pointers
|
||||
msub x5, x7, x11, x5
|
||||
msub x6, x8, x11, x6
|
||||
// Output pointers
|
||||
msub x0, x7, x10, x0
|
||||
msub x1, x8, x10, x1
|
||||
add x0, x0, #32
|
||||
add x1, x1, #16
|
||||
add x5, x5, #32
|
||||
add x6, x6, #16
|
||||
mov w3, w9
|
||||
b 1b
|
||||
|
||||
0:
|
||||
ret
|
||||
.purgem add3
|
||||
endfunc
|
||||
|
||||
// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
|
||||
// const int w, const int h,
|
||||
// const enum LrEdgeFlags edges);
|
||||
function sgr_box5_v_neon, export=1
|
||||
add w10, w3, #2 // Number of output rows to move back
|
||||
mov w11, w3 // Number of input rows to move back
|
||||
add w2, w2, #8 // Actual summed width
|
||||
mov x7, #(4*SUM_STRIDE) // sumsq stride
|
||||
mov x8, #(2*SUM_STRIDE) // sum stride
|
||||
sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride
|
||||
sub x1, x1, #(2*SUM_STRIDE) // sum -= stride
|
||||
|
||||
tst w4, #4 // LR_HAVE_TOP
|
||||
b.eq 0f
|
||||
// If have top, read from row -2.
|
||||
sub x5, x0, #(4*SUM_STRIDE)
|
||||
sub x6, x1, #(2*SUM_STRIDE)
|
||||
add w11, w11, #2
|
||||
b 1f
|
||||
0:
|
||||
// !LR_HAVE_TOP
|
||||
// If we don't have top, read from row 0 even if
|
||||
// we start writing to row -1.
|
||||
add x5, x0, #(4*SUM_STRIDE)
|
||||
add x6, x1, #(2*SUM_STRIDE)
|
||||
1:
|
||||
|
||||
tst w4, #8 // LR_HAVE_BOTTOM
|
||||
b.eq 0f
|
||||
// LR_HAVE_BOTTOM
|
||||
add w3, w3, #2 // Handle h+2 lines with the main loop
|
||||
add w11, w11, #2
|
||||
b 1f
|
||||
0:
|
||||
// !LR_HAVE_BOTTOM
|
||||
sub w3, w3, #1 // Handle h-1 lines with the main loop
|
||||
1:
|
||||
mov w9, w3 // Backup of h for next loops
|
||||
|
||||
1:
|
||||
// Start of horizontal loop; start one vertical filter slice.
|
||||
// Start loading rows into v16-v25 and v26-v30 taking top
|
||||
// padding into consideration.
|
||||
tst w4, #4 // LR_HAVE_TOP
|
||||
ld1 {v16.4s, v17.4s}, [x5], x7
|
||||
ld1 {v26.8h}, [x6], x8
|
||||
b.eq 2f
|
||||
// LR_HAVE_TOP
|
||||
ld1 {v20.4s, v21.4s}, [x5], x7
|
||||
ld1 {v28.8h}, [x6], x8
|
||||
mov v18.16b, v16.16b
|
||||
mov v19.16b, v17.16b
|
||||
mov v27.16b, v26.16b
|
||||
ld1 {v22.4s, v23.4s}, [x5], x7
|
||||
ld1 {v29.8h}, [x6], x8
|
||||
b 3f
|
||||
2: // !LR_HAVE_TOP
|
||||
mov v18.16b, v16.16b
|
||||
mov v19.16b, v17.16b
|
||||
mov v27.16b, v26.16b
|
||||
mov v20.16b, v16.16b
|
||||
mov v21.16b, v17.16b
|
||||
mov v28.16b, v26.16b
|
||||
mov v22.16b, v16.16b
|
||||
mov v23.16b, v17.16b
|
||||
mov v29.16b, v26.16b
|
||||
|
||||
3:
|
||||
cbz w3, 4f
|
||||
ld1 {v24.4s, v25.4s}, [x5], x7
|
||||
ld1 {v30.8h}, [x6], x8
|
||||
|
||||
3:
|
||||
// Start of vertical loop
|
||||
subs w3, w3, #2
|
||||
.macro add5
|
||||
add v16.4s, v16.4s, v18.4s
|
||||
add v17.4s, v17.4s, v19.4s
|
||||
add v26.8h, v26.8h, v27.8h
|
||||
add v0.4s, v20.4s, v22.4s
|
||||
add v1.4s, v21.4s, v23.4s
|
||||
add v2.8h, v28.8h, v29.8h
|
||||
add v16.4s, v16.4s, v24.4s
|
||||
add v17.4s, v17.4s, v25.4s
|
||||
add v26.8h, v26.8h, v30.8h
|
||||
add v16.4s, v16.4s, v0.4s
|
||||
add v17.4s, v17.4s, v1.4s
|
||||
add v26.8h, v26.8h, v2.8h
|
||||
st1 {v16.4s, v17.4s}, [x0], x7
|
||||
st1 {v26.8h}, [x1], x8
|
||||
.endm
|
||||
add5
|
||||
.macro shift2
|
||||
mov v16.16b, v20.16b
|
||||
mov v17.16b, v21.16b
|
||||
mov v26.16b, v28.16b
|
||||
mov v18.16b, v22.16b
|
||||
mov v19.16b, v23.16b
|
||||
mov v27.16b, v29.16b
|
||||
mov v20.16b, v24.16b
|
||||
mov v21.16b, v25.16b
|
||||
mov v28.16b, v30.16b
|
||||
.endm
|
||||
shift2
|
||||
add x0, x0, x7
|
||||
add x1, x1, x8
|
||||
b.le 5f
|
||||
ld1 {v22.4s, v23.4s}, [x5], x7
|
||||
ld1 {v29.8h}, [x6], x8
|
||||
ld1 {v24.4s, v25.4s}, [x5], x7
|
||||
ld1 {v30.8h}, [x6], x8
|
||||
b 3b
|
||||
|
||||
4:
|
||||
// h == 1, !LR_HAVE_BOTTOM.
|
||||
// Pad the last row with the only content row, and add.
|
||||
mov v24.16b, v22.16b
|
||||
mov v25.16b, v23.16b
|
||||
mov v30.16b, v29.16b
|
||||
add5
|
||||
shift2
|
||||
add x0, x0, x7
|
||||
add x1, x1, x8
|
||||
add5
|
||||
b 6f
|
||||
|
||||
5:
|
||||
tst w4, #8 // LR_HAVE_BOTTOM
|
||||
b.ne 6f
|
||||
// !LR_HAVE_BOTTOM
|
||||
cbnz w3, 5f
|
||||
// The intended three edge rows left; output the one at h-2 and
|
||||
// the past edge one at h.
|
||||
ld1 {v22.4s, v23.4s}, [x5], x7
|
||||
ld1 {v29.8h}, [x6], x8
|
||||
// Pad the past-edge row from the last content row.
|
||||
mov v24.16b, v22.16b
|
||||
mov v25.16b, v23.16b
|
||||
mov v30.16b, v29.16b
|
||||
add5
|
||||
shift2
|
||||
add x0, x0, x7
|
||||
add x1, x1, x8
|
||||
// The last two rows are already padded properly here.
|
||||
add5
|
||||
b 6f
|
||||
|
||||
5:
|
||||
// w3 == -1, two rows left, output one.
|
||||
// Pad the last two rows from the mid one.
|
||||
mov v22.16b, v20.16b
|
||||
mov v23.16b, v21.16b
|
||||
mov v29.16b, v28.16b
|
||||
mov v24.16b, v20.16b
|
||||
mov v25.16b, v21.16b
|
||||
mov v30.16b, v28.16b
|
||||
add5
|
||||
add x0, x0, x7
|
||||
add x1, x1, x8
|
||||
b 6f
|
||||
|
||||
6: // End of one vertical slice.
|
||||
subs w2, w2, #8
|
||||
b.le 0f
|
||||
// Move pointers back up to the top and loop horizontally.
|
||||
// Input pointers
|
||||
msub x5, x7, x11, x5
|
||||
msub x6, x8, x11, x6
|
||||
// Output pointers
|
||||
msub x0, x7, x10, x0
|
||||
msub x1, x8, x10, x1
|
||||
add x0, x0, #32
|
||||
add x1, x1, #16
|
||||
add x5, x5, #32
|
||||
add x6, x6, #16
|
||||
mov w3, w9
|
||||
b 1b
|
||||
|
||||
0:
|
||||
ret
|
||||
.purgem add5
|
||||
endfunc
|
||||
|
||||
// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
|
||||
// const int w, const int h, const int strength,
|
||||
// const int bitdepth_max);
|
||||
// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
|
||||
// const int w, const int h, const int strength,
|
||||
// const int bitdepth_max);
|
||||
function sgr_calc_ab1_neon, export=1
|
||||
clz w9, w5
|
||||
add x3, x3, #2 // h += 2
|
||||
movi v31.4s, #9 // n
|
||||
mov x5, #455
|
||||
mov x8, #SUM_STRIDE
|
||||
b sgr_calc_ab_neon
|
||||
endfunc
|
||||
|
||||
function sgr_calc_ab2_neon, export=1
|
||||
clz w9, w5
|
||||
add x3, x3, #3 // h += 3
|
||||
asr x3, x3, #1 // h /= 2
|
||||
movi v31.4s, #25 // n
|
||||
mov x5, #164
|
||||
mov x8, #(2*SUM_STRIDE)
|
||||
endfunc
|
||||
|
||||
function sgr_calc_ab_neon
|
||||
sub w9, w9, #24 // -bitdepth_min_8
|
||||
movrel x12, X(sgr_x_by_x)
|
||||
mov w13, #455 // one_by_x
|
||||
ld1 {v16.16b, v17.16b, v18.16b}, [x12]
|
||||
dup v6.8h, w9 // -bitdepth_min_8
|
||||
movi v19.16b, #5
|
||||
@ -363,70 +60,213 @@ function sgr_calc_ab_neon
|
||||
movi v23.8b, #169 // idx of last 2
|
||||
movi v24.8b, #254 // idx of last 1
|
||||
saddl v7.4s, v6.4h, v6.4h // -2*bitdepth_min_8
|
||||
add x2, x2, #2 // w += 2
|
||||
add x7, x2, #7
|
||||
bic x7, x7, #7 // aligned w
|
||||
sub x7, x8, x7 // increment between rows
|
||||
movi v29.8h, #1, lsl #8
|
||||
dup v28.4s, w4
|
||||
dup v30.4s, w5 // one_by_x
|
||||
sub x0, x0, #(4*(SUM_STRIDE))
|
||||
sub x1, x1, #(2*(SUM_STRIDE))
|
||||
mov x6, x2 // backup of w
|
||||
dup v30.4s, w13 // one_by_x
|
||||
|
||||
sub v16.16b, v16.16b, v19.16b
|
||||
sub v17.16b, v17.16b, v19.16b
|
||||
sub v18.16b, v18.16b, v19.16b
|
||||
|
||||
ld1 {v8.4s, v9.4s}, [x5], #32
|
||||
ld1 {v10.4s, v11.4s}, [x6], #32
|
||||
ld1 {v12.8h}, [x7], #16
|
||||
ld1 {v13.8h}, [x8], #16
|
||||
ld1 {v0.4s, v1.4s}, [x0], #32
|
||||
ld1 {v2.8h}, [x1], #16
|
||||
1:
|
||||
subs x2, x2, #8
|
||||
ld1 {v0.4s, v1.4s}, [x0] // a
|
||||
ld1 {v2.8h}, [x1] // b
|
||||
srshl v0.4s, v0.4s, v7.4s
|
||||
srshl v1.4s, v1.4s, v7.4s
|
||||
srshl v4.8h, v2.8h, v6.8h
|
||||
mul v0.4s, v0.4s, v31.4s // a * n
|
||||
mul v1.4s, v1.4s, v31.4s // a * n
|
||||
umull v3.4s, v4.4h, v4.4h // b * b
|
||||
umull2 v4.4s, v4.8h, v4.8h // b * b
|
||||
uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0)
|
||||
uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0)
|
||||
mul v0.4s, v0.4s, v28.4s // p * s
|
||||
mul v1.4s, v1.4s, v28.4s // p * s
|
||||
uqshrn v0.4h, v0.4s, #16
|
||||
uqshrn2 v0.8h, v1.4s, #16
|
||||
uqrshrn v0.8b, v0.8h, #4 // imin(z, 255)
|
||||
|
||||
cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5
|
||||
cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4
|
||||
tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b
|
||||
cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3
|
||||
cmhi v4.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2
|
||||
add v25.8b, v25.8b, v26.8b
|
||||
cmhi v5.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1
|
||||
add v27.8b, v27.8b, v4.8b
|
||||
add v5.8b, v5.8b, v19.8b
|
||||
add v25.8b, v25.8b, v27.8b
|
||||
add v1.8b, v1.8b, v5.8b
|
||||
add v1.8b, v1.8b, v25.8b
|
||||
uxtl v1.8h, v1.8b // x
|
||||
add v8.4s, v8.4s, v10.4s
|
||||
add v9.4s, v9.4s, v11.4s
|
||||
|
||||
umull v3.4s, v1.4h, v2.4h // x * BB[i]
|
||||
umull2 v4.4s, v1.8h, v2.8h // x * BB[i]
|
||||
mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x
|
||||
mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x
|
||||
srshr v3.4s, v3.4s, #12 // AA[i]
|
||||
srshr v4.4s, v4.4s, #12 // AA[i]
|
||||
sub v2.8h, v29.8h, v1.8h // 256 - x
|
||||
add v12.8h, v12.8h, v13.8h
|
||||
|
||||
st1 {v3.4s, v4.4s}, [x0], #32
|
||||
st1 {v2.8h}, [x1], #16
|
||||
subs w4, w4, #8
|
||||
add v0.4s, v0.4s, v8.4s
|
||||
add v1.4s, v1.4s, v9.4s
|
||||
add v2.8h, v2.8h, v12.8h
|
||||
|
||||
srshl v0.4s, v0.4s, v7.4s
|
||||
srshl v1.4s, v1.4s, v7.4s
|
||||
srshl v4.8h, v2.8h, v6.8h
|
||||
mul v0.4s, v0.4s, v31.4s // a * n
|
||||
mul v1.4s, v1.4s, v31.4s // a * n
|
||||
umull v3.4s, v4.4h, v4.4h // b * b
|
||||
umull2 v4.4s, v4.8h, v4.8h // b * b
|
||||
uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0)
|
||||
uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0)
|
||||
mul v0.4s, v0.4s, v28.4s // p * s
|
||||
mul v1.4s, v1.4s, v28.4s // p * s
|
||||
ld1 {v8.4s, v9.4s}, [x5], #32
|
||||
uqshrn v0.4h, v0.4s, #16
|
||||
uqshrn2 v0.8h, v1.4s, #16
|
||||
ld1 {v10.4s, v11.4s}, [x6], #32
|
||||
uqrshrn v0.8b, v0.8h, #4 // imin(z, 255)
|
||||
|
||||
ld1 {v12.8h}, [x7], #16
|
||||
|
||||
cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5
|
||||
cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4
|
||||
tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b
|
||||
cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3
|
||||
cmhi v4.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2
|
||||
add v25.8b, v25.8b, v26.8b
|
||||
cmhi v5.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1
|
||||
add v27.8b, v27.8b, v4.8b
|
||||
add v5.8b, v5.8b, v19.8b
|
||||
add v25.8b, v25.8b, v27.8b
|
||||
add v5.8b, v1.8b, v5.8b
|
||||
ld1 {v13.8h}, [x8], #16
|
||||
add v5.8b, v5.8b, v25.8b
|
||||
ld1 {v0.4s, v1.4s}, [x0], #32
|
||||
uxtl v5.8h, v5.8b // x
|
||||
|
||||
umull v3.4s, v5.4h, v2.4h // x * BB[i]
|
||||
umull2 v4.4s, v5.8h, v2.8h // x * BB[i]
|
||||
mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x
|
||||
mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x
|
||||
srshr v3.4s, v3.4s, #12 // AA[i]
|
||||
srshr v4.4s, v4.4s, #12 // AA[i]
|
||||
sub v5.8h, v29.8h, v5.8h // 256 - x
|
||||
ld1 {v2.8h}, [x1], #16
|
||||
|
||||
st1 {v3.4s, v4.4s}, [x2], #32
|
||||
st1 {v5.8h}, [x3], #16
|
||||
b.gt 1b
|
||||
|
||||
subs x3, x3, #1
|
||||
b.le 0f
|
||||
add x0, x0, x7, lsl #2
|
||||
add x1, x1, x7, lsl #1
|
||||
mov x2, x6
|
||||
b 1b
|
||||
0:
|
||||
ldp d12, d13, [sp, #0x20]
|
||||
ldp d10, d11, [sp, #0x10]
|
||||
ldp d8, d9, [sp], 0x30
|
||||
ret
|
||||
endfunc
|
||||
|
||||
// void dav1d_sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum,
|
||||
// int32_t *AA, int16_t *BB,
|
||||
// const int w, const int s,
|
||||
// const int bitdepth_max);
|
||||
function sgr_box5_vert_neon, export=1
|
||||
stp d8, d9, [sp, #-0x40]!
|
||||
stp d10, d11, [sp, #0x10]
|
||||
stp d12, d13, [sp, #0x20]
|
||||
stp d14, d15, [sp, #0x30]
|
||||
|
||||
add w4, w4, #2
|
||||
clz w15, w6 // bitdepth_max
|
||||
dup v28.4s, w5 // strength
|
||||
|
||||
ldp x5, x6, [x0]
|
||||
ldp x7, x8, [x0, #16]
|
||||
ldr x0, [x0, #32]
|
||||
ldp x9, x10, [x1]
|
||||
ldp x11, x12, [x1, #16]
|
||||
ldr x1, [x1, #32]
|
||||
|
||||
movi v31.4s, #25 // n
|
||||
|
||||
sub w15, w15, #24 // -bitdepth_min_8
|
||||
movrel x13, X(sgr_x_by_x)
|
||||
mov w14, #164 // one_by_x
|
||||
ld1 {v16.16b, v17.16b, v18.16b}, [x13]
|
||||
dup v6.8h, w15 // -bitdepth_min_8
|
||||
movi v19.16b, #5
|
||||
movi v24.8b, #254 // idx of last 1
|
||||
saddl v7.4s, v6.4h, v6.4h // -2*bitdepth_min_8
|
||||
movi v29.8h, #1, lsl #8
|
||||
dup v30.4s, w14 // one_by_x
|
||||
|
||||
sub v16.16b, v16.16b, v19.16b
|
||||
sub v17.16b, v17.16b, v19.16b
|
||||
sub v18.16b, v18.16b, v19.16b
|
||||
|
||||
ld1 {v8.4s, v9.4s}, [x5], #32
|
||||
ld1 {v10.4s, v11.4s}, [x6], #32
|
||||
ld1 {v12.4s, v13.4s}, [x7], #32
|
||||
ld1 {v14.4s, v15.4s}, [x8], #32
|
||||
ld1 {v20.8h}, [x9], #16
|
||||
ld1 {v21.8h}, [x10], #16
|
||||
ld1 {v22.8h}, [x11], #16
|
||||
ld1 {v23.8h}, [x12], #16
|
||||
ld1 {v0.4s, v1.4s}, [x0], #32
|
||||
ld1 {v2.8h}, [x1], #16
|
||||
|
||||
1:
|
||||
add v8.4s, v8.4s, v10.4s
|
||||
add v9.4s, v9.4s, v11.4s
|
||||
add v12.4s, v12.4s, v14.4s
|
||||
add v13.4s, v13.4s, v15.4s
|
||||
|
||||
add v20.8h, v20.8h, v21.8h
|
||||
add v22.8h, v22.8h, v23.8h
|
||||
|
||||
add v0.4s, v0.4s, v8.4s
|
||||
add v1.4s, v1.4s, v9.4s
|
||||
add v2.8h, v2.8h, v20.8h
|
||||
|
||||
add v0.4s, v0.4s, v12.4s
|
||||
add v1.4s, v1.4s, v13.4s
|
||||
add v2.8h, v2.8h, v22.8h
|
||||
|
||||
subs w4, w4, #8
|
||||
|
||||
movi v20.8b, #55 // idx of last 5
|
||||
movi v21.8b, #72 // idx of last 4
|
||||
movi v22.8b, #101 // idx of last 3
|
||||
movi v23.8b, #169 // idx of last 2
|
||||
|
||||
srshl v0.4s, v0.4s, v7.4s
|
||||
srshl v1.4s, v1.4s, v7.4s
|
||||
srshl v4.8h, v2.8h, v6.8h
|
||||
mul v0.4s, v0.4s, v31.4s // a * n
|
||||
mul v1.4s, v1.4s, v31.4s // a * n
|
||||
umull v3.4s, v4.4h, v4.4h // b * b
|
||||
umull2 v4.4s, v4.8h, v4.8h // b * b
|
||||
uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0)
|
||||
uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0)
|
||||
mul v0.4s, v0.4s, v28.4s // p * s
|
||||
mul v1.4s, v1.4s, v28.4s // p * s
|
||||
ld1 {v8.4s, v9.4s}, [x5], #32
|
||||
uqshrn v0.4h, v0.4s, #16
|
||||
uqshrn2 v0.8h, v1.4s, #16
|
||||
ld1 {v10.4s, v11.4s}, [x6], #32
|
||||
uqrshrn v0.8b, v0.8h, #4 // imin(z, 255)
|
||||
|
||||
ld1 {v12.4s, v13.4s}, [x7], #32
|
||||
|
||||
cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5
|
||||
cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4
|
||||
tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b
|
||||
cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3
|
||||
cmhi v4.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2
|
||||
ld1 {v14.4s, v15.4s}, [x8], #32
|
||||
add v25.8b, v25.8b, v26.8b
|
||||
cmhi v5.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1
|
||||
add v27.8b, v27.8b, v4.8b
|
||||
ld1 {v20.8h}, [x9], #16
|
||||
add v5.8b, v5.8b, v19.8b
|
||||
add v25.8b, v25.8b, v27.8b
|
||||
ld1 {v21.8h}, [x10], #16
|
||||
add v5.8b, v1.8b, v5.8b
|
||||
ld1 {v22.8h}, [x11], #16
|
||||
add v5.8b, v5.8b, v25.8b
|
||||
ld1 {v23.8h}, [x12], #16
|
||||
uxtl v5.8h, v5.8b // x
|
||||
|
||||
ld1 {v0.4s, v1.4s}, [x0], #32
|
||||
umull v3.4s, v5.4h, v2.4h // x * BB[i]
|
||||
umull2 v4.4s, v5.8h, v2.8h // x * BB[i]
|
||||
mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x
|
||||
mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x
|
||||
srshr v3.4s, v3.4s, #12 // AA[i]
|
||||
srshr v4.4s, v4.4s, #12 // AA[i]
|
||||
sub v5.8h, v29.8h, v5.8h // 256 - x
|
||||
ld1 {v2.8h}, [x1], #16
|
||||
|
||||
st1 {v3.4s, v4.4s}, [x2], #32
|
||||
st1 {v5.8h}, [x3], #16
|
||||
b.gt 1b
|
||||
|
||||
ldp d14, d15, [sp, #0x30]
|
||||
ldp d12, d13, [sp, #0x20]
|
||||
ldp d10, d11, [sp, #0x10]
|
||||
ldp d8, d9, [sp], 0x40
|
||||
ret
|
||||
endfunc
|
||||
|
692
third_party/dav1d/src/arm/64/looprestoration_tmpl.S
vendored
692
third_party/dav1d/src/arm/64/looprestoration_tmpl.S
vendored
@ -30,52 +30,224 @@
|
||||
#define FILTER_OUT_STRIDE 384
|
||||
|
||||
.macro sgr_funcs bpc
|
||||
// void dav1d_sgr_finish_filter1_Xbpc_neon(int16_t *tmp,
|
||||
// const pixel *src, const ptrdiff_t stride,
|
||||
// const int32_t *a, const int16_t *b,
|
||||
// const int w, const int h);
|
||||
function sgr_finish_filter1_\bpc\()bpc_neon, export=1
|
||||
sub x7, x3, #(4*SUM_STRIDE)
|
||||
add x8, x3, #(4*SUM_STRIDE)
|
||||
sub x9, x4, #(2*SUM_STRIDE)
|
||||
add x10, x4, #(2*SUM_STRIDE)
|
||||
mov x11, #SUM_STRIDE
|
||||
mov x12, #FILTER_OUT_STRIDE
|
||||
add x13, x5, #7
|
||||
bic x13, x13, #7 // Aligned width
|
||||
// void dav1d_sgr_finish_filter1_2rows_Xbpc_neon(int16_t *tmp,
|
||||
// const pixel *src,
|
||||
// const ptrdiff_t src_stride,
|
||||
// const int32_t **a,
|
||||
// const int16_t **b,
|
||||
// const int w, const int h);
|
||||
function sgr_finish_filter1_2rows_\bpc\()bpc_neon, export=1
|
||||
stp d8, d9, [sp, #-0x40]!
|
||||
stp d10, d11, [sp, #0x10]
|
||||
stp d12, d13, [sp, #0x20]
|
||||
stp d14, d15, [sp, #0x30]
|
||||
|
||||
ldp x7, x8, [x3]
|
||||
ldp x9, x3, [x3, #16]
|
||||
ldp x10, x11, [x4]
|
||||
ldp x12, x4, [x4, #16]
|
||||
|
||||
mov x13, #FILTER_OUT_STRIDE
|
||||
cmp w6, #1
|
||||
add x2, x1, x2 // src + stride
|
||||
csel x2, x1, x2, le // if (h <= 1) x2 = x1
|
||||
add x13, x0, x13, lsl #1
|
||||
|
||||
movi v30.8h, #3
|
||||
movi v31.4s, #3
|
||||
1:
|
||||
ld1 {v0.8h, v1.8h}, [x10], #32
|
||||
ld1 {v2.8h, v3.8h}, [x11], #32
|
||||
ld1 {v4.8h, v5.8h}, [x12], #32
|
||||
ld1 {v6.8h, v7.8h}, [x4], #32
|
||||
ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48
|
||||
ld1 {v19.4s, v20.4s, v21.4s}, [x8], #48
|
||||
ld1 {v22.4s, v23.4s, v24.4s}, [x9], #48
|
||||
ld1 {v25.4s, v26.4s, v27.4s}, [x3], #48
|
||||
|
||||
2:
|
||||
ext v8.16b, v0.16b, v1.16b, #2 // [0][1]
|
||||
ext v9.16b, v2.16b, v3.16b, #2 // [1][1]
|
||||
ext v10.16b, v4.16b, v5.16b, #2 // [2][1]
|
||||
ext v11.16b, v0.16b, v1.16b, #4 // [0][2]
|
||||
ext v12.16b, v2.16b, v3.16b, #4 // [1][2]
|
||||
ext v13.16b, v4.16b, v5.16b, #4 // [2][2]
|
||||
|
||||
add v14.8h, v2.8h, v8.8h // [1][0] + [0][1]
|
||||
add v15.8h, v9.8h, v10.8h // [1][1] + [2][1]
|
||||
|
||||
add v28.8h, v0.8h, v11.8h // [0][0] + [0][2]
|
||||
add v14.8h, v14.8h, v12.8h // () + [1][2]
|
||||
add v29.8h, v4.8h, v13.8h // [2][0] + [2][2]
|
||||
|
||||
ext v8.16b, v6.16b, v7.16b, #2 // [3][1]
|
||||
ext v11.16b, v6.16b, v7.16b, #4 // [3][2]
|
||||
|
||||
add v14.8h, v14.8h, v15.8h // mid
|
||||
add v15.8h, v28.8h, v29.8h // corners
|
||||
|
||||
add v28.8h, v4.8h, v9.8h // [2][0] + [1][1]
|
||||
add v29.8h, v10.8h, v8.8h // [2][1] + [3][1]
|
||||
|
||||
add v2.8h, v2.8h, v12.8h // [1][0] + [1][2]
|
||||
add v28.8h, v28.8h, v13.8h // () + [2][2]
|
||||
add v4.8h, v6.8h, v11.8h // [3][0] + [3][2]
|
||||
|
||||
add v0.8h, v28.8h, v29.8h // mid
|
||||
add v2.8h, v2.8h, v4.8h // corners
|
||||
|
||||
shl v4.8h, v14.8h, #2
|
||||
mla v4.8h, v15.8h, v30.8h // * 3 -> a
|
||||
|
||||
shl v0.8h, v0.8h, #2
|
||||
mla v0.8h, v2.8h, v30.8h // * 3 -> a
|
||||
|
||||
ext v8.16b, v16.16b, v17.16b, #4 // [0][1]
|
||||
ext v9.16b, v17.16b, v18.16b, #4
|
||||
ext v10.16b, v16.16b, v17.16b, #8 // [0][2]
|
||||
ext v11.16b, v17.16b, v18.16b, #8
|
||||
ext v12.16b, v19.16b, v20.16b, #4 // [1][1]
|
||||
ext v13.16b, v20.16b, v21.16b, #4
|
||||
add v8.4s, v8.4s, v19.4s // [0][1] + [1][0]
|
||||
add v9.4s, v9.4s, v20.4s
|
||||
add v16.4s, v16.4s, v10.4s // [0][0] + [0][2]
|
||||
add v17.4s, v17.4s, v11.4s
|
||||
ext v14.16b, v19.16b, v20.16b, #8 // [1][2]
|
||||
ext v15.16b, v20.16b, v21.16b, #8
|
||||
add v16.4s, v16.4s, v22.4s // () + [2][0]
|
||||
add v17.4s, v17.4s, v23.4s
|
||||
add v28.4s, v12.4s, v14.4s // [1][1] + [1][2]
|
||||
add v29.4s, v13.4s, v15.4s
|
||||
ext v10.16b, v22.16b, v23.16b, #4 // [2][1]
|
||||
ext v11.16b, v23.16b, v24.16b, #4
|
||||
add v8.4s, v8.4s, v28.4s // mid (incomplete)
|
||||
add v9.4s, v9.4s, v29.4s
|
||||
|
||||
add v19.4s, v19.4s, v14.4s // [1][0] + [1][2]
|
||||
add v20.4s, v20.4s, v15.4s
|
||||
add v14.4s, v22.4s, v12.4s // [2][0] + [1][1]
|
||||
add v15.4s, v23.4s, v13.4s
|
||||
|
||||
ext v12.16b, v22.16b, v23.16b, #8 // [2][2]
|
||||
ext v13.16b, v23.16b, v24.16b, #8
|
||||
ext v28.16b, v25.16b, v26.16b, #4 // [3][1]
|
||||
ext v29.16b, v26.16b, v27.16b, #4
|
||||
add v8.4s, v8.4s, v10.4s // () + [2][1] = mid
|
||||
add v9.4s, v9.4s, v11.4s
|
||||
add v14.4s, v14.4s, v10.4s // () + [2][1]
|
||||
add v15.4s, v15.4s, v11.4s
|
||||
ext v10.16b, v25.16b, v26.16b, #8 // [3][2]
|
||||
ext v11.16b, v26.16b, v27.16b, #8
|
||||
add v16.4s, v16.4s, v12.4s // () + [2][2] = corner
|
||||
add v17.4s, v17.4s, v13.4s
|
||||
|
||||
add v12.4s, v12.4s, v28.4s // [2][2] + [3][1]
|
||||
add v13.4s, v13.4s, v29.4s
|
||||
add v25.4s, v25.4s, v10.4s // [3][0] + [3][2]
|
||||
add v26.4s, v26.4s, v11.4s
|
||||
|
||||
add v14.4s, v14.4s, v12.4s // mid
|
||||
add v15.4s, v15.4s, v13.4s
|
||||
add v19.4s, v19.4s, v25.4s // corner
|
||||
add v20.4s, v20.4s, v26.4s
|
||||
|
||||
.if \bpc == 8
|
||||
sub x2, x2, x13
|
||||
ld1 {v25.8b}, [x1], #8 // src
|
||||
ld1 {v26.8b}, [x2], #8
|
||||
.else
|
||||
sub x2, x2, x13, lsl #1
|
||||
ld1 {v25.8h}, [x1], #16 // src
|
||||
ld1 {v26.8h}, [x2], #16
|
||||
.endif
|
||||
sub x12, x12, x13
|
||||
sub x11, x11, x13
|
||||
sub x11, x11, #4 // We read 4 extra elements from a
|
||||
sub x14, x11, #4 // We read 8 extra elements from b
|
||||
mov x13, x5
|
||||
|
||||
shl v8.4s, v8.4s, #2
|
||||
shl v9.4s, v9.4s, #2
|
||||
mla v8.4s, v16.4s, v31.4s // * 3 -> b
|
||||
mla v9.4s, v17.4s, v31.4s
|
||||
|
||||
.if \bpc == 8
|
||||
uxtl v25.8h, v25.8b // src
|
||||
uxtl v26.8h, v26.8b
|
||||
.endif
|
||||
|
||||
shl v14.4s, v14.4s, #2
|
||||
shl v15.4s, v15.4s, #2
|
||||
mla v14.4s, v19.4s, v31.4s // * 3 -> b
|
||||
mla v15.4s, v20.4s, v31.4s
|
||||
|
||||
umlal v8.4s, v4.4h, v25.4h // b + a * src
|
||||
umlal2 v9.4s, v4.8h, v25.8h
|
||||
umlal v14.4s, v0.4h, v26.4h // b + a * src
|
||||
umlal2 v15.4s, v0.8h, v26.8h
|
||||
mov v0.16b, v1.16b
|
||||
rshrn v8.4h, v8.4s, #9
|
||||
rshrn2 v8.8h, v9.4s, #9
|
||||
mov v2.16b, v3.16b
|
||||
rshrn v14.4h, v14.4s, #9
|
||||
rshrn2 v14.8h, v15.4s, #9
|
||||
subs w5, w5, #8
|
||||
mov v4.16b, v5.16b
|
||||
st1 {v8.8h}, [x0], #16
|
||||
mov v6.16b, v7.16b
|
||||
st1 {v14.8h}, [x13], #16
|
||||
|
||||
b.le 3f
|
||||
mov v16.16b, v18.16b
|
||||
mov v19.16b, v21.16b
|
||||
mov v22.16b, v24.16b
|
||||
mov v25.16b, v27.16b
|
||||
ld1 {v1.8h}, [x10], #16
|
||||
ld1 {v3.8h}, [x11], #16
|
||||
ld1 {v5.8h}, [x12], #16
|
||||
ld1 {v7.8h}, [x4], #16
|
||||
ld1 {v17.4s, v18.4s}, [x7], #32
|
||||
ld1 {v20.4s, v21.4s}, [x8], #32
|
||||
ld1 {v23.4s, v24.4s}, [x9], #32
|
||||
ld1 {v26.4s, v27.4s}, [x3], #32
|
||||
b 2b
|
||||
|
||||
3:
|
||||
ldp d14, d15, [sp, #0x30]
|
||||
ldp d12, d13, [sp, #0x20]
|
||||
ldp d10, d11, [sp, #0x10]
|
||||
ldp d8, d9, [sp], 0x40
|
||||
ret
|
||||
endfunc
|
||||
|
||||
// void dav1d_sgr_finish_weighted1_Xbpc_neon(pixel *dst,
|
||||
// const int32_t **a, const int16_t **b,
|
||||
// const int w, const int w1,
|
||||
// const int bitdepth_max);
|
||||
function sgr_finish_weighted1_\bpc\()bpc_neon, export=1
|
||||
ldp x7, x8, [x1]
|
||||
ldr x1, [x1, #16]
|
||||
ldp x9, x10, [x2]
|
||||
ldr x2, [x2, #16]
|
||||
|
||||
dup v31.8h, w4
|
||||
dup v30.8h, w5
|
||||
|
||||
movi v6.8h, #3
|
||||
movi v7.4s, #3
|
||||
1:
|
||||
ld1 {v0.8h, v1.8h}, [x9], #32
|
||||
ld1 {v2.8h, v3.8h}, [x4], #32
|
||||
ld1 {v4.8h, v5.8h}, [x10], #32
|
||||
ld1 {v0.8h, v1.8h}, [x9], #32
|
||||
ld1 {v2.8h, v3.8h}, [x10], #32
|
||||
ld1 {v4.8h, v5.8h}, [x2], #32
|
||||
ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48
|
||||
ld1 {v19.4s, v20.4s, v21.4s}, [x3], #48
|
||||
ld1 {v22.4s, v23.4s, v24.4s}, [x8], #48
|
||||
ld1 {v19.4s, v20.4s, v21.4s}, [x8], #48
|
||||
ld1 {v22.4s, v23.4s, v24.4s}, [x1], #48
|
||||
|
||||
2:
|
||||
subs x5, x5, #8
|
||||
ext v25.16b, v0.16b, v1.16b, #2 // -stride
|
||||
ext v26.16b, v2.16b, v3.16b, #2 // 0
|
||||
ext v27.16b, v4.16b, v5.16b, #2 // +stride
|
||||
ext v28.16b, v0.16b, v1.16b, #4 // +1-stride
|
||||
ext v29.16b, v2.16b, v3.16b, #4 // +1
|
||||
ext v30.16b, v4.16b, v5.16b, #4 // +1+stride
|
||||
add v2.8h, v2.8h, v25.8h // -1, -stride
|
||||
ext v25.16b, v4.16b, v5.16b, #4 // +1+stride
|
||||
add v26.8h, v26.8h, v27.8h // 0, +stride
|
||||
add v0.8h, v0.8h, v28.8h // -1-stride, +1-stride
|
||||
add v2.8h, v2.8h, v26.8h
|
||||
add v4.8h, v4.8h, v30.8h // -1+stride, +1+stride
|
||||
add v4.8h, v4.8h, v25.8h // -1+stride, +1+stride
|
||||
add v2.8h, v2.8h, v29.8h // +1
|
||||
add v0.8h, v0.8h, v4.8h
|
||||
|
||||
@ -85,7 +257,7 @@ function sgr_finish_filter1_\bpc\()bpc_neon, export=1
|
||||
ext v27.16b, v16.16b, v17.16b, #8 // +1-stride
|
||||
ext v28.16b, v17.16b, v18.16b, #8
|
||||
ext v29.16b, v19.16b, v20.16b, #4 // 0
|
||||
ext v30.16b, v20.16b, v21.16b, #4
|
||||
ext v4.16b, v20.16b, v21.16b, #4
|
||||
mla v2.8h, v0.8h, v6.8h // * 3 -> a
|
||||
add v25.4s, v25.4s, v19.4s // -stride, -1
|
||||
add v26.4s, v26.4s, v20.4s
|
||||
@ -96,22 +268,22 @@ function sgr_finish_filter1_\bpc\()bpc_neon, export=1
|
||||
add v16.4s, v16.4s, v22.4s // -1+stride
|
||||
add v17.4s, v17.4s, v23.4s
|
||||
add v29.4s, v29.4s, v27.4s // 0, +1
|
||||
add v30.4s, v30.4s, v28.4s
|
||||
add v4.4s, v4.4s, v28.4s
|
||||
add v25.4s, v25.4s, v29.4s
|
||||
add v26.4s, v26.4s, v30.4s
|
||||
add v26.4s, v26.4s, v4.4s
|
||||
ext v27.16b, v22.16b, v23.16b, #4 // +stride
|
||||
ext v28.16b, v23.16b, v24.16b, #4
|
||||
ext v29.16b, v22.16b, v23.16b, #8 // +1+stride
|
||||
ext v30.16b, v23.16b, v24.16b, #8
|
||||
ext v4.16b, v23.16b, v24.16b, #8
|
||||
.if \bpc == 8
|
||||
ld1 {v19.8b}, [x1], #8 // src
|
||||
ld1 {v19.8b}, [x0] // src
|
||||
.else
|
||||
ld1 {v19.8h}, [x1], #16 // src
|
||||
ld1 {v19.8h}, [x0] // src
|
||||
.endif
|
||||
add v25.4s, v25.4s, v27.4s // +stride
|
||||
add v26.4s, v26.4s, v28.4s
|
||||
add v16.4s, v16.4s, v29.4s // +1+stride
|
||||
add v17.4s, v17.4s, v30.4s
|
||||
add v17.4s, v17.4s, v4.4s
|
||||
shl v25.4s, v25.4s, #2
|
||||
shl v26.4s, v26.4s, #2
|
||||
mla v25.4s, v16.4s, v7.4s // * 3 -> b
|
||||
@ -125,61 +297,68 @@ function sgr_finish_filter1_\bpc\()bpc_neon, export=1
|
||||
mov v2.16b, v3.16b
|
||||
rshrn v25.4h, v25.4s, #9
|
||||
rshrn2 v25.8h, v26.4s, #9
|
||||
mov v4.16b, v5.16b
|
||||
st1 {v25.8h}, [x0], #16
|
||||
|
||||
b.le 3f
|
||||
subs w3, w3, #8
|
||||
|
||||
// weighted1
|
||||
shl v19.8h, v19.8h, #4 // u
|
||||
mov v4.16b, v5.16b
|
||||
|
||||
sub v25.8h, v25.8h, v19.8h // t1 - u
|
||||
ld1 {v1.8h}, [x9], #16
|
||||
ushll v26.4s, v19.4h, #7 // u << 7
|
||||
ushll2 v27.4s, v19.8h, #7 // u << 7
|
||||
ld1 {v3.8h}, [x10], #16
|
||||
smlal v26.4s, v25.4h, v31.4h // v
|
||||
smlal2 v27.4s, v25.8h, v31.8h // v
|
||||
ld1 {v5.8h}, [x2], #16
|
||||
.if \bpc == 8
|
||||
rshrn v26.4h, v26.4s, #11
|
||||
rshrn2 v26.8h, v27.4s, #11
|
||||
mov v16.16b, v18.16b
|
||||
sqxtun v26.8b, v26.8h
|
||||
mov v19.16b, v21.16b
|
||||
mov v22.16b, v24.16b
|
||||
ld1 {v1.8h}, [x9], #16
|
||||
ld1 {v3.8h}, [x4], #16
|
||||
ld1 {v5.8h}, [x10], #16
|
||||
st1 {v26.8b}, [x0], #8
|
||||
.else
|
||||
sqrshrun v26.4h, v26.4s, #11
|
||||
sqrshrun2 v26.8h, v27.4s, #11
|
||||
mov v16.16b, v18.16b
|
||||
umin v26.8h, v26.8h, v30.8h
|
||||
mov v19.16b, v21.16b
|
||||
mov v22.16b, v24.16b
|
||||
st1 {v26.8h}, [x0], #16
|
||||
.endif
|
||||
|
||||
b.le 3f
|
||||
ld1 {v17.4s, v18.4s}, [x7], #32
|
||||
ld1 {v20.4s, v21.4s}, [x3], #32
|
||||
ld1 {v23.4s, v24.4s}, [x8], #32
|
||||
ld1 {v20.4s, v21.4s}, [x8], #32
|
||||
ld1 {v23.4s, v24.4s}, [x1], #32
|
||||
b 2b
|
||||
|
||||
3:
|
||||
subs x6, x6, #1
|
||||
b.le 0f
|
||||
mov x5, x13
|
||||
add x0, x0, x12, lsl #1
|
||||
add x1, x1, x2
|
||||
add x3, x3, x11, lsl #2
|
||||
add x7, x7, x11, lsl #2
|
||||
add x8, x8, x11, lsl #2
|
||||
add x4, x4, x14, lsl #1
|
||||
add x9, x9, x14, lsl #1
|
||||
add x10, x10, x14, lsl #1
|
||||
b 1b
|
||||
0:
|
||||
ret
|
||||
endfunc
|
||||
|
||||
// void dav1d_sgr_finish_filter2_Xbpc_neon(int16_t *tmp,
|
||||
// const pixel *src, const ptrdiff_t stride,
|
||||
// const int32_t *a, const int16_t *b,
|
||||
// const int w, const int h);
|
||||
function sgr_finish_filter2_\bpc\()bpc_neon, export=1
|
||||
add x7, x3, #(4*(SUM_STRIDE))
|
||||
sub x3, x3, #(4*(SUM_STRIDE))
|
||||
add x8, x4, #(2*(SUM_STRIDE))
|
||||
sub x4, x4, #(2*(SUM_STRIDE))
|
||||
mov x9, #(2*SUM_STRIDE)
|
||||
// void dav1d_sgr_finish_filter2_2rows_Xbpc_neon(int16_t *tmp,
|
||||
// const pixel *src,
|
||||
// const ptrdiff_t stride,
|
||||
// const int32_t **a,
|
||||
// const int16_t **b,
|
||||
// const int w, const int h);
|
||||
function sgr_finish_filter2_2rows_\bpc\()bpc_neon, export=1
|
||||
stp d8, d9, [sp, #-0x40]!
|
||||
stp d10, d11, [sp, #0x10]
|
||||
stp d12, d13, [sp, #0x20]
|
||||
stp d14, d15, [sp, #0x30]
|
||||
|
||||
ldp x3, x7, [x3]
|
||||
ldp x4, x8, [x4]
|
||||
mov x10, #FILTER_OUT_STRIDE
|
||||
add x11, x5, #7
|
||||
bic x11, x11, #7 // Aligned width
|
||||
.if \bpc == 8
|
||||
sub x2, x2, x11
|
||||
.else
|
||||
sub x2, x2, x11, lsl #1
|
||||
.endif
|
||||
sub x10, x10, x11
|
||||
sub x9, x9, x11
|
||||
sub x9, x9, #4 // We read 4 extra elements from a
|
||||
sub x12, x9, #4 // We read 8 extra elements from b
|
||||
mov x11, x5
|
||||
cmp w6, #1
|
||||
add x2, x1, x2 // src + stride
|
||||
csel x2, x1, x2, le // if (h <= 1) x2 = x1
|
||||
add x10, x0, x10, lsl #1
|
||||
movi v4.8h, #5
|
||||
movi v5.4s, #5
|
||||
movi v6.8h, #6
|
||||
@ -191,7 +370,6 @@ function sgr_finish_filter2_\bpc\()bpc_neon, export=1
|
||||
ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48
|
||||
|
||||
2:
|
||||
subs x5, x5, #8
|
||||
ext v24.16b, v0.16b, v1.16b, #4 // +1-stride
|
||||
ext v25.16b, v2.16b, v3.16b, #4 // +1+stride
|
||||
ext v22.16b, v0.16b, v1.16b, #2 // -stride
|
||||
@ -201,6 +379,9 @@ function sgr_finish_filter2_\bpc\()bpc_neon, export=1
|
||||
add v2.8h, v22.8h, v23.8h // -stride, +stride
|
||||
add v0.8h, v0.8h, v25.8h
|
||||
|
||||
mul v8.8h, v25.8h, v4.8h // * 5
|
||||
mla v8.8h, v23.8h, v6.8h // * 6
|
||||
|
||||
ext v22.16b, v16.16b, v17.16b, #4 // -stride
|
||||
ext v23.16b, v17.16b, v18.16b, #4
|
||||
ext v24.16b, v19.16b, v20.16b, #4 // +stride
|
||||
@ -213,8 +394,10 @@ function sgr_finish_filter2_\bpc\()bpc_neon, export=1
|
||||
mla v0.8h, v2.8h, v6.8h // * 6
|
||||
.if \bpc == 8
|
||||
ld1 {v31.8b}, [x1], #8
|
||||
ld1 {v30.8b}, [x2], #8
|
||||
.else
|
||||
ld1 {v31.8h}, [x1], #16
|
||||
ld1 {v30.8h}, [x2], #16
|
||||
.endif
|
||||
add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride
|
||||
add v17.4s, v17.4s, v27.4s
|
||||
@ -223,6 +406,11 @@ function sgr_finish_filter2_\bpc\()bpc_neon, export=1
|
||||
add v16.4s, v16.4s, v19.4s
|
||||
add v17.4s, v17.4s, v20.4s
|
||||
|
||||
mul v9.4s, v19.4s, v5.4s // * 5
|
||||
mla v9.4s, v24.4s, v7.4s // * 6
|
||||
mul v10.4s, v20.4s, v5.4s // * 5
|
||||
mla v10.4s, v25.4s, v7.4s // * 6
|
||||
|
||||
add v22.4s, v22.4s, v24.4s // -stride, +stride
|
||||
add v23.4s, v23.4s, v25.4s
|
||||
// This is, surprisingly, faster than other variants where the
|
||||
@ -234,16 +422,23 @@ function sgr_finish_filter2_\bpc\()bpc_neon, export=1
|
||||
|
||||
.if \bpc == 8
|
||||
uxtl v31.8h, v31.8b
|
||||
uxtl v30.8h, v30.8b
|
||||
.endif
|
||||
umlal v16.4s, v0.4h, v31.4h // b + a * src
|
||||
umlal2 v17.4s, v0.8h, v31.8h
|
||||
umlal v9.4s, v8.4h, v30.4h // b + a * src
|
||||
umlal2 v10.4s, v8.8h, v30.8h
|
||||
mov v0.16b, v1.16b
|
||||
rshrn v16.4h, v16.4s, #9
|
||||
rshrn2 v16.8h, v17.4s, #9
|
||||
rshrn v9.4h, v9.4s, #8
|
||||
rshrn2 v9.8h, v10.4s, #8
|
||||
subs w5, w5, #8
|
||||
mov v2.16b, v3.16b
|
||||
st1 {v16.8h}, [x0], #16
|
||||
st1 {v16.8h}, [x0], #16
|
||||
st1 {v9.8h}, [x10], #16
|
||||
|
||||
b.le 3f
|
||||
b.le 9f
|
||||
mov v16.16b, v18.16b
|
||||
mov v19.16b, v21.16b
|
||||
ld1 {v1.8h}, [x4], #16
|
||||
@ -252,201 +447,160 @@ function sgr_finish_filter2_\bpc\()bpc_neon, export=1
|
||||
ld1 {v20.4s, v21.4s}, [x7], #32
|
||||
b 2b
|
||||
|
||||
3:
|
||||
subs x6, x6, #1
|
||||
b.le 0f
|
||||
mov x5, x11
|
||||
add x0, x0, x10, lsl #1
|
||||
add x1, x1, x2
|
||||
add x3, x3, x9, lsl #2
|
||||
add x7, x7, x9, lsl #2
|
||||
add x4, x4, x12, lsl #1
|
||||
add x8, x8, x12, lsl #1
|
||||
mov x13, x3
|
||||
mov x14, x4
|
||||
|
||||
ld1 {v0.8h, v1.8h}, [x4], #32
|
||||
ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48
|
||||
|
||||
4:
|
||||
subs x5, x5, #8
|
||||
ext v23.16b, v0.16b, v1.16b, #4 // +1
|
||||
ext v22.16b, v0.16b, v1.16b, #2 // 0
|
||||
add v0.8h, v0.8h, v23.8h // -1, +1
|
||||
|
||||
ext v24.16b, v16.16b, v17.16b, #4 // 0
|
||||
ext v25.16b, v17.16b, v18.16b, #4
|
||||
ext v26.16b, v16.16b, v17.16b, #8 // +1
|
||||
ext v27.16b, v17.16b, v18.16b, #8
|
||||
mul v2.8h, v22.8h, v6.8h // * 6
|
||||
mla v2.8h, v0.8h, v4.8h // * 5 -> a
|
||||
.if \bpc == 8
|
||||
ld1 {v31.8b}, [x1], #8
|
||||
.else
|
||||
ld1 {v31.8h}, [x1], #16
|
||||
.endif
|
||||
add v16.4s, v16.4s, v26.4s // -1, +1
|
||||
add v17.4s, v17.4s, v27.4s
|
||||
.if \bpc == 8
|
||||
uxtl v31.8h, v31.8b
|
||||
.endif
|
||||
// This is, surprisingly, faster than other variants where the
|
||||
// mul+mla pairs are further apart, on Cortex A53.
|
||||
mul v24.4s, v24.4s, v7.4s // * 6
|
||||
mla v24.4s, v16.4s, v5.4s // * 5 -> b
|
||||
mul v25.4s, v25.4s, v7.4s // * 6
|
||||
mla v25.4s, v17.4s, v5.4s // * 5 -> b
|
||||
|
||||
umlal v24.4s, v2.4h, v31.4h // b + a * src
|
||||
umlal2 v25.4s, v2.8h, v31.8h
|
||||
mov v0.16b, v1.16b
|
||||
rshrn v24.4h, v24.4s, #8
|
||||
rshrn2 v24.8h, v25.4s, #8
|
||||
mov v16.16b, v18.16b
|
||||
st1 {v24.8h}, [x0], #16
|
||||
|
||||
b.le 5f
|
||||
ld1 {v1.8h}, [x4], #16
|
||||
ld1 {v17.4s, v18.4s}, [x3], #32
|
||||
b 4b
|
||||
|
||||
5:
|
||||
subs x6, x6, #1
|
||||
b.le 0f
|
||||
mov x5, x11
|
||||
add x0, x0, x10, lsl #1
|
||||
add x1, x1, x2
|
||||
mov x3, x13 // Rewind x3/x4 to where they started
|
||||
mov x4, x14
|
||||
b 1b
|
||||
0:
|
||||
9:
|
||||
ldp d14, d15, [sp, #0x30]
|
||||
ldp d12, d13, [sp, #0x20]
|
||||
ldp d10, d11, [sp, #0x10]
|
||||
ldp d8, d9, [sp], 0x40
|
||||
ret
|
||||
endfunc
|
||||
|
||||
// void dav1d_sgr_weighted1_Xbpc_neon(pixel *dst, const ptrdiff_t dst_stride,
|
||||
// const pixel *src, const ptrdiff_t src_stride,
|
||||
// const int16_t *t1, const int w, const int h,
|
||||
// const int wt, const int bitdepth_max);
|
||||
function sgr_weighted1_\bpc\()bpc_neon, export=1
|
||||
.if \bpc == 16
|
||||
ldr w8, [sp]
|
||||
.endif
|
||||
dup v31.8h, w7
|
||||
cmp x6, #2
|
||||
.if \bpc == 16
|
||||
dup v30.8h, w8
|
||||
.endif
|
||||
add x9, x0, x1
|
||||
add x10, x2, x3
|
||||
add x11, x4, #2*FILTER_OUT_STRIDE
|
||||
mov x7, #(4*FILTER_OUT_STRIDE)
|
||||
lsl x1, x1, #1
|
||||
lsl x3, x3, #1
|
||||
add x8, x5, #7
|
||||
bic x8, x8, #7 // Aligned width
|
||||
.if \bpc == 8
|
||||
sub x1, x1, x8
|
||||
sub x3, x3, x8
|
||||
.else
|
||||
sub x1, x1, x8, lsl #1
|
||||
sub x3, x3, x8, lsl #1
|
||||
.endif
|
||||
sub x7, x7, x8, lsl #1
|
||||
mov x8, x5
|
||||
b.lt 2f
|
||||
1:
|
||||
.if \bpc == 8
|
||||
ld1 {v0.8b}, [x2], #8
|
||||
ld1 {v4.8b}, [x10], #8
|
||||
.else
|
||||
ld1 {v0.8h}, [x2], #16
|
||||
ld1 {v4.8h}, [x10], #16
|
||||
.endif
|
||||
ld1 {v1.8h}, [x4], #16
|
||||
ld1 {v5.8h}, [x11], #16
|
||||
subs x5, x5, #8
|
||||
.if \bpc == 8
|
||||
ushll v0.8h, v0.8b, #4 // u
|
||||
ushll v4.8h, v4.8b, #4 // u
|
||||
.else
|
||||
shl v0.8h, v0.8h, #4 // u
|
||||
shl v4.8h, v4.8h, #4 // u
|
||||
.endif
|
||||
sub v1.8h, v1.8h, v0.8h // t1 - u
|
||||
sub v5.8h, v5.8h, v4.8h // t1 - u
|
||||
ushll v2.4s, v0.4h, #7 // u << 7
|
||||
ushll2 v3.4s, v0.8h, #7 // u << 7
|
||||
ushll v6.4s, v4.4h, #7 // u << 7
|
||||
ushll2 v7.4s, v4.8h, #7 // u << 7
|
||||
smlal v2.4s, v1.4h, v31.4h // v
|
||||
smlal2 v3.4s, v1.8h, v31.8h // v
|
||||
smlal v6.4s, v5.4h, v31.4h // v
|
||||
smlal2 v7.4s, v5.8h, v31.8h // v
|
||||
.if \bpc == 8
|
||||
rshrn v2.4h, v2.4s, #11
|
||||
rshrn2 v2.8h, v3.4s, #11
|
||||
rshrn v6.4h, v6.4s, #11
|
||||
rshrn2 v6.8h, v7.4s, #11
|
||||
sqxtun v2.8b, v2.8h
|
||||
sqxtun v6.8b, v6.8h
|
||||
st1 {v2.8b}, [x0], #8
|
||||
st1 {v6.8b}, [x9], #8
|
||||
.else
|
||||
sqrshrun v2.4h, v2.4s, #11
|
||||
sqrshrun2 v2.8h, v3.4s, #11
|
||||
sqrshrun v6.4h, v6.4s, #11
|
||||
sqrshrun2 v6.8h, v7.4s, #11
|
||||
umin v2.8h, v2.8h, v30.8h
|
||||
umin v6.8h, v6.8h, v30.8h
|
||||
st1 {v2.8h}, [x0], #16
|
||||
st1 {v6.8h}, [x9], #16
|
||||
.endif
|
||||
b.gt 1b
|
||||
// void dav1d_sgr_finish_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride,
|
||||
// const int32_t **a,
|
||||
// const int16_t **b,
|
||||
// const int w, const int h,
|
||||
// const int w1,
|
||||
// const int bitdepth_max);
|
||||
function sgr_finish_weighted2_\bpc\()bpc_neon, export=1
|
||||
stp d8, d9, [sp, #-0x30]!
|
||||
str d10, [sp, #0x10]
|
||||
stp d14, d15, [sp, #0x20]
|
||||
|
||||
sub x6, x6, #2
|
||||
cmp x6, #1
|
||||
b.lt 0f
|
||||
mov x5, x8
|
||||
add x0, x0, x1
|
||||
add x9, x9, x1
|
||||
add x2, x2, x3
|
||||
add x10, x10, x3
|
||||
add x4, x4, x7
|
||||
add x11, x11, x7
|
||||
b.eq 2f
|
||||
b 1b
|
||||
dup v14.8h, w6
|
||||
dup v15.8h, w7
|
||||
|
||||
ldp x2, x7, [x2]
|
||||
ldp x3, x8, [x3]
|
||||
cmp w5, #1
|
||||
add x1, x0, x1 // src + stride
|
||||
// if (h <= 1), set the pointer to the second row to any dummy buffer
|
||||
// we can clobber (x2 in this case)
|
||||
csel x1, x2, x1, le
|
||||
movi v4.8h, #5
|
||||
movi v5.4s, #5
|
||||
movi v6.8h, #6
|
||||
movi v7.4s, #6
|
||||
1:
|
||||
ld1 {v0.8h, v1.8h}, [x3], #32
|
||||
ld1 {v2.8h, v3.8h}, [x8], #32
|
||||
ld1 {v16.4s, v17.4s, v18.4s}, [x2], #48
|
||||
ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48
|
||||
|
||||
2:
|
||||
ext v24.16b, v0.16b, v1.16b, #4 // +1-stride
|
||||
ext v25.16b, v2.16b, v3.16b, #4 // +1+stride
|
||||
ext v22.16b, v0.16b, v1.16b, #2 // -stride
|
||||
ext v23.16b, v2.16b, v3.16b, #2 // +stride
|
||||
add v0.8h, v0.8h, v24.8h // -1-stride, +1-stride
|
||||
add v25.8h, v2.8h, v25.8h // -1+stride, +1+stride
|
||||
add v2.8h, v22.8h, v23.8h // -stride, +stride
|
||||
add v0.8h, v0.8h, v25.8h
|
||||
|
||||
mul v8.8h, v25.8h, v4.8h // * 5
|
||||
mla v8.8h, v23.8h, v6.8h // * 6
|
||||
|
||||
ext v22.16b, v16.16b, v17.16b, #4 // -stride
|
||||
ext v23.16b, v17.16b, v18.16b, #4
|
||||
ext v24.16b, v19.16b, v20.16b, #4 // +stride
|
||||
ext v25.16b, v20.16b, v21.16b, #4
|
||||
ext v26.16b, v16.16b, v17.16b, #8 // +1-stride
|
||||
ext v27.16b, v17.16b, v18.16b, #8
|
||||
ext v28.16b, v19.16b, v20.16b, #8 // +1+stride
|
||||
ext v29.16b, v20.16b, v21.16b, #8
|
||||
mul v0.8h, v0.8h, v4.8h // * 5
|
||||
mla v0.8h, v2.8h, v6.8h // * 6
|
||||
.if \bpc == 8
|
||||
ld1 {v0.8b}, [x2], #8
|
||||
ld1 {v31.8b}, [x0]
|
||||
ld1 {v30.8b}, [x1]
|
||||
.else
|
||||
ld1 {v0.8h}, [x2], #16
|
||||
ld1 {v31.8h}, [x0]
|
||||
ld1 {v30.8h}, [x1]
|
||||
.endif
|
||||
ld1 {v1.8h}, [x4], #16
|
||||
subs x5, x5, #8
|
||||
add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride
|
||||
add v17.4s, v17.4s, v27.4s
|
||||
add v19.4s, v19.4s, v28.4s // -1+stride, +1+stride
|
||||
add v20.4s, v20.4s, v29.4s
|
||||
add v16.4s, v16.4s, v19.4s
|
||||
add v17.4s, v17.4s, v20.4s
|
||||
|
||||
mul v9.4s, v19.4s, v5.4s // * 5
|
||||
mla v9.4s, v24.4s, v7.4s // * 6
|
||||
mul v10.4s, v20.4s, v5.4s // * 5
|
||||
mla v10.4s, v25.4s, v7.4s // * 6
|
||||
|
||||
add v22.4s, v22.4s, v24.4s // -stride, +stride
|
||||
add v23.4s, v23.4s, v25.4s
|
||||
// This is, surprisingly, faster than other variants where the
|
||||
// mul+mla pairs are further apart, on Cortex A53.
|
||||
mul v16.4s, v16.4s, v5.4s // * 5
|
||||
mla v16.4s, v22.4s, v7.4s // * 6
|
||||
mul v17.4s, v17.4s, v5.4s // * 5
|
||||
mla v17.4s, v23.4s, v7.4s // * 6
|
||||
|
||||
.if \bpc == 8
|
||||
ushll v0.8h, v0.8b, #4 // u
|
||||
.else
|
||||
shl v0.8h, v0.8h, #4 // u
|
||||
uxtl v31.8h, v31.8b
|
||||
uxtl v30.8h, v30.8b
|
||||
.endif
|
||||
sub v1.8h, v1.8h, v0.8h // t1 - u
|
||||
ushll v2.4s, v0.4h, #7 // u << 7
|
||||
ushll2 v3.4s, v0.8h, #7 // u << 7
|
||||
smlal v2.4s, v1.4h, v31.4h // v
|
||||
smlal2 v3.4s, v1.8h, v31.8h // v
|
||||
umlal v16.4s, v0.4h, v31.4h // b + a * src
|
||||
umlal2 v17.4s, v0.8h, v31.8h
|
||||
umlal v9.4s, v8.4h, v30.4h // b + a * src
|
||||
umlal2 v10.4s, v8.8h, v30.8h
|
||||
mov v0.16b, v1.16b
|
||||
rshrn v16.4h, v16.4s, #9
|
||||
rshrn2 v16.8h, v17.4s, #9
|
||||
rshrn v9.4h, v9.4s, #8
|
||||
rshrn2 v9.8h, v10.4s, #8
|
||||
|
||||
subs w4, w4, #8
|
||||
|
||||
// weighted1
|
||||
shl v31.8h, v31.8h, #4 // u
|
||||
shl v30.8h, v30.8h, #4
|
||||
mov v2.16b, v3.16b
|
||||
|
||||
sub v16.8h, v16.8h, v31.8h // t1 - u
|
||||
sub v9.8h, v9.8h, v30.8h
|
||||
ld1 {v1.8h}, [x3], #16
|
||||
ushll v22.4s, v31.4h, #7 // u << 7
|
||||
ushll2 v23.4s, v31.8h, #7
|
||||
ushll v24.4s, v30.4h, #7
|
||||
ushll2 v25.4s, v30.8h, #7
|
||||
ld1 {v3.8h}, [x8], #16
|
||||
smlal v22.4s, v16.4h, v14.4h // v
|
||||
smlal2 v23.4s, v16.8h, v14.8h
|
||||
mov v16.16b, v18.16b
|
||||
smlal v24.4s, v9.4h, v14.4h
|
||||
smlal2 v25.4s, v9.8h, v14.8h
|
||||
mov v19.16b, v21.16b
|
||||
.if \bpc == 8
|
||||
rshrn v2.4h, v2.4s, #11
|
||||
rshrn2 v2.8h, v3.4s, #11
|
||||
sqxtun v2.8b, v2.8h
|
||||
st1 {v2.8b}, [x0], #8
|
||||
rshrn v22.4h, v22.4s, #11
|
||||
rshrn2 v22.8h, v23.4s, #11
|
||||
rshrn v23.4h, v24.4s, #11
|
||||
rshrn2 v23.8h, v25.4s, #11
|
||||
sqxtun v22.8b, v22.8h
|
||||
sqxtun v23.8b, v23.8h
|
||||
st1 {v22.8b}, [x0], #8
|
||||
st1 {v23.8b}, [x1], #8
|
||||
.else
|
||||
sqrshrun v2.4h, v2.4s, #11
|
||||
sqrshrun2 v2.8h, v3.4s, #11
|
||||
umin v2.8h, v2.8h, v30.8h
|
||||
st1 {v2.8h}, [x0], #16
|
||||
sqrshrun v22.4h, v22.4s, #11
|
||||
sqrshrun2 v22.8h, v23.4s, #11
|
||||
sqrshrun v23.4h, v24.4s, #11
|
||||
sqrshrun2 v23.8h, v25.4s, #11
|
||||
umin v22.8h, v22.8h, v15.8h
|
||||
umin v23.8h, v23.8h, v15.8h
|
||||
st1 {v22.8h}, [x0], #16
|
||||
st1 {v23.8h}, [x1], #16
|
||||
.endif
|
||||
b.gt 2b
|
||||
0:
|
||||
|
||||
b.le 3f
|
||||
ld1 {v17.4s, v18.4s}, [x2], #32
|
||||
ld1 {v20.4s, v21.4s}, [x7], #32
|
||||
b 2b
|
||||
|
||||
3:
|
||||
ldp d14, d15, [sp, #0x20]
|
||||
ldr d10, [sp, #0x10]
|
||||
ldp d8, d9, [sp], 0x30
|
||||
ret
|
||||
endfunc
|
||||
|
||||
@ -461,7 +615,7 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1
|
||||
.else
|
||||
ldp x8, x9, [sp]
|
||||
.endif
|
||||
cmp x7, #2
|
||||
cmp w7, #2
|
||||
add x10, x0, x1
|
||||
add x11, x2, x3
|
||||
add x12, x4, #2*FILTER_OUT_STRIDE
|
||||
@ -483,7 +637,7 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1
|
||||
sub x3, x3, x9, lsl #1
|
||||
.endif
|
||||
sub x8, x8, x9, lsl #1
|
||||
mov x9, x6
|
||||
mov w9, w6
|
||||
b.lt 2f
|
||||
1:
|
||||
.if \bpc == 8
|
||||
@ -497,7 +651,7 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1
|
||||
ld1 {v17.8h}, [x12], #16
|
||||
ld1 {v2.8h}, [x5], #16
|
||||
ld1 {v18.8h}, [x13], #16
|
||||
subs x6, x6, #8
|
||||
subs w6, w6, #8
|
||||
.if \bpc == 8
|
||||
ushll v0.8h, v0.8b, #4 // u
|
||||
ushll v16.8h, v16.8b, #4 // u
|
||||
@ -542,10 +696,10 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1
|
||||
.endif
|
||||
b.gt 1b
|
||||
|
||||
subs x7, x7, #2
|
||||
cmp x7, #1
|
||||
subs w7, w7, #2
|
||||
cmp w7, #1
|
||||
b.lt 0f
|
||||
mov x6, x9
|
||||
mov w6, w9
|
||||
add x0, x0, x1
|
||||
add x10, x10, x1
|
||||
add x2, x2, x3
|
||||
@ -565,7 +719,7 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1
|
||||
.endif
|
||||
ld1 {v1.8h}, [x4], #16
|
||||
ld1 {v2.8h}, [x5], #16
|
||||
subs x6, x6, #8
|
||||
subs w6, w6, #8
|
||||
.if \bpc == 8
|
||||
ushll v0.8h, v0.8b, #4 // u
|
||||
.else
|
||||
|
201
third_party/dav1d/src/arm/64/refmvs.S
vendored
201
third_party/dav1d/src/arm/64/refmvs.S
vendored
@ -89,3 +89,204 @@ L(splat_tbl):
|
||||
.hword L(splat_tbl) - 20b
|
||||
.hword L(splat_tbl) - 10b
|
||||
endfunc
|
||||
|
||||
const mv_tbls, align=4
|
||||
.byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
|
||||
.byte 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0
|
||||
.byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
|
||||
.byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
|
||||
endconst
|
||||
|
||||
const mask_mult, align=4
|
||||
.byte 1, 2, 1, 2, 0, 0, 0, 0
|
||||
endconst
|
||||
|
||||
// void dav1d_save_tmvs_neon(refmvs_temporal_block *rp, ptrdiff_t stride,
|
||||
// refmvs_block **rr, const uint8_t *ref_sign,
|
||||
// int col_end8, int row_end8,
|
||||
// int col_start8, int row_start8)
|
||||
function save_tmvs_neon, export=1
|
||||
AARCH64_SIGN_LINK_REGISTER
|
||||
stp x29, x30, [sp, #-16]!
|
||||
mov x29, sp
|
||||
|
||||
movi v30.8b, #0
|
||||
ld1 {v31.8b}, [x3]
|
||||
adr x8, L(save_tmvs_tbl)
|
||||
movrel x16, mask_mult
|
||||
movrel x13, mv_tbls
|
||||
ld1 {v29.8b}, [x16]
|
||||
ext v31.8b, v30.8b, v31.8b, #7 // [0, ref_sign]
|
||||
mov w15, #5
|
||||
mov w14, #12*2
|
||||
sxtw x4, w4
|
||||
sxtw x6, w6
|
||||
mul w1, w1, w15 // stride *= 5
|
||||
sub w5, w5, w7 // h = row_end8 - row_start8
|
||||
lsl w7, w7, #1 // row_start8 <<= 1
|
||||
1:
|
||||
mov w15, #5
|
||||
and w9, w7, #30 // (y & 15) * 2
|
||||
ldr x9, [x2, w9, uxtw #3] // b = rr[(y & 15) * 2]
|
||||
add x9, x9, #12 // &b[... + 1]
|
||||
madd x10, x4, x14, x9 // end_cand_b = &b[col_end8*2 + 1]
|
||||
madd x9, x6, x14, x9 // cand_b = &b[x*2 + 1]
|
||||
|
||||
madd x3, x6, x15, x0 // &rp[x]
|
||||
|
||||
2:
|
||||
ldrb w11, [x9, #10] // cand_b->bs
|
||||
ld1 {v0.16b}, [x9] // cand_b->mv
|
||||
add x11, x8, w11, uxtw #2
|
||||
ldr h1, [x9, #8] // cand_b->ref
|
||||
ldrh w12, [x11] // bw8
|
||||
mov x15, x8
|
||||
add x9, x9, w12, uxtw #1 // cand_b += bw8*2
|
||||
cmp x9, x10
|
||||
mov v2.8b, v0.8b
|
||||
b.ge 3f
|
||||
|
||||
ldrb w15, [x9, #10] // cand_b->bs
|
||||
add x16, x9, #8
|
||||
ld1 {v4.16b}, [x9] // cand_b->mv
|
||||
add x15, x8, w15, uxtw #2
|
||||
ld1 {v1.h}[1], [x16] // cand_b->ref
|
||||
ldrh w12, [x15] // bw8
|
||||
add x9, x9, w12, uxtw #1 // cand_b += bw8*2
|
||||
trn1 v2.2d, v0.2d, v4.2d
|
||||
|
||||
3:
|
||||
abs v2.8h, v2.8h // abs(mv[].xy)
|
||||
tbl v1.8b, {v31.16b}, v1.8b // ref_sign[ref]
|
||||
ushr v2.8h, v2.8h, #12 // abs(mv[].xy) >> 12
|
||||
umull v1.8h, v1.8b, v29.8b // ref_sign[ref] * {1, 2}
|
||||
cmeq v2.4s, v2.4s, #0 // abs(mv[].xy) <= 4096
|
||||
xtn v2.4h, v2.4s // abs() condition to 16 bit
|
||||
and v1.8b, v1.8b, v2.8b // h[0-3] contains conditions for mv[0-1]
|
||||
addp v1.4h, v1.4h, v1.4h // Combine condition for [1] and [0]
|
||||
umov w16, v1.h[0] // Extract case for first block
|
||||
umov w17, v1.h[1]
|
||||
ldrh w11, [x11, #2] // Fetch jump table entry
|
||||
ldrh w15, [x15, #2]
|
||||
ldr q1, [x13, w16, uxtw #4] // Load permutation table base on case
|
||||
ldr q5, [x13, w17, uxtw #4]
|
||||
sub x11, x8, w11, uxtw // Find jump table target
|
||||
sub x15, x8, w15, uxtw
|
||||
tbl v0.16b, {v0.16b}, v1.16b // Permute cand_b to output refmvs_temporal_block
|
||||
tbl v4.16b, {v4.16b}, v5.16b
|
||||
|
||||
// v1 follows on v0, with another 3 full repetitions of the pattern.
|
||||
ext v1.16b, v0.16b, v0.16b, #1
|
||||
ext v5.16b, v4.16b, v4.16b, #1
|
||||
// v2 ends with 3 complete repetitions of the pattern.
|
||||
ext v2.16b, v0.16b, v1.16b, #4
|
||||
ext v6.16b, v4.16b, v5.16b, #4
|
||||
|
||||
blr x11
|
||||
b.ge 4f // if (cand_b >= end)
|
||||
mov v0.16b, v4.16b
|
||||
mov v1.16b, v5.16b
|
||||
mov v2.16b, v6.16b
|
||||
cmp x9, x10
|
||||
blr x15
|
||||
b.lt 2b // if (cand_b < end)
|
||||
|
||||
4:
|
||||
subs w5, w5, #1 // h--
|
||||
add w7, w7, #2 // y += 2
|
||||
add x0, x0, x1 // rp += stride
|
||||
b.gt 1b
|
||||
|
||||
ldp x29, x30, [sp], #16
|
||||
AARCH64_VALIDATE_LINK_REGISTER
|
||||
ret
|
||||
|
||||
10:
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
add x16, x3, #4
|
||||
st1 {v0.s}[0], [x3]
|
||||
st1 {v0.b}[4], [x16]
|
||||
add x3, x3, #5
|
||||
ret
|
||||
20:
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
add x16, x3, #8
|
||||
st1 {v0.d}[0], [x3]
|
||||
st1 {v0.h}[4], [x16]
|
||||
add x3, x3, #2*5
|
||||
ret
|
||||
40:
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
st1 {v0.16b}, [x3]
|
||||
str s1, [x3, #16]
|
||||
add x3, x3, #4*5
|
||||
ret
|
||||
80:
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
// This writes 6 full entries plus 2 extra bytes
|
||||
st1 {v0.16b, v1.16b}, [x3]
|
||||
// Write the last few, overlapping with the first write.
|
||||
stur q2, [x3, #(8*5-16)]
|
||||
add x3, x3, #8*5
|
||||
ret
|
||||
160:
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
add x16, x3, #6*5
|
||||
add x17, x3, #12*5
|
||||
// This writes 6 full entries plus 2 extra bytes
|
||||
st1 {v0.16b, v1.16b}, [x3]
|
||||
// Write another 6 full entries, slightly overlapping with the first set
|
||||
st1 {v0.16b, v1.16b}, [x16]
|
||||
// Write 8 bytes (one full entry) after the first 12
|
||||
st1 {v0.8b}, [x17]
|
||||
// Write the last 3 entries
|
||||
str q2, [x3, #(16*5-16)]
|
||||
add x3, x3, #16*5
|
||||
ret
|
||||
|
||||
L(save_tmvs_tbl):
|
||||
.hword 16 * 12
|
||||
.hword L(save_tmvs_tbl) - 160b
|
||||
.hword 16 * 12
|
||||
.hword L(save_tmvs_tbl) - 160b
|
||||
.hword 8 * 12
|
||||
.hword L(save_tmvs_tbl) - 80b
|
||||
.hword 8 * 12
|
||||
.hword L(save_tmvs_tbl) - 80b
|
||||
.hword 8 * 12
|
||||
.hword L(save_tmvs_tbl) - 80b
|
||||
.hword 8 * 12
|
||||
.hword L(save_tmvs_tbl) - 80b
|
||||
.hword 4 * 12
|
||||
.hword L(save_tmvs_tbl) - 40b
|
||||
.hword 4 * 12
|
||||
.hword L(save_tmvs_tbl) - 40b
|
||||
.hword 4 * 12
|
||||
.hword L(save_tmvs_tbl) - 40b
|
||||
.hword 4 * 12
|
||||
.hword L(save_tmvs_tbl) - 40b
|
||||
.hword 2 * 12
|
||||
.hword L(save_tmvs_tbl) - 20b
|
||||
.hword 2 * 12
|
||||
.hword L(save_tmvs_tbl) - 20b
|
||||
.hword 2 * 12
|
||||
.hword L(save_tmvs_tbl) - 20b
|
||||
.hword 2 * 12
|
||||
.hword L(save_tmvs_tbl) - 20b
|
||||
.hword 2 * 12
|
||||
.hword L(save_tmvs_tbl) - 20b
|
||||
.hword 1 * 12
|
||||
.hword L(save_tmvs_tbl) - 10b
|
||||
.hword 1 * 12
|
||||
.hword L(save_tmvs_tbl) - 10b
|
||||
.hword 1 * 12
|
||||
.hword L(save_tmvs_tbl) - 10b
|
||||
.hword 1 * 12
|
||||
.hword L(save_tmvs_tbl) - 10b
|
||||
.hword 1 * 12
|
||||
.hword L(save_tmvs_tbl) - 10b
|
||||
.hword 1 * 12
|
||||
.hword L(save_tmvs_tbl) - 10b
|
||||
.hword 1 * 12
|
||||
.hword L(save_tmvs_tbl) - 10b
|
||||
endfunc
|
||||
|
848
third_party/dav1d/src/arm/looprestoration.h
vendored
848
third_party/dav1d/src/arm/looprestoration.h
vendored
@ -105,6 +105,7 @@ static void wiener_filter_neon(pixel *const dst, const ptrdiff_t stride,
|
||||
}
|
||||
#endif
|
||||
|
||||
#if ARCH_ARM
|
||||
void BF(dav1d_sgr_box3_h, neon)(int32_t *sumsq, int16_t *sum,
|
||||
const pixel (*left)[4],
|
||||
const pixel *src, const ptrdiff_t stride,
|
||||
@ -246,6 +247,853 @@ static void sgr_filter_mix_neon(pixel *const dst, const ptrdiff_t stride,
|
||||
tmp1, tmp2, w, h, wt HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
|
||||
#else
|
||||
static void rotate(int32_t **sumsq_ptrs, int16_t **sum_ptrs, int n) {
|
||||
int32_t *tmp32 = sumsq_ptrs[0];
|
||||
int16_t *tmp16 = sum_ptrs[0];
|
||||
for (int i = 0; i < n - 1; i++) {
|
||||
sumsq_ptrs[i] = sumsq_ptrs[i+1];
|
||||
sum_ptrs[i] = sum_ptrs[i+1];
|
||||
}
|
||||
sumsq_ptrs[n - 1] = tmp32;
|
||||
sum_ptrs[n - 1] = tmp16;
|
||||
}
|
||||
static void rotate5_x2(int32_t **sumsq_ptrs, int16_t **sum_ptrs) {
|
||||
int32_t *tmp32[2];
|
||||
int16_t *tmp16[2];
|
||||
for (int i = 0; i < 2; i++) {
|
||||
tmp32[i] = sumsq_ptrs[i];
|
||||
tmp16[i] = sum_ptrs[i];
|
||||
}
|
||||
for (int i = 0; i < 3; i++) {
|
||||
sumsq_ptrs[i] = sumsq_ptrs[i+2];
|
||||
sum_ptrs[i] = sum_ptrs[i+2];
|
||||
}
|
||||
for (int i = 0; i < 2; i++) {
|
||||
sumsq_ptrs[3 + i] = tmp32[i];
|
||||
sum_ptrs[3 + i] = tmp16[i];
|
||||
}
|
||||
}
|
||||
|
||||
static void rotate_ab_3(int32_t **A_ptrs, int16_t **B_ptrs) {
|
||||
rotate(A_ptrs, B_ptrs, 3);
|
||||
}
|
||||
|
||||
static void rotate_ab_2(int32_t **A_ptrs, int16_t **B_ptrs) {
|
||||
rotate(A_ptrs, B_ptrs, 2);
|
||||
}
|
||||
|
||||
static void rotate_ab_4(int32_t **A_ptrs, int16_t **B_ptrs) {
|
||||
rotate(A_ptrs, B_ptrs, 4);
|
||||
}
|
||||
|
||||
void BF(dav1d_sgr_box3_row_h, neon)(int32_t *sumsq, int16_t *sum,
|
||||
const pixel (*left)[4],
|
||||
const pixel *src, const int w,
|
||||
const enum LrEdgeFlags edges);
|
||||
void BF(dav1d_sgr_box5_row_h, neon)(int32_t *sumsq, int16_t *sum,
|
||||
const pixel (*left)[4],
|
||||
const pixel *src, const int w,
|
||||
const enum LrEdgeFlags edges);
|
||||
void BF(dav1d_sgr_box35_row_h, neon)(int32_t *sumsq3, int16_t *sum3,
|
||||
int32_t *sumsq5, int16_t *sum5,
|
||||
const pixel (*left)[4],
|
||||
const pixel *src, const int w,
|
||||
const enum LrEdgeFlags edges);
|
||||
|
||||
void dav1d_sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum,
|
||||
int32_t *AA, int16_t *BB,
|
||||
const int w, const int s,
|
||||
const int bitdepth_max);
|
||||
void dav1d_sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum,
|
||||
int32_t *AA, int16_t *BB,
|
||||
const int w, const int s,
|
||||
const int bitdepth_max);
|
||||
|
||||
void BF(dav1d_sgr_finish_weighted1, neon)(pixel *dst,
|
||||
int32_t **A_ptrs, int16_t **B_ptrs,
|
||||
const int w, const int w1
|
||||
HIGHBD_DECL_SUFFIX);
|
||||
void BF(dav1d_sgr_finish_weighted2, neon)(pixel *dst, const ptrdiff_t stride,
|
||||
int32_t **A_ptrs, int16_t **B_ptrs,
|
||||
const int w, const int h,
|
||||
const int w1 HIGHBD_DECL_SUFFIX);
|
||||
|
||||
void BF(dav1d_sgr_finish_filter1_2rows, neon)(int16_t *tmp, const pixel *src,
|
||||
const ptrdiff_t src_stride,
|
||||
int32_t **A_ptrs,
|
||||
int16_t **B_ptrs,
|
||||
const int w, const int h);
|
||||
void BF(dav1d_sgr_finish_filter2_2rows, neon)(int16_t *tmp, const pixel *src,
|
||||
const ptrdiff_t src_stride,
|
||||
int32_t **A_ptrs, int16_t **B_ptrs,
|
||||
const int w, const int h);
|
||||
void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride,
|
||||
const pixel *src, const ptrdiff_t src_stride,
|
||||
const int16_t *t1, const int16_t *t2,
|
||||
const int w, const int h,
|
||||
const int16_t wt[2] HIGHBD_DECL_SUFFIX);
|
||||
|
||||
static void sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum,
|
||||
int32_t *sumsq_out, int16_t *sum_out,
|
||||
const int w, int s, int bitdepth_max) {
|
||||
// box3_v + calc_ab1
|
||||
dav1d_sgr_box3_vert_neon(sumsq, sum, sumsq_out, sum_out, w, s, bitdepth_max);
|
||||
rotate(sumsq, sum, 3);
|
||||
}
|
||||
|
||||
static void sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum,
|
||||
int32_t *sumsq_out, int16_t *sum_out,
|
||||
const int w, int s, int bitdepth_max) {
|
||||
// box5_v + calc_ab2
|
||||
dav1d_sgr_box5_vert_neon(sumsq, sum, sumsq_out, sum_out, w, s, bitdepth_max);
|
||||
rotate5_x2(sumsq, sum);
|
||||
}
|
||||
|
||||
static void sgr_box3_hv_neon(int32_t **sumsq, int16_t **sum,
|
||||
int32_t *AA, int16_t *BB,
|
||||
const pixel (*left)[4],
|
||||
const pixel *src, const int w,
|
||||
const int s,
|
||||
const enum LrEdgeFlags edges,
|
||||
const int bitdepth_max) {
|
||||
BF(dav1d_sgr_box3_row_h, neon)(sumsq[2], sum[2], left, src, w, edges);
|
||||
sgr_box3_vert_neon(sumsq, sum, AA, BB, w, s, bitdepth_max);
|
||||
}
|
||||
|
||||
|
||||
static void sgr_finish1_neon(pixel **dst, const ptrdiff_t stride,
|
||||
int32_t **A_ptrs, int16_t **B_ptrs, const int w,
|
||||
const int w1 HIGHBD_DECL_SUFFIX) {
|
||||
BF(dav1d_sgr_finish_weighted1, neon)(*dst, A_ptrs, B_ptrs,
|
||||
w, w1 HIGHBD_TAIL_SUFFIX);
|
||||
*dst += PXSTRIDE(stride);
|
||||
rotate_ab_3(A_ptrs, B_ptrs);
|
||||
}
|
||||
|
||||
static void sgr_finish2_neon(pixel **dst, const ptrdiff_t stride,
|
||||
int32_t **A_ptrs, int16_t **B_ptrs,
|
||||
const int w, const int h, const int w1
|
||||
HIGHBD_DECL_SUFFIX) {
|
||||
BF(dav1d_sgr_finish_weighted2, neon)(*dst, stride, A_ptrs, B_ptrs,
|
||||
w, h, w1 HIGHBD_TAIL_SUFFIX);
|
||||
*dst += 2*PXSTRIDE(stride);
|
||||
rotate_ab_2(A_ptrs, B_ptrs);
|
||||
}
|
||||
|
||||
static void sgr_finish_mix_neon(pixel **dst, const ptrdiff_t stride,
|
||||
int32_t **A5_ptrs, int16_t **B5_ptrs,
|
||||
int32_t **A3_ptrs, int16_t **B3_ptrs,
|
||||
const int w, const int h,
|
||||
const int w0, const int w1 HIGHBD_DECL_SUFFIX) {
|
||||
#define FILTER_OUT_STRIDE 384
|
||||
ALIGN_STK_16(int16_t, tmp5, 2*FILTER_OUT_STRIDE,);
|
||||
ALIGN_STK_16(int16_t, tmp3, 2*FILTER_OUT_STRIDE,);
|
||||
|
||||
BF(dav1d_sgr_finish_filter2_2rows, neon)(tmp5, *dst, stride,
|
||||
A5_ptrs, B5_ptrs, w, h);
|
||||
BF(dav1d_sgr_finish_filter1_2rows, neon)(tmp3, *dst, stride,
|
||||
A3_ptrs, B3_ptrs, w, h);
|
||||
const int16_t wt[2] = { w0, w1 };
|
||||
BF(dav1d_sgr_weighted2, neon)(*dst, stride, *dst, stride,
|
||||
tmp5, tmp3, w, h, wt HIGHBD_TAIL_SUFFIX);
|
||||
*dst += h*PXSTRIDE(stride);
|
||||
rotate_ab_2(A5_ptrs, B5_ptrs);
|
||||
rotate_ab_4(A3_ptrs, B3_ptrs);
|
||||
}
|
||||
|
||||
|
||||
static void sgr_filter_3x3_neon(pixel *dst, const ptrdiff_t stride,
|
||||
const pixel (*left)[4], const pixel *lpf,
|
||||
const int w, int h,
|
||||
const LooprestorationParams *const params,
|
||||
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
|
||||
{
|
||||
#define BUF_STRIDE (384 + 16)
|
||||
ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 3 + 16,);
|
||||
ALIGN_STK_16(int16_t, sum_buf, BUF_STRIDE * 3 + 16,);
|
||||
int32_t *sumsq_ptrs[3], *sumsq_rows[3];
|
||||
int16_t *sum_ptrs[3], *sum_rows[3];
|
||||
for (int i = 0; i < 3; i++) {
|
||||
sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE];
|
||||
sum_rows[i] = &sum_buf[i * BUF_STRIDE];
|
||||
}
|
||||
|
||||
ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 3 + 16,);
|
||||
ALIGN_STK_16(int16_t, B_buf, BUF_STRIDE * 3 + 16,);
|
||||
int32_t *A_ptrs[3];
|
||||
int16_t *B_ptrs[3];
|
||||
for (int i = 0; i < 3; i++) {
|
||||
A_ptrs[i] = &A_buf[i * BUF_STRIDE];
|
||||
B_ptrs[i] = &B_buf[i * BUF_STRIDE];
|
||||
}
|
||||
const pixel *src = dst;
|
||||
const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
|
||||
|
||||
if (edges & LR_HAVE_TOP) {
|
||||
sumsq_ptrs[0] = sumsq_rows[0];
|
||||
sumsq_ptrs[1] = sumsq_rows[1];
|
||||
sumsq_ptrs[2] = sumsq_rows[2];
|
||||
sum_ptrs[0] = sum_rows[0];
|
||||
sum_ptrs[1] = sum_rows[1];
|
||||
sum_ptrs[2] = sum_rows[2];
|
||||
|
||||
BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[0], sum_rows[0],
|
||||
NULL, lpf, w, edges);
|
||||
lpf += PXSTRIDE(stride);
|
||||
BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[1], sum_rows[1],
|
||||
NULL, lpf, w, edges);
|
||||
|
||||
sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
|
||||
left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
|
||||
left++;
|
||||
src += PXSTRIDE(stride);
|
||||
rotate_ab_3(A_ptrs, B_ptrs);
|
||||
|
||||
if (--h <= 0)
|
||||
goto vert_1;
|
||||
|
||||
sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
|
||||
left++;
|
||||
src += PXSTRIDE(stride);
|
||||
rotate_ab_3(A_ptrs, B_ptrs);
|
||||
|
||||
if (--h <= 0)
|
||||
goto vert_2;
|
||||
} else {
|
||||
sumsq_ptrs[0] = sumsq_rows[0];
|
||||
sumsq_ptrs[1] = sumsq_rows[0];
|
||||
sumsq_ptrs[2] = sumsq_rows[0];
|
||||
sum_ptrs[0] = sum_rows[0];
|
||||
sum_ptrs[1] = sum_rows[0];
|
||||
sum_ptrs[2] = sum_rows[0];
|
||||
|
||||
BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[0], sum_rows[0],
|
||||
left, src, w, edges);
|
||||
left++;
|
||||
src += PXSTRIDE(stride);
|
||||
|
||||
sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
|
||||
w, params->sgr.s1, BITDEPTH_MAX);
|
||||
rotate_ab_3(A_ptrs, B_ptrs);
|
||||
|
||||
if (--h <= 0)
|
||||
goto vert_1;
|
||||
|
||||
sumsq_ptrs[2] = sumsq_rows[1];
|
||||
sum_ptrs[2] = sum_rows[1];
|
||||
|
||||
sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
|
||||
left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
|
||||
left++;
|
||||
src += PXSTRIDE(stride);
|
||||
rotate_ab_3(A_ptrs, B_ptrs);
|
||||
|
||||
if (--h <= 0)
|
||||
goto vert_2;
|
||||
|
||||
sumsq_ptrs[2] = sumsq_rows[2];
|
||||
sum_ptrs[2] = sum_rows[2];
|
||||
}
|
||||
|
||||
do {
|
||||
sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
|
||||
left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
|
||||
left++;
|
||||
src += PXSTRIDE(stride);
|
||||
|
||||
sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
|
||||
w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
|
||||
} while (--h > 0);
|
||||
|
||||
if (!(edges & LR_HAVE_BOTTOM))
|
||||
goto vert_2;
|
||||
|
||||
sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
|
||||
NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX);
|
||||
lpf_bottom += PXSTRIDE(stride);
|
||||
|
||||
sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
|
||||
w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
|
||||
|
||||
sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
|
||||
NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX);
|
||||
|
||||
sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
|
||||
w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
|
||||
return;
|
||||
|
||||
vert_2:
|
||||
sumsq_ptrs[2] = sumsq_ptrs[1];
|
||||
sum_ptrs[2] = sum_ptrs[1];
|
||||
sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
|
||||
w, params->sgr.s1, BITDEPTH_MAX);
|
||||
|
||||
sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
|
||||
w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
|
||||
|
||||
output_1:
|
||||
sumsq_ptrs[2] = sumsq_ptrs[1];
|
||||
sum_ptrs[2] = sum_ptrs[1];
|
||||
sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
|
||||
w, params->sgr.s1, BITDEPTH_MAX);
|
||||
|
||||
sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
|
||||
w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
|
||||
return;
|
||||
|
||||
vert_1:
|
||||
sumsq_ptrs[2] = sumsq_ptrs[1];
|
||||
sum_ptrs[2] = sum_ptrs[1];
|
||||
sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
|
||||
w, params->sgr.s1, BITDEPTH_MAX);
|
||||
rotate_ab_3(A_ptrs, B_ptrs);
|
||||
goto output_1;
|
||||
}
|
||||
|
||||
static void sgr_filter_5x5_neon(pixel *dst, const ptrdiff_t stride,
|
||||
const pixel (*left)[4], const pixel *lpf,
|
||||
const int w, int h,
|
||||
const LooprestorationParams *const params,
|
||||
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
|
||||
{
|
||||
ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 5 + 16,);
|
||||
ALIGN_STK_16(int16_t, sum_buf, BUF_STRIDE * 5 + 16,);
|
||||
int32_t *sumsq_ptrs[5], *sumsq_rows[5];
|
||||
int16_t *sum_ptrs[5], *sum_rows[5];
|
||||
for (int i = 0; i < 5; i++) {
|
||||
sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE];
|
||||
sum_rows[i] = &sum_buf[i * BUF_STRIDE];
|
||||
}
|
||||
|
||||
ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 2 + 16,);
|
||||
ALIGN_STK_16(int16_t, B_buf, BUF_STRIDE * 2 + 16,);
|
||||
int32_t *A_ptrs[2];
|
||||
int16_t *B_ptrs[2];
|
||||
for (int i = 0; i < 2; i++) {
|
||||
A_ptrs[i] = &A_buf[i * BUF_STRIDE];
|
||||
B_ptrs[i] = &B_buf[i * BUF_STRIDE];
|
||||
}
|
||||
const pixel *src = dst;
|
||||
const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
|
||||
|
||||
if (edges & LR_HAVE_TOP) {
|
||||
sumsq_ptrs[0] = sumsq_rows[0];
|
||||
sumsq_ptrs[1] = sumsq_rows[0];
|
||||
sumsq_ptrs[2] = sumsq_rows[1];
|
||||
sumsq_ptrs[3] = sumsq_rows[2];
|
||||
sumsq_ptrs[4] = sumsq_rows[3];
|
||||
sum_ptrs[0] = sum_rows[0];
|
||||
sum_ptrs[1] = sum_rows[0];
|
||||
sum_ptrs[2] = sum_rows[1];
|
||||
sum_ptrs[3] = sum_rows[2];
|
||||
sum_ptrs[4] = sum_rows[3];
|
||||
|
||||
BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[0], sum_rows[0],
|
||||
NULL, lpf, w, edges);
|
||||
lpf += PXSTRIDE(stride);
|
||||
BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[1], sum_rows[1],
|
||||
NULL, lpf, w, edges);
|
||||
|
||||
BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[2], sum_rows[2],
|
||||
left, src, w, edges);
|
||||
left++;
|
||||
src += PXSTRIDE(stride);
|
||||
|
||||
if (--h <= 0)
|
||||
goto vert_1;
|
||||
|
||||
BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[3], sum_rows[3],
|
||||
left, src, w, edges);
|
||||
left++;
|
||||
src += PXSTRIDE(stride);
|
||||
sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
|
||||
w, params->sgr.s0, BITDEPTH_MAX);
|
||||
rotate_ab_2(A_ptrs, B_ptrs);
|
||||
|
||||
if (--h <= 0)
|
||||
goto vert_2;
|
||||
|
||||
// ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
|
||||
// one of them to point at the previously unused rows[4].
|
||||
sumsq_ptrs[3] = sumsq_rows[4];
|
||||
sum_ptrs[3] = sum_rows[4];
|
||||
} else {
|
||||
sumsq_ptrs[0] = sumsq_rows[0];
|
||||
sumsq_ptrs[1] = sumsq_rows[0];
|
||||
sumsq_ptrs[2] = sumsq_rows[0];
|
||||
sumsq_ptrs[3] = sumsq_rows[0];
|
||||
sumsq_ptrs[4] = sumsq_rows[0];
|
||||
sum_ptrs[0] = sum_rows[0];
|
||||
sum_ptrs[1] = sum_rows[0];
|
||||
sum_ptrs[2] = sum_rows[0];
|
||||
sum_ptrs[3] = sum_rows[0];
|
||||
sum_ptrs[4] = sum_rows[0];
|
||||
|
||||
BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[0], sum_rows[0],
|
||||
left, src, w, edges);
|
||||
left++;
|
||||
src += PXSTRIDE(stride);
|
||||
|
||||
if (--h <= 0)
|
||||
goto vert_1;
|
||||
|
||||
sumsq_ptrs[4] = sumsq_rows[1];
|
||||
sum_ptrs[4] = sum_rows[1];
|
||||
|
||||
BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[1], sum_rows[1],
|
||||
left, src, w, edges);
|
||||
left++;
|
||||
src += PXSTRIDE(stride);
|
||||
|
||||
sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
|
||||
w, params->sgr.s0, BITDEPTH_MAX);
|
||||
rotate_ab_2(A_ptrs, B_ptrs);
|
||||
|
||||
if (--h <= 0)
|
||||
goto vert_2;
|
||||
|
||||
sumsq_ptrs[3] = sumsq_rows[2];
|
||||
sumsq_ptrs[4] = sumsq_rows[3];
|
||||
sum_ptrs[3] = sum_rows[2];
|
||||
sum_ptrs[4] = sum_rows[3];
|
||||
|
||||
BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[2], sum_rows[2],
|
||||
left, src, w, edges);
|
||||
left++;
|
||||
src += PXSTRIDE(stride);
|
||||
|
||||
if (--h <= 0)
|
||||
goto odd;
|
||||
|
||||
BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[3], sum_rows[3],
|
||||
left, src, w, edges);
|
||||
left++;
|
||||
src += PXSTRIDE(stride);
|
||||
|
||||
sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
|
||||
w, params->sgr.s0, BITDEPTH_MAX);
|
||||
sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
|
||||
w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
|
||||
|
||||
if (--h <= 0)
|
||||
goto vert_2;
|
||||
|
||||
// ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
|
||||
// one of them to point at the previously unused rows[4].
|
||||
sumsq_ptrs[3] = sumsq_rows[4];
|
||||
sum_ptrs[3] = sum_rows[4];
|
||||
}
|
||||
|
||||
do {
|
||||
BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[3], sum_ptrs[3],
|
||||
left, src, w, edges);
|
||||
left++;
|
||||
src += PXSTRIDE(stride);
|
||||
|
||||
if (--h <= 0)
|
||||
goto odd;
|
||||
|
||||
BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[4], sum_ptrs[4],
|
||||
left, src, w, edges);
|
||||
left++;
|
||||
src += PXSTRIDE(stride);
|
||||
|
||||
sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
|
||||
w, params->sgr.s0, BITDEPTH_MAX);
|
||||
sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
|
||||
w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
|
||||
} while (--h > 0);
|
||||
|
||||
if (!(edges & LR_HAVE_BOTTOM))
|
||||
goto vert_2;
|
||||
|
||||
BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[3], sum_ptrs[3],
|
||||
NULL, lpf_bottom, w, edges);
|
||||
lpf_bottom += PXSTRIDE(stride);
|
||||
BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[4], sum_ptrs[4],
|
||||
NULL, lpf_bottom, w, edges);
|
||||
|
||||
output_2:
|
||||
sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
|
||||
w, params->sgr.s0, BITDEPTH_MAX);
|
||||
sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
|
||||
w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
|
||||
return;
|
||||
|
||||
vert_2:
|
||||
// Duplicate the last row twice more
|
||||
sumsq_ptrs[3] = sumsq_ptrs[2];
|
||||
sumsq_ptrs[4] = sumsq_ptrs[2];
|
||||
sum_ptrs[3] = sum_ptrs[2];
|
||||
sum_ptrs[4] = sum_ptrs[2];
|
||||
goto output_2;
|
||||
|
||||
odd:
|
||||
// Copy the last row as padding once
|
||||
sumsq_ptrs[4] = sumsq_ptrs[3];
|
||||
sum_ptrs[4] = sum_ptrs[3];
|
||||
|
||||
sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
|
||||
w, params->sgr.s0, BITDEPTH_MAX);
|
||||
sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
|
||||
w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
|
||||
|
||||
output_1:
|
||||
// Duplicate the last row twice more
|
||||
sumsq_ptrs[3] = sumsq_ptrs[2];
|
||||
sumsq_ptrs[4] = sumsq_ptrs[2];
|
||||
sum_ptrs[3] = sum_ptrs[2];
|
||||
sum_ptrs[4] = sum_ptrs[2];
|
||||
|
||||
sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
|
||||
w, params->sgr.s0, BITDEPTH_MAX);
|
||||
// Output only one row
|
||||
sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
|
||||
w, 1, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
|
||||
return;
|
||||
|
||||
vert_1:
|
||||
// Copy the last row as padding once
|
||||
sumsq_ptrs[4] = sumsq_ptrs[3];
|
||||
sum_ptrs[4] = sum_ptrs[3];
|
||||
|
||||
sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
|
||||
w, params->sgr.s0, BITDEPTH_MAX);
|
||||
rotate_ab_2(A_ptrs, B_ptrs);
|
||||
|
||||
goto output_1;
|
||||
}
|
||||
|
||||
static void sgr_filter_mix_neon(pixel *dst, const ptrdiff_t stride,
|
||||
const pixel (*left)[4], const pixel *lpf,
|
||||
const int w, int h,
|
||||
const LooprestorationParams *const params,
|
||||
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
|
||||
{
|
||||
ALIGN_STK_16(int32_t, sumsq5_buf, BUF_STRIDE * 5 + 16,);
|
||||
ALIGN_STK_16(int16_t, sum5_buf, BUF_STRIDE * 5 + 16,);
|
||||
int32_t *sumsq5_ptrs[5], *sumsq5_rows[5];
|
||||
int16_t *sum5_ptrs[5], *sum5_rows[5];
|
||||
for (int i = 0; i < 5; i++) {
|
||||
sumsq5_rows[i] = &sumsq5_buf[i * BUF_STRIDE];
|
||||
sum5_rows[i] = &sum5_buf[i * BUF_STRIDE];
|
||||
}
|
||||
ALIGN_STK_16(int32_t, sumsq3_buf, BUF_STRIDE * 3 + 16,);
|
||||
ALIGN_STK_16(int16_t, sum3_buf, BUF_STRIDE * 3 + 16,);
|
||||
int32_t *sumsq3_ptrs[3], *sumsq3_rows[3];
|
||||
int16_t *sum3_ptrs[3], *sum3_rows[3];
|
||||
for (int i = 0; i < 3; i++) {
|
||||
sumsq3_rows[i] = &sumsq3_buf[i * BUF_STRIDE];
|
||||
sum3_rows[i] = &sum3_buf[i * BUF_STRIDE];
|
||||
}
|
||||
|
||||
ALIGN_STK_16(int32_t, A5_buf, BUF_STRIDE * 2 + 16,);
|
||||
ALIGN_STK_16(int16_t, B5_buf, BUF_STRIDE * 2 + 16,);
|
||||
int32_t *A5_ptrs[2];
|
||||
int16_t *B5_ptrs[2];
|
||||
for (int i = 0; i < 2; i++) {
|
||||
A5_ptrs[i] = &A5_buf[i * BUF_STRIDE];
|
||||
B5_ptrs[i] = &B5_buf[i * BUF_STRIDE];
|
||||
}
|
||||
ALIGN_STK_16(int32_t, A3_buf, BUF_STRIDE * 4 + 16,);
|
||||
ALIGN_STK_16(int16_t, B3_buf, BUF_STRIDE * 4 + 16,);
|
||||
int32_t *A3_ptrs[4];
|
||||
int16_t *B3_ptrs[4];
|
||||
for (int i = 0; i < 4; i++) {
|
||||
A3_ptrs[i] = &A3_buf[i * BUF_STRIDE];
|
||||
B3_ptrs[i] = &B3_buf[i * BUF_STRIDE];
|
||||
}
|
||||
const pixel *src = dst;
|
||||
const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
|
||||
|
||||
if (edges & LR_HAVE_TOP) {
|
||||
sumsq5_ptrs[0] = sumsq5_rows[0];
|
||||
sumsq5_ptrs[1] = sumsq5_rows[0];
|
||||
sumsq5_ptrs[2] = sumsq5_rows[1];
|
||||
sumsq5_ptrs[3] = sumsq5_rows[2];
|
||||
sumsq5_ptrs[4] = sumsq5_rows[3];
|
||||
sum5_ptrs[0] = sum5_rows[0];
|
||||
sum5_ptrs[1] = sum5_rows[0];
|
||||
sum5_ptrs[2] = sum5_rows[1];
|
||||
sum5_ptrs[3] = sum5_rows[2];
|
||||
sum5_ptrs[4] = sum5_rows[3];
|
||||
|
||||
sumsq3_ptrs[0] = sumsq3_rows[0];
|
||||
sumsq3_ptrs[1] = sumsq3_rows[1];
|
||||
sumsq3_ptrs[2] = sumsq3_rows[2];
|
||||
sum3_ptrs[0] = sum3_rows[0];
|
||||
sum3_ptrs[1] = sum3_rows[1];
|
||||
sum3_ptrs[2] = sum3_rows[2];
|
||||
|
||||
BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[0], sum3_rows[0],
|
||||
sumsq5_rows[0], sum5_rows[0],
|
||||
NULL, lpf, w, edges);
|
||||
lpf += PXSTRIDE(stride);
|
||||
BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[1], sum3_rows[1],
|
||||
sumsq5_rows[1], sum5_rows[1],
|
||||
NULL, lpf, w, edges);
|
||||
|
||||
BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[2], sum3_rows[2],
|
||||
sumsq5_rows[2], sum5_rows[2],
|
||||
left, src, w, edges);
|
||||
left++;
|
||||
src += PXSTRIDE(stride);
|
||||
|
||||
sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
|
||||
w, params->sgr.s1, BITDEPTH_MAX);
|
||||
rotate_ab_4(A3_ptrs, B3_ptrs);
|
||||
|
||||
if (--h <= 0)
|
||||
goto vert_1;
|
||||
|
||||
BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
|
||||
sumsq5_rows[3], sum5_rows[3],
|
||||
left, src, w, edges);
|
||||
left++;
|
||||
src += PXSTRIDE(stride);
|
||||
sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
|
||||
w, params->sgr.s0, BITDEPTH_MAX);
|
||||
rotate_ab_2(A5_ptrs, B5_ptrs);
|
||||
sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
|
||||
w, params->sgr.s1, BITDEPTH_MAX);
|
||||
rotate_ab_4(A3_ptrs, B3_ptrs);
|
||||
|
||||
if (--h <= 0)
|
||||
goto vert_2;
|
||||
|
||||
// ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
|
||||
// one of them to point at the previously unused rows[4].
|
||||
sumsq5_ptrs[3] = sumsq5_rows[4];
|
||||
sum5_ptrs[3] = sum5_rows[4];
|
||||
} else {
|
||||
sumsq5_ptrs[0] = sumsq5_rows[0];
|
||||
sumsq5_ptrs[1] = sumsq5_rows[0];
|
||||
sumsq5_ptrs[2] = sumsq5_rows[0];
|
||||
sumsq5_ptrs[3] = sumsq5_rows[0];
|
||||
sumsq5_ptrs[4] = sumsq5_rows[0];
|
||||
sum5_ptrs[0] = sum5_rows[0];
|
||||
sum5_ptrs[1] = sum5_rows[0];
|
||||
sum5_ptrs[2] = sum5_rows[0];
|
||||
sum5_ptrs[3] = sum5_rows[0];
|
||||
sum5_ptrs[4] = sum5_rows[0];
|
||||
|
||||
sumsq3_ptrs[0] = sumsq3_rows[0];
|
||||
sumsq3_ptrs[1] = sumsq3_rows[0];
|
||||
sumsq3_ptrs[2] = sumsq3_rows[0];
|
||||
sum3_ptrs[0] = sum3_rows[0];
|
||||
sum3_ptrs[1] = sum3_rows[0];
|
||||
sum3_ptrs[2] = sum3_rows[0];
|
||||
|
||||
BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[0], sum3_rows[0],
|
||||
sumsq5_rows[0], sum5_rows[0],
|
||||
left, src, w, edges);
|
||||
left++;
|
||||
src += PXSTRIDE(stride);
|
||||
|
||||
sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
|
||||
w, params->sgr.s1, BITDEPTH_MAX);
|
||||
rotate_ab_4(A3_ptrs, B3_ptrs);
|
||||
|
||||
if (--h <= 0)
|
||||
goto vert_1;
|
||||
|
||||
sumsq5_ptrs[4] = sumsq5_rows[1];
|
||||
sum5_ptrs[4] = sum5_rows[1];
|
||||
|
||||
sumsq3_ptrs[2] = sumsq3_rows[1];
|
||||
sum3_ptrs[2] = sum3_rows[1];
|
||||
|
||||
BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[1], sum3_rows[1],
|
||||
sumsq5_rows[1], sum5_rows[1],
|
||||
left, src, w, edges);
|
||||
left++;
|
||||
src += PXSTRIDE(stride);
|
||||
|
||||
sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
|
||||
w, params->sgr.s0, BITDEPTH_MAX);
|
||||
rotate_ab_2(A5_ptrs, B5_ptrs);
|
||||
sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
|
||||
w, params->sgr.s1, BITDEPTH_MAX);
|
||||
rotate_ab_4(A3_ptrs, B3_ptrs);
|
||||
|
||||
if (--h <= 0)
|
||||
goto vert_2;
|
||||
|
||||
sumsq5_ptrs[3] = sumsq5_rows[2];
|
||||
sumsq5_ptrs[4] = sumsq5_rows[3];
|
||||
sum5_ptrs[3] = sum5_rows[2];
|
||||
sum5_ptrs[4] = sum5_rows[3];
|
||||
|
||||
sumsq3_ptrs[2] = sumsq3_rows[2];
|
||||
sum3_ptrs[2] = sum3_rows[2];
|
||||
|
||||
BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[2], sum3_rows[2],
|
||||
sumsq5_rows[2], sum5_rows[2],
|
||||
left, src, w, edges);
|
||||
left++;
|
||||
src += PXSTRIDE(stride);
|
||||
|
||||
sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
|
||||
w, params->sgr.s1, BITDEPTH_MAX);
|
||||
rotate_ab_4(A3_ptrs, B3_ptrs);
|
||||
|
||||
if (--h <= 0)
|
||||
goto odd;
|
||||
|
||||
BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
|
||||
sumsq5_rows[3], sum5_rows[3],
|
||||
left, src, w, edges);
|
||||
left++;
|
||||
src += PXSTRIDE(stride);
|
||||
|
||||
sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
|
||||
w, params->sgr.s0, BITDEPTH_MAX);
|
||||
sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
|
||||
w, params->sgr.s1, BITDEPTH_MAX);
|
||||
sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
|
||||
w, 2, params->sgr.w0, params->sgr.w1
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
|
||||
if (--h <= 0)
|
||||
goto vert_2;
|
||||
|
||||
// ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
|
||||
// one of them to point at the previously unused rows[4].
|
||||
sumsq5_ptrs[3] = sumsq5_rows[4];
|
||||
sum5_ptrs[3] = sum5_rows[4];
|
||||
}
|
||||
|
||||
do {
|
||||
BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
|
||||
sumsq5_ptrs[3], sum5_ptrs[3],
|
||||
left, src, w, edges);
|
||||
left++;
|
||||
src += PXSTRIDE(stride);
|
||||
|
||||
sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
|
||||
w, params->sgr.s1, BITDEPTH_MAX);
|
||||
rotate_ab_4(A3_ptrs, B3_ptrs);
|
||||
|
||||
if (--h <= 0)
|
||||
goto odd;
|
||||
|
||||
BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
|
||||
sumsq5_ptrs[4], sum5_ptrs[4],
|
||||
left, src, w, edges);
|
||||
left++;
|
||||
src += PXSTRIDE(stride);
|
||||
|
||||
sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
|
||||
w, params->sgr.s0, BITDEPTH_MAX);
|
||||
sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
|
||||
w, params->sgr.s1, BITDEPTH_MAX);
|
||||
sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
|
||||
w, 2, params->sgr.w0, params->sgr.w1
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
} while (--h > 0);
|
||||
|
||||
if (!(edges & LR_HAVE_BOTTOM))
|
||||
goto vert_2;
|
||||
|
||||
BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
|
||||
sumsq5_ptrs[3], sum5_ptrs[3],
|
||||
NULL, lpf_bottom, w, edges);
|
||||
lpf_bottom += PXSTRIDE(stride);
|
||||
sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
|
||||
w, params->sgr.s1, BITDEPTH_MAX);
|
||||
rotate_ab_4(A3_ptrs, B3_ptrs);
|
||||
|
||||
BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
|
||||
sumsq5_ptrs[4], sum5_ptrs[4],
|
||||
NULL, lpf_bottom, w, edges);
|
||||
|
||||
output_2:
|
||||
sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
|
||||
w, params->sgr.s0, BITDEPTH_MAX);
|
||||
sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
|
||||
w, params->sgr.s1, BITDEPTH_MAX);
|
||||
sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
|
||||
w, 2, params->sgr.w0, params->sgr.w1
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
return;
|
||||
|
||||
vert_2:
|
||||
// Duplicate the last row twice more
|
||||
sumsq5_ptrs[3] = sumsq5_ptrs[2];
|
||||
sumsq5_ptrs[4] = sumsq5_ptrs[2];
|
||||
sum5_ptrs[3] = sum5_ptrs[2];
|
||||
sum5_ptrs[4] = sum5_ptrs[2];
|
||||
|
||||
sumsq3_ptrs[2] = sumsq3_ptrs[1];
|
||||
sum3_ptrs[2] = sum3_ptrs[1];
|
||||
sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
|
||||
w, params->sgr.s1, BITDEPTH_MAX);
|
||||
rotate_ab_4(A3_ptrs, B3_ptrs);
|
||||
|
||||
sumsq3_ptrs[2] = sumsq3_ptrs[1];
|
||||
sum3_ptrs[2] = sum3_ptrs[1];
|
||||
|
||||
goto output_2;
|
||||
|
||||
odd:
|
||||
// Copy the last row as padding once
|
||||
sumsq5_ptrs[4] = sumsq5_ptrs[3];
|
||||
sum5_ptrs[4] = sum5_ptrs[3];
|
||||
|
||||
sumsq3_ptrs[2] = sumsq3_ptrs[1];
|
||||
sum3_ptrs[2] = sum3_ptrs[1];
|
||||
|
||||
sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
|
||||
w, params->sgr.s0, BITDEPTH_MAX);
|
||||
sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
|
||||
w, params->sgr.s1, BITDEPTH_MAX);
|
||||
sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
|
||||
w, 2, params->sgr.w0, params->sgr.w1
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
|
||||
output_1:
|
||||
// Duplicate the last row twice more
|
||||
sumsq5_ptrs[3] = sumsq5_ptrs[2];
|
||||
sumsq5_ptrs[4] = sumsq5_ptrs[2];
|
||||
sum5_ptrs[3] = sum5_ptrs[2];
|
||||
sum5_ptrs[4] = sum5_ptrs[2];
|
||||
|
||||
sumsq3_ptrs[2] = sumsq3_ptrs[1];
|
||||
sum3_ptrs[2] = sum3_ptrs[1];
|
||||
|
||||
sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
|
||||
w, params->sgr.s0, BITDEPTH_MAX);
|
||||
sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
|
||||
w, params->sgr.s1, BITDEPTH_MAX);
|
||||
rotate_ab_4(A3_ptrs, B3_ptrs);
|
||||
// Output only one row
|
||||
sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
|
||||
w, 1, params->sgr.w0, params->sgr.w1
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
return;
|
||||
|
||||
vert_1:
|
||||
// Copy the last row as padding once
|
||||
sumsq5_ptrs[4] = sumsq5_ptrs[3];
|
||||
sum5_ptrs[4] = sum5_ptrs[3];
|
||||
|
||||
sumsq3_ptrs[2] = sumsq3_ptrs[1];
|
||||
sum3_ptrs[2] = sum3_ptrs[1];
|
||||
|
||||
sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
|
||||
w, params->sgr.s0, BITDEPTH_MAX);
|
||||
rotate_ab_2(A5_ptrs, B5_ptrs);
|
||||
sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
|
||||
w, params->sgr.s1, BITDEPTH_MAX);
|
||||
rotate_ab_4(A3_ptrs, B3_ptrs);
|
||||
|
||||
goto output_1;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
static ALWAYS_INLINE void loop_restoration_dsp_init_arm(Dav1dLoopRestorationDSPContext *const c, int bpc) {
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
||||
|
2
third_party/dav1d/src/arm/refmvs.h
vendored
2
third_party/dav1d/src/arm/refmvs.h
vendored
@ -28,6 +28,7 @@
|
||||
#include "src/cpu.h"
|
||||
#include "src/refmvs.h"
|
||||
|
||||
decl_save_tmvs_fn(dav1d_save_tmvs_neon);
|
||||
decl_splat_mv_fn(dav1d_splat_mv_neon);
|
||||
|
||||
static ALWAYS_INLINE void refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *const c) {
|
||||
@ -35,5 +36,6 @@ static ALWAYS_INLINE void refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *const c) {
|
||||
|
||||
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
|
||||
|
||||
c->save_tmvs = dav1d_save_tmvs_neon;
|
||||
c->splat_mv = dav1d_splat_mv_neon;
|
||||
}
|
||||
|
15
third_party/dav1d/src/data.c
vendored
15
third_party/dav1d/src/data.c
vendored
@ -44,7 +44,7 @@ uint8_t *dav1d_data_create_internal(Dav1dData *const buf, const size_t sz) {
|
||||
validate_input_or_ret(buf != NULL, NULL);
|
||||
|
||||
if (sz > SIZE_MAX / 2) return NULL;
|
||||
buf->ref = dav1d_ref_create(sz);
|
||||
buf->ref = dav1d_ref_create(ALLOC_DAV1DDATA, sz);
|
||||
if (!buf->ref) return NULL;
|
||||
buf->data = buf->ref->const_data;
|
||||
buf->sz = sz;
|
||||
@ -65,7 +65,7 @@ int dav1d_data_wrap_internal(Dav1dData *const buf, const uint8_t *const ptr,
|
||||
validate_input_or_ret(free_callback != NULL, DAV1D_ERR(EINVAL));
|
||||
|
||||
if (sz > SIZE_MAX / 2) return DAV1D_ERR(EINVAL);
|
||||
Dav1dRef *const ref = malloc(sizeof(Dav1dRef));
|
||||
Dav1dRef *const ref = dav1d_malloc(ALLOC_DAV1DDATA, sizeof(Dav1dRef));
|
||||
if (!ref) return DAV1D_ERR(ENOMEM);
|
||||
|
||||
buf->ref = dav1d_ref_init(ref, ptr, free_callback, cookie, 1);
|
||||
@ -86,7 +86,7 @@ int dav1d_data_wrap_user_data_internal(Dav1dData *const buf,
|
||||
validate_input_or_ret(buf != NULL, DAV1D_ERR(EINVAL));
|
||||
validate_input_or_ret(free_callback != NULL, DAV1D_ERR(EINVAL));
|
||||
|
||||
Dav1dRef *const ref = malloc(sizeof(Dav1dRef));
|
||||
Dav1dRef *const ref = dav1d_malloc(ALLOC_DAV1DDATA, sizeof(Dav1dRef));
|
||||
if (!ref) return DAV1D_ERR(ENOMEM);
|
||||
|
||||
buf->m.user_data.ref = dav1d_ref_init(ref, user_data, free_callback, cookie, 1);
|
||||
@ -95,14 +95,13 @@ int dav1d_data_wrap_user_data_internal(Dav1dData *const buf,
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
void dav1d_data_ref(Dav1dData *const dst, const Dav1dData *const src) {
|
||||
validate_input(dst != NULL);
|
||||
validate_input(dst->data == NULL);
|
||||
validate_input(src != NULL);
|
||||
assert(dst != NULL);
|
||||
assert(dst->data == NULL);
|
||||
assert(src != NULL);
|
||||
|
||||
if (src->ref) {
|
||||
validate_input(src->data != NULL);
|
||||
assert(src->data != NULL);
|
||||
dav1d_ref_inc(src->ref);
|
||||
}
|
||||
if (src->m.user_data.ref) dav1d_ref_inc(src->m.user_data.ref);
|
||||
|
71
third_party/dav1d/src/decode.c
vendored
71
third_party/dav1d/src/decode.c
vendored
@ -2932,8 +2932,8 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
|
||||
int retval = DAV1D_ERR(ENOMEM);
|
||||
|
||||
if (f->sbh > f->lf.start_of_tile_row_sz) {
|
||||
free(f->lf.start_of_tile_row);
|
||||
f->lf.start_of_tile_row = malloc(f->sbh * sizeof(uint8_t));
|
||||
dav1d_free(f->lf.start_of_tile_row);
|
||||
f->lf.start_of_tile_row = dav1d_malloc(ALLOC_TILE, f->sbh * sizeof(uint8_t));
|
||||
if (!f->lf.start_of_tile_row) {
|
||||
f->lf.start_of_tile_row_sz = 0;
|
||||
goto error;
|
||||
@ -2950,24 +2950,24 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
|
||||
const int n_ts = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows;
|
||||
if (n_ts != f->n_ts) {
|
||||
if (c->n_fc > 1) {
|
||||
freep(&f->frame_thread.tile_start_off);
|
||||
dav1d_free(f->frame_thread.tile_start_off);
|
||||
f->frame_thread.tile_start_off =
|
||||
malloc(sizeof(*f->frame_thread.tile_start_off) * n_ts);
|
||||
dav1d_malloc(ALLOC_TILE, sizeof(*f->frame_thread.tile_start_off) * n_ts);
|
||||
if (!f->frame_thread.tile_start_off) {
|
||||
f->n_ts = 0;
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
dav1d_free_aligned(f->ts);
|
||||
f->ts = dav1d_alloc_aligned(sizeof(*f->ts) * n_ts, 32);
|
||||
f->ts = dav1d_alloc_aligned(ALLOC_TILE, sizeof(*f->ts) * n_ts, 32);
|
||||
if (!f->ts) goto error;
|
||||
f->n_ts = n_ts;
|
||||
}
|
||||
|
||||
const int a_sz = f->sb128w * f->frame_hdr->tiling.rows * (1 + (c->n_fc > 1 && c->n_tc > 1));
|
||||
if (a_sz != f->a_sz) {
|
||||
freep(&f->a);
|
||||
f->a = malloc(sizeof(*f->a) * a_sz);
|
||||
dav1d_free(f->a);
|
||||
f->a = dav1d_malloc(ALLOC_TILE, sizeof(*f->a) * a_sz);
|
||||
if (!f->a) {
|
||||
f->a_sz = 0;
|
||||
goto error;
|
||||
@ -2993,9 +2993,10 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
|
||||
|
||||
const int lowest_pixel_mem_sz = f->frame_hdr->tiling.cols * f->sbh;
|
||||
if (lowest_pixel_mem_sz != f->tile_thread.lowest_pixel_mem_sz) {
|
||||
free(f->tile_thread.lowest_pixel_mem);
|
||||
dav1d_free(f->tile_thread.lowest_pixel_mem);
|
||||
f->tile_thread.lowest_pixel_mem =
|
||||
malloc(lowest_pixel_mem_sz * sizeof(*f->tile_thread.lowest_pixel_mem));
|
||||
dav1d_malloc(ALLOC_TILE, lowest_pixel_mem_sz *
|
||||
sizeof(*f->tile_thread.lowest_pixel_mem));
|
||||
if (!f->tile_thread.lowest_pixel_mem) {
|
||||
f->tile_thread.lowest_pixel_mem_sz = 0;
|
||||
goto error;
|
||||
@ -3016,9 +3017,9 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
|
||||
|
||||
const int cf_sz = (num_sb128 * size_mul[0]) << hbd;
|
||||
if (cf_sz != f->frame_thread.cf_sz) {
|
||||
dav1d_freep_aligned(&f->frame_thread.cf);
|
||||
dav1d_free_aligned(f->frame_thread.cf);
|
||||
f->frame_thread.cf =
|
||||
dav1d_alloc_aligned((size_t)cf_sz * 128 * 128 / 2, 64);
|
||||
dav1d_alloc_aligned(ALLOC_COEF, (size_t)cf_sz * 128 * 128 / 2, 64);
|
||||
if (!f->frame_thread.cf) {
|
||||
f->frame_thread.cf_sz = 0;
|
||||
goto error;
|
||||
@ -3029,9 +3030,9 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
|
||||
|
||||
if (f->frame_hdr->allow_screen_content_tools) {
|
||||
if (num_sb128 != f->frame_thread.pal_sz) {
|
||||
dav1d_freep_aligned(&f->frame_thread.pal);
|
||||
dav1d_free_aligned(f->frame_thread.pal);
|
||||
f->frame_thread.pal =
|
||||
dav1d_alloc_aligned(sizeof(*f->frame_thread.pal) *
|
||||
dav1d_alloc_aligned(ALLOC_PAL, sizeof(*f->frame_thread.pal) *
|
||||
num_sb128 * 16 * 16, 64);
|
||||
if (!f->frame_thread.pal) {
|
||||
f->frame_thread.pal_sz = 0;
|
||||
@ -3042,9 +3043,9 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
|
||||
|
||||
const int pal_idx_sz = num_sb128 * size_mul[1];
|
||||
if (pal_idx_sz != f->frame_thread.pal_idx_sz) {
|
||||
dav1d_freep_aligned(&f->frame_thread.pal_idx);
|
||||
dav1d_free_aligned(f->frame_thread.pal_idx);
|
||||
f->frame_thread.pal_idx =
|
||||
dav1d_alloc_aligned(sizeof(*f->frame_thread.pal_idx) *
|
||||
dav1d_alloc_aligned(ALLOC_PAL, sizeof(*f->frame_thread.pal_idx) *
|
||||
pal_idx_sz * 128 * 128 / 4, 64);
|
||||
if (!f->frame_thread.pal_idx) {
|
||||
f->frame_thread.pal_idx_sz = 0;
|
||||
@ -3072,7 +3073,7 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
|
||||
size_t alloc_sz = 64;
|
||||
alloc_sz += (size_t)llabs(y_stride) * 4 * f->sbh << need_cdef_lpf_copy;
|
||||
alloc_sz += (size_t)llabs(uv_stride) * 8 * f->sbh << need_cdef_lpf_copy;
|
||||
uint8_t *ptr = f->lf.cdef_line_buf = dav1d_alloc_aligned(alloc_sz, 32);
|
||||
uint8_t *ptr = f->lf.cdef_line_buf = dav1d_alloc_aligned(ALLOC_CDEF, alloc_sz, 32);
|
||||
if (!ptr) {
|
||||
f->lf.cdef_buf_plane_sz[0] = f->lf.cdef_buf_plane_sz[1] = 0;
|
||||
goto error;
|
||||
@ -3132,7 +3133,7 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
|
||||
size_t alloc_sz = 128;
|
||||
alloc_sz += (size_t)llabs(y_stride) * num_lines;
|
||||
alloc_sz += (size_t)llabs(uv_stride) * num_lines * 2;
|
||||
uint8_t *ptr = f->lf.lr_line_buf = dav1d_alloc_aligned(alloc_sz, 64);
|
||||
uint8_t *ptr = f->lf.lr_line_buf = dav1d_alloc_aligned(ALLOC_LR, alloc_sz, 64);
|
||||
if (!ptr) {
|
||||
f->lf.lr_buf_plane_sz[0] = f->lf.lr_buf_plane_sz[1] = 0;
|
||||
goto error;
|
||||
@ -3158,23 +3159,23 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
|
||||
|
||||
// update allocation for loopfilter masks
|
||||
if (num_sb128 != f->lf.mask_sz) {
|
||||
freep(&f->lf.mask);
|
||||
freep(&f->lf.level);
|
||||
f->lf.mask = malloc(sizeof(*f->lf.mask) * num_sb128);
|
||||
dav1d_free(f->lf.mask);
|
||||
dav1d_free(f->lf.level);
|
||||
f->lf.mask = dav1d_malloc(ALLOC_LF, sizeof(*f->lf.mask) * num_sb128);
|
||||
// over-allocate by 3 bytes since some of the SIMD implementations
|
||||
// index this from the level type and can thus over-read by up to 3
|
||||
f->lf.level = malloc(sizeof(*f->lf.level) * num_sb128 * 32 * 32 + 3);
|
||||
f->lf.level = dav1d_malloc(ALLOC_LF, sizeof(*f->lf.level) * num_sb128 * 32 * 32 + 3);
|
||||
if (!f->lf.mask || !f->lf.level) {
|
||||
f->lf.mask_sz = 0;
|
||||
goto error;
|
||||
}
|
||||
if (c->n_fc > 1) {
|
||||
freep(&f->frame_thread.b);
|
||||
freep(&f->frame_thread.cbi);
|
||||
f->frame_thread.b = malloc(sizeof(*f->frame_thread.b) *
|
||||
num_sb128 * 32 * 32);
|
||||
f->frame_thread.cbi = malloc(sizeof(*f->frame_thread.cbi) *
|
||||
num_sb128 * 32 * 32);
|
||||
dav1d_free(f->frame_thread.b);
|
||||
dav1d_free(f->frame_thread.cbi);
|
||||
f->frame_thread.b = dav1d_malloc(ALLOC_BLOCK, sizeof(*f->frame_thread.b) *
|
||||
num_sb128 * 32 * 32);
|
||||
f->frame_thread.cbi = dav1d_malloc(ALLOC_BLOCK, sizeof(*f->frame_thread.cbi) *
|
||||
num_sb128 * 32 * 32);
|
||||
if (!f->frame_thread.b || !f->frame_thread.cbi) {
|
||||
f->lf.mask_sz = 0;
|
||||
goto error;
|
||||
@ -3186,8 +3187,8 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
|
||||
f->sr_sb128w = (f->sr_cur.p.p.w + 127) >> 7;
|
||||
const int lr_mask_sz = f->sr_sb128w * f->sb128h;
|
||||
if (lr_mask_sz != f->lf.lr_mask_sz) {
|
||||
freep(&f->lf.lr_mask);
|
||||
f->lf.lr_mask = malloc(sizeof(*f->lf.lr_mask) * lr_mask_sz);
|
||||
dav1d_free(f->lf.lr_mask);
|
||||
f->lf.lr_mask = dav1d_malloc(ALLOC_LR, sizeof(*f->lf.lr_mask) * lr_mask_sz);
|
||||
if (!f->lf.lr_mask) {
|
||||
f->lf.lr_mask_sz = 0;
|
||||
goto error;
|
||||
@ -3207,9 +3208,9 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
|
||||
|
||||
const int ipred_edge_sz = f->sbh * f->sb128w << hbd;
|
||||
if (ipred_edge_sz != f->ipred_edge_sz) {
|
||||
dav1d_freep_aligned(&f->ipred_edge[0]);
|
||||
dav1d_free_aligned(f->ipred_edge[0]);
|
||||
uint8_t *ptr = f->ipred_edge[0] =
|
||||
dav1d_alloc_aligned(ipred_edge_sz * 128 * 3, 64);
|
||||
dav1d_alloc_aligned(ALLOC_IPRED, ipred_edge_sz * 128 * 3, 64);
|
||||
if (!ptr) {
|
||||
f->ipred_edge_sz = 0;
|
||||
goto error;
|
||||
@ -3221,8 +3222,8 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
|
||||
|
||||
const int re_sz = f->sb128h * f->frame_hdr->tiling.cols;
|
||||
if (re_sz != f->lf.re_sz) {
|
||||
freep(&f->lf.tx_lpf_right_edge[0]);
|
||||
f->lf.tx_lpf_right_edge[0] = malloc(re_sz * 32 * 2);
|
||||
dav1d_free(f->lf.tx_lpf_right_edge[0]);
|
||||
f->lf.tx_lpf_right_edge[0] = dav1d_malloc(ALLOC_LF, re_sz * 32 * 2);
|
||||
if (!f->lf.tx_lpf_right_edge[0]) {
|
||||
f->lf.re_sz = 0;
|
||||
goto error;
|
||||
@ -3656,9 +3657,9 @@ int dav1d_submit_frame(Dav1dContext *const c) {
|
||||
|
||||
// FIXME qsort so tiles are in order (for frame threading)
|
||||
if (f->n_tile_data_alloc < c->n_tile_data) {
|
||||
freep(&f->tile);
|
||||
dav1d_free(f->tile);
|
||||
assert(c->n_tile_data < INT_MAX / (int)sizeof(*f->tile));
|
||||
f->tile = malloc(c->n_tile_data * sizeof(*f->tile));
|
||||
f->tile = dav1d_malloc(ALLOC_TILE, c->n_tile_data * sizeof(*f->tile));
|
||||
if (!f->tile) {
|
||||
f->n_tile_data_alloc = f->n_tile_data = 0;
|
||||
res = DAV1D_ERR(ENOMEM);
|
||||
|
71
third_party/dav1d/src/lib.c
vendored
71
third_party/dav1d/src/lib.c
vendored
@ -63,6 +63,12 @@ COLD const char *dav1d_version(void) {
|
||||
return DAV1D_VERSION;
|
||||
}
|
||||
|
||||
COLD unsigned dav1d_version_api(void) {
|
||||
return (DAV1D_API_VERSION_MAJOR << 16) |
|
||||
(DAV1D_API_VERSION_MINOR << 8) |
|
||||
(DAV1D_API_VERSION_PATCH << 0);
|
||||
}
|
||||
|
||||
COLD void dav1d_default_settings(Dav1dSettings *const s) {
|
||||
s->n_threads = 0;
|
||||
s->max_frame_delay = 0;
|
||||
@ -155,7 +161,7 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
|
||||
|
||||
pthread_attr_setstacksize(&thread_attr, stack_size);
|
||||
|
||||
Dav1dContext *const c = *c_out = dav1d_alloc_aligned(sizeof(*c), 64);
|
||||
Dav1dContext *const c = *c_out = dav1d_alloc_aligned(ALLOC_COMMON_CTX, sizeof(*c), 64);
|
||||
if (!c) goto error;
|
||||
memset(c, 0, sizeof(*c));
|
||||
|
||||
@ -172,12 +178,12 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
|
||||
|
||||
dav1d_data_props_set_defaults(&c->cached_error_props);
|
||||
|
||||
if (dav1d_mem_pool_init(&c->seq_hdr_pool) ||
|
||||
dav1d_mem_pool_init(&c->frame_hdr_pool) ||
|
||||
dav1d_mem_pool_init(&c->segmap_pool) ||
|
||||
dav1d_mem_pool_init(&c->refmvs_pool) ||
|
||||
dav1d_mem_pool_init(&c->pic_ctx_pool) ||
|
||||
dav1d_mem_pool_init(&c->cdf_pool))
|
||||
if (dav1d_mem_pool_init(ALLOC_OBU_HDR, &c->seq_hdr_pool) ||
|
||||
dav1d_mem_pool_init(ALLOC_OBU_HDR, &c->frame_hdr_pool) ||
|
||||
dav1d_mem_pool_init(ALLOC_SEGMAP, &c->segmap_pool) ||
|
||||
dav1d_mem_pool_init(ALLOC_REFMVS, &c->refmvs_pool) ||
|
||||
dav1d_mem_pool_init(ALLOC_PIC_CTX, &c->pic_ctx_pool) ||
|
||||
dav1d_mem_pool_init(ALLOC_CDF, &c->cdf_pool))
|
||||
{
|
||||
goto error;
|
||||
}
|
||||
@ -186,7 +192,7 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
|
||||
c->allocator.release_picture_callback == dav1d_default_picture_release)
|
||||
{
|
||||
if (c->allocator.cookie) goto error;
|
||||
if (dav1d_mem_pool_init(&c->picture_pool)) goto error;
|
||||
if (dav1d_mem_pool_init(ALLOC_PIC, &c->picture_pool)) goto error;
|
||||
c->allocator.cookie = c->picture_pool;
|
||||
} else if (c->allocator.alloc_picture_callback == dav1d_default_picture_alloc ||
|
||||
c->allocator.release_picture_callback == dav1d_default_picture_release)
|
||||
@ -210,11 +216,11 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
|
||||
|
||||
get_num_threads(c, s, &c->n_tc, &c->n_fc);
|
||||
|
||||
c->fc = dav1d_alloc_aligned(sizeof(*c->fc) * c->n_fc, 32);
|
||||
c->fc = dav1d_alloc_aligned(ALLOC_THREAD_CTX, sizeof(*c->fc) * c->n_fc, 32);
|
||||
if (!c->fc) goto error;
|
||||
memset(c->fc, 0, sizeof(*c->fc) * c->n_fc);
|
||||
|
||||
c->tc = dav1d_alloc_aligned(sizeof(*c->tc) * c->n_tc, 64);
|
||||
c->tc = dav1d_alloc_aligned(ALLOC_THREAD_CTX, sizeof(*c->tc) * c->n_tc, 64);
|
||||
if (!c->tc) goto error;
|
||||
memset(c->tc, 0, sizeof(*c->tc) * c->n_tc);
|
||||
if (c->n_tc > 1) {
|
||||
@ -235,9 +241,11 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
|
||||
}
|
||||
|
||||
if (c->n_fc > 1) {
|
||||
const size_t out_delayed_sz = sizeof(*c->frame_thread.out_delayed) * c->n_fc;
|
||||
c->frame_thread.out_delayed =
|
||||
calloc(c->n_fc, sizeof(*c->frame_thread.out_delayed));
|
||||
dav1d_malloc(ALLOC_THREAD_CTX, out_delayed_sz);
|
||||
if (!c->frame_thread.out_delayed) goto error;
|
||||
memset(c->frame_thread.out_delayed, 0, out_delayed_sz);
|
||||
}
|
||||
for (unsigned n = 0; n < c->n_fc; n++) {
|
||||
Dav1dFrameContext *const f = &c->fc[n];
|
||||
@ -592,6 +600,9 @@ void dav1d_flush(Dav1dContext *const c) {
|
||||
|
||||
COLD void dav1d_close(Dav1dContext **const c_out) {
|
||||
validate_input(c_out != NULL);
|
||||
#if TRACK_HEAP_ALLOCATIONS
|
||||
dav1d_log_alloc_stats(*c_out);
|
||||
#endif
|
||||
close_internal(c_out, 1);
|
||||
}
|
||||
|
||||
@ -628,31 +639,31 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
|
||||
|
||||
// clean-up threading stuff
|
||||
if (c->n_fc > 1) {
|
||||
freep(&f->tile_thread.lowest_pixel_mem);
|
||||
freep(&f->frame_thread.b);
|
||||
dav1d_freep_aligned(&f->frame_thread.pal_idx);
|
||||
dav1d_freep_aligned(&f->frame_thread.cf);
|
||||
freep(&f->frame_thread.tile_start_off);
|
||||
dav1d_freep_aligned(&f->frame_thread.pal);
|
||||
freep(&f->frame_thread.cbi);
|
||||
dav1d_free(f->tile_thread.lowest_pixel_mem);
|
||||
dav1d_free(f->frame_thread.b);
|
||||
dav1d_free_aligned(f->frame_thread.pal_idx);
|
||||
dav1d_free_aligned(f->frame_thread.cf);
|
||||
dav1d_free(f->frame_thread.tile_start_off);
|
||||
dav1d_free_aligned(f->frame_thread.pal);
|
||||
dav1d_free(f->frame_thread.cbi);
|
||||
}
|
||||
if (c->n_tc > 1) {
|
||||
pthread_mutex_destroy(&f->task_thread.pending_tasks.lock);
|
||||
pthread_cond_destroy(&f->task_thread.cond);
|
||||
pthread_mutex_destroy(&f->task_thread.lock);
|
||||
}
|
||||
freep(&f->frame_thread.frame_progress);
|
||||
freep(&f->task_thread.tasks);
|
||||
freep(&f->task_thread.tile_tasks[0]);
|
||||
dav1d_free(f->frame_thread.frame_progress);
|
||||
dav1d_free(f->task_thread.tasks);
|
||||
dav1d_free(f->task_thread.tile_tasks[0]);
|
||||
dav1d_free_aligned(f->ts);
|
||||
dav1d_free_aligned(f->ipred_edge[0]);
|
||||
free(f->a);
|
||||
free(f->tile);
|
||||
free(f->lf.mask);
|
||||
free(f->lf.lr_mask);
|
||||
free(f->lf.level);
|
||||
free(f->lf.tx_lpf_right_edge[0]);
|
||||
free(f->lf.start_of_tile_row);
|
||||
dav1d_free(f->a);
|
||||
dav1d_free(f->tile);
|
||||
dav1d_free(f->lf.mask);
|
||||
dav1d_free(f->lf.level);
|
||||
dav1d_free(f->lf.lr_mask);
|
||||
dav1d_free(f->lf.tx_lpf_right_edge[0]);
|
||||
dav1d_free(f->lf.start_of_tile_row);
|
||||
dav1d_refmvs_clear(&f->rf);
|
||||
dav1d_free_aligned(f->lf.cdef_line_buf);
|
||||
dav1d_free_aligned(f->lf.lr_line_buf);
|
||||
@ -662,11 +673,11 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
|
||||
for (unsigned n = 0; n < c->n_fc; n++)
|
||||
if (c->frame_thread.out_delayed[n].p.frame_hdr)
|
||||
dav1d_thread_picture_unref(&c->frame_thread.out_delayed[n]);
|
||||
free(c->frame_thread.out_delayed);
|
||||
dav1d_free(c->frame_thread.out_delayed);
|
||||
}
|
||||
for (int n = 0; n < c->n_tile_data; n++)
|
||||
dav1d_data_unref_internal(&c->tile[n].data);
|
||||
free(c->tile);
|
||||
dav1d_free(c->tile);
|
||||
for (int n = 0; n < 8; n++) {
|
||||
dav1d_cdf_thread_unref(&c->cdf[n]);
|
||||
if (c->refs[n].p.p.frame_hdr)
|
||||
|
2
third_party/dav1d/src/log.c
vendored
2
third_party/dav1d/src/log.c
vendored
@ -44,7 +44,7 @@ COLD void dav1d_log_default_callback(void *const cookie,
|
||||
}
|
||||
|
||||
COLD void dav1d_log(Dav1dContext *const c, const char *const format, ...) {
|
||||
validate_input(c != NULL);
|
||||
assert(c != NULL);
|
||||
|
||||
if (!c->logger.callback)
|
||||
return;
|
||||
|
217
third_party/dav1d/src/mem.c
vendored
217
third_party/dav1d/src/mem.c
vendored
@ -31,9 +31,208 @@
|
||||
|
||||
#include "src/internal.h"
|
||||
|
||||
#if TRACK_HEAP_ALLOCATIONS
|
||||
#include <stdio.h>
|
||||
|
||||
#include "src/log.h"
|
||||
|
||||
#define DEFAULT_ALIGN 16
|
||||
|
||||
typedef struct {
|
||||
size_t sz;
|
||||
unsigned align;
|
||||
enum AllocationType type;
|
||||
} Dav1dAllocationData;
|
||||
|
||||
typedef struct {
|
||||
size_t curr_sz;
|
||||
size_t peak_sz;
|
||||
unsigned num_allocs;
|
||||
unsigned num_reuses;
|
||||
} AllocStats;
|
||||
|
||||
static AllocStats tracked_allocs[N_ALLOC_TYPES];
|
||||
static size_t curr_total_sz;
|
||||
static size_t peak_total_sz;
|
||||
static pthread_mutex_t track_alloc_mutex = PTHREAD_MUTEX_INITIALIZER;
|
||||
|
||||
static void *track_alloc(const enum AllocationType type, char *ptr,
|
||||
const size_t sz, const size_t align)
|
||||
{
|
||||
assert(align >= sizeof(Dav1dAllocationData));
|
||||
if (ptr) {
|
||||
ptr += align;
|
||||
Dav1dAllocationData *const d = &((Dav1dAllocationData*)ptr)[-1];
|
||||
AllocStats *const s = &tracked_allocs[type];
|
||||
|
||||
d->sz = sz;
|
||||
d->align = (unsigned)align;
|
||||
d->type = type;
|
||||
|
||||
pthread_mutex_lock(&track_alloc_mutex);
|
||||
s->num_allocs++;
|
||||
s->curr_sz += sz;
|
||||
if (s->curr_sz > s->peak_sz)
|
||||
s->peak_sz = s->curr_sz;
|
||||
|
||||
curr_total_sz += sz;
|
||||
if (curr_total_sz > peak_total_sz)
|
||||
peak_total_sz = curr_total_sz;
|
||||
pthread_mutex_unlock(&track_alloc_mutex);
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
static void *track_free(char *const ptr) {
|
||||
const Dav1dAllocationData *const d = &((Dav1dAllocationData*)ptr)[-1];
|
||||
const size_t sz = d->sz;
|
||||
|
||||
pthread_mutex_lock(&track_alloc_mutex);
|
||||
tracked_allocs[d->type].curr_sz -= sz;
|
||||
curr_total_sz -= sz;
|
||||
pthread_mutex_unlock(&track_alloc_mutex);
|
||||
|
||||
return ptr - d->align;
|
||||
}
|
||||
|
||||
static void dav1d_track_reuse(const enum AllocationType type) {
|
||||
pthread_mutex_lock(&track_alloc_mutex);
|
||||
tracked_allocs[type].num_reuses++;
|
||||
pthread_mutex_unlock(&track_alloc_mutex);
|
||||
}
|
||||
|
||||
void *dav1d_malloc(const enum AllocationType type, const size_t sz) {
|
||||
void *const ptr = malloc(sz + DEFAULT_ALIGN);
|
||||
return track_alloc(type, ptr, sz, DEFAULT_ALIGN);
|
||||
}
|
||||
|
||||
void *dav1d_alloc_aligned(const enum AllocationType type,
|
||||
const size_t sz, const size_t align)
|
||||
{
|
||||
assert(!(align & (align - 1)));
|
||||
void *ptr;
|
||||
#ifdef _WIN32
|
||||
ptr = _aligned_malloc(sz + align, align);
|
||||
#elif defined(HAVE_POSIX_MEMALIGN)
|
||||
if (posix_memalign(&ptr, align, sz + align)) return NULL;
|
||||
#else
|
||||
ptr = memalign(align, sz + align);
|
||||
#endif
|
||||
|
||||
return track_alloc(type, ptr, sz, align);
|
||||
}
|
||||
|
||||
void *dav1d_realloc(const enum AllocationType type,
|
||||
void *ptr, const size_t sz)
|
||||
{
|
||||
if (!ptr)
|
||||
return dav1d_malloc(type, sz);
|
||||
ptr = realloc((char*)ptr - DEFAULT_ALIGN, sz + DEFAULT_ALIGN);
|
||||
if (ptr)
|
||||
ptr = track_free((char*)ptr + DEFAULT_ALIGN);
|
||||
return track_alloc(type, ptr, sz, DEFAULT_ALIGN);
|
||||
}
|
||||
|
||||
void dav1d_free(void *ptr) {
|
||||
if (ptr)
|
||||
free(track_free(ptr));
|
||||
}
|
||||
|
||||
void dav1d_free_aligned(void *ptr) {
|
||||
if (ptr) {
|
||||
ptr = track_free(ptr);
|
||||
#ifdef _WIN32
|
||||
_aligned_free(ptr);
|
||||
#else
|
||||
free(ptr);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static COLD int cmp_stats(const void *const a, const void *const b) {
|
||||
const size_t a_sz = ((const AllocStats*)a)->peak_sz;
|
||||
const size_t b_sz = ((const AllocStats*)b)->peak_sz;
|
||||
return a_sz < b_sz ? -1 : a_sz > b_sz;
|
||||
}
|
||||
|
||||
/* Insert spaces as thousands separators for better readability */
|
||||
static COLD int format_tsep(char *const s, const size_t n, const size_t value) {
|
||||
if (value < 1000)
|
||||
return snprintf(s, n, "%u", (unsigned)value);
|
||||
|
||||
const int len = format_tsep(s, n, value / 1000);
|
||||
assert((size_t)len < n);
|
||||
return len + snprintf(s + len, n - len, " %03u", (unsigned)(value % 1000));
|
||||
}
|
||||
|
||||
COLD void dav1d_log_alloc_stats(Dav1dContext *const c) {
|
||||
static const char *const type_names[N_ALLOC_TYPES] = {
|
||||
[ALLOC_BLOCK ] = "Block data",
|
||||
[ALLOC_CDEF ] = "CDEF line buffers",
|
||||
[ALLOC_CDF ] = "CDF contexts",
|
||||
[ALLOC_COEF ] = "Coefficient data",
|
||||
[ALLOC_COMMON_CTX] = "Common context data",
|
||||
[ALLOC_DAV1DDATA ] = "Dav1dData",
|
||||
[ALLOC_IPRED ] = "Intra pred edges",
|
||||
[ALLOC_LF ] = "Loopfilter data",
|
||||
[ALLOC_LR ] = "Looprestoration data",
|
||||
[ALLOC_OBU_HDR ] = "OBU headers",
|
||||
[ALLOC_OBU_META ] = "OBU metadata",
|
||||
[ALLOC_PAL ] = "Palette data",
|
||||
[ALLOC_PIC ] = "Picture buffers",
|
||||
[ALLOC_PIC_CTX ] = "Picture context data",
|
||||
[ALLOC_REFMVS ] = "Reference mv data",
|
||||
[ALLOC_SEGMAP ] = "Segmentation maps",
|
||||
[ALLOC_THREAD_CTX] = "Thread context data",
|
||||
[ALLOC_TILE ] = "Tile data",
|
||||
};
|
||||
|
||||
struct {
|
||||
AllocStats stats;
|
||||
enum AllocationType type;
|
||||
} data[N_ALLOC_TYPES];
|
||||
unsigned total_allocs = 0;
|
||||
unsigned total_reuses = 0;
|
||||
|
||||
pthread_mutex_lock(&track_alloc_mutex);
|
||||
for (int i = 0; i < N_ALLOC_TYPES; i++) {
|
||||
AllocStats *const s = &data[i].stats;
|
||||
*s = tracked_allocs[i];
|
||||
data[i].type = i;
|
||||
total_allocs += s->num_allocs;
|
||||
total_reuses += s->num_reuses;
|
||||
}
|
||||
size_t total_sz = peak_total_sz;
|
||||
pthread_mutex_unlock(&track_alloc_mutex);
|
||||
|
||||
/* Sort types by memory usage */
|
||||
qsort(&data, N_ALLOC_TYPES, sizeof(*data), cmp_stats);
|
||||
|
||||
const double inv_total_share = 100.0 / total_sz;
|
||||
char total_sz_buf[32];
|
||||
const int sz_len = 4 + format_tsep(total_sz_buf, sizeof(total_sz_buf), total_sz);
|
||||
|
||||
dav1d_log(c, "\n Type Allocs Reuses Share Peak size\n"
|
||||
"---------------------------------------------------------------------\n");
|
||||
for (int i = N_ALLOC_TYPES - 1; i >= 0; i--) {
|
||||
const AllocStats *const s = &data[i].stats;
|
||||
if (s->num_allocs) {
|
||||
const double share = s->peak_sz * inv_total_share;
|
||||
char sz_buf[32];
|
||||
format_tsep(sz_buf, sizeof(sz_buf), s->peak_sz);
|
||||
dav1d_log(c, " %-20s%10u%10u%8.1f%%%*s\n", type_names[data[i].type],
|
||||
s->num_allocs, s->num_reuses, share, sz_len, sz_buf);
|
||||
}
|
||||
}
|
||||
dav1d_log(c, "---------------------------------------------------------------------\n"
|
||||
"%31u%10u %s\n",
|
||||
total_allocs, total_reuses, total_sz_buf);
|
||||
}
|
||||
#endif /* TRACK_HEAP_ALLOCATIONS */
|
||||
|
||||
static COLD void mem_pool_destroy(Dav1dMemPool *const pool) {
|
||||
pthread_mutex_destroy(&pool->lock);
|
||||
free(pool);
|
||||
dav1d_free(pool);
|
||||
}
|
||||
|
||||
void dav1d_mem_pool_push(Dav1dMemPool *const pool, Dav1dMemPoolBuffer *const buf) {
|
||||
@ -66,10 +265,14 @@ Dav1dMemPoolBuffer *dav1d_mem_pool_pop(Dav1dMemPool *const pool, const size_t si
|
||||
dav1d_free_aligned(data);
|
||||
goto alloc;
|
||||
}
|
||||
#if TRACK_HEAP_ALLOCATIONS
|
||||
dav1d_track_reuse(pool->type);
|
||||
#endif
|
||||
} else {
|
||||
pthread_mutex_unlock(&pool->lock);
|
||||
alloc:
|
||||
data = dav1d_alloc_aligned(size + sizeof(Dav1dMemPoolBuffer), 64);
|
||||
data = dav1d_alloc_aligned(pool->type,
|
||||
size + sizeof(Dav1dMemPoolBuffer), 64);
|
||||
if (!data) {
|
||||
pthread_mutex_lock(&pool->lock);
|
||||
const int ref_cnt = --pool->ref_cnt;
|
||||
@ -84,13 +287,19 @@ alloc:
|
||||
return buf;
|
||||
}
|
||||
|
||||
COLD int dav1d_mem_pool_init(Dav1dMemPool **const ppool) {
|
||||
Dav1dMemPool *const pool = malloc(sizeof(Dav1dMemPool));
|
||||
COLD int dav1d_mem_pool_init(const enum AllocationType type,
|
||||
Dav1dMemPool **const ppool)
|
||||
{
|
||||
Dav1dMemPool *const pool = dav1d_malloc(ALLOC_COMMON_CTX,
|
||||
sizeof(Dav1dMemPool));
|
||||
if (pool) {
|
||||
if (!pthread_mutex_init(&pool->lock, NULL)) {
|
||||
pool->buf = NULL;
|
||||
pool->ref_cnt = 1;
|
||||
pool->end = 0;
|
||||
#if TRACK_HEAP_ALLOCATIONS
|
||||
pool->type = type;
|
||||
#endif
|
||||
*ppool = pool;
|
||||
return 0;
|
||||
}
|
||||
|
86
third_party/dav1d/src/mem.h
vendored
86
third_party/dav1d/src/mem.h
vendored
@ -28,16 +28,42 @@
|
||||
#ifndef DAV1D_SRC_MEM_H
|
||||
#define DAV1D_SRC_MEM_H
|
||||
|
||||
#define TRACK_HEAP_ALLOCATIONS 0
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
#if defined(HAVE_ALIGNED_MALLOC) || defined(HAVE_MEMALIGN)
|
||||
#if defined(_WIN32) || !defined(HAVE_POSIX_MEMALIGN)
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
|
||||
#include "dav1d/dav1d.h"
|
||||
|
||||
#include "common/attributes.h"
|
||||
|
||||
#include "src/thread.h"
|
||||
|
||||
enum AllocationType {
|
||||
ALLOC_BLOCK,
|
||||
ALLOC_CDEF,
|
||||
ALLOC_CDF,
|
||||
ALLOC_COEF,
|
||||
ALLOC_COMMON_CTX,
|
||||
ALLOC_DAV1DDATA,
|
||||
ALLOC_IPRED,
|
||||
ALLOC_LF,
|
||||
ALLOC_LR,
|
||||
ALLOC_OBU_HDR,
|
||||
ALLOC_OBU_META,
|
||||
ALLOC_PAL,
|
||||
ALLOC_PIC,
|
||||
ALLOC_PIC_CTX,
|
||||
ALLOC_REFMVS,
|
||||
ALLOC_SEGMAP,
|
||||
ALLOC_THREAD_CTX,
|
||||
ALLOC_TILE,
|
||||
N_ALLOC_TYPES,
|
||||
};
|
||||
|
||||
typedef struct Dav1dMemPoolBuffer {
|
||||
void *data;
|
||||
struct Dav1dMemPoolBuffer *next;
|
||||
@ -48,43 +74,59 @@ typedef struct Dav1dMemPool {
|
||||
Dav1dMemPoolBuffer *buf;
|
||||
int ref_cnt;
|
||||
int end;
|
||||
#if TRACK_HEAP_ALLOCATIONS
|
||||
enum AllocationType type;
|
||||
#endif
|
||||
} Dav1dMemPool;
|
||||
|
||||
void dav1d_mem_pool_push(Dav1dMemPool *pool, Dav1dMemPoolBuffer *buf);
|
||||
Dav1dMemPoolBuffer *dav1d_mem_pool_pop(Dav1dMemPool *pool, size_t size);
|
||||
int dav1d_mem_pool_init(Dav1dMemPool **pool);
|
||||
void dav1d_mem_pool_end(Dav1dMemPool *pool);
|
||||
|
||||
#if TRACK_HEAP_ALLOCATIONS
|
||||
void *dav1d_malloc(enum AllocationType type, size_t sz);
|
||||
void *dav1d_realloc(enum AllocationType type, void *ptr, size_t sz);
|
||||
void *dav1d_alloc_aligned(enum AllocationType type, size_t sz, size_t align);
|
||||
void dav1d_free(void *ptr);
|
||||
void dav1d_free_aligned(void *ptr);
|
||||
void dav1d_log_alloc_stats(Dav1dContext *c);
|
||||
#else
|
||||
#define dav1d_mem_pool_init(type, pool) dav1d_mem_pool_init(pool)
|
||||
#define dav1d_malloc(type, sz) malloc(sz)
|
||||
#define dav1d_realloc(type, ptr, sz) realloc(ptr, sz)
|
||||
#define dav1d_free(ptr) free(ptr)
|
||||
|
||||
/*
|
||||
* Allocate align-byte aligned memory. The return value can be released
|
||||
* by calling the dav1d_free_aligned() function.
|
||||
*/
|
||||
static inline void *dav1d_alloc_aligned(size_t sz, size_t align) {
|
||||
static inline void *dav1d_alloc_aligned(const size_t sz, const size_t align) {
|
||||
assert(!(align & (align - 1)));
|
||||
#ifdef HAVE_POSIX_MEMALIGN
|
||||
#ifdef _WIN32
|
||||
return _aligned_malloc(sz, align);
|
||||
#elif defined(HAVE_POSIX_MEMALIGN)
|
||||
void *ptr;
|
||||
if (posix_memalign(&ptr, align, sz)) return NULL;
|
||||
return ptr;
|
||||
#elif defined(HAVE_ALIGNED_MALLOC)
|
||||
return _aligned_malloc(sz, align);
|
||||
#elif defined(HAVE_MEMALIGN)
|
||||
return memalign(align, sz);
|
||||
#else
|
||||
#error Missing aligned alloc implementation
|
||||
return memalign(align, sz);
|
||||
#endif
|
||||
}
|
||||
#define dav1d_alloc_aligned(type, sz, align) dav1d_alloc_aligned(sz, align)
|
||||
|
||||
static inline void dav1d_free_aligned(void* ptr) {
|
||||
#ifdef HAVE_POSIX_MEMALIGN
|
||||
free(ptr);
|
||||
#elif defined(HAVE_ALIGNED_MALLOC)
|
||||
static inline void dav1d_free_aligned(void *ptr) {
|
||||
#ifdef _WIN32
|
||||
_aligned_free(ptr);
|
||||
#elif defined(HAVE_MEMALIGN)
|
||||
#else
|
||||
free(ptr);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void dav1d_freep_aligned(void* ptr) {
|
||||
#endif /* TRACK_HEAP_ALLOCATIONS */
|
||||
|
||||
void dav1d_mem_pool_push(Dav1dMemPool *pool, Dav1dMemPoolBuffer *buf);
|
||||
Dav1dMemPoolBuffer *dav1d_mem_pool_pop(Dav1dMemPool *pool, size_t size);
|
||||
int dav1d_mem_pool_init(enum AllocationType type, Dav1dMemPool **pool);
|
||||
void dav1d_mem_pool_end(Dav1dMemPool *pool);
|
||||
|
||||
static inline void dav1d_freep_aligned(void *ptr) {
|
||||
void **mem = (void **) ptr;
|
||||
if (*mem) {
|
||||
dav1d_free_aligned(*mem);
|
||||
@ -92,12 +134,4 @@ static inline void dav1d_freep_aligned(void* ptr) {
|
||||
}
|
||||
}
|
||||
|
||||
static inline void freep(void *ptr) {
|
||||
void **mem = (void **) ptr;
|
||||
if (*mem) {
|
||||
free(*mem);
|
||||
*mem = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* DAV1D_SRC_MEM_H */
|
||||
|
27
third_party/dav1d/src/obu.c
vendored
27
third_party/dav1d/src/obu.c
vendored
@ -304,7 +304,7 @@ int dav1d_parse_sequence_header(Dav1dSequenceHeader *const out,
|
||||
{
|
||||
validate_input_or_ret(out != NULL, DAV1D_ERR(EINVAL));
|
||||
validate_input_or_ret(ptr != NULL, DAV1D_ERR(EINVAL));
|
||||
validate_input_or_ret(sz > 0, DAV1D_ERR(EINVAL));
|
||||
validate_input_or_ret(sz > 0 && sz <= SIZE_MAX / 2, DAV1D_ERR(EINVAL));
|
||||
|
||||
GetBits gb;
|
||||
dav1d_init_get_bits(&gb, ptr, sz);
|
||||
@ -609,8 +609,8 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
|
||||
if (!hdr->frame_ref_short_signaling)
|
||||
hdr->refidx[i] = dav1d_get_bits(gb, 3);
|
||||
if (seqhdr->frame_id_numbers_present) {
|
||||
const int delta_ref_frame_id_minus_1 = dav1d_get_bits(gb, seqhdr->delta_frame_id_n_bits);
|
||||
const int ref_frame_id = (hdr->frame_id + (1 << seqhdr->frame_id_n_bits) - delta_ref_frame_id_minus_1 - 1) & ((1 << seqhdr->frame_id_n_bits) - 1);
|
||||
const unsigned delta_ref_frame_id = dav1d_get_bits(gb, seqhdr->delta_frame_id_n_bits) + 1;
|
||||
const unsigned ref_frame_id = (hdr->frame_id + (1 << seqhdr->frame_id_n_bits) - delta_ref_frame_id) & ((1 << seqhdr->frame_id_n_bits) - 1);
|
||||
Dav1dFrameHeader *const ref_frame_hdr = c->refs[hdr->refidx[i]].p.p.frame_hdr;
|
||||
if (!ref_frame_hdr || ref_frame_hdr->frame_id != ref_frame_id) goto error;
|
||||
}
|
||||
@ -705,7 +705,8 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
|
||||
goto error;
|
||||
hdr->tiling.n_bytes = dav1d_get_bits(gb, 2) + 1;
|
||||
} else {
|
||||
hdr->tiling.n_bytes = hdr->tiling.update = 0;
|
||||
hdr->tiling.n_bytes = 0;
|
||||
hdr->tiling.update = 0;
|
||||
}
|
||||
#if DEBUG_FRAME_HDR
|
||||
printf("HDR: post-tiling: off=%td\n",
|
||||
@ -739,7 +740,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
|
||||
hdr->quant.qm_y = dav1d_get_bits(gb, 4);
|
||||
hdr->quant.qm_u = dav1d_get_bits(gb, 4);
|
||||
hdr->quant.qm_v =
|
||||
seqhdr->separate_uv_delta_q ? (int)dav1d_get_bits(gb, 4) :
|
||||
seqhdr->separate_uv_delta_q ? dav1d_get_bits(gb, 4) :
|
||||
hdr->quant.qm_u;
|
||||
}
|
||||
#if DEBUG_FRAME_HDR
|
||||
@ -1366,7 +1367,8 @@ ptrdiff_t dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) {
|
||||
if (!c->frame_hdr) goto error;
|
||||
if (c->n_tile_data_alloc < c->n_tile_data + 1) {
|
||||
if ((c->n_tile_data + 1) > INT_MAX / (int)sizeof(*c->tile)) goto error;
|
||||
struct Dav1dTileGroup *tile = realloc(c->tile, (c->n_tile_data + 1) * sizeof(*c->tile));
|
||||
struct Dav1dTileGroup *tile = dav1d_realloc(ALLOC_TILE, c->tile,
|
||||
(c->n_tile_data + 1) * sizeof(*c->tile));
|
||||
if (!tile) goto error;
|
||||
c->tile = tile;
|
||||
memset(c->tile + c->n_tile_data, 0, sizeof(*c->tile));
|
||||
@ -1406,7 +1408,8 @@ ptrdiff_t dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) {
|
||||
|
||||
switch (meta_type) {
|
||||
case OBU_META_HDR_CLL: {
|
||||
Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dContentLightLevel));
|
||||
Dav1dRef *ref = dav1d_ref_create(ALLOC_OBU_META,
|
||||
sizeof(Dav1dContentLightLevel));
|
||||
if (!ref) return DAV1D_ERR(ENOMEM);
|
||||
Dav1dContentLightLevel *const content_light = ref->data;
|
||||
|
||||
@ -1434,7 +1437,8 @@ ptrdiff_t dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) {
|
||||
break;
|
||||
}
|
||||
case OBU_META_HDR_MDCV: {
|
||||
Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dMasteringDisplay));
|
||||
Dav1dRef *ref = dav1d_ref_create(ALLOC_OBU_META,
|
||||
sizeof(Dav1dMasteringDisplay));
|
||||
if (!ref) return DAV1D_ERR(ENOMEM);
|
||||
Dav1dMasteringDisplay *const mastering_display = ref->data;
|
||||
|
||||
@ -1503,7 +1507,8 @@ ptrdiff_t dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) {
|
||||
}
|
||||
|
||||
if ((c->n_itut_t35 + 1) > INT_MAX / (int)sizeof(*c->itut_t35)) goto error;
|
||||
struct Dav1dITUTT35 *itut_t35 = realloc(c->itut_t35, (c->n_itut_t35 + 1) * sizeof(*c->itut_t35));
|
||||
struct Dav1dITUTT35 *itut_t35 = dav1d_realloc(ALLOC_OBU_META, c->itut_t35,
|
||||
(c->n_itut_t35 + 1) * sizeof(*c->itut_t35));
|
||||
if (!itut_t35) goto error;
|
||||
c->itut_t35 = itut_t35;
|
||||
memset(c->itut_t35 + c->n_itut_t35, 0, sizeof(*c->itut_t35));
|
||||
@ -1511,7 +1516,7 @@ ptrdiff_t dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) {
|
||||
struct itut_t35_ctx_context *itut_t35_ctx;
|
||||
if (!c->n_itut_t35) {
|
||||
assert(!c->itut_t35_ref);
|
||||
itut_t35_ctx = malloc(sizeof(struct itut_t35_ctx_context));
|
||||
itut_t35_ctx = dav1d_malloc(ALLOC_OBU_META, sizeof(struct itut_t35_ctx_context));
|
||||
if (!itut_t35_ctx) goto error;
|
||||
c->itut_t35_ref = dav1d_ref_init(&itut_t35_ctx->ref, c->itut_t35,
|
||||
dav1d_picture_free_itut_t35, itut_t35_ctx, 0);
|
||||
@ -1524,7 +1529,7 @@ ptrdiff_t dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) {
|
||||
itut_t35_ctx->n_itut_t35 = c->n_itut_t35 + 1;
|
||||
|
||||
Dav1dITUTT35 *const itut_t35_metadata = &c->itut_t35[c->n_itut_t35];
|
||||
itut_t35_metadata->payload = malloc(payload_size);
|
||||
itut_t35_metadata->payload = dav1d_malloc(ALLOC_OBU_META, payload_size);
|
||||
if (!itut_t35_metadata->payload) goto error;
|
||||
|
||||
itut_t35_metadata->country_code = country_code;
|
||||
|
22
third_party/dav1d/src/picture.c
vendored
22
third_party/dav1d/src/picture.c
vendored
@ -106,9 +106,9 @@ void dav1d_picture_free_itut_t35(const uint8_t *const data, void *const user_dat
|
||||
struct itut_t35_ctx_context *itut_t35_ctx = user_data;
|
||||
|
||||
for (size_t i = 0; i < itut_t35_ctx->n_itut_t35; i++)
|
||||
free(itut_t35_ctx->itut_t35[i].payload);
|
||||
free(itut_t35_ctx->itut_t35);
|
||||
free(itut_t35_ctx);
|
||||
dav1d_free(itut_t35_ctx->itut_t35[i].payload);
|
||||
dav1d_free(itut_t35_ctx->itut_t35);
|
||||
dav1d_free(itut_t35_ctx);
|
||||
}
|
||||
|
||||
static int picture_alloc_with_edges(Dav1dContext *const c,
|
||||
@ -249,12 +249,12 @@ int dav1d_picture_alloc_copy(Dav1dContext *const c, Dav1dPicture *const dst, con
|
||||
}
|
||||
|
||||
void dav1d_picture_ref(Dav1dPicture *const dst, const Dav1dPicture *const src) {
|
||||
validate_input(dst != NULL);
|
||||
validate_input(dst->data[0] == NULL);
|
||||
validate_input(src != NULL);
|
||||
assert(dst != NULL);
|
||||
assert(dst->data[0] == NULL);
|
||||
assert(src != NULL);
|
||||
|
||||
if (src->ref) {
|
||||
validate_input(src->data[0] != NULL);
|
||||
assert(src->data[0] != NULL);
|
||||
dav1d_ref_inc(src->ref);
|
||||
}
|
||||
if (src->frame_hdr_ref) dav1d_ref_inc(src->frame_hdr_ref);
|
||||
@ -267,12 +267,12 @@ void dav1d_picture_ref(Dav1dPicture *const dst, const Dav1dPicture *const src) {
|
||||
}
|
||||
|
||||
void dav1d_picture_move_ref(Dav1dPicture *const dst, Dav1dPicture *const src) {
|
||||
validate_input(dst != NULL);
|
||||
validate_input(dst->data[0] == NULL);
|
||||
validate_input(src != NULL);
|
||||
assert(dst != NULL);
|
||||
assert(dst->data[0] == NULL);
|
||||
assert(src != NULL);
|
||||
|
||||
if (src->ref)
|
||||
validate_input(src->data[0] != NULL);
|
||||
assert(src->data[0] != NULL);
|
||||
|
||||
*dst = *src;
|
||||
memset(src, 0, sizeof(*src));
|
||||
|
6
third_party/dav1d/src/ref.c
vendored
6
third_party/dav1d/src/ref.c
vendored
@ -34,10 +34,10 @@ static void default_free_callback(const uint8_t *const data, void *const user_da
|
||||
dav1d_free_aligned(user_data);
|
||||
}
|
||||
|
||||
Dav1dRef *dav1d_ref_create(size_t size) {
|
||||
Dav1dRef *dav1d_ref_create(const enum AllocationType type, size_t size) {
|
||||
size = (size + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
|
||||
|
||||
uint8_t *const data = dav1d_alloc_aligned(size + sizeof(Dav1dRef), 64);
|
||||
uint8_t *const data = dav1d_alloc_aligned(type, size + sizeof(Dav1dRef), 64);
|
||||
if (!data) return NULL;
|
||||
|
||||
Dav1dRef *const res = (Dav1dRef*)(data + size);
|
||||
@ -81,6 +81,6 @@ void dav1d_ref_dec(Dav1dRef **const pref) {
|
||||
if (atomic_fetch_sub(&ref->ref_cnt, 1) == 1) {
|
||||
const int free_ref = ref->free_ref;
|
||||
ref->free_callback(ref->const_data, ref->user_data);
|
||||
if (free_ref) free(ref);
|
||||
if (free_ref) dav1d_free(ref);
|
||||
}
|
||||
}
|
||||
|
6
third_party/dav1d/src/ref.h
vendored
6
third_party/dav1d/src/ref.h
vendored
@ -45,7 +45,11 @@ struct Dav1dRef {
|
||||
void *user_data;
|
||||
};
|
||||
|
||||
Dav1dRef *dav1d_ref_create(size_t size);
|
||||
#if !TRACK_HEAP_ALLOCATIONS
|
||||
#define dav1d_ref_create(type, size) dav1d_ref_create(size)
|
||||
#endif
|
||||
|
||||
Dav1dRef *dav1d_ref_create(enum AllocationType type, size_t size);
|
||||
Dav1dRef *dav1d_ref_create_using_pool(Dav1dMemPool *pool, size_t size);
|
||||
void dav1d_ref_dec(Dav1dRef **ref);
|
||||
|
||||
|
4
third_party/dav1d/src/refmvs.c
vendored
4
third_party/dav1d/src/refmvs.c
vendored
@ -817,7 +817,7 @@ int dav1d_refmvs_init_frame(refmvs_frame *const rf,
|
||||
if (r_stride != rf->r_stride || n_tile_rows != rf->n_tile_rows) {
|
||||
if (rf->r) dav1d_freep_aligned(&rf->r);
|
||||
const int uses_2pass = n_tile_threads > 1 && n_frame_threads > 1;
|
||||
rf->r = dav1d_alloc_aligned(sizeof(*rf->r) * 35 * r_stride * n_tile_rows * (1 + uses_2pass), 64);
|
||||
rf->r = dav1d_alloc_aligned(ALLOC_REFMVS, sizeof(*rf->r) * 35 * r_stride * n_tile_rows * (1 + uses_2pass), 64);
|
||||
if (!rf->r) return DAV1D_ERR(ENOMEM);
|
||||
rf->r_stride = r_stride;
|
||||
}
|
||||
@ -825,7 +825,7 @@ int dav1d_refmvs_init_frame(refmvs_frame *const rf,
|
||||
const ptrdiff_t rp_stride = r_stride >> 1;
|
||||
if (rp_stride != rf->rp_stride || n_tile_rows != rf->n_tile_rows) {
|
||||
if (rf->rp_proj) dav1d_freep_aligned(&rf->rp_proj);
|
||||
rf->rp_proj = dav1d_alloc_aligned(sizeof(*rf->rp_proj) * 16 * rp_stride * n_tile_rows, 64);
|
||||
rf->rp_proj = dav1d_alloc_aligned(ALLOC_REFMVS, sizeof(*rf->rp_proj) * 16 * rp_stride * n_tile_rows, 64);
|
||||
if (!rf->rp_proj) return DAV1D_ERR(ENOMEM);
|
||||
rf->rp_stride = rp_stride;
|
||||
}
|
||||
|
1
third_party/dav1d/src/thread.h
vendored
1
third_party/dav1d/src/thread.h
vendored
@ -33,6 +33,7 @@
|
||||
#include <limits.h>
|
||||
#include <windows.h>
|
||||
|
||||
#define PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT
|
||||
#define PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT
|
||||
|
||||
typedef struct {
|
||||
|
8
third_party/dav1d/src/thread_task.c
vendored
8
third_party/dav1d/src/thread_task.c
vendored
@ -224,7 +224,7 @@ static int create_filter_sbrow(Dav1dFrameContext *const f,
|
||||
int num_tasks = f->sbh * (1 + uses_2pass);
|
||||
if (num_tasks > f->task_thread.num_tasks) {
|
||||
const size_t size = sizeof(Dav1dTask) * num_tasks;
|
||||
tasks = realloc(f->task_thread.tasks, size);
|
||||
tasks = dav1d_realloc(ALLOC_COMMON_CTX, f->task_thread.tasks, size);
|
||||
if (!tasks) return -1;
|
||||
memset(tasks, 0, size);
|
||||
f->task_thread.tasks = tasks;
|
||||
@ -237,8 +237,8 @@ static int create_filter_sbrow(Dav1dFrameContext *const f,
|
||||
} else {
|
||||
const int prog_sz = ((f->sbh + 31) & ~31) >> 5;
|
||||
if (prog_sz > f->frame_thread.prog_sz) {
|
||||
atomic_uint *const prog = realloc(f->frame_thread.frame_progress,
|
||||
2 * prog_sz * sizeof(*prog));
|
||||
atomic_uint *const prog = dav1d_realloc(ALLOC_COMMON_CTX, f->frame_thread.frame_progress,
|
||||
2 * prog_sz * sizeof(*prog));
|
||||
if (!prog) return -1;
|
||||
f->frame_thread.frame_progress = prog;
|
||||
f->frame_thread.copy_lpf_progress = prog + prog_sz;
|
||||
@ -275,7 +275,7 @@ int dav1d_task_create_tile_sbrow(Dav1dFrameContext *const f, const int pass,
|
||||
int alloc_num_tasks = num_tasks * (1 + uses_2pass);
|
||||
if (alloc_num_tasks > f->task_thread.num_tile_tasks) {
|
||||
const size_t size = sizeof(Dav1dTask) * alloc_num_tasks;
|
||||
tasks = realloc(f->task_thread.tile_tasks[0], size);
|
||||
tasks = dav1d_realloc(ALLOC_COMMON_CTX, f->task_thread.tile_tasks[0], size);
|
||||
if (!tasks) return -1;
|
||||
memset(tasks, 0, size);
|
||||
f->task_thread.tile_tasks[0] = tasks;
|
||||
|
224
third_party/dav1d/src/x86/refmvs.asm
vendored
224
third_party/dav1d/src/x86/refmvs.asm
vendored
@ -47,6 +47,10 @@ SECTION_RODATA 64
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_64
|
||||
mv_proj: dw 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340
|
||||
dw 2048, 1820, 1638, 1489, 1365, 1260, 1170, 1092
|
||||
dw 1024, 963, 910, 862, 819, 780, 744, 712
|
||||
dw 682, 655, 630, 606, 585, 564, 546, 528
|
||||
splat_mv_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3
|
||||
db 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7
|
||||
db 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
|
||||
@ -61,6 +65,7 @@ cond_shuf512: db 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 3, 3
|
||||
save_cond0: db 0x80, 0x81, 0x82, 0x83, 0x89, 0x84, 0x00, 0x00
|
||||
save_cond1: db 0x84, 0x85, 0x86, 0x87, 0x88, 0x80, 0x00, 0x00
|
||||
pb_128: times 16 db 128
|
||||
pq_8192: dq 8192
|
||||
|
||||
save_tmvs_ssse3_table: SAVE_TMVS_TABLE 2, 16, ssse3
|
||||
SAVE_TMVS_TABLE 4, 8, ssse3
|
||||
@ -329,6 +334,225 @@ cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
|
||||
RET
|
||||
|
||||
%if ARCH_X86_64
|
||||
INIT_XMM sse4
|
||||
; refmvs_frame *rf, int tile_row_idx,
|
||||
; int col_start8, int col_end8, int row_start8, int row_end8
|
||||
cglobal load_tmvs, 6, 15, 4, -0x50, rf, tridx, xstart, xend, ystart, yend, \
|
||||
stride, rp_proj, roff, troff, \
|
||||
xendi, xstarti, iw8, ih8, dst
|
||||
xor r14d, r14d
|
||||
cmp dword [rfq+212], 1 ; n_tile_threads
|
||||
mov ih8d, [rfq+20] ; rf->ih8
|
||||
mov iw8d, [rfq+16] ; rf->iw8
|
||||
mov xstartd, xstartd
|
||||
mov xendd, xendd
|
||||
cmove tridxd, r14d
|
||||
lea xstartid, [xstartq-8]
|
||||
lea xendid, [xendq+8]
|
||||
mov strideq, [rfq+184]
|
||||
mov rp_projq, [rfq+176]
|
||||
cmp ih8d, yendd
|
||||
mov [rsp+0x30], strideq
|
||||
cmovs yendd, ih8d
|
||||
test xstartid, xstartid
|
||||
cmovs xstartid, r14d
|
||||
cmp iw8d, xendid
|
||||
cmovs xendid, iw8d
|
||||
mov troffq, strideq
|
||||
shl troffq, 4
|
||||
imul troffq, tridxq
|
||||
mov dstd, ystartd
|
||||
and dstd, 15
|
||||
imul dstq, strideq
|
||||
add dstq, troffq ; (16 * tridx + (ystart & 15)) * stride
|
||||
lea dstq, [dstq*5]
|
||||
add dstq, rp_projq
|
||||
lea troffq, [troffq*5] ; 16 * tridx * stride * 5
|
||||
lea r13d, [xendq*5]
|
||||
lea r12, [strideq*5]
|
||||
DEFINE_ARGS rf, w5, xstart, xend, ystart, yend, h, x5, \
|
||||
_, troff, xendi, xstarti, stride5, _, dst
|
||||
lea w5d, [xstartq*5]
|
||||
add r7, troffq ; rp_proj + tile_row_offset
|
||||
mov hd, yendd
|
||||
mov [rsp+0x28], r7
|
||||
add dstq, r13
|
||||
sub w5q, r13
|
||||
sub hd, ystartd
|
||||
.init_xloop_start:
|
||||
mov x5q, w5q
|
||||
test w5b, 1
|
||||
jz .init_2blk
|
||||
mov dword [dstq+x5q], 0x80008000
|
||||
add x5q, 5
|
||||
jz .init_next_row
|
||||
.init_2blk:
|
||||
mov dword [dstq+x5q+0], 0x80008000
|
||||
mov dword [dstq+x5q+5], 0x80008000
|
||||
add x5q, 10
|
||||
jl .init_2blk
|
||||
.init_next_row:
|
||||
add dstq, stride5q
|
||||
dec hd
|
||||
jg .init_xloop_start
|
||||
DEFINE_ARGS rf, _, xstart, xend, ystart, yend, n7, stride, \
|
||||
_, _, xendi, xstarti, stride5, _, n
|
||||
mov r13d, [rfq+152] ; rf->n_mfmvs
|
||||
test r13d, r13d
|
||||
jz .ret
|
||||
mov [rsp+0x0c], r13d
|
||||
mov strideq, [rsp+0x30]
|
||||
movddup m3, [pq_8192]
|
||||
mov r9d, ystartd
|
||||
mov [rsp+0x38], yendd
|
||||
mov [rsp+0x20], xstartid
|
||||
xor nd, nd
|
||||
xor n7d, n7d
|
||||
imul r9, strideq ; ystart * stride
|
||||
mov [rsp+0x48], rfq
|
||||
mov [rsp+0x18], stride5q
|
||||
lea r7, [r9*5]
|
||||
mov [rsp+0x24], ystartd
|
||||
mov [rsp+0x00], r7
|
||||
.nloop:
|
||||
DEFINE_ARGS y, off, xstart, xend, ystart, rf, n7, refsign, \
|
||||
ref, rp_ref, xendi, xstarti, _, _, n
|
||||
mov rfq, [rsp+0x48]
|
||||
mov refd, [rfq+56+nq*4] ; ref2cur
|
||||
cmp refd, 0x80000000
|
||||
je .next_n
|
||||
mov [rsp+0x40], refd
|
||||
mov offq, [rsp+0x00] ; ystart * stride * 5
|
||||
movzx refd, byte [rfq+53+nq] ; rf->mfmv_ref[n]
|
||||
lea refsignq, [refq-4]
|
||||
mov rp_refq, [rfq+168]
|
||||
movq m2, refsignq
|
||||
add offq, [rp_refq+refq*8] ; r = rp_ref[ref] + row_offset
|
||||
mov [rsp+0x14], nd
|
||||
mov yd, ystartd
|
||||
.yloop:
|
||||
mov r11d, [rsp+0x24] ; ystart
|
||||
mov r12d, [rsp+0x38] ; yend
|
||||
mov r14d, yd
|
||||
and r14d, ~7 ; y_sb_align
|
||||
cmp r11d, r14d
|
||||
cmovs r11d, r14d ; imax(y_sb_align, ystart)
|
||||
mov [rsp+0x44], r11d ; y_proj_start
|
||||
add r14d, 8
|
||||
cmp r12d, r14d
|
||||
cmovs r14d, r12d ; imin(y_sb_align + 8, yend)
|
||||
mov [rsp+0x3c], r14d ; y_proj_end
|
||||
DEFINE_ARGS y, src, xstart, xend, frac, rf, n7, mv, \
|
||||
ref, x, xendi, mvx, mvy, rb, ref2ref
|
||||
mov xd, [rsp+0x20] ; xstarti
|
||||
.xloop:
|
||||
lea rbd, [xq*5]
|
||||
add rbq, srcq
|
||||
movsx refd, byte [rbq+4]
|
||||
test refd, refd
|
||||
jz .next_x_bad_ref
|
||||
mov rfq, [rsp+0x48]
|
||||
lea r14d, [16+n7q+refq]
|
||||
mov ref2refd, [rfq+r14*4] ; rf->mfmv_ref2ref[n][b_ref-1]
|
||||
test ref2refd, ref2refd
|
||||
jz .next_x_bad_ref
|
||||
lea fracq, [mv_proj]
|
||||
movzx fracd, word [fracq+ref2refq*2]
|
||||
mov mvd, [rbq]
|
||||
imul fracd, [rsp+0x40] ; ref2cur
|
||||
pmovsxwq m0, [rbq]
|
||||
movd m1, fracd
|
||||
punpcklqdq m1, m1
|
||||
pmuldq m0, m1 ; mv * frac
|
||||
pshufd m1, m0, q3311
|
||||
paddd m0, m3
|
||||
paddd m0, m1
|
||||
psrad m0, 14 ; offset = (xy + (xy >> 31) + 8192) >> 14
|
||||
pabsd m1, m0
|
||||
packssdw m0, m0
|
||||
psrld m1, 6
|
||||
packuswb m1, m1
|
||||
pxor m0, m2 ; offset ^ ref_sign
|
||||
psignd m1, m0 ; apply_sign(abs(offset) >> 6, offset ^ refsign)
|
||||
movq mvxq, m1
|
||||
lea mvyd, [mvxq+yq] ; ypos
|
||||
sar mvxq, 32
|
||||
DEFINE_ARGS y, src, xstart, xend, _, _, n7, mv, \
|
||||
ref, x, xendi, mvx, ypos, rb, ref2ref
|
||||
cmp yposd, [rsp+0x44] ; y_proj_start
|
||||
jl .next_x_bad_pos_y
|
||||
cmp yposd, [rsp+0x3c] ; y_proj_end
|
||||
jge .next_x_bad_pos_y
|
||||
and yposd, 15
|
||||
add mvxq, xq ; xpos
|
||||
imul yposq, [rsp+0x30] ; pos = (ypos & 15) * stride
|
||||
DEFINE_ARGS y, src, xstart, xend, dst, _, n7, mv, \
|
||||
ref, x, xendi, xpos, pos, rb, ref2ref
|
||||
mov dstq, [rsp+0x28] ; dst = rp_proj + tile_row_offset
|
||||
add posq, xposq ; pos += xpos
|
||||
lea posq, [posq*5]
|
||||
add dstq, posq ; dst += pos5
|
||||
jmp .write_loop_entry
|
||||
.write_loop:
|
||||
add rbq, 5
|
||||
cmp refb, byte [rbq+4]
|
||||
jne .xloop
|
||||
cmp mvd, [rbq]
|
||||
jne .xloop
|
||||
add dstq, 5
|
||||
inc xposd
|
||||
.write_loop_entry:
|
||||
mov r12d, xd
|
||||
and r12d, ~7
|
||||
lea r5d, [r12-8]
|
||||
cmp r5d, xstartd
|
||||
cmovs r5d, xstartd ; x_proj_start
|
||||
cmp xposd, r5d
|
||||
jl .next_xpos
|
||||
add r12d, 16
|
||||
cmp xendd, r12d
|
||||
cmovs r12d, xendd ; x_proj_end
|
||||
cmp xposd, r12d
|
||||
jge .next_xpos
|
||||
mov [dstq+0], mvd
|
||||
mov byte [dstq+4], ref2refb
|
||||
.next_xpos:
|
||||
inc xd
|
||||
cmp xd, xendid
|
||||
jl .write_loop
|
||||
.next_y:
|
||||
DEFINE_ARGS y, src, xstart, xend, ystart, _, n7, _, _, x, xendi, _, _, _, n
|
||||
add srcq, [rsp+0x18] ; stride5
|
||||
inc yd
|
||||
cmp yd, [rsp+0x38] ; yend
|
||||
jne .yloop
|
||||
mov nd, [rsp+0x14]
|
||||
mov ystartd, [rsp+0x24]
|
||||
.next_n:
|
||||
add n7d, 7
|
||||
inc nd
|
||||
cmp nd, [rsp+0x0c] ; n_mfmvs
|
||||
jne .nloop
|
||||
.ret:
|
||||
RET
|
||||
.next_x:
|
||||
DEFINE_ARGS y, src, xstart, xend, _, _, n7, mv, ref, x, xendi, _, _, rb, _
|
||||
add rbq, 5
|
||||
cmp refb, byte [rbq+4]
|
||||
jne .xloop
|
||||
cmp mvd, [rbq]
|
||||
jne .xloop
|
||||
.next_x_bad_pos_y:
|
||||
inc xd
|
||||
cmp xd, xendid
|
||||
jl .next_x
|
||||
jmp .next_y
|
||||
.next_x_bad_ref:
|
||||
inc xd
|
||||
cmp xd, xendid
|
||||
jl .xloop
|
||||
jmp .next_y
|
||||
|
||||
INIT_YMM avx2
|
||||
; refmvs_temporal_block *rp, ptrdiff_t stride,
|
||||
; refmvs_block **rr, uint8_t *ref_sign,
|
||||
|
5
third_party/dav1d/src/x86/refmvs.h
vendored
5
third_party/dav1d/src/x86/refmvs.h
vendored
@ -28,6 +28,8 @@
|
||||
#include "src/cpu.h"
|
||||
#include "src/refmvs.h"
|
||||
|
||||
decl_load_tmvs_fn(dav1d_load_tmvs_sse4);
|
||||
|
||||
decl_save_tmvs_fn(dav1d_save_tmvs_ssse3);
|
||||
decl_save_tmvs_fn(dav1d_save_tmvs_avx2);
|
||||
decl_save_tmvs_fn(dav1d_save_tmvs_avx512icl);
|
||||
@ -47,7 +49,10 @@ static ALWAYS_INLINE void refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *const c) {
|
||||
|
||||
c->save_tmvs = dav1d_save_tmvs_ssse3;
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
|
||||
#if ARCH_X86_64
|
||||
c->load_tmvs = dav1d_load_tmvs_sse4;
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
|
||||
|
||||
c->save_tmvs = dav1d_save_tmvs_avx2;
|
||||
|
185
third_party/dav1d/tests/checkasm/refmvs.c
vendored
185
third_party/dav1d/tests/checkasm/refmvs.c
vendored
@ -39,6 +39,190 @@ static inline int gen_mv(const int total_bits, int spel_bits) {
|
||||
return rnd() & 1 ? -bits : bits;
|
||||
}
|
||||
|
||||
#define ARRAY_SIZE(n) (sizeof(n)/sizeof(*(n)))
|
||||
|
||||
static inline int get_min_mv_val(const int idx) {
|
||||
if (idx <= 9) return idx;
|
||||
else if (idx <= 18) return (idx - 9) * 10;
|
||||
else if (idx <= 27) return (idx - 18) * 100;
|
||||
else if (idx <= 36) return (idx - 27) * 1000;
|
||||
else return (idx - 36) * 10000;
|
||||
}
|
||||
|
||||
static inline void gen_tmv(refmvs_temporal_block *const rb, const int *ref2ref) {
|
||||
rb->ref = rnd() % 7;
|
||||
if (!rb->ref) return;
|
||||
static const int x_prob[] = {
|
||||
26447556, 6800591, 3708783, 2198592, 1635940, 1145901, 1052602, 1261759,
|
||||
1099739, 755108, 6075404, 4355916, 3254908, 2897157, 2273676, 2154432,
|
||||
1937436, 1694818, 1466863, 10203087, 5241546, 3328819, 2187483, 1458997,
|
||||
1030842, 806863, 587219, 525024, 1858953, 422368, 114626, 16992
|
||||
};
|
||||
static const int y_prob[] = {
|
||||
33845001, 7591218, 6425971, 4115838, 4032161, 2515962, 2614601, 2343656,
|
||||
2898897, 1397254, 10125350, 5124449, 3232914, 2185499, 1608775, 1342585,
|
||||
980208, 795714, 649665, 3369250, 1298716, 486002, 279588, 235990,
|
||||
110318, 89372, 66895, 46980, 153322, 32960, 4500, 389
|
||||
};
|
||||
const int prob = rnd() % 100000000;
|
||||
int acc = 0;
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(x_prob); i++) {
|
||||
acc += x_prob[i];
|
||||
if (prob < acc) {
|
||||
const int min = get_min_mv_val(i);
|
||||
const int max = get_min_mv_val(i + 1);
|
||||
const int val = min + rnd() % (max - min);
|
||||
rb->mv.x = iclip(val * ref2ref[rb->ref], -(1 << 15), (1 << 15) - 1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
acc = 0;
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(y_prob); i++) {
|
||||
acc += y_prob[i];
|
||||
if (prob < acc) {
|
||||
const int min = get_min_mv_val(i);
|
||||
const int max = get_min_mv_val(i + 1);
|
||||
const int val = min + rnd() % (max - min);
|
||||
rb->mv.y = iclip(val * ref2ref[rb->ref], -(1 << 15), (1 << 15) - 1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline int get_ref2cur(void) {
|
||||
const int prob = rnd() % 100;
|
||||
static const uint8_t ref2cur[11] = { 35, 55, 67, 73, 78, 83, 84, 87, 90, 93, 100 };
|
||||
for (int i = 0; i < 11; i++)
|
||||
if (prob < ref2cur[i])
|
||||
return rnd() & 1 ? -(i + 1) : i + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int get_seqlen(void) {
|
||||
int len = 0, max_len;
|
||||
const int prob = rnd() % 100000;
|
||||
// =1 =2 =3 =4 <8 =8 <16 =16 <32 =32 <48 =48 <64 =64 >64 eq240
|
||||
// 5 17 1.5 16 5 10 5 7 4 3 1.5 2 1 2 20 15 chimera blocks
|
||||
// 25 38 2.5 19 3.5 5.5 2 1.87 .86 .4 .18 .2 .067 .165 .478 .28 chimera sequences
|
||||
|
||||
if (prob < 25000) len = 1; // =1 5%
|
||||
else if (prob < 63000) len = 2; // =2 17%
|
||||
else if (prob < 65500) len = 3; // =3 1.5%
|
||||
else if (prob < 84500) len = 4; // =4 16%
|
||||
else if (prob < 88000) max_len = 7; // <8 5% (43.5% tot <8)
|
||||
else if (prob < 93500) len = 8; // =8 10%
|
||||
else if (prob < 95500) max_len = 15; // <16 5%
|
||||
else if (prob < 97370) len = 16; // =16 7%
|
||||
else if (prob < 98230) max_len = 31; // <32 4%
|
||||
else if (prob < 98630) len = 32; // =32 3%
|
||||
else if (prob < 98810) max_len = 47; // <48 1.5%
|
||||
else if (prob < 99010) len = 48; // =48 2%
|
||||
else if (prob < 99077) max_len = 63; // <64 1%
|
||||
else if (prob < 99242) len = 64; // =64 2%
|
||||
else if (prob < 99720) max_len = 239; // <240 5%
|
||||
else len = 240; // =240 15%
|
||||
|
||||
if (!len) len = 1 + rnd() % max_len;
|
||||
return len;
|
||||
}
|
||||
|
||||
static inline void init_rp_ref(refmvs_frame const *const rf,
|
||||
const int col_start8, const int col_end8,
|
||||
const int row_start8, const int row_end8)
|
||||
{
|
||||
const int col_start8i = imax(col_start8 - 8, 0);
|
||||
const int col_end8i = imin(col_end8 + 8, rf->iw8);
|
||||
for (int n = 0; n < rf->n_mfmvs; n++) {
|
||||
refmvs_temporal_block *rp_ref = rf->rp_ref[rf->mfmv_ref[n]];
|
||||
for (int i = row_start8; i < imin(row_end8, rf->ih8); i++) {
|
||||
for (int j = col_start8i; j < col_end8i;) {
|
||||
refmvs_temporal_block rb;
|
||||
gen_tmv(&rb, rf->mfmv_ref2ref[n]);
|
||||
for (int k = get_seqlen(); k && j < col_end8i; k--, j++)
|
||||
rp_ref[i * rf->iw8 + j] = rb;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void check_load_tmvs(const Dav1dRefmvsDSPContext *const c) {
|
||||
refmvs_temporal_block *rp_ref[7] = {0};
|
||||
refmvs_temporal_block c_rp_proj[240 * 63];
|
||||
refmvs_temporal_block a_rp_proj[240 * 63];
|
||||
refmvs_frame rf = {
|
||||
.rp_ref = rp_ref,
|
||||
.rp_stride = 240, .iw8 = 240, .ih8 = 63,
|
||||
.n_mfmvs = 3
|
||||
};
|
||||
const size_t rp_ref_sz = rf.ih8 * rf.rp_stride * sizeof(refmvs_temporal_block);
|
||||
|
||||
declare_func(void, const refmvs_frame *rf, int tile_row_idx,
|
||||
int col_start8, int col_end8, int row_start8, int row_end8);
|
||||
|
||||
if (check_func(c->load_tmvs, "load_tmvs")) {
|
||||
const int row_start8 = (rnd() & 3) << 4;
|
||||
const int row_end8 = row_start8 + 16;
|
||||
const int col_start8 = rnd() & 31;
|
||||
const int col_end8 = rf.iw8 - (rnd() & 31);
|
||||
|
||||
for (int n = 0; n < rf.n_mfmvs; n++) {
|
||||
rf.mfmv_ref[n] = rnd() % 7;
|
||||
rf.mfmv_ref2cur[n] = get_ref2cur();
|
||||
for (int r = 0; r < 7; r++)
|
||||
rf.mfmv_ref2ref[n][r] = rnd() & 31;
|
||||
}
|
||||
for (int n = 0; n < rf.n_mfmvs; n++) {
|
||||
refmvs_temporal_block **p_rp_ref = &rp_ref[rf.mfmv_ref[n]];
|
||||
if (!*p_rp_ref)
|
||||
*p_rp_ref = malloc(rp_ref_sz);
|
||||
}
|
||||
init_rp_ref(&rf, 0, rf.iw8, row_start8, row_end8);
|
||||
for (int i = 0; i < rf.iw8 * rf.ih8; i++) {
|
||||
c_rp_proj[i].mv.n = a_rp_proj[i].mv.n = 0xdeadbeef;
|
||||
c_rp_proj[i].ref = a_rp_proj[i].ref = 0xdd;
|
||||
}
|
||||
|
||||
rf.n_tile_threads = 1;
|
||||
|
||||
rf.rp_proj = c_rp_proj;
|
||||
call_ref(&rf, 0, col_start8, col_end8, row_start8, row_end8);
|
||||
rf.rp_proj = a_rp_proj;
|
||||
call_new(&rf, 0, col_start8, col_end8, row_start8, row_end8);
|
||||
|
||||
for (int i = 0; i < rf.ih8; i++)
|
||||
for (int j = 0; j < rf.iw8; j++)
|
||||
if (c_rp_proj[i * rf.iw8 + j].mv.n != a_rp_proj[i * rf.iw8 + j].mv.n ||
|
||||
(c_rp_proj[i * rf.iw8 + j].ref != a_rp_proj[i * rf.iw8 + j].ref &&
|
||||
c_rp_proj[i * rf.iw8 + j].mv.n != INVALID_MV))
|
||||
{
|
||||
if (fail()) {
|
||||
fprintf(stderr, "[%d][%d] c_rp.mv.x = 0x%x a_rp.mv.x = 0x%x\n",
|
||||
i, j, c_rp_proj[i * rf.iw8 + j].mv.x, a_rp_proj[i * rf.iw8 + j].mv.x);
|
||||
fprintf(stderr, "[%d][%d] c_rp.mv.y = 0x%x a_rp.mv.y = 0x%x\n",
|
||||
i, j, c_rp_proj[i * rf.iw8 + j].mv.y, a_rp_proj[i * rf.iw8 + j].mv.y);
|
||||
fprintf(stderr, "[%d][%d] c_rp.ref = %u a_rp.ref = %u\n",
|
||||
i, j, c_rp_proj[i * rf.iw8 + j].ref, a_rp_proj[i * rf.iw8 + j].ref);
|
||||
}
|
||||
}
|
||||
|
||||
if (checkasm_bench_func()) {
|
||||
for (int n = 0; n < rf.n_mfmvs; n++) {
|
||||
rf.mfmv_ref2cur[n] = 1;
|
||||
for (int r = 0; r < 7; r++)
|
||||
rf.mfmv_ref2ref[n][r] = 1;
|
||||
}
|
||||
bench_new(&rf, 0, 0, rf.iw8, row_start8, row_end8);
|
||||
}
|
||||
|
||||
for (int n = 0; n < rf.n_mfmvs; n++) {
|
||||
free(rp_ref[rf.mfmv_ref[n]]);
|
||||
rp_ref[rf.mfmv_ref[n]] = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
report("load_tmvs");
|
||||
}
|
||||
|
||||
static void check_save_tmvs(const Dav1dRefmvsDSPContext *const c) {
|
||||
refmvs_block *rr[31];
|
||||
refmvs_block r[31 * 256];
|
||||
@ -162,6 +346,7 @@ void checkasm_check_refmvs(void) {
|
||||
Dav1dRefmvsDSPContext c;
|
||||
dav1d_refmvs_dsp_init(&c);
|
||||
|
||||
check_load_tmvs(&c);
|
||||
check_save_tmvs(&c);
|
||||
check_splat_mv(&c);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user