diff --git a/media/libdav1d/moz.yaml b/media/libdav1d/moz.yaml index f39c2a421084..5ae45920e7bc 100644 --- a/media/libdav1d/moz.yaml +++ b/media/libdav1d/moz.yaml @@ -20,11 +20,11 @@ origin: # Human-readable identifier for this version/release # Generally "version NNN", "tag SSS", "bookmark SSS" - release: 8b419c16bf1e37bc98044089da58f06824462cb9 (2023-06-02T00:00:12.000+02:00). + release: 616bfd1506a8a75c6a358e578cbec9ca11931502 (2023-07-01T11:36:39.000+03:00). # Revision to pull in # Must be a long or short commit SHA (long preferred) - revision: 8b419c16bf1e37bc98044089da58f06824462cb9 + revision: 616bfd1506a8a75c6a358e578cbec9ca11931502 # The package's license, where possible using the mnemonic from # https://spdx.org/licenses/ diff --git a/media/libdav1d/vcs_version.h b/media/libdav1d/vcs_version.h index 6cca1b247e2f..d241c603d473 100644 --- a/media/libdav1d/vcs_version.h +++ b/media/libdav1d/vcs_version.h @@ -1,2 +1,2 @@ /* auto-generated, do not edit */ -#define DAV1D_VERSION "8b419c16bf1e37bc98044089da58f06824462cb9" +#define DAV1D_VERSION "616bfd1506a8a75c6a358e578cbec9ca11931502" diff --git a/media/libdav1d/version.h b/media/libdav1d/version.h index 176d3ac220b9..e5c01f1d007c 100644 --- a/media/libdav1d/version.h +++ b/media/libdav1d/version.h @@ -27,8 +27,8 @@ #ifndef DAV1D_VERSION_H #define DAV1D_VERSION_H -#define DAV1D_API_VERSION_MAJOR 6 -#define DAV1D_API_VERSION_MINOR 9 +#define DAV1D_API_VERSION_MAJOR 7 +#define DAV1D_API_VERSION_MINOR 0 #define DAV1D_API_VERSION_PATCH 0 #endif /* DAV1D_VERSION_H */ diff --git a/third_party/dav1d/include/common/validate.h b/third_party/dav1d/include/common/validate.h index 3096f3db8ee1..3aaed5bb929a 100644 --- a/third_party/dav1d/include/common/validate.h +++ b/third_party/dav1d/include/common/validate.h @@ -32,24 +32,26 @@ #include #if defined(NDEBUG) -#define debug_abort() +#define debug_print(...) do {} while (0) +#define debug_abort() do {} while (0) #else +#define debug_print(...) fprintf(stderr, __VA_ARGS__) #define debug_abort abort #endif #define validate_input_or_ret_with_msg(x, r, ...) \ if (!(x)) { \ - fprintf(stderr, "Input validation check \'%s\' failed in %s!\n", \ - #x, __func__); \ - fprintf(stderr, __VA_ARGS__); \ + debug_print("Input validation check \'%s\' failed in %s!\n", \ + #x, __func__); \ + debug_print(__VA_ARGS__); \ debug_abort(); \ return r; \ } #define validate_input_or_ret(x, r) \ if (!(x)) { \ - fprintf(stderr, "Input validation check \'%s\' failed in %s!\n", \ - #x, __func__); \ + debug_print("Input validation check \'%s\' failed in %s!\n", \ + #x, __func__); \ debug_abort(); \ return r; \ } diff --git a/third_party/dav1d/include/dav1d/dav1d.h b/third_party/dav1d/include/dav1d/dav1d.h index ced7108d403c..e8f070577917 100644 --- a/third_party/dav1d/include/dav1d/dav1d.h +++ b/third_party/dav1d/include/dav1d/dav1d.h @@ -103,6 +103,15 @@ typedef struct Dav1dSettings { */ DAV1D_API const char *dav1d_version(void); +/** + * Get library API version. + * + * @return A value in the format 0x00XXYYZZ, where XX is the major version, + * YY the minor version, and ZZ the patch version. + * @see DAV1D_API_MAJOR, DAV1D_API_MINOR, DAV1D_API_PATCH + */ +DAV1D_API unsigned dav1d_version_api(void); + /** * Initialize settings to default values. * diff --git a/third_party/dav1d/include/dav1d/headers.h b/third_party/dav1d/include/dav1d/headers.h index f152709f46ef..b9037f23de63 100644 --- a/third_party/dav1d/include/dav1d/headers.h +++ b/third_party/dav1d/include/dav1d/headers.h @@ -182,8 +182,8 @@ enum Dav1dChromaSamplePosition { }; typedef struct Dav1dContentLightLevel { - int max_content_light_level; - int max_frame_average_light_level; + uint16_t max_content_light_level; + uint16_t max_frame_average_light_level; } Dav1dContentLightLevel; typedef struct Dav1dMasteringDisplay { @@ -210,7 +210,7 @@ typedef struct Dav1dSequenceHeader { * 1 for 8-10 bits/component 4:4:4; 2 for 4:2:2 at any bits/component, * or 12 bits/component at any chroma subsampling. */ - int profile; + uint8_t profile; /** * Maximum dimensions for this stream. In non-scalable streams, these * are often the actual dimensions of the stream, although that is not @@ -229,60 +229,60 @@ typedef struct Dav1dSequenceHeader { * (twelve_bit) to distinguish between 10 and 12 bits/component. To get * the spec's hbd, use !!our_hbd, and to get twelve_bit, use hbd == 2. */ - int hbd; + uint8_t hbd; /** * Pixel data uses JPEG pixel range ([0,255] for 8bits) instead of * MPEG pixel range ([16,235] for 8bits luma, [16,240] for 8bits chroma). */ - int color_range; + uint8_t color_range; - int num_operating_points; + uint8_t num_operating_points; struct Dav1dSequenceHeaderOperatingPoint { - int major_level, minor_level; - int initial_display_delay; - int idc; - int tier; - int decoder_model_param_present; - int display_model_param_present; + uint8_t major_level, minor_level; + uint8_t initial_display_delay; + uint16_t idc; + uint8_t tier; + uint8_t decoder_model_param_present; + uint8_t display_model_param_present; } operating_points[DAV1D_MAX_OPERATING_POINTS]; - int still_picture; - int reduced_still_picture_header; - int timing_info_present; - int num_units_in_tick; - int time_scale; - int equal_picture_interval; - unsigned num_ticks_per_picture; - int decoder_model_info_present; - int encoder_decoder_buffer_delay_length; - int num_units_in_decoding_tick; - int buffer_removal_delay_length; - int frame_presentation_delay_length; - int display_model_info_present; - int width_n_bits, height_n_bits; - int frame_id_numbers_present; - int delta_frame_id_n_bits; - int frame_id_n_bits; - int sb128; - int filter_intra; - int intra_edge_filter; - int inter_intra; - int masked_compound; - int warped_motion; - int dual_filter; - int order_hint; - int jnt_comp; - int ref_frame_mvs; + uint8_t still_picture; + uint8_t reduced_still_picture_header; + uint8_t timing_info_present; + uint32_t num_units_in_tick; + uint32_t time_scale; + uint8_t equal_picture_interval; + uint32_t num_ticks_per_picture; + uint8_t decoder_model_info_present; + uint8_t encoder_decoder_buffer_delay_length; + uint32_t num_units_in_decoding_tick; + uint8_t buffer_removal_delay_length; + uint8_t frame_presentation_delay_length; + uint8_t display_model_info_present; + uint8_t width_n_bits, height_n_bits; + uint8_t frame_id_numbers_present; + uint8_t delta_frame_id_n_bits; + uint8_t frame_id_n_bits; + uint8_t sb128; + uint8_t filter_intra; + uint8_t intra_edge_filter; + uint8_t inter_intra; + uint8_t masked_compound; + uint8_t warped_motion; + uint8_t dual_filter; + uint8_t order_hint; + uint8_t jnt_comp; + uint8_t ref_frame_mvs; enum Dav1dAdaptiveBoolean screen_content_tools; enum Dav1dAdaptiveBoolean force_integer_mv; - int order_hint_n_bits; - int super_res; - int cdef; - int restoration; - int ss_hor, ss_ver, monochrome; - int color_description_present; - int separate_uv_delta_q; - int film_grain_present; + uint8_t order_hint_n_bits; + uint8_t super_res; + uint8_t cdef; + uint8_t restoration; + uint8_t ss_hor, ss_ver, monochrome; + uint8_t color_description_present; + uint8_t separate_uv_delta_q; + uint8_t film_grain_present; // Dav1dSequenceHeaders of the same sequence are required to be // bit-identical until this offset. See 7.5 "Ordering of OBUs": @@ -291,29 +291,29 @@ typedef struct Dav1dSequenceHeader { // sequence header appears except for the contents of // operating_parameters_info. struct Dav1dSequenceHeaderOperatingParameterInfo { - int decoder_buffer_delay; - int encoder_buffer_delay; - int low_delay_mode; + uint32_t decoder_buffer_delay; + uint32_t encoder_buffer_delay; + uint8_t low_delay_mode; } operating_parameter_info[DAV1D_MAX_OPERATING_POINTS]; } Dav1dSequenceHeader; typedef struct Dav1dSegmentationData { - int delta_q; - int delta_lf_y_v, delta_lf_y_h, delta_lf_u, delta_lf_v; - int ref; - int skip; - int globalmv; + int16_t delta_q; + int8_t delta_lf_y_v, delta_lf_y_h, delta_lf_u, delta_lf_v; + int8_t ref; + uint8_t skip; + uint8_t globalmv; } Dav1dSegmentationData; typedef struct Dav1dSegmentationDataSet { Dav1dSegmentationData d[DAV1D_MAX_SEGMENTS]; - int preskip; - int last_active_segid; + uint8_t preskip; + int8_t last_active_segid; } Dav1dSegmentationDataSet; typedef struct Dav1dLoopfilterModeRefDeltas { - int mode_delta[2 /* is_zeromv */]; - int ref_delta[DAV1D_TOTAL_REFS_PER_FRAME]; + int8_t mode_delta[2 /* is_zeromv */]; + int8_t ref_delta[DAV1D_TOTAL_REFS_PER_FRAME]; } Dav1dLoopfilterModeRefDeltas; typedef struct Dav1dFilmGrainData { @@ -339,100 +339,101 @@ typedef struct Dav1dFilmGrainData { typedef struct Dav1dFrameHeader { struct { Dav1dFilmGrainData data; - int present, update; + uint8_t present, update; } film_grain; ///< film grain parameters enum Dav1dFrameType frame_type; ///< type of the picture int width[2 /* { coded_width, superresolution_upscaled_width } */], height; - int frame_offset; ///< frame number - int temporal_id; ///< temporal id of the frame for SVC - int spatial_id; ///< spatial id of the frame for SVC + uint8_t frame_offset; ///< frame number + uint8_t temporal_id; ///< temporal id of the frame for SVC + uint8_t spatial_id; ///< spatial id of the frame for SVC - int show_existing_frame; - int existing_frame_idx; - int frame_id; - int frame_presentation_delay; - int show_frame; - int showable_frame; - int error_resilient_mode; - int disable_cdf_update; - int allow_screen_content_tools; - int force_integer_mv; - int frame_size_override; - int primary_ref_frame; - int buffer_removal_time_present; + uint8_t show_existing_frame; + uint8_t existing_frame_idx; + uint32_t frame_id; + uint32_t frame_presentation_delay; + uint8_t show_frame; + uint8_t showable_frame; + uint8_t error_resilient_mode; + uint8_t disable_cdf_update; + uint8_t allow_screen_content_tools; + uint8_t force_integer_mv; + uint8_t frame_size_override; + uint8_t primary_ref_frame; + uint8_t buffer_removal_time_present; struct Dav1dFrameHeaderOperatingPoint { - int buffer_removal_time; + uint32_t buffer_removal_time; } operating_points[DAV1D_MAX_OPERATING_POINTS]; - int refresh_frame_flags; + uint8_t refresh_frame_flags; int render_width, render_height; struct { - int width_scale_denominator; - int enabled; + uint8_t width_scale_denominator; + uint8_t enabled; } super_res; - int have_render_size; - int allow_intrabc; - int frame_ref_short_signaling; - int refidx[DAV1D_REFS_PER_FRAME]; - int hp; + uint8_t have_render_size; + uint8_t allow_intrabc; + uint8_t frame_ref_short_signaling; + int8_t refidx[DAV1D_REFS_PER_FRAME]; + uint8_t hp; enum Dav1dFilterMode subpel_filter_mode; - int switchable_motion_mode; - int use_ref_frame_mvs; - int refresh_context; + uint8_t switchable_motion_mode; + uint8_t use_ref_frame_mvs; + uint8_t refresh_context; struct { - int uniform; - unsigned n_bytes; - int min_log2_cols, max_log2_cols, log2_cols, cols; - int min_log2_rows, max_log2_rows, log2_rows, rows; + uint8_t uniform; + uint8_t n_bytes; + uint8_t min_log2_cols, max_log2_cols, log2_cols, cols; + uint8_t min_log2_rows, max_log2_rows, log2_rows, rows; uint16_t col_start_sb[DAV1D_MAX_TILE_COLS + 1]; uint16_t row_start_sb[DAV1D_MAX_TILE_ROWS + 1]; - int update; + uint16_t update; } tiling; struct { - int yac; - int ydc_delta; - int udc_delta, uac_delta, vdc_delta, vac_delta; - int qm, qm_y, qm_u, qm_v; + uint8_t yac; + int8_t ydc_delta; + int8_t udc_delta, uac_delta, vdc_delta, vac_delta; + uint8_t qm, qm_y, qm_u, qm_v; } quant; struct { - int enabled, update_map, temporal, update_data; + uint8_t enabled, update_map, temporal, update_data; Dav1dSegmentationDataSet seg_data; - int lossless[DAV1D_MAX_SEGMENTS], qidx[DAV1D_MAX_SEGMENTS]; + uint8_t lossless[DAV1D_MAX_SEGMENTS], qidx[DAV1D_MAX_SEGMENTS]; } segmentation; struct { struct { - int present; - int res_log2; + uint8_t present; + uint8_t res_log2; } q; struct { - int present; - int res_log2; - int multi; + uint8_t present; + uint8_t res_log2; + uint8_t multi; } lf; } delta; - int all_lossless; + uint8_t all_lossless; struct { - int level_y[2 /* dir */]; - int level_u, level_v; - int mode_ref_delta_enabled; - int mode_ref_delta_update; + uint8_t level_y[2 /* dir */]; + uint8_t level_u, level_v; + uint8_t mode_ref_delta_enabled; + uint8_t mode_ref_delta_update; Dav1dLoopfilterModeRefDeltas mode_ref_deltas; - int sharpness; + uint8_t sharpness; } loopfilter; struct { - int damping; - int n_bits; - int y_strength[DAV1D_MAX_CDEF_STRENGTHS]; - int uv_strength[DAV1D_MAX_CDEF_STRENGTHS]; + uint8_t damping; + uint8_t n_bits; + uint8_t y_strength[DAV1D_MAX_CDEF_STRENGTHS]; + uint8_t uv_strength[DAV1D_MAX_CDEF_STRENGTHS]; } cdef; struct { enum Dav1dRestorationType type[3 /* plane */]; - int unit_size[2 /* y, uv */]; + uint8_t unit_size[2 /* y, uv */]; } restoration; enum Dav1dTxfmMode txfm_mode; - int switchable_comp_refs; - int skip_mode_allowed, skip_mode_enabled, skip_mode_refs[2]; - int warp_motion; - int reduced_txtp_set; + uint8_t switchable_comp_refs; + uint8_t skip_mode_allowed, skip_mode_enabled; + int8_t skip_mode_refs[2]; + uint8_t warp_motion; + uint8_t reduced_txtp_set; Dav1dWarpedMotionParams gmv[DAV1D_REFS_PER_FRAME]; } Dav1dFrameHeader; diff --git a/third_party/dav1d/include/dav1d/picture.h b/third_party/dav1d/include/dav1d/picture.h index c566ceabe01d..cc291a4abb95 100644 --- a/third_party/dav1d/include/dav1d/picture.h +++ b/third_party/dav1d/include/dav1d/picture.h @@ -91,7 +91,7 @@ typedef struct Dav1dPicture { */ size_t n_itut_t35; - uintptr_t reserved[3]; ///< reserved for future use + uintptr_t reserved[4]; ///< reserved for future use struct Dav1dRef *frame_hdr_ref; ///< Dav1dFrameHeader allocation origin struct Dav1dRef *seq_hdr_ref; ///< Dav1dSequenceHeader allocation origin diff --git a/third_party/dav1d/include/dav1d/version.h.in b/third_party/dav1d/include/dav1d/version.h.in index 086428c1d49f..4fa420ded31e 100644 --- a/third_party/dav1d/include/dav1d/version.h.in +++ b/third_party/dav1d/include/dav1d/version.h.in @@ -35,6 +35,14 @@ extern "C" { #define DAV1D_API_VERSION_MINOR @DAV1D_API_VERSION_MINOR@ #define DAV1D_API_VERSION_PATCH @DAV1D_API_VERSION_PATCH@ +/** + * Extract version components from the value returned by + * dav1d_version_int() + */ +#define DAV1D_API_MAJOR(v) (((v) >> 16) & 0xFF) +#define DAV1D_API_MINOR(v) (((v) >> 8) & 0xFF) +#define DAV1D_API_PATCH(v) (((v) >> 0) & 0xFF) + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/third_party/dav1d/meson.build b/third_party/dav1d/meson.build index bd29c985722f..1104aef9b98c 100644 --- a/third_party/dav1d/meson.build +++ b/third_party/dav1d/meson.build @@ -30,7 +30,7 @@ project('dav1d', ['c'], 'b_ndebug=if-release'], meson_version: '>= 0.49.0') -dav1d_soname_version = '6.9.0' +dav1d_soname_version = '7.0.0' dav1d_api_version_array = dav1d_soname_version.split('.') dav1d_api_version_major = dav1d_api_version_array[0] dav1d_api_version_minor = dav1d_api_version_array[1] @@ -149,6 +149,10 @@ else endif cdata.set('HAVE_CLOCK_GETTIME', 1) endif + + if cc.has_function('posix_memalign', prefix : '#include ', args : test_args) + cdata.set('HAVE_POSIX_MEMALIGN', 1) + endif endif # check for fseeko on android. It is not always available if _FILE_OFFSET_BITS is defined to 64 @@ -226,14 +230,6 @@ else getopt_dependency = [] endif -if cc.has_function('_aligned_malloc', prefix : '#include ', args : test_args) - cdata.set('HAVE_ALIGNED_MALLOC', 1) -elif cc.has_function('posix_memalign', prefix : '#include ', args : test_args) - cdata.set('HAVE_POSIX_MEMALIGN', 1) -elif cc.has_function('memalign', prefix : '#include ', args : test_args) - cdata.set('HAVE_MEMALIGN', 1) -endif - if (host_machine.cpu_family() == 'aarch64' or host_machine.cpu_family().startswith('arm') or host_machine.cpu() == 'ppc64le') diff --git a/third_party/dav1d/src/arm/32/refmvs.S b/third_party/dav1d/src/arm/32/refmvs.S index e16c5448d087..7f31db11ebcf 100644 --- a/third_party/dav1d/src/arm/32/refmvs.S +++ b/third_party/dav1d/src/arm/32/refmvs.S @@ -95,3 +95,209 @@ L(splat_tbl): bgt 1b pop {r4, pc} endfunc + +const mv_tbls, align=4 + .byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 + .byte 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0 + .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4 + .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4 +endconst + +const mask_mult, align=4 + .byte 1, 2, 1, 2, 0, 0, 0, 0 +endconst + +// void dav1d_save_tmvs_neon(refmvs_temporal_block *rp, ptrdiff_t stride, +// refmvs_block **rr, const uint8_t *ref_sign, +// int col_end8, int row_end8, +// int col_start8, int row_start8) +function save_tmvs_neon, export=1 + push {r4-r11,lr} + ldrd r4, r5, [sp, #36] + ldrd r6, r7, [sp, #44] + + vmov.i8 d30, #0 + vld1.8 {d31}, [r3] + adr r8, L(save_tmvs_tbl) + movrel_local lr, mask_mult + movrel_local r12, mv_tbls + vld1.8 {d29}, [lr] + vext.8 d31, d30, d31, #7 // [0, ref_sign] + mov r3, #5 + mul r1, r1, r3 // stride *= 5 + sub r5, r5, r7 // h = row_end8 - row_start8 + lsl r7, r7, #1 // row_start8 <<= 1 +1: + mov r3, #5 + mov r11, #12*2 + and r9, r7, #30 // (y & 15) * 2 + ldr r9, [r2, r9, lsl #2] // b = rr[(y & 15) * 2] + add r9, r9, #12 // &b[... + 1] + mla r10, r4, r11, r9 // end_cand_b = &b[col_end8*2 + 1] + mla r9, r6, r11, r9 // cand_b = &b[x*2 + 1] + + mla r3, r6, r3, r0 // &rp[x] + + push {r2,r4,r6} + +2: + ldrb r11, [r9, #10] // cand_b->bs + add lr, r9, #8 + vld1.8 {d0, d1}, [r9] // cand_b->mv + add r11, r8, r11, lsl #3 + vld1.16 {d2[]}, [lr] // cand_b->ref + ldrh lr, [r11] // bw8 + mov r2, r8 + add r9, r9, lr, lsl #1 // cand_b += bw8*2 + cmp r9, r10 + vmov d4, d0 + bge 3f + + ldrb r2, [r9, #10] // cand_b->bs + add lr, r9, #8 + vld1.8 {d6, d7}, [r9] // cand_b->mv + add r2, r8, r2, lsl #3 + vld1.16 {d2[1]}, [lr] // cand_b->ref + ldrh lr, [r2] // bw8 + add r9, r9, lr, lsl #1 // cand_b += bw8*2 + vmov d5, d6 + +3: + vabs.s16 q2, q2 // abs(mv[].xy) + vtbl.8 d2, {d31}, d2 // ref_sign[ref] + vshr.u16 q2, q2, #12 // abs(mv[].xy) >> 12 + vmull.u8 q1, d2, d29 // ref_sign[ref] * {1, 2} + vceq.i32 q2, q2, #0 // abs(mv[].xy) <= 4096 + vmovn.i32 d4, q2 // abs() condition to 16 bit + vand d2, d2, d4 // h[0-3] contains conditions for mv[0-1] + vpadd.i16 d2, d2, d2 // Combine condition for [1] and [0] + vmov.u16 r4, d2[0] // Extract case for first block + vmov.u16 r6, d2[1] + ldr r11, [r11, #4] // Fetch jump table entry + ldr r2, [r2, #4] + add r4, r12, r4, lsl #4 + add r6, r12, r6, lsl #4 + vld1.8 {d2, d3}, [r4] // Load permutation table base on case + vld1.8 {d4, d5}, [r6] + add r11, r8, r11 // Find jump table target + add r2, r8, r2 + vtbl.8 d16, {d0, d1}, d2 // Permute cand_b to output refmvs_temporal_block + vtbl.8 d17, {d0, d1}, d3 + vtbl.8 d18, {d6, d7}, d4 + vtbl.8 d19, {d6, d7}, d5 + vmov q0, q8 + + // q1 follows on q0 (q8), with another 3 full repetitions of the pattern. + vext.8 q1, q8, q8, #1 + vext.8 q10, q9, q9, #1 + // q2 ends with 3 complete repetitions of the pattern. + vext.8 q2, q8, q1, #4 + vext.8 q11, q9, q10, #4 + + blx r11 + bge 4f // if (cand_b >= end) + vmov q0, q9 + vmov q1, q10 + vmov q2, q11 + cmp r9, r10 + blx r2 + blt 2b // if (cand_b < end) + +4: + pop {r2,r4,r6} + + subs r5, r5, #1 // h-- + add r7, r7, #2 // y += 2 + add r0, r0, r1 // rp += stride + bgt 1b + + pop {r4-r11,pc} + + .align 2 +L(save_tmvs_tbl): + .word 16 * 12 + .word 160f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 16 * 12 + .word 160f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 8 * 12 + .word 80f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 8 * 12 + .word 80f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 8 * 12 + .word 80f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 8 * 12 + .word 80f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 4 * 12 + .word 40f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 4 * 12 + .word 40f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 4 * 12 + .word 40f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 4 * 12 + .word 40f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 2 * 12 + .word 20f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 2 * 12 + .word 20f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 2 * 12 + .word 20f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 2 * 12 + .word 20f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 2 * 12 + .word 20f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 1 * 12 + .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 1 * 12 + .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 1 * 12 + .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 1 * 12 + .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 1 * 12 + .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 1 * 12 + .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 1 * 12 + .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB + +10: + add r4, r3, #4 + vst1.32 {d0[0]}, [r3] + vst1.8 {d0[4]}, [r4] + add r3, r3, #5 + bx lr +20: + add r4, r3, #8 + vst1.8 {d0}, [r3] + vst1.16 {d1[0]}, [r4] + add r3, r3, #2*5 + bx lr +40: + add r4, r3, #16 + vst1.8 {q0}, [r3] + vst1.32 {d2[0]}, [r4] + add r3, r3, #4*5 + bx lr +80: + add r4, r3, #(8*5-16) + // This writes 6 full entries plus 2 extra bytes + vst1.8 {q0, q1}, [r3] + // Write the last few, overlapping with the first write. + vst1.8 {q2}, [r4] + add r3, r3, #8*5 + bx lr +160: + add r4, r3, #6*5 + add r6, r3, #12*5 + // This writes 6 full entries plus 2 extra bytes + vst1.8 {q0, q1}, [r3] + // Write another 6 full entries, slightly overlapping with the first set + vst1.8 {q0, q1}, [r4] + add r4, r3, #(16*5-16) + // Write 8 bytes (one full entry) after the first 12 + vst1.8 {d0}, [r6] + // Write the last 3 entries + vst1.8 {q2}, [r4] + add r3, r3, #16*5 + bx lr +endfunc diff --git a/third_party/dav1d/src/arm/64/looprestoration.S b/third_party/dav1d/src/arm/64/looprestoration.S index a598b72b0395..f8dc0df4d82e 100644 --- a/third_party/dav1d/src/arm/64/looprestoration.S +++ b/third_party/dav1d/src/arm/64/looprestoration.S @@ -965,371 +965,338 @@ function wiener_filter5_hv_8bpc_neon ret endfunc -#define SUM_STRIDE (384+16) - #include "looprestoration_tmpl.S" -// void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum, -// const pixel (*left)[4], -// const pixel *src, const ptrdiff_t stride, -// const int w, const int h, -// const enum LrEdgeFlags edges); -function sgr_box3_h_8bpc_neon, export=1 - add w5, w5, #2 // w += 2 +// void dav1d_sgr_box3_row_h_8bpc_neon(int32_t *sumsq, int16_t *sum, +// const pixel (*left)[4], +// const pixel *src, const int w, +// const enum LrEdgeFlags edges); +function sgr_box3_row_h_8bpc_neon, export=1 + add w4, w4, #2 // w += 2 - // Set up pointers for reading/writing alternate rows - add x10, x0, #(4*SUM_STRIDE) // sumsq - add x11, x1, #(2*SUM_STRIDE) // sum - add x12, x3, x4 // src - lsl x4, x4, #1 - mov x9, #(2*2*SUM_STRIDE) // double sum stride - - // Subtract the aligned width from the output stride. - add w13, w5, #7 - bic w13, w13, #7 - sub x9, x9, w13, uxtw #1 - - // Store the width for the vertical loop - mov w8, w5 - - // Subtract the number of pixels read from the input from the stride - add w13, w13, #8 - sub x4, x4, w13, uxtw - - // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL - tst w7, #1 // LR_HAVE_LEFT - b.eq 2f - // LR_HAVE_LEFT + tst w5, #1 // LR_HAVE_LEFT + b.eq 1f cbnz x2, 0f - // left == NULL + + // LR_HAVE_LEFT && left == NULL sub x3, x3, #2 - sub x12, x12, #2 - b 1f -0: // LR_HAVE_LEFT, left != NULL -2: // !LR_HAVE_LEFT, increase the stride. - // For this case we don't read the left 2 pixels from the src pointer, - // but shift it as if we had done that. - add x4, x4, #2 + ld1 {v0.16b}, [x3], #16 + b 2f - -1: // Loop vertically - ld1 {v0.16b}, [x3], #16 - ld1 {v4.16b}, [x12], #16 - - tst w7, #1 // LR_HAVE_LEFT - b.eq 0f - cbz x2, 2f +0: // LR_HAVE_LEFT, left != NULL - ld1 {v1.s}[3], [x2], #4 - // Move x3/x12 back to account for the last 2 bytes we loaded earlier, + ld1 {v0.16b}, [x3], #16 + ld1 {v1.s}[3], [x2] + // Move x3 back to account for the last 2 bytes we loaded earlier, // which we'll shift out. sub x3, x3, #2 - sub x12, x12, #2 - ld1 {v5.s}[3], [x2], #4 ext v0.16b, v1.16b, v0.16b, #14 - ext v4.16b, v5.16b, v4.16b, #14 b 2f -0: + +1: + ld1 {v0.16b}, [x3], #16 // !LR_HAVE_LEFT, fill v1 with the leftmost byte // and shift v0 to have 2x the first byte at the front. dup v1.16b, v0.b[0] - dup v5.16b, v4.b[0] // Move x3 back to account for the last 2 bytes we loaded before, // which we shifted out. sub x3, x3, #2 - sub x12, x12, #2 ext v0.16b, v1.16b, v0.16b, #14 - ext v4.16b, v5.16b, v4.16b, #14 2: umull v1.8h, v0.8b, v0.8b umull2 v2.8h, v0.16b, v0.16b - umull v5.8h, v4.8b, v4.8b - umull2 v6.8h, v4.16b, v4.16b - tst w7, #2 // LR_HAVE_RIGHT + tst w5, #2 // LR_HAVE_RIGHT b.ne 4f // If we'll need to pad the right edge, load that byte to pad with // here since we can find it pretty easily from here. - sub w13, w5, #(2 + 16 - 2 + 1) + sub w13, w4, #(2 + 16 - 2 + 1) ldr b30, [x3, w13, sxtw] - ldr b31, [x12, w13, sxtw] - // Fill v30/v31 with the right padding pixel + // Fill v30 with the right padding pixel dup v30.16b, v30.b[0] - dup v31.16b, v31.b[0] 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge - cmp w5, #10 + cmp w4, #10 b.ge 4f // If w >= 10, all used input pixels are valid // 1 <= w < 10, w pixels valid in v0. For w=9, this ends up called // again; it's not strictly needed in those cases (we pad enough here), // but keeping the code as simple as possible. - // Insert padding in v0/4.b[w] onwards + // Insert padding in v0.b[w] onwards movrel x13, right_ext_mask - sub x13, x13, w5, uxtw + sub x13, x13, w4, uxtw ld1 {v29.16b}, [x13] bit v0.16b, v30.16b, v29.16b - bit v4.16b, v31.16b, v29.16b // Update the precalculated squares umull v1.8h, v0.8b, v0.8b umull2 v2.8h, v0.16b, v0.16b - umull v5.8h, v4.8b, v4.8b - umull2 v6.8h, v4.16b, v4.16b 4: // Loop horizontally ext v16.16b, v0.16b, v0.16b, #1 ext v17.16b, v0.16b, v0.16b, #2 - ext v18.16b, v4.16b, v4.16b, #1 - ext v19.16b, v4.16b, v4.16b, #2 uaddl v3.8h, v0.8b, v16.8b - uaddw v3.8h, v3.8h, v17.8b - uaddl v7.8h, v4.8b, v18.8b - uaddw v7.8h, v7.8h, v19.8b - ext v20.16b, v1.16b, v2.16b, #2 + uaddw v3.8h, v3.8h, v17.8b + ext v21.16b, v1.16b, v2.16b, #4 - ext v22.16b, v5.16b, v6.16b, #2 - ext v23.16b, v5.16b, v6.16b, #4 uaddl v26.4s, v1.4h, v20.4h uaddl2 v27.4s, v1.8h, v20.8h uaddw v26.4s, v26.4s, v21.4h uaddw2 v27.4s, v27.4s, v21.8h - uaddl v28.4s, v5.4h, v22.4h - uaddl2 v29.4s, v5.8h, v22.8h - uaddw v28.4s, v28.4s, v23.4h - uaddw2 v29.4s, v29.4s, v23.8h - - subs w5, w5, #8 + subs w4, w4, #8 st1 {v3.8h}, [x1], #16 - st1 {v7.8h}, [x11], #16 st1 {v26.4s,v27.4s}, [x0], #32 - st1 {v28.4s,v29.4s}, [x10], #32 b.le 9f - tst w7, #2 // LR_HAVE_RIGHT + tst w5, #2 // LR_HAVE_RIGHT ld1 {v3.8b}, [x3], #8 - ld1 {v7.8b}, [x12], #8 mov v1.16b, v2.16b - mov v5.16b, v6.16b ext v0.16b, v0.16b, v3.16b, #8 - ext v4.16b, v4.16b, v7.16b, #8 umull v2.8h, v3.8b, v3.8b - umull v6.8h, v7.8b, v7.8b b.ne 4b // If we don't need to pad, just keep summing. b 3b // If we need to pad, check how many pixels we have left. 9: - subs w6, w6, #2 - b.le 0f - // Jump to the next row and loop horizontally - add x0, x0, x9, lsl #1 - add x10, x10, x9, lsl #1 - add x1, x1, x9 - add x11, x11, x9 - add x3, x3, x4 - add x12, x12, x4 - mov w5, w8 - b 1b -0: ret endfunc -// void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum, -// const pixel (*left)[4], -// const pixel *src, const ptrdiff_t stride, -// const int w, const int h, -// const enum LrEdgeFlags edges); -function sgr_box5_h_8bpc_neon, export=1 - add w5, w5, #2 // w += 2 +// void dav1d_sgr_box5_row_h_8bpc_neon(int32_t *sumsq, int16_t *sum, +// const pixel (*left)[4], +// const pixel *src, const int w, +// const enum LrEdgeFlags edges); +function sgr_box5_row_h_8bpc_neon, export=1 + add w4, w4, #2 // w += 2 - // Set up pointers for reading/writing alternate rows - add x10, x0, #(4*SUM_STRIDE) // sumsq - add x11, x1, #(2*SUM_STRIDE) // sum - add x12, x3, x4 // src - lsl x4, x4, #1 - mov x9, #(2*2*SUM_STRIDE) // double sum stride - - // Subtract the aligned width from the output stride. - add w13, w5, #7 - bic w13, w13, #7 - sub x9, x9, w13, uxtw #1 - add w13, w13, #8 - sub x4, x4, w13, uxtw - - // Store the width for the vertical loop - mov w8, w5 - - // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL - tst w7, #1 // LR_HAVE_LEFT - b.eq 2f - // LR_HAVE_LEFT + tst w5, #1 // LR_HAVE_LEFT + b.eq 1f cbnz x2, 0f - // left == NULL + + // LR_HAVE_LEFT && left == NULL sub x3, x3, #3 - sub x12, x12, #3 - b 1f -0: // LR_HAVE_LEFT, left != NULL -2: // !LR_HAVE_LEFT, increase the stride. - // For this case we don't read the left 3 pixels from the src pointer, - // but shift it as if we had done that. - add x4, x4, #3 + ld1 {v0.16b}, [x3], #16 + b 2f -1: // Loop vertically - ld1 {v0.16b}, [x3], #16 - ld1 {v4.16b}, [x12], #16 - - tst w7, #1 // LR_HAVE_LEFT - b.eq 0f - cbz x2, 2f +0: // LR_HAVE_LEFT, left != NULL - ld1 {v1.s}[3], [x2], #4 - // Move x3/x12 back to account for the last 3 bytes we loaded earlier, + ld1 {v0.16b}, [x3], #16 + ld1 {v1.s}[3], [x2], #4 + // Move x3 back to account for the last 3 bytes we loaded earlier, // which we'll shift out. sub x3, x3, #3 - sub x12, x12, #3 - ld1 {v5.s}[3], [x2], #4 ext v0.16b, v1.16b, v0.16b, #13 - ext v4.16b, v5.16b, v4.16b, #13 b 2f -0: + +1: + ld1 {v0.16b}, [x3], #16 // !LR_HAVE_LEFT, fill v1 with the leftmost byte // and shift v0 to have 3x the first byte at the front. dup v1.16b, v0.b[0] - dup v5.16b, v4.b[0] // Move x3 back to account for the last 3 bytes we loaded before, // which we shifted out. sub x3, x3, #3 - sub x12, x12, #3 ext v0.16b, v1.16b, v0.16b, #13 - ext v4.16b, v5.16b, v4.16b, #13 2: umull v1.8h, v0.8b, v0.8b umull2 v2.8h, v0.16b, v0.16b - umull v5.8h, v4.8b, v4.8b - umull2 v6.8h, v4.16b, v4.16b - tst w7, #2 // LR_HAVE_RIGHT + tst w5, #2 // LR_HAVE_RIGHT b.ne 4f // If we'll need to pad the right edge, load that byte to pad with // here since we can find it pretty easily from here. - sub w13, w5, #(2 + 16 - 3 + 1) + sub w13, w4, #(2 + 16 - 3 + 1) ldr b30, [x3, w13, sxtw] - ldr b31, [x12, w13, sxtw] - // Fill v30/v31 with the right padding pixel + // Fill v30 with the right padding pixel dup v30.16b, v30.b[0] - dup v31.16b, v31.b[0] 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge - cmp w5, #11 + cmp w4, #11 b.ge 4f // If w >= 11, all used input pixels are valid // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10, // this ends up called again; it's not strictly needed in those // cases (we pad enough here), but keeping the code as simple as possible. - // Insert padding in v0/4.b[w+1] onwards; fuse the +1 into the + // Insert padding in v0.b[w+1] onwards; fuse the +1 into the // buffer pointer. movrel x13, right_ext_mask, -1 - sub x13, x13, w5, uxtw + sub x13, x13, w4, uxtw ld1 {v29.16b}, [x13] bit v0.16b, v30.16b, v29.16b - bit v4.16b, v31.16b, v29.16b // Update the precalculated squares umull v1.8h, v0.8b, v0.8b umull2 v2.8h, v0.16b, v0.16b - umull v5.8h, v4.8b, v4.8b - umull2 v6.8h, v4.16b, v4.16b 4: // Loop horizontally ext v16.16b, v0.16b, v0.16b, #1 ext v17.16b, v0.16b, v0.16b, #2 ext v18.16b, v0.16b, v0.16b, #3 ext v19.16b, v0.16b, v0.16b, #4 - ext v20.16b, v4.16b, v4.16b, #1 - ext v21.16b, v4.16b, v4.16b, #2 - ext v22.16b, v4.16b, v4.16b, #3 - ext v23.16b, v4.16b, v4.16b, #4 uaddl v3.8h, v0.8b, v16.8b uaddl v24.8h, v17.8b, v18.8b - uaddl v7.8h, v4.8b, v20.8b uaddw v3.8h, v3.8h, v19.8b - uaddl v25.8h, v21.8b, v22.8b - uaddw v7.8h, v7.8h, v23.8b add v3.8h, v3.8h, v24.8h - add v7.8h, v7.8h, v25.8h ext v16.16b, v1.16b, v2.16b, #2 ext v17.16b, v1.16b, v2.16b, #4 ext v18.16b, v1.16b, v2.16b, #6 ext v19.16b, v1.16b, v2.16b, #8 - ext v20.16b, v5.16b, v6.16b, #2 - ext v21.16b, v5.16b, v6.16b, #4 - ext v22.16b, v5.16b, v6.16b, #6 - ext v23.16b, v5.16b, v6.16b, #8 uaddl v26.4s, v1.4h, v16.4h uaddl2 v27.4s, v1.8h, v16.8h uaddl v16.4s, v17.4h, v18.4h uaddl2 v17.4s, v17.8h, v18.8h - uaddl v28.4s, v5.4h, v20.4h - uaddl2 v29.4s, v5.8h, v20.8h uaddw v26.4s, v26.4s, v19.4h uaddw2 v27.4s, v27.4s, v19.8h - uaddl v20.4s, v21.4h, v22.4h - uaddl2 v21.4s, v21.8h, v22.8h - uaddw v28.4s, v28.4s, v23.4h - uaddw2 v29.4s, v29.4s, v23.8h add v26.4s, v26.4s, v16.4s add v27.4s, v27.4s, v17.4s - add v28.4s, v28.4s, v20.4s - add v29.4s, v29.4s, v21.4s - subs w5, w5, #8 + subs w4, w4, #8 st1 {v3.8h}, [x1], #16 - st1 {v7.8h}, [x11], #16 st1 {v26.4s,v27.4s}, [x0], #32 - st1 {v28.4s,v29.4s}, [x10], #32 b.le 9f - tst w7, #2 // LR_HAVE_RIGHT + tst w5, #2 // LR_HAVE_RIGHT ld1 {v3.8b}, [x3], #8 - ld1 {v7.8b}, [x12], #8 mov v1.16b, v2.16b - mov v5.16b, v6.16b ext v0.16b, v0.16b, v3.16b, #8 - ext v4.16b, v4.16b, v7.16b, #8 umull v2.8h, v3.8b, v3.8b - umull v6.8h, v7.8b, v7.8b + b.ne 4b // If we don't need to pad, just keep summing. b 3b // If we need to pad, check how many pixels we have left. 9: - subs w6, w6, #2 - b.le 0f - // Jump to the next row and loop horizontally - add x0, x0, x9, lsl #1 - add x10, x10, x9, lsl #1 - add x1, x1, x9 - add x11, x11, x9 - add x3, x3, x4 - add x12, x12, x4 - mov w5, w8 - b 1b + ret +endfunc + +// void dav1d_sgr_box35_row_h_8bpc_neon(int32_t *sumsq3, int16_t *sum3, +// int32_t *sumsq5, int16_t *sum5, +// const pixel (*left)[4], +// const pixel *src, const int w, +// const enum LrEdgeFlags edges); +function sgr_box35_row_h_8bpc_neon, export=1 + add w6, w6, #2 // w += 2 + + tst w7, #1 // LR_HAVE_LEFT + b.eq 1f + cbnz x4, 0f + + // LR_HAVE_LEFT && left == NULL + sub x5, x5, #3 + ld1 {v0.16b}, [x5], #16 + b 2f + 0: + // LR_HAVE_LEFT, left != NULL + ld1 {v0.16b}, [x5], #16 + ld1 {v1.s}[3], [x4], #4 + // Move x3 back to account for the last 3 bytes we loaded earlier, + // which we'll shift out. + sub x5, x5, #3 + ext v0.16b, v1.16b, v0.16b, #13 + b 2f + +1: + ld1 {v0.16b}, [x5], #16 + // !LR_HAVE_LEFT, fill v1 with the leftmost byte + // and shift v0 to have 3x the first byte at the front. + dup v1.16b, v0.b[0] + // Move x3 back to account for the last 3 bytes we loaded before, + // which we shifted out. + sub x5, x5, #3 + ext v0.16b, v1.16b, v0.16b, #13 + +2: + umull v1.8h, v0.8b, v0.8b + umull2 v2.8h, v0.16b, v0.16b + + tst w7, #2 // LR_HAVE_RIGHT + b.ne 4f + // If we'll need to pad the right edge, load that byte to pad with + // here since we can find it pretty easily from here. + sub w13, w6, #(2 + 16 - 3 + 1) + ldr b30, [x5, w13, sxtw] + // Fill v30 with the right padding pixel + dup v30.16b, v30.b[0] +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp w6, #11 + b.ge 4f // If w >= 11, all used input pixels are valid + + // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // Insert padding in v0.b[w+1] onwards; fuse the +1 into the + // buffer pointer. + movrel x13, right_ext_mask, -1 + sub x13, x13, w6, uxtw + ld1 {v29.16b}, [x13] + + bit v0.16b, v30.16b, v29.16b + + // Update the precalculated squares + umull v1.8h, v0.8b, v0.8b + umull2 v2.8h, v0.16b, v0.16b + +4: // Loop horizontally + ext v16.16b, v0.16b, v0.16b, #1 + ext v17.16b, v0.16b, v0.16b, #2 + ext v19.16b, v0.16b, v0.16b, #4 + ext v18.16b, v0.16b, v0.16b, #3 + uaddl v3.8h, v16.8b, v17.8b + uaddl v24.8h, v0.8b, v19.8b + uaddw v3.8h, v3.8h, v18.8b + + ext v16.16b, v1.16b, v2.16b, #2 + ext v17.16b, v1.16b, v2.16b, #4 + ext v19.16b, v1.16b, v2.16b, #8 + ext v18.16b, v1.16b, v2.16b, #6 + + st1 {v3.8h}, [x1], #16 + add v3.8h, v3.8h, v24.8h + + uaddl v26.4s, v16.4h, v17.4h + uaddl2 v27.4s, v16.8h, v17.8h + uaddl v16.4s, v1.4h, v19.4h + uaddl2 v17.4s, v1.8h, v19.8h + uaddw v26.4s, v26.4s, v18.4h + uaddw2 v27.4s, v27.4s, v18.8h + + st1 {v26.4s,v27.4s}, [x0], #32 + add v26.4s, v26.4s, v16.4s + add v27.4s, v27.4s, v17.4s + + subs w6, w6, #8 + + st1 {v3.8h}, [x3], #16 + st1 {v26.4s,v27.4s}, [x2], #32 + + b.le 9f + tst w7, #2 // LR_HAVE_RIGHT + ld1 {v3.8b}, [x5], #8 + mov v1.16b, v2.16b + ext v0.16b, v0.16b, v3.16b, #8 + umull v2.8h, v3.8b, v3.8b + + b.ne 4b // If we don't need to pad, just keep summing. + b 3b // If we need to pad, check how many pixels we have left. + +9: ret endfunc diff --git a/third_party/dav1d/src/arm/64/looprestoration16.S b/third_party/dav1d/src/arm/64/looprestoration16.S index 8954e604cf55..3b76b1ee2af8 100644 --- a/third_party/dav1d/src/arm/64/looprestoration16.S +++ b/third_party/dav1d/src/arm/64/looprestoration16.S @@ -1070,349 +1070,318 @@ function wiener_filter5_hv_16bpc_neon ret endfunc -#define SUM_STRIDE (384+16) - #include "looprestoration_tmpl.S" -// void dav1d_sgr_box3_h_16bpc_neon(int32_t *sumsq, int16_t *sum, -// const pixel (*left)[4], -// const pixel *src, const ptrdiff_t stride, -// const int w, const int h, -// const enum LrEdgeFlags edges); -function sgr_box3_h_16bpc_neon, export=1 - add w5, w5, #2 // w += 2 +// void dav1d_sgr_box3_row_h_16bpc_neon(int32_t *sumsq, int16_t *sum, +// const pixel (*left)[4], +// const pixel *src, const int w, +// const enum LrEdgeFlags edges); +function sgr_box3_row_h_16bpc_neon, export=1 + add w4, w4, #2 // w += 2 - // Set up pointers for reading/writing alternate rows - add x10, x0, #(4*SUM_STRIDE) // sumsq - add x11, x1, #(2*SUM_STRIDE) // sum - add x12, x3, x4 // src - lsl x4, x4, #1 - mov x9, #(2*2*SUM_STRIDE) // double sum stride - - // Subtract the aligned width from the output stride. - add w13, w5, #7 - bic w13, w13, #7 - sub x9, x9, w13, uxtw #1 - - // Store the width for the vertical loop - mov w8, w5 - - // Subtract the number of pixels read from the input from the stride - add w13, w13, #8 - sub x4, x4, w13, uxtw #1 - - // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL - tst w7, #1 // LR_HAVE_LEFT - b.eq 2f - // LR_HAVE_LEFT + tst w5, #1 // LR_HAVE_LEFT + b.eq 1f cbnz x2, 0f - // left == NULL + + // LR_HAVE_LEFT && left == NULL sub x3, x3, #4 - sub x12, x12, #4 - b 1f -0: // LR_HAVE_LEFT, left != NULL -2: // !LR_HAVE_LEFT, increase the stride. - // For this case we don't read the left 2 pixels from the src pointer, - // but shift it as if we had done that. - add x4, x4, #4 + ld1 {v0.8h, v1.8h}, [x3], #32 + b 2f - -1: // Loop vertically - ld1 {v0.8h, v1.8h}, [x3], #32 - ld1 {v16.8h, v17.8h}, [x12], #32 - - tst w7, #1 // LR_HAVE_LEFT - b.eq 0f - cbz x2, 2f +0: // LR_HAVE_LEFT, left != NULL - ld1 {v2.d}[1], [x2], #8 - // Move x3/x12 back to account for the last 2 pixels we loaded earlier, + ld1 {v0.8h, v1.8h}, [x3], #32 + ld1 {v2.d}[1], [x2] + // Move x3 back to account for the last 2 pixels we loaded earlier, // which we'll shift out. sub x3, x3, #4 - sub x12, x12, #4 - ld1 {v18.d}[1], [x2], #8 - ext v1.16b, v0.16b, v1.16b, #12 - ext v0.16b, v2.16b, v0.16b, #12 - ext v17.16b, v16.16b, v17.16b, #12 - ext v16.16b, v18.16b, v16.16b, #12 + ext v1.16b, v0.16b, v1.16b, #12 + ext v0.16b, v2.16b, v0.16b, #12 b 2f -0: + +1: + ld1 {v0.8h, v1.8h}, [x3], #32 // !LR_HAVE_LEFT, fill v2 with the leftmost pixel // and shift v0/v1 to have 2x the first pixel at the front. - dup v2.8h, v0.h[0] - dup v18.8h, v16.h[0] + dup v2.8h, v0.h[0] // Move x3 back to account for the last 2 pixels we loaded before, // which we shifted out. sub x3, x3, #4 - sub x12, x12, #4 - ext v1.16b, v0.16b, v1.16b, #12 - ext v0.16b, v2.16b, v0.16b, #12 - ext v17.16b, v16.16b, v17.16b, #12 - ext v16.16b, v18.16b, v16.16b, #12 + ext v1.16b, v0.16b, v1.16b, #12 + ext v0.16b, v2.16b, v0.16b, #12 2: - tst w7, #2 // LR_HAVE_RIGHT + tst w5, #2 // LR_HAVE_RIGHT b.ne 4f // If we'll need to pad the right edge, load that pixel to pad with // here since we can find it pretty easily from here. - sub w13, w5, #(2 + 16 - 2 + 1) + sub w13, w4, #(2 + 16 - 2 + 1) ldr h30, [x3, w13, sxtw #1] - ldr h31, [x12, w13, sxtw #1] - // Fill v30/v31 with the right padding pixel + // Fill v30 with the right padding pixel dup v30.8h, v30.h[0] - dup v31.8h, v31.h[0] 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge - cmp w5, #10 + cmp w4, #10 b.ge 4f // If w >= 10, all used input pixels are valid - // 1 <= w < 10, w pixels valid in v0-v1. For w=9, this ends up called + // 1 <= w < 10, w pixels valid in v0. For w=9, this ends up called // again; it's not strictly needed in those cases (we pad enough here), // but keeping the code as simple as possible. - // Insert padding in v0/1.h[w] onwards + // Insert padding in v0.b[w] onwards movrel x13, right_ext_mask - sub x13, x13, w5, uxtw #1 + sub x13, x13, w4, uxtw #1 ld1 {v28.16b, v29.16b}, [x13] bit v0.16b, v30.16b, v28.16b bit v1.16b, v30.16b, v29.16b - bit v16.16b, v31.16b, v28.16b - bit v17.16b, v31.16b, v29.16b 4: // Loop horizontally ext v26.16b, v0.16b, v1.16b, #2 - ext v28.16b, v16.16b, v17.16b, #2 ext v27.16b, v0.16b, v1.16b, #4 - ext v29.16b, v16.16b, v17.16b, #4 add v6.8h, v0.8h, v26.8h umull v22.4s, v0.4h, v0.4h umlal v22.4s, v26.4h, v26.4h umlal v22.4s, v27.4h, v27.4h - add v7.8h, v16.8h, v28.8h - umull v24.4s, v16.4h, v16.4h - umlal v24.4s, v28.4h, v28.4h - umlal v24.4s, v29.4h, v29.4h add v6.8h, v6.8h, v27.8h umull2 v23.4s, v0.8h, v0.8h umlal2 v23.4s, v26.8h, v26.8h umlal2 v23.4s, v27.8h, v27.8h - add v7.8h, v7.8h, v29.8h - umull2 v25.4s, v16.8h, v16.8h - umlal2 v25.4s, v28.8h, v28.8h - umlal2 v25.4s, v29.8h, v29.8h - subs w5, w5, #8 + subs w4, w4, #8 st1 {v6.8h}, [x1], #16 - st1 {v7.8h}, [x11], #16 st1 {v22.4s,v23.4s}, [x0], #32 - st1 {v24.4s,v25.4s}, [x10], #32 b.le 9f - tst w7, #2 // LR_HAVE_RIGHT + tst w5, #2 // LR_HAVE_RIGHT mov v0.16b, v1.16b - mov v16.16b, v17.16b ld1 {v1.8h}, [x3], #16 - ld1 {v17.8h}, [x12], #16 b.ne 4b // If we don't need to pad, just keep summing. b 3b // If we need to pad, check how many pixels we have left. 9: - subs w6, w6, #2 - b.le 0f - // Jump to the next row and loop horizontally - add x0, x0, x9, lsl #1 - add x10, x10, x9, lsl #1 - add x1, x1, x9 - add x11, x11, x9 - add x3, x3, x4 - add x12, x12, x4 - mov w5, w8 - b 1b -0: ret endfunc -// void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum, -// const pixel (*left)[4], -// const pixel *src, const ptrdiff_t stride, -// const int w, const int h, -// const enum LrEdgeFlags edges); -function sgr_box5_h_16bpc_neon, export=1 - add w5, w5, #2 // w += 2 +// void dav1d_sgr_box5_row_h_16bpc_neon(int32_t *sumsq, int16_t *sum, +// const pixel (*left)[4], +// const pixel *src, const int w, +// const enum LrEdgeFlags edges); +function sgr_box5_row_h_16bpc_neon, export=1 + add w4, w4, #2 // w += 2 - // Set up pointers for reading/writing alternate rows - add x10, x0, #(4*SUM_STRIDE) // sumsq - add x11, x1, #(2*SUM_STRIDE) // sum - add x12, x3, x4 // src - lsl x4, x4, #1 - mov x9, #(2*2*SUM_STRIDE) // double sum stride - - // Subtract the aligned width from the output stride. - add w13, w5, #7 - bic w13, w13, #7 - sub x9, x9, w13, uxtw #1 - add w13, w13, #8 - sub x4, x4, w13, uxtw #1 - - // Store the width for the vertical loop - mov w8, w5 - - // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL - tst w7, #1 // LR_HAVE_LEFT - b.eq 2f - // LR_HAVE_LEFT + tst w5, #1 // LR_HAVE_LEFT + b.eq 1f cbnz x2, 0f - // left == NULL + + // LR_HAVE_LEFT && left == NULL sub x3, x3, #6 - sub x12, x12, #6 - b 1f -0: // LR_HAVE_LEFT, left != NULL -2: // !LR_HAVE_LEFT, increase the stride. - // For this case we don't read the left 3 pixels from the src pointer, - // but shift it as if we had done that. - add x4, x4, #6 + ld1 {v0.8h, v1.8h}, [x3], #32 + b 2f -1: // Loop vertically - ld1 {v0.8h, v1.8h}, [x3], #32 - ld1 {v16.8h, v17.8h}, [x12], #32 - - tst w7, #1 // LR_HAVE_LEFT - b.eq 0f - cbz x2, 2f +0: // LR_HAVE_LEFT, left != NULL - ld1 {v2.d}[1], [x2], #8 - // Move x3/x12 back to account for the last 3 pixels we loaded earlier, + ld1 {v0.8h, v1.8h}, [x3], #32 + ld1 {v2.d}[1], [x2], #8 + // Move x3 back to account for the last 3 pixels we loaded earlier, // which we'll shift out. sub x3, x3, #6 - sub x12, x12, #6 - ld1 {v18.d}[1], [x2], #8 ext v1.16b, v0.16b, v1.16b, #10 ext v0.16b, v2.16b, v0.16b, #10 - ext v17.16b, v16.16b, v17.16b, #10 - ext v16.16b, v18.16b, v16.16b, #10 b 2f -0: + +1: + ld1 {v0.8h, v1.8h}, [x3], #32 // !LR_HAVE_LEFT, fill v2 with the leftmost pixel // and shift v0/v1 to have 3x the first pixel at the front. dup v2.8h, v0.h[0] - dup v18.8h, v16.h[0] // Move x3 back to account for the last 3 pixels we loaded before, // which we shifted out. sub x3, x3, #6 - sub x12, x12, #6 ext v1.16b, v0.16b, v1.16b, #10 ext v0.16b, v2.16b, v0.16b, #10 - ext v17.16b, v16.16b, v17.16b, #10 - ext v16.16b, v18.16b, v16.16b, #10 2: - tst w7, #2 // LR_HAVE_RIGHT + tst w5, #2 // LR_HAVE_RIGHT b.ne 4f // If we'll need to pad the right edge, load that pixel to pad with // here since we can find it pretty easily from here. - sub w13, w5, #(2 + 16 - 3 + 1) + sub w13, w4, #(2 + 16 - 3 + 1) ldr h30, [x3, w13, sxtw #1] - ldr h31, [x12, w13, sxtw #1] - // Fill v30/v31 with the right padding pixel + // Fill v30 with the right padding pixel dup v30.8h, v30.h[0] - dup v31.8h, v31.h[0] 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge - cmp w5, #11 + cmp w4, #11 b.ge 4f // If w >= 11, all used input pixels are valid - // 1 <= w < 11, w+1 pixels valid in v0-v1. For w=9 or w=10, + // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10, // this ends up called again; it's not strictly needed in those // cases (we pad enough here), but keeping the code as simple as possible. - // Insert padding in v0/1.h[w+1] onwards; fuse the +1 into the + // Insert padding in v0.b[w+1] onwards; fuse the +1 into the // buffer pointer. - movrel x13, right_ext_mask, -2 - sub x13, x13, w5, uxtw #1 + movrel x13, right_ext_mask, -1 + sub x13, x13, w4, uxtw #1 ld1 {v28.16b, v29.16b}, [x13] bit v0.16b, v30.16b, v28.16b bit v1.16b, v30.16b, v29.16b - bit v16.16b, v31.16b, v28.16b - bit v17.16b, v31.16b, v29.16b 4: // Loop horizontally ext v26.16b, v0.16b, v1.16b, #2 - ext v28.16b, v16.16b, v17.16b, #2 ext v27.16b, v0.16b, v1.16b, #4 - ext v29.16b, v16.16b, v17.16b, #4 add v6.8h, v0.8h, v26.8h umull v22.4s, v0.4h, v0.4h umlal v22.4s, v26.4h, v26.4h umlal v22.4s, v27.4h, v27.4h - add v7.8h, v16.8h, v28.8h - umull v24.4s, v16.4h, v16.4h - umlal v24.4s, v28.4h, v28.4h - umlal v24.4s, v29.4h, v29.4h add v6.8h, v6.8h, v27.8h umull2 v23.4s, v0.8h, v0.8h umlal2 v23.4s, v26.8h, v26.8h umlal2 v23.4s, v27.8h, v27.8h - add v7.8h, v7.8h, v29.8h - umull2 v25.4s, v16.8h, v16.8h - umlal2 v25.4s, v28.8h, v28.8h - umlal2 v25.4s, v29.8h, v29.8h ext v26.16b, v0.16b, v1.16b, #6 - ext v28.16b, v16.16b, v17.16b, #6 ext v27.16b, v0.16b, v1.16b, #8 - ext v29.16b, v16.16b, v17.16b, #8 add v6.8h, v6.8h, v26.8h umlal v22.4s, v26.4h, v26.4h umlal v22.4s, v27.4h, v27.4h - add v7.8h, v7.8h, v28.8h - umlal v24.4s, v28.4h, v28.4h - umlal v24.4s, v29.4h, v29.4h add v6.8h, v6.8h, v27.8h umlal2 v23.4s, v26.8h, v26.8h umlal2 v23.4s, v27.8h, v27.8h - add v7.8h, v7.8h, v29.8h - umlal2 v25.4s, v28.8h, v28.8h - umlal2 v25.4s, v29.8h, v29.8h - subs w5, w5, #8 + subs w4, w4, #8 st1 {v6.8h}, [x1], #16 - st1 {v7.8h}, [x11], #16 st1 {v22.4s,v23.4s}, [x0], #32 - st1 {v24.4s,v25.4s}, [x10], #32 b.le 9f - tst w7, #2 // LR_HAVE_RIGHT + tst w5, #2 // LR_HAVE_RIGHT mov v0.16b, v1.16b - mov v16.16b, v17.16b - ld1 {v1.8h}, [x3], #16 - ld1 {v17.8h}, [x12], #16 + ld1 {v1.8h}, [x3], #16 b.ne 4b // If we don't need to pad, just keep summing. b 3b // If we need to pad, check how many pixels we have left. 9: - subs w6, w6, #2 - b.le 0f - // Jump to the next row and loop horizontally - add x0, x0, x9, lsl #1 - add x10, x10, x9, lsl #1 - add x1, x1, x9 - add x11, x11, x9 - add x3, x3, x4 - add x12, x12, x4 - mov w5, w8 - b 1b + ret +endfunc + +// void dav1d_sgr_box35_row_h_16bpc_neon(int32_t *sumsq3, int16_t *sum3, +// int32_t *sumsq5, int16_t *sum5, +// const pixel (*left)[4], +// const pixel *src, const int w, +// const enum LrEdgeFlags edges); +function sgr_box35_row_h_16bpc_neon, export=1 + add w6, w6, #2 // w += 2 + + tst w7, #1 // LR_HAVE_LEFT + b.eq 1f + cbnz x4, 0f + + // LR_HAVE_LEFT && left == NULL + sub x5, x5, #6 + ld1 {v0.8h, v1.8h}, [x5], #32 + b 2f + 0: + // LR_HAVE_LEFT, left != NULL + ld1 {v0.8h, v1.8h}, [x5], #32 + ld1 {v2.d}[1], [x4], #8 + // Move x3 back to account for the last 3 pixels we loaded earlier, + // which we'll shift out. + sub x5, x5, #6 + ext v1.16b, v0.16b, v1.16b, #10 + ext v0.16b, v2.16b, v0.16b, #10 + b 2f + +1: + ld1 {v0.8h, v1.8h}, [x5], #32 + // !LR_HAVE_LEFT, fill v2 with the leftmost pixel + // and shift v0/v1 to have 3x the first pixel at the front. + dup v2.8h, v0.h[0] + // Move x5 back to account for the last 3 pixels we loaded before, + // which we shifted out. + sub x5, x5, #6 + ext v1.16b, v0.16b, v1.16b, #10 + ext v0.16b, v2.16b, v0.16b, #10 + +2: + tst w7, #2 // LR_HAVE_RIGHT + b.ne 4f + // If we'll need to pad the right edge, load that pixel to pad with + // here since we can find it pretty easily from here. + sub w13, w6, #(2 + 16 - 3 + 1) + ldr h30, [x5, w13, sxtw #1] + // Fill v30 with the right padding pixel + dup v30.8h, v30.h[0] +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp w6, #11 + b.ge 4f // If w >= 11, all used input pixels are valid + + // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // Insert padding in v0.b[w+1] onwards; fuse the +1 into the + // buffer pointer. + movrel x13, right_ext_mask, -1 + sub x13, x13, w6, uxtw #1 + ld1 {v28.16b, v29.16b}, [x13] + + bit v0.16b, v30.16b, v28.16b + bit v1.16b, v30.16b, v29.16b + +4: // Loop horizontally + ext v16.16b, v0.16b, v1.16b, #2 + ext v17.16b, v0.16b, v1.16b, #4 + ext v19.16b, v0.16b, v1.16b, #8 + ext v18.16b, v0.16b, v1.16b, #6 + + add v20.8h, v16.8h, v17.8h + add v21.8h, v0.8h, v19.8h + add v20.8h, v20.8h, v18.8h + + umull v22.4s, v16.4h, v16.4h + umlal v22.4s, v17.4h, v17.4h + umlal v22.4s, v18.4h, v18.4h + + umull2 v23.4s, v16.8h, v16.8h + umlal2 v23.4s, v17.8h, v17.8h + umlal2 v23.4s, v18.8h, v18.8h + + add v21.8h, v21.8h, v20.8h + st1 {v20.8h}, [x1], #16 + st1 {v22.4s,v23.4s}, [x0], #32 + + umlal v22.4s, v0.4h, v0.4h + umlal v22.4s, v19.4h, v19.4h + + umlal2 v23.4s, v0.8h, v0.8h + umlal2 v23.4s, v19.8h, v19.8h + + subs w6, w6, #8 + + st1 {v21.8h}, [x3], #16 + st1 {v22.4s,v23.4s}, [x2], #32 + + b.le 9f + tst w7, #2 // LR_HAVE_RIGHT + mov v0.16b, v1.16b + ld1 {v1.8h}, [x5], #16 + + b.ne 4b // If we don't need to pad, just keep summing. + b 3b // If we need to pad, check how many pixels we have left. + +9: ret endfunc diff --git a/third_party/dav1d/src/arm/64/looprestoration_common.S b/third_party/dav1d/src/arm/64/looprestoration_common.S index 200eb63189ef..745f6c20f491 100644 --- a/third_party/dav1d/src/arm/64/looprestoration_common.S +++ b/third_party/dav1d/src/arm/64/looprestoration_common.S @@ -28,332 +28,29 @@ #include "src/arm/asm.S" #include "util.S" -#define SUM_STRIDE (384+16) +// void dav1d_sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum, +// int32_t *AA, int16_t *BB, +// const int w, const int s, +// const int bitdepth_max); +function sgr_box3_vert_neon, export=1 + stp d8, d9, [sp, #-0x30]! + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] -// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum, -// const int w, const int h, -// const enum LrEdgeFlags edges); -function sgr_box3_v_neon, export=1 - add w10, w3, #2 // Number of output rows to move back - mov w11, w3 // Number of input rows to move back - add w2, w2, #2 // Actual summed width - mov x7, #(4*SUM_STRIDE) // sumsq stride - mov x8, #(2*SUM_STRIDE) // sum stride - sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride - sub x1, x1, #(2*SUM_STRIDE) // sum -= stride + add w4, w4, #2 + clz w9, w6 // bitdepth_max + dup v28.4s, w5 // strength - tst w4, #4 // LR_HAVE_TOP - b.eq 0f - // If have top, read from row -2. - sub x5, x0, #(4*SUM_STRIDE) - sub x6, x1, #(2*SUM_STRIDE) - add w11, w11, #2 - b 1f -0: - // !LR_HAVE_TOP - // If we don't have top, read from row 0 even if - // we start writing to row -1. - add x5, x0, #(4*SUM_STRIDE) - add x6, x1, #(2*SUM_STRIDE) -1: + ldp x5, x6, [x0] + ldr x0, [x0, #16] + ldp x7, x8, [x1] + ldr x1, [x1, #16] - tst w4, #8 // LR_HAVE_BOTTOM - b.eq 1f - // LR_HAVE_BOTTOM - add w3, w3, #2 // Sum all h+2 lines with the main loop - add w11, w11, #2 -1: - mov w9, w3 // Backup of h for next loops + movi v31.4s, #9 // n -1: - // Start of horizontal loop; start one vertical filter slice. - // Start loading rows into v16-v21 and v24-v26 taking top - // padding into consideration. - tst w4, #4 // LR_HAVE_TOP - ld1 {v16.4s, v17.4s}, [x5], x7 - ld1 {v24.8h}, [x6], x8 - b.eq 2f - // LR_HAVE_TOP - ld1 {v18.4s, v19.4s}, [x5], x7 - ld1 {v25.8h}, [x6], x8 - ld1 {v20.4s, v21.4s}, [x5], x7 - ld1 {v26.8h}, [x6], x8 - b 3f -2: // !LR_HAVE_TOP - mov v18.16b, v16.16b - mov v19.16b, v17.16b - mov v25.16b, v24.16b - mov v20.16b, v16.16b - mov v21.16b, v17.16b - mov v26.16b, v24.16b - -3: - subs w3, w3, #1 -.macro add3 - add v16.4s, v16.4s, v18.4s - add v17.4s, v17.4s, v19.4s - add v24.8h, v24.8h, v25.8h - add v16.4s, v16.4s, v20.4s - add v17.4s, v17.4s, v21.4s - add v24.8h, v24.8h, v26.8h - st1 {v16.4s, v17.4s}, [x0], x7 - st1 {v24.8h}, [x1], x8 -.endm - add3 - mov v16.16b, v18.16b - mov v17.16b, v19.16b - mov v24.16b, v25.16b - mov v18.16b, v20.16b - mov v19.16b, v21.16b - mov v25.16b, v26.16b - b.le 4f - ld1 {v20.4s, v21.4s}, [x5], x7 - ld1 {v26.8h}, [x6], x8 - b 3b - -4: - tst w4, #8 // LR_HAVE_BOTTOM - b.ne 5f - // !LR_HAVE_BOTTOM - // Produce two more rows, extending the already loaded rows. - add3 - mov v16.16b, v18.16b - mov v17.16b, v19.16b - mov v24.16b, v25.16b - add3 - -5: // End of one vertical slice. - subs w2, w2, #8 - b.le 0f - // Move pointers back up to the top and loop horizontally. - // Input pointers - msub x5, x7, x11, x5 - msub x6, x8, x11, x6 - // Output pointers - msub x0, x7, x10, x0 - msub x1, x8, x10, x1 - add x0, x0, #32 - add x1, x1, #16 - add x5, x5, #32 - add x6, x6, #16 - mov w3, w9 - b 1b - -0: - ret -.purgem add3 -endfunc - -// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum, -// const int w, const int h, -// const enum LrEdgeFlags edges); -function sgr_box5_v_neon, export=1 - add w10, w3, #2 // Number of output rows to move back - mov w11, w3 // Number of input rows to move back - add w2, w2, #8 // Actual summed width - mov x7, #(4*SUM_STRIDE) // sumsq stride - mov x8, #(2*SUM_STRIDE) // sum stride - sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride - sub x1, x1, #(2*SUM_STRIDE) // sum -= stride - - tst w4, #4 // LR_HAVE_TOP - b.eq 0f - // If have top, read from row -2. - sub x5, x0, #(4*SUM_STRIDE) - sub x6, x1, #(2*SUM_STRIDE) - add w11, w11, #2 - b 1f -0: - // !LR_HAVE_TOP - // If we don't have top, read from row 0 even if - // we start writing to row -1. - add x5, x0, #(4*SUM_STRIDE) - add x6, x1, #(2*SUM_STRIDE) -1: - - tst w4, #8 // LR_HAVE_BOTTOM - b.eq 0f - // LR_HAVE_BOTTOM - add w3, w3, #2 // Handle h+2 lines with the main loop - add w11, w11, #2 - b 1f -0: - // !LR_HAVE_BOTTOM - sub w3, w3, #1 // Handle h-1 lines with the main loop -1: - mov w9, w3 // Backup of h for next loops - -1: - // Start of horizontal loop; start one vertical filter slice. - // Start loading rows into v16-v25 and v26-v30 taking top - // padding into consideration. - tst w4, #4 // LR_HAVE_TOP - ld1 {v16.4s, v17.4s}, [x5], x7 - ld1 {v26.8h}, [x6], x8 - b.eq 2f - // LR_HAVE_TOP - ld1 {v20.4s, v21.4s}, [x5], x7 - ld1 {v28.8h}, [x6], x8 - mov v18.16b, v16.16b - mov v19.16b, v17.16b - mov v27.16b, v26.16b - ld1 {v22.4s, v23.4s}, [x5], x7 - ld1 {v29.8h}, [x6], x8 - b 3f -2: // !LR_HAVE_TOP - mov v18.16b, v16.16b - mov v19.16b, v17.16b - mov v27.16b, v26.16b - mov v20.16b, v16.16b - mov v21.16b, v17.16b - mov v28.16b, v26.16b - mov v22.16b, v16.16b - mov v23.16b, v17.16b - mov v29.16b, v26.16b - -3: - cbz w3, 4f - ld1 {v24.4s, v25.4s}, [x5], x7 - ld1 {v30.8h}, [x6], x8 - -3: - // Start of vertical loop - subs w3, w3, #2 -.macro add5 - add v16.4s, v16.4s, v18.4s - add v17.4s, v17.4s, v19.4s - add v26.8h, v26.8h, v27.8h - add v0.4s, v20.4s, v22.4s - add v1.4s, v21.4s, v23.4s - add v2.8h, v28.8h, v29.8h - add v16.4s, v16.4s, v24.4s - add v17.4s, v17.4s, v25.4s - add v26.8h, v26.8h, v30.8h - add v16.4s, v16.4s, v0.4s - add v17.4s, v17.4s, v1.4s - add v26.8h, v26.8h, v2.8h - st1 {v16.4s, v17.4s}, [x0], x7 - st1 {v26.8h}, [x1], x8 -.endm - add5 -.macro shift2 - mov v16.16b, v20.16b - mov v17.16b, v21.16b - mov v26.16b, v28.16b - mov v18.16b, v22.16b - mov v19.16b, v23.16b - mov v27.16b, v29.16b - mov v20.16b, v24.16b - mov v21.16b, v25.16b - mov v28.16b, v30.16b -.endm - shift2 - add x0, x0, x7 - add x1, x1, x8 - b.le 5f - ld1 {v22.4s, v23.4s}, [x5], x7 - ld1 {v29.8h}, [x6], x8 - ld1 {v24.4s, v25.4s}, [x5], x7 - ld1 {v30.8h}, [x6], x8 - b 3b - -4: - // h == 1, !LR_HAVE_BOTTOM. - // Pad the last row with the only content row, and add. - mov v24.16b, v22.16b - mov v25.16b, v23.16b - mov v30.16b, v29.16b - add5 - shift2 - add x0, x0, x7 - add x1, x1, x8 - add5 - b 6f - -5: - tst w4, #8 // LR_HAVE_BOTTOM - b.ne 6f - // !LR_HAVE_BOTTOM - cbnz w3, 5f - // The intended three edge rows left; output the one at h-2 and - // the past edge one at h. - ld1 {v22.4s, v23.4s}, [x5], x7 - ld1 {v29.8h}, [x6], x8 - // Pad the past-edge row from the last content row. - mov v24.16b, v22.16b - mov v25.16b, v23.16b - mov v30.16b, v29.16b - add5 - shift2 - add x0, x0, x7 - add x1, x1, x8 - // The last two rows are already padded properly here. - add5 - b 6f - -5: - // w3 == -1, two rows left, output one. - // Pad the last two rows from the mid one. - mov v22.16b, v20.16b - mov v23.16b, v21.16b - mov v29.16b, v28.16b - mov v24.16b, v20.16b - mov v25.16b, v21.16b - mov v30.16b, v28.16b - add5 - add x0, x0, x7 - add x1, x1, x8 - b 6f - -6: // End of one vertical slice. - subs w2, w2, #8 - b.le 0f - // Move pointers back up to the top and loop horizontally. - // Input pointers - msub x5, x7, x11, x5 - msub x6, x8, x11, x6 - // Output pointers - msub x0, x7, x10, x0 - msub x1, x8, x10, x1 - add x0, x0, #32 - add x1, x1, #16 - add x5, x5, #32 - add x6, x6, #16 - mov w3, w9 - b 1b - -0: - ret -.purgem add5 -endfunc - -// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b, -// const int w, const int h, const int strength, -// const int bitdepth_max); -// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b, -// const int w, const int h, const int strength, -// const int bitdepth_max); -function sgr_calc_ab1_neon, export=1 - clz w9, w5 - add x3, x3, #2 // h += 2 - movi v31.4s, #9 // n - mov x5, #455 - mov x8, #SUM_STRIDE - b sgr_calc_ab_neon -endfunc - -function sgr_calc_ab2_neon, export=1 - clz w9, w5 - add x3, x3, #3 // h += 3 - asr x3, x3, #1 // h /= 2 - movi v31.4s, #25 // n - mov x5, #164 - mov x8, #(2*SUM_STRIDE) -endfunc - -function sgr_calc_ab_neon sub w9, w9, #24 // -bitdepth_min_8 movrel x12, X(sgr_x_by_x) + mov w13, #455 // one_by_x ld1 {v16.16b, v17.16b, v18.16b}, [x12] dup v6.8h, w9 // -bitdepth_min_8 movi v19.16b, #5 @@ -363,70 +60,213 @@ function sgr_calc_ab_neon movi v23.8b, #169 // idx of last 2 movi v24.8b, #254 // idx of last 1 saddl v7.4s, v6.4h, v6.4h // -2*bitdepth_min_8 - add x2, x2, #2 // w += 2 - add x7, x2, #7 - bic x7, x7, #7 // aligned w - sub x7, x8, x7 // increment between rows movi v29.8h, #1, lsl #8 - dup v28.4s, w4 - dup v30.4s, w5 // one_by_x - sub x0, x0, #(4*(SUM_STRIDE)) - sub x1, x1, #(2*(SUM_STRIDE)) - mov x6, x2 // backup of w + dup v30.4s, w13 // one_by_x + sub v16.16b, v16.16b, v19.16b sub v17.16b, v17.16b, v19.16b sub v18.16b, v18.16b, v19.16b + + ld1 {v8.4s, v9.4s}, [x5], #32 + ld1 {v10.4s, v11.4s}, [x6], #32 + ld1 {v12.8h}, [x7], #16 + ld1 {v13.8h}, [x8], #16 + ld1 {v0.4s, v1.4s}, [x0], #32 + ld1 {v2.8h}, [x1], #16 1: - subs x2, x2, #8 - ld1 {v0.4s, v1.4s}, [x0] // a - ld1 {v2.8h}, [x1] // b - srshl v0.4s, v0.4s, v7.4s - srshl v1.4s, v1.4s, v7.4s - srshl v4.8h, v2.8h, v6.8h - mul v0.4s, v0.4s, v31.4s // a * n - mul v1.4s, v1.4s, v31.4s // a * n - umull v3.4s, v4.4h, v4.4h // b * b - umull2 v4.4s, v4.8h, v4.8h // b * b - uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0) - uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0) - mul v0.4s, v0.4s, v28.4s // p * s - mul v1.4s, v1.4s, v28.4s // p * s - uqshrn v0.4h, v0.4s, #16 - uqshrn2 v0.8h, v1.4s, #16 - uqrshrn v0.8b, v0.8h, #4 // imin(z, 255) - cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5 - cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4 - tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b - cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3 - cmhi v4.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2 - add v25.8b, v25.8b, v26.8b - cmhi v5.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1 - add v27.8b, v27.8b, v4.8b - add v5.8b, v5.8b, v19.8b - add v25.8b, v25.8b, v27.8b - add v1.8b, v1.8b, v5.8b - add v1.8b, v1.8b, v25.8b - uxtl v1.8h, v1.8b // x + add v8.4s, v8.4s, v10.4s + add v9.4s, v9.4s, v11.4s - umull v3.4s, v1.4h, v2.4h // x * BB[i] - umull2 v4.4s, v1.8h, v2.8h // x * BB[i] - mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x - mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x - srshr v3.4s, v3.4s, #12 // AA[i] - srshr v4.4s, v4.4s, #12 // AA[i] - sub v2.8h, v29.8h, v1.8h // 256 - x + add v12.8h, v12.8h, v13.8h - st1 {v3.4s, v4.4s}, [x0], #32 - st1 {v2.8h}, [x1], #16 + subs w4, w4, #8 + add v0.4s, v0.4s, v8.4s + add v1.4s, v1.4s, v9.4s + add v2.8h, v2.8h, v12.8h + + srshl v0.4s, v0.4s, v7.4s + srshl v1.4s, v1.4s, v7.4s + srshl v4.8h, v2.8h, v6.8h + mul v0.4s, v0.4s, v31.4s // a * n + mul v1.4s, v1.4s, v31.4s // a * n + umull v3.4s, v4.4h, v4.4h // b * b + umull2 v4.4s, v4.8h, v4.8h // b * b + uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0) + uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0) + mul v0.4s, v0.4s, v28.4s // p * s + mul v1.4s, v1.4s, v28.4s // p * s + ld1 {v8.4s, v9.4s}, [x5], #32 + uqshrn v0.4h, v0.4s, #16 + uqshrn2 v0.8h, v1.4s, #16 + ld1 {v10.4s, v11.4s}, [x6], #32 + uqrshrn v0.8b, v0.8h, #4 // imin(z, 255) + + ld1 {v12.8h}, [x7], #16 + + cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5 + cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4 + tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b + cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3 + cmhi v4.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2 + add v25.8b, v25.8b, v26.8b + cmhi v5.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1 + add v27.8b, v27.8b, v4.8b + add v5.8b, v5.8b, v19.8b + add v25.8b, v25.8b, v27.8b + add v5.8b, v1.8b, v5.8b + ld1 {v13.8h}, [x8], #16 + add v5.8b, v5.8b, v25.8b + ld1 {v0.4s, v1.4s}, [x0], #32 + uxtl v5.8h, v5.8b // x + + umull v3.4s, v5.4h, v2.4h // x * BB[i] + umull2 v4.4s, v5.8h, v2.8h // x * BB[i] + mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x + mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x + srshr v3.4s, v3.4s, #12 // AA[i] + srshr v4.4s, v4.4s, #12 // AA[i] + sub v5.8h, v29.8h, v5.8h // 256 - x + ld1 {v2.8h}, [x1], #16 + + st1 {v3.4s, v4.4s}, [x2], #32 + st1 {v5.8h}, [x3], #16 b.gt 1b - subs x3, x3, #1 - b.le 0f - add x0, x0, x7, lsl #2 - add x1, x1, x7, lsl #1 - mov x2, x6 - b 1b -0: + ldp d12, d13, [sp, #0x20] + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], 0x30 + ret +endfunc + +// void dav1d_sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum, +// int32_t *AA, int16_t *BB, +// const int w, const int s, +// const int bitdepth_max); +function sgr_box5_vert_neon, export=1 + stp d8, d9, [sp, #-0x40]! + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + + add w4, w4, #2 + clz w15, w6 // bitdepth_max + dup v28.4s, w5 // strength + + ldp x5, x6, [x0] + ldp x7, x8, [x0, #16] + ldr x0, [x0, #32] + ldp x9, x10, [x1] + ldp x11, x12, [x1, #16] + ldr x1, [x1, #32] + + movi v31.4s, #25 // n + + sub w15, w15, #24 // -bitdepth_min_8 + movrel x13, X(sgr_x_by_x) + mov w14, #164 // one_by_x + ld1 {v16.16b, v17.16b, v18.16b}, [x13] + dup v6.8h, w15 // -bitdepth_min_8 + movi v19.16b, #5 + movi v24.8b, #254 // idx of last 1 + saddl v7.4s, v6.4h, v6.4h // -2*bitdepth_min_8 + movi v29.8h, #1, lsl #8 + dup v30.4s, w14 // one_by_x + + sub v16.16b, v16.16b, v19.16b + sub v17.16b, v17.16b, v19.16b + sub v18.16b, v18.16b, v19.16b + + ld1 {v8.4s, v9.4s}, [x5], #32 + ld1 {v10.4s, v11.4s}, [x6], #32 + ld1 {v12.4s, v13.4s}, [x7], #32 + ld1 {v14.4s, v15.4s}, [x8], #32 + ld1 {v20.8h}, [x9], #16 + ld1 {v21.8h}, [x10], #16 + ld1 {v22.8h}, [x11], #16 + ld1 {v23.8h}, [x12], #16 + ld1 {v0.4s, v1.4s}, [x0], #32 + ld1 {v2.8h}, [x1], #16 + +1: + add v8.4s, v8.4s, v10.4s + add v9.4s, v9.4s, v11.4s + add v12.4s, v12.4s, v14.4s + add v13.4s, v13.4s, v15.4s + + add v20.8h, v20.8h, v21.8h + add v22.8h, v22.8h, v23.8h + + add v0.4s, v0.4s, v8.4s + add v1.4s, v1.4s, v9.4s + add v2.8h, v2.8h, v20.8h + + add v0.4s, v0.4s, v12.4s + add v1.4s, v1.4s, v13.4s + add v2.8h, v2.8h, v22.8h + + subs w4, w4, #8 + + movi v20.8b, #55 // idx of last 5 + movi v21.8b, #72 // idx of last 4 + movi v22.8b, #101 // idx of last 3 + movi v23.8b, #169 // idx of last 2 + + srshl v0.4s, v0.4s, v7.4s + srshl v1.4s, v1.4s, v7.4s + srshl v4.8h, v2.8h, v6.8h + mul v0.4s, v0.4s, v31.4s // a * n + mul v1.4s, v1.4s, v31.4s // a * n + umull v3.4s, v4.4h, v4.4h // b * b + umull2 v4.4s, v4.8h, v4.8h // b * b + uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0) + uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0) + mul v0.4s, v0.4s, v28.4s // p * s + mul v1.4s, v1.4s, v28.4s // p * s + ld1 {v8.4s, v9.4s}, [x5], #32 + uqshrn v0.4h, v0.4s, #16 + uqshrn2 v0.8h, v1.4s, #16 + ld1 {v10.4s, v11.4s}, [x6], #32 + uqrshrn v0.8b, v0.8h, #4 // imin(z, 255) + + ld1 {v12.4s, v13.4s}, [x7], #32 + + cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5 + cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4 + tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b + cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3 + cmhi v4.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2 + ld1 {v14.4s, v15.4s}, [x8], #32 + add v25.8b, v25.8b, v26.8b + cmhi v5.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1 + add v27.8b, v27.8b, v4.8b + ld1 {v20.8h}, [x9], #16 + add v5.8b, v5.8b, v19.8b + add v25.8b, v25.8b, v27.8b + ld1 {v21.8h}, [x10], #16 + add v5.8b, v1.8b, v5.8b + ld1 {v22.8h}, [x11], #16 + add v5.8b, v5.8b, v25.8b + ld1 {v23.8h}, [x12], #16 + uxtl v5.8h, v5.8b // x + + ld1 {v0.4s, v1.4s}, [x0], #32 + umull v3.4s, v5.4h, v2.4h // x * BB[i] + umull2 v4.4s, v5.8h, v2.8h // x * BB[i] + mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x + mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x + srshr v3.4s, v3.4s, #12 // AA[i] + srshr v4.4s, v4.4s, #12 // AA[i] + sub v5.8h, v29.8h, v5.8h // 256 - x + ld1 {v2.8h}, [x1], #16 + + st1 {v3.4s, v4.4s}, [x2], #32 + st1 {v5.8h}, [x3], #16 + b.gt 1b + + ldp d14, d15, [sp, #0x30] + ldp d12, d13, [sp, #0x20] + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], 0x40 ret endfunc diff --git a/third_party/dav1d/src/arm/64/looprestoration_tmpl.S b/third_party/dav1d/src/arm/64/looprestoration_tmpl.S index 7cdfd6f3f77e..1373f9ace3b8 100644 --- a/third_party/dav1d/src/arm/64/looprestoration_tmpl.S +++ b/third_party/dav1d/src/arm/64/looprestoration_tmpl.S @@ -30,52 +30,224 @@ #define FILTER_OUT_STRIDE 384 .macro sgr_funcs bpc -// void dav1d_sgr_finish_filter1_Xbpc_neon(int16_t *tmp, -// const pixel *src, const ptrdiff_t stride, -// const int32_t *a, const int16_t *b, -// const int w, const int h); -function sgr_finish_filter1_\bpc\()bpc_neon, export=1 - sub x7, x3, #(4*SUM_STRIDE) - add x8, x3, #(4*SUM_STRIDE) - sub x9, x4, #(2*SUM_STRIDE) - add x10, x4, #(2*SUM_STRIDE) - mov x11, #SUM_STRIDE - mov x12, #FILTER_OUT_STRIDE - add x13, x5, #7 - bic x13, x13, #7 // Aligned width +// void dav1d_sgr_finish_filter1_2rows_Xbpc_neon(int16_t *tmp, +// const pixel *src, +// const ptrdiff_t src_stride, +// const int32_t **a, +// const int16_t **b, +// const int w, const int h); +function sgr_finish_filter1_2rows_\bpc\()bpc_neon, export=1 + stp d8, d9, [sp, #-0x40]! + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + + ldp x7, x8, [x3] + ldp x9, x3, [x3, #16] + ldp x10, x11, [x4] + ldp x12, x4, [x4, #16] + + mov x13, #FILTER_OUT_STRIDE + cmp w6, #1 + add x2, x1, x2 // src + stride + csel x2, x1, x2, le // if (h <= 1) x2 = x1 + add x13, x0, x13, lsl #1 + + movi v30.8h, #3 + movi v31.4s, #3 +1: + ld1 {v0.8h, v1.8h}, [x10], #32 + ld1 {v2.8h, v3.8h}, [x11], #32 + ld1 {v4.8h, v5.8h}, [x12], #32 + ld1 {v6.8h, v7.8h}, [x4], #32 + ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48 + ld1 {v19.4s, v20.4s, v21.4s}, [x8], #48 + ld1 {v22.4s, v23.4s, v24.4s}, [x9], #48 + ld1 {v25.4s, v26.4s, v27.4s}, [x3], #48 + +2: + ext v8.16b, v0.16b, v1.16b, #2 // [0][1] + ext v9.16b, v2.16b, v3.16b, #2 // [1][1] + ext v10.16b, v4.16b, v5.16b, #2 // [2][1] + ext v11.16b, v0.16b, v1.16b, #4 // [0][2] + ext v12.16b, v2.16b, v3.16b, #4 // [1][2] + ext v13.16b, v4.16b, v5.16b, #4 // [2][2] + + add v14.8h, v2.8h, v8.8h // [1][0] + [0][1] + add v15.8h, v9.8h, v10.8h // [1][1] + [2][1] + + add v28.8h, v0.8h, v11.8h // [0][0] + [0][2] + add v14.8h, v14.8h, v12.8h // () + [1][2] + add v29.8h, v4.8h, v13.8h // [2][0] + [2][2] + + ext v8.16b, v6.16b, v7.16b, #2 // [3][1] + ext v11.16b, v6.16b, v7.16b, #4 // [3][2] + + add v14.8h, v14.8h, v15.8h // mid + add v15.8h, v28.8h, v29.8h // corners + + add v28.8h, v4.8h, v9.8h // [2][0] + [1][1] + add v29.8h, v10.8h, v8.8h // [2][1] + [3][1] + + add v2.8h, v2.8h, v12.8h // [1][0] + [1][2] + add v28.8h, v28.8h, v13.8h // () + [2][2] + add v4.8h, v6.8h, v11.8h // [3][0] + [3][2] + + add v0.8h, v28.8h, v29.8h // mid + add v2.8h, v2.8h, v4.8h // corners + + shl v4.8h, v14.8h, #2 + mla v4.8h, v15.8h, v30.8h // * 3 -> a + + shl v0.8h, v0.8h, #2 + mla v0.8h, v2.8h, v30.8h // * 3 -> a + + ext v8.16b, v16.16b, v17.16b, #4 // [0][1] + ext v9.16b, v17.16b, v18.16b, #4 + ext v10.16b, v16.16b, v17.16b, #8 // [0][2] + ext v11.16b, v17.16b, v18.16b, #8 + ext v12.16b, v19.16b, v20.16b, #4 // [1][1] + ext v13.16b, v20.16b, v21.16b, #4 + add v8.4s, v8.4s, v19.4s // [0][1] + [1][0] + add v9.4s, v9.4s, v20.4s + add v16.4s, v16.4s, v10.4s // [0][0] + [0][2] + add v17.4s, v17.4s, v11.4s + ext v14.16b, v19.16b, v20.16b, #8 // [1][2] + ext v15.16b, v20.16b, v21.16b, #8 + add v16.4s, v16.4s, v22.4s // () + [2][0] + add v17.4s, v17.4s, v23.4s + add v28.4s, v12.4s, v14.4s // [1][1] + [1][2] + add v29.4s, v13.4s, v15.4s + ext v10.16b, v22.16b, v23.16b, #4 // [2][1] + ext v11.16b, v23.16b, v24.16b, #4 + add v8.4s, v8.4s, v28.4s // mid (incomplete) + add v9.4s, v9.4s, v29.4s + + add v19.4s, v19.4s, v14.4s // [1][0] + [1][2] + add v20.4s, v20.4s, v15.4s + add v14.4s, v22.4s, v12.4s // [2][0] + [1][1] + add v15.4s, v23.4s, v13.4s + + ext v12.16b, v22.16b, v23.16b, #8 // [2][2] + ext v13.16b, v23.16b, v24.16b, #8 + ext v28.16b, v25.16b, v26.16b, #4 // [3][1] + ext v29.16b, v26.16b, v27.16b, #4 + add v8.4s, v8.4s, v10.4s // () + [2][1] = mid + add v9.4s, v9.4s, v11.4s + add v14.4s, v14.4s, v10.4s // () + [2][1] + add v15.4s, v15.4s, v11.4s + ext v10.16b, v25.16b, v26.16b, #8 // [3][2] + ext v11.16b, v26.16b, v27.16b, #8 + add v16.4s, v16.4s, v12.4s // () + [2][2] = corner + add v17.4s, v17.4s, v13.4s + + add v12.4s, v12.4s, v28.4s // [2][2] + [3][1] + add v13.4s, v13.4s, v29.4s + add v25.4s, v25.4s, v10.4s // [3][0] + [3][2] + add v26.4s, v26.4s, v11.4s + + add v14.4s, v14.4s, v12.4s // mid + add v15.4s, v15.4s, v13.4s + add v19.4s, v19.4s, v25.4s // corner + add v20.4s, v20.4s, v26.4s + .if \bpc == 8 - sub x2, x2, x13 + ld1 {v25.8b}, [x1], #8 // src + ld1 {v26.8b}, [x2], #8 .else - sub x2, x2, x13, lsl #1 + ld1 {v25.8h}, [x1], #16 // src + ld1 {v26.8h}, [x2], #16 .endif - sub x12, x12, x13 - sub x11, x11, x13 - sub x11, x11, #4 // We read 4 extra elements from a - sub x14, x11, #4 // We read 8 extra elements from b - mov x13, x5 + + shl v8.4s, v8.4s, #2 + shl v9.4s, v9.4s, #2 + mla v8.4s, v16.4s, v31.4s // * 3 -> b + mla v9.4s, v17.4s, v31.4s + +.if \bpc == 8 + uxtl v25.8h, v25.8b // src + uxtl v26.8h, v26.8b +.endif + + shl v14.4s, v14.4s, #2 + shl v15.4s, v15.4s, #2 + mla v14.4s, v19.4s, v31.4s // * 3 -> b + mla v15.4s, v20.4s, v31.4s + + umlal v8.4s, v4.4h, v25.4h // b + a * src + umlal2 v9.4s, v4.8h, v25.8h + umlal v14.4s, v0.4h, v26.4h // b + a * src + umlal2 v15.4s, v0.8h, v26.8h + mov v0.16b, v1.16b + rshrn v8.4h, v8.4s, #9 + rshrn2 v8.8h, v9.4s, #9 + mov v2.16b, v3.16b + rshrn v14.4h, v14.4s, #9 + rshrn2 v14.8h, v15.4s, #9 + subs w5, w5, #8 + mov v4.16b, v5.16b + st1 {v8.8h}, [x0], #16 + mov v6.16b, v7.16b + st1 {v14.8h}, [x13], #16 + + b.le 3f + mov v16.16b, v18.16b + mov v19.16b, v21.16b + mov v22.16b, v24.16b + mov v25.16b, v27.16b + ld1 {v1.8h}, [x10], #16 + ld1 {v3.8h}, [x11], #16 + ld1 {v5.8h}, [x12], #16 + ld1 {v7.8h}, [x4], #16 + ld1 {v17.4s, v18.4s}, [x7], #32 + ld1 {v20.4s, v21.4s}, [x8], #32 + ld1 {v23.4s, v24.4s}, [x9], #32 + ld1 {v26.4s, v27.4s}, [x3], #32 + b 2b + +3: + ldp d14, d15, [sp, #0x30] + ldp d12, d13, [sp, #0x20] + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], 0x40 + ret +endfunc + +// void dav1d_sgr_finish_weighted1_Xbpc_neon(pixel *dst, +// const int32_t **a, const int16_t **b, +// const int w, const int w1, +// const int bitdepth_max); +function sgr_finish_weighted1_\bpc\()bpc_neon, export=1 + ldp x7, x8, [x1] + ldr x1, [x1, #16] + ldp x9, x10, [x2] + ldr x2, [x2, #16] + + dup v31.8h, w4 + dup v30.8h, w5 + movi v6.8h, #3 movi v7.4s, #3 1: - ld1 {v0.8h, v1.8h}, [x9], #32 - ld1 {v2.8h, v3.8h}, [x4], #32 - ld1 {v4.8h, v5.8h}, [x10], #32 + ld1 {v0.8h, v1.8h}, [x9], #32 + ld1 {v2.8h, v3.8h}, [x10], #32 + ld1 {v4.8h, v5.8h}, [x2], #32 ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48 - ld1 {v19.4s, v20.4s, v21.4s}, [x3], #48 - ld1 {v22.4s, v23.4s, v24.4s}, [x8], #48 + ld1 {v19.4s, v20.4s, v21.4s}, [x8], #48 + ld1 {v22.4s, v23.4s, v24.4s}, [x1], #48 2: - subs x5, x5, #8 ext v25.16b, v0.16b, v1.16b, #2 // -stride ext v26.16b, v2.16b, v3.16b, #2 // 0 ext v27.16b, v4.16b, v5.16b, #2 // +stride ext v28.16b, v0.16b, v1.16b, #4 // +1-stride ext v29.16b, v2.16b, v3.16b, #4 // +1 - ext v30.16b, v4.16b, v5.16b, #4 // +1+stride add v2.8h, v2.8h, v25.8h // -1, -stride + ext v25.16b, v4.16b, v5.16b, #4 // +1+stride add v26.8h, v26.8h, v27.8h // 0, +stride add v0.8h, v0.8h, v28.8h // -1-stride, +1-stride add v2.8h, v2.8h, v26.8h - add v4.8h, v4.8h, v30.8h // -1+stride, +1+stride + add v4.8h, v4.8h, v25.8h // -1+stride, +1+stride add v2.8h, v2.8h, v29.8h // +1 add v0.8h, v0.8h, v4.8h @@ -85,7 +257,7 @@ function sgr_finish_filter1_\bpc\()bpc_neon, export=1 ext v27.16b, v16.16b, v17.16b, #8 // +1-stride ext v28.16b, v17.16b, v18.16b, #8 ext v29.16b, v19.16b, v20.16b, #4 // 0 - ext v30.16b, v20.16b, v21.16b, #4 + ext v4.16b, v20.16b, v21.16b, #4 mla v2.8h, v0.8h, v6.8h // * 3 -> a add v25.4s, v25.4s, v19.4s // -stride, -1 add v26.4s, v26.4s, v20.4s @@ -96,22 +268,22 @@ function sgr_finish_filter1_\bpc\()bpc_neon, export=1 add v16.4s, v16.4s, v22.4s // -1+stride add v17.4s, v17.4s, v23.4s add v29.4s, v29.4s, v27.4s // 0, +1 - add v30.4s, v30.4s, v28.4s + add v4.4s, v4.4s, v28.4s add v25.4s, v25.4s, v29.4s - add v26.4s, v26.4s, v30.4s + add v26.4s, v26.4s, v4.4s ext v27.16b, v22.16b, v23.16b, #4 // +stride ext v28.16b, v23.16b, v24.16b, #4 ext v29.16b, v22.16b, v23.16b, #8 // +1+stride - ext v30.16b, v23.16b, v24.16b, #8 + ext v4.16b, v23.16b, v24.16b, #8 .if \bpc == 8 - ld1 {v19.8b}, [x1], #8 // src + ld1 {v19.8b}, [x0] // src .else - ld1 {v19.8h}, [x1], #16 // src + ld1 {v19.8h}, [x0] // src .endif add v25.4s, v25.4s, v27.4s // +stride add v26.4s, v26.4s, v28.4s add v16.4s, v16.4s, v29.4s // +1+stride - add v17.4s, v17.4s, v30.4s + add v17.4s, v17.4s, v4.4s shl v25.4s, v25.4s, #2 shl v26.4s, v26.4s, #2 mla v25.4s, v16.4s, v7.4s // * 3 -> b @@ -125,61 +297,68 @@ function sgr_finish_filter1_\bpc\()bpc_neon, export=1 mov v2.16b, v3.16b rshrn v25.4h, v25.4s, #9 rshrn2 v25.8h, v26.4s, #9 - mov v4.16b, v5.16b - st1 {v25.8h}, [x0], #16 - b.le 3f + subs w3, w3, #8 + + // weighted1 + shl v19.8h, v19.8h, #4 // u + mov v4.16b, v5.16b + + sub v25.8h, v25.8h, v19.8h // t1 - u + ld1 {v1.8h}, [x9], #16 + ushll v26.4s, v19.4h, #7 // u << 7 + ushll2 v27.4s, v19.8h, #7 // u << 7 + ld1 {v3.8h}, [x10], #16 + smlal v26.4s, v25.4h, v31.4h // v + smlal2 v27.4s, v25.8h, v31.8h // v + ld1 {v5.8h}, [x2], #16 +.if \bpc == 8 + rshrn v26.4h, v26.4s, #11 + rshrn2 v26.8h, v27.4s, #11 mov v16.16b, v18.16b + sqxtun v26.8b, v26.8h mov v19.16b, v21.16b mov v22.16b, v24.16b - ld1 {v1.8h}, [x9], #16 - ld1 {v3.8h}, [x4], #16 - ld1 {v5.8h}, [x10], #16 + st1 {v26.8b}, [x0], #8 +.else + sqrshrun v26.4h, v26.4s, #11 + sqrshrun2 v26.8h, v27.4s, #11 + mov v16.16b, v18.16b + umin v26.8h, v26.8h, v30.8h + mov v19.16b, v21.16b + mov v22.16b, v24.16b + st1 {v26.8h}, [x0], #16 +.endif + + b.le 3f ld1 {v17.4s, v18.4s}, [x7], #32 - ld1 {v20.4s, v21.4s}, [x3], #32 - ld1 {v23.4s, v24.4s}, [x8], #32 + ld1 {v20.4s, v21.4s}, [x8], #32 + ld1 {v23.4s, v24.4s}, [x1], #32 b 2b 3: - subs x6, x6, #1 - b.le 0f - mov x5, x13 - add x0, x0, x12, lsl #1 - add x1, x1, x2 - add x3, x3, x11, lsl #2 - add x7, x7, x11, lsl #2 - add x8, x8, x11, lsl #2 - add x4, x4, x14, lsl #1 - add x9, x9, x14, lsl #1 - add x10, x10, x14, lsl #1 - b 1b -0: ret endfunc -// void dav1d_sgr_finish_filter2_Xbpc_neon(int16_t *tmp, -// const pixel *src, const ptrdiff_t stride, -// const int32_t *a, const int16_t *b, -// const int w, const int h); -function sgr_finish_filter2_\bpc\()bpc_neon, export=1 - add x7, x3, #(4*(SUM_STRIDE)) - sub x3, x3, #(4*(SUM_STRIDE)) - add x8, x4, #(2*(SUM_STRIDE)) - sub x4, x4, #(2*(SUM_STRIDE)) - mov x9, #(2*SUM_STRIDE) +// void dav1d_sgr_finish_filter2_2rows_Xbpc_neon(int16_t *tmp, +// const pixel *src, +// const ptrdiff_t stride, +// const int32_t **a, +// const int16_t **b, +// const int w, const int h); +function sgr_finish_filter2_2rows_\bpc\()bpc_neon, export=1 + stp d8, d9, [sp, #-0x40]! + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + + ldp x3, x7, [x3] + ldp x4, x8, [x4] mov x10, #FILTER_OUT_STRIDE - add x11, x5, #7 - bic x11, x11, #7 // Aligned width -.if \bpc == 8 - sub x2, x2, x11 -.else - sub x2, x2, x11, lsl #1 -.endif - sub x10, x10, x11 - sub x9, x9, x11 - sub x9, x9, #4 // We read 4 extra elements from a - sub x12, x9, #4 // We read 8 extra elements from b - mov x11, x5 + cmp w6, #1 + add x2, x1, x2 // src + stride + csel x2, x1, x2, le // if (h <= 1) x2 = x1 + add x10, x0, x10, lsl #1 movi v4.8h, #5 movi v5.4s, #5 movi v6.8h, #6 @@ -191,7 +370,6 @@ function sgr_finish_filter2_\bpc\()bpc_neon, export=1 ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48 2: - subs x5, x5, #8 ext v24.16b, v0.16b, v1.16b, #4 // +1-stride ext v25.16b, v2.16b, v3.16b, #4 // +1+stride ext v22.16b, v0.16b, v1.16b, #2 // -stride @@ -201,6 +379,9 @@ function sgr_finish_filter2_\bpc\()bpc_neon, export=1 add v2.8h, v22.8h, v23.8h // -stride, +stride add v0.8h, v0.8h, v25.8h + mul v8.8h, v25.8h, v4.8h // * 5 + mla v8.8h, v23.8h, v6.8h // * 6 + ext v22.16b, v16.16b, v17.16b, #4 // -stride ext v23.16b, v17.16b, v18.16b, #4 ext v24.16b, v19.16b, v20.16b, #4 // +stride @@ -213,8 +394,10 @@ function sgr_finish_filter2_\bpc\()bpc_neon, export=1 mla v0.8h, v2.8h, v6.8h // * 6 .if \bpc == 8 ld1 {v31.8b}, [x1], #8 + ld1 {v30.8b}, [x2], #8 .else ld1 {v31.8h}, [x1], #16 + ld1 {v30.8h}, [x2], #16 .endif add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride add v17.4s, v17.4s, v27.4s @@ -223,6 +406,11 @@ function sgr_finish_filter2_\bpc\()bpc_neon, export=1 add v16.4s, v16.4s, v19.4s add v17.4s, v17.4s, v20.4s + mul v9.4s, v19.4s, v5.4s // * 5 + mla v9.4s, v24.4s, v7.4s // * 6 + mul v10.4s, v20.4s, v5.4s // * 5 + mla v10.4s, v25.4s, v7.4s // * 6 + add v22.4s, v22.4s, v24.4s // -stride, +stride add v23.4s, v23.4s, v25.4s // This is, surprisingly, faster than other variants where the @@ -234,16 +422,23 @@ function sgr_finish_filter2_\bpc\()bpc_neon, export=1 .if \bpc == 8 uxtl v31.8h, v31.8b + uxtl v30.8h, v30.8b .endif umlal v16.4s, v0.4h, v31.4h // b + a * src umlal2 v17.4s, v0.8h, v31.8h + umlal v9.4s, v8.4h, v30.4h // b + a * src + umlal2 v10.4s, v8.8h, v30.8h mov v0.16b, v1.16b rshrn v16.4h, v16.4s, #9 rshrn2 v16.8h, v17.4s, #9 + rshrn v9.4h, v9.4s, #8 + rshrn2 v9.8h, v10.4s, #8 + subs w5, w5, #8 mov v2.16b, v3.16b - st1 {v16.8h}, [x0], #16 + st1 {v16.8h}, [x0], #16 + st1 {v9.8h}, [x10], #16 - b.le 3f + b.le 9f mov v16.16b, v18.16b mov v19.16b, v21.16b ld1 {v1.8h}, [x4], #16 @@ -252,201 +447,160 @@ function sgr_finish_filter2_\bpc\()bpc_neon, export=1 ld1 {v20.4s, v21.4s}, [x7], #32 b 2b -3: - subs x6, x6, #1 - b.le 0f - mov x5, x11 - add x0, x0, x10, lsl #1 - add x1, x1, x2 - add x3, x3, x9, lsl #2 - add x7, x7, x9, lsl #2 - add x4, x4, x12, lsl #1 - add x8, x8, x12, lsl #1 - mov x13, x3 - mov x14, x4 - - ld1 {v0.8h, v1.8h}, [x4], #32 - ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48 - -4: - subs x5, x5, #8 - ext v23.16b, v0.16b, v1.16b, #4 // +1 - ext v22.16b, v0.16b, v1.16b, #2 // 0 - add v0.8h, v0.8h, v23.8h // -1, +1 - - ext v24.16b, v16.16b, v17.16b, #4 // 0 - ext v25.16b, v17.16b, v18.16b, #4 - ext v26.16b, v16.16b, v17.16b, #8 // +1 - ext v27.16b, v17.16b, v18.16b, #8 - mul v2.8h, v22.8h, v6.8h // * 6 - mla v2.8h, v0.8h, v4.8h // * 5 -> a -.if \bpc == 8 - ld1 {v31.8b}, [x1], #8 -.else - ld1 {v31.8h}, [x1], #16 -.endif - add v16.4s, v16.4s, v26.4s // -1, +1 - add v17.4s, v17.4s, v27.4s -.if \bpc == 8 - uxtl v31.8h, v31.8b -.endif - // This is, surprisingly, faster than other variants where the - // mul+mla pairs are further apart, on Cortex A53. - mul v24.4s, v24.4s, v7.4s // * 6 - mla v24.4s, v16.4s, v5.4s // * 5 -> b - mul v25.4s, v25.4s, v7.4s // * 6 - mla v25.4s, v17.4s, v5.4s // * 5 -> b - - umlal v24.4s, v2.4h, v31.4h // b + a * src - umlal2 v25.4s, v2.8h, v31.8h - mov v0.16b, v1.16b - rshrn v24.4h, v24.4s, #8 - rshrn2 v24.8h, v25.4s, #8 - mov v16.16b, v18.16b - st1 {v24.8h}, [x0], #16 - - b.le 5f - ld1 {v1.8h}, [x4], #16 - ld1 {v17.4s, v18.4s}, [x3], #32 - b 4b - -5: - subs x6, x6, #1 - b.le 0f - mov x5, x11 - add x0, x0, x10, lsl #1 - add x1, x1, x2 - mov x3, x13 // Rewind x3/x4 to where they started - mov x4, x14 - b 1b -0: +9: + ldp d14, d15, [sp, #0x30] + ldp d12, d13, [sp, #0x20] + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], 0x40 ret endfunc -// void dav1d_sgr_weighted1_Xbpc_neon(pixel *dst, const ptrdiff_t dst_stride, -// const pixel *src, const ptrdiff_t src_stride, -// const int16_t *t1, const int w, const int h, -// const int wt, const int bitdepth_max); -function sgr_weighted1_\bpc\()bpc_neon, export=1 -.if \bpc == 16 - ldr w8, [sp] -.endif - dup v31.8h, w7 - cmp x6, #2 -.if \bpc == 16 - dup v30.8h, w8 -.endif - add x9, x0, x1 - add x10, x2, x3 - add x11, x4, #2*FILTER_OUT_STRIDE - mov x7, #(4*FILTER_OUT_STRIDE) - lsl x1, x1, #1 - lsl x3, x3, #1 - add x8, x5, #7 - bic x8, x8, #7 // Aligned width -.if \bpc == 8 - sub x1, x1, x8 - sub x3, x3, x8 -.else - sub x1, x1, x8, lsl #1 - sub x3, x3, x8, lsl #1 -.endif - sub x7, x7, x8, lsl #1 - mov x8, x5 - b.lt 2f -1: -.if \bpc == 8 - ld1 {v0.8b}, [x2], #8 - ld1 {v4.8b}, [x10], #8 -.else - ld1 {v0.8h}, [x2], #16 - ld1 {v4.8h}, [x10], #16 -.endif - ld1 {v1.8h}, [x4], #16 - ld1 {v5.8h}, [x11], #16 - subs x5, x5, #8 -.if \bpc == 8 - ushll v0.8h, v0.8b, #4 // u - ushll v4.8h, v4.8b, #4 // u -.else - shl v0.8h, v0.8h, #4 // u - shl v4.8h, v4.8h, #4 // u -.endif - sub v1.8h, v1.8h, v0.8h // t1 - u - sub v5.8h, v5.8h, v4.8h // t1 - u - ushll v2.4s, v0.4h, #7 // u << 7 - ushll2 v3.4s, v0.8h, #7 // u << 7 - ushll v6.4s, v4.4h, #7 // u << 7 - ushll2 v7.4s, v4.8h, #7 // u << 7 - smlal v2.4s, v1.4h, v31.4h // v - smlal2 v3.4s, v1.8h, v31.8h // v - smlal v6.4s, v5.4h, v31.4h // v - smlal2 v7.4s, v5.8h, v31.8h // v -.if \bpc == 8 - rshrn v2.4h, v2.4s, #11 - rshrn2 v2.8h, v3.4s, #11 - rshrn v6.4h, v6.4s, #11 - rshrn2 v6.8h, v7.4s, #11 - sqxtun v2.8b, v2.8h - sqxtun v6.8b, v6.8h - st1 {v2.8b}, [x0], #8 - st1 {v6.8b}, [x9], #8 -.else - sqrshrun v2.4h, v2.4s, #11 - sqrshrun2 v2.8h, v3.4s, #11 - sqrshrun v6.4h, v6.4s, #11 - sqrshrun2 v6.8h, v7.4s, #11 - umin v2.8h, v2.8h, v30.8h - umin v6.8h, v6.8h, v30.8h - st1 {v2.8h}, [x0], #16 - st1 {v6.8h}, [x9], #16 -.endif - b.gt 1b +// void dav1d_sgr_finish_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride, +// const int32_t **a, +// const int16_t **b, +// const int w, const int h, +// const int w1, +// const int bitdepth_max); +function sgr_finish_weighted2_\bpc\()bpc_neon, export=1 + stp d8, d9, [sp, #-0x30]! + str d10, [sp, #0x10] + stp d14, d15, [sp, #0x20] - sub x6, x6, #2 - cmp x6, #1 - b.lt 0f - mov x5, x8 - add x0, x0, x1 - add x9, x9, x1 - add x2, x2, x3 - add x10, x10, x3 - add x4, x4, x7 - add x11, x11, x7 - b.eq 2f - b 1b + dup v14.8h, w6 + dup v15.8h, w7 + + ldp x2, x7, [x2] + ldp x3, x8, [x3] + cmp w5, #1 + add x1, x0, x1 // src + stride + // if (h <= 1), set the pointer to the second row to any dummy buffer + // we can clobber (x2 in this case) + csel x1, x2, x1, le + movi v4.8h, #5 + movi v5.4s, #5 + movi v6.8h, #6 + movi v7.4s, #6 +1: + ld1 {v0.8h, v1.8h}, [x3], #32 + ld1 {v2.8h, v3.8h}, [x8], #32 + ld1 {v16.4s, v17.4s, v18.4s}, [x2], #48 + ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48 2: + ext v24.16b, v0.16b, v1.16b, #4 // +1-stride + ext v25.16b, v2.16b, v3.16b, #4 // +1+stride + ext v22.16b, v0.16b, v1.16b, #2 // -stride + ext v23.16b, v2.16b, v3.16b, #2 // +stride + add v0.8h, v0.8h, v24.8h // -1-stride, +1-stride + add v25.8h, v2.8h, v25.8h // -1+stride, +1+stride + add v2.8h, v22.8h, v23.8h // -stride, +stride + add v0.8h, v0.8h, v25.8h + + mul v8.8h, v25.8h, v4.8h // * 5 + mla v8.8h, v23.8h, v6.8h // * 6 + + ext v22.16b, v16.16b, v17.16b, #4 // -stride + ext v23.16b, v17.16b, v18.16b, #4 + ext v24.16b, v19.16b, v20.16b, #4 // +stride + ext v25.16b, v20.16b, v21.16b, #4 + ext v26.16b, v16.16b, v17.16b, #8 // +1-stride + ext v27.16b, v17.16b, v18.16b, #8 + ext v28.16b, v19.16b, v20.16b, #8 // +1+stride + ext v29.16b, v20.16b, v21.16b, #8 + mul v0.8h, v0.8h, v4.8h // * 5 + mla v0.8h, v2.8h, v6.8h // * 6 .if \bpc == 8 - ld1 {v0.8b}, [x2], #8 + ld1 {v31.8b}, [x0] + ld1 {v30.8b}, [x1] .else - ld1 {v0.8h}, [x2], #16 + ld1 {v31.8h}, [x0] + ld1 {v30.8h}, [x1] .endif - ld1 {v1.8h}, [x4], #16 - subs x5, x5, #8 + add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride + add v17.4s, v17.4s, v27.4s + add v19.4s, v19.4s, v28.4s // -1+stride, +1+stride + add v20.4s, v20.4s, v29.4s + add v16.4s, v16.4s, v19.4s + add v17.4s, v17.4s, v20.4s + + mul v9.4s, v19.4s, v5.4s // * 5 + mla v9.4s, v24.4s, v7.4s // * 6 + mul v10.4s, v20.4s, v5.4s // * 5 + mla v10.4s, v25.4s, v7.4s // * 6 + + add v22.4s, v22.4s, v24.4s // -stride, +stride + add v23.4s, v23.4s, v25.4s + // This is, surprisingly, faster than other variants where the + // mul+mla pairs are further apart, on Cortex A53. + mul v16.4s, v16.4s, v5.4s // * 5 + mla v16.4s, v22.4s, v7.4s // * 6 + mul v17.4s, v17.4s, v5.4s // * 5 + mla v17.4s, v23.4s, v7.4s // * 6 + .if \bpc == 8 - ushll v0.8h, v0.8b, #4 // u -.else - shl v0.8h, v0.8h, #4 // u + uxtl v31.8h, v31.8b + uxtl v30.8h, v30.8b .endif - sub v1.8h, v1.8h, v0.8h // t1 - u - ushll v2.4s, v0.4h, #7 // u << 7 - ushll2 v3.4s, v0.8h, #7 // u << 7 - smlal v2.4s, v1.4h, v31.4h // v - smlal2 v3.4s, v1.8h, v31.8h // v + umlal v16.4s, v0.4h, v31.4h // b + a * src + umlal2 v17.4s, v0.8h, v31.8h + umlal v9.4s, v8.4h, v30.4h // b + a * src + umlal2 v10.4s, v8.8h, v30.8h + mov v0.16b, v1.16b + rshrn v16.4h, v16.4s, #9 + rshrn2 v16.8h, v17.4s, #9 + rshrn v9.4h, v9.4s, #8 + rshrn2 v9.8h, v10.4s, #8 + + subs w4, w4, #8 + + // weighted1 + shl v31.8h, v31.8h, #4 // u + shl v30.8h, v30.8h, #4 + mov v2.16b, v3.16b + + sub v16.8h, v16.8h, v31.8h // t1 - u + sub v9.8h, v9.8h, v30.8h + ld1 {v1.8h}, [x3], #16 + ushll v22.4s, v31.4h, #7 // u << 7 + ushll2 v23.4s, v31.8h, #7 + ushll v24.4s, v30.4h, #7 + ushll2 v25.4s, v30.8h, #7 + ld1 {v3.8h}, [x8], #16 + smlal v22.4s, v16.4h, v14.4h // v + smlal2 v23.4s, v16.8h, v14.8h + mov v16.16b, v18.16b + smlal v24.4s, v9.4h, v14.4h + smlal2 v25.4s, v9.8h, v14.8h + mov v19.16b, v21.16b .if \bpc == 8 - rshrn v2.4h, v2.4s, #11 - rshrn2 v2.8h, v3.4s, #11 - sqxtun v2.8b, v2.8h - st1 {v2.8b}, [x0], #8 + rshrn v22.4h, v22.4s, #11 + rshrn2 v22.8h, v23.4s, #11 + rshrn v23.4h, v24.4s, #11 + rshrn2 v23.8h, v25.4s, #11 + sqxtun v22.8b, v22.8h + sqxtun v23.8b, v23.8h + st1 {v22.8b}, [x0], #8 + st1 {v23.8b}, [x1], #8 .else - sqrshrun v2.4h, v2.4s, #11 - sqrshrun2 v2.8h, v3.4s, #11 - umin v2.8h, v2.8h, v30.8h - st1 {v2.8h}, [x0], #16 + sqrshrun v22.4h, v22.4s, #11 + sqrshrun2 v22.8h, v23.4s, #11 + sqrshrun v23.4h, v24.4s, #11 + sqrshrun2 v23.8h, v25.4s, #11 + umin v22.8h, v22.8h, v15.8h + umin v23.8h, v23.8h, v15.8h + st1 {v22.8h}, [x0], #16 + st1 {v23.8h}, [x1], #16 .endif - b.gt 2b -0: + + b.le 3f + ld1 {v17.4s, v18.4s}, [x2], #32 + ld1 {v20.4s, v21.4s}, [x7], #32 + b 2b + +3: + ldp d14, d15, [sp, #0x20] + ldr d10, [sp, #0x10] + ldp d8, d9, [sp], 0x30 ret endfunc @@ -461,7 +615,7 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1 .else ldp x8, x9, [sp] .endif - cmp x7, #2 + cmp w7, #2 add x10, x0, x1 add x11, x2, x3 add x12, x4, #2*FILTER_OUT_STRIDE @@ -483,7 +637,7 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1 sub x3, x3, x9, lsl #1 .endif sub x8, x8, x9, lsl #1 - mov x9, x6 + mov w9, w6 b.lt 2f 1: .if \bpc == 8 @@ -497,7 +651,7 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1 ld1 {v17.8h}, [x12], #16 ld1 {v2.8h}, [x5], #16 ld1 {v18.8h}, [x13], #16 - subs x6, x6, #8 + subs w6, w6, #8 .if \bpc == 8 ushll v0.8h, v0.8b, #4 // u ushll v16.8h, v16.8b, #4 // u @@ -542,10 +696,10 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1 .endif b.gt 1b - subs x7, x7, #2 - cmp x7, #1 + subs w7, w7, #2 + cmp w7, #1 b.lt 0f - mov x6, x9 + mov w6, w9 add x0, x0, x1 add x10, x10, x1 add x2, x2, x3 @@ -565,7 +719,7 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1 .endif ld1 {v1.8h}, [x4], #16 ld1 {v2.8h}, [x5], #16 - subs x6, x6, #8 + subs w6, w6, #8 .if \bpc == 8 ushll v0.8h, v0.8b, #4 // u .else diff --git a/third_party/dav1d/src/arm/64/refmvs.S b/third_party/dav1d/src/arm/64/refmvs.S index becd4c08f6a0..b567e194404a 100644 --- a/third_party/dav1d/src/arm/64/refmvs.S +++ b/third_party/dav1d/src/arm/64/refmvs.S @@ -89,3 +89,204 @@ L(splat_tbl): .hword L(splat_tbl) - 20b .hword L(splat_tbl) - 10b endfunc + +const mv_tbls, align=4 + .byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 + .byte 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0 + .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4 + .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4 +endconst + +const mask_mult, align=4 + .byte 1, 2, 1, 2, 0, 0, 0, 0 +endconst + +// void dav1d_save_tmvs_neon(refmvs_temporal_block *rp, ptrdiff_t stride, +// refmvs_block **rr, const uint8_t *ref_sign, +// int col_end8, int row_end8, +// int col_start8, int row_start8) +function save_tmvs_neon, export=1 + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp, #-16]! + mov x29, sp + + movi v30.8b, #0 + ld1 {v31.8b}, [x3] + adr x8, L(save_tmvs_tbl) + movrel x16, mask_mult + movrel x13, mv_tbls + ld1 {v29.8b}, [x16] + ext v31.8b, v30.8b, v31.8b, #7 // [0, ref_sign] + mov w15, #5 + mov w14, #12*2 + sxtw x4, w4 + sxtw x6, w6 + mul w1, w1, w15 // stride *= 5 + sub w5, w5, w7 // h = row_end8 - row_start8 + lsl w7, w7, #1 // row_start8 <<= 1 +1: + mov w15, #5 + and w9, w7, #30 // (y & 15) * 2 + ldr x9, [x2, w9, uxtw #3] // b = rr[(y & 15) * 2] + add x9, x9, #12 // &b[... + 1] + madd x10, x4, x14, x9 // end_cand_b = &b[col_end8*2 + 1] + madd x9, x6, x14, x9 // cand_b = &b[x*2 + 1] + + madd x3, x6, x15, x0 // &rp[x] + +2: + ldrb w11, [x9, #10] // cand_b->bs + ld1 {v0.16b}, [x9] // cand_b->mv + add x11, x8, w11, uxtw #2 + ldr h1, [x9, #8] // cand_b->ref + ldrh w12, [x11] // bw8 + mov x15, x8 + add x9, x9, w12, uxtw #1 // cand_b += bw8*2 + cmp x9, x10 + mov v2.8b, v0.8b + b.ge 3f + + ldrb w15, [x9, #10] // cand_b->bs + add x16, x9, #8 + ld1 {v4.16b}, [x9] // cand_b->mv + add x15, x8, w15, uxtw #2 + ld1 {v1.h}[1], [x16] // cand_b->ref + ldrh w12, [x15] // bw8 + add x9, x9, w12, uxtw #1 // cand_b += bw8*2 + trn1 v2.2d, v0.2d, v4.2d + +3: + abs v2.8h, v2.8h // abs(mv[].xy) + tbl v1.8b, {v31.16b}, v1.8b // ref_sign[ref] + ushr v2.8h, v2.8h, #12 // abs(mv[].xy) >> 12 + umull v1.8h, v1.8b, v29.8b // ref_sign[ref] * {1, 2} + cmeq v2.4s, v2.4s, #0 // abs(mv[].xy) <= 4096 + xtn v2.4h, v2.4s // abs() condition to 16 bit + and v1.8b, v1.8b, v2.8b // h[0-3] contains conditions for mv[0-1] + addp v1.4h, v1.4h, v1.4h // Combine condition for [1] and [0] + umov w16, v1.h[0] // Extract case for first block + umov w17, v1.h[1] + ldrh w11, [x11, #2] // Fetch jump table entry + ldrh w15, [x15, #2] + ldr q1, [x13, w16, uxtw #4] // Load permutation table base on case + ldr q5, [x13, w17, uxtw #4] + sub x11, x8, w11, uxtw // Find jump table target + sub x15, x8, w15, uxtw + tbl v0.16b, {v0.16b}, v1.16b // Permute cand_b to output refmvs_temporal_block + tbl v4.16b, {v4.16b}, v5.16b + + // v1 follows on v0, with another 3 full repetitions of the pattern. + ext v1.16b, v0.16b, v0.16b, #1 + ext v5.16b, v4.16b, v4.16b, #1 + // v2 ends with 3 complete repetitions of the pattern. + ext v2.16b, v0.16b, v1.16b, #4 + ext v6.16b, v4.16b, v5.16b, #4 + + blr x11 + b.ge 4f // if (cand_b >= end) + mov v0.16b, v4.16b + mov v1.16b, v5.16b + mov v2.16b, v6.16b + cmp x9, x10 + blr x15 + b.lt 2b // if (cand_b < end) + +4: + subs w5, w5, #1 // h-- + add w7, w7, #2 // y += 2 + add x0, x0, x1 // rp += stride + b.gt 1b + + ldp x29, x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret + +10: + AARCH64_VALID_JUMP_TARGET + add x16, x3, #4 + st1 {v0.s}[0], [x3] + st1 {v0.b}[4], [x16] + add x3, x3, #5 + ret +20: + AARCH64_VALID_JUMP_TARGET + add x16, x3, #8 + st1 {v0.d}[0], [x3] + st1 {v0.h}[4], [x16] + add x3, x3, #2*5 + ret +40: + AARCH64_VALID_JUMP_TARGET + st1 {v0.16b}, [x3] + str s1, [x3, #16] + add x3, x3, #4*5 + ret +80: + AARCH64_VALID_JUMP_TARGET + // This writes 6 full entries plus 2 extra bytes + st1 {v0.16b, v1.16b}, [x3] + // Write the last few, overlapping with the first write. + stur q2, [x3, #(8*5-16)] + add x3, x3, #8*5 + ret +160: + AARCH64_VALID_JUMP_TARGET + add x16, x3, #6*5 + add x17, x3, #12*5 + // This writes 6 full entries plus 2 extra bytes + st1 {v0.16b, v1.16b}, [x3] + // Write another 6 full entries, slightly overlapping with the first set + st1 {v0.16b, v1.16b}, [x16] + // Write 8 bytes (one full entry) after the first 12 + st1 {v0.8b}, [x17] + // Write the last 3 entries + str q2, [x3, #(16*5-16)] + add x3, x3, #16*5 + ret + +L(save_tmvs_tbl): + .hword 16 * 12 + .hword L(save_tmvs_tbl) - 160b + .hword 16 * 12 + .hword L(save_tmvs_tbl) - 160b + .hword 8 * 12 + .hword L(save_tmvs_tbl) - 80b + .hword 8 * 12 + .hword L(save_tmvs_tbl) - 80b + .hword 8 * 12 + .hword L(save_tmvs_tbl) - 80b + .hword 8 * 12 + .hword L(save_tmvs_tbl) - 80b + .hword 4 * 12 + .hword L(save_tmvs_tbl) - 40b + .hword 4 * 12 + .hword L(save_tmvs_tbl) - 40b + .hword 4 * 12 + .hword L(save_tmvs_tbl) - 40b + .hword 4 * 12 + .hword L(save_tmvs_tbl) - 40b + .hword 2 * 12 + .hword L(save_tmvs_tbl) - 20b + .hword 2 * 12 + .hword L(save_tmvs_tbl) - 20b + .hword 2 * 12 + .hword L(save_tmvs_tbl) - 20b + .hword 2 * 12 + .hword L(save_tmvs_tbl) - 20b + .hword 2 * 12 + .hword L(save_tmvs_tbl) - 20b + .hword 1 * 12 + .hword L(save_tmvs_tbl) - 10b + .hword 1 * 12 + .hword L(save_tmvs_tbl) - 10b + .hword 1 * 12 + .hword L(save_tmvs_tbl) - 10b + .hword 1 * 12 + .hword L(save_tmvs_tbl) - 10b + .hword 1 * 12 + .hword L(save_tmvs_tbl) - 10b + .hword 1 * 12 + .hword L(save_tmvs_tbl) - 10b + .hword 1 * 12 + .hword L(save_tmvs_tbl) - 10b +endfunc diff --git a/third_party/dav1d/src/arm/looprestoration.h b/third_party/dav1d/src/arm/looprestoration.h index 7993dbff6839..1ac6d5fb5e0a 100644 --- a/third_party/dav1d/src/arm/looprestoration.h +++ b/third_party/dav1d/src/arm/looprestoration.h @@ -105,6 +105,7 @@ static void wiener_filter_neon(pixel *const dst, const ptrdiff_t stride, } #endif +#if ARCH_ARM void BF(dav1d_sgr_box3_h, neon)(int32_t *sumsq, int16_t *sum, const pixel (*left)[4], const pixel *src, const ptrdiff_t stride, @@ -246,6 +247,853 @@ static void sgr_filter_mix_neon(pixel *const dst, const ptrdiff_t stride, tmp1, tmp2, w, h, wt HIGHBD_TAIL_SUFFIX); } +#else +static void rotate(int32_t **sumsq_ptrs, int16_t **sum_ptrs, int n) { + int32_t *tmp32 = sumsq_ptrs[0]; + int16_t *tmp16 = sum_ptrs[0]; + for (int i = 0; i < n - 1; i++) { + sumsq_ptrs[i] = sumsq_ptrs[i+1]; + sum_ptrs[i] = sum_ptrs[i+1]; + } + sumsq_ptrs[n - 1] = tmp32; + sum_ptrs[n - 1] = tmp16; +} +static void rotate5_x2(int32_t **sumsq_ptrs, int16_t **sum_ptrs) { + int32_t *tmp32[2]; + int16_t *tmp16[2]; + for (int i = 0; i < 2; i++) { + tmp32[i] = sumsq_ptrs[i]; + tmp16[i] = sum_ptrs[i]; + } + for (int i = 0; i < 3; i++) { + sumsq_ptrs[i] = sumsq_ptrs[i+2]; + sum_ptrs[i] = sum_ptrs[i+2]; + } + for (int i = 0; i < 2; i++) { + sumsq_ptrs[3 + i] = tmp32[i]; + sum_ptrs[3 + i] = tmp16[i]; + } +} + +static void rotate_ab_3(int32_t **A_ptrs, int16_t **B_ptrs) { + rotate(A_ptrs, B_ptrs, 3); +} + +static void rotate_ab_2(int32_t **A_ptrs, int16_t **B_ptrs) { + rotate(A_ptrs, B_ptrs, 2); +} + +static void rotate_ab_4(int32_t **A_ptrs, int16_t **B_ptrs) { + rotate(A_ptrs, B_ptrs, 4); +} + +void BF(dav1d_sgr_box3_row_h, neon)(int32_t *sumsq, int16_t *sum, + const pixel (*left)[4], + const pixel *src, const int w, + const enum LrEdgeFlags edges); +void BF(dav1d_sgr_box5_row_h, neon)(int32_t *sumsq, int16_t *sum, + const pixel (*left)[4], + const pixel *src, const int w, + const enum LrEdgeFlags edges); +void BF(dav1d_sgr_box35_row_h, neon)(int32_t *sumsq3, int16_t *sum3, + int32_t *sumsq5, int16_t *sum5, + const pixel (*left)[4], + const pixel *src, const int w, + const enum LrEdgeFlags edges); + +void dav1d_sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum, + int32_t *AA, int16_t *BB, + const int w, const int s, + const int bitdepth_max); +void dav1d_sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum, + int32_t *AA, int16_t *BB, + const int w, const int s, + const int bitdepth_max); + +void BF(dav1d_sgr_finish_weighted1, neon)(pixel *dst, + int32_t **A_ptrs, int16_t **B_ptrs, + const int w, const int w1 + HIGHBD_DECL_SUFFIX); +void BF(dav1d_sgr_finish_weighted2, neon)(pixel *dst, const ptrdiff_t stride, + int32_t **A_ptrs, int16_t **B_ptrs, + const int w, const int h, + const int w1 HIGHBD_DECL_SUFFIX); + +void BF(dav1d_sgr_finish_filter1_2rows, neon)(int16_t *tmp, const pixel *src, + const ptrdiff_t src_stride, + int32_t **A_ptrs, + int16_t **B_ptrs, + const int w, const int h); +void BF(dav1d_sgr_finish_filter2_2rows, neon)(int16_t *tmp, const pixel *src, + const ptrdiff_t src_stride, + int32_t **A_ptrs, int16_t **B_ptrs, + const int w, const int h); +void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride, + const pixel *src, const ptrdiff_t src_stride, + const int16_t *t1, const int16_t *t2, + const int w, const int h, + const int16_t wt[2] HIGHBD_DECL_SUFFIX); + +static void sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum, + int32_t *sumsq_out, int16_t *sum_out, + const int w, int s, int bitdepth_max) { + // box3_v + calc_ab1 + dav1d_sgr_box3_vert_neon(sumsq, sum, sumsq_out, sum_out, w, s, bitdepth_max); + rotate(sumsq, sum, 3); +} + +static void sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum, + int32_t *sumsq_out, int16_t *sum_out, + const int w, int s, int bitdepth_max) { + // box5_v + calc_ab2 + dav1d_sgr_box5_vert_neon(sumsq, sum, sumsq_out, sum_out, w, s, bitdepth_max); + rotate5_x2(sumsq, sum); +} + +static void sgr_box3_hv_neon(int32_t **sumsq, int16_t **sum, + int32_t *AA, int16_t *BB, + const pixel (*left)[4], + const pixel *src, const int w, + const int s, + const enum LrEdgeFlags edges, + const int bitdepth_max) { + BF(dav1d_sgr_box3_row_h, neon)(sumsq[2], sum[2], left, src, w, edges); + sgr_box3_vert_neon(sumsq, sum, AA, BB, w, s, bitdepth_max); +} + + +static void sgr_finish1_neon(pixel **dst, const ptrdiff_t stride, + int32_t **A_ptrs, int16_t **B_ptrs, const int w, + const int w1 HIGHBD_DECL_SUFFIX) { + BF(dav1d_sgr_finish_weighted1, neon)(*dst, A_ptrs, B_ptrs, + w, w1 HIGHBD_TAIL_SUFFIX); + *dst += PXSTRIDE(stride); + rotate_ab_3(A_ptrs, B_ptrs); +} + +static void sgr_finish2_neon(pixel **dst, const ptrdiff_t stride, + int32_t **A_ptrs, int16_t **B_ptrs, + const int w, const int h, const int w1 + HIGHBD_DECL_SUFFIX) { + BF(dav1d_sgr_finish_weighted2, neon)(*dst, stride, A_ptrs, B_ptrs, + w, h, w1 HIGHBD_TAIL_SUFFIX); + *dst += 2*PXSTRIDE(stride); + rotate_ab_2(A_ptrs, B_ptrs); +} + +static void sgr_finish_mix_neon(pixel **dst, const ptrdiff_t stride, + int32_t **A5_ptrs, int16_t **B5_ptrs, + int32_t **A3_ptrs, int16_t **B3_ptrs, + const int w, const int h, + const int w0, const int w1 HIGHBD_DECL_SUFFIX) { +#define FILTER_OUT_STRIDE 384 + ALIGN_STK_16(int16_t, tmp5, 2*FILTER_OUT_STRIDE,); + ALIGN_STK_16(int16_t, tmp3, 2*FILTER_OUT_STRIDE,); + + BF(dav1d_sgr_finish_filter2_2rows, neon)(tmp5, *dst, stride, + A5_ptrs, B5_ptrs, w, h); + BF(dav1d_sgr_finish_filter1_2rows, neon)(tmp3, *dst, stride, + A3_ptrs, B3_ptrs, w, h); + const int16_t wt[2] = { w0, w1 }; + BF(dav1d_sgr_weighted2, neon)(*dst, stride, *dst, stride, + tmp5, tmp3, w, h, wt HIGHBD_TAIL_SUFFIX); + *dst += h*PXSTRIDE(stride); + rotate_ab_2(A5_ptrs, B5_ptrs); + rotate_ab_4(A3_ptrs, B3_ptrs); +} + + +static void sgr_filter_3x3_neon(pixel *dst, const ptrdiff_t stride, + const pixel (*left)[4], const pixel *lpf, + const int w, int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +{ +#define BUF_STRIDE (384 + 16) + ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 3 + 16,); + ALIGN_STK_16(int16_t, sum_buf, BUF_STRIDE * 3 + 16,); + int32_t *sumsq_ptrs[3], *sumsq_rows[3]; + int16_t *sum_ptrs[3], *sum_rows[3]; + for (int i = 0; i < 3; i++) { + sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE]; + sum_rows[i] = &sum_buf[i * BUF_STRIDE]; + } + + ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 3 + 16,); + ALIGN_STK_16(int16_t, B_buf, BUF_STRIDE * 3 + 16,); + int32_t *A_ptrs[3]; + int16_t *B_ptrs[3]; + for (int i = 0; i < 3; i++) { + A_ptrs[i] = &A_buf[i * BUF_STRIDE]; + B_ptrs[i] = &B_buf[i * BUF_STRIDE]; + } + const pixel *src = dst; + const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride); + + if (edges & LR_HAVE_TOP) { + sumsq_ptrs[0] = sumsq_rows[0]; + sumsq_ptrs[1] = sumsq_rows[1]; + sumsq_ptrs[2] = sumsq_rows[2]; + sum_ptrs[0] = sum_rows[0]; + sum_ptrs[1] = sum_rows[1]; + sum_ptrs[2] = sum_rows[2]; + + BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[0], sum_rows[0], + NULL, lpf, w, edges); + lpf += PXSTRIDE(stride); + BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[1], sum_rows[1], + NULL, lpf, w, edges); + + sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], + left, src, w, params->sgr.s1, edges, BITDEPTH_MAX); + left++; + src += PXSTRIDE(stride); + rotate_ab_3(A_ptrs, B_ptrs); + + if (--h <= 0) + goto vert_1; + + sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], left, src, w, params->sgr.s1, edges, BITDEPTH_MAX); + left++; + src += PXSTRIDE(stride); + rotate_ab_3(A_ptrs, B_ptrs); + + if (--h <= 0) + goto vert_2; + } else { + sumsq_ptrs[0] = sumsq_rows[0]; + sumsq_ptrs[1] = sumsq_rows[0]; + sumsq_ptrs[2] = sumsq_rows[0]; + sum_ptrs[0] = sum_rows[0]; + sum_ptrs[1] = sum_rows[0]; + sum_ptrs[2] = sum_rows[0]; + + BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[0], sum_rows[0], + left, src, w, edges); + left++; + src += PXSTRIDE(stride); + + sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], + w, params->sgr.s1, BITDEPTH_MAX); + rotate_ab_3(A_ptrs, B_ptrs); + + if (--h <= 0) + goto vert_1; + + sumsq_ptrs[2] = sumsq_rows[1]; + sum_ptrs[2] = sum_rows[1]; + + sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], + left, src, w, params->sgr.s1, edges, BITDEPTH_MAX); + left++; + src += PXSTRIDE(stride); + rotate_ab_3(A_ptrs, B_ptrs); + + if (--h <= 0) + goto vert_2; + + sumsq_ptrs[2] = sumsq_rows[2]; + sum_ptrs[2] = sum_rows[2]; + } + + do { + sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], + left, src, w, params->sgr.s1, edges, BITDEPTH_MAX); + left++; + src += PXSTRIDE(stride); + + sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs, + w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); + } while (--h > 0); + + if (!(edges & LR_HAVE_BOTTOM)) + goto vert_2; + + sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], + NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX); + lpf_bottom += PXSTRIDE(stride); + + sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs, + w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); + + sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], + NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX); + + sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs, + w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); + return; + +vert_2: + sumsq_ptrs[2] = sumsq_ptrs[1]; + sum_ptrs[2] = sum_ptrs[1]; + sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], + w, params->sgr.s1, BITDEPTH_MAX); + + sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs, + w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); + +output_1: + sumsq_ptrs[2] = sumsq_ptrs[1]; + sum_ptrs[2] = sum_ptrs[1]; + sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], + w, params->sgr.s1, BITDEPTH_MAX); + + sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs, + w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); + return; + +vert_1: + sumsq_ptrs[2] = sumsq_ptrs[1]; + sum_ptrs[2] = sum_ptrs[1]; + sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], + w, params->sgr.s1, BITDEPTH_MAX); + rotate_ab_3(A_ptrs, B_ptrs); + goto output_1; +} + +static void sgr_filter_5x5_neon(pixel *dst, const ptrdiff_t stride, + const pixel (*left)[4], const pixel *lpf, + const int w, int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +{ + ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 5 + 16,); + ALIGN_STK_16(int16_t, sum_buf, BUF_STRIDE * 5 + 16,); + int32_t *sumsq_ptrs[5], *sumsq_rows[5]; + int16_t *sum_ptrs[5], *sum_rows[5]; + for (int i = 0; i < 5; i++) { + sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE]; + sum_rows[i] = &sum_buf[i * BUF_STRIDE]; + } + + ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 2 + 16,); + ALIGN_STK_16(int16_t, B_buf, BUF_STRIDE * 2 + 16,); + int32_t *A_ptrs[2]; + int16_t *B_ptrs[2]; + for (int i = 0; i < 2; i++) { + A_ptrs[i] = &A_buf[i * BUF_STRIDE]; + B_ptrs[i] = &B_buf[i * BUF_STRIDE]; + } + const pixel *src = dst; + const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride); + + if (edges & LR_HAVE_TOP) { + sumsq_ptrs[0] = sumsq_rows[0]; + sumsq_ptrs[1] = sumsq_rows[0]; + sumsq_ptrs[2] = sumsq_rows[1]; + sumsq_ptrs[3] = sumsq_rows[2]; + sumsq_ptrs[4] = sumsq_rows[3]; + sum_ptrs[0] = sum_rows[0]; + sum_ptrs[1] = sum_rows[0]; + sum_ptrs[2] = sum_rows[1]; + sum_ptrs[3] = sum_rows[2]; + sum_ptrs[4] = sum_rows[3]; + + BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[0], sum_rows[0], + NULL, lpf, w, edges); + lpf += PXSTRIDE(stride); + BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[1], sum_rows[1], + NULL, lpf, w, edges); + + BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[2], sum_rows[2], + left, src, w, edges); + left++; + src += PXSTRIDE(stride); + + if (--h <= 0) + goto vert_1; + + BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[3], sum_rows[3], + left, src, w, edges); + left++; + src += PXSTRIDE(stride); + sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], + w, params->sgr.s0, BITDEPTH_MAX); + rotate_ab_2(A_ptrs, B_ptrs); + + if (--h <= 0) + goto vert_2; + + // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set + // one of them to point at the previously unused rows[4]. + sumsq_ptrs[3] = sumsq_rows[4]; + sum_ptrs[3] = sum_rows[4]; + } else { + sumsq_ptrs[0] = sumsq_rows[0]; + sumsq_ptrs[1] = sumsq_rows[0]; + sumsq_ptrs[2] = sumsq_rows[0]; + sumsq_ptrs[3] = sumsq_rows[0]; + sumsq_ptrs[4] = sumsq_rows[0]; + sum_ptrs[0] = sum_rows[0]; + sum_ptrs[1] = sum_rows[0]; + sum_ptrs[2] = sum_rows[0]; + sum_ptrs[3] = sum_rows[0]; + sum_ptrs[4] = sum_rows[0]; + + BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[0], sum_rows[0], + left, src, w, edges); + left++; + src += PXSTRIDE(stride); + + if (--h <= 0) + goto vert_1; + + sumsq_ptrs[4] = sumsq_rows[1]; + sum_ptrs[4] = sum_rows[1]; + + BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[1], sum_rows[1], + left, src, w, edges); + left++; + src += PXSTRIDE(stride); + + sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], + w, params->sgr.s0, BITDEPTH_MAX); + rotate_ab_2(A_ptrs, B_ptrs); + + if (--h <= 0) + goto vert_2; + + sumsq_ptrs[3] = sumsq_rows[2]; + sumsq_ptrs[4] = sumsq_rows[3]; + sum_ptrs[3] = sum_rows[2]; + sum_ptrs[4] = sum_rows[3]; + + BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[2], sum_rows[2], + left, src, w, edges); + left++; + src += PXSTRIDE(stride); + + if (--h <= 0) + goto odd; + + BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[3], sum_rows[3], + left, src, w, edges); + left++; + src += PXSTRIDE(stride); + + sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], + w, params->sgr.s0, BITDEPTH_MAX); + sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs, + w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX); + + if (--h <= 0) + goto vert_2; + + // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set + // one of them to point at the previously unused rows[4]. + sumsq_ptrs[3] = sumsq_rows[4]; + sum_ptrs[3] = sum_rows[4]; + } + + do { + BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[3], sum_ptrs[3], + left, src, w, edges); + left++; + src += PXSTRIDE(stride); + + if (--h <= 0) + goto odd; + + BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[4], sum_ptrs[4], + left, src, w, edges); + left++; + src += PXSTRIDE(stride); + + sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], + w, params->sgr.s0, BITDEPTH_MAX); + sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs, + w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX); + } while (--h > 0); + + if (!(edges & LR_HAVE_BOTTOM)) + goto vert_2; + + BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[3], sum_ptrs[3], + NULL, lpf_bottom, w, edges); + lpf_bottom += PXSTRIDE(stride); + BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[4], sum_ptrs[4], + NULL, lpf_bottom, w, edges); + +output_2: + sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], + w, params->sgr.s0, BITDEPTH_MAX); + sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs, + w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX); + return; + +vert_2: + // Duplicate the last row twice more + sumsq_ptrs[3] = sumsq_ptrs[2]; + sumsq_ptrs[4] = sumsq_ptrs[2]; + sum_ptrs[3] = sum_ptrs[2]; + sum_ptrs[4] = sum_ptrs[2]; + goto output_2; + +odd: + // Copy the last row as padding once + sumsq_ptrs[4] = sumsq_ptrs[3]; + sum_ptrs[4] = sum_ptrs[3]; + + sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], + w, params->sgr.s0, BITDEPTH_MAX); + sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs, + w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX); + +output_1: + // Duplicate the last row twice more + sumsq_ptrs[3] = sumsq_ptrs[2]; + sumsq_ptrs[4] = sumsq_ptrs[2]; + sum_ptrs[3] = sum_ptrs[2]; + sum_ptrs[4] = sum_ptrs[2]; + + sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], + w, params->sgr.s0, BITDEPTH_MAX); + // Output only one row + sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs, + w, 1, params->sgr.w0 HIGHBD_TAIL_SUFFIX); + return; + +vert_1: + // Copy the last row as padding once + sumsq_ptrs[4] = sumsq_ptrs[3]; + sum_ptrs[4] = sum_ptrs[3]; + + sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], + w, params->sgr.s0, BITDEPTH_MAX); + rotate_ab_2(A_ptrs, B_ptrs); + + goto output_1; +} + +static void sgr_filter_mix_neon(pixel *dst, const ptrdiff_t stride, + const pixel (*left)[4], const pixel *lpf, + const int w, int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +{ + ALIGN_STK_16(int32_t, sumsq5_buf, BUF_STRIDE * 5 + 16,); + ALIGN_STK_16(int16_t, sum5_buf, BUF_STRIDE * 5 + 16,); + int32_t *sumsq5_ptrs[5], *sumsq5_rows[5]; + int16_t *sum5_ptrs[5], *sum5_rows[5]; + for (int i = 0; i < 5; i++) { + sumsq5_rows[i] = &sumsq5_buf[i * BUF_STRIDE]; + sum5_rows[i] = &sum5_buf[i * BUF_STRIDE]; + } + ALIGN_STK_16(int32_t, sumsq3_buf, BUF_STRIDE * 3 + 16,); + ALIGN_STK_16(int16_t, sum3_buf, BUF_STRIDE * 3 + 16,); + int32_t *sumsq3_ptrs[3], *sumsq3_rows[3]; + int16_t *sum3_ptrs[3], *sum3_rows[3]; + for (int i = 0; i < 3; i++) { + sumsq3_rows[i] = &sumsq3_buf[i * BUF_STRIDE]; + sum3_rows[i] = &sum3_buf[i * BUF_STRIDE]; + } + + ALIGN_STK_16(int32_t, A5_buf, BUF_STRIDE * 2 + 16,); + ALIGN_STK_16(int16_t, B5_buf, BUF_STRIDE * 2 + 16,); + int32_t *A5_ptrs[2]; + int16_t *B5_ptrs[2]; + for (int i = 0; i < 2; i++) { + A5_ptrs[i] = &A5_buf[i * BUF_STRIDE]; + B5_ptrs[i] = &B5_buf[i * BUF_STRIDE]; + } + ALIGN_STK_16(int32_t, A3_buf, BUF_STRIDE * 4 + 16,); + ALIGN_STK_16(int16_t, B3_buf, BUF_STRIDE * 4 + 16,); + int32_t *A3_ptrs[4]; + int16_t *B3_ptrs[4]; + for (int i = 0; i < 4; i++) { + A3_ptrs[i] = &A3_buf[i * BUF_STRIDE]; + B3_ptrs[i] = &B3_buf[i * BUF_STRIDE]; + } + const pixel *src = dst; + const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride); + + if (edges & LR_HAVE_TOP) { + sumsq5_ptrs[0] = sumsq5_rows[0]; + sumsq5_ptrs[1] = sumsq5_rows[0]; + sumsq5_ptrs[2] = sumsq5_rows[1]; + sumsq5_ptrs[3] = sumsq5_rows[2]; + sumsq5_ptrs[4] = sumsq5_rows[3]; + sum5_ptrs[0] = sum5_rows[0]; + sum5_ptrs[1] = sum5_rows[0]; + sum5_ptrs[2] = sum5_rows[1]; + sum5_ptrs[3] = sum5_rows[2]; + sum5_ptrs[4] = sum5_rows[3]; + + sumsq3_ptrs[0] = sumsq3_rows[0]; + sumsq3_ptrs[1] = sumsq3_rows[1]; + sumsq3_ptrs[2] = sumsq3_rows[2]; + sum3_ptrs[0] = sum3_rows[0]; + sum3_ptrs[1] = sum3_rows[1]; + sum3_ptrs[2] = sum3_rows[2]; + + BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[0], sum3_rows[0], + sumsq5_rows[0], sum5_rows[0], + NULL, lpf, w, edges); + lpf += PXSTRIDE(stride); + BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[1], sum3_rows[1], + sumsq5_rows[1], sum5_rows[1], + NULL, lpf, w, edges); + + BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[2], sum3_rows[2], + sumsq5_rows[2], sum5_rows[2], + left, src, w, edges); + left++; + src += PXSTRIDE(stride); + + sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], + w, params->sgr.s1, BITDEPTH_MAX); + rotate_ab_4(A3_ptrs, B3_ptrs); + + if (--h <= 0) + goto vert_1; + + BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2], + sumsq5_rows[3], sum5_rows[3], + left, src, w, edges); + left++; + src += PXSTRIDE(stride); + sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], + w, params->sgr.s0, BITDEPTH_MAX); + rotate_ab_2(A5_ptrs, B5_ptrs); + sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], + w, params->sgr.s1, BITDEPTH_MAX); + rotate_ab_4(A3_ptrs, B3_ptrs); + + if (--h <= 0) + goto vert_2; + + // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set + // one of them to point at the previously unused rows[4]. + sumsq5_ptrs[3] = sumsq5_rows[4]; + sum5_ptrs[3] = sum5_rows[4]; + } else { + sumsq5_ptrs[0] = sumsq5_rows[0]; + sumsq5_ptrs[1] = sumsq5_rows[0]; + sumsq5_ptrs[2] = sumsq5_rows[0]; + sumsq5_ptrs[3] = sumsq5_rows[0]; + sumsq5_ptrs[4] = sumsq5_rows[0]; + sum5_ptrs[0] = sum5_rows[0]; + sum5_ptrs[1] = sum5_rows[0]; + sum5_ptrs[2] = sum5_rows[0]; + sum5_ptrs[3] = sum5_rows[0]; + sum5_ptrs[4] = sum5_rows[0]; + + sumsq3_ptrs[0] = sumsq3_rows[0]; + sumsq3_ptrs[1] = sumsq3_rows[0]; + sumsq3_ptrs[2] = sumsq3_rows[0]; + sum3_ptrs[0] = sum3_rows[0]; + sum3_ptrs[1] = sum3_rows[0]; + sum3_ptrs[2] = sum3_rows[0]; + + BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[0], sum3_rows[0], + sumsq5_rows[0], sum5_rows[0], + left, src, w, edges); + left++; + src += PXSTRIDE(stride); + + sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], + w, params->sgr.s1, BITDEPTH_MAX); + rotate_ab_4(A3_ptrs, B3_ptrs); + + if (--h <= 0) + goto vert_1; + + sumsq5_ptrs[4] = sumsq5_rows[1]; + sum5_ptrs[4] = sum5_rows[1]; + + sumsq3_ptrs[2] = sumsq3_rows[1]; + sum3_ptrs[2] = sum3_rows[1]; + + BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[1], sum3_rows[1], + sumsq5_rows[1], sum5_rows[1], + left, src, w, edges); + left++; + src += PXSTRIDE(stride); + + sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], + w, params->sgr.s0, BITDEPTH_MAX); + rotate_ab_2(A5_ptrs, B5_ptrs); + sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], + w, params->sgr.s1, BITDEPTH_MAX); + rotate_ab_4(A3_ptrs, B3_ptrs); + + if (--h <= 0) + goto vert_2; + + sumsq5_ptrs[3] = sumsq5_rows[2]; + sumsq5_ptrs[4] = sumsq5_rows[3]; + sum5_ptrs[3] = sum5_rows[2]; + sum5_ptrs[4] = sum5_rows[3]; + + sumsq3_ptrs[2] = sumsq3_rows[2]; + sum3_ptrs[2] = sum3_rows[2]; + + BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[2], sum3_rows[2], + sumsq5_rows[2], sum5_rows[2], + left, src, w, edges); + left++; + src += PXSTRIDE(stride); + + sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], + w, params->sgr.s1, BITDEPTH_MAX); + rotate_ab_4(A3_ptrs, B3_ptrs); + + if (--h <= 0) + goto odd; + + BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2], + sumsq5_rows[3], sum5_rows[3], + left, src, w, edges); + left++; + src += PXSTRIDE(stride); + + sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], + w, params->sgr.s0, BITDEPTH_MAX); + sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], + w, params->sgr.s1, BITDEPTH_MAX); + sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, + w, 2, params->sgr.w0, params->sgr.w1 + HIGHBD_TAIL_SUFFIX); + + if (--h <= 0) + goto vert_2; + + // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set + // one of them to point at the previously unused rows[4]. + sumsq5_ptrs[3] = sumsq5_rows[4]; + sum5_ptrs[3] = sum5_rows[4]; + } + + do { + BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2], + sumsq5_ptrs[3], sum5_ptrs[3], + left, src, w, edges); + left++; + src += PXSTRIDE(stride); + + sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], + w, params->sgr.s1, BITDEPTH_MAX); + rotate_ab_4(A3_ptrs, B3_ptrs); + + if (--h <= 0) + goto odd; + + BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2], + sumsq5_ptrs[4], sum5_ptrs[4], + left, src, w, edges); + left++; + src += PXSTRIDE(stride); + + sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], + w, params->sgr.s0, BITDEPTH_MAX); + sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], + w, params->sgr.s1, BITDEPTH_MAX); + sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, + w, 2, params->sgr.w0, params->sgr.w1 + HIGHBD_TAIL_SUFFIX); + } while (--h > 0); + + if (!(edges & LR_HAVE_BOTTOM)) + goto vert_2; + + BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2], + sumsq5_ptrs[3], sum5_ptrs[3], + NULL, lpf_bottom, w, edges); + lpf_bottom += PXSTRIDE(stride); + sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], + w, params->sgr.s1, BITDEPTH_MAX); + rotate_ab_4(A3_ptrs, B3_ptrs); + + BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2], + sumsq5_ptrs[4], sum5_ptrs[4], + NULL, lpf_bottom, w, edges); + +output_2: + sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], + w, params->sgr.s0, BITDEPTH_MAX); + sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], + w, params->sgr.s1, BITDEPTH_MAX); + sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, + w, 2, params->sgr.w0, params->sgr.w1 + HIGHBD_TAIL_SUFFIX); + return; + +vert_2: + // Duplicate the last row twice more + sumsq5_ptrs[3] = sumsq5_ptrs[2]; + sumsq5_ptrs[4] = sumsq5_ptrs[2]; + sum5_ptrs[3] = sum5_ptrs[2]; + sum5_ptrs[4] = sum5_ptrs[2]; + + sumsq3_ptrs[2] = sumsq3_ptrs[1]; + sum3_ptrs[2] = sum3_ptrs[1]; + sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], + w, params->sgr.s1, BITDEPTH_MAX); + rotate_ab_4(A3_ptrs, B3_ptrs); + + sumsq3_ptrs[2] = sumsq3_ptrs[1]; + sum3_ptrs[2] = sum3_ptrs[1]; + + goto output_2; + +odd: + // Copy the last row as padding once + sumsq5_ptrs[4] = sumsq5_ptrs[3]; + sum5_ptrs[4] = sum5_ptrs[3]; + + sumsq3_ptrs[2] = sumsq3_ptrs[1]; + sum3_ptrs[2] = sum3_ptrs[1]; + + sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], + w, params->sgr.s0, BITDEPTH_MAX); + sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], + w, params->sgr.s1, BITDEPTH_MAX); + sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, + w, 2, params->sgr.w0, params->sgr.w1 + HIGHBD_TAIL_SUFFIX); + +output_1: + // Duplicate the last row twice more + sumsq5_ptrs[3] = sumsq5_ptrs[2]; + sumsq5_ptrs[4] = sumsq5_ptrs[2]; + sum5_ptrs[3] = sum5_ptrs[2]; + sum5_ptrs[4] = sum5_ptrs[2]; + + sumsq3_ptrs[2] = sumsq3_ptrs[1]; + sum3_ptrs[2] = sum3_ptrs[1]; + + sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], + w, params->sgr.s0, BITDEPTH_MAX); + sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], + w, params->sgr.s1, BITDEPTH_MAX); + rotate_ab_4(A3_ptrs, B3_ptrs); + // Output only one row + sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, + w, 1, params->sgr.w0, params->sgr.w1 + HIGHBD_TAIL_SUFFIX); + return; + +vert_1: + // Copy the last row as padding once + sumsq5_ptrs[4] = sumsq5_ptrs[3]; + sum5_ptrs[4] = sum5_ptrs[3]; + + sumsq3_ptrs[2] = sumsq3_ptrs[1]; + sum3_ptrs[2] = sum3_ptrs[1]; + + sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], + w, params->sgr.s0, BITDEPTH_MAX); + rotate_ab_2(A5_ptrs, B5_ptrs); + sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], + w, params->sgr.s1, BITDEPTH_MAX); + rotate_ab_4(A3_ptrs, B3_ptrs); + + goto output_1; +} + +#endif + + static ALWAYS_INLINE void loop_restoration_dsp_init_arm(Dav1dLoopRestorationDSPContext *const c, int bpc) { const unsigned flags = dav1d_get_cpu_flags(); diff --git a/third_party/dav1d/src/arm/refmvs.h b/third_party/dav1d/src/arm/refmvs.h index 4c96fc509525..1c2dc704cf4f 100644 --- a/third_party/dav1d/src/arm/refmvs.h +++ b/third_party/dav1d/src/arm/refmvs.h @@ -28,6 +28,7 @@ #include "src/cpu.h" #include "src/refmvs.h" +decl_save_tmvs_fn(dav1d_save_tmvs_neon); decl_splat_mv_fn(dav1d_splat_mv_neon); static ALWAYS_INLINE void refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *const c) { @@ -35,5 +36,6 @@ static ALWAYS_INLINE void refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *const c) { if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + c->save_tmvs = dav1d_save_tmvs_neon; c->splat_mv = dav1d_splat_mv_neon; } diff --git a/third_party/dav1d/src/data.c b/third_party/dav1d/src/data.c index fe2e4f42c090..bbbe02e8d17b 100644 --- a/third_party/dav1d/src/data.c +++ b/third_party/dav1d/src/data.c @@ -44,7 +44,7 @@ uint8_t *dav1d_data_create_internal(Dav1dData *const buf, const size_t sz) { validate_input_or_ret(buf != NULL, NULL); if (sz > SIZE_MAX / 2) return NULL; - buf->ref = dav1d_ref_create(sz); + buf->ref = dav1d_ref_create(ALLOC_DAV1DDATA, sz); if (!buf->ref) return NULL; buf->data = buf->ref->const_data; buf->sz = sz; @@ -65,7 +65,7 @@ int dav1d_data_wrap_internal(Dav1dData *const buf, const uint8_t *const ptr, validate_input_or_ret(free_callback != NULL, DAV1D_ERR(EINVAL)); if (sz > SIZE_MAX / 2) return DAV1D_ERR(EINVAL); - Dav1dRef *const ref = malloc(sizeof(Dav1dRef)); + Dav1dRef *const ref = dav1d_malloc(ALLOC_DAV1DDATA, sizeof(Dav1dRef)); if (!ref) return DAV1D_ERR(ENOMEM); buf->ref = dav1d_ref_init(ref, ptr, free_callback, cookie, 1); @@ -86,7 +86,7 @@ int dav1d_data_wrap_user_data_internal(Dav1dData *const buf, validate_input_or_ret(buf != NULL, DAV1D_ERR(EINVAL)); validate_input_or_ret(free_callback != NULL, DAV1D_ERR(EINVAL)); - Dav1dRef *const ref = malloc(sizeof(Dav1dRef)); + Dav1dRef *const ref = dav1d_malloc(ALLOC_DAV1DDATA, sizeof(Dav1dRef)); if (!ref) return DAV1D_ERR(ENOMEM); buf->m.user_data.ref = dav1d_ref_init(ref, user_data, free_callback, cookie, 1); @@ -95,14 +95,13 @@ int dav1d_data_wrap_user_data_internal(Dav1dData *const buf, return 0; } - void dav1d_data_ref(Dav1dData *const dst, const Dav1dData *const src) { - validate_input(dst != NULL); - validate_input(dst->data == NULL); - validate_input(src != NULL); + assert(dst != NULL); + assert(dst->data == NULL); + assert(src != NULL); if (src->ref) { - validate_input(src->data != NULL); + assert(src->data != NULL); dav1d_ref_inc(src->ref); } if (src->m.user_data.ref) dav1d_ref_inc(src->m.user_data.ref); diff --git a/third_party/dav1d/src/decode.c b/third_party/dav1d/src/decode.c index 1e298f527acc..4dfc4a1e5798 100644 --- a/third_party/dav1d/src/decode.c +++ b/third_party/dav1d/src/decode.c @@ -2932,8 +2932,8 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) { int retval = DAV1D_ERR(ENOMEM); if (f->sbh > f->lf.start_of_tile_row_sz) { - free(f->lf.start_of_tile_row); - f->lf.start_of_tile_row = malloc(f->sbh * sizeof(uint8_t)); + dav1d_free(f->lf.start_of_tile_row); + f->lf.start_of_tile_row = dav1d_malloc(ALLOC_TILE, f->sbh * sizeof(uint8_t)); if (!f->lf.start_of_tile_row) { f->lf.start_of_tile_row_sz = 0; goto error; @@ -2950,24 +2950,24 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) { const int n_ts = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows; if (n_ts != f->n_ts) { if (c->n_fc > 1) { - freep(&f->frame_thread.tile_start_off); + dav1d_free(f->frame_thread.tile_start_off); f->frame_thread.tile_start_off = - malloc(sizeof(*f->frame_thread.tile_start_off) * n_ts); + dav1d_malloc(ALLOC_TILE, sizeof(*f->frame_thread.tile_start_off) * n_ts); if (!f->frame_thread.tile_start_off) { f->n_ts = 0; goto error; } } dav1d_free_aligned(f->ts); - f->ts = dav1d_alloc_aligned(sizeof(*f->ts) * n_ts, 32); + f->ts = dav1d_alloc_aligned(ALLOC_TILE, sizeof(*f->ts) * n_ts, 32); if (!f->ts) goto error; f->n_ts = n_ts; } const int a_sz = f->sb128w * f->frame_hdr->tiling.rows * (1 + (c->n_fc > 1 && c->n_tc > 1)); if (a_sz != f->a_sz) { - freep(&f->a); - f->a = malloc(sizeof(*f->a) * a_sz); + dav1d_free(f->a); + f->a = dav1d_malloc(ALLOC_TILE, sizeof(*f->a) * a_sz); if (!f->a) { f->a_sz = 0; goto error; @@ -2993,9 +2993,10 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) { const int lowest_pixel_mem_sz = f->frame_hdr->tiling.cols * f->sbh; if (lowest_pixel_mem_sz != f->tile_thread.lowest_pixel_mem_sz) { - free(f->tile_thread.lowest_pixel_mem); + dav1d_free(f->tile_thread.lowest_pixel_mem); f->tile_thread.lowest_pixel_mem = - malloc(lowest_pixel_mem_sz * sizeof(*f->tile_thread.lowest_pixel_mem)); + dav1d_malloc(ALLOC_TILE, lowest_pixel_mem_sz * + sizeof(*f->tile_thread.lowest_pixel_mem)); if (!f->tile_thread.lowest_pixel_mem) { f->tile_thread.lowest_pixel_mem_sz = 0; goto error; @@ -3016,9 +3017,9 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) { const int cf_sz = (num_sb128 * size_mul[0]) << hbd; if (cf_sz != f->frame_thread.cf_sz) { - dav1d_freep_aligned(&f->frame_thread.cf); + dav1d_free_aligned(f->frame_thread.cf); f->frame_thread.cf = - dav1d_alloc_aligned((size_t)cf_sz * 128 * 128 / 2, 64); + dav1d_alloc_aligned(ALLOC_COEF, (size_t)cf_sz * 128 * 128 / 2, 64); if (!f->frame_thread.cf) { f->frame_thread.cf_sz = 0; goto error; @@ -3029,9 +3030,9 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) { if (f->frame_hdr->allow_screen_content_tools) { if (num_sb128 != f->frame_thread.pal_sz) { - dav1d_freep_aligned(&f->frame_thread.pal); + dav1d_free_aligned(f->frame_thread.pal); f->frame_thread.pal = - dav1d_alloc_aligned(sizeof(*f->frame_thread.pal) * + dav1d_alloc_aligned(ALLOC_PAL, sizeof(*f->frame_thread.pal) * num_sb128 * 16 * 16, 64); if (!f->frame_thread.pal) { f->frame_thread.pal_sz = 0; @@ -3042,9 +3043,9 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) { const int pal_idx_sz = num_sb128 * size_mul[1]; if (pal_idx_sz != f->frame_thread.pal_idx_sz) { - dav1d_freep_aligned(&f->frame_thread.pal_idx); + dav1d_free_aligned(f->frame_thread.pal_idx); f->frame_thread.pal_idx = - dav1d_alloc_aligned(sizeof(*f->frame_thread.pal_idx) * + dav1d_alloc_aligned(ALLOC_PAL, sizeof(*f->frame_thread.pal_idx) * pal_idx_sz * 128 * 128 / 4, 64); if (!f->frame_thread.pal_idx) { f->frame_thread.pal_idx_sz = 0; @@ -3072,7 +3073,7 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) { size_t alloc_sz = 64; alloc_sz += (size_t)llabs(y_stride) * 4 * f->sbh << need_cdef_lpf_copy; alloc_sz += (size_t)llabs(uv_stride) * 8 * f->sbh << need_cdef_lpf_copy; - uint8_t *ptr = f->lf.cdef_line_buf = dav1d_alloc_aligned(alloc_sz, 32); + uint8_t *ptr = f->lf.cdef_line_buf = dav1d_alloc_aligned(ALLOC_CDEF, alloc_sz, 32); if (!ptr) { f->lf.cdef_buf_plane_sz[0] = f->lf.cdef_buf_plane_sz[1] = 0; goto error; @@ -3132,7 +3133,7 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) { size_t alloc_sz = 128; alloc_sz += (size_t)llabs(y_stride) * num_lines; alloc_sz += (size_t)llabs(uv_stride) * num_lines * 2; - uint8_t *ptr = f->lf.lr_line_buf = dav1d_alloc_aligned(alloc_sz, 64); + uint8_t *ptr = f->lf.lr_line_buf = dav1d_alloc_aligned(ALLOC_LR, alloc_sz, 64); if (!ptr) { f->lf.lr_buf_plane_sz[0] = f->lf.lr_buf_plane_sz[1] = 0; goto error; @@ -3158,23 +3159,23 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) { // update allocation for loopfilter masks if (num_sb128 != f->lf.mask_sz) { - freep(&f->lf.mask); - freep(&f->lf.level); - f->lf.mask = malloc(sizeof(*f->lf.mask) * num_sb128); + dav1d_free(f->lf.mask); + dav1d_free(f->lf.level); + f->lf.mask = dav1d_malloc(ALLOC_LF, sizeof(*f->lf.mask) * num_sb128); // over-allocate by 3 bytes since some of the SIMD implementations // index this from the level type and can thus over-read by up to 3 - f->lf.level = malloc(sizeof(*f->lf.level) * num_sb128 * 32 * 32 + 3); + f->lf.level = dav1d_malloc(ALLOC_LF, sizeof(*f->lf.level) * num_sb128 * 32 * 32 + 3); if (!f->lf.mask || !f->lf.level) { f->lf.mask_sz = 0; goto error; } if (c->n_fc > 1) { - freep(&f->frame_thread.b); - freep(&f->frame_thread.cbi); - f->frame_thread.b = malloc(sizeof(*f->frame_thread.b) * - num_sb128 * 32 * 32); - f->frame_thread.cbi = malloc(sizeof(*f->frame_thread.cbi) * - num_sb128 * 32 * 32); + dav1d_free(f->frame_thread.b); + dav1d_free(f->frame_thread.cbi); + f->frame_thread.b = dav1d_malloc(ALLOC_BLOCK, sizeof(*f->frame_thread.b) * + num_sb128 * 32 * 32); + f->frame_thread.cbi = dav1d_malloc(ALLOC_BLOCK, sizeof(*f->frame_thread.cbi) * + num_sb128 * 32 * 32); if (!f->frame_thread.b || !f->frame_thread.cbi) { f->lf.mask_sz = 0; goto error; @@ -3186,8 +3187,8 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) { f->sr_sb128w = (f->sr_cur.p.p.w + 127) >> 7; const int lr_mask_sz = f->sr_sb128w * f->sb128h; if (lr_mask_sz != f->lf.lr_mask_sz) { - freep(&f->lf.lr_mask); - f->lf.lr_mask = malloc(sizeof(*f->lf.lr_mask) * lr_mask_sz); + dav1d_free(f->lf.lr_mask); + f->lf.lr_mask = dav1d_malloc(ALLOC_LR, sizeof(*f->lf.lr_mask) * lr_mask_sz); if (!f->lf.lr_mask) { f->lf.lr_mask_sz = 0; goto error; @@ -3207,9 +3208,9 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) { const int ipred_edge_sz = f->sbh * f->sb128w << hbd; if (ipred_edge_sz != f->ipred_edge_sz) { - dav1d_freep_aligned(&f->ipred_edge[0]); + dav1d_free_aligned(f->ipred_edge[0]); uint8_t *ptr = f->ipred_edge[0] = - dav1d_alloc_aligned(ipred_edge_sz * 128 * 3, 64); + dav1d_alloc_aligned(ALLOC_IPRED, ipred_edge_sz * 128 * 3, 64); if (!ptr) { f->ipred_edge_sz = 0; goto error; @@ -3221,8 +3222,8 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) { const int re_sz = f->sb128h * f->frame_hdr->tiling.cols; if (re_sz != f->lf.re_sz) { - freep(&f->lf.tx_lpf_right_edge[0]); - f->lf.tx_lpf_right_edge[0] = malloc(re_sz * 32 * 2); + dav1d_free(f->lf.tx_lpf_right_edge[0]); + f->lf.tx_lpf_right_edge[0] = dav1d_malloc(ALLOC_LF, re_sz * 32 * 2); if (!f->lf.tx_lpf_right_edge[0]) { f->lf.re_sz = 0; goto error; @@ -3656,9 +3657,9 @@ int dav1d_submit_frame(Dav1dContext *const c) { // FIXME qsort so tiles are in order (for frame threading) if (f->n_tile_data_alloc < c->n_tile_data) { - freep(&f->tile); + dav1d_free(f->tile); assert(c->n_tile_data < INT_MAX / (int)sizeof(*f->tile)); - f->tile = malloc(c->n_tile_data * sizeof(*f->tile)); + f->tile = dav1d_malloc(ALLOC_TILE, c->n_tile_data * sizeof(*f->tile)); if (!f->tile) { f->n_tile_data_alloc = f->n_tile_data = 0; res = DAV1D_ERR(ENOMEM); diff --git a/third_party/dav1d/src/lib.c b/third_party/dav1d/src/lib.c index 4403620adda4..5f7dd1715515 100644 --- a/third_party/dav1d/src/lib.c +++ b/third_party/dav1d/src/lib.c @@ -63,6 +63,12 @@ COLD const char *dav1d_version(void) { return DAV1D_VERSION; } +COLD unsigned dav1d_version_api(void) { + return (DAV1D_API_VERSION_MAJOR << 16) | + (DAV1D_API_VERSION_MINOR << 8) | + (DAV1D_API_VERSION_PATCH << 0); +} + COLD void dav1d_default_settings(Dav1dSettings *const s) { s->n_threads = 0; s->max_frame_delay = 0; @@ -155,7 +161,7 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) { pthread_attr_setstacksize(&thread_attr, stack_size); - Dav1dContext *const c = *c_out = dav1d_alloc_aligned(sizeof(*c), 64); + Dav1dContext *const c = *c_out = dav1d_alloc_aligned(ALLOC_COMMON_CTX, sizeof(*c), 64); if (!c) goto error; memset(c, 0, sizeof(*c)); @@ -172,12 +178,12 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) { dav1d_data_props_set_defaults(&c->cached_error_props); - if (dav1d_mem_pool_init(&c->seq_hdr_pool) || - dav1d_mem_pool_init(&c->frame_hdr_pool) || - dav1d_mem_pool_init(&c->segmap_pool) || - dav1d_mem_pool_init(&c->refmvs_pool) || - dav1d_mem_pool_init(&c->pic_ctx_pool) || - dav1d_mem_pool_init(&c->cdf_pool)) + if (dav1d_mem_pool_init(ALLOC_OBU_HDR, &c->seq_hdr_pool) || + dav1d_mem_pool_init(ALLOC_OBU_HDR, &c->frame_hdr_pool) || + dav1d_mem_pool_init(ALLOC_SEGMAP, &c->segmap_pool) || + dav1d_mem_pool_init(ALLOC_REFMVS, &c->refmvs_pool) || + dav1d_mem_pool_init(ALLOC_PIC_CTX, &c->pic_ctx_pool) || + dav1d_mem_pool_init(ALLOC_CDF, &c->cdf_pool)) { goto error; } @@ -186,7 +192,7 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) { c->allocator.release_picture_callback == dav1d_default_picture_release) { if (c->allocator.cookie) goto error; - if (dav1d_mem_pool_init(&c->picture_pool)) goto error; + if (dav1d_mem_pool_init(ALLOC_PIC, &c->picture_pool)) goto error; c->allocator.cookie = c->picture_pool; } else if (c->allocator.alloc_picture_callback == dav1d_default_picture_alloc || c->allocator.release_picture_callback == dav1d_default_picture_release) @@ -210,11 +216,11 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) { get_num_threads(c, s, &c->n_tc, &c->n_fc); - c->fc = dav1d_alloc_aligned(sizeof(*c->fc) * c->n_fc, 32); + c->fc = dav1d_alloc_aligned(ALLOC_THREAD_CTX, sizeof(*c->fc) * c->n_fc, 32); if (!c->fc) goto error; memset(c->fc, 0, sizeof(*c->fc) * c->n_fc); - c->tc = dav1d_alloc_aligned(sizeof(*c->tc) * c->n_tc, 64); + c->tc = dav1d_alloc_aligned(ALLOC_THREAD_CTX, sizeof(*c->tc) * c->n_tc, 64); if (!c->tc) goto error; memset(c->tc, 0, sizeof(*c->tc) * c->n_tc); if (c->n_tc > 1) { @@ -235,9 +241,11 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) { } if (c->n_fc > 1) { + const size_t out_delayed_sz = sizeof(*c->frame_thread.out_delayed) * c->n_fc; c->frame_thread.out_delayed = - calloc(c->n_fc, sizeof(*c->frame_thread.out_delayed)); + dav1d_malloc(ALLOC_THREAD_CTX, out_delayed_sz); if (!c->frame_thread.out_delayed) goto error; + memset(c->frame_thread.out_delayed, 0, out_delayed_sz); } for (unsigned n = 0; n < c->n_fc; n++) { Dav1dFrameContext *const f = &c->fc[n]; @@ -592,6 +600,9 @@ void dav1d_flush(Dav1dContext *const c) { COLD void dav1d_close(Dav1dContext **const c_out) { validate_input(c_out != NULL); +#if TRACK_HEAP_ALLOCATIONS + dav1d_log_alloc_stats(*c_out); +#endif close_internal(c_out, 1); } @@ -628,31 +639,31 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) { // clean-up threading stuff if (c->n_fc > 1) { - freep(&f->tile_thread.lowest_pixel_mem); - freep(&f->frame_thread.b); - dav1d_freep_aligned(&f->frame_thread.pal_idx); - dav1d_freep_aligned(&f->frame_thread.cf); - freep(&f->frame_thread.tile_start_off); - dav1d_freep_aligned(&f->frame_thread.pal); - freep(&f->frame_thread.cbi); + dav1d_free(f->tile_thread.lowest_pixel_mem); + dav1d_free(f->frame_thread.b); + dav1d_free_aligned(f->frame_thread.pal_idx); + dav1d_free_aligned(f->frame_thread.cf); + dav1d_free(f->frame_thread.tile_start_off); + dav1d_free_aligned(f->frame_thread.pal); + dav1d_free(f->frame_thread.cbi); } if (c->n_tc > 1) { pthread_mutex_destroy(&f->task_thread.pending_tasks.lock); pthread_cond_destroy(&f->task_thread.cond); pthread_mutex_destroy(&f->task_thread.lock); } - freep(&f->frame_thread.frame_progress); - freep(&f->task_thread.tasks); - freep(&f->task_thread.tile_tasks[0]); + dav1d_free(f->frame_thread.frame_progress); + dav1d_free(f->task_thread.tasks); + dav1d_free(f->task_thread.tile_tasks[0]); dav1d_free_aligned(f->ts); dav1d_free_aligned(f->ipred_edge[0]); - free(f->a); - free(f->tile); - free(f->lf.mask); - free(f->lf.lr_mask); - free(f->lf.level); - free(f->lf.tx_lpf_right_edge[0]); - free(f->lf.start_of_tile_row); + dav1d_free(f->a); + dav1d_free(f->tile); + dav1d_free(f->lf.mask); + dav1d_free(f->lf.level); + dav1d_free(f->lf.lr_mask); + dav1d_free(f->lf.tx_lpf_right_edge[0]); + dav1d_free(f->lf.start_of_tile_row); dav1d_refmvs_clear(&f->rf); dav1d_free_aligned(f->lf.cdef_line_buf); dav1d_free_aligned(f->lf.lr_line_buf); @@ -662,11 +673,11 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) { for (unsigned n = 0; n < c->n_fc; n++) if (c->frame_thread.out_delayed[n].p.frame_hdr) dav1d_thread_picture_unref(&c->frame_thread.out_delayed[n]); - free(c->frame_thread.out_delayed); + dav1d_free(c->frame_thread.out_delayed); } for (int n = 0; n < c->n_tile_data; n++) dav1d_data_unref_internal(&c->tile[n].data); - free(c->tile); + dav1d_free(c->tile); for (int n = 0; n < 8; n++) { dav1d_cdf_thread_unref(&c->cdf[n]); if (c->refs[n].p.p.frame_hdr) diff --git a/third_party/dav1d/src/log.c b/third_party/dav1d/src/log.c index de6776a617e6..a08f6eb68d48 100644 --- a/third_party/dav1d/src/log.c +++ b/third_party/dav1d/src/log.c @@ -44,7 +44,7 @@ COLD void dav1d_log_default_callback(void *const cookie, } COLD void dav1d_log(Dav1dContext *const c, const char *const format, ...) { - validate_input(c != NULL); + assert(c != NULL); if (!c->logger.callback) return; diff --git a/third_party/dav1d/src/mem.c b/third_party/dav1d/src/mem.c index 558bc01caeaf..1888af361813 100644 --- a/third_party/dav1d/src/mem.c +++ b/third_party/dav1d/src/mem.c @@ -31,9 +31,208 @@ #include "src/internal.h" +#if TRACK_HEAP_ALLOCATIONS +#include + +#include "src/log.h" + +#define DEFAULT_ALIGN 16 + +typedef struct { + size_t sz; + unsigned align; + enum AllocationType type; +} Dav1dAllocationData; + +typedef struct { + size_t curr_sz; + size_t peak_sz; + unsigned num_allocs; + unsigned num_reuses; +} AllocStats; + +static AllocStats tracked_allocs[N_ALLOC_TYPES]; +static size_t curr_total_sz; +static size_t peak_total_sz; +static pthread_mutex_t track_alloc_mutex = PTHREAD_MUTEX_INITIALIZER; + +static void *track_alloc(const enum AllocationType type, char *ptr, + const size_t sz, const size_t align) +{ + assert(align >= sizeof(Dav1dAllocationData)); + if (ptr) { + ptr += align; + Dav1dAllocationData *const d = &((Dav1dAllocationData*)ptr)[-1]; + AllocStats *const s = &tracked_allocs[type]; + + d->sz = sz; + d->align = (unsigned)align; + d->type = type; + + pthread_mutex_lock(&track_alloc_mutex); + s->num_allocs++; + s->curr_sz += sz; + if (s->curr_sz > s->peak_sz) + s->peak_sz = s->curr_sz; + + curr_total_sz += sz; + if (curr_total_sz > peak_total_sz) + peak_total_sz = curr_total_sz; + pthread_mutex_unlock(&track_alloc_mutex); + } + return ptr; +} + +static void *track_free(char *const ptr) { + const Dav1dAllocationData *const d = &((Dav1dAllocationData*)ptr)[-1]; + const size_t sz = d->sz; + + pthread_mutex_lock(&track_alloc_mutex); + tracked_allocs[d->type].curr_sz -= sz; + curr_total_sz -= sz; + pthread_mutex_unlock(&track_alloc_mutex); + + return ptr - d->align; +} + +static void dav1d_track_reuse(const enum AllocationType type) { + pthread_mutex_lock(&track_alloc_mutex); + tracked_allocs[type].num_reuses++; + pthread_mutex_unlock(&track_alloc_mutex); +} + +void *dav1d_malloc(const enum AllocationType type, const size_t sz) { + void *const ptr = malloc(sz + DEFAULT_ALIGN); + return track_alloc(type, ptr, sz, DEFAULT_ALIGN); +} + +void *dav1d_alloc_aligned(const enum AllocationType type, + const size_t sz, const size_t align) +{ + assert(!(align & (align - 1))); + void *ptr; +#ifdef _WIN32 + ptr = _aligned_malloc(sz + align, align); +#elif defined(HAVE_POSIX_MEMALIGN) + if (posix_memalign(&ptr, align, sz + align)) return NULL; +#else + ptr = memalign(align, sz + align); +#endif + + return track_alloc(type, ptr, sz, align); +} + +void *dav1d_realloc(const enum AllocationType type, + void *ptr, const size_t sz) +{ + if (!ptr) + return dav1d_malloc(type, sz); + ptr = realloc((char*)ptr - DEFAULT_ALIGN, sz + DEFAULT_ALIGN); + if (ptr) + ptr = track_free((char*)ptr + DEFAULT_ALIGN); + return track_alloc(type, ptr, sz, DEFAULT_ALIGN); +} + +void dav1d_free(void *ptr) { + if (ptr) + free(track_free(ptr)); +} + +void dav1d_free_aligned(void *ptr) { + if (ptr) { + ptr = track_free(ptr); +#ifdef _WIN32 + _aligned_free(ptr); +#else + free(ptr); +#endif + } +} + +static COLD int cmp_stats(const void *const a, const void *const b) { + const size_t a_sz = ((const AllocStats*)a)->peak_sz; + const size_t b_sz = ((const AllocStats*)b)->peak_sz; + return a_sz < b_sz ? -1 : a_sz > b_sz; +} + +/* Insert spaces as thousands separators for better readability */ +static COLD int format_tsep(char *const s, const size_t n, const size_t value) { + if (value < 1000) + return snprintf(s, n, "%u", (unsigned)value); + + const int len = format_tsep(s, n, value / 1000); + assert((size_t)len < n); + return len + snprintf(s + len, n - len, " %03u", (unsigned)(value % 1000)); +} + +COLD void dav1d_log_alloc_stats(Dav1dContext *const c) { + static const char *const type_names[N_ALLOC_TYPES] = { + [ALLOC_BLOCK ] = "Block data", + [ALLOC_CDEF ] = "CDEF line buffers", + [ALLOC_CDF ] = "CDF contexts", + [ALLOC_COEF ] = "Coefficient data", + [ALLOC_COMMON_CTX] = "Common context data", + [ALLOC_DAV1DDATA ] = "Dav1dData", + [ALLOC_IPRED ] = "Intra pred edges", + [ALLOC_LF ] = "Loopfilter data", + [ALLOC_LR ] = "Looprestoration data", + [ALLOC_OBU_HDR ] = "OBU headers", + [ALLOC_OBU_META ] = "OBU metadata", + [ALLOC_PAL ] = "Palette data", + [ALLOC_PIC ] = "Picture buffers", + [ALLOC_PIC_CTX ] = "Picture context data", + [ALLOC_REFMVS ] = "Reference mv data", + [ALLOC_SEGMAP ] = "Segmentation maps", + [ALLOC_THREAD_CTX] = "Thread context data", + [ALLOC_TILE ] = "Tile data", + }; + + struct { + AllocStats stats; + enum AllocationType type; + } data[N_ALLOC_TYPES]; + unsigned total_allocs = 0; + unsigned total_reuses = 0; + + pthread_mutex_lock(&track_alloc_mutex); + for (int i = 0; i < N_ALLOC_TYPES; i++) { + AllocStats *const s = &data[i].stats; + *s = tracked_allocs[i]; + data[i].type = i; + total_allocs += s->num_allocs; + total_reuses += s->num_reuses; + } + size_t total_sz = peak_total_sz; + pthread_mutex_unlock(&track_alloc_mutex); + + /* Sort types by memory usage */ + qsort(&data, N_ALLOC_TYPES, sizeof(*data), cmp_stats); + + const double inv_total_share = 100.0 / total_sz; + char total_sz_buf[32]; + const int sz_len = 4 + format_tsep(total_sz_buf, sizeof(total_sz_buf), total_sz); + + dav1d_log(c, "\n Type Allocs Reuses Share Peak size\n" + "---------------------------------------------------------------------\n"); + for (int i = N_ALLOC_TYPES - 1; i >= 0; i--) { + const AllocStats *const s = &data[i].stats; + if (s->num_allocs) { + const double share = s->peak_sz * inv_total_share; + char sz_buf[32]; + format_tsep(sz_buf, sizeof(sz_buf), s->peak_sz); + dav1d_log(c, " %-20s%10u%10u%8.1f%%%*s\n", type_names[data[i].type], + s->num_allocs, s->num_reuses, share, sz_len, sz_buf); + } + } + dav1d_log(c, "---------------------------------------------------------------------\n" + "%31u%10u %s\n", + total_allocs, total_reuses, total_sz_buf); +} +#endif /* TRACK_HEAP_ALLOCATIONS */ + static COLD void mem_pool_destroy(Dav1dMemPool *const pool) { pthread_mutex_destroy(&pool->lock); - free(pool); + dav1d_free(pool); } void dav1d_mem_pool_push(Dav1dMemPool *const pool, Dav1dMemPoolBuffer *const buf) { @@ -66,10 +265,14 @@ Dav1dMemPoolBuffer *dav1d_mem_pool_pop(Dav1dMemPool *const pool, const size_t si dav1d_free_aligned(data); goto alloc; } +#if TRACK_HEAP_ALLOCATIONS + dav1d_track_reuse(pool->type); +#endif } else { pthread_mutex_unlock(&pool->lock); alloc: - data = dav1d_alloc_aligned(size + sizeof(Dav1dMemPoolBuffer), 64); + data = dav1d_alloc_aligned(pool->type, + size + sizeof(Dav1dMemPoolBuffer), 64); if (!data) { pthread_mutex_lock(&pool->lock); const int ref_cnt = --pool->ref_cnt; @@ -84,13 +287,19 @@ alloc: return buf; } -COLD int dav1d_mem_pool_init(Dav1dMemPool **const ppool) { - Dav1dMemPool *const pool = malloc(sizeof(Dav1dMemPool)); +COLD int dav1d_mem_pool_init(const enum AllocationType type, + Dav1dMemPool **const ppool) +{ + Dav1dMemPool *const pool = dav1d_malloc(ALLOC_COMMON_CTX, + sizeof(Dav1dMemPool)); if (pool) { if (!pthread_mutex_init(&pool->lock, NULL)) { pool->buf = NULL; pool->ref_cnt = 1; pool->end = 0; +#if TRACK_HEAP_ALLOCATIONS + pool->type = type; +#endif *ppool = pool; return 0; } diff --git a/third_party/dav1d/src/mem.h b/third_party/dav1d/src/mem.h index 41ae47a2fd37..0a8c18d709b8 100644 --- a/third_party/dav1d/src/mem.h +++ b/third_party/dav1d/src/mem.h @@ -28,16 +28,42 @@ #ifndef DAV1D_SRC_MEM_H #define DAV1D_SRC_MEM_H +#define TRACK_HEAP_ALLOCATIONS 0 + #include -#if defined(HAVE_ALIGNED_MALLOC) || defined(HAVE_MEMALIGN) +#if defined(_WIN32) || !defined(HAVE_POSIX_MEMALIGN) #include #endif +#include "dav1d/dav1d.h" + #include "common/attributes.h" #include "src/thread.h" +enum AllocationType { + ALLOC_BLOCK, + ALLOC_CDEF, + ALLOC_CDF, + ALLOC_COEF, + ALLOC_COMMON_CTX, + ALLOC_DAV1DDATA, + ALLOC_IPRED, + ALLOC_LF, + ALLOC_LR, + ALLOC_OBU_HDR, + ALLOC_OBU_META, + ALLOC_PAL, + ALLOC_PIC, + ALLOC_PIC_CTX, + ALLOC_REFMVS, + ALLOC_SEGMAP, + ALLOC_THREAD_CTX, + ALLOC_TILE, + N_ALLOC_TYPES, +}; + typedef struct Dav1dMemPoolBuffer { void *data; struct Dav1dMemPoolBuffer *next; @@ -48,43 +74,59 @@ typedef struct Dav1dMemPool { Dav1dMemPoolBuffer *buf; int ref_cnt; int end; +#if TRACK_HEAP_ALLOCATIONS + enum AllocationType type; +#endif } Dav1dMemPool; -void dav1d_mem_pool_push(Dav1dMemPool *pool, Dav1dMemPoolBuffer *buf); -Dav1dMemPoolBuffer *dav1d_mem_pool_pop(Dav1dMemPool *pool, size_t size); -int dav1d_mem_pool_init(Dav1dMemPool **pool); -void dav1d_mem_pool_end(Dav1dMemPool *pool); + +#if TRACK_HEAP_ALLOCATIONS +void *dav1d_malloc(enum AllocationType type, size_t sz); +void *dav1d_realloc(enum AllocationType type, void *ptr, size_t sz); +void *dav1d_alloc_aligned(enum AllocationType type, size_t sz, size_t align); +void dav1d_free(void *ptr); +void dav1d_free_aligned(void *ptr); +void dav1d_log_alloc_stats(Dav1dContext *c); +#else +#define dav1d_mem_pool_init(type, pool) dav1d_mem_pool_init(pool) +#define dav1d_malloc(type, sz) malloc(sz) +#define dav1d_realloc(type, ptr, sz) realloc(ptr, sz) +#define dav1d_free(ptr) free(ptr) /* * Allocate align-byte aligned memory. The return value can be released * by calling the dav1d_free_aligned() function. */ -static inline void *dav1d_alloc_aligned(size_t sz, size_t align) { +static inline void *dav1d_alloc_aligned(const size_t sz, const size_t align) { assert(!(align & (align - 1))); -#ifdef HAVE_POSIX_MEMALIGN +#ifdef _WIN32 + return _aligned_malloc(sz, align); +#elif defined(HAVE_POSIX_MEMALIGN) void *ptr; if (posix_memalign(&ptr, align, sz)) return NULL; return ptr; -#elif defined(HAVE_ALIGNED_MALLOC) - return _aligned_malloc(sz, align); -#elif defined(HAVE_MEMALIGN) - return memalign(align, sz); #else -#error Missing aligned alloc implementation + return memalign(align, sz); #endif } +#define dav1d_alloc_aligned(type, sz, align) dav1d_alloc_aligned(sz, align) -static inline void dav1d_free_aligned(void* ptr) { -#ifdef HAVE_POSIX_MEMALIGN - free(ptr); -#elif defined(HAVE_ALIGNED_MALLOC) +static inline void dav1d_free_aligned(void *ptr) { +#ifdef _WIN32 _aligned_free(ptr); -#elif defined(HAVE_MEMALIGN) +#else free(ptr); #endif } -static inline void dav1d_freep_aligned(void* ptr) { +#endif /* TRACK_HEAP_ALLOCATIONS */ + +void dav1d_mem_pool_push(Dav1dMemPool *pool, Dav1dMemPoolBuffer *buf); +Dav1dMemPoolBuffer *dav1d_mem_pool_pop(Dav1dMemPool *pool, size_t size); +int dav1d_mem_pool_init(enum AllocationType type, Dav1dMemPool **pool); +void dav1d_mem_pool_end(Dav1dMemPool *pool); + +static inline void dav1d_freep_aligned(void *ptr) { void **mem = (void **) ptr; if (*mem) { dav1d_free_aligned(*mem); @@ -92,12 +134,4 @@ static inline void dav1d_freep_aligned(void* ptr) { } } -static inline void freep(void *ptr) { - void **mem = (void **) ptr; - if (*mem) { - free(*mem); - *mem = NULL; - } -} - #endif /* DAV1D_SRC_MEM_H */ diff --git a/third_party/dav1d/src/obu.c b/third_party/dav1d/src/obu.c index 8f94872b2ac3..78d652b4c5b6 100644 --- a/third_party/dav1d/src/obu.c +++ b/third_party/dav1d/src/obu.c @@ -304,7 +304,7 @@ int dav1d_parse_sequence_header(Dav1dSequenceHeader *const out, { validate_input_or_ret(out != NULL, DAV1D_ERR(EINVAL)); validate_input_or_ret(ptr != NULL, DAV1D_ERR(EINVAL)); - validate_input_or_ret(sz > 0, DAV1D_ERR(EINVAL)); + validate_input_or_ret(sz > 0 && sz <= SIZE_MAX / 2, DAV1D_ERR(EINVAL)); GetBits gb; dav1d_init_get_bits(&gb, ptr, sz); @@ -609,8 +609,8 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { if (!hdr->frame_ref_short_signaling) hdr->refidx[i] = dav1d_get_bits(gb, 3); if (seqhdr->frame_id_numbers_present) { - const int delta_ref_frame_id_minus_1 = dav1d_get_bits(gb, seqhdr->delta_frame_id_n_bits); - const int ref_frame_id = (hdr->frame_id + (1 << seqhdr->frame_id_n_bits) - delta_ref_frame_id_minus_1 - 1) & ((1 << seqhdr->frame_id_n_bits) - 1); + const unsigned delta_ref_frame_id = dav1d_get_bits(gb, seqhdr->delta_frame_id_n_bits) + 1; + const unsigned ref_frame_id = (hdr->frame_id + (1 << seqhdr->frame_id_n_bits) - delta_ref_frame_id) & ((1 << seqhdr->frame_id_n_bits) - 1); Dav1dFrameHeader *const ref_frame_hdr = c->refs[hdr->refidx[i]].p.p.frame_hdr; if (!ref_frame_hdr || ref_frame_hdr->frame_id != ref_frame_id) goto error; } @@ -705,7 +705,8 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { goto error; hdr->tiling.n_bytes = dav1d_get_bits(gb, 2) + 1; } else { - hdr->tiling.n_bytes = hdr->tiling.update = 0; + hdr->tiling.n_bytes = 0; + hdr->tiling.update = 0; } #if DEBUG_FRAME_HDR printf("HDR: post-tiling: off=%td\n", @@ -739,7 +740,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { hdr->quant.qm_y = dav1d_get_bits(gb, 4); hdr->quant.qm_u = dav1d_get_bits(gb, 4); hdr->quant.qm_v = - seqhdr->separate_uv_delta_q ? (int)dav1d_get_bits(gb, 4) : + seqhdr->separate_uv_delta_q ? dav1d_get_bits(gb, 4) : hdr->quant.qm_u; } #if DEBUG_FRAME_HDR @@ -1366,7 +1367,8 @@ ptrdiff_t dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) { if (!c->frame_hdr) goto error; if (c->n_tile_data_alloc < c->n_tile_data + 1) { if ((c->n_tile_data + 1) > INT_MAX / (int)sizeof(*c->tile)) goto error; - struct Dav1dTileGroup *tile = realloc(c->tile, (c->n_tile_data + 1) * sizeof(*c->tile)); + struct Dav1dTileGroup *tile = dav1d_realloc(ALLOC_TILE, c->tile, + (c->n_tile_data + 1) * sizeof(*c->tile)); if (!tile) goto error; c->tile = tile; memset(c->tile + c->n_tile_data, 0, sizeof(*c->tile)); @@ -1406,7 +1408,8 @@ ptrdiff_t dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) { switch (meta_type) { case OBU_META_HDR_CLL: { - Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dContentLightLevel)); + Dav1dRef *ref = dav1d_ref_create(ALLOC_OBU_META, + sizeof(Dav1dContentLightLevel)); if (!ref) return DAV1D_ERR(ENOMEM); Dav1dContentLightLevel *const content_light = ref->data; @@ -1434,7 +1437,8 @@ ptrdiff_t dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) { break; } case OBU_META_HDR_MDCV: { - Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dMasteringDisplay)); + Dav1dRef *ref = dav1d_ref_create(ALLOC_OBU_META, + sizeof(Dav1dMasteringDisplay)); if (!ref) return DAV1D_ERR(ENOMEM); Dav1dMasteringDisplay *const mastering_display = ref->data; @@ -1503,7 +1507,8 @@ ptrdiff_t dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) { } if ((c->n_itut_t35 + 1) > INT_MAX / (int)sizeof(*c->itut_t35)) goto error; - struct Dav1dITUTT35 *itut_t35 = realloc(c->itut_t35, (c->n_itut_t35 + 1) * sizeof(*c->itut_t35)); + struct Dav1dITUTT35 *itut_t35 = dav1d_realloc(ALLOC_OBU_META, c->itut_t35, + (c->n_itut_t35 + 1) * sizeof(*c->itut_t35)); if (!itut_t35) goto error; c->itut_t35 = itut_t35; memset(c->itut_t35 + c->n_itut_t35, 0, sizeof(*c->itut_t35)); @@ -1511,7 +1516,7 @@ ptrdiff_t dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) { struct itut_t35_ctx_context *itut_t35_ctx; if (!c->n_itut_t35) { assert(!c->itut_t35_ref); - itut_t35_ctx = malloc(sizeof(struct itut_t35_ctx_context)); + itut_t35_ctx = dav1d_malloc(ALLOC_OBU_META, sizeof(struct itut_t35_ctx_context)); if (!itut_t35_ctx) goto error; c->itut_t35_ref = dav1d_ref_init(&itut_t35_ctx->ref, c->itut_t35, dav1d_picture_free_itut_t35, itut_t35_ctx, 0); @@ -1524,7 +1529,7 @@ ptrdiff_t dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) { itut_t35_ctx->n_itut_t35 = c->n_itut_t35 + 1; Dav1dITUTT35 *const itut_t35_metadata = &c->itut_t35[c->n_itut_t35]; - itut_t35_metadata->payload = malloc(payload_size); + itut_t35_metadata->payload = dav1d_malloc(ALLOC_OBU_META, payload_size); if (!itut_t35_metadata->payload) goto error; itut_t35_metadata->country_code = country_code; diff --git a/third_party/dav1d/src/picture.c b/third_party/dav1d/src/picture.c index f83a4c29a52b..f22f05f0ca5f 100644 --- a/third_party/dav1d/src/picture.c +++ b/third_party/dav1d/src/picture.c @@ -106,9 +106,9 @@ void dav1d_picture_free_itut_t35(const uint8_t *const data, void *const user_dat struct itut_t35_ctx_context *itut_t35_ctx = user_data; for (size_t i = 0; i < itut_t35_ctx->n_itut_t35; i++) - free(itut_t35_ctx->itut_t35[i].payload); - free(itut_t35_ctx->itut_t35); - free(itut_t35_ctx); + dav1d_free(itut_t35_ctx->itut_t35[i].payload); + dav1d_free(itut_t35_ctx->itut_t35); + dav1d_free(itut_t35_ctx); } static int picture_alloc_with_edges(Dav1dContext *const c, @@ -249,12 +249,12 @@ int dav1d_picture_alloc_copy(Dav1dContext *const c, Dav1dPicture *const dst, con } void dav1d_picture_ref(Dav1dPicture *const dst, const Dav1dPicture *const src) { - validate_input(dst != NULL); - validate_input(dst->data[0] == NULL); - validate_input(src != NULL); + assert(dst != NULL); + assert(dst->data[0] == NULL); + assert(src != NULL); if (src->ref) { - validate_input(src->data[0] != NULL); + assert(src->data[0] != NULL); dav1d_ref_inc(src->ref); } if (src->frame_hdr_ref) dav1d_ref_inc(src->frame_hdr_ref); @@ -267,12 +267,12 @@ void dav1d_picture_ref(Dav1dPicture *const dst, const Dav1dPicture *const src) { } void dav1d_picture_move_ref(Dav1dPicture *const dst, Dav1dPicture *const src) { - validate_input(dst != NULL); - validate_input(dst->data[0] == NULL); - validate_input(src != NULL); + assert(dst != NULL); + assert(dst->data[0] == NULL); + assert(src != NULL); if (src->ref) - validate_input(src->data[0] != NULL); + assert(src->data[0] != NULL); *dst = *src; memset(src, 0, sizeof(*src)); diff --git a/third_party/dav1d/src/ref.c b/third_party/dav1d/src/ref.c index 6996ec7d3e96..5a4d3a245798 100644 --- a/third_party/dav1d/src/ref.c +++ b/third_party/dav1d/src/ref.c @@ -34,10 +34,10 @@ static void default_free_callback(const uint8_t *const data, void *const user_da dav1d_free_aligned(user_data); } -Dav1dRef *dav1d_ref_create(size_t size) { +Dav1dRef *dav1d_ref_create(const enum AllocationType type, size_t size) { size = (size + sizeof(void*) - 1) & ~(sizeof(void*) - 1); - uint8_t *const data = dav1d_alloc_aligned(size + sizeof(Dav1dRef), 64); + uint8_t *const data = dav1d_alloc_aligned(type, size + sizeof(Dav1dRef), 64); if (!data) return NULL; Dav1dRef *const res = (Dav1dRef*)(data + size); @@ -81,6 +81,6 @@ void dav1d_ref_dec(Dav1dRef **const pref) { if (atomic_fetch_sub(&ref->ref_cnt, 1) == 1) { const int free_ref = ref->free_ref; ref->free_callback(ref->const_data, ref->user_data); - if (free_ref) free(ref); + if (free_ref) dav1d_free(ref); } } diff --git a/third_party/dav1d/src/ref.h b/third_party/dav1d/src/ref.h index 57463e05b761..f1c96eb9141d 100644 --- a/third_party/dav1d/src/ref.h +++ b/third_party/dav1d/src/ref.h @@ -45,7 +45,11 @@ struct Dav1dRef { void *user_data; }; -Dav1dRef *dav1d_ref_create(size_t size); +#if !TRACK_HEAP_ALLOCATIONS +#define dav1d_ref_create(type, size) dav1d_ref_create(size) +#endif + +Dav1dRef *dav1d_ref_create(enum AllocationType type, size_t size); Dav1dRef *dav1d_ref_create_using_pool(Dav1dMemPool *pool, size_t size); void dav1d_ref_dec(Dav1dRef **ref); diff --git a/third_party/dav1d/src/refmvs.c b/third_party/dav1d/src/refmvs.c index 5398d396d162..0b5ccd304398 100644 --- a/third_party/dav1d/src/refmvs.c +++ b/third_party/dav1d/src/refmvs.c @@ -817,7 +817,7 @@ int dav1d_refmvs_init_frame(refmvs_frame *const rf, if (r_stride != rf->r_stride || n_tile_rows != rf->n_tile_rows) { if (rf->r) dav1d_freep_aligned(&rf->r); const int uses_2pass = n_tile_threads > 1 && n_frame_threads > 1; - rf->r = dav1d_alloc_aligned(sizeof(*rf->r) * 35 * r_stride * n_tile_rows * (1 + uses_2pass), 64); + rf->r = dav1d_alloc_aligned(ALLOC_REFMVS, sizeof(*rf->r) * 35 * r_stride * n_tile_rows * (1 + uses_2pass), 64); if (!rf->r) return DAV1D_ERR(ENOMEM); rf->r_stride = r_stride; } @@ -825,7 +825,7 @@ int dav1d_refmvs_init_frame(refmvs_frame *const rf, const ptrdiff_t rp_stride = r_stride >> 1; if (rp_stride != rf->rp_stride || n_tile_rows != rf->n_tile_rows) { if (rf->rp_proj) dav1d_freep_aligned(&rf->rp_proj); - rf->rp_proj = dav1d_alloc_aligned(sizeof(*rf->rp_proj) * 16 * rp_stride * n_tile_rows, 64); + rf->rp_proj = dav1d_alloc_aligned(ALLOC_REFMVS, sizeof(*rf->rp_proj) * 16 * rp_stride * n_tile_rows, 64); if (!rf->rp_proj) return DAV1D_ERR(ENOMEM); rf->rp_stride = rp_stride; } diff --git a/third_party/dav1d/src/thread.h b/third_party/dav1d/src/thread.h index b091e4f26dca..c44de736c3a7 100644 --- a/third_party/dav1d/src/thread.h +++ b/third_party/dav1d/src/thread.h @@ -33,6 +33,7 @@ #include #include +#define PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT typedef struct { diff --git a/third_party/dav1d/src/thread_task.c b/third_party/dav1d/src/thread_task.c index 49a8c123fd45..31794efee2e8 100644 --- a/third_party/dav1d/src/thread_task.c +++ b/third_party/dav1d/src/thread_task.c @@ -224,7 +224,7 @@ static int create_filter_sbrow(Dav1dFrameContext *const f, int num_tasks = f->sbh * (1 + uses_2pass); if (num_tasks > f->task_thread.num_tasks) { const size_t size = sizeof(Dav1dTask) * num_tasks; - tasks = realloc(f->task_thread.tasks, size); + tasks = dav1d_realloc(ALLOC_COMMON_CTX, f->task_thread.tasks, size); if (!tasks) return -1; memset(tasks, 0, size); f->task_thread.tasks = tasks; @@ -237,8 +237,8 @@ static int create_filter_sbrow(Dav1dFrameContext *const f, } else { const int prog_sz = ((f->sbh + 31) & ~31) >> 5; if (prog_sz > f->frame_thread.prog_sz) { - atomic_uint *const prog = realloc(f->frame_thread.frame_progress, - 2 * prog_sz * sizeof(*prog)); + atomic_uint *const prog = dav1d_realloc(ALLOC_COMMON_CTX, f->frame_thread.frame_progress, + 2 * prog_sz * sizeof(*prog)); if (!prog) return -1; f->frame_thread.frame_progress = prog; f->frame_thread.copy_lpf_progress = prog + prog_sz; @@ -275,7 +275,7 @@ int dav1d_task_create_tile_sbrow(Dav1dFrameContext *const f, const int pass, int alloc_num_tasks = num_tasks * (1 + uses_2pass); if (alloc_num_tasks > f->task_thread.num_tile_tasks) { const size_t size = sizeof(Dav1dTask) * alloc_num_tasks; - tasks = realloc(f->task_thread.tile_tasks[0], size); + tasks = dav1d_realloc(ALLOC_COMMON_CTX, f->task_thread.tile_tasks[0], size); if (!tasks) return -1; memset(tasks, 0, size); f->task_thread.tile_tasks[0] = tasks; diff --git a/third_party/dav1d/src/x86/refmvs.asm b/third_party/dav1d/src/x86/refmvs.asm index 06f555db117b..d95861fa1754 100644 --- a/third_party/dav1d/src/x86/refmvs.asm +++ b/third_party/dav1d/src/x86/refmvs.asm @@ -47,6 +47,10 @@ SECTION_RODATA 64 %endmacro %if ARCH_X86_64 +mv_proj: dw 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340 + dw 2048, 1820, 1638, 1489, 1365, 1260, 1170, 1092 + dw 1024, 963, 910, 862, 819, 780, 744, 712 + dw 682, 655, 630, 606, 585, 564, 546, 528 splat_mv_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3 db 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7 db 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 @@ -61,6 +65,7 @@ cond_shuf512: db 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 3, 3 save_cond0: db 0x80, 0x81, 0x82, 0x83, 0x89, 0x84, 0x00, 0x00 save_cond1: db 0x84, 0x85, 0x86, 0x87, 0x88, 0x80, 0x00, 0x00 pb_128: times 16 db 128 +pq_8192: dq 8192 save_tmvs_ssse3_table: SAVE_TMVS_TABLE 2, 16, ssse3 SAVE_TMVS_TABLE 4, 8, ssse3 @@ -329,6 +334,225 @@ cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4 RET %if ARCH_X86_64 +INIT_XMM sse4 +; refmvs_frame *rf, int tile_row_idx, +; int col_start8, int col_end8, int row_start8, int row_end8 +cglobal load_tmvs, 6, 15, 4, -0x50, rf, tridx, xstart, xend, ystart, yend, \ + stride, rp_proj, roff, troff, \ + xendi, xstarti, iw8, ih8, dst + xor r14d, r14d + cmp dword [rfq+212], 1 ; n_tile_threads + mov ih8d, [rfq+20] ; rf->ih8 + mov iw8d, [rfq+16] ; rf->iw8 + mov xstartd, xstartd + mov xendd, xendd + cmove tridxd, r14d + lea xstartid, [xstartq-8] + lea xendid, [xendq+8] + mov strideq, [rfq+184] + mov rp_projq, [rfq+176] + cmp ih8d, yendd + mov [rsp+0x30], strideq + cmovs yendd, ih8d + test xstartid, xstartid + cmovs xstartid, r14d + cmp iw8d, xendid + cmovs xendid, iw8d + mov troffq, strideq + shl troffq, 4 + imul troffq, tridxq + mov dstd, ystartd + and dstd, 15 + imul dstq, strideq + add dstq, troffq ; (16 * tridx + (ystart & 15)) * stride + lea dstq, [dstq*5] + add dstq, rp_projq + lea troffq, [troffq*5] ; 16 * tridx * stride * 5 + lea r13d, [xendq*5] + lea r12, [strideq*5] + DEFINE_ARGS rf, w5, xstart, xend, ystart, yend, h, x5, \ + _, troff, xendi, xstarti, stride5, _, dst + lea w5d, [xstartq*5] + add r7, troffq ; rp_proj + tile_row_offset + mov hd, yendd + mov [rsp+0x28], r7 + add dstq, r13 + sub w5q, r13 + sub hd, ystartd +.init_xloop_start: + mov x5q, w5q + test w5b, 1 + jz .init_2blk + mov dword [dstq+x5q], 0x80008000 + add x5q, 5 + jz .init_next_row +.init_2blk: + mov dword [dstq+x5q+0], 0x80008000 + mov dword [dstq+x5q+5], 0x80008000 + add x5q, 10 + jl .init_2blk +.init_next_row: + add dstq, stride5q + dec hd + jg .init_xloop_start + DEFINE_ARGS rf, _, xstart, xend, ystart, yend, n7, stride, \ + _, _, xendi, xstarti, stride5, _, n + mov r13d, [rfq+152] ; rf->n_mfmvs + test r13d, r13d + jz .ret + mov [rsp+0x0c], r13d + mov strideq, [rsp+0x30] + movddup m3, [pq_8192] + mov r9d, ystartd + mov [rsp+0x38], yendd + mov [rsp+0x20], xstartid + xor nd, nd + xor n7d, n7d + imul r9, strideq ; ystart * stride + mov [rsp+0x48], rfq + mov [rsp+0x18], stride5q + lea r7, [r9*5] + mov [rsp+0x24], ystartd + mov [rsp+0x00], r7 +.nloop: + DEFINE_ARGS y, off, xstart, xend, ystart, rf, n7, refsign, \ + ref, rp_ref, xendi, xstarti, _, _, n + mov rfq, [rsp+0x48] + mov refd, [rfq+56+nq*4] ; ref2cur + cmp refd, 0x80000000 + je .next_n + mov [rsp+0x40], refd + mov offq, [rsp+0x00] ; ystart * stride * 5 + movzx refd, byte [rfq+53+nq] ; rf->mfmv_ref[n] + lea refsignq, [refq-4] + mov rp_refq, [rfq+168] + movq m2, refsignq + add offq, [rp_refq+refq*8] ; r = rp_ref[ref] + row_offset + mov [rsp+0x14], nd + mov yd, ystartd +.yloop: + mov r11d, [rsp+0x24] ; ystart + mov r12d, [rsp+0x38] ; yend + mov r14d, yd + and r14d, ~7 ; y_sb_align + cmp r11d, r14d + cmovs r11d, r14d ; imax(y_sb_align, ystart) + mov [rsp+0x44], r11d ; y_proj_start + add r14d, 8 + cmp r12d, r14d + cmovs r14d, r12d ; imin(y_sb_align + 8, yend) + mov [rsp+0x3c], r14d ; y_proj_end + DEFINE_ARGS y, src, xstart, xend, frac, rf, n7, mv, \ + ref, x, xendi, mvx, mvy, rb, ref2ref + mov xd, [rsp+0x20] ; xstarti +.xloop: + lea rbd, [xq*5] + add rbq, srcq + movsx refd, byte [rbq+4] + test refd, refd + jz .next_x_bad_ref + mov rfq, [rsp+0x48] + lea r14d, [16+n7q+refq] + mov ref2refd, [rfq+r14*4] ; rf->mfmv_ref2ref[n][b_ref-1] + test ref2refd, ref2refd + jz .next_x_bad_ref + lea fracq, [mv_proj] + movzx fracd, word [fracq+ref2refq*2] + mov mvd, [rbq] + imul fracd, [rsp+0x40] ; ref2cur + pmovsxwq m0, [rbq] + movd m1, fracd + punpcklqdq m1, m1 + pmuldq m0, m1 ; mv * frac + pshufd m1, m0, q3311 + paddd m0, m3 + paddd m0, m1 + psrad m0, 14 ; offset = (xy + (xy >> 31) + 8192) >> 14 + pabsd m1, m0 + packssdw m0, m0 + psrld m1, 6 + packuswb m1, m1 + pxor m0, m2 ; offset ^ ref_sign + psignd m1, m0 ; apply_sign(abs(offset) >> 6, offset ^ refsign) + movq mvxq, m1 + lea mvyd, [mvxq+yq] ; ypos + sar mvxq, 32 + DEFINE_ARGS y, src, xstart, xend, _, _, n7, mv, \ + ref, x, xendi, mvx, ypos, rb, ref2ref + cmp yposd, [rsp+0x44] ; y_proj_start + jl .next_x_bad_pos_y + cmp yposd, [rsp+0x3c] ; y_proj_end + jge .next_x_bad_pos_y + and yposd, 15 + add mvxq, xq ; xpos + imul yposq, [rsp+0x30] ; pos = (ypos & 15) * stride + DEFINE_ARGS y, src, xstart, xend, dst, _, n7, mv, \ + ref, x, xendi, xpos, pos, rb, ref2ref + mov dstq, [rsp+0x28] ; dst = rp_proj + tile_row_offset + add posq, xposq ; pos += xpos + lea posq, [posq*5] + add dstq, posq ; dst += pos5 + jmp .write_loop_entry +.write_loop: + add rbq, 5 + cmp refb, byte [rbq+4] + jne .xloop + cmp mvd, [rbq] + jne .xloop + add dstq, 5 + inc xposd +.write_loop_entry: + mov r12d, xd + and r12d, ~7 + lea r5d, [r12-8] + cmp r5d, xstartd + cmovs r5d, xstartd ; x_proj_start + cmp xposd, r5d + jl .next_xpos + add r12d, 16 + cmp xendd, r12d + cmovs r12d, xendd ; x_proj_end + cmp xposd, r12d + jge .next_xpos + mov [dstq+0], mvd + mov byte [dstq+4], ref2refb +.next_xpos: + inc xd + cmp xd, xendid + jl .write_loop +.next_y: + DEFINE_ARGS y, src, xstart, xend, ystart, _, n7, _, _, x, xendi, _, _, _, n + add srcq, [rsp+0x18] ; stride5 + inc yd + cmp yd, [rsp+0x38] ; yend + jne .yloop + mov nd, [rsp+0x14] + mov ystartd, [rsp+0x24] +.next_n: + add n7d, 7 + inc nd + cmp nd, [rsp+0x0c] ; n_mfmvs + jne .nloop +.ret: + RET +.next_x: + DEFINE_ARGS y, src, xstart, xend, _, _, n7, mv, ref, x, xendi, _, _, rb, _ + add rbq, 5 + cmp refb, byte [rbq+4] + jne .xloop + cmp mvd, [rbq] + jne .xloop +.next_x_bad_pos_y: + inc xd + cmp xd, xendid + jl .next_x + jmp .next_y +.next_x_bad_ref: + inc xd + cmp xd, xendid + jl .xloop + jmp .next_y + INIT_YMM avx2 ; refmvs_temporal_block *rp, ptrdiff_t stride, ; refmvs_block **rr, uint8_t *ref_sign, diff --git a/third_party/dav1d/src/x86/refmvs.h b/third_party/dav1d/src/x86/refmvs.h index 9dafa78b1338..c9978561ecb5 100644 --- a/third_party/dav1d/src/x86/refmvs.h +++ b/third_party/dav1d/src/x86/refmvs.h @@ -28,6 +28,8 @@ #include "src/cpu.h" #include "src/refmvs.h" +decl_load_tmvs_fn(dav1d_load_tmvs_sse4); + decl_save_tmvs_fn(dav1d_save_tmvs_ssse3); decl_save_tmvs_fn(dav1d_save_tmvs_avx2); decl_save_tmvs_fn(dav1d_save_tmvs_avx512icl); @@ -47,7 +49,10 @@ static ALWAYS_INLINE void refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *const c) { c->save_tmvs = dav1d_save_tmvs_ssse3; + if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return; #if ARCH_X86_64 + c->load_tmvs = dav1d_load_tmvs_sse4; + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; c->save_tmvs = dav1d_save_tmvs_avx2; diff --git a/third_party/dav1d/tests/checkasm/refmvs.c b/third_party/dav1d/tests/checkasm/refmvs.c index f21c81f85a48..59c1560ae9e9 100644 --- a/third_party/dav1d/tests/checkasm/refmvs.c +++ b/third_party/dav1d/tests/checkasm/refmvs.c @@ -39,6 +39,190 @@ static inline int gen_mv(const int total_bits, int spel_bits) { return rnd() & 1 ? -bits : bits; } +#define ARRAY_SIZE(n) (sizeof(n)/sizeof(*(n))) + +static inline int get_min_mv_val(const int idx) { + if (idx <= 9) return idx; + else if (idx <= 18) return (idx - 9) * 10; + else if (idx <= 27) return (idx - 18) * 100; + else if (idx <= 36) return (idx - 27) * 1000; + else return (idx - 36) * 10000; +} + +static inline void gen_tmv(refmvs_temporal_block *const rb, const int *ref2ref) { + rb->ref = rnd() % 7; + if (!rb->ref) return; + static const int x_prob[] = { + 26447556, 6800591, 3708783, 2198592, 1635940, 1145901, 1052602, 1261759, + 1099739, 755108, 6075404, 4355916, 3254908, 2897157, 2273676, 2154432, + 1937436, 1694818, 1466863, 10203087, 5241546, 3328819, 2187483, 1458997, + 1030842, 806863, 587219, 525024, 1858953, 422368, 114626, 16992 + }; + static const int y_prob[] = { + 33845001, 7591218, 6425971, 4115838, 4032161, 2515962, 2614601, 2343656, + 2898897, 1397254, 10125350, 5124449, 3232914, 2185499, 1608775, 1342585, + 980208, 795714, 649665, 3369250, 1298716, 486002, 279588, 235990, + 110318, 89372, 66895, 46980, 153322, 32960, 4500, 389 + }; + const int prob = rnd() % 100000000; + int acc = 0; + for (unsigned i = 0; i < ARRAY_SIZE(x_prob); i++) { + acc += x_prob[i]; + if (prob < acc) { + const int min = get_min_mv_val(i); + const int max = get_min_mv_val(i + 1); + const int val = min + rnd() % (max - min); + rb->mv.x = iclip(val * ref2ref[rb->ref], -(1 << 15), (1 << 15) - 1); + break; + } + } + acc = 0; + for (unsigned i = 0; i < ARRAY_SIZE(y_prob); i++) { + acc += y_prob[i]; + if (prob < acc) { + const int min = get_min_mv_val(i); + const int max = get_min_mv_val(i + 1); + const int val = min + rnd() % (max - min); + rb->mv.y = iclip(val * ref2ref[rb->ref], -(1 << 15), (1 << 15) - 1); + break; + } + } +} + +static inline int get_ref2cur(void) { + const int prob = rnd() % 100; + static const uint8_t ref2cur[11] = { 35, 55, 67, 73, 78, 83, 84, 87, 90, 93, 100 }; + for (int i = 0; i < 11; i++) + if (prob < ref2cur[i]) + return rnd() & 1 ? -(i + 1) : i + 1; + return 0; +} + +static inline int get_seqlen(void) { + int len = 0, max_len; + const int prob = rnd() % 100000; + // =1 =2 =3 =4 <8 =8 <16 =16 <32 =32 <48 =48 <64 =64 >64 eq240 + // 5 17 1.5 16 5 10 5 7 4 3 1.5 2 1 2 20 15 chimera blocks + // 25 38 2.5 19 3.5 5.5 2 1.87 .86 .4 .18 .2 .067 .165 .478 .28 chimera sequences + + if (prob < 25000) len = 1; // =1 5% + else if (prob < 63000) len = 2; // =2 17% + else if (prob < 65500) len = 3; // =3 1.5% + else if (prob < 84500) len = 4; // =4 16% + else if (prob < 88000) max_len = 7; // <8 5% (43.5% tot <8) + else if (prob < 93500) len = 8; // =8 10% + else if (prob < 95500) max_len = 15; // <16 5% + else if (prob < 97370) len = 16; // =16 7% + else if (prob < 98230) max_len = 31; // <32 4% + else if (prob < 98630) len = 32; // =32 3% + else if (prob < 98810) max_len = 47; // <48 1.5% + else if (prob < 99010) len = 48; // =48 2% + else if (prob < 99077) max_len = 63; // <64 1% + else if (prob < 99242) len = 64; // =64 2% + else if (prob < 99720) max_len = 239; // <240 5% + else len = 240; // =240 15% + + if (!len) len = 1 + rnd() % max_len; + return len; +} + +static inline void init_rp_ref(refmvs_frame const *const rf, + const int col_start8, const int col_end8, + const int row_start8, const int row_end8) +{ + const int col_start8i = imax(col_start8 - 8, 0); + const int col_end8i = imin(col_end8 + 8, rf->iw8); + for (int n = 0; n < rf->n_mfmvs; n++) { + refmvs_temporal_block *rp_ref = rf->rp_ref[rf->mfmv_ref[n]]; + for (int i = row_start8; i < imin(row_end8, rf->ih8); i++) { + for (int j = col_start8i; j < col_end8i;) { + refmvs_temporal_block rb; + gen_tmv(&rb, rf->mfmv_ref2ref[n]); + for (int k = get_seqlen(); k && j < col_end8i; k--, j++) + rp_ref[i * rf->iw8 + j] = rb; + } + } + } +} + +static void check_load_tmvs(const Dav1dRefmvsDSPContext *const c) { + refmvs_temporal_block *rp_ref[7] = {0}; + refmvs_temporal_block c_rp_proj[240 * 63]; + refmvs_temporal_block a_rp_proj[240 * 63]; + refmvs_frame rf = { + .rp_ref = rp_ref, + .rp_stride = 240, .iw8 = 240, .ih8 = 63, + .n_mfmvs = 3 + }; + const size_t rp_ref_sz = rf.ih8 * rf.rp_stride * sizeof(refmvs_temporal_block); + + declare_func(void, const refmvs_frame *rf, int tile_row_idx, + int col_start8, int col_end8, int row_start8, int row_end8); + + if (check_func(c->load_tmvs, "load_tmvs")) { + const int row_start8 = (rnd() & 3) << 4; + const int row_end8 = row_start8 + 16; + const int col_start8 = rnd() & 31; + const int col_end8 = rf.iw8 - (rnd() & 31); + + for (int n = 0; n < rf.n_mfmvs; n++) { + rf.mfmv_ref[n] = rnd() % 7; + rf.mfmv_ref2cur[n] = get_ref2cur(); + for (int r = 0; r < 7; r++) + rf.mfmv_ref2ref[n][r] = rnd() & 31; + } + for (int n = 0; n < rf.n_mfmvs; n++) { + refmvs_temporal_block **p_rp_ref = &rp_ref[rf.mfmv_ref[n]]; + if (!*p_rp_ref) + *p_rp_ref = malloc(rp_ref_sz); + } + init_rp_ref(&rf, 0, rf.iw8, row_start8, row_end8); + for (int i = 0; i < rf.iw8 * rf.ih8; i++) { + c_rp_proj[i].mv.n = a_rp_proj[i].mv.n = 0xdeadbeef; + c_rp_proj[i].ref = a_rp_proj[i].ref = 0xdd; + } + + rf.n_tile_threads = 1; + + rf.rp_proj = c_rp_proj; + call_ref(&rf, 0, col_start8, col_end8, row_start8, row_end8); + rf.rp_proj = a_rp_proj; + call_new(&rf, 0, col_start8, col_end8, row_start8, row_end8); + + for (int i = 0; i < rf.ih8; i++) + for (int j = 0; j < rf.iw8; j++) + if (c_rp_proj[i * rf.iw8 + j].mv.n != a_rp_proj[i * rf.iw8 + j].mv.n || + (c_rp_proj[i * rf.iw8 + j].ref != a_rp_proj[i * rf.iw8 + j].ref && + c_rp_proj[i * rf.iw8 + j].mv.n != INVALID_MV)) + { + if (fail()) { + fprintf(stderr, "[%d][%d] c_rp.mv.x = 0x%x a_rp.mv.x = 0x%x\n", + i, j, c_rp_proj[i * rf.iw8 + j].mv.x, a_rp_proj[i * rf.iw8 + j].mv.x); + fprintf(stderr, "[%d][%d] c_rp.mv.y = 0x%x a_rp.mv.y = 0x%x\n", + i, j, c_rp_proj[i * rf.iw8 + j].mv.y, a_rp_proj[i * rf.iw8 + j].mv.y); + fprintf(stderr, "[%d][%d] c_rp.ref = %u a_rp.ref = %u\n", + i, j, c_rp_proj[i * rf.iw8 + j].ref, a_rp_proj[i * rf.iw8 + j].ref); + } + } + + if (checkasm_bench_func()) { + for (int n = 0; n < rf.n_mfmvs; n++) { + rf.mfmv_ref2cur[n] = 1; + for (int r = 0; r < 7; r++) + rf.mfmv_ref2ref[n][r] = 1; + } + bench_new(&rf, 0, 0, rf.iw8, row_start8, row_end8); + } + + for (int n = 0; n < rf.n_mfmvs; n++) { + free(rp_ref[rf.mfmv_ref[n]]); + rp_ref[rf.mfmv_ref[n]] = NULL; + } + } + + report("load_tmvs"); +} + static void check_save_tmvs(const Dav1dRefmvsDSPContext *const c) { refmvs_block *rr[31]; refmvs_block r[31 * 256]; @@ -162,6 +346,7 @@ void checkasm_check_refmvs(void) { Dav1dRefmvsDSPContext c; dav1d_refmvs_dsp_init(&c); + check_load_tmvs(&c); check_save_tmvs(&c); check_splat_mv(&c); }