From b50386f0ecd1c60c88e61118ea51b0ea907038bc Mon Sep 17 00:00:00 2001 From: Updatebot Date: Tue, 11 Jun 2024 18:00:44 +0000 Subject: [PATCH] Bug 1901600 - Update dav1d to 92f592ed104ba92ad35c781ee93f354525eef503 r=chunmin Differential Revision: https://phabricator.services.mozilla.com/D213129 --- media/libdav1d/moz.yaml | 4 +- media/libdav1d/vcs_version.h | 2 +- third_party/dav1d/NEWS | 17 +- third_party/dav1d/meson.build | 2 +- third_party/dav1d/src/arm/64/mc.S | 335 +- third_party/dav1d/src/arm/64/mc_dotprod.S | 789 +++-- third_party/dav1d/src/arm/64/msac.S | 285 +- third_party/dav1d/src/arm/mc.h | 15 +- third_party/dav1d/src/cpu.h | 3 + third_party/dav1d/src/decode.c | 2 +- third_party/dav1d/src/ext/x86/x86inc.asm | 25 +- third_party/dav1d/src/lib.c | 3 +- third_party/dav1d/src/loopfilter_tmpl.c | 4 + third_party/dav1d/src/meson.build | 39 +- third_party/dav1d/src/msac.h | 2 +- third_party/dav1d/src/ppc/cpu.c | 4 + third_party/dav1d/src/ppc/cpu.h | 1 + third_party/dav1d/src/ppc/dav1d_types.h | 4 + third_party/dav1d/src/ppc/loopfilter.h | 47 + third_party/dav1d/src/ppc/loopfilter_tmpl.c | 1704 ++++++++++ third_party/dav1d/src/recon_tmpl.c | 4 +- third_party/dav1d/src/refmvs.c | 73 +- third_party/dav1d/src/refmvs.h | 8 +- third_party/dav1d/src/riscv/64/cpu.S | 44 + third_party/dav1d/src/riscv/cpu.c | 4 +- third_party/dav1d/src/x86/mc.h | 20 - third_party/dav1d/src/x86/mc16_avx512.asm | 3142 +++++++++++++------ third_party/dav1d/src/x86/mc_sse.asm | 2937 ++++++++++------- third_party/dav1d/src/x86/refmvs.asm | 47 +- third_party/dav1d/tests/dav1d_argon.bash | 2 +- 30 files changed, 6789 insertions(+), 2779 deletions(-) create mode 100644 third_party/dav1d/src/ppc/loopfilter.h create mode 100644 third_party/dav1d/src/ppc/loopfilter_tmpl.c create mode 100644 third_party/dav1d/src/riscv/64/cpu.S diff --git a/media/libdav1d/moz.yaml b/media/libdav1d/moz.yaml index b22d5ee296f9..b6100a637878 100644 --- a/media/libdav1d/moz.yaml +++ b/media/libdav1d/moz.yaml @@ -20,11 +20,11 @@ origin: # Human-readable identifier for this version/release # Generally "version NNN", "tag SSS", "bookmark SSS" - release: 5b5399911dd24703de641d65eda5b7f1e845d060 (2024-04-15T13:19:42.000+02:00). + release: 92f592ed104ba92ad35c781ee93f354525eef503 (2024-06-05T23:22:36.000+02:00). # Revision to pull in # Must be a long or short commit SHA (long preferred) - revision: 5b5399911dd24703de641d65eda5b7f1e845d060 + revision: 92f592ed104ba92ad35c781ee93f354525eef503 # The package's license, where possible using the mnemonic from # https://spdx.org/licenses/ diff --git a/media/libdav1d/vcs_version.h b/media/libdav1d/vcs_version.h index ca9e8952c1f6..d3ee0709ae88 100644 --- a/media/libdav1d/vcs_version.h +++ b/media/libdav1d/vcs_version.h @@ -1,2 +1,2 @@ /* auto-generated, do not edit */ -#define DAV1D_VERSION "5b5399911dd24703de641d65eda5b7f1e845d060" +#define DAV1D_VERSION "92f592ed104ba92ad35c781ee93f354525eef503" diff --git a/third_party/dav1d/NEWS b/third_party/dav1d/NEWS index 88b1eea00e91..16825ff87e4b 100644 --- a/third_party/dav1d/NEWS +++ b/third_party/dav1d/NEWS @@ -1,3 +1,18 @@ +Changes for 1.4.2 'Road Runner': +-------------------------------- + +1.4.2 is a small release of dav1d, improving notably ARM, AVX-512 and PowerPC + - AVX2 optimizations for 8-tap and new variants for 6-tap + - AVX-512 optimizations for 8-tap and new variants for 6-tap + - Improve entropy decoding on ARM64 + - New ARM64 optimizations for convolutions based on DotProd extension + - New ARM64 optimizations for convolutions based on i8mm extension + - New ARM64 optimizations for subpel and prep filters for i8mm + - Misc improvements on existing ARM64 optimizations, notably for put/prep + - New PowerPC9 optimizations for loopfilter + - Support for macOS kperf API for benchmarking + + Changes for 1.4.1 'Road Runner': -------------------------------- @@ -246,7 +261,7 @@ Changes for 0.6.0 'Gyrfalcon': - New SSSE3 optimizations for film grain - New AVX2 optimizations for msac_adapt16 - Fix rare mismatches against the reference decoder, notably because of clipping - - Improvements on ARM64 on msac, cdef and looprestoration optimizations + - Improvements on ARM64 on msac, cdef, mc_blend_v and looprestoration optimizations - Improvements on AVX2 optimizations for cdef_filter - Improvements in the C version for itxfm, cdef_filter diff --git a/third_party/dav1d/meson.build b/third_party/dav1d/meson.build index a2637ed79778..f5010ac4855e 100644 --- a/third_party/dav1d/meson.build +++ b/third_party/dav1d/meson.build @@ -23,7 +23,7 @@ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. project('dav1d', ['c'], - version: '1.4.1', + version: '1.4.2', default_options: ['c_std=c99', 'warning_level=2', 'buildtype=release', diff --git a/third_party/dav1d/src/arm/64/mc.S b/third_party/dav1d/src/arm/64/mc.S index 5b493be82df0..736b2bb4e699 100644 --- a/third_party/dav1d/src/arm/64/mc.S +++ b/third_party/dav1d/src/arm/64/mc.S @@ -840,100 +840,108 @@ endfunc function put_neon, export=1 adr x9, L(put_tbl) ldrh w8, [x9, x8, lsl #1] - sub x9, x9, w8, uxtw + sub x9, x9, x8 br x9 -2: +20: AARCH64_VALID_JUMP_TARGET - ld1 {v0.h}[0], [x2], x3 - ld1 {v1.h}[0], [x2], x3 - subs w5, w5, #2 - st1 {v0.h}[0], [x0], x1 - st1 {v1.h}[0], [x0], x1 +2: + ldrh w9, [x2] + ldrh w10, [x2, x3] + add x2, x2, x3, lsl #1 + subs w5, w5, #2 + strh w9, [x0] + strh w10, [x0, x1] + add x0, x0, x1, lsl #1 b.gt 2b ret -4: +40: AARCH64_VALID_JUMP_TARGET - ld1 {v0.s}[0], [x2], x3 - ld1 {v1.s}[0], [x2], x3 - subs w5, w5, #2 - st1 {v0.s}[0], [x0], x1 - st1 {v1.s}[0], [x0], x1 +4: + ldr w9, [x2] + ldr w10, [x2, x3] + add x2, x2, x3, lsl #1 + subs w5, w5, #2 + str w9, [x0] + str w10, [x0, x1] + add x0, x0, x1, lsl #1 b.gt 4b ret -8: +80: AARCH64_VALID_JUMP_TARGET - ld1 {v0.8b}, [x2], x3 - ld1 {v1.8b}, [x2], x3 - subs w5, w5, #2 - st1 {v0.8b}, [x0], x1 - st1 {v1.8b}, [x0], x1 +8: + ldr x9, [x2] + ldr x10, [x2, x3] + add x2, x2, x3, lsl #1 + subs w5, w5, #2 + str x9, [x0] + str x10, [x0, x1] + add x0, x0, x1, lsl #1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET - add x8, x0, x1 - lsl x1, x1, #1 - add x9, x2, x3 - lsl x3, x3, #1 16: - ld1 {v0.16b}, [x2], x3 - ld1 {v1.16b}, [x9], x3 - subs w5, w5, #2 - st1 {v0.16b}, [x0], x1 - st1 {v1.16b}, [x8], x1 + ldr q0, [x2] + ldr q1, [x2, x3] + add x2, x2, x3, lsl #1 + subs w5, w5, #2 + str q0, [x0] + str q1, [x0, x1] + add x0, x0, x1, lsl #1 b.gt 16b ret -32: +320: AARCH64_VALID_JUMP_TARGET - ldp x6, x7, [x2] - ldp x8, x9, [x2, #16] - stp x6, x7, [x0] - subs w5, w5, #1 - stp x8, x9, [x0, #16] - add x2, x2, x3 - add x0, x0, x1 +32: + ldp q0, q1, [x2] + add x2, x2, x3 + stp q0, q1, [x0] + add x0, x0, x1 + ldp q2, q3, [x2] + add x2, x2, x3 + stp q2, q3, [x0] + subs w5, w5, #2 + add x0, x0, x1 b.gt 32b ret -64: +640: AARCH64_VALID_JUMP_TARGET - ldp x6, x7, [x2] - ldp x8, x9, [x2, #16] - stp x6, x7, [x0] - ldp x10, x11, [x2, #32] - stp x8, x9, [x0, #16] - subs w5, w5, #1 - ldp x12, x13, [x2, #48] - stp x10, x11, [x0, #32] - stp x12, x13, [x0, #48] - add x2, x2, x3 - add x0, x0, x1 +64: + ldp q0, q1, [x2] + stp q0, q1, [x0] + ldp q2, q3, [x2, #32] + add x2, x2, x3 + stp q2, q3, [x0, #32] + subs w5, w5, #1 + add x0, x0, x1 b.gt 64b ret -128: +1280: AARCH64_VALID_JUMP_TARGET - ldp q0, q1, [x2] - ldp q2, q3, [x2, #32] - stp q0, q1, [x0] - ldp q4, q5, [x2, #64] - stp q2, q3, [x0, #32] - ldp q6, q7, [x2, #96] - subs w5, w5, #1 - stp q4, q5, [x0, #64] - stp q6, q7, [x0, #96] - add x2, x2, x3 - add x0, x0, x1 +128: + ldp q0, q1, [x2] + stp q0, q1, [x0] + ldp q2, q3, [x2, #32] + stp q2, q3, [x0, #32] + ldp q4, q5, [x2, #64] + stp q4, q5, [x0, #64] + ldp q6, q7, [x2, #96] + add x2, x2, x3 + stp q6, q7, [x0, #96] + subs w5, w5, #1 + add x0, x0, x1 b.gt 128b ret L(put_tbl): - .hword L(put_tbl) - 128b - .hword L(put_tbl) - 64b - .hword L(put_tbl) - 32b - .hword L(put_tbl) - 160b - .hword L(put_tbl) - 8b - .hword L(put_tbl) - 4b - .hword L(put_tbl) - 2b + .hword L(put_tbl) - 1280b + .hword L(put_tbl) - 640b + .hword L(put_tbl) - 320b + .hword L(put_tbl) - 160b + .hword L(put_tbl) - 80b + .hword L(put_tbl) - 40b + .hword L(put_tbl) - 20b endfunc @@ -942,119 +950,146 @@ endfunc function prep_neon, export=1 adr x9, L(prep_tbl) ldrh w8, [x9, x8, lsl #1] - sub x9, x9, w8, uxtw + movi v24.16b, #16 + sub x9, x9, x8 br x9 -4: +40: AARCH64_VALID_JUMP_TARGET +4: ld1 {v0.s}[0], [x1], x2 + ld1 {v0.s}[1], [x1], x2 ld1 {v1.s}[0], [x1], x2 - subs w4, w4, #2 + ld1 {v1.s}[1], [x1], x2 ushll v0.8h, v0.8b, #4 ushll v1.8h, v1.8b, #4 - st1 {v0.4h, v1.4h}, [x0], #16 + subs w4, w4, #4 + stp q0, q1, [x0], #32 b.gt 4b ret -8: +80: AARCH64_VALID_JUMP_TARGET - ld1 {v0.8b}, [x1], x2 - ld1 {v1.8b}, [x1], x2 - subs w4, w4, #2 +8: + ldr d0, [x1] + ldr d1, [x1, x2] + add x1, x1, x2, lsl #1 + ldr d2, [x1] + ldr d3, [x1, x2] + add x1, x1, x2, lsl #1 ushll v0.8h, v0.8b, #4 ushll v1.8h, v1.8b, #4 - st1 {v0.8h, v1.8h}, [x0], #32 + umull v2.8h, v2.8b, v24.8b + umull v3.8h, v3.8b, v24.8b + subs w4, w4, #4 + stp q0, q1, [x0] + stp q2, q3, [x0, #32] + add x0, x0, #64 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET - add x9, x1, x2 - lsl x2, x2, #1 16: - ld1 {v0.16b}, [x1], x2 - ld1 {v1.16b}, [x9], x2 - subs w4, w4, #2 - ushll v4.8h, v0.8b, #4 - ushll2 v5.8h, v0.16b, #4 - ushll v6.8h, v1.8b, #4 - ushll2 v7.8h, v1.16b, #4 - st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 + ldr q1, [x1] + ldr q3, [x1, x2] + add x1, x1, x2, lsl #1 + ushll v0.8h, v1.8b, #4 + ushll2 v1.8h, v1.16b, #4 + ldr q5, [x1] + ldr q7, [x1, x2] + add x1, x1, x2, lsl #1 + umull v2.8h, v3.8b, v24.8b + umull2 v3.8h, v3.16b, v24.16b + ushll v4.8h, v5.8b, #4 + ushll2 v5.8h, v5.16b, #4 + umull v6.8h, v7.8b, v24.8b + umull2 v7.8h, v7.16b, v24.16b + subs w4, w4, #4 + stp q0, q1, [x0] + stp q2, q3, [x0, #32] + stp q4, q5, [x0, #64] + stp q6, q7, [x0, #96] + add x0, x0, #128 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET - add x8, x0, w3, uxtw 32: - ld1 {v0.16b, v1.16b}, [x1], x2 - subs w4, w4, #2 - ushll v4.8h, v0.8b, #4 - ushll2 v5.8h, v0.16b, #4 - ld1 {v2.16b, v3.16b}, [x1], x2 - ushll v6.8h, v1.8b, #4 - ushll2 v7.8h, v1.16b, #4 - ushll v16.8h, v2.8b, #4 - st1 {v4.8h, v5.8h}, [x0], x7 - ushll2 v17.8h, v2.16b, #4 - st1 {v6.8h, v7.8h}, [x8], x7 - ushll v18.8h, v3.8b, #4 - st1 {v16.8h, v17.8h}, [x0], x7 - ushll2 v19.8h, v3.16b, #4 - st1 {v18.8h, v19.8h}, [x8], x7 + ldp q4, q5, [x1] + add x1, x1, x2 + ldp q6, q7, [x1] + add x1, x1, x2 + ushll v0.8h, v4.8b, #4 + ushll2 v1.8h, v4.16b, #4 + umull v2.8h, v5.8b, v24.8b + umull2 v3.8h, v5.16b, v24.16b + ushll v4.8h, v6.8b, #4 + ushll2 v5.8h, v6.16b, #4 + umull v6.8h, v7.8b, v24.8b + umull2 v7.8h, v7.16b, v24.16b + subs w4, w4, #2 + stp q0, q1, [x0] + stp q2, q3, [x0, #32] + stp q4, q5, [x0, #64] + stp q6, q7, [x0, #96] + add x0, x0, #128 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET - add x8, x0, #32 - mov x6, #64 64: - ldp q0, q1, [x1] - subs w4, w4, #1 - ushll v4.8h, v0.8b, #4 - ushll2 v5.8h, v0.16b, #4 - ldp q2, q3, [x1, #32] - ushll v6.8h, v1.8b, #4 - ushll2 v7.8h, v1.16b, #4 - add x1, x1, x2 - ushll v16.8h, v2.8b, #4 - st1 {v4.8h, v5.8h}, [x0], x6 - ushll2 v17.8h, v2.16b, #4 - ushll v18.8h, v3.8b, #4 - st1 {v6.8h, v7.8h}, [x8], x6 - ushll2 v19.8h, v3.16b, #4 - st1 {v16.8h, v17.8h}, [x0], x6 - st1 {v18.8h, v19.8h}, [x8], x6 + ldp q4, q5, [x1] + ldp q6, q7, [x1, #32] + add x1, x1, x2 + ushll v0.8h, v4.8b, #4 + ushll2 v1.8h, v4.16b, #4 + umull v2.8h, v5.8b, v24.8b + umull2 v3.8h, v5.16b, v24.16b + ushll v4.8h, v6.8b, #4 + ushll2 v5.8h, v6.16b, #4 + umull v6.8h, v7.8b, v24.8b + umull2 v7.8h, v7.16b, v24.16b + subs w4, w4, #1 + stp q0, q1, [x0] + stp q2, q3, [x0, #32] + stp q4, q5, [x0, #64] + stp q6, q7, [x0, #96] + add x0, x0, #128 b.gt 64b ret 1280: AARCH64_VALID_JUMP_TARGET - add x8, x0, #64 - mov x6, #128 128: - ldp q0, q1, [x1] - ldp q2, q3, [x1, #32] - ushll v16.8h, v0.8b, #4 - ushll2 v17.8h, v0.16b, #4 - ushll v18.8h, v1.8b, #4 - ushll2 v19.8h, v1.16b, #4 - ushll v20.8h, v2.8b, #4 - ushll2 v21.8h, v2.16b, #4 - ldp q4, q5, [x1, #64] - st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x6 - ushll v22.8h, v3.8b, #4 - ushll2 v23.8h, v3.16b, #4 - ushll v24.8h, v4.8b, #4 - ushll2 v25.8h, v4.16b, #4 - ushll v26.8h, v5.8b, #4 - ushll2 v27.8h, v5.16b, #4 - ldp q6, q7, [x1, #96] - st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x8], x6 - ushll v28.8h, v6.8b, #4 - ushll2 v29.8h, v6.16b, #4 - ushll v30.8h, v7.8b, #4 - ushll2 v31.8h, v7.16b, #4 - subs w4, w4, #1 - add x1, x1, x2 - st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], x6 - st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x6 + ldp q28, q29, [x1] + ldp q30, q31, [x1, #32] + ushll v16.8h, v28.8b, #4 + ushll2 v17.8h, v28.16b, #4 + umull v18.8h, v29.8b, v24.8b + umull2 v19.8h, v29.16b, v24.16b + ushll v20.8h, v30.8b, #4 + ushll2 v21.8h, v30.16b, #4 + umull v22.8h, v31.8b, v24.8b + umull2 v23.8h, v31.16b, v24.16b + ldp q28, q29, [x1, #64] + ldp q30, q31, [x1, #96] + add x1, x1, x2 + stp q16, q17, [x0] + stp q18, q19, [x0, #32] + stp q20, q21, [x0, #64] + stp q22, q23, [x0, #96] + ushll v16.8h, v28.8b, #4 + ushll2 v17.8h, v28.16b, #4 + umull v18.8h, v29.8b, v24.8b + umull2 v19.8h, v29.16b, v24.16b + ushll v20.8h, v30.8b, #4 + ushll2 v21.8h, v30.16b, #4 + umull v22.8h, v31.8b, v24.8b + umull2 v23.8h, v31.16b, v24.16b + subs w4, w4, #1 + stp q16, q17, [x0, #128] + stp q18, q19, [x0, #160] + stp q20, q21, [x0, #192] + stp q22, q23, [x0, #224] + add x0, x0, #256 b.gt 128b ret @@ -1063,8 +1098,8 @@ L(prep_tbl): .hword L(prep_tbl) - 640b .hword L(prep_tbl) - 320b .hword L(prep_tbl) - 160b - .hword L(prep_tbl) - 8b - .hword L(prep_tbl) - 4b + .hword L(prep_tbl) - 80b + .hword L(prep_tbl) - 40b endfunc diff --git a/third_party/dav1d/src/arm/64/mc_dotprod.S b/third_party/dav1d/src/arm/64/mc_dotprod.S index fcf04ee4d0f4..a3eef59f36f2 100644 --- a/third_party/dav1d/src/arm/64/mc_dotprod.S +++ b/third_party/dav1d/src/arm/64/mc_dotprod.S @@ -52,15 +52,12 @@ L(hv_tbl_neon_dotprod): // Shuffle indices to permute horizontal samples in preparation for input to // SDOT instructions. The 8-tap horizontal convolution uses sample indices in the -// interval of [-3, 4] relative to the current sample position. We load samples -// from index value -4 to keep loads word aligned, so the shuffle bytes are -// translated by 1 to handle this. +// interval of [-3, 4] relative to the current sample position. .align 4 L(h_tbl_neon_dotprod): - .byte 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7 - .byte 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11 - .byte 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15 - .byte 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18, 16, 17, 18, 19 + .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 + .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 + .byte 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 // Vertical convolutions are also using SDOT instructions, where a 128-bit // register contains a transposed 4x4 matrix of values. Subsequent iterations of @@ -86,7 +83,7 @@ function \op\()_8tap_\type\()_8bpc_\isa, export=1, align=FUNC_ALIGN endfunc .endm -.macro filter_8tap_fn type, isa, dst, d_strd, src, s_strd, w, h, mx, my, xmx, xmy, ldst, lsrc, wd_strd +.macro filter_8tap_fn type, dot, isa, dst, d_strd, src, s_strd, w, h, mx, my, xmx, xmy, ldst, lsrc, wd_strd make_8tap_fn \type, sharp, SHARP1, SHARP1, \isa make_8tap_fn \type, sharp_smooth, SHARP1, SMOOTH1, \isa make_8tap_fn \type, sharp_regular, SHARP1, REGULAR1, \isa @@ -112,23 +109,27 @@ function \type\()_8tap_\isa, align=FUNC_ALIGN .align JUMP_ALIGN L(\type\()_8tap_v_\isa): madd \my, \my, w11, w10 -.ifc \type, prep - mov w8, 0x2002 // FILTER_WEIGHT * 128 + rounding -.endif - sub \src, \src, \s_strd ldr q6, L(v_tbl_neon_dotprod) -.ifc \type, prep + sub \src, \src, \s_strd +.ifc \isa, neon_dotprod + .ifc \type, prep + mov w8, 0x2002 // FILTER_WEIGHT * 128 + rounding dup v4.4s, w8 + .else + movi v4.4s, #32, lsl 8 // FILTER_WEIGHT * 128, bias for SDOT + .endif .endif ubfx w11, \my, #7, #7 and \my, \my, #0x7F ldr q28, L(v_tbl_neon_dotprod) + 16 cmp \h, #4 csel \my, \my, w11, le - sub \src, \src, \s_strd, lsl #1 // src - src_stride * 3 - ldr q29, L(v_tbl_neon_dotprod) + 32 + sub \src, \src, \s_strd, lsl #1 // src - s_strd * 3 add \xmy, x12, \xmy, lsl #3 // subpel V filter address + ldr q29, L(v_tbl_neon_dotprod) + 32 +.ifc \isa, neon_dotprod movi v5.16b, #128 +.endif ldr d7, [\xmy] cmp \w, #8 b.eq 80f @@ -180,7 +181,7 @@ L(\type\()_8tap_v_\isa): zip2 v20.8h, v18.8h, v24.8h zip1 v23.8h, v21.8h, v27.8h zip2 v26.8h, v21.8h, v27.8h - +.ifc \isa, neon_dotprod sub v16.16b, v16.16b, v5.16b sub v19.16b, v19.16b, v5.16b sub v22.16b, v22.16b, v5.16b @@ -190,41 +191,43 @@ L(\type\()_8tap_v_\isa): sub v20.16b, v20.16b, v5.16b sub v23.16b, v23.16b, v5.16b sub v26.16b, v26.16b, v5.16b - +.endif .align LOOP_ALIGN 16: - ldr q27, [\lsrc] - add \lsrc, \lsrc, \s_strd -.ifc \type, prep +.ifc \isa, neon_i8mm + ld1 {v18.16b}, [\lsrc], \s_strd + movi v0.4s, #0 + movi v1.4s, #0 + movi v2.4s, #0 + movi v3.4s, #0 + mov v21.16b, v18.16b + mov v24.16b, v18.16b + mov v27.16b, v18.16b +.else // neon_dotprod + ld1 {v27.16b}, [\lsrc], \s_strd mov v0.16b, v4.16b mov v1.16b, v4.16b mov v2.16b, v4.16b mov v3.16b, v4.16b -.else - movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT - movi v1.4s, #32, lsl 8 - movi v2.4s, #32, lsl 8 - movi v3.4s, #32, lsl 8 -.endif sub v18.16b, v27.16b, v5.16b sub v21.16b, v27.16b, v5.16b sub v24.16b, v27.16b, v5.16b sub v27.16b, v27.16b, v5.16b - - sdot v0.4s, v16.16b, v7.4b[0] - sdot v1.4s, v19.16b, v7.4b[0] - sdot v2.4s, v22.16b, v7.4b[0] - sdot v3.4s, v25.16b, v7.4b[0] +.endif + \dot v0.4s, v16.16b, v7.4b[0] + \dot v1.4s, v19.16b, v7.4b[0] + \dot v2.4s, v22.16b, v7.4b[0] + \dot v3.4s, v25.16b, v7.4b[0] tbl v16.16b, {v16.16b, v17.16b}, v6.16b tbl v19.16b, {v19.16b, v20.16b}, v6.16b tbl v22.16b, {v22.16b, v23.16b}, v6.16b tbl v25.16b, {v25.16b, v26.16b}, v6.16b - sdot v0.4s, v17.16b, v7.4b[1] - sdot v1.4s, v20.16b, v7.4b[1] - sdot v2.4s, v23.16b, v7.4b[1] - sdot v3.4s, v26.16b, v7.4b[1] + \dot v0.4s, v17.16b, v7.4b[1] + \dot v1.4s, v20.16b, v7.4b[1] + \dot v2.4s, v23.16b, v7.4b[1] + \dot v3.4s, v26.16b, v7.4b[1] tbl v17.16b, {v17.16b, v18.16b}, v28.16b tbl v20.16b, {v20.16b, v21.16b}, v29.16b @@ -235,46 +238,56 @@ L(\type\()_8tap_v_\isa): uzp1 v0.8h, v0.8h, v1.8h uzp1 v2.8h, v2.8h, v3.8h .ifc \type, prep + .ifc \isa, neon_i8mm + srshr v0.8h, v0.8h, #2 + srshr v1.8h, v2.8h, #2 + .else sshr v0.8h, v0.8h, #2 sshr v1.8h, v2.8h, #2 + .endif st1 {v0.8h, v1.8h}, [\ldst], \d_strd -.else +.else // put sqrshrun v0.8b, v0.8h, #6 sqrshrun2 v0.16b, v2.8h, #6 st1 {v0.16b}, [\ldst], \d_strd .endif b.gt 16b -.ifc \type, prep +.ifc \isa, neon_i8mm + movi v0.4s, #0 + movi v1.4s, #0 + movi v2.4s, #0 + movi v3.4s, #0 +.else // neon_dotprod mov v0.16b, v4.16b mov v1.16b, v4.16b mov v2.16b, v4.16b mov v3.16b, v4.16b -.else - movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT - movi v1.4s, #32, lsl 8 - movi v2.4s, #32, lsl 8 - movi v3.4s, #32, lsl 8 .endif - sdot v0.4s, v16.16b, v7.4b[0] - sdot v1.4s, v19.16b, v7.4b[0] - sdot v2.4s, v22.16b, v7.4b[0] - sdot v3.4s, v25.16b, v7.4b[0] + \dot v0.4s, v16.16b, v7.4b[0] + \dot v1.4s, v19.16b, v7.4b[0] + \dot v2.4s, v22.16b, v7.4b[0] + \dot v3.4s, v25.16b, v7.4b[0] - sdot v0.4s, v17.16b, v7.4b[1] - sdot v1.4s, v20.16b, v7.4b[1] - sdot v2.4s, v23.16b, v7.4b[1] - sdot v3.4s, v26.16b, v7.4b[1] + \dot v0.4s, v17.16b, v7.4b[1] + \dot v1.4s, v20.16b, v7.4b[1] + \dot v2.4s, v23.16b, v7.4b[1] + \dot v3.4s, v26.16b, v7.4b[1] subs \w, \w, #16 uzp1 v0.8h, v0.8h, v1.8h uzp1 v2.8h, v2.8h, v3.8h .ifc \type, prep + .ifc \isa, neon_i8mm + srshr v0.8h, v0.8h, #2 + srshr v1.8h, v2.8h, #2 + .else sshr v0.8h, v0.8h, #2 sshr v1.8h, v2.8h, #2 + .endif stp q0, q1, [\ldst] add \dst, \dst, #32 -.else +.else // put sqrshrun v0.8b, v0.8h, #6 sqrshrun2 v0.16b, v2.8h, #6 str q0, [\ldst] @@ -310,64 +323,73 @@ L(\type\()_8tap_v_\isa): zip2 v19.8h, v0.8h, v2.8h zip1 v17.8h, v18.8h, v24.8h zip2 v20.8h, v18.8h, v24.8h - +.ifc \isa, neon_dotprod sub v16.16b, v16.16b, v5.16b sub v19.16b, v19.16b, v5.16b sub v17.16b, v17.16b, v5.16b sub v20.16b, v20.16b, v5.16b +.endif .ifc \type, put b.eq 82f .endif - .align LOOP_ALIGN 8: +.ifc \isa, neon_i8mm + ldr d18, [\src] + movi v0.4s, #0 + movi v1.4s, #0 + ldr d24, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + movi v2.4s, #0 + movi v3.4s, #0 + mov v21.8b, v18.8b + mov v27.8b, v24.8b +.else // neon_dotprod ldr d21, [\src] ldr d27, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 -.ifc \type, prep mov v0.16b, v4.16b mov v1.16b, v4.16b mov v2.16b, v4.16b mov v3.16b, v4.16b -.else - movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT - movi v1.4s, #32, lsl 8 - movi v2.4s, #32, lsl 8 - movi v3.4s, #32, lsl 8 -.endif sub v18.16b, v21.16b, v5.16b sub v21.16b, v21.16b, v5.16b sub v24.16b, v27.16b, v5.16b sub v27.16b, v27.16b, v5.16b - +.endif tbl v22.16b, {v16.16b, v17.16b}, v6.16b tbl v25.16b, {v19.16b, v20.16b}, v6.16b tbl v23.16b, {v17.16b, v18.16b}, v28.16b tbl v26.16b, {v20.16b, v21.16b}, v29.16b - sdot v0.4s, v16.16b, v7.4b[0] - sdot v0.4s, v17.16b, v7.4b[1] - sdot v1.4s, v19.16b, v7.4b[0] - sdot v1.4s, v20.16b, v7.4b[1] + \dot v0.4s, v16.16b, v7.4b[0] + \dot v0.4s, v17.16b, v7.4b[1] + \dot v1.4s, v19.16b, v7.4b[0] + \dot v1.4s, v20.16b, v7.4b[1] tbl v16.16b, {v22.16b, v23.16b}, v6.16b tbl v19.16b, {v25.16b, v26.16b}, v6.16b tbl v17.16b, {v23.16b, v24.16b}, v28.16b tbl v20.16b, {v26.16b, v27.16b}, v29.16b - sdot v2.4s, v22.16b, v7.4b[0] - sdot v2.4s, v23.16b, v7.4b[1] - sdot v3.4s, v25.16b, v7.4b[0] - sdot v3.4s, v26.16b, v7.4b[1] + \dot v2.4s, v22.16b, v7.4b[0] + \dot v2.4s, v23.16b, v7.4b[1] + \dot v3.4s, v25.16b, v7.4b[0] + \dot v3.4s, v26.16b, v7.4b[1] subs \h, \h, #2 uzp1 v0.8h, v0.8h, v1.8h uzp1 v2.8h, v2.8h, v3.8h .ifc \type, prep + .ifc \isa, neon_i8mm + srshr v0.8h, v0.8h, #2 + srshr v1.8h, v2.8h, #2 + .else sshr v0.8h, v0.8h, #2 sshr v1.8h, v2.8h, #2 + .endif stp q0, q1, [\dst], #32 -.else +.else // put sqrshrun v0.8b, v0.8h, #6 sqrshrun v1.8b, v2.8h, #6 str d0, [\dst] @@ -379,43 +401,50 @@ L(\type\()_8tap_v_\isa): .ifc \type, put .align JUMP_ALIGN 82: - ldr d21, [\src] - movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT - movi v1.4s, #32, lsl 8 - movi v2.4s, #32, lsl 8 - movi v3.4s, #32, lsl 8 -.else +.endif +.ifc \isa, neon_i8mm + ldr d18, [\src] + movi v0.4s, #0 + movi v1.4s, #0 + movi v2.4s, #0 + movi v3.4s, #0 + mov v21.8b, v18.8b +.else // neon_dotprod ldr d21, [\src] mov v0.16b, v4.16b mov v1.16b, v4.16b mov v2.16b, v4.16b mov v3.16b, v4.16b -.endif sub v18.16b, v21.16b, v5.16b sub v21.16b, v21.16b, v5.16b - +.endif tbl v22.16b, {v16.16b, v17.16b}, v6.16b tbl v25.16b, {v19.16b, v20.16b}, v6.16b tbl v23.16b, {v17.16b, v18.16b}, v28.16b tbl v26.16b, {v20.16b, v21.16b}, v29.16b - sdot v0.4s, v16.16b, v7.4b[0] - sdot v0.4s, v17.16b, v7.4b[1] - sdot v1.4s, v19.16b, v7.4b[0] - sdot v1.4s, v20.16b, v7.4b[1] + \dot v0.4s, v16.16b, v7.4b[0] + \dot v0.4s, v17.16b, v7.4b[1] + \dot v1.4s, v19.16b, v7.4b[0] + \dot v1.4s, v20.16b, v7.4b[1] - sdot v2.4s, v22.16b, v7.4b[0] - sdot v2.4s, v23.16b, v7.4b[1] - sdot v3.4s, v25.16b, v7.4b[0] - sdot v3.4s, v26.16b, v7.4b[1] + \dot v2.4s, v22.16b, v7.4b[0] + \dot v2.4s, v23.16b, v7.4b[1] + \dot v3.4s, v25.16b, v7.4b[0] + \dot v3.4s, v26.16b, v7.4b[1] uzp1 v0.8h, v0.8h, v1.8h uzp1 v2.8h, v2.8h, v3.8h .ifc \type, prep + .ifc \isa, neon_i8mm + srshr v0.8h, v0.8h, #2 + srshr v1.8h, v2.8h, #2 + .else sshr v0.8h, v0.8h, #2 sshr v1.8h, v2.8h, #2 + .endif stp q0, q1, [\dst] -.else +.else // put sqrshrun v0.8b, v0.8h, #6 sqrshrun v1.8b, v2.8h, #6 str d0, [\dst] @@ -451,43 +480,47 @@ L(\type\()_8tap_v_\isa): zip1 v16.8h, v0.8h, v2.8h zip1 v17.8h, v18.8h, v24.8h - +.ifc \isa, neon_dotprod sub v16.16b, v16.16b, v5.16b sub v17.16b, v17.16b, v5.16b +.endif .ifc \type, put b.eq 42f .endif - .align LOOP_ALIGN 4: ldr s18, [\src] ldr s21, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 -.ifc \type, prep +.ifc \isa, neon_i8mm + movi v0.4s, #0 + movi v1.4s, #0 +.else // neon_dotprod mov v0.16b, v4.16b mov v1.16b, v4.16b -.else - movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT - movi v1.4s, #32, lsl 8 -.endif sub v18.16b, v18.16b, v5.16b sub v21.16b, v21.16b, v5.16b - +.endif tbl v19.16b, {v16.16b, v17.16b}, v6.16b tbl v20.16b, {v17.16b, v18.16b}, v28.16b - sdot v0.4s, v16.16b, v7.4b[0] - sdot v0.4s, v17.16b, v7.4b[1] + \dot v0.4s, v16.16b, v7.4b[0] + \dot v0.4s, v17.16b, v7.4b[1] tbl v16.16b, {v19.16b, v20.16b}, v6.16b tbl v17.16b, {v20.16b, v21.16b}, v28.16b - sdot v1.4s, v19.16b, v7.4b[0] - sdot v1.4s, v20.16b, v7.4b[1] + \dot v1.4s, v19.16b, v7.4b[0] + \dot v1.4s, v20.16b, v7.4b[1] .ifc \type, prep subs \h, \h, #2 + .ifc \isa, neon_i8mm + rshrn v0.4h, v0.4s, #2 + rshrn2 v0.8h, v1.4s, #2 + .else shrn v0.4h, v0.4s, #2 shrn2 v0.8h, v1.4s, #2 + .endif str q0, [\dst], #16 .else uzp1 v0.8h, v0.8h, v1.8h @@ -504,29 +537,33 @@ L(\type\()_8tap_v_\isa): .ifc \type, put .align JUMP_ALIGN 42: +.endif ldr s18, [\src] - movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT - movi v1.4s, #32, lsl 8 -.else - ldr s18, [\src] +.ifc \isa, neon_i8mm + movi v0.4s, #0 + movi v1.4s, #0 +.else // neon_dotprod mov v0.16b, v4.16b mov v1.16b, v4.16b -.endif sub v18.16b, v18.16b, v5.16b - +.endif tbl v19.16b, {v16.16b, v17.16b}, v6.16b tbl v20.16b, {v17.16b, v18.16b}, v28.16b - sdot v0.4s, v16.16b, v7.4b[0] - sdot v0.4s, v17.16b, v7.4b[1] + \dot v0.4s, v16.16b, v7.4b[0] + \dot v0.4s, v17.16b, v7.4b[1] - sdot v1.4s, v19.16b, v7.4b[0] - sdot v1.4s, v20.16b, v7.4b[1] + \dot v1.4s, v19.16b, v7.4b[0] + \dot v1.4s, v20.16b, v7.4b[1] .ifc \type, prep + .ifc \isa, neon_i8mm + rshrn v0.4h, v0.4s, #2 + rshrn2 v0.8h, v1.4s, #2 + .else shrn v0.4h, v0.4s, #2 shrn2 v0.8h, v1.4s, #2 + .endif str q0, [\dst] - ret .else uzp1 v0.8h, v0.8h, v1.8h sqrshrun v0.8b, v0.8h, #6 @@ -534,8 +571,10 @@ L(\type\()_8tap_v_\isa): lsr x9, x8, #32 str w8, [\dst] str w9, [\dst, \d_strd] +.endif ret +.ifc \type, put .align JUMP_ALIGN 20: // V - 2xN ldr h16, [\src] @@ -560,10 +599,10 @@ L(\type\()_8tap_v_\isa): zip1 v16.4h, v0.4h, v2.4h zip1 v17.4h, v18.4h, v24.4h - + .ifc \isa, neon_dotprod sub v16.8b, v16.8b, v5.8b sub v17.8b, v17.8b, v5.8b - + .endif b.eq 22f .align LOOP_ALIGN @@ -571,24 +610,26 @@ L(\type\()_8tap_v_\isa): ldr h18, [\src] ldr h21, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 - - movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT - movi v1.4s, #32, lsl 8 - + .ifc \isa, neon_i8mm + movi v0.4s, #0 + movi v1.4s, #0 + .else // put + mov v0.16b, v4.16b + mov v1.16b, v4.16b sub v18.8b, v18.8b, v5.8b sub v21.8b, v21.8b, v5.8b - + .endif tbl v19.16b, {v16.16b, v17.16b}, v6.16b tbl v20.16b, {v17.16b, v18.16b}, v28.16b - sdot v0.4s, v16.16b, v7.4b[0] - sdot v0.4s, v17.16b, v7.4b[1] + \dot v0.4s, v16.16b, v7.4b[0] + \dot v0.4s, v17.16b, v7.4b[1] tbl v16.16b, {v19.16b, v20.16b}, v6.16b tbl v17.16b, {v20.16b, v21.16b}, v28.16b - sdot v1.4s, v19.16b, v7.4b[0] - sdot v1.4s, v20.16b, v7.4b[1] + \dot v1.4s, v19.16b, v7.4b[0] + \dot v1.4s, v20.16b, v7.4b[1] uzp1 v0.8h, v0.8h, v1.8h sqrshrun v0.8b, v0.8h, #6 @@ -604,20 +645,22 @@ L(\type\()_8tap_v_\isa): .align JUMP_ALIGN 22: ldr h18, [\src] - - movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT - movi v1.4s, #32, lsl 8 - + .ifc \isa, neon_i8mm + movi v0.4s, #0 + movi v1.4s, #0 + .else // put + mov v0.16b, v4.16b + mov v1.16b, v4.16b sub v18.8b, v18.8b, v5.8b - + .endif tbl v19.16b, {v16.16b, v17.16b}, v6.16b tbl v20.16b, {v17.16b, v18.16b}, v28.16b - sdot v0.4s, v16.16b, v7.4b[0] - sdot v0.4s, v17.16b, v7.4b[1] + \dot v0.4s, v16.16b, v7.4b[0] + \dot v0.4s, v17.16b, v7.4b[1] - sdot v1.4s, v19.16b, v7.4b[0] - sdot v1.4s, v20.16b, v7.4b[1] + \dot v1.4s, v19.16b, v7.4b[0] + \dot v1.4s, v20.16b, v7.4b[1] uzp1 v0.8h, v0.8h, v1.8h sqrshrun v0.8b, v0.8h, #6 @@ -634,9 +677,11 @@ L(\type\()_8tap_h_hv_\isa): madd \mx, \mx, w11, w9 madd w14, \my, w11, w10 // for HV ldr q28, L(h_tbl_neon_dotprod) +.ifc \isa, neon_dotprod mov w13, 0x2002 // FILTER_WEIGHT * 128 + rounding - sub \src, \src, #4 // src - 4 - dup v27.4s, w13 + dup v27.4s, w13 // put H overrides this +.endif + sub \src, \src, #3 // src - 3 ubfx w9, \mx, #7, #7 and \mx, \mx, #0x7F ubfx w11, w14, #7, #7 // for HV @@ -644,13 +689,15 @@ L(\type\()_8tap_h_hv_\isa): cmp \w, #4 csel \mx, \mx, w9, le add \xmx, x12, \xmx, lsl #3 // subpel H filter address +.ifc \isa, neon_dotprod movi v24.16b, #128 +.endif cbz \my, L(\type\()_8tap_h_\isa) // HV cases cmp \h, #4 csel w14, w14, w11, le - sub \src, \src, \s_strd, lsl #1 // src - src_stride * 2 - 4 + sub \src, \src, \s_strd, lsl #1 // src - s_strd * 2 - 3 add \xmy, x12, x14, lsl #3 // subpel V filter address mov x15, x30 ldr d7, [\xmy] @@ -662,7 +709,7 @@ L(\type\()_8tap_h_hv_\isa): b.ne L(\type\()_6tap_hv_\isa) // vertical != SHARP1 // HV 8-tap cases - sub \src, \src, \s_strd // src - src_stride * 3 - 4 + sub \src, \src, \s_strd // src - s_strd * 3 - 3 cmp \w, #4 b.eq 40f .ifc \type, put @@ -677,27 +724,42 @@ L(\type\()_8tap_h_hv_\isa): .ifc \type, prep add \wd_strd, \w, \w .endif - .align LOOP_ALIGN 81: mov \lsrc, \src mov \ldst, \dst mov w8, \h - +.ifc \isa, neon_i8mm bl L(\type\()_hv_filter8_\isa) - mov v16.16b, v22.16b + srshr v16.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) - mov v17.16b, v22.16b + srshr v17.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) - mov v18.16b, v22.16b + srshr v18.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) - mov v19.16b, v22.16b + srshr v19.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) - mov v20.16b, v22.16b + srshr v20.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) - mov v21.16b, v22.16b + srshr v21.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) - + srshr v22.8h, v22.8h, #2 +.else + bl L(\type\()_hv_filter8_\isa) + sshr v16.8h, v22.8h, #2 + bl L(\type\()_hv_filter8_\isa) + sshr v17.8h, v22.8h, #2 + bl L(\type\()_hv_filter8_\isa) + sshr v18.8h, v22.8h, #2 + bl L(\type\()_hv_filter8_\isa) + sshr v19.8h, v22.8h, #2 + bl L(\type\()_hv_filter8_\isa) + sshr v20.8h, v22.8h, #2 + bl L(\type\()_hv_filter8_\isa) + sshr v21.8h, v22.8h, #2 + bl L(\type\()_hv_filter8_\isa) + sshr v22.8h, v22.8h, #2 +.endif .align LOOP_ALIGN 8: ldr q23, [\lsrc] @@ -706,33 +768,40 @@ L(\type\()_8tap_h_hv_\isa): smull v0.4s, v16.4h, v7.h[0] smull2 v1.4s, v16.8h, v7.h[0] mov v16.16b, v17.16b - +.ifc \isa, neon_i8mm + movi v5.4s, #0 + movi v6.4s, #0 + tbl v2.16b, {v23.16b}, v28.16b + tbl v3.16b, {v23.16b}, v29.16b +.else // neon_dotprod sub v23.16b, v23.16b, v24.16b - mov v5.16b, v27.16b mov v6.16b, v27.16b - +.endif smlal v0.4s, v17.4h, v7.h[1] smlal2 v1.4s, v17.8h, v7.h[1] +.ifc \isa, neon_i8mm + tbl v4.16b, {v23.16b}, v30.16b + mov v17.16b, v18.16b +.else // neon_dotprod mov v17.16b, v18.16b - tbl v2.16b, {v23.16b}, v28.16b tbl v3.16b, {v23.16b}, v29.16b tbl v4.16b, {v23.16b}, v30.16b - +.endif smlal v0.4s, v18.4h, v7.h[2] smlal2 v1.4s, v18.8h, v7.h[2] mov v18.16b, v19.16b - sdot v5.4s, v2.16b, v26.4b[0] - sdot v6.4s, v3.16b, v26.4b[0] + \dot v5.4s, v2.16b, v26.4b[0] + \dot v6.4s, v3.16b, v26.4b[0] smlal v0.4s, v19.4h, v7.h[3] smlal2 v1.4s, v19.8h, v7.h[3] mov v19.16b, v20.16b - sdot v5.4s, v3.16b, v26.4b[1] - sdot v6.4s, v4.16b, v26.4b[1] + \dot v5.4s, v3.16b, v26.4b[1] + \dot v6.4s, v4.16b, v26.4b[1] smlal v0.4s, v20.4h, v7.h[4] smlal2 v1.4s, v20.8h, v7.h[4] @@ -744,27 +813,42 @@ L(\type\()_8tap_h_hv_\isa): uzp1 v23.8h, v5.8h, v6.8h .endif mov v21.16b, v22.16b - smlal v0.4s, v22.4h, v7.h[6] smlal2 v1.4s, v22.8h, v7.h[6] +.ifc \isa, neon_i8mm + subs w8, w8, #1 +.endif .ifc \type, prep + .ifc \isa, neon_i8mm + srshr v22.8h, v23.8h, #2 + .else sshr v22.8h, v23.8h, #2 + .endif smlal v0.4s, v22.4h, v7.h[7] smlal2 v1.4s, v22.8h, v7.h[7] rshrn v0.4h, v0.4s, #6 rshrn2 v0.8h, v1.4s, #6 +.else // put + .ifc \isa, neon_i8mm + rshrn v22.4h, v5.4s, #2 + rshrn2 v22.8h, v6.4s, #2 + .else + shrn v22.4h, v5.4s, #2 + shrn2 v22.8h, v6.4s, #2 + .endif + smlal v0.4s, v22.4h, v7.h[7] + smlal2 v1.4s, v22.8h, v7.h[7] + tbl v0.16b, {v0.16b, v1.16b}, v25.16b + sqrshrun v0.8b, v0.8h, #2 +.endif +.ifc \isa, neon_dotprod subs w8, w8, #1 +.endif +.ifc \type, prep st1 {v0.8h}, [\ldst], \d_strd b.gt 8b add \dst, \dst, #16 .else - shrn v22.4h, v5.4s, #2 - shrn2 v22.8h, v6.4s, #2 - smlal v0.4s, v22.4h, v7.h[7] - smlal2 v1.4s, v22.8h, v7.h[7] - tbl v0.16b, {v0.16b, v1.16b}, v25.16b - subs w8, w8, #1 - sqrshrun v0.8b, v0.8h, #2 st1 {v0.8b}, [\ldst], \d_strd b.gt 8b add \dst, \dst, #8 @@ -780,18 +864,19 @@ L(\type\()_8tap_h_hv_\isa): add \src, \src, #2 bl L(\type\()_hv_filter4_\isa) - mov v16.16b, v22.16b + shrn v16.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v17.16b, v22.16b + shrn v17.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v18.16b, v22.16b + shrn v18.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v19.16b, v22.16b + shrn v19.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v20.16b, v22.16b + shrn v20.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v21.16b, v22.16b + shrn v21.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) + shrn v22.4h, v22.4s, #2 .align LOOP_ALIGN 4: @@ -801,35 +886,40 @@ L(\type\()_8tap_h_hv_\isa): smlal v0.4s, v17.4h, v7.h[1] mov v16.16b, v17.16b mov v17.16b, v18.16b +.ifc \isa, neon_dotprod sub v4.16b, v4.16b, v24.16b - +.endif smlal v0.4s, v18.4h, v7.h[2] smlal v0.4s, v19.4h, v7.h[3] tbl v2.16b, {v4.16b}, v28.16b +.ifc \isa, neon_i8mm + movi v5.4s, #0 +.else mov v5.16b, v27.16b - +.endif mov v18.16b, v19.16b mov v19.16b, v20.16b smlal v0.4s, v20.4h, v7.h[4] smlal v0.4s, v21.4h, v7.h[5] - sdot v5.4s, v2.16b, v26.4b[0] + \dot v5.4s, v2.16b, v26.4b[0] mov v20.16b, v21.16b mov v21.16b, v22.16b -.ifc \type, put - subs \h, \h, #1 -.endif smlal v0.4s, v22.4h, v7.h[6] +.ifc \isa, neon_i8mm + rshrn v22.4h, v5.4s, #2 +.else shrn v22.4h, v5.4s, #2 - +.endif smlal v0.4s, v22.4h, v7.h[7] .ifc \type, prep rshrn v0.4h, v0.4s, #6 str d0, [\dst], #8 subs \h, \h, #1 .else - tbl v0.16b, {v0.16b, v1.16b}, v25.16b + subs \h, \h, #1 + tbl v0.8b, {v0.16b}, v25.8b sqrshrun v0.8b, v0.8h, #2 str s0, [\dst] add \dst, \dst, \d_strd @@ -844,18 +934,19 @@ L(\type\()_8tap_h_hv_\isa): add \src, \src, #2 bl L(\type\()_hv_filter4_\isa) - mov v16.16b, v22.16b + shrn v16.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v17.16b, v22.16b + shrn v17.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v18.16b, v22.16b + shrn v18.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v19.16b, v22.16b + shrn v19.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v20.16b, v22.16b + shrn v20.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v21.16b, v22.16b + shrn v21.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) + shrn v22.4h, v22.4s, #2 .align LOOP_ALIGN 2: @@ -865,29 +956,37 @@ L(\type\()_8tap_h_hv_\isa): smlal v0.4s, v17.4h, v7.h[1] mov v16.16b, v17.16b mov v17.16b, v18.16b + .ifc \isa, neon_dotprod sub v4.16b, v4.16b, v24.16b - + .endif smlal v0.4s, v18.4h, v7.h[2] smlal v0.4s, v19.4h, v7.h[3] tbl v2.16b, {v4.16b}, v28.16b + .ifc \isa, neon_i8mm + movi v5.4s, #0 + .else mov v5.16b, v27.16b - + .endif mov v18.16b, v19.16b mov v19.16b, v20.16b smlal v0.4s, v20.4h, v7.h[4] smlal v0.4s, v21.4h, v7.h[5] - sdot v5.4s, v2.16b, v26.4b[0] + \dot v5.4s, v2.16b, v26.4b[0] mov v20.16b, v21.16b mov v21.16b, v22.16b - subs \h, \h, #1 smlal v0.4s, v22.4h, v7.h[6] + .ifc \isa, neon_i8mm + rshrn v22.4h, v5.4s, #2 + .else shrn v22.4h, v5.4s, #2 - + .endif smlal v0.4s, v22.4h, v7.h[7] - tbl v0.16b, {v0.16b, v1.16b}, v25.16b + subs \h, \h, #1 + + tbl v0.8b, {v0.16b}, v25.8b sqrshrun v0.8b, v0.8h, #2 str h0, [\dst] @@ -918,18 +1017,29 @@ L(\type\()_6tap_hv_\isa): mov \lsrc, \src mov \ldst, \dst mov w8, \h - +.ifc \isa, neon_i8mm bl L(\type\()_hv_filter8_\isa) - mov v16.16b, v22.16b + srshr v16.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) - mov v17.16b, v22.16b + srshr v17.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) - mov v18.16b, v22.16b + srshr v18.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) - mov v19.16b, v22.16b + srshr v19.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) - mov v20.16b, v22.16b - + srshr v20.8h, v22.8h, #2 +.else + bl L(\type\()_hv_filter8_\isa) + sshr v16.8h, v22.8h, #2 + bl L(\type\()_hv_filter8_\isa) + sshr v17.8h, v22.8h, #2 + bl L(\type\()_hv_filter8_\isa) + sshr v18.8h, v22.8h, #2 + bl L(\type\()_hv_filter8_\isa) + sshr v19.8h, v22.8h, #2 + bl L(\type\()_hv_filter8_\isa) + sshr v20.8h, v22.8h, #2 +.endif .align LOOP_ALIGN 8: ldr q23, [\xmy] @@ -937,12 +1047,17 @@ L(\type\()_6tap_hv_\isa): smull v0.4s, v16.4h, v7.h[1] smull2 v1.4s, v16.8h, v7.h[1] +.ifc \isa, neon_dotprod sub v23.16b, v23.16b, v24.16b +.endif mov v16.16b, v17.16b - +.ifc \isa, neon_i8mm + movi v5.4s, #0 + movi v6.4s, #0 +.else mov v5.16b, v27.16b mov v6.16b, v27.16b - +.endif tbl v2.16b, {v23.16b}, v28.16b tbl v3.16b, {v23.16b}, v29.16b @@ -951,14 +1066,16 @@ L(\type\()_6tap_hv_\isa): tbl v4.16b, {v23.16b}, v30.16b mov v17.16b, v18.16b - sdot v5.4s, v2.16b, v26.4b[0] - sdot v6.4s, v3.16b, v26.4b[0] + \dot v5.4s, v2.16b, v26.4b[0] + \dot v6.4s, v3.16b, v26.4b[0] + smlal v0.4s, v18.4h, v7.h[3] smlal2 v1.4s, v18.8h, v7.h[3] mov v18.16b, v19.16b - sdot v5.4s, v3.16b, v26.4b[1] - sdot v6.4s, v4.16b, v26.4b[1] + \dot v5.4s, v3.16b, v26.4b[1] + \dot v6.4s, v4.16b, v26.4b[1] + smlal v0.4s, v19.4h, v7.h[4] smlal2 v1.4s, v19.8h, v7.h[4] mov v19.16b, v20.16b @@ -966,20 +1083,21 @@ L(\type\()_6tap_hv_\isa): smlal v0.4s, v20.4h, v7.h[5] smlal2 v1.4s, v20.8h, v7.h[5] +.ifc \isa, neon_i8mm + srshr v20.8h, v23.8h, #2 +.else sshr v20.8h, v23.8h, #2 -.ifc \type, prep +.endif + subs w8, w8, #1 smlal v0.4s, v20.4h, v7.h[6] smlal2 v1.4s, v20.8h, v7.h[6] +.ifc \type, prep rshrn v0.4h, v0.4s, #6 rshrn2 v0.8h, v1.4s, #6 st1 {v0.8h}, [\ldst], \d_strd - subs w8, w8, #1 b.gt 8b add \dst, \dst, #16 .else - subs w8, w8, #1 - smlal v0.4s, v20.4h, v7.h[6] - smlal2 v1.4s, v20.8h, v7.h[6] tbl v0.16b, {v0.16b, v1.16b}, v25.16b sqrshrun v0.8b, v0.8h, #2 st1 {v0.8b}, [\ldst], \d_strd @@ -993,30 +1111,36 @@ L(\type\()_6tap_hv_\isa): .align FUNC_ALIGN L(\type\()_hv_filter8_\isa): - ldr q4, [\lsrc] - add \lsrc, \lsrc, \s_strd + ld1 {v4.16b}, [\lsrc], \s_strd +.ifc \isa, neon_i8mm + movi v22.4s, #0 + movi v23.4s, #0 +.else // neon_dotprod sub v4.16b, v4.16b, v24.16b mov v22.16b, v27.16b mov v23.16b, v27.16b +.endif tbl v2.16b, {v4.16b}, v28.16b tbl v3.16b, {v4.16b}, v29.16b tbl v4.16b, {v4.16b}, v30.16b - sdot v22.4s, v2.16b, v26.4b[0] - sdot v22.4s, v3.16b, v26.4b[1] - sdot v23.4s, v3.16b, v26.4b[0] - sdot v23.4s, v4.16b, v26.4b[1] - shrn v22.4h, v22.4s, #2 - shrn2 v22.8h, v23.4s, #2 + \dot v22.4s, v2.16b, v26.4b[0] + \dot v23.4s, v3.16b, v26.4b[0] + \dot v22.4s, v3.16b, v26.4b[1] + \dot v23.4s, v4.16b, v26.4b[1] + uzp1 v22.8h, v22.8h, v23.8h ret .align FUNC_ALIGN L(\type\()_hv_filter4_\isa): - mov v22.16b, v27.16b ld1 {v4.8b}, [\src], \s_strd +.ifc \isa, neon_i8mm + movi v22.4s, #2 +.else + mov v22.16b, v27.16b sub v4.16b, v4.16b, v24.16b +.endif tbl v2.16b, {v4.16b}, v28.16b - sdot v22.4s, v2.16b, v26.4b[0] - shrn v22.4h, v22.4s, #2 + \dot v22.4s, v2.16b, v26.4b[0] ret .align JUMP_ALIGN @@ -1025,15 +1149,15 @@ L(\type\()_hv_filter4_\isa): add \src, \src, #2 bl L(\type\()_hv_filter4_\isa) - mov v16.16b, v22.16b + shrn v16.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v17.16b, v22.16b + shrn v17.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v18.16b, v22.16b + shrn v18.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v19.16b, v22.16b + shrn v19.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v20.16b, v22.16b + shrn v20.4h, v22.4s, #2 .align LOOP_ALIGN 4: @@ -1041,30 +1165,37 @@ L(\type\()_hv_filter4_\isa): smull v0.4s, v16.4h, v7.h[1] smlal v0.4s, v17.4h, v7.h[2] +.ifc \isa, neon_dotprod sub v4.16b, v4.16b, v24.16b +.endif mov v16.16b, v17.16b mov v17.16b, v18.16b smlal v0.4s, v18.4h, v7.h[3] smlal v0.4s, v19.4h, v7.h[4] tbl v2.16b, {v4.16b}, v28.16b +.ifc \isa, neon_i8mm + movi v5.4s, #0 +.else mov v5.16b, v27.16b - +.endif mov v18.16b, v19.16b mov v19.16b, v20.16b - sdot v5.4s, v2.16b, v26.4b[0] + \dot v5.4s, v2.16b, v26.4b[0] smlal v0.4s, v20.4h, v7.h[5] +.ifc \isa, neon_i8mm + rshrn v20.4h, v5.4s, #2 +.else shrn v20.4h, v5.4s, #2 -.ifc \type, prep +.endif + subs \h, \h, #1 smlal v0.4s, v20.4h, v7.h[6] +.ifc \type, prep rshrn v0.4h, v0.4s, #6 str d0, [\dst], #8 - subs \h, \h, #1 .else - subs \h, \h, #1 - smlal v0.4s, v20.4h, v7.h[6] - tbl v0.16b, {v0.16b}, v25.16b + tbl v0.8b, {v0.16b}, v25.8b sqrshrun v0.8b, v0.8h, #2 str s0, [\dst] add \dst, \dst, \d_strd @@ -1079,15 +1210,15 @@ L(\type\()_hv_filter4_\isa): add \src, \src, #2 bl L(\type\()_hv_filter4_\isa) - mov v16.16b, v22.16b + shrn v16.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v17.16b, v22.16b + shrn v17.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v18.16b, v22.16b + shrn v18.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v19.16b, v22.16b + shrn v19.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v20.16b, v22.16b + shrn v20.4h, v22.4s, #2 .align LOOP_ALIGN 2: @@ -1095,26 +1226,36 @@ L(\type\()_hv_filter4_\isa): smull v0.4s, v16.4h, v7.h[1] smlal v0.4s, v17.4h, v7.h[2] + .ifc \isa, neon_dotprod sub v4.16b, v4.16b, v24.16b + .endif mov v16.16b, v17.16b mov v17.16b, v18.16b smlal v0.4s, v18.4h, v7.h[3] smlal v0.4s, v19.4h, v7.h[4] tbl v2.16b, {v4.16b}, v28.16b + .ifc \isa, neon_i8mm + movi v5.4s, #0 + .else mov v5.16b, v27.16b + .endif mov v18.16b, v19.16b mov v19.16b, v20.16b - sdot v5.4s, v2.16b, v26.4b[0] + \dot v5.4s, v2.16b, v26.4b[0] smlal v0.4s, v20.4h, v7.h[5] + .ifc \isa, neon_i8mm + rshrn v20.4h, v5.4s, #2 + .else shrn v20.4h, v5.4s, #2 + .endif subs \h, \h, #1 smlal v0.4s, v20.4h, v7.h[6] - tbl v0.16b, {v0.16b}, v25.16b + tbl v0.8b, {v0.16b}, v25.8b sqrshrun v0.8b, v0.8h, #2 str h0, [\dst] @@ -1128,8 +1269,12 @@ L(\type\()_8tap_h_\isa): adr x9, L(\type\()_8tap_h_\isa\()_tbl) ldrh w8, [x9, x8, lsl #1] .ifc \type, put + .ifc \isa, neon_i8mm + movi v27.4s, #34 // special rounding + .else mov w10, #0x2022 // 64 * 128 + 34, bias and rounding for SDOT dup v27.4s, w10 + .endif .endif sub x9, x9, x8 br x9 @@ -1139,25 +1284,25 @@ L(\type\()_8tap_h_\isa): 20: // H - 2xN AARCH64_VALID_JUMP_TARGET add \src, \src, #2 - ldr s6, [\xmx, #2] + ldr s26, [\xmx, #2] .align LOOP_ALIGN 2: ldr d0, [\src] ldr d1, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 - + .ifc \isa, neon_dotprod sub v0.8b, v0.8b, v24.8b sub v1.8b, v1.8b, v24.8b - + .endif mov v4.16b, v27.16b mov v5.16b, v27.16b tbl v2.16b, {v0.16b}, v28.16b tbl v3.16b, {v1.16b}, v28.16b - sdot v4.4s, v2.16b, v6.4b[0] - sdot v5.4s, v3.16b, v6.4b[0] + \dot v4.4s, v2.16b, v26.4b[0] + \dot v5.4s, v3.16b, v26.4b[0] uzp1 v4.8h, v4.8h, v5.8h sqshrun v4.8b, v4.8h, #6 @@ -1170,7 +1315,6 @@ L(\type\()_8tap_h_\isa): add \dst, \dst, \d_strd, lsl #1 b.gt 2b ret - .endif .align JUMP_ALIGN @@ -1184,24 +1328,33 @@ L(\type\()_8tap_h_\isa): ldr d0, [\src] ldr d1, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 - +.ifc \type\()_\isa, prep_neon_i8mm + movi v4.4s, #0 + movi v5.4s, #0 +.else + .ifc \isa, neon_dotprod sub v0.8b, v0.8b, v24.8b sub v1.8b, v1.8b, v24.8b - + .endif mov v4.16b, v27.16b mov v5.16b, v27.16b - +.endif tbl v2.16b, {v0.16b}, v28.16b tbl v3.16b, {v1.16b}, v28.16b - sdot v4.4s, v2.16b, v26.4b[0] - sdot v5.4s, v3.16b, v26.4b[0] + \dot v4.4s, v2.16b, v26.4b[0] + \dot v5.4s, v3.16b, v26.4b[0] .ifc \type, prep subs \h, \h, #2 + .ifc \isa, neon_i8mm + uzp1 v4.8h, v4.8h, v5.8h + srshr v4.8h, v4.8h, #2 + .else shrn v4.4h, v4.4s, #2 shrn2 v4.8h, v5.4s, #2 + .endif str q4, [\dst], #16 -.else +.else // put uzp1 v4.8h, v4.8h, v5.8h sqshrun v4.8b, v4.8h, #6 subs \h, \h, #2 @@ -1226,15 +1379,21 @@ L(\type\()_8tap_h_\isa): ldr q0, [\src] ldr q16, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 - +.ifc \type\()_\isa, prep_neon_i8mm + movi v4.4s, #0 + movi v5.4s, #0 + movi v20.4s, #0 + movi v21.4s, #0 +.else + .ifc \isa, neon_dotprod sub v0.16b, v0.16b, v24.16b sub v16.16b, v16.16b, v24.16b - + .endif mov v4.16b, v27.16b mov v5.16b, v27.16b mov v20.16b, v27.16b mov v21.16b, v27.16b - +.endif tbl v1.16b, {v0.16b}, v28.16b tbl v2.16b, {v0.16b}, v29.16b tbl v3.16b, {v0.16b}, v30.16b @@ -1242,23 +1401,28 @@ L(\type\()_8tap_h_\isa): tbl v18.16b, {v16.16b}, v29.16b tbl v19.16b, {v16.16b}, v30.16b - sdot v4.4s, v1.16b, v26.4b[0] - sdot v5.4s, v2.16b, v26.4b[0] - sdot v20.4s, v17.16b, v26.4b[0] - sdot v21.4s, v18.16b, v26.4b[0] - sdot v4.4s, v2.16b, v26.4b[1] - sdot v5.4s, v3.16b, v26.4b[1] - sdot v20.4s, v18.16b, v26.4b[1] - sdot v21.4s, v19.16b, v26.4b[1] + \dot v4.4s, v1.16b, v26.4b[0] + \dot v5.4s, v2.16b, v26.4b[0] + \dot v20.4s, v17.16b, v26.4b[0] + \dot v21.4s, v18.16b, v26.4b[0] + \dot v4.4s, v2.16b, v26.4b[1] + \dot v5.4s, v3.16b, v26.4b[1] + \dot v20.4s, v18.16b, v26.4b[1] + \dot v21.4s, v19.16b, v26.4b[1] uzp1 v4.8h, v4.8h, v5.8h uzp1 v20.8h, v20.8h, v21.8h .ifc \type, prep + .ifc \isa, neon_i8mm + srshr v4.8h, v4.8h, #2 + srshr v20.8h, v20.8h, #2 + .else sshr v4.8h, v4.8h, #2 sshr v20.8h, v20.8h, #2 + .endif subs \h, \h, #2 stp q4, q20, [\dst], #32 -.else +.else // put sqshrun v4.8b, v4.8h, #6 sqshrun v20.8b, v20.8h, #6 subs \h, \h, #2 @@ -1274,50 +1438,60 @@ L(\type\()_8tap_h_\isa): AARCH64_VALID_JUMP_TARGET ldr q29, L(h_tbl_neon_dotprod) + 16 ldr q30, L(h_tbl_neon_dotprod) + 32 - ldr q31, L(h_tbl_neon_dotprod) + 48 ldr d26, [\xmx] .align LOOP_ALIGN 16: - ldp q16, q17, [\src] + ldr q16, [\src] + ldr q17, [\src, #12] // avoid 2 register TBL for small cores add \src, \src, \s_strd - +.ifc \type\()_\isa, prep_neon_i8mm + movi v6.4s, #0 + movi v7.4s, #0 + movi v22.4s, #0 + movi v23.4s, #0 +.else + .ifc \isa, neon_dotprod sub v16.16b, v16.16b, v24.16b sub v17.16b, v17.16b, v24.16b - + .endif mov v6.16b, v27.16b mov v7.16b, v27.16b mov v22.16b, v27.16b mov v23.16b, v27.16b - +.endif tbl v0.16b, {v16.16b}, v28.16b tbl v1.16b, {v16.16b}, v29.16b tbl v2.16b, {v16.16b}, v30.16b - tbl v3.16b, {v16.16b, v17.16b}, v31.16b - tbl v4.16b, {v17.16b}, v28.16b + tbl v3.16b, {v17.16b}, v28.16b + tbl v4.16b, {v17.16b}, v29.16b - sdot v6.4s, v0.16b, v26.4b[0] - sdot v7.4s, v1.16b, v26.4b[0] - sdot v22.4s, v2.16b, v26.4b[0] - sdot v23.4s, v3.16b, v26.4b[0] - sdot v6.4s, v1.16b, v26.4b[1] - sdot v7.4s, v2.16b, v26.4b[1] - sdot v22.4s, v3.16b, v26.4b[1] - sdot v23.4s, v4.16b, v26.4b[1] + \dot v6.4s, v0.16b, v26.4b[0] + \dot v7.4s, v1.16b, v26.4b[0] + \dot v22.4s, v2.16b, v26.4b[0] + \dot v23.4s, v3.16b, v26.4b[0] + \dot v6.4s, v1.16b, v26.4b[1] + \dot v7.4s, v2.16b, v26.4b[1] + \dot v22.4s, v3.16b, v26.4b[1] + \dot v23.4s, v4.16b, v26.4b[1] uzp1 v6.8h, v6.8h, v7.8h uzp1 v22.8h, v22.8h, v23.8h .ifc \type, prep + .ifc \isa, neon_i8mm + srshr v6.8h, v6.8h, #2 + srshr v22.8h, v22.8h, #2 + .else sshr v6.8h, v6.8h, #2 sshr v22.8h, v22.8h, #2 + .endif subs \h, \h, #1 stp q6, q22, [\dst], #32 -.else +.else // put sqshrun v6.8b, v6.8h, #6 sqshrun2 v6.16b, v22.8h, #6 subs \h, \h, #1 - str q6, [\dst] - add \dst, \dst, \d_strd + st1 {v6.16b}, [\dst], \d_strd .endif b.gt 16b ret @@ -1329,7 +1503,6 @@ L(\type\()_8tap_h_\isa): AARCH64_VALID_JUMP_TARGET ldr q29, L(h_tbl_neon_dotprod) + 16 ldr q30, L(h_tbl_neon_dotprod) + 32 - ldr q31, L(h_tbl_neon_dotprod) + 48 ldr d26, [\xmx] .ifc \type, put sub \d_strd, \d_strd, \w, uxtw @@ -1339,39 +1512,52 @@ L(\type\()_8tap_h_\isa): .align LOOP_ALIGN 32: - ldp q16, q17, [\src], #16 - + ldr q16, [\src] + ldr q17, [\src, #12] // avoid 2 register TBL for small cores + add \src, \src, #16 +.ifc \type\()_\isa, prep_neon_i8mm + movi v6.4s, #0 + movi v7.4s, #0 + movi v22.4s, #0 + movi v23.4s, #0 +.else + .ifc \isa, neon_dotprod sub v16.16b, v16.16b, v24.16b sub v17.16b, v17.16b, v24.16b - + .endif mov v6.16b, v27.16b mov v7.16b, v27.16b mov v22.16b, v27.16b mov v23.16b, v27.16b - +.endif tbl v0.16b, {v16.16b}, v28.16b tbl v1.16b, {v16.16b}, v29.16b tbl v2.16b, {v16.16b}, v30.16b - tbl v3.16b, {v16.16b, v17.16b}, v31.16b - tbl v4.16b, {v17.16b}, v28.16b + tbl v3.16b, {v17.16b}, v28.16b + tbl v4.16b, {v17.16b}, v29.16b - sdot v6.4s, v0.16b, v26.4b[0] - sdot v7.4s, v1.16b, v26.4b[0] - sdot v22.4s, v2.16b, v26.4b[0] - sdot v23.4s, v3.16b, v26.4b[0] - sdot v6.4s, v1.16b, v26.4b[1] - sdot v7.4s, v2.16b, v26.4b[1] - sdot v22.4s, v3.16b, v26.4b[1] - sdot v23.4s, v4.16b, v26.4b[1] + \dot v6.4s, v0.16b, v26.4b[0] + \dot v7.4s, v1.16b, v26.4b[0] + \dot v22.4s, v2.16b, v26.4b[0] + \dot v23.4s, v3.16b, v26.4b[0] + \dot v6.4s, v1.16b, v26.4b[1] + \dot v7.4s, v2.16b, v26.4b[1] + \dot v22.4s, v3.16b, v26.4b[1] + \dot v23.4s, v4.16b, v26.4b[1] uzp1 v6.8h, v6.8h, v7.8h uzp1 v22.8h, v22.8h, v23.8h .ifc \type, prep + .ifc \isa, neon_i8mm + srshr v6.8h, v6.8h, #2 + srshr v22.8h, v22.8h, #2 + .else sshr v6.8h, v6.8h, #2 sshr v22.8h, v22.8h, #2 + .endif subs w8, w8, #16 stp q6, q22, [\dst], #32 -.else +.else // put sqshrun v6.8b, v6.8h, #6 sqshrun2 v6.16b, v22.8h, #6 subs w8, w8, #16 @@ -1397,17 +1583,32 @@ L(\type\()_8tap_h_\isa\()_tbl): .hword (L(\type\()_8tap_h_\isa\()_tbl) - 40b) .ifc \type, put .hword (L(\type\()_8tap_h_\isa\()_tbl) - 20b) + .hword 0 .endif endfunc .endm // dst(x0), d_strd(x7), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6) // xmx(x5), xmy(x6), ldst(x5), lsrc(x6), wd_strd(w7) -filter_8tap_fn prep, neon_dotprod, x0, x7, x1, x2, w3, w4, w5, w6, x5, x6, x5, x6, w7 +filter_8tap_fn prep, sdot, neon_dotprod, x0, x7, x1, x2, w3, w4, w5, w6, x5, x6, x5, x6, w7 // dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7) // xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1) -filter_8tap_fn put, neon_dotprod, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1 +filter_8tap_fn put, sdot, neon_dotprod, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1 + +#if HAVE_I8MM +ENABLE_I8MM + +// dst(x0), d_strd(x7), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6) +// xmx(x5), xmy(x6), ldst(x5), lsrc(x6), wd_strd(w7) +filter_8tap_fn prep, usdot, neon_i8mm, x0, x7, x1, x2, w3, w4, w5, w6, x5, x6, x5, x6, w7 + +// dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7) +// xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1) +filter_8tap_fn put, usdot, neon_i8mm, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1 + +DISABLE_I8MM +#endif // HAVE_I8MM DISABLE_DOTPROD #endif // HAVE_DOTPROD diff --git a/third_party/dav1d/src/arm/64/msac.S b/third_party/dav1d/src/arm/64/msac.S index 9033072a8298..a1a8c92e794d 100644 --- a/third_party/dav1d/src/arm/64/msac.S +++ b/third_party/dav1d/src/arm/64/msac.S @@ -35,14 +35,14 @@ #define CNT 28 #define ALLOW_UPDATE_CDF 32 +#define COEFFS_BASE_OFFSET 30 +#define MASKS8_OFFSET (64-COEFFS_BASE_OFFSET) + const coeffs .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0 .short 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -endconst - -const bits - .short 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 - .short 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000 + // masks8 + .short -0x202, -0x202, -0x202, -0x202, -0x202, -0x202, -0x202, 0xF0E endconst .macro ld1_n d0, d1, src, sz, n @@ -96,13 +96,6 @@ endconst .endif .endm -.macro urhadd_n d0, d1, s0, s1, s2, s3, sz, n - urhadd \d0\sz, \s0\sz, \s2\sz -.if \n == 16 - urhadd \d1\sz, \s1\sz, \s3\sz -.endif -.endm - .macro sshl_n d0, d1, s0, s1, s2, s3, sz, n sshl \d0\sz, \s0\sz, \s2\sz .if \n == 16 @@ -129,93 +122,189 @@ endconst function msac_decode_symbol_adapt4_neon, export=1 .macro decode_update sz, szb, n +.if \n == 16 sub sp, sp, #48 +.endif add x8, x0, #RNG ld1_n v0, v1, x1, \sz, \n // cdf - ld1r {v4\sz}, [x8] // rng - movrel x9, coeffs, 30 + ld1r {v29\sz}, [x8] // rng + movrel x9, coeffs, COEFFS_BASE_OFFSET movi v31\sz, #0x7f, lsl #8 // 0x7f00 - sub x9, x9, x2, lsl #1 + sub x10, x9, x2, lsl #1 mvni v30\sz, #0x3f // 0xffc0 - and v7\szb, v4\szb, v31\szb // rng & 0x7f00 - str h4, [sp, #14] // store original u = s->rng + and v7\szb, v29\szb, v31\szb // rng & 0x7f00 +.if \n == 16 + str h29, [sp, #14] // store original u = s->rng +.endif and_n v2, v3, v0, v1, v30, v30, \szb, \n // cdf & 0xffc0 - ld1_n v4, v5, x9, \sz, \n // EC_MIN_PROB * (n_symbols - ret) + ld1_n v4, v5, x10, \sz, \n // EC_MIN_PROB * (n_symbols - ret) sqdmulh_n v6, v7, v2, v3, v7, v7, \sz, \n // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1 - add x8, x0, #DIF + 6 + ldr d28, [x0, #DIF] add_n v4, v5, v2, v3, v4, v5, \sz, \n // v = cdf + EC_MIN_PROB * (n_symbols - ret) add_n v4, v5, v6, v7, v4, v5, \sz, \n // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret) - ld1r {v6.8h}, [x8] // dif >> (EC_WIN_SIZE - 16) - movrel x8, bits + dup v30\sz, v28.h[3] // dif >> (EC_WIN_SIZE - 16) +.if \n == 8 + ldur q31, [x9, #MASKS8_OFFSET] +.elseif \n == 16 str_n q4, q5, sp, #16, \n // store v values to allow indexed access - - ld1_n v16, v17, x8, .8h, \n - - cmhs_n v2, v3, v6, v6, v4, v5, .8h, \n // c >= v - - and_n v6, v7, v2, v3, v16, v17, .16b, \n // One bit per halfword set in the mask -.if \n == 16 - add v6.8h, v6.8h, v7.8h .endif - addv h6, v6.8h // Aggregate mask bits - ldr w4, [x0, #ALLOW_UPDATE_CDF] - umov w3, v6.h[0] - rbit w3, w3 - clz w15, w3 // ret - cbz w4, L(renorm) + // After the condition starts being true it continues, such that the vector looks like: + // 0, 0, 0 ... -1, -1 + cmhs_n v2, v3, v30, v30, v4, v5, \sz, \n // c >= v +.if \n == 4 + ext v29\szb, v29\szb, v4\szb, #6 // u + umov x15, v2.d[0] + ldr w4, [x0, #ALLOW_UPDATE_CDF] + rev x15, x15 + sub v29\sz, v29\sz, v4\sz // rng = u-v + // rev + clz = count trailing zeros + clz x15, x15 // 16*ret +.elseif \n == 8 + // The final short of the compare is always set. + // Using addv, subtract -0x202*ret from this value to create a lookup table for a short. + // For n == 8: + // -0x202 + -0x202 + ... + 0xF0E + // (0x202*7) | (1 << 8) + // ^-------offset for second byte of the short + and v31\szb, v31\szb, v2\szb + ext v29\szb, v29\szb, v4\szb, #14 // u + addv h31, v31\sz // ((2*ret + 1) << 8) | (2*ret) + ldr w4, [x0, #ALLOW_UPDATE_CDF] + sub v30\sz, v30\sz, v4\sz // (dif >> 48) - v + smov w15, v31.b[0] // 2*ret + sub v29\sz, v29\sz, v4\sz // rng = u-v +.elseif \n == 16 + add v6\sz, v2\sz, v3\sz + addv h31, v6\sz // -n + ret + ldr w4, [x0, #ALLOW_UPDATE_CDF] + smov w15, v31.h[0] +.endif + + cbz w4, 0f + // update_cdf ldrh w3, [x1, x2, lsl #1] // count = cdf[n_symbols] - movi v5\szb, #0xff .if \n == 16 + // 16 case has a lower bound that guarantees n_symbols > 2 mov w4, #-5 -.else +.elseif \n == 8 mvn w14, w2 mov w4, #-4 cmn w14, #3 // set C if n_symbols <= 2 +.else + // if n_symbols < 4 (or < 6 even) then + // (1 + n_symbols) >> 2 == n_symbols > 2 + add w14, w2, #17 // (1 + n_symbols) + (4 << 2) +.endif + sub_n v16, v17, v0, v1, v2, v3, \sz, \n // cdf + (i >= val ? 1 : 0) + orr v2\sz, #0x80, lsl #8 +.if \n == 16 + orr v3\sz, #0x80, lsl #8 .endif - urhadd_n v4, v5, v5, v5, v2, v3, \sz, \n // i >= val ? -1 : 32768 .if \n == 16 sub w4, w4, w3, lsr #4 // -((count >> 4) + 5) -.else +.elseif \n == 8 lsr w14, w3, #4 // count >> 4 sbc w4, w4, w14 // -((count >> 4) + (n_symbols > 2) + 4) +.else + neg w4, w14, lsr #2 // -((n_symbols > 2) + 4) + sub w4, w4, w3, lsr #4 // -((count >> 4) + (n_symbols > 2) + 4) .endif - sub_n v4, v5, v4, v5, v0, v1, \sz, \n // (32768 - cdf[i]) or (-1 - cdf[i]) + sub_n v2, v3, v2, v3, v0, v1, \sz, \n // (32768 - cdf[i]) or (-1 - cdf[i]) dup v6\sz, w4 // -rate sub w3, w3, w3, lsr #5 // count - (count == 32) - sub_n v0, v1, v0, v1, v2, v3, \sz, \n // cdf + (i >= val ? 1 : 0) - sshl_n v4, v5, v4, v5, v6, v6, \sz, \n // ({32768,-1} - cdf[i]) >> rate + sshl_n v2, v3, v2, v3, v6, v6, \sz, \n // ({32768,-1} - cdf[i]) >> rate add w3, w3, #1 // count + (count < 32) - add_n v0, v1, v0, v1, v4, v5, \sz, \n // cdf + (32768 - cdf[i]) >> rate + add_n v0, v1, v16, v17, v2, v3, \sz, \n // cdf + (32768 - cdf[i]) >> rate st1_n v0, v1, x1, \sz, \n strh w3, [x1, x2, lsl #1] -.endm - decode_update .4h, .8b, 4 +0: + // renorm +.if \n == 4 + ldr w6, [x0, #CNT] + ldr x7, [x0, #DIF] + mov x4, v29.d[0] // rng (packed) + mov x3, v4.d[0] // v (packed) -L(renorm): - add x8, sp, #16 - add x8, x8, w15, uxtw #1 - ldrh w3, [x8] // v - ldurh w4, [x8, #-2] // u + // Shift 'v'/'rng' for ret into the 16 least sig bits. There is + // garbage in the remaining bits, but we can work around this. + lsr x4, x4, x15 // rng + lsr x3, x3, x15 // v + lsl w5, w4, #16 // rng << 16 + sub x7, x7, x3, lsl #48 // dif - (v << 48) + clz w5, w5 // d = clz(rng << 16) + lsl w4, w4, w5 // rng << d + subs w6, w6, w5 // cnt -= d + lsl x7, x7, x5 // (dif - (v << 48)) << d + strh w4, [x0, #RNG] + b.lo 1f + str w6, [x0, #CNT] + str x7, [x0, #DIF] + lsr w0, w15, #4 + ret +1: + lsr w15, w15, #4 + b L(refill) +.elseif \n == 8 + ldr w6, [x0, #CNT] + tbl v30.8b, {v30.16b}, v31.8b + tbl v29.8b, {v29.16b}, v31.8b + ins v28.h[3], v30.h[0] // dif - (v << 48) + clz v0.4h, v29.4h // d = clz(rng) + umov w5, v0.h[0] + ushl v29.4h, v29.4h, v0.4h // rng << d + + // The vec for clz(rng) is filled with garbage after the first short, + // but ushl/sshl conveniently uses only the first byte for the shift + // amount. + ushl d28, d28, d0 // (dif - (v << 48)) << d + + subs w6, w6, w5 // cnt -= d + str h29, [x0, #RNG] + b.lo 1f + str w6, [x0, #CNT] + str d28, [x0, #DIF] + lsr w0, w15, #1 // ret + ret +1: + lsr w15, w15, #1 // ret + mov x7, v28.d[0] + b L(refill) +.elseif \n == 16 + add x8, sp, w15, sxtw #1 + ldrh w3, [x8, #48] // v + ldurh w4, [x8, #46] // u ldr w6, [x0, #CNT] ldr x7, [x0, #DIF] sub w4, w4, w3 // rng = u - v clz w5, w4 // clz(rng) eor w5, w5, #16 // d = clz(rng) ^ 16 sub x7, x7, x3, lsl #48 // dif - (v << 48) -L(renorm2): lsl w4, w4, w5 // rng << d subs w6, w6, w5 // cnt -= d lsl x7, x7, x5 // (dif - (v << 48)) << d str w4, [x0, #RNG] - b.hs 4f + add sp, sp, #48 + b.lo 1f + str w6, [x0, #CNT] + str x7, [x0, #DIF] + add w0, w15, #\n // ret + ret +1: + add w15, w15, #\n // ret + b L(refill) +.endif +.endm + decode_update .4h, .8b, 4 + +L(refill): // refill ldp x3, x4, [x0] // BUF_POS, BUF_END add x5, x3, #8 @@ -243,7 +332,6 @@ L(renorm2): str x7, [x0, #DIF] mov w0, w15 - add sp, sp, #48 ret 5: // pad_with_ones @@ -272,29 +360,26 @@ endfunc function msac_decode_symbol_adapt8_neon, export=1 decode_update .8h, .16b, 8 - b L(renorm) endfunc function msac_decode_symbol_adapt16_neon, export=1 decode_update .8h, .16b, 16 - b L(renorm) endfunc function msac_decode_hi_tok_neon, export=1 ld1 {v0.4h}, [x1] // cdf add x16, x0, #RNG movi v31.4h, #0x7f, lsl #8 // 0x7f00 - movrel x17, coeffs, 30-2*3 + movrel x17, coeffs, COEFFS_BASE_OFFSET-2*3 mvni v30.4h, #0x3f // 0xffc0 ldrh w9, [x1, #6] // count = cdf[n_symbols] ld1r {v3.4h}, [x16] // rng ld1 {v29.4h}, [x17] // EC_MIN_PROB * (n_symbols - ret) add x17, x0, #DIF + 6 - mov w13, #-24 + mov w13, #-24*8 and v17.8b, v0.8b, v30.8b // cdf & 0xffc0 ldr w10, [x0, #ALLOW_UPDATE_CDF] ld1r {v1.8h}, [x17] // dif >> (EC_WIN_SIZE - 16) - sub sp, sp, #48 ldr w6, [x0, #CNT] ldr x7, [x0, #DIF] 1: @@ -302,14 +387,14 @@ function msac_decode_hi_tok_neon, export=1 sqdmulh v6.4h, v17.4h, v7.4h // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1 add v4.4h, v17.4h, v29.4h // v = cdf + EC_MIN_PROB * (n_symbols - ret) add v4.4h, v6.4h, v4.4h // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret) - str h3, [sp, #14] // store original u = s->rng cmhs v2.4h, v1.4h, v4.4h // c >= v - str q4, [sp, #16] // store v values to allow indexed access - addv h6, v2.4h // -4 + ret - add w13, w13, #5 - smov w15, v6.h[0] - add x8, sp, #16 - add w15, w15, #4 // ret + add w13, w13, #5*8 + ext v18.8b, v3.8b, v4.8b, #6 // u + umov x15, v2.d[0] + rev x15, x15 + sub v18.4h, v18.4h, v4.4h // rng = u-v + // rev + clz = count trailing zeros + clz x15, x15 // 16*ret cbz w10, 2f // update_cdf @@ -317,29 +402,32 @@ function msac_decode_hi_tok_neon, export=1 mov w4, #-5 orr v2.4h, #0x80, lsl #8 // i >= val ? -1 : 32768 sub w4, w4, w9, lsr #4 // -((count >> 4) + 5) - sub v4.4h, v2.4h, v0.4h // (32768 - cdf[i]) or (-1 - cdf[i]) + sub v2.4h, v2.4h, v0.4h // (32768 - cdf[i]) or (-1 - cdf[i]) dup v6.4h, w4 // -rate sub w9, w9, w9, lsr #5 // count - (count == 32) - sshl v4.4h, v4.4h, v6.4h // ({32768,-1} - cdf[i]) >> rate + sshl v2.4h, v2.4h, v6.4h // ({32768,-1} - cdf[i]) >> rate add w9, w9, #1 // count + (count < 32) - add v0.4h, v5.4h, v4.4h // cdf[i] + (32768 - cdf[i]) >> rate + add v0.4h, v5.4h, v2.4h // cdf[i] + (32768 - cdf[i]) >> rate st1 {v0.4h}, [x1] and v17.8b, v0.8b, v30.8b // cdf & 0xffc0 strh w9, [x1, #6] 2: - add x8, x8, w15, uxtw #1 - ldrh w3, [x8] // v - ldurh w4, [x8, #-2] // u - sub w4, w4, w3 // rng = u - v - clz w5, w4 // clz(rng) - eor w5, w5, #16 // d = clz(rng) ^ 16 + mov x4, v18.d[0] // rng (packed) + mov x3, v4.d[0] // v (packed) + + // Shift 'v'/'rng' for ret into the 16 least sig bits. There is + // garbage in the remaining bits, but we can work around this. + lsr x4, x4, x15 // rng + lsr x3, x3, x15 // v + lsl w5, w4, #16 // rng << 16 sub x7, x7, x3, lsl #48 // dif - (v << 48) + clz w5, w5 // d = clz(rng << 16) lsl w4, w4, w5 // rng << d subs w6, w6, w5 // cnt -= d lsl x7, x7, x5 // (dif - (v << 48)) << d - str w4, [x0, #RNG] + strh w4, [x0, #RNG] dup v3.4h, w4 b.hs 5f @@ -366,17 +454,15 @@ function msac_decode_hi_tok_neon, export=1 orr x7, x7, x8 // dif |= next_bits 5: // end - lsl w15, w15, #1 - sub w15, w15, #5 + sub w15, w15, #5*8 lsr x12, x7, #48 adds w13, w13, w15 // carry = tok_br < 3 || tok == 15 dup v1.8h, w12 b.cc 1b // loop if !carry - add w13, w13, #30 + add w13, w13, #30*8 str w6, [x0, #CNT] - add sp, sp, #48 str x7, [x0, #DIF] - lsr w0, w13, #1 + lsr w0, w13, #4 ret 6: // pad_with_ones @@ -405,7 +491,6 @@ endfunc function msac_decode_bool_equi_neon, export=1 ldp w5, w6, [x0, #RNG] // + CNT - sub sp, sp, #48 ldr x7, [x0, #DIF] bic w4, w5, #0xff // r &= 0xff00 add w4, w4, #8 @@ -418,12 +503,20 @@ function msac_decode_bool_equi_neon, export=1 clz w5, w4 // clz(rng) eor w5, w5, #16 // d = clz(rng) ^ 16 - b L(renorm2) + lsl w4, w4, w5 // rng << d + subs w6, w6, w5 // cnt -= d + lsl x7, x7, x5 // (dif - (v << 48)) << d + str w4, [x0, #RNG] + b.lo L(refill) + + str w6, [x0, #CNT] + str x7, [x0, #DIF] + mov w0, w15 + ret endfunc function msac_decode_bool_neon, export=1 ldp w5, w6, [x0, #RNG] // + CNT - sub sp, sp, #48 ldr x7, [x0, #DIF] lsr w4, w5, #8 // r >> 8 bic w1, w1, #0x3f // f &= ~63 @@ -438,13 +531,21 @@ function msac_decode_bool_neon, export=1 clz w5, w4 // clz(rng) eor w5, w5, #16 // d = clz(rng) ^ 16 - b L(renorm2) + lsl w4, w4, w5 // rng << d + subs w6, w6, w5 // cnt -= d + lsl x7, x7, x5 // (dif - (v << 48)) << d + str w4, [x0, #RNG] + b.lo L(refill) + + str w6, [x0, #CNT] + str x7, [x0, #DIF] + mov w0, w15 + ret endfunc function msac_decode_bool_adapt_neon, export=1 ldr w9, [x1] // cdf[0-1] ldp w5, w6, [x0, #RNG] // + CNT - sub sp, sp, #48 ldr x7, [x0, #DIF] lsr w4, w5, #8 // r >> 8 and w2, w9, #0xffc0 // f &= ~63 @@ -462,7 +563,7 @@ function msac_decode_bool_adapt_neon, export=1 clz w5, w4 // clz(rng) eor w5, w5, #16 // d = clz(rng) ^ 16 - cbz w10, L(renorm2) + cbz w10, 1f lsr w2, w9, #16 // count = cdf[1] and w9, w9, #0xffff // cdf[0] @@ -480,5 +581,15 @@ function msac_decode_bool_adapt_neon, export=1 strh w9, [x1] strh w10, [x1, #2] - b L(renorm2) +1: + lsl w4, w4, w5 // rng << d + subs w6, w6, w5 // cnt -= d + lsl x7, x7, x5 // (dif - (v << 48)) << d + str w4, [x0, #RNG] + b.lo L(refill) + + str w6, [x0, #CNT] + str x7, [x0, #DIF] + mov w0, w15 + ret endfunc diff --git a/third_party/dav1d/src/arm/mc.h b/third_party/dav1d/src/arm/mc.h index 7e57fd37cbbf..dabdab35753e 100644 --- a/third_party/dav1d/src/arm/mc.h +++ b/third_party/dav1d/src/arm/mc.h @@ -62,6 +62,7 @@ decl_8tap_fns(neon); decl_8tap_fns(neon_dotprod); +decl_8tap_fns(neon_i8mm); decl_mc_fn(BF(dav1d_put_bilin, neon)); decl_mct_fn(BF(dav1d_prep_bilin, neon)); @@ -109,11 +110,17 @@ static ALWAYS_INLINE void mc_dsp_init_arm(Dav1dMCDSPContext *const c) { c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon); c->emu_edge = BF(dav1d_emu_edge, neon); -#if ARCH_AARCH64 -#if HAVE_DOTPROD && BITDEPTH == 8 +#if ARCH_AARCH64 && BITDEPTH == 8 +#if HAVE_DOTPROD if (!(flags & DAV1D_ARM_CPU_FLAG_DOTPROD)) return; init_8tap_fns(neon_dotprod); -#endif // HAVE_DOTPROD && BITDEPTH == 8 -#endif // ARCH_AARCH64 +#endif // HAVE_DOTPROD + +#if HAVE_I8MM + if (!(flags & DAV1D_ARM_CPU_FLAG_I8MM)) return; + + init_8tap_fns(neon_i8mm); +#endif // HAVE_I8MM +#endif // ARCH_AARCH64 && BITDEPTH == 8 } diff --git a/third_party/dav1d/src/cpu.h b/third_party/dav1d/src/cpu.h index d20c5f016804..7205e8e62ff6 100644 --- a/third_party/dav1d/src/cpu.h +++ b/third_party/dav1d/src/cpu.h @@ -82,6 +82,9 @@ static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) { #if defined(__VSX__) flags |= DAV1D_PPC_CPU_FLAG_VSX; #endif +#if defined(__POWER9_VECTOR__) + flags |= DAV1D_PPC_CPU_FLAG_PWR9; +#endif #elif ARCH_RISCV #if defined(__riscv_v) flags |= DAV1D_RISCV_CPU_FLAG_V; diff --git a/third_party/dav1d/src/decode.c b/third_party/dav1d/src/decode.c index 7427c35592a7..ea371324216d 100644 --- a/third_party/dav1d/src/decode.c +++ b/third_party/dav1d/src/decode.c @@ -1162,7 +1162,7 @@ static int decode_b(Dav1dTaskContext *const t, ts->cdf.m.use_filter_intra[bs]); if (is_filter) { b->y_mode = FILTER_PRED; - b->y_angle = dav1d_msac_decode_symbol_adapt4(&ts->msac, + b->y_angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, ts->cdf.m.filter_intra, 4); } if (DEBUG_BLOCK_INFO) diff --git a/third_party/dav1d/src/ext/x86/x86inc.asm b/third_party/dav1d/src/ext/x86/x86inc.asm index d2bd758e679a..fc490b6f4bfa 100644 --- a/third_party/dav1d/src/ext/x86/x86inc.asm +++ b/third_party/dav1d/src/ext/x86/x86inc.asm @@ -232,7 +232,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %elif PIC call $+5 ; special-cased to not affect the RSB on most CPU:s pop %1 - add %1, (%2)-$+1 + add %1, -$+1+%2 %else mov %1, %2 %endif @@ -864,16 +864,16 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %macro cextern 1 %xdefine %1 mangle(private_prefix %+ _ %+ %1) - CAT_XDEFINE cglobaled_, %1, 1 + CAT_XDEFINE cglobaled_, %1, 2 extern %1 %endmacro -; like cextern, but without the prefix +; Like cextern, but without the prefix. This should be used for symbols from external libraries. %macro cextern_naked 1 %ifdef PREFIX %xdefine %1 mangle(%1) %endif - CAT_XDEFINE cglobaled_, %1, 1 + CAT_XDEFINE cglobaled_, %1, 3 extern %1 %endmacro @@ -1268,12 +1268,27 @@ INIT_XMM %endmacro %macro call_internal 2 %xdefine %%i %2 + %define %%j %%i %ifndef cglobaled_%2 %ifdef cglobaled_%1 %xdefine %%i %1 %endif + %elif FORMAT_ELF + %if ARCH_X86_64 + %if cglobaled_%2 >= 2 + ; Always emit PLT relocations when calling external functions, + ; the linker will eliminate unnecessary PLT indirections anyway. + %define %%j %%i wrt ..plt + %endif + %elif PIC && cglobaled_%2 == 3 + ; Go through the GOT for functions declared using cextern_naked with + ; PIC, as such functions presumably exists in external libraries. + extern _GLOBAL_OFFSET_TABLE_ + LEA eax, $$+_GLOBAL_OFFSET_TABLE_ wrt ..gotpc + %define %%j [eax+%%i wrt ..got] + %endif %endif - call %%i + call %%j LOAD_MM_PERMUTATION %%i %endmacro diff --git a/third_party/dav1d/src/lib.c b/third_party/dav1d/src/lib.c index 3807efdcce19..4d9a2d30e343 100644 --- a/third_party/dav1d/src/lib.c +++ b/third_party/dav1d/src/lib.c @@ -263,7 +263,6 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) { f->c = c; f->task_thread.ttd = &c->task_thread; f->lf.last_sharpness = -1; - dav1d_refmvs_init(&f->rf); } for (unsigned m = 0; m < c->n_tc; m++) { @@ -664,7 +663,7 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) { dav1d_free(f->lf.lr_mask); dav1d_free(f->lf.tx_lpf_right_edge[0]); dav1d_free(f->lf.start_of_tile_row); - dav1d_refmvs_clear(&f->rf); + dav1d_free_aligned(f->rf.r); dav1d_free_aligned(f->lf.cdef_line_buf); dav1d_free_aligned(f->lf.lr_line_buf); } diff --git a/third_party/dav1d/src/loopfilter_tmpl.c b/third_party/dav1d/src/loopfilter_tmpl.c index 7cc89643e41e..0a2baf1e868d 100644 --- a/third_party/dav1d/src/loopfilter_tmpl.c +++ b/third_party/dav1d/src/loopfilter_tmpl.c @@ -249,6 +249,8 @@ static void loop_filter_v_sb128uv_c(pixel *dst, const ptrdiff_t stride, #include "src/arm/loopfilter.h" #elif ARCH_LOONGARCH64 #include "src/loongarch/loopfilter.h" +#elif ARCH_PPC64LE +#include "src/ppc/loopfilter.h" #elif ARCH_X86 #include "src/x86/loopfilter.h" #endif @@ -265,6 +267,8 @@ COLD void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) loop_filter_dsp_init_arm(c); #elif ARCH_LOONGARCH64 loop_filter_dsp_init_loongarch(c); +#elif ARCH_PPC64LE + loop_filter_dsp_init_ppc(c); #elif ARCH_X86 loop_filter_dsp_init_x86(c); #endif diff --git a/third_party/dav1d/src/meson.build b/third_party/dav1d/src/meson.build index cd19b70c38fc..c75466805392 100644 --- a/third_party/dav1d/src/meson.build +++ b/third_party/dav1d/src/meson.build @@ -73,14 +73,14 @@ libdav1d_tmpl_sources = files( 'recon_tmpl.c', ) -libdav1d_arch_tmpl_sources = [] +libdav1d_arch_tmpl_sources = {} libdav1d_bitdepth_objs = [] # ASM specific sources libdav1d_asm_objs = [] # Arch-specific flags -arch_flags = [] +arch_flags = {} if is_asm_enabled if (host_machine.cpu_family() == 'aarch64' or host_machine.cpu_family().startswith('arm')) @@ -232,9 +232,9 @@ if is_asm_enabled 'loongarch/cpu.c', ) - libdav1d_arch_tmpl_sources += files( + libdav1d_arch_tmpl_sources += {'default': files( 'loongarch/looprestoration_tmpl.c', - ) + )} libdav1d_sources_asm = files( 'loongarch/mc.S', @@ -246,20 +246,25 @@ if is_asm_enabled ) libdav1d_asm_objs += libdav1d_sources_asm elif host_machine.cpu() == 'ppc64le' - arch_flags = ['-maltivec', '-mvsx'] + arch_flags += {'vsx': ['-maltivec', '-mvsx', '-DDAV1D_VSX']} libdav1d_sources += files( 'ppc/cpu.c', ) - libdav1d_arch_tmpl_sources += files( + libdav1d_arch_tmpl_sources += {'vsx': files( 'ppc/cdef_tmpl.c', 'ppc/looprestoration_tmpl.c', - ) + )} + arch_flags += {'pwr9': ['-mcpu=power9', '-DDAV1D_PWR9']} + libdav1d_arch_tmpl_sources += {'pwr9': files( + 'ppc/loopfilter_tmpl.c', + )} elif host_machine.cpu_family().startswith('riscv') libdav1d_sources += files( 'riscv/cpu.c', ) if host_machine.cpu_family() == 'riscv64' libdav1d_sources += files( + 'riscv/64/cpu.S', 'riscv/64/itx.S', ) endif @@ -320,15 +325,17 @@ endforeach # Helper library for each bitdepth and architecture-specific flags foreach bitdepth : dav1d_bitdepths - libdav1d_bitdepth_objs += static_library( - 'dav1d_arch_bitdepth_@0@'.format(bitdepth), - libdav1d_arch_tmpl_sources, config_h_target, - include_directories: dav1d_inc_dirs, - dependencies : [stdatomic_dependencies], - c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + libdav1d_flags + arch_flags, - install : false, - build_by_default : false, - ).extract_all_objects(recursive: true) + foreach subarch : libdav1d_arch_tmpl_sources.keys() + libdav1d_bitdepth_objs += static_library( + 'dav1d_arch_bitdepth_@0@_@1@'.format(bitdepth,subarch), + libdav1d_arch_tmpl_sources[subarch], config_h_target, + include_directories: dav1d_inc_dirs, + dependencies : [stdatomic_dependencies], + c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + libdav1d_flags + arch_flags.get(subarch, []), + install : false, + build_by_default : false, + ).extract_all_objects(recursive: true) + endforeach endforeach # The final dav1d library diff --git a/third_party/dav1d/src/msac.h b/third_party/dav1d/src/msac.h index c3e07e1c7033..eb97650fce4f 100644 --- a/third_party/dav1d/src/msac.h +++ b/third_party/dav1d/src/msac.h @@ -68,7 +68,7 @@ unsigned dav1d_msac_decode_bool_c(MsacContext *s, unsigned f); unsigned dav1d_msac_decode_hi_tok_c(MsacContext *s, uint16_t *cdf); int dav1d_msac_decode_subexp(MsacContext *s, int ref, int n, unsigned k); -/* Supported n_symbols ranges: adapt4: 1-4, adapt8: 1-7, adapt16: 3-15 */ +/* Supported n_symbols ranges: adapt4: 1-3, adapt8: 1-7, adapt16: 3-15 */ #ifndef dav1d_msac_decode_symbol_adapt4 #define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt_c #endif diff --git a/third_party/dav1d/src/ppc/cpu.c b/third_party/dav1d/src/ppc/cpu.c index fe77057c57f5..53287639de8e 100644 --- a/third_party/dav1d/src/ppc/cpu.c +++ b/third_party/dav1d/src/ppc/cpu.c @@ -40,12 +40,16 @@ COLD unsigned dav1d_get_cpu_flags_ppc(void) { unsigned flags = 0; #if defined(HAVE_GETAUXVAL) && ARCH_PPC64LE unsigned long hw_cap = getauxval(AT_HWCAP); + unsigned long hw_cap2 = getauxval(AT_HWCAP2); #elif defined(HAVE_ELF_AUX_INFO) && ARCH_PPC64LE unsigned long hw_cap = 0; + unsigned long hw_cap2 = 0; elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap)); + elf_aux_info(AT_HWCAP2, &hw_cap2, sizeof(hw_cap2)); #endif #ifdef HAVE_AUX flags |= (hw_cap & PPC_FEATURE_HAS_VSX) ? DAV1D_PPC_CPU_FLAG_VSX : 0; + flags |= (hw_cap2 & PPC_FEATURE2_ARCH_3_00) ? DAV1D_PPC_CPU_FLAG_PWR9 : 0; #endif return flags; } diff --git a/third_party/dav1d/src/ppc/cpu.h b/third_party/dav1d/src/ppc/cpu.h index cfd2ff4ff51d..02db7a60acf7 100644 --- a/third_party/dav1d/src/ppc/cpu.h +++ b/third_party/dav1d/src/ppc/cpu.h @@ -30,6 +30,7 @@ enum CpuFlags { DAV1D_PPC_CPU_FLAG_VSX = 1 << 0, + DAV1D_PPC_CPU_FLAG_PWR9 = 1 << 1, }; unsigned dav1d_get_cpu_flags_ppc(void); diff --git a/third_party/dav1d/src/ppc/dav1d_types.h b/third_party/dav1d/src/ppc/dav1d_types.h index 0b4bd72f0e63..9a8bc7a732f2 100644 --- a/third_party/dav1d/src/ppc/dav1d_types.h +++ b/third_party/dav1d/src/ppc/dav1d_types.h @@ -44,6 +44,10 @@ #define i64x2 vector signed long long #define b64x2 vector bool long long +#define i8h_to_i16(v) ((i16x8) vec_unpackh((i8x16)v)) +#define i8l_to_i16(v) ((i16x8) vec_unpackl((i8x16)v)) +#define u8h_to_i16(v) ((i16x8) vec_mergeh((u8x16) v, vec_splat_u8(0))) +#define u8l_to_i16(v) ((i16x8) vec_mergel((u8x16) v, vec_splat_u8(0))) #define u8h_to_u16(v) ((u16x8) vec_mergeh((u8x16) v, vec_splat_u8(0))) #define u8l_to_u16(v) ((u16x8) vec_mergel((u8x16) v, vec_splat_u8(0))) #define u16h_to_i32(v) ((i32x4) vec_mergeh((u16x8) v, vec_splat_u16(0))) diff --git a/third_party/dav1d/src/ppc/loopfilter.h b/third_party/dav1d/src/ppc/loopfilter.h new file mode 100644 index 000000000000..fc97b375bb7e --- /dev/null +++ b/third_party/dav1d/src/ppc/loopfilter.h @@ -0,0 +1,47 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/loopfilter.h" + +decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, pwr9)); +decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, pwr9)); +decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, pwr9)); +decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, pwr9)); + +static ALWAYS_INLINE void loop_filter_dsp_init_ppc(Dav1dLoopFilterDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_PPC_CPU_FLAG_PWR9)) return; + +#if BITDEPTH == 8 + c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, pwr9); + c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, pwr9); + c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, pwr9); + c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, pwr9); +#endif +} diff --git a/third_party/dav1d/src/ppc/loopfilter_tmpl.c b/third_party/dav1d/src/ppc/loopfilter_tmpl.c new file mode 100644 index 000000000000..4e658a701a88 --- /dev/null +++ b/third_party/dav1d/src/ppc/loopfilter_tmpl.c @@ -0,0 +1,1704 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * Copyright © 2024, Luca Barbato + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#undef NDEBUG +#include + +#include + +#include "common/attributes.h" +#include "common/intops.h" + +#include "src/ppc/dav1d_types.h" +#include "src/ppc/loopfilter.h" + +#if BITDEPTH == 8 + +#define LOAD4_H(idx) \ + u8x16 idx##0 = vec_xl(0, dst); /* p1_0 p0_0 q0_0 q1_0 */ \ + dst += stridea; \ + u8x16 idx##1 = vec_xl(0, dst); /* p1_1 p0_1 q0_1 q1_1 */ \ + dst += stridea; \ + u8x16 idx##2 = vec_xl(0, dst); /* p1_2 p0_2 q0_2 q1_2 */ \ + dst += stridea; \ + u8x16 idx##3 = vec_xl(0, dst); /* p1_3 p0_3 q0_3 q1_3 */ \ + +// return idx##_01 and idx##_23 +#define LOAD4_H_SINGLE(idx) \ + LOAD4_H(idx) \ + \ + u8x16 idx##_01 = vec_mergeh(idx##0, idx##1); /* p1_0 p1_1 p0_0 p0_1 q0_0 q0_1 q1_0 q1_1 */ \ + u8x16 idx##_23 = vec_mergeh(idx##2, idx##3); /* p1_2 p1_3 p0_2 p0_3 q0_2 q0_3 q1_2 q1_3 */ + + +#define DECLARE_ADD_16HL(r, a, b) \ + u16x8 r##h = vec_add(a##h, b##h); \ + u16x8 r##l = vec_add(a##l, b##l); + +#define ADD_16HL(r, a, b) \ + r##h = vec_add(a##h, b##h); \ + r##l = vec_add(a##l, b##l); + +#define ADD_AND_SHIFT4(v) \ + v##h = vec_sr(vec_add(v##h, v4u16), v3u16); \ + v##l = vec_sr(vec_add(v##l, v4u16), v3u16); +#define ADD_AND_SHIFT8(v) \ + v##h = vec_sr(vec_add(v##h, v8u16), v4u16); \ + v##l = vec_sr(vec_add(v##l, v8u16), v4u16); + +#define PACK_AND_SEL(v, m) \ + vec_sel(v, vec_pack(o##v##h, o##v##l), m) + +#define UNPACK_16(v) \ + u16x8 v##h = u8h_to_u16(v); \ + u16x8 v##l = u8l_to_u16(v); + + +#define APPLY_4 \ + b8x16 hev = vec_cmpgt(max_a_p1p0_q1q0, H); \ + \ + i8x16 ps1 = (i8x16)vec_xor(p1, s); \ + i8x16 ps0 = (i8x16)vec_xor(p0, s); \ + i8x16 qs0 = (i8x16)vec_xor(q0, s); \ + i8x16 qs1 = (i8x16)vec_xor(q1, s); \ + i8x16 f0 = vec_and(vec_subs(ps1, qs1), hev); \ + i16x8 q0sh = (i16x8)q0h; \ + i16x8 q0sl = (i16x8)q0l; \ + i16x8 p0sh = (i16x8)p0h; \ + i16x8 p0sl = (i16x8)p0l; \ + i16x8 f0h = i8h_to_i16(f0); \ + i16x8 f0l = i8l_to_i16(f0); \ + i16x8 d0h = vec_sub(q0sh, p0sh); \ + i16x8 d0l = vec_sub(q0sl, p0sl); \ + u8x16 v3u8 = vec_splat_u8(3); \ + i16x8 d0h_2 = vec_add(d0h, d0h); \ + i16x8 d0l_2 = vec_add(d0l, d0l); \ + u8x16 v4u8 = vec_splat_u8(4); \ + i16x8 f0_d0h = vec_add(d0h, f0h); \ + i16x8 f0_d0l = vec_add(d0l, f0l); \ + i16x8 fh = vec_add(d0h_2, f0_d0h); \ + i16x8 fl = vec_add(d0l_2, f0_d0l); \ + i8x16 f = vec_packs(fh, fl); \ + i8x16 f1 = vec_adds(f, (i8x16)v4u8); \ + i8x16 f2 = vec_adds(f, (i8x16)v3u8); \ + f1 = vec_sra(f1, v3u8); \ + f2 = vec_sra(f2, v3u8); \ + f1 = vec_and(f1, fm); \ + f2 = vec_and(f2, fm); \ + i8x16 f3 = vec_adds(f1, (i8x16)v1u8); \ + b8x16 m3 = vec_and(~hev, (b8x16)fm); \ + f3 = vec_sra(f3, v1u8); \ + f3 = vec_and(f3, m3); \ + i8x16 op0s = vec_adds(ps0, f2); \ + i8x16 oq0s = vec_subs(qs0, f1); \ + i8x16 oq1s = vec_subs(qs1, f3); \ + i8x16 op1s = vec_adds(ps1, f3); \ + p0 = (u8x16)vec_xor(op0s, s); \ + q0 = (u8x16)vec_xor(oq0s, s); \ + q1 = (u8x16)vec_xor(oq1s, s); \ + p1 = (u8x16)vec_xor(op1s, s); + +#define APPLY_8 \ + DECLARE_ADD_16HL(p1p0, p1, p0) \ + DECLARE_ADD_16HL(p2q0, p2, q0) \ + DECLARE_ADD_16HL(q1q2, q1, q2) \ + DECLARE_ADD_16HL(p3p3, p3, p3) \ + DECLARE_ADD_16HL(q0q3, q0, q3) \ + DECLARE_ADD_16HL(p3p2, p3, p2) \ + DECLARE_ADD_16HL(p1q1, p1, q1) \ + DECLARE_ADD_16HL(p3p0, p3, p0) \ + DECLARE_ADD_16HL(p0q2, p0, q2) \ + DECLARE_ADD_16HL(q1q3, q1, q3) \ + DECLARE_ADD_16HL(q3q3, q3, q3) \ + DECLARE_ADD_16HL(q0q1q2q3, q0q3, q1q2) \ + DECLARE_ADD_16HL(p2p1p0q0, p1p0, p2q0) \ + DECLARE_ADD_16HL(p3p3p3p2, p3p3, p3p2) \ + DECLARE_ADD_16HL(p3p3p1q1, p3p3, p1q1) \ + DECLARE_ADD_16HL(p3p0q1q2, p3p0, q1q2) \ + DECLARE_ADD_16HL(p1p0q1q3, p1p0, q1q3) \ + DECLARE_ADD_16HL(p0q2q3q3, p0q2, q3q3) \ + \ + DECLARE_ADD_16HL(op2, p3p3p3p2, p2p1p0q0) \ + DECLARE_ADD_16HL(op1, p3p3p1q1, p2p1p0q0) \ + DECLARE_ADD_16HL(op0, p3p0q1q2, p2p1p0q0) \ + DECLARE_ADD_16HL(oq0, p2p1p0q0, q0q1q2q3) \ + DECLARE_ADD_16HL(oq1, p1p0q1q3, q0q1q2q3) \ + DECLARE_ADD_16HL(oq2, p0q2q3q3, q0q1q2q3) \ + \ + ADD_AND_SHIFT4(op2) \ + ADD_AND_SHIFT4(op1) \ + ADD_AND_SHIFT4(op0) \ + ADD_AND_SHIFT4(oq0) \ + ADD_AND_SHIFT4(oq1) \ + ADD_AND_SHIFT4(oq2) \ + \ + p2 = PACK_AND_SEL(p2, apply_8); \ + p1 = PACK_AND_SEL(p1, apply_8); \ + p0 = PACK_AND_SEL(p0, apply_8); \ + q0 = PACK_AND_SEL(q0, apply_8); \ + q1 = PACK_AND_SEL(q1, apply_8); \ + q2 = PACK_AND_SEL(q2, apply_8); + +#define APPLY_16 \ + DECLARE_ADD_16HL(p6p6, p6, p6) \ + DECLARE_ADD_16HL(p6p5, p6, p5) \ + DECLARE_ADD_16HL(p6p4, p6, p4) \ + DECLARE_ADD_16HL(p4p3, p4, p3) \ + DECLARE_ADD_16HL(p2p1, p2, p1) \ + DECLARE_ADD_16HL(p2q2, p2, q2) \ + DECLARE_ADD_16HL(p3q1, p3, q1) \ + DECLARE_ADD_16HL(p0q0, p0, q0) \ + DECLARE_ADD_16HL(p0q1, p0, q1) \ + DECLARE_ADD_16HL(p1q3, p1, q3) \ + DECLARE_ADD_16HL(p1q0, p1, q0) \ + DECLARE_ADD_16HL(p1q5, p1, q5) \ + DECLARE_ADD_16HL(q3q4, q3, q4) \ + DECLARE_ADD_16HL(q2q5, q2, q5) \ + DECLARE_ADD_16HL(q1q6, q1, q6) \ + DECLARE_ADD_16HL(q0q1, q0, q1) \ + DECLARE_ADD_16HL(q6q6, q6, q6) \ + DECLARE_ADD_16HL(q2q6, q2, q6) \ + DECLARE_ADD_16HL(q3q6, q3, q6) \ + DECLARE_ADD_16HL(q4q6, q4, q6) \ + DECLARE_ADD_16HL(p5q0, p5, q0) \ + \ + DECLARE_ADD_16HL(p6q2, p6, q2) \ + DECLARE_ADD_16HL(p6p6p6p4, p6p6, p6p4) \ + DECLARE_ADD_16HL(p6p5p2p1, p6p5, p2p1) \ + DECLARE_ADD_16HL(p4p3p0q0, p4p3, p0q0) \ + DECLARE_ADD_16HL(p2q2p3q1, p2q2, p3q1) \ + DECLARE_ADD_16HL(p6p5p6p6, p6p5, p6p6) \ + DECLARE_ADD_16HL(p6p5p3q1, p6p5, p3q1) \ + DECLARE_ADD_16HL(p6p6p1q3, p6p6, p1q3) \ + DECLARE_ADD_16HL(q2q5q3q4, q2q5, q3q4) \ + DECLARE_ADD_16HL(p2p1q1q6, p2p1, q1q6) \ + DECLARE_ADD_16HL(p0q0q3q6, p0q0, q3q6) \ + DECLARE_ADD_16HL(q4q6q6q6, q4q6, q6q6) \ + u16x8 q5q6q6q6h = vec_madd(v3u16, q6h, q5h); \ + u16x8 q5q6q6q6l = vec_madd(v3u16, q6l, q5l); \ + DECLARE_ADD_16HL(p0q0q1q6, p0q0, q1q6) \ + DECLARE_ADD_16HL(p0q1q3q4, p0q1, q3q4) \ + \ + DECLARE_ADD_16HL(p6q2p2p1, p6q2, p2p1) \ + DECLARE_ADD_16HL(p1q0q2q5, p1q0, q2q5) \ + DECLARE_ADD_16HL(p0q1p5q0, p0q1, p5q0) \ + DECLARE_ADD_16HL(q0q1q2q6, q0q1, q2q6) \ + DECLARE_ADD_16HL(p3q1q2q6, p3q1, q2q6) \ + DECLARE_ADD_16HL(q2q6q4q6, q2q6, q4q6) \ + DECLARE_ADD_16HL(q3q6p1q5, q3q6, p1q5) \ + \ + DECLARE_ADD_16HL(p4p3p0q0p2p1q1q6, p4p3p0q0, p2p1q1q6) \ + DECLARE_ADD_16HL(p6p5p2p1p4p3p0q0, p6p5p2p1, p4p3p0q0) \ + DECLARE_ADD_16HL(p2p1q1q6q2q5q3q4, p2p1q1q6, q2q5q3q4) \ + DECLARE_ADD_16HL(q2q5q3q4q4q6q6q6, q2q5q3q4, q4q6q6q6) \ + DECLARE_ADD_16HL(p6p5p2p1p4p3p0q0p2q2p3q1, p6p5p2p1p4p3p0q0, p2q2p3q1) \ + DECLARE_ADD_16HL(p6p6p6p4p6p5p2p1p4p3p0q0, p6p6p6p4, p6p5p2p1p4p3p0q0) \ + DECLARE_ADD_16HL(p4p3p0q0p2p1q1q6q2q5q3q4, p4p3p0q0p2p1q1q6, q2q5q3q4) \ + DECLARE_ADD_16HL(p2p1q1q6q2q5q3q4p0q0q3q6, p2p1q1q6q2q5q3q4, p0q0q3q6) \ + DECLARE_ADD_16HL(p0q0q1q6q2q5q3q4q4q6q6q6, p0q0q1q6, q2q5q3q4q4q6q6q6) \ + DECLARE_ADD_16HL(p6p5p2p1p4p3p0q0p0q1q3q4, p6p5p2p1p4p3p0q0, p0q1q3q4) \ + \ + DECLARE_ADD_16HL(op5, p6p6p6p4p6p5p2p1p4p3p0q0, p6p5p6p6) \ + DECLARE_ADD_16HL(op4, p6p6p6p4p6p5p2p1p4p3p0q0, p6p5p3q1) \ + DECLARE_ADD_16HL(op3, p6p6p6p4, p6p5p2p1p4p3p0q0p2q2p3q1) \ + DECLARE_ADD_16HL(op2, p6p6p1q3, p6p5p2p1p4p3p0q0p2q2p3q1) \ + DECLARE_ADD_16HL(op1, p6p5p2p1p4p3p0q0p0q1q3q4, p6q2p2p1) \ + DECLARE_ADD_16HL(op0, p6p5p2p1p4p3p0q0p0q1q3q4, p1q0q2q5) \ + DECLARE_ADD_16HL(oq0, p4p3p0q0p2p1q1q6q2q5q3q4, p0q1p5q0) \ + DECLARE_ADD_16HL(oq1, p4p3p0q0p2p1q1q6q2q5q3q4, q0q1q2q6) \ + DECLARE_ADD_16HL(oq2, p2p1q1q6q2q5q3q4p0q0q3q6, p3q1q2q6) \ + DECLARE_ADD_16HL(oq3, p2p1q1q6q2q5q3q4p0q0q3q6, q2q6q4q6) \ + DECLARE_ADD_16HL(oq4, p0q0q1q6q2q5q3q4q4q6q6q6, q3q6p1q5) \ + DECLARE_ADD_16HL(oq5, p0q0q1q6q2q5q3q4q4q6q6q6, q5q6q6q6) \ + \ + ADD_AND_SHIFT8(op5) \ + ADD_AND_SHIFT8(op4) \ + ADD_AND_SHIFT8(op3) \ + ADD_AND_SHIFT8(op2) \ + ADD_AND_SHIFT8(op1) \ + ADD_AND_SHIFT8(op0) \ + ADD_AND_SHIFT8(oq0) \ + ADD_AND_SHIFT8(oq1) \ + ADD_AND_SHIFT8(oq2) \ + ADD_AND_SHIFT8(oq3) \ + ADD_AND_SHIFT8(oq4) \ + ADD_AND_SHIFT8(oq5) \ + \ + p5 = PACK_AND_SEL(p5, apply_16); \ + p4 = PACK_AND_SEL(p4, apply_16); \ + p3 = PACK_AND_SEL(p3, apply_16); \ + p2 = PACK_AND_SEL(p2, apply_16); \ + p1 = PACK_AND_SEL(p1, apply_16); \ + p0 = PACK_AND_SEL(p0, apply_16); \ + q0 = PACK_AND_SEL(q0, apply_16); \ + q1 = PACK_AND_SEL(q1, apply_16); \ + q2 = PACK_AND_SEL(q2, apply_16); \ + q3 = PACK_AND_SEL(q3, apply_16); \ + q4 = PACK_AND_SEL(q4, apply_16); \ + q5 = PACK_AND_SEL(q5, apply_16); \ + + + +static inline void store_h_4(u8x16 out, uint8_t *dst, int stridea) +{ + u8x16 out1 = (u8x16)vec_splat((u32x4)out, 1); + u8x16 out2 = (u8x16)vec_splat((u32x4)out, 2); + u8x16 out3 = (u8x16)vec_splat((u32x4)out, 3); + vec_xst_len(out, dst, 4); + dst += stridea; + vec_xst_len(out1, dst, 4); + dst += stridea; + vec_xst_len(out2, dst, 4); + dst += stridea; + vec_xst_len(out3, dst, 4); +} + +static inline void store_h_8(u8x16 outa, u8x16 outb, uint8_t *dst, int stridea) +{ + u8x16 out1 = (u8x16)vec_mergel((u64x2)outa, (u64x2)outa); + u8x16 out3 = (u8x16)vec_mergel((u64x2)outb, (u64x2)outb); + vec_xst_len(outa, dst, 6); + dst += stridea; + vec_xst_len(out1, dst, 6); + dst += stridea; + vec_xst_len(outb, dst, 6); + dst += stridea; + vec_xst_len(out3, dst, 6); +} + +// Assume a layout {v}0 {v}1 {v}2 {v}3, produces {v}01 {v}23 +#define MERGEH_4(v) \ + u8x16 v##01 = vec_mergeh(v##0, v##1); \ + u8x16 v##23 = vec_mergeh(v##2, v##3); + +#define MERGEL_4(v) \ + u8x16 v##01 = vec_mergel(v##0, v##1); \ + u8x16 v##23 = vec_mergel(v##2, v##3); + +// produce {v}0123h +#define MERGEH_U16_0123(v) \ + u16x8 v##0123h = vec_mergeh((u16x8)v##01, (u16x8)v##23); + +#define MERGEHL_U16_0123(v) \ + u16x8 v##0123l = vec_mergel((u16x8)v##01, (u16x8)v##23); + +#define MERGE_U16_0123(v) \ + u16x8 v##0123h = vec_mergeh((u16x8)v##01, (u16x8)v##23); \ + u16x8 v##0123l = vec_mergel((u16x8)v##01, (u16x8)v##23); + +// produce {ac,bd}0123h{dir} +#define MERGEH_U32_LINE(dir) \ + u32x4 ac0123h##dir = vec_mergeh((u32x4)a0123##dir, (u32x4)c0123##dir); \ + u32x4 bd0123h##dir = vec_mergeh((u32x4)b0123##dir, (u32x4)d0123##dir); + +#define MERGEL_U32_LINE(dir) \ + u32x4 ac0123l##dir = vec_mergel((u32x4)a0123##dir, (u32x4)c0123##dir); \ + u32x4 bd0123l##dir = vec_mergel((u32x4)b0123##dir, (u32x4)d0123##dir); + + +// produce the pair of mergeh/mergel of {ac,bd}01234{dira}{dirb} +#define MERGE_U32(oh, ol, dira, dirb) \ + oh = (u8x16)vec_mergeh(ac0123##dira##dirb, bd0123##dira##dirb); \ + ol = (u8x16)vec_mergel(ac0123##dira##dirb, bd0123##dira##dirb); + +#define MERGEHL_U8(a, b) \ + u8x16 a##b##h = vec_mergeh(a, b); \ + u8x16 a##b##l = vec_mergel(a, b); + +#define MERGEHL_U16(out, a, b) \ + u8x16 out##h = (u8x16)vec_mergeh((u16x8)a, (u16x8)b); \ + u8x16 out##l = (u8x16)vec_mergel((u16x8)a, (u16x8)b); + +#define MERGEHL_U32(out, a, b) \ + u8x16 out##h = (u8x16)vec_mergeh((u32x4)a, (u32x4)b); \ + u8x16 out##l = (u8x16)vec_mergel((u32x4)a, (u32x4)b); + +static inline void +loop_filter_h_4_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H, + const ptrdiff_t stridea, b32x4 apply + HIGHBD_DECL_SUFFIX) +{ + dst -= 2; + uint8_t *dst2 = dst; + u8x16 p1, p0, q0, q1; + + LOAD4_H(a) + dst += stridea; + LOAD4_H(b) + dst += stridea; + LOAD4_H(c) + dst += stridea; + LOAD4_H(d) + + MERGEH_4(a) + MERGEH_4(b) + MERGEH_4(c) + MERGEH_4(d) + + MERGEH_U16_0123(a) + MERGEH_U16_0123(b) + MERGEH_U16_0123(c) + MERGEH_U16_0123(d) + + MERGEH_U32_LINE(h) + MERGEL_U32_LINE(h) + + MERGE_U32(p1, p0, h, h) + MERGE_U32(q0, q1, l, h) + + const u8x16 zero = vec_splat_u8(0); + const u8x16 v1u8 = vec_splat_u8(1); + const b8x16 s = (b8x16)vec_splats((uint8_t)128); + + const u8x16 a_p1_p0 = vec_absd(p1, p0); + const u8x16 a_q1_q0 = vec_absd(q1, q0); + const u8x16 a_p0_q0 = vec_absd(p0, q0); + const u8x16 a_p1_q1 = vec_absd(p1, q1); + + u8x16 cmp_E = vec_adds(a_p0_q0, a_p0_q0); + const u8x16 max_a_p1p0_q1q0 = vec_max(a_p1_p0, a_q1_q0); + const u8x16 cmp_I = max_a_p1p0_q1q0; + cmp_E = vec_adds(vec_sr(a_p1_q1, v1u8), cmp_E); + const b8x16 ltI = vec_cmple(cmp_I, I); + const b8x16 ltE = vec_cmple(cmp_E, E); + b8x16 fm = vec_and(ltI, ltE); + + fm = vec_and(fm, (b8x16)apply); + if (vec_all_eq(fm, zero)) + return; + + UNPACK_16(p0) + UNPACK_16(q0) + + APPLY_4 + + u8x16 p1p0ab = (u8x16)vec_mergeh(p1, p0); // p1 p0 ... + u8x16 q0q1ab = (u8x16)vec_mergeh(q0, q1); // q0 q1 ... + u8x16 p1p0cd = (u8x16)vec_mergel(p1, p0); // p1 p0 ... + u8x16 q0q1cd = (u8x16)vec_mergel(q0, q1); // q0 q1 ... + + u8x16 outa = (u8x16)vec_mergeh((u16x8)p1p0ab, (u16x8)q0q1ab); // op1 op0 oq0 oq1 ... + u8x16 outb = (u8x16)vec_mergel((u16x8)p1p0ab, (u16x8)q0q1ab); + u8x16 outc = (u8x16)vec_mergeh((u16x8)p1p0cd, (u16x8)q0q1cd); + u8x16 outd = (u8x16)vec_mergel((u16x8)p1p0cd, (u16x8)q0q1cd); + + if (apply[0]) { + store_h_4(outa, dst2, stridea); + } + dst2 += 4 * stridea; + if (apply[1]) { + store_h_4(outb, dst2, stridea); + } + dst2 += 4 * stridea; + if (apply[2]) { + store_h_4(outc, dst2, stridea); + } + dst2 += 4 * stridea; + if (apply[3]) { + store_h_4(outd, dst2, stridea); + } +} + +static inline void +loop_filter_h_6_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H, + const ptrdiff_t stridea, b32x4 apply, b32x4 m6 + HIGHBD_DECL_SUFFIX) +{ + uint8_t *dst2 = dst - 2; + dst -= 3; + u8x16 p2, p1, p0, q0, q1, q2; + + LOAD4_H(a) + dst += stridea; + LOAD4_H(b) + dst += stridea; + LOAD4_H(c) + dst += stridea; + LOAD4_H(d) + + MERGEH_4(a) + MERGEH_4(b) + MERGEH_4(c) + MERGEH_4(d) + + MERGE_U16_0123(a) + MERGE_U16_0123(b) + MERGE_U16_0123(c) + MERGE_U16_0123(d) + + MERGEH_U32_LINE(h) + MERGEL_U32_LINE(h) + MERGEH_U32_LINE(l) + + MERGE_U32(p2, p1, h, h) + MERGE_U32(p0, q0, l, h) + MERGE_U32(q1, q2, h, l) + + const u8x16 F = vec_splat_u8(1); + + const u8x16 zero = vec_splat_u8(0); + const u16x8 v3u16 = vec_splat_u16(3); + const u16x8 v4u16 = vec_splat_u16(4); + const u8x16 v1u8 = vec_splat_u8(1); + const b8x16 s = (b8x16)vec_splats((uint8_t)128); + + const u8x16 a_p1_p0 = vec_absd(p1, p0); + const u8x16 a_q1_q0 = vec_absd(q1, q0); + const u8x16 a_p0_q0 = vec_absd(p0, q0); + const u8x16 a_p1_q1 = vec_absd(p1, q1); + const u8x16 a_p2_p1 = vec_absd(p2, p1); + const u8x16 a_q2_q1 = vec_absd(q2, q1); + const u8x16 a_p2_p0 = vec_absd(p2, p0); + const u8x16 a_q2_q0 = vec_absd(q2, q0); + + u8x16 max_a_p2p1_q2q1 = vec_max(a_p2_p1, a_q2_q1); + u8x16 cmp_E = vec_adds(a_p0_q0, a_p0_q0); + const u8x16 max_a_p1p0_q1q0 = vec_max(a_p1_p0, a_q1_q0); + u8x16 max_a_p2p0_q2q0 = vec_max(a_p2_p0, a_q2_q0); + u8x16 cmp_I_m6 = max_a_p2p1_q2q1; + u8x16 cmp_I_m4 = max_a_p1p0_q1q0; + cmp_E = vec_adds(vec_sr(a_p1_q1, v1u8), cmp_E); + cmp_I_m6 = vec_and(cmp_I_m6, (u8x16)m6); + u8x16 cmp_I = vec_max(cmp_I_m4, cmp_I_m6); + const b8x16 ltE = vec_cmple(cmp_E, E); + const b8x16 ltI = vec_cmple(cmp_I, I); + b8x16 fm = vec_and(ltI, ltE); + + fm = vec_and(fm, (b8x16)apply); + if (vec_all_eq(fm, zero)) + return; + + UNPACK_16(p2) + UNPACK_16(p1) + UNPACK_16(p0) + UNPACK_16(q0) + UNPACK_16(q1) + UNPACK_16(q2) + + m6 = vec_and(m6, (b32x4)fm); + + u8x16 cmp_flat8in = vec_max(max_a_p2p0_q2q0, max_a_p1p0_q1q0); + b8x16 apply_6 = vec_and(vec_cmple(cmp_flat8in, F), (b8x16)m6); + + b8x16 apply_4 = vec_andc(fm, apply_6); + + if (vec_any_ne(apply_4, zero)) { + APPLY_4 + } + + if (vec_any_ne(apply_6, zero)) { + DECLARE_ADD_16HL(p2p2, p2, p2) + DECLARE_ADD_16HL(p2p1, p2, p1) + DECLARE_ADD_16HL(p1p0, p1, p0) + DECLARE_ADD_16HL(p0q0, p0, q0) + DECLARE_ADD_16HL(q0q1, q0, q1) + DECLARE_ADD_16HL(q1q2, q1, q2) + DECLARE_ADD_16HL(p2p2p0q0, p2p2, p0q0) + DECLARE_ADD_16HL(p2p1p1p0, p2p1, p1p0) + DECLARE_ADD_16HL(p1p0q1q2, p1p0, q1q2) + DECLARE_ADD_16HL(p0q0q0q1, p0q0, q0q1) + u16x8 q1q2q2q2h = q2h * 3 + q1h; + u16x8 q1q2q2q2l = q2l * 3 + q1l; + + DECLARE_ADD_16HL(op1, p2p2p0q0, p2p1p1p0) + DECLARE_ADD_16HL(op0, p2p1p1p0, p0q0q0q1) + DECLARE_ADD_16HL(oq0, p1p0q1q2, p0q0q0q1) + DECLARE_ADD_16HL(oq1, p0q0q0q1, q1q2q2q2) + + ADD_AND_SHIFT4(op1) + ADD_AND_SHIFT4(op0) + ADD_AND_SHIFT4(oq0) + ADD_AND_SHIFT4(oq1) + + p1 = PACK_AND_SEL(p1, apply_6); + p0 = PACK_AND_SEL(p0, apply_6); + q0 = PACK_AND_SEL(q0, apply_6); + q1 = PACK_AND_SEL(q1, apply_6); + } + + u8x16 p1p0ab = (u8x16)vec_mergeh(p1, p0); // p1 p0 ... + u8x16 q0q1ab = (u8x16)vec_mergeh(q0, q1); // q0 q1 ... + u8x16 p1p0cd = (u8x16)vec_mergel(p1, p0); // p1 p0 ... + u8x16 q0q1cd = (u8x16)vec_mergel(q0, q1); // q0 q1 ... + + u8x16 outa = (u8x16)vec_mergeh((u16x8)p1p0ab, (u16x8)q0q1ab); // op1 op0 oq0 oq1 ... + u8x16 outb = (u8x16)vec_mergel((u16x8)p1p0ab, (u16x8)q0q1ab); + u8x16 outc = (u8x16)vec_mergeh((u16x8)p1p0cd, (u16x8)q0q1cd); + u8x16 outd = (u8x16)vec_mergel((u16x8)p1p0cd, (u16x8)q0q1cd); + + if (apply[0]) { + store_h_4(outa, dst2, stridea); + } + dst2 += 4 * stridea; + if (apply[1]) { + store_h_4(outb, dst2, stridea); + } + dst2 += 4 * stridea; + if (apply[2]) { + store_h_4(outc, dst2, stridea); + } + dst2 += 4 * stridea; + if (apply[3]) { + store_h_4(outd, dst2, stridea); + } +} + +static inline void +loop_filter_h_8_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H, + const ptrdiff_t stridea, b32x4 apply, b32x4 m8 + HIGHBD_DECL_SUFFIX) +{ + uint8_t *dst2 = dst - 3; + dst -= 4; + u8x16 p3, p2, p1, p0, q0, q1, q2, q3; + + LOAD4_H(a) + dst += stridea; + LOAD4_H(b) + dst += stridea; + LOAD4_H(c) + dst += stridea; + LOAD4_H(d) + + MERGEH_4(a) + MERGEH_4(b) + MERGEH_4(c) + MERGEH_4(d) + + MERGE_U16_0123(a) + MERGE_U16_0123(b) + MERGE_U16_0123(c) + MERGE_U16_0123(d) + + MERGEH_U32_LINE(h) + MERGEL_U32_LINE(h) + MERGEH_U32_LINE(l) + MERGEL_U32_LINE(l) + + MERGE_U32(p3, p2, h, h) + MERGE_U32(p1, p0, l, h) + MERGE_U32(q0, q1, h, l) + MERGE_U32(q2, q3, l, l) + + const u8x16 F = vec_splat_u8(1); + + const u8x16 zero = vec_splat_u8(0); + const u16x8 v3u16 = vec_splat_u16(3); + const u16x8 v4u16 = vec_splat_u16(4); + const u8x16 v1u8 = vec_splat_u8(1); + const b8x16 s = (b8x16)vec_splats((uint8_t)128); + + const u8x16 a_p1_p0 = vec_absd(p1, p0); + const u8x16 a_q1_q0 = vec_absd(q1, q0); + const u8x16 a_p0_q0 = vec_absd(p0, q0); + const u8x16 a_p1_q1 = vec_absd(p1, q1); + const u8x16 a_p2_p1 = vec_absd(p2, p1); + const u8x16 a_q2_q1 = vec_absd(q2, q1); + const u8x16 a_p2_p0 = vec_absd(p2, p0); + const u8x16 a_q2_q0 = vec_absd(q2, q0); + const u8x16 a_p3_p0 = vec_absd(p3, p0); + const u8x16 a_q3_q0 = vec_absd(q3, q0); + const u8x16 a_p3_p2 = vec_absd(p3, p2); + const u8x16 a_q3_q2 = vec_absd(q3, q2); + + u8x16 max_a_p2p1_q2q1 = vec_max(a_p2_p1, a_q2_q1); + u8x16 max_a_p3p2_q3q2 = vec_max(a_p3_p2, a_q3_q2); + u8x16 cmp_E = vec_adds(a_p0_q0, a_p0_q0); + const u8x16 max_a_p1p0_q1q0 = vec_max(a_p1_p0, a_q1_q0); + const u8x16 max_a_p2p0_q2q0 = vec_max(a_p2_p0, a_q2_q0); + u8x16 max_a_p3p0_q3q0 = vec_max(a_p3_p0, a_q3_q0); + u8x16 cmp_I_m8 = vec_max(max_a_p2p1_q2q1, max_a_p3p2_q3q2); + u8x16 cmp_I_m4 = max_a_p1p0_q1q0; + cmp_E = vec_adds(vec_sr(a_p1_q1, v1u8), cmp_E); + cmp_I_m8 = vec_and(cmp_I_m8, (u8x16)m8); + u8x16 cmp_I = vec_max(cmp_I_m4, cmp_I_m8); + const b8x16 ltE = vec_cmple(cmp_E, E); + const b8x16 ltI = vec_cmple(cmp_I, I); + b8x16 fm = vec_and(ltI, ltE); + + fm = vec_and(fm, (b8x16)apply); + if (vec_all_eq(fm, zero)) + return; + + #define UNPACK_16(v) \ + u16x8 v##h = u8h_to_u16(v); \ + u16x8 v##l = u8l_to_u16(v); + + UNPACK_16(p3) + UNPACK_16(p2) + UNPACK_16(p1) + UNPACK_16(p0) + UNPACK_16(q0) + UNPACK_16(q1) + UNPACK_16(q2) + UNPACK_16(q3) + + m8 = vec_and(m8, (b32x4)fm); + + u8x16 cmp_flat8in = vec_max(max_a_p2p0_q2q0, max_a_p1p0_q1q0); + cmp_flat8in = vec_max(max_a_p3p0_q3q0, cmp_flat8in); + b8x16 apply_8 = vec_and(vec_cmple(cmp_flat8in, F), (b8x16)m8); + + b8x16 apply_4 = vec_andc(fm, apply_8); + + if (vec_any_ne(apply_4, zero)) { + APPLY_4 + } + + if (vec_any_ne(apply_8, zero)) { + APPLY_8 + } + + MERGEHL_U8(p2, p1) // A0 A1 A2 A3 B0 B1 B2 B3 + MERGEHL_U8(p0, q0) + MERGEHL_U8(q1, q2) + + MERGEHL_U16(ab_p2p1p0q0, p2p1h, p0q0h) // A0 p2 p1 p0 q0 | A1 p2 p1 p0 q0 | A2 ... + // B0 ... + MERGEHL_U16(cd_p2p1p0q0, p2p1l, p0q0l) // C0 ... + // D0 ... + MERGEHL_U16(ab_q1q2, q1q2h, q1q2h) // A0 q1 q2 q1 q2 | A1 q1 q2 ... + // B0 ... + MERGEHL_U16(cd_q1q2, q1q2l, q1q2l) // C0 ... + // D0 ... + + MERGEHL_U32(a, ab_p2p1p0q0h, ab_q1q2h) // A0 p2 p1 p0 q0 q1 q2 q1 q2 | A1 .. + // A2 ... | A3 ... + MERGEHL_U32(b, ab_p2p1p0q0l, ab_q1q2l) // B0 ... + // C2 ... + MERGEHL_U32(c, cd_p2p1p0q0h, cd_q1q2h) // C0 ... + // C2 + MERGEHL_U32(d, cd_p2p1p0q0l, cd_q1q2l) // D0 .. + // D2 .. + if (apply[0]) { + store_h_8(ah, al, dst2, stridea); + } + dst2 += 4 * stridea; + if (apply[1]) { + store_h_8(bh, bl, dst2, stridea); + } + dst2 += 4 * stridea; + + if (apply[2]) { + store_h_8(ch, cl, dst2, stridea); + } + dst2 += 4 * stridea; + if (apply[3]) { + store_h_8(dh, dl, dst2, stridea); + } + +} + +static inline void +loop_filter_h_16_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H, + const ptrdiff_t stridea, b32x4 apply, b32x4 m8, b32x4 m16 + HIGHBD_DECL_SUFFIX) +{ + uint8_t *dst2 = dst -6 ; + dst -= 7; + u8x16 p3, p2, p1, p0, q0, q1, q2, q3; + u8x16 p6, p5, p4, q4, q5, q6; + + LOAD4_H(a) + dst += stridea; + LOAD4_H(b) + dst += stridea; + LOAD4_H(c) + dst += stridea; + LOAD4_H(d) + + { + MERGEH_4(a) + MERGEH_4(b) + MERGEH_4(c) + MERGEH_4(d) + + MERGE_U16_0123(a) + MERGE_U16_0123(b) + MERGE_U16_0123(c) + MERGE_U16_0123(d) + + MERGEH_U32_LINE(h) + MERGEL_U32_LINE(h) + MERGEH_U32_LINE(l) + MERGEL_U32_LINE(l) + + MERGE_U32(p6, p5, h, h) + MERGE_U32(p4, p3, l, h) + MERGE_U32(p2, p1, h, l) + MERGE_U32(p0, q0, l, l) + } + { + MERGEL_4(a) + MERGEL_4(b) + MERGEL_4(c) + MERGEL_4(d) + + MERGE_U16_0123(a) + MERGE_U16_0123(b) + MERGE_U16_0123(c) + MERGE_U16_0123(d) + + MERGEH_U32_LINE(h) + MERGEL_U32_LINE(h) + MERGEH_U32_LINE(l) + + MERGE_U32(q1, q2, h, h) + MERGE_U32(q3, q4, l, h) + MERGE_U32(q5, q6, h, l) + } + + const u8x16 F = vec_splat_u8(1); + + const u8x16 zero = vec_splat_u8(0); + const u16x8 v3u16 = vec_splat_u16(3); + const u16x8 v4u16 = vec_splat_u16(4); + const u16x8 v8u16 = vec_splat_u16(8); + const u8x16 v1u8 = vec_splat_u8(1); + const b8x16 s = (b8x16)vec_splats((uint8_t)128); + + const u8x16 a_p6_p0 = vec_absd(p6, p0); + const u8x16 a_p5_p0 = vec_absd(p5, p0); + const u8x16 a_p4_p0 = vec_absd(p4, p0); + const u8x16 a_q4_q0 = vec_absd(q4, q0); + const u8x16 a_q5_q0 = vec_absd(q5, q0); + const u8x16 a_q6_q0 = vec_absd(q6, q0); + + const u8x16 a_p1_p0 = vec_absd(p1, p0); + const u8x16 a_q1_q0 = vec_absd(q1, q0); + const u8x16 a_p0_q0 = vec_absd(p0, q0); + const u8x16 a_p1_q1 = vec_absd(p1, q1); + const u8x16 a_p2_p1 = vec_absd(p2, p1); + const u8x16 a_q2_q1 = vec_absd(q2, q1); + const u8x16 a_p2_p0 = vec_absd(p2, p0); + const u8x16 a_q2_q0 = vec_absd(q2, q0); + const u8x16 a_p3_p0 = vec_absd(p3, p0); + const u8x16 a_q3_q0 = vec_absd(q3, q0); + const u8x16 a_p3_p2 = vec_absd(p3, p2); + const u8x16 a_q3_q2 = vec_absd(q3, q2); + + u8x16 max_a_p2p1_q2q1 = vec_max(a_p2_p1, a_q2_q1); + u8x16 max_a_p3p2_q3q2 = vec_max(a_p3_p2, a_q3_q2); + u8x16 cmp_E = vec_adds(a_p0_q0, a_p0_q0); + const u8x16 max_a_p1p0_q1q0 = vec_max(a_p1_p0, a_q1_q0); + const u8x16 max_a_p2p0_q2q0 = vec_max(a_p2_p0, a_q2_q0); + + const u8x16 max_a_p4p0_q4q0 = vec_max(a_p4_p0, a_q4_q0); + const u8x16 max_a_p5p0_q5q0 = vec_max(a_p5_p0, a_q5_q0); + const u8x16 max_a_p6p0_q6q0 = vec_max(a_p6_p0, a_q6_q0); + + b32x4 m8_16 = vec_or(m8, m16); + + u8x16 max_a_p3p0_q3q0 = vec_max(a_p3_p0, a_q3_q0); + u8x16 cmp_I_m8 = vec_max(max_a_p2p1_q2q1, max_a_p3p2_q3q2); + u8x16 cmp_I_m4 = max_a_p1p0_q1q0; + cmp_E = vec_adds(vec_sr(a_p1_q1, v1u8), cmp_E); + cmp_I_m8 = vec_and(cmp_I_m8, (b8x16)m8_16); + u8x16 cmp_I = vec_max(cmp_I_m4, cmp_I_m8); + const b8x16 ltE = vec_cmple(cmp_E, E); + const b8x16 ltI = vec_cmple(cmp_I, I); + b8x16 fm = vec_and(ltI, ltE); + + fm = vec_and(fm, (b8x16)apply); + if (vec_all_eq(fm, zero)) + return; + + u8x16 cmp_flat8in = vec_max(max_a_p2p0_q2q0, max_a_p1p0_q1q0); + u8x16 cmp_flat8out = vec_max(max_a_p6p0_q6q0, max_a_p5p0_q5q0); + + m8_16 = vec_and(m8_16, (b32x4)fm); + m16 = vec_and(m16, (b32x4)fm); + + cmp_flat8in = vec_max(max_a_p3p0_q3q0, cmp_flat8in); + cmp_flat8out = vec_max(max_a_p4p0_q4q0, cmp_flat8out); + b8x16 flat8in = vec_cmple(cmp_flat8in, F); + b8x16 flat8out = vec_cmple(cmp_flat8out, F); + flat8in = vec_and(flat8in, (b8x16)m8_16); + flat8out = vec_and(flat8out, (b8x16)m16); + + b8x16 apply_16 = vec_and(flat8out, flat8in); + b8x16 apply_8 = vec_andc(flat8in, flat8out); + + UNPACK_16(p6) + UNPACK_16(p5) + UNPACK_16(p4) + UNPACK_16(p3) + UNPACK_16(p2) + UNPACK_16(p1) + UNPACK_16(p0) + + b8x16 apply_4 = vec_and(fm, vec_nor(apply_16, apply_8)); + + UNPACK_16(q0) + UNPACK_16(q1) + UNPACK_16(q2) + UNPACK_16(q3) + UNPACK_16(q4) + UNPACK_16(q5) + UNPACK_16(q6) + + if (vec_any_ne(apply_4, zero)) { + APPLY_4 + } + + if (vec_any_ne(apply_16, zero)) { + APPLY_16 + } + + if (vec_any_ne(apply_8, zero)) { + APPLY_8 + } + + MERGEHL_U8(p5, p4) + MERGEHL_U8(p3, p2) + MERGEHL_U8(p1, p0) + MERGEHL_U8(q0, q1) + MERGEHL_U8(q2, q3) + MERGEHL_U8(q4, q5) + + MERGEHL_U16(ab_p5p4p3p2, p5p4h, p3p2h) + MERGEHL_U16(cd_p5p4p3p2, p5p4l, p3p2l) + MERGEHL_U16(ab_p1p0q0q1, p1p0h, q0q1h) + MERGEHL_U16(cd_p1p0q0q1, p1p0l, q0q1l) + MERGEHL_U16(ab_q2q3q4q5, q2q3h, q4q5h) + MERGEHL_U16(cd_q2q3q4q5, q2q3l, q4q5l) + + + MERGEHL_U32(a_p5p4p3p2q2q3q4q5, ab_p5p4p3p2h, ab_q2q3q4q5h) // A0 p5p4p3p2 q2q3q4q5 A1 + // A2 A3 + MERGEHL_U32(a_p1p0q0q1q2q3q4q5, ab_p1p0q0q1h, ab_q2q3q4q5h) // A0 p1p0q0q1 q2q3q4q5 A1 + // A2 A3 + MERGEHL_U32(b_p5p4p3p2q2q3q4q5, ab_p5p4p3p2l, ab_q2q3q4q5l) // B0 p5p4p3p2 q2q3q4q5 B1 + // A2 A3 + MERGEHL_U32(b_p1p0q0q1q2q3q4q5, ab_p1p0q0q1l, ab_q2q3q4q5l) // B0 p1p0q0q1 q2q3q4q5 B1 + // B2 B3 + MERGEHL_U32(c_p5p4p3p2q2q3q4q5, cd_p5p4p3p2h, cd_q2q3q4q5h) // C0 p5p4p3p2 q2q3q4q5 C1 + // C2 C3 + MERGEHL_U32(c_p1p0q0q1q2q3q4q5, cd_p1p0q0q1h, cd_q2q3q4q5h) // C0 p1p0q0q1 q2q3q4q5 C1 + // C2 C3 + MERGEHL_U32(d_p5p4p3p2q2q3q4q5, cd_p5p4p3p2l, cd_q2q3q4q5l) // D0 p5p4p3p2 q2q3q4q5 D1 + // D2 D3 + MERGEHL_U32(d_p1p0q0q1q2q3q4q5, cd_p1p0q0q1l, cd_q2q3q4q5l) // D0 p1p0q0q1 q2q3q4q5 D1 + // D2 D3 + + MERGEHL_U32(a01, a_p5p4p3p2q2q3q4q5h, a_p1p0q0q1q2q3q4q5h) // A0 p5p4p3p2 p1p0q0q1 q2q3q4q5 q2q3q4q5 + // A1 + vec_xst_len(a01h, dst2, 12); + dst2 += stridea; + vec_xst_len(a01l, dst2, 12); + dst2 += stridea; + MERGEHL_U32(a23, a_p5p4p3p2q2q3q4q5l, a_p1p0q0q1q2q3q4q5l) // A2 + // A3 + vec_xst_len(a23h, dst2, 12); + dst2 += stridea; + vec_xst_len(a23l, dst2, 12); + dst2 += stridea; + MERGEHL_U32(b01, b_p5p4p3p2q2q3q4q5h, b_p1p0q0q1q2q3q4q5h) // B0 p5p4p3p2 p1p0q0q1 q2q3q4q5 q2q3q4q5 + // B1 + vec_xst_len(b01h, dst2, 12); + dst2 += stridea; + vec_xst_len(b01l, dst2, 12); + dst2 += stridea; + MERGEHL_U32(b23, b_p5p4p3p2q2q3q4q5l, b_p1p0q0q1q2q3q4q5l) // B2 + // B3 + vec_xst_len(b23h, dst2, 12); + dst2 += stridea; + vec_xst_len(b23l, dst2, 12); + dst2 += stridea; + MERGEHL_U32(c01, c_p5p4p3p2q2q3q4q5h, c_p1p0q0q1q2q3q4q5h) // C0 p5p4p3p2 p1p0q0q1 q2q3q4q5 q2q3q4q5 + // C1 + vec_xst_len(c01h, dst2, 12); + dst2 += stridea; + vec_xst_len(c01l, dst2, 12); + dst2 += stridea; + MERGEHL_U32(c23, c_p5p4p3p2q2q3q4q5l, c_p1p0q0q1q2q3q4q5l) // C2 + // C3 + vec_xst_len(c23h, dst2, 12); + dst2 += stridea; + vec_xst_len(c23l, dst2, 12); + dst2 += stridea; + MERGEHL_U32(d01, d_p5p4p3p2q2q3q4q5h, d_p1p0q0q1q2q3q4q5h) // D0 p5p4p3p2 p1p0q0q1 q2q3q4q5 q2q3q4q5 + // D1 + vec_xst_len(d01h, dst2, 12); + dst2 += stridea; + vec_xst_len(d01l, dst2, 12); + dst2 += stridea; + MERGEHL_U32(d23, d_p5p4p3p2q2q3q4q5l, d_p1p0q0q1q2q3q4q5l) // D2 + // D3 + vec_xst_len(d23h, dst2, 12); + dst2 += stridea; + vec_xst_len(d23l, dst2, 12); + dst2 += stridea; +} + +static inline void +loop_filter_v_4_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H, + const ptrdiff_t strideb, b32x4 apply + HIGHBD_DECL_SUFFIX) +{ + uint8_t *p1d = dst + strideb * -2; + uint8_t *p0d = dst + strideb * -1; + uint8_t *q0d = dst + strideb * +0; + uint8_t *q1d = dst + strideb * +1; + + u8x16 p1 = vec_xl(0, p1d); + u8x16 p0 = vec_xl(0, p0d); + u8x16 q0 = vec_xl(0, q0d); + u8x16 q1 = vec_xl(0, q1d); + + const u8x16 zero = vec_splat_u8(0); + const u8x16 v1u8 = vec_splat_u8(1); + const b8x16 s = (b8x16)vec_splats((uint8_t)128); + + const u8x16 a_p1_p0 = vec_absd(p1, p0); + const u8x16 a_q1_q0 = vec_absd(q1, q0); + const u8x16 a_p0_q0 = vec_absd(p0, q0); + const u8x16 a_p1_q1 = vec_absd(p1, q1); + + u8x16 cmp_E = vec_adds(a_p0_q0, a_p0_q0); + const u8x16 max_a_p1p0_q1q0 = vec_max(a_p1_p0, a_q1_q0); + const u8x16 cmp_I = max_a_p1p0_q1q0; + cmp_E = vec_adds(vec_sr(a_p1_q1, v1u8), cmp_E); + const b8x16 ltI = vec_cmple(cmp_I, I); + const b8x16 ltE = vec_cmple(cmp_E, E); + b8x16 fm = vec_and(ltI, ltE); + + fm = vec_and(fm, (b8x16)apply); + if (vec_all_eq(fm, zero)) + return; + + UNPACK_16(p0) + UNPACK_16(q0) + + APPLY_4 + + vec_xst(p0, 0, p0d); + vec_xst(q0, 0, q0d); + vec_xst(q1, 0, q1d); + vec_xst(p1, 0, p1d); +} + +static inline void +loop_filter_v_6_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H, + const ptrdiff_t strideb, b32x4 apply, b32x4 m6 + HIGHBD_DECL_SUFFIX) +{ + uint8_t *p2d = dst + strideb * -3; + uint8_t *p1d = dst + strideb * -2; + uint8_t *p0d = dst + strideb * -1; + uint8_t *q0d = dst + strideb * +0; + uint8_t *q1d = dst + strideb * +1; + uint8_t *q2d = dst + strideb * +2; + + u8x16 p2 = vec_xl(0, p2d); + u8x16 p1 = vec_xl(0, p1d); + u8x16 p0 = vec_xl(0, p0d); + u8x16 q0 = vec_xl(0, q0d); + u8x16 q1 = vec_xl(0, q1d); + u8x16 q2 = vec_xl(0, q2d); + + const u8x16 F = vec_splat_u8(1); + + const u8x16 zero = vec_splat_u8(0); + const u16x8 v3u16 = vec_splat_u16(3); + const u16x8 v4u16 = vec_splat_u16(4); + const u8x16 v1u8 = vec_splat_u8(1); + const b8x16 s = (b8x16)vec_splats((uint8_t)128); + + const u8x16 a_p1_p0 = vec_absd(p1, p0); + const u8x16 a_q1_q0 = vec_absd(q1, q0); + const u8x16 a_p0_q0 = vec_absd(p0, q0); + const u8x16 a_p1_q1 = vec_absd(p1, q1); + const u8x16 a_p2_p1 = vec_absd(p2, p1); + const u8x16 a_q2_q1 = vec_absd(q2, q1); + const u8x16 a_p2_p0 = vec_absd(p2, p0); + const u8x16 a_q2_q0 = vec_absd(q2, q0); + + u8x16 max_a_p2p1_q2q1 = vec_max(a_p2_p1, a_q2_q1); + u8x16 cmp_E = vec_adds(a_p0_q0, a_p0_q0); + const u8x16 max_a_p1p0_q1q0 = vec_max(a_p1_p0, a_q1_q0); + u8x16 max_a_p2p0_q2q0 = vec_max(a_p2_p0, a_q2_q0); + u8x16 cmp_I_m6 = max_a_p2p1_q2q1; + u8x16 cmp_I_m4 = max_a_p1p0_q1q0; + cmp_E = vec_adds(vec_sr(a_p1_q1, v1u8), cmp_E); + cmp_I_m6 = vec_and(cmp_I_m6, (u8x16)m6); + u8x16 cmp_I = vec_max(cmp_I_m4, cmp_I_m6); + const b8x16 ltE = vec_cmple(cmp_E, E); + const b8x16 ltI = vec_cmple(cmp_I, I); + b8x16 fm = vec_and(ltI, ltE); + + fm = vec_and(fm, (b8x16)apply); + if (vec_all_eq(fm, zero)) + return; + + UNPACK_16(p2) + UNPACK_16(p1) + UNPACK_16(p0) + UNPACK_16(q0) + UNPACK_16(q1) + UNPACK_16(q2) + + m6 = vec_and(m6, (b32x4)fm); + + u8x16 cmp_flat8in = vec_max(max_a_p2p0_q2q0, max_a_p1p0_q1q0); + b8x16 apply_6 = vec_and(vec_cmple(cmp_flat8in, F), (b8x16)m6); + + b8x16 apply_4 = vec_andc(fm, apply_6); + + if (vec_any_ne(apply_4, zero)) { + APPLY_4 + } + + if (vec_any_ne(apply_6, zero)) { + DECLARE_ADD_16HL(p2p2, p2, p2) + DECLARE_ADD_16HL(p2p1, p2, p1) + DECLARE_ADD_16HL(p1p0, p1, p0) + DECLARE_ADD_16HL(p0q0, p0, q0) + DECLARE_ADD_16HL(q0q1, q0, q1) + DECLARE_ADD_16HL(q1q2, q1, q2) + DECLARE_ADD_16HL(p2p2p0q0, p2p2, p0q0) + DECLARE_ADD_16HL(p2p1p1p0, p2p1, p1p0) + DECLARE_ADD_16HL(p1p0q1q2, p1p0, q1q2) + DECLARE_ADD_16HL(p0q0q0q1, p0q0, q0q1) + u16x8 q1q2q2q2h = q2h * 3 + q1h; + u16x8 q1q2q2q2l = q2l * 3 + q1l; + + DECLARE_ADD_16HL(op1, p2p2p0q0, p2p1p1p0) + DECLARE_ADD_16HL(op0, p2p1p1p0, p0q0q0q1) + DECLARE_ADD_16HL(oq0, p1p0q1q2, p0q0q0q1) + DECLARE_ADD_16HL(oq1, p0q0q0q1, q1q2q2q2) + + ADD_AND_SHIFT4(op1) + ADD_AND_SHIFT4(op0) + ADD_AND_SHIFT4(oq0) + ADD_AND_SHIFT4(oq1) + + p1 = PACK_AND_SEL(p1, apply_6); + p0 = PACK_AND_SEL(p0, apply_6); + q0 = PACK_AND_SEL(q0, apply_6); + q1 = PACK_AND_SEL(q1, apply_6); + } + + vec_xst(p0, 0, p0d); + vec_xst(q0, 0, q0d); + vec_xst(q1, 0, q1d); + vec_xst(p1, 0, p1d); +} + +static inline void +loop_filter_v_8_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H, + const ptrdiff_t strideb, b32x4 apply, b32x4 m8 + HIGHBD_DECL_SUFFIX) + +{ + + uint8_t *p3d = dst + strideb * -4; + uint8_t *p2d = dst + strideb * -3; + uint8_t *p1d = dst + strideb * -2; + uint8_t *p0d = dst + strideb * -1; + uint8_t *q0d = dst + strideb * +0; + uint8_t *q1d = dst + strideb * +1; + uint8_t *q2d = dst + strideb * +2; + uint8_t *q3d = dst + strideb * +3; + + u8x16 p3 = vec_xl(0, p3d); + u8x16 p2 = vec_xl(0, p2d); + u8x16 p1 = vec_xl(0, p1d); + u8x16 p0 = vec_xl(0, p0d); + u8x16 q0 = vec_xl(0, q0d); + u8x16 q1 = vec_xl(0, q1d); + u8x16 q2 = vec_xl(0, q2d); + u8x16 q3 = vec_xl(0, q3d); + + const u8x16 F = vec_splat_u8(1); + + const u8x16 zero = vec_splat_u8(0); + const u16x8 v3u16 = vec_splat_u16(3); + const u16x8 v4u16 = vec_splat_u16(4); + const u8x16 v1u8 = vec_splat_u8(1); + const b8x16 s = (b8x16)vec_splats((uint8_t)128); + + const u8x16 a_p1_p0 = vec_absd(p1, p0); + const u8x16 a_q1_q0 = vec_absd(q1, q0); + const u8x16 a_p0_q0 = vec_absd(p0, q0); + const u8x16 a_p1_q1 = vec_absd(p1, q1); + const u8x16 a_p2_p1 = vec_absd(p2, p1); + const u8x16 a_q2_q1 = vec_absd(q2, q1); + const u8x16 a_p2_p0 = vec_absd(p2, p0); + const u8x16 a_q2_q0 = vec_absd(q2, q0); + const u8x16 a_p3_p0 = vec_absd(p3, p0); + const u8x16 a_q3_q0 = vec_absd(q3, q0); + const u8x16 a_p3_p2 = vec_absd(p3, p2); + const u8x16 a_q3_q2 = vec_absd(q3, q2); + + u8x16 max_a_p2p1_q2q1 = vec_max(a_p2_p1, a_q2_q1); + u8x16 max_a_p3p2_q3q2 = vec_max(a_p3_p2, a_q3_q2); + u8x16 cmp_E = vec_adds(a_p0_q0, a_p0_q0); + const u8x16 max_a_p1p0_q1q0 = vec_max(a_p1_p0, a_q1_q0); + const u8x16 max_a_p2p0_q2q0 = vec_max(a_p2_p0, a_q2_q0); + u8x16 max_a_p3p0_q3q0 = vec_max(a_p3_p0, a_q3_q0); + u8x16 cmp_I_m8 = vec_max(max_a_p2p1_q2q1, max_a_p3p2_q3q2); + u8x16 cmp_I_m4 = max_a_p1p0_q1q0; + cmp_E = vec_adds(vec_sr(a_p1_q1, v1u8), cmp_E); + cmp_I_m8 = vec_and(cmp_I_m8, (u8x16)m8); + u8x16 cmp_I = vec_max(cmp_I_m4, cmp_I_m8); + const b8x16 ltE = vec_cmple(cmp_E, E); + const b8x16 ltI = vec_cmple(cmp_I, I); + b8x16 fm = vec_and(ltI, ltE); + + fm = vec_and(fm, (b8x16)apply); + if (vec_all_eq(fm, zero)) + return; + + #define UNPACK_16(v) \ + u16x8 v##h = u8h_to_u16(v); \ + u16x8 v##l = u8l_to_u16(v); + + UNPACK_16(p3) + UNPACK_16(p2) + UNPACK_16(p1) + UNPACK_16(p0) + UNPACK_16(q0) + UNPACK_16(q1) + UNPACK_16(q2) + UNPACK_16(q3) + + m8 = vec_and(m8, (b32x4)fm); + + u8x16 cmp_flat8in = vec_max(max_a_p2p0_q2q0, max_a_p1p0_q1q0); + cmp_flat8in = vec_max(max_a_p3p0_q3q0, cmp_flat8in); + b8x16 apply_8 = vec_and(vec_cmple(cmp_flat8in, F), (b8x16)m8); + + b8x16 apply_4 = vec_andc(fm, apply_8); + + if (vec_any_ne(apply_4, zero)) { + APPLY_4 + } + + if (vec_any_ne(apply_8, zero)) { + APPLY_8 + } + + vec_xst(p0, 0, p0d); + vec_xst(q0, 0, q0d); + vec_xst(q1, 0, q1d); + vec_xst(p1, 0, p1d); + vec_xst(q2, 0, q2d); + vec_xst(p2, 0, p2d); +} + +static inline void +loop_filter_v_16_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H, + const ptrdiff_t strideb, b32x4 apply, b32x4 m8, b32x4 m16 + HIGHBD_DECL_SUFFIX) + +{ + + uint8_t *p6d = dst + strideb * -7; + uint8_t *p5d = dst + strideb * -6; + uint8_t *p4d = dst + strideb * -5; + uint8_t *p3d = dst + strideb * -4; + uint8_t *p2d = dst + strideb * -3; + uint8_t *p1d = dst + strideb * -2; + uint8_t *p0d = dst + strideb * -1; + uint8_t *q0d = dst + strideb * +0; + uint8_t *q1d = dst + strideb * +1; + uint8_t *q2d = dst + strideb * +2; + uint8_t *q3d = dst + strideb * +3; + uint8_t *q4d = dst + strideb * +4; + uint8_t *q5d = dst + strideb * +5; + uint8_t *q6d = dst + strideb * +6; + + u8x16 p6 = vec_xl(0, p6d); + u8x16 p5 = vec_xl(0, p5d); + u8x16 p4 = vec_xl(0, p4d); + u8x16 p3 = vec_xl(0, p3d); + u8x16 p2 = vec_xl(0, p2d); + u8x16 p1 = vec_xl(0, p1d); + u8x16 p0 = vec_xl(0, p0d); + u8x16 q0 = vec_xl(0, q0d); + u8x16 q1 = vec_xl(0, q1d); + u8x16 q2 = vec_xl(0, q2d); + u8x16 q3 = vec_xl(0, q3d); + u8x16 q4 = vec_xl(0, q4d); + u8x16 q5 = vec_xl(0, q5d); + u8x16 q6 = vec_xl(0, q6d); + + const u8x16 F = vec_splat_u8(1); + + const u8x16 zero = vec_splat_u8(0); + const u16x8 v3u16 = vec_splat_u16(3); + const u16x8 v4u16 = vec_splat_u16(4); + const u16x8 v8u16 = vec_splat_u16(8); + const u8x16 v1u8 = vec_splat_u8(1); + const b8x16 s = (b8x16)vec_splats((uint8_t)128); + + const u8x16 a_p6_p0 = vec_absd(p6, p0); + const u8x16 a_p5_p0 = vec_absd(p5, p0); + const u8x16 a_p4_p0 = vec_absd(p4, p0); + const u8x16 a_q4_q0 = vec_absd(q4, q0); + const u8x16 a_q5_q0 = vec_absd(q5, q0); + const u8x16 a_q6_q0 = vec_absd(q6, q0); + + const u8x16 a_p1_p0 = vec_absd(p1, p0); + const u8x16 a_q1_q0 = vec_absd(q1, q0); + const u8x16 a_p0_q0 = vec_absd(p0, q0); + const u8x16 a_p1_q1 = vec_absd(p1, q1); + const u8x16 a_p2_p1 = vec_absd(p2, p1); + const u8x16 a_q2_q1 = vec_absd(q2, q1); + const u8x16 a_p2_p0 = vec_absd(p2, p0); + const u8x16 a_q2_q0 = vec_absd(q2, q0); + const u8x16 a_p3_p0 = vec_absd(p3, p0); + const u8x16 a_q3_q0 = vec_absd(q3, q0); + const u8x16 a_p3_p2 = vec_absd(p3, p2); + const u8x16 a_q3_q2 = vec_absd(q3, q2); + + u8x16 max_a_p2p1_q2q1 = vec_max(a_p2_p1, a_q2_q1); + u8x16 max_a_p3p2_q3q2 = vec_max(a_p3_p2, a_q3_q2); + u8x16 cmp_E = vec_adds(a_p0_q0, a_p0_q0); + const u8x16 max_a_p1p0_q1q0 = vec_max(a_p1_p0, a_q1_q0); + const u8x16 max_a_p2p0_q2q0 = vec_max(a_p2_p0, a_q2_q0); + + const u8x16 max_a_p4p0_q4q0 = vec_max(a_p4_p0, a_q4_q0); + const u8x16 max_a_p5p0_q5q0 = vec_max(a_p5_p0, a_q5_q0); + const u8x16 max_a_p6p0_q6q0 = vec_max(a_p6_p0, a_q6_q0); + + b32x4 m8_16 = vec_or(m8, m16); + + u8x16 max_a_p3p0_q3q0 = vec_max(a_p3_p0, a_q3_q0); + u8x16 cmp_I_m8 = vec_max(max_a_p2p1_q2q1, max_a_p3p2_q3q2); + u8x16 cmp_I_m4 = max_a_p1p0_q1q0; + cmp_E = vec_adds(vec_sr(a_p1_q1, v1u8), cmp_E); + cmp_I_m8 = vec_and(cmp_I_m8, (u8x16)m8_16); + u8x16 cmp_I = vec_max(cmp_I_m4, cmp_I_m8); + const b8x16 ltE = vec_cmple(cmp_E, E); + const b8x16 ltI = vec_cmple(cmp_I, I); + b8x16 fm = vec_and(ltI, ltE); + + fm = vec_and(fm, (b8x16)apply); + if (vec_all_eq(fm, zero)) + return; + + u8x16 cmp_flat8in = vec_max(max_a_p2p0_q2q0, max_a_p1p0_q1q0); + u8x16 cmp_flat8out = vec_max(max_a_p6p0_q6q0, max_a_p5p0_q5q0); + + m8_16 = vec_and(m8_16, (b32x4)fm); + m16 = vec_and(m16, (b32x4)fm); + + cmp_flat8in = vec_max(max_a_p3p0_q3q0, cmp_flat8in); + cmp_flat8out = vec_max(max_a_p4p0_q4q0, cmp_flat8out); + b8x16 flat8in = vec_cmple(cmp_flat8in, F); + b8x16 flat8out = vec_cmple(cmp_flat8out, F); + flat8in = vec_and(flat8in, (b8x16)m8_16); + flat8out = vec_and(flat8out, (b8x16)m16); + + b8x16 apply_16 = vec_and(flat8out, flat8in); + b8x16 apply_8 = vec_andc(flat8in, flat8out); + + UNPACK_16(p6) + UNPACK_16(p5) + UNPACK_16(p4) + UNPACK_16(p3) + UNPACK_16(p2) + UNPACK_16(p1) + UNPACK_16(p0) + + b8x16 apply_4 = vec_nor(apply_16, apply_8); + + UNPACK_16(q0) + UNPACK_16(q1) + UNPACK_16(q2) + UNPACK_16(q3) + UNPACK_16(q4) + UNPACK_16(q5) + UNPACK_16(q6) + + if (vec_any_ne(apply_4, zero)) { + APPLY_4 + } + if (vec_any_ne(apply_16, zero)) { + APPLY_16 + } + if (vec_any_ne(apply_8, zero)) { + APPLY_8 + } + + vec_xst(p5, 0, p5d); + vec_xst(p4, 0, p4d); + vec_xst(p3, 0, p3d); + vec_xst(p2, 0, p2d); + vec_xst(p1, 0, p1d); + vec_xst(p0, 0, p0d); + vec_xst(q0, 0, q0d); + vec_xst(q1, 0, q1d); + vec_xst(q2, 0, q2d); + vec_xst(q3, 0, q3d); + vec_xst(q4, 0, q4d); + vec_xst(q5, 0, q5d); +} + +#if defined(DAV1D_VSX) +#define LPF(fn) BF(dav1d_lpf_##fn, vsx) +#elif defined(DAV1D_PWR9) +#define LPF(fn) BF(dav1d_lpf_##fn, pwr9) +#endif + +void LPF(h_sb_y)(pixel *dst, const ptrdiff_t stride, + const uint32_t *const vmask, + const uint8_t (*l)[4], ptrdiff_t b4_stride, + const Av1FilterLUT *lut, const int h + HIGHBD_DECL_SUFFIX) +{ + unsigned vm = vmask[0] | vmask[1] | vmask[2]; + + u32x4 vm0 = vec_splats(vmask[0] | vmask[1] | vmask[2]); + u32x4 vm1 = vec_splats(vmask[1]); + u32x4 vm2 = vec_splats(vmask[2]); + u32x4 mm = (u32x4){1, 2, 4, 8}; + + const u8x16 sharp = vec_xl(0, (uint8_t *)lut->sharp); + const u8x16 s0 = vec_splat(sharp, 0); + const u8x16 s1 = vec_splat(sharp, 8); + const u32x4 v4u32 = vec_splat_u32(4); + const u32x4 zero = vec_splat_u32(0); + const u8x16 v1u8 = vec_splat_u8(1); + const u8x16 v2u8 = vec_splat_u8(2); + const u8x16 v4u8 = vec_splat_u8(4); + const uint8_t (*pl)[4] = &l[-1]; + + const u8x16 spread = (u8x16){ + 0x00, 0x00, 0x00, 0x00, + 0x04, 0x04, 0x04, 0x04, + 0x08, 0x08, 0x08, 0x08, + 0x0c, 0x0c, 0x0c, 0x0c, + }; + + for (; + vm; + vm >>= 4, + mm = vec_sl(mm, v4u32), + dst += 4 * 4 * PXSTRIDE(stride), + pl += 4 * b4_stride) { + if (!(vm & 0x0f)) + continue; + u32x4 la = (u32x4)vec_xl(0, (uint8_t *)pl); // l[-1] l[0] ... + u32x4 lb = (u32x4)vec_xl(1 * 4 * b4_stride, (uint8_t *)pl); + u32x4 lc = (u32x4)vec_xl(2 * 4 * b4_stride, (uint8_t *)pl); + u32x4 ld = (u32x4)vec_xl(3 * 4 * b4_stride, (uint8_t *)pl); + + u32x4 Lac = vec_mergeh(la, lc); // la[-1] lb[-1] la[0] lb[0] + u32x4 Lbd = vec_mergeh(lb, ld); // lc[-1] ld[-1] lc[0] ld[0] + + u32x4 wd16 = vec_and(vm2, mm); // vmask[2] & [1,2,4,8] + u32x4 wd8 = vec_and(vm1, mm); // vmask[1] & [1,2,4,8] + u32x4 wd4 = vec_and(vm0, mm); // vm & [1,2,4,8] + + u32x4 L_1 = (u32x4)vec_mergeh(Lac, Lbd); // la[-1] lb[-1] lc[-1] ld[-1] + u32x4 L_0 = (u32x4)vec_mergel(Lac, Lbd); // la[ 0] lb[ 0] lc[ 0] ld[ 0] + + b8x16 mask = vec_cmpeq((u8x16)L_0, (u8x16)zero); + + u32x4 L4 = (u32x4)vec_sel((u8x16)L_0, (u8x16)L_1, mask); // if !l[0][0] { l[-1][0] } + + u8x16 L = (u8x16)vec_perm((u8x16)L4, (u8x16)L4, spread); // La La La La Lb Lb Lb Lb ... + + b32x4 m16 = vec_cmpeq(wd16, mm); + b32x4 m8 = vec_cmpeq(wd8, mm); + b32x4 m4 = vec_cmpeq(wd4, mm); + + b32x4 apply = vec_cmpne((u32x4)L, zero); + + if (vec_all_eq((u32x4)L, zero)) + continue; + + u8x16 I = vec_sr(L, s0); // L >> sharp[0] + u8x16 H = vec_sr(L, v4u8); + I = vec_min(I, s1); // min(L >> sharp[0], sharp[1]) + u8x16 E = vec_add(L, v2u8); // L + 2 + I = vec_max(I, v1u8); // max(min(L >> sharp[0], sharp[1]), 1) + E = vec_add(E, E); // 2 * (L + 2) + E = vec_add(E, I); // 2 * (L + 2) + limit + + apply = vec_and(m4, apply); + + if (vec_any_ne(wd16, zero)) { + loop_filter_h_16_all(dst, E, I, H, PXSTRIDE(stride), apply, m8, m16 HIGHBD_TAIL_SUFFIX); + } else if (vec_any_ne(wd8, zero)) { + loop_filter_h_8_all(dst, E, I, H, PXSTRIDE(stride), apply, m8 HIGHBD_TAIL_SUFFIX); + } else { // wd4 == 0 already tested + loop_filter_h_4_all(dst, E, I, H, PXSTRIDE(stride), apply HIGHBD_TAIL_SUFFIX); + } + } +} + +void LPF(v_sb_y)(pixel *dst, const ptrdiff_t stride, + const uint32_t *const vmask, + const uint8_t (*l)[4], ptrdiff_t b4_stride, + const Av1FilterLUT *lut, const int w + HIGHBD_DECL_SUFFIX) +{ + unsigned vm = vmask[0] | vmask[1] | vmask[2]; + + u32x4 vm0 = vec_splats(vmask[0] | vmask[1] | vmask[2]); + u32x4 vm1 = vec_splats(vmask[1]); + u32x4 vm2 = vec_splats(vmask[2]); + + u8x16 sharp = vec_xl(0, (uint8_t *)lut->sharp); + u8x16 s0 = vec_splat(sharp, 0); + u8x16 s1 = vec_splat(sharp, 8); + u32x4 mm = (u32x4){1, 2, 4, 8}; + u32x4 v4u32 = vec_splat_u32(4); + u32x4 zero = vec_splat_u32(0); + u8x16 v1u8 = vec_splat_u8(1); + u8x16 v2u8 = vec_splat_u8(2); + u8x16 v4u8 = vec_splat_u8(4); + const uint8_t (*pl)[4] = l; + const uint8_t (*plb4)[4] = l - b4_stride; + const u8x16 spread = (u8x16){ + 0x00, 0x00, 0x00, 0x00, + 0x04, 0x04, 0x04, 0x04, + 0x08, 0x08, 0x08, 0x08, + 0x0c, 0x0c, 0x0c, 0x0c, + }; + + for (; + vm; + vm >>= 4, + mm = vec_sl(mm, v4u32), + dst += 4 * 4, + pl += 4, + plb4 += 4) { + if (!(vm & 0x0f)) + continue; + u32x4 L_0 = (u32x4)vec_xl(0, (uint8_t *)pl); + u32x4 L_b4 = (u32x4)vec_xl(0, (uint8_t *)plb4); + + u32x4 wd16 = vec_and(vm2, mm); // vmask[2] & [1,2,4,8] + u32x4 wd8 = vec_and(vm1, mm); // vmask[1] & [1,2,4,8] + u32x4 wd4 = vec_and(vm0, mm); // vm & [1,2,4,8] + + b8x16 mask = vec_cmpeq((u8x16)L_0, (u8x16)zero); + + u32x4 L4 = (u32x4)vec_sel((u8x16)L_0, (u8x16)L_b4, mask); // if !l[0][0] { l[-b4_stride][0] } + + u8x16 L = (u8x16)vec_perm((u8x16)L4, (u8x16)L4, spread); // La La La La Lb Lb Lb Lb ... + + b32x4 m16 = vec_cmpeq(wd16, mm); + b32x4 m8 = vec_cmpeq(wd8, mm); + b32x4 m4 = vec_cmpeq(wd4, mm); + + b32x4 apply = vec_cmpne((u32x4)L, zero); + + if (vec_all_eq((u32x4)L, zero)) + continue; + + u8x16 I = vec_sr(L, s0); // L >> sharp[0] + u8x16 H = vec_sr(L, v4u8); + I = vec_min(I, s1); // min(L >> sharp[0], sharp[1]) + u8x16 E = vec_add(L, v2u8); // L + 2 + I = vec_max(I, v1u8); // max(min(L >> sharp[0], sharp[1]), 1) + E = vec_add(E, E); // 2 * (L + 2) + E = vec_add(E, I); // 2 * (L + 2) + limit + + apply = vec_and(apply, m4); + + if (vec_any_ne(wd16, zero)) { + loop_filter_v_16_all(dst, E, I, H, PXSTRIDE(stride), apply, m8, m16 HIGHBD_TAIL_SUFFIX); + } else if (vec_any_ne(wd8, zero)) { + loop_filter_v_8_all(dst, E, I, H, PXSTRIDE(stride), apply, m8 HIGHBD_TAIL_SUFFIX); + } else { + loop_filter_v_4_all(dst, E, I, H, PXSTRIDE(stride), apply HIGHBD_TAIL_SUFFIX); + } + + } +} + +void LPF(h_sb_uv)(pixel *dst, const ptrdiff_t stride, + const uint32_t *const vmask, + const uint8_t (*l)[4], ptrdiff_t b4_stride, + const Av1FilterLUT *lut, const int h + HIGHBD_DECL_SUFFIX) +{ + unsigned vm = vmask[0] | vmask[1]; + u32x4 vm0 = vec_splats(vm); + u32x4 vm1 = vec_splats(vmask[1]); + u32x4 mm = (u32x4){1, 2, 4, 8}; + + const u8x16 sharp = vec_xl(0, (uint8_t *)lut->sharp); + const u8x16 s0 = vec_splat(sharp, 0); + const u8x16 s1 = vec_splat(sharp, 8); + const u32x4 v4u32 = vec_splat_u32(4); + const u32x4 zero = vec_splat_u32(0); + const u8x16 v1u8 = vec_splat_u8(1); + const u8x16 v2u8 = vec_splat_u8(2); + const u8x16 v4u8 = vec_splat_u8(4); + const uint8_t (*pl)[4] = &l[-1]; + const u8x16 spread = (u8x16){ + 0x00, 0x00, 0x00, 0x00, + 0x04, 0x04, 0x04, 0x04, + 0x08, 0x08, 0x08, 0x08, + 0x0c, 0x0c, 0x0c, 0x0c, + }; + + for (; + vm; + vm >>= 4, + mm = vec_sl(mm, v4u32), + dst += 4 * 4 * PXSTRIDE(stride), + pl += 4 * b4_stride) { + if (!(vm & 0x0f)) + continue; + u32x4 la = (u32x4)vec_xl(0, (uint8_t *)pl); // l[-1] l[0] ... + u32x4 lb = (u32x4)vec_xl(1 * 4 * b4_stride, (uint8_t *)pl); + u32x4 lc = (u32x4)vec_xl(2 * 4 * b4_stride, (uint8_t *)pl); + u32x4 ld = (u32x4)vec_xl(3 * 4 * b4_stride, (uint8_t *)pl); + + u32x4 Lac = vec_mergeh(la, lc); // la[-1] lb[-1] la[0] lb[0] + u32x4 Lbd = vec_mergeh(lb, ld); // lc[-1] ld[-1] lc[0] ld[0] + + u32x4 wd6 = vec_and(vm1, mm); // vmask[1] & [1,2,4,8] + u32x4 wd4 = vec_and(vm0, mm); // vm & [1,2,4,8] + + u32x4 L_1 = (u32x4)vec_mergeh(Lac, Lbd); // la[-1] lb[-1] lc[-1] ld[-1] + u32x4 L_0 = (u32x4)vec_mergel(Lac, Lbd); // la[ 0] lb[ 0] lc[ 0] ld[ 0] + + b8x16 mask = vec_cmpeq((u8x16)L_0, (u8x16)zero); + + u32x4 L4 = (u32x4)vec_sel((u8x16)L_0, (u8x16)L_1, mask); // if !l[0][0] { l[-1][0] } + + u8x16 L = (u8x16)vec_perm((u8x16)L4, (u8x16)L4, spread); // La La La La Lb Lb Lb Lb ... + + b32x4 m6 = vec_cmpeq(wd6, mm); + b32x4 m4 = vec_cmpeq(wd4, mm); + + b32x4 apply = vec_cmpne((u32x4)L, zero); + + if (vec_all_eq((u32x4)L, zero)) + continue; + + u8x16 I = vec_sr(L, s0); // L >> sharp[0] + u8x16 H = vec_sr(L, v4u8); + I = vec_min(I, s1); // min(L >> sharp[0], sharp[1]) + u8x16 E = vec_add(L, v2u8); // L + 2 + I = vec_max(I, v1u8); // max(min(L >> sharp[0], sharp[1]), 1) + E = vec_add(E, E); // 2 * (L + 2) + E = vec_add(E, I); // 2 * (L + 2) + limit + + apply = vec_and(m4, apply); + + if (vec_any_ne(wd6, zero)) { + loop_filter_h_6_all(dst, E, I, H, PXSTRIDE(stride), apply, m6 HIGHBD_TAIL_SUFFIX); + // loop_filter_h_8 + } else { // wd4 == 0 already tested + loop_filter_h_4_all(dst, E, I, H, PXSTRIDE(stride), apply HIGHBD_TAIL_SUFFIX); + + // loop_filter_h_4 + } + + } +} + +void LPF(v_sb_uv)(pixel *dst, const ptrdiff_t stride, + const uint32_t *const vmask, + const uint8_t (*l)[4], ptrdiff_t b4_stride, + const Av1FilterLUT *lut, const int w + HIGHBD_DECL_SUFFIX) +{ + unsigned vm = vmask[0] | vmask[1]; + + u32x4 vm0 = vec_splats(vm); + u32x4 vm1 = vec_splats(vmask[1]); + + u8x16 sharp = vec_xl(0, (uint8_t *)lut->sharp); + u8x16 s0 = vec_splat(sharp, 0); + u8x16 s1 = vec_splat(sharp, 8); + u32x4 mm = (u32x4){1, 2, 4, 8}; + u32x4 v4u32 = vec_splat_u32(4); + u32x4 zero = vec_splat_u32(0); + u8x16 v1u8 = vec_splat_u8(1); + u8x16 v2u8 = vec_splat_u8(2); + u8x16 v4u8 = vec_splat_u8(4); + const uint8_t (*pl)[4] = l; + const uint8_t (*plb4)[4] = l - b4_stride; + const u8x16 spread = (u8x16){ + 0x00, 0x00, 0x00, 0x00, + 0x04, 0x04, 0x04, 0x04, + 0x08, 0x08, 0x08, 0x08, + 0x0c, 0x0c, 0x0c, 0x0c, + }; + + for (; + vm; + vm >>= 4, + mm = vec_sl(mm, v4u32), + dst += 4 * 4, + pl += 4, + plb4 += 4) { + if (!(vm & 0x0f)) + continue; + u32x4 L_0 = (u32x4)vec_xl(0, (uint8_t *)pl); + u32x4 L_b4 = (u32x4)vec_xl(0, (uint8_t *)plb4); + + u32x4 wd6 = vec_and(vm1, mm); // vmask[1] & [1,2,4,8] + u32x4 wd4 = vec_and(vm0, mm); // vm & [1,2,4,8] + + b8x16 mask = vec_cmpeq((u8x16)L_0, (u8x16)zero); + + u32x4 L4 = (u32x4)vec_sel((u8x16)L_0, (u8x16)L_b4, mask); // if !l[0][0] { l[-b4_stride][0] } + + u8x16 L = (u8x16)vec_perm((u8x16)L4, (u8x16)L4, spread); // La La La La Lb Lb Lb Lb ... + + b32x4 m6 = vec_cmpeq(wd6, mm); + b32x4 m4 = vec_cmpeq(wd4, mm); + + b32x4 apply = vec_cmpne((u32x4)L, zero); + + if (vec_all_eq((u32x4)L, zero)) + continue; + + u8x16 I = vec_sr(L, s0); // L >> sharp[0] + u8x16 H = vec_sr(L, v4u8); + I = vec_min(I, s1); // min(L >> sharp[0], sharp[1]) + u8x16 E = vec_add(L, v2u8); // L + 2 + I = vec_max(I, v1u8); // max(min(L >> sharp[0], sharp[1]), 1) + E = vec_add(E, E); // 2 * (L + 2) + E = vec_add(E, I); // 2 * (L + 2) + limit + + apply = vec_and(apply, m4); + + if (vec_any_ne(wd6, zero)) { + loop_filter_v_6_all(dst, E, I, H, PXSTRIDE(stride), apply, m6 HIGHBD_TAIL_SUFFIX); + } else { + loop_filter_v_4_all(dst, E, I, H, PXSTRIDE(stride), apply HIGHBD_TAIL_SUFFIX); + } + } +} + +#endif // BITDEPTH diff --git a/third_party/dav1d/src/recon_tmpl.c b/third_party/dav1d/src/recon_tmpl.c index 9d1a0da6bf5c..0afd06c16bc7 100644 --- a/third_party/dav1d/src/recon_tmpl.c +++ b/third_party/dav1d/src/recon_tmpl.c @@ -369,7 +369,7 @@ static int decode_coefs(Dav1dTaskContext *const t, const enum IntraPredMode y_mode_nofilt = b->y_mode == FILTER_PRED ? dav1d_filter_mode_to_y_mode[b->y_angle] : b->y_mode; if (f->frame_hdr->reduced_txtp_set || t_dim->min == TX_16X16) { - idx = dav1d_msac_decode_symbol_adapt4(&ts->msac, + idx = dav1d_msac_decode_symbol_adapt8(&ts->msac, ts->cdf.m.txtp_intra2[t_dim->min][y_mode_nofilt], 4); *txtp = dav1d_tx_types_per_set[idx + 0]; } else { @@ -412,7 +412,7 @@ static int decode_coefs(Dav1dTaskContext *const t, eob_bin = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 4 + sz); \ break; \ } - case_sz(0, 16, 4, [is_1d]); + case_sz(0, 16, 8, [is_1d]); case_sz(1, 32, 8, [is_1d]); case_sz(2, 64, 8, [is_1d]); case_sz(3, 128, 8, [is_1d]); diff --git a/third_party/dav1d/src/refmvs.c b/third_party/dav1d/src/refmvs.c index 1da024b6301b..40cc4efefc68 100644 --- a/third_party/dav1d/src/refmvs.c +++ b/third_party/dav1d/src/refmvs.c @@ -657,19 +657,19 @@ void dav1d_refmvs_tile_sbrow_init(refmvs_tile *const rt, const refmvs_frame *con { if (rf->n_tile_threads == 1) tile_row_idx = 0; rt->rp_proj = &rf->rp_proj[16 * rf->rp_stride * tile_row_idx]; - const int uses_2pass = rf->n_tile_threads > 1 && rf->n_frame_threads > 1; - const ptrdiff_t pass_off = (uses_2pass && pass == 2) ? - 35 * rf->r_stride * rf->n_tile_rows : 0; - refmvs_block *r = &rf->r[35 * rf->r_stride * tile_row_idx + pass_off]; + const ptrdiff_t r_stride = rf->rp_stride * 2; + const ptrdiff_t pass_off = (rf->n_frame_threads > 1 && pass == 2) ? + 35 * 2 * rf->n_blocks : 0; + refmvs_block *r = &rf->r[35 * r_stride * tile_row_idx + pass_off]; const int sbsz = rf->sbsz; const int off = (sbsz * sby) & 16; - for (int i = 0; i < sbsz; i++, r += rf->r_stride) + for (int i = 0; i < sbsz; i++, r += r_stride) rt->r[off + 5 + i] = r; rt->r[off + 0] = r; - r += rf->r_stride; + r += r_stride; rt->r[off + 1] = NULL; rt->r[off + 2] = r; - r += rf->r_stride; + r += r_stride; rt->r[off + 3] = NULL; rt->r[off + 4] = r; if (sby & 1) { @@ -805,37 +805,37 @@ int dav1d_refmvs_init_frame(refmvs_frame *const rf, /*const*/ refmvs_temporal_block *const rp_ref[7], const int n_tile_threads, const int n_frame_threads) { + const int rp_stride = ((frm_hdr->width[0] + 127) & ~127) >> 3; + const int n_tile_rows = n_tile_threads > 1 ? frm_hdr->tiling.rows : 1; + const int n_blocks = rp_stride * n_tile_rows; + rf->sbsz = 16 << seq_hdr->sb128; rf->frm_hdr = frm_hdr; rf->iw8 = (frm_hdr->width[0] + 7) >> 3; rf->ih8 = (frm_hdr->height + 7) >> 3; rf->iw4 = rf->iw8 << 1; rf->ih4 = rf->ih8 << 1; - - const ptrdiff_t r_stride = ((frm_hdr->width[0] + 127) & ~127) >> 2; - const int n_tile_rows = n_tile_threads > 1 ? frm_hdr->tiling.rows : 1; - if (r_stride != rf->r_stride || n_tile_rows != rf->n_tile_rows) { - if (rf->r) dav1d_freep_aligned(&rf->r); - const int uses_2pass = n_tile_threads > 1 && n_frame_threads > 1; - /* sizeof(refmvs_block) == 12 but it's accessed using 16-byte loads in asm, - * so add 4 bytes of padding to avoid buffer overreads. */ - rf->r = dav1d_alloc_aligned(ALLOC_REFMVS, sizeof(*rf->r) * 35 * r_stride * n_tile_rows * (1 + uses_2pass) + 4, 64); - if (!rf->r) return DAV1D_ERR(ENOMEM); - rf->r_stride = r_stride; - } - - const ptrdiff_t rp_stride = r_stride >> 1; - if (rp_stride != rf->rp_stride || n_tile_rows != rf->n_tile_rows) { - if (rf->rp_proj) dav1d_freep_aligned(&rf->rp_proj); - rf->rp_proj = dav1d_alloc_aligned(ALLOC_REFMVS, sizeof(*rf->rp_proj) * 16 * rp_stride * n_tile_rows, 64); - if (!rf->rp_proj) return DAV1D_ERR(ENOMEM); - rf->rp_stride = rp_stride; - } - rf->n_tile_rows = n_tile_rows; + rf->rp = rp; + rf->rp_stride = rp_stride; rf->n_tile_threads = n_tile_threads; rf->n_frame_threads = n_frame_threads; - rf->rp = rp; - rf->rp_ref = rp_ref; + + if (n_blocks != rf->n_blocks) { + const size_t r_sz = sizeof(*rf->r) * 35 * 2 * n_blocks * (1 + (n_frame_threads > 1)); + const size_t rp_proj_sz = sizeof(*rf->rp_proj) * 16 * n_blocks; + /* Note that sizeof(*rf->r) == 12, but it's accessed using 16-byte unaligned + * loads in save_tmvs() asm which can overread 4 bytes into rp_proj. */ + dav1d_free_aligned(rf->r); + rf->r = dav1d_alloc_aligned(ALLOC_REFMVS, r_sz + rp_proj_sz, 64); + if (!rf->r) { + rf->n_blocks = 0; + return DAV1D_ERR(ENOMEM); + } + + rf->rp_proj = (refmvs_temporal_block*)((uintptr_t)rf->r + r_sz); + rf->n_blocks = n_blocks; + } + const unsigned poc = frm_hdr->frame_offset; for (int i = 0; i < 7; i++) { const int poc_diff = get_poc_diff(seq_hdr->order_hint_n_bits, @@ -848,6 +848,7 @@ int dav1d_refmvs_init_frame(refmvs_frame *const rf, // temporal MV setup rf->n_mfmvs = 0; + rf->rp_ref = rp_ref; if (frm_hdr->use_ref_frame_mvs && seq_hdr->order_hint_n_bits) { int total = 2; if (rp_ref[0] && ref_ref_poc[0][6] != ref_poc[3] /* alt-of-last != gold */) { @@ -896,18 +897,6 @@ int dav1d_refmvs_init_frame(refmvs_frame *const rf, return 0; } -void dav1d_refmvs_init(refmvs_frame *const rf) { - rf->r = NULL; - rf->r_stride = 0; - rf->rp_proj = NULL; - rf->rp_stride = 0; -} - -void dav1d_refmvs_clear(refmvs_frame *const rf) { - if (rf->r) dav1d_freep_aligned(&rf->r); - if (rf->rp_proj) dav1d_freep_aligned(&rf->rp_proj); -} - static void splat_mv_c(refmvs_block **rr, const refmvs_block *const rmv, const int bx4, const int bw4, int bh4) { diff --git a/third_party/dav1d/src/refmvs.h b/third_party/dav1d/src/refmvs.h index d63874d3cb7a..d29feedce5ab 100644 --- a/third_party/dav1d/src/refmvs.h +++ b/third_party/dav1d/src/refmvs.h @@ -72,14 +72,14 @@ typedef struct refmvs_frame { int mfmv_ref2ref[3][7]; int n_mfmvs; + int n_blocks; refmvs_temporal_block *rp; /*const*/ refmvs_temporal_block *const *rp_ref; refmvs_temporal_block *rp_proj; ptrdiff_t rp_stride; refmvs_block *r; // 35 x r_stride memory - ptrdiff_t r_stride; - int n_tile_rows, n_tile_threads, n_frame_threads; + int n_tile_threads, n_frame_threads; } refmvs_frame; typedef struct refmvs_tile { @@ -121,10 +121,6 @@ typedef struct Dav1dRefmvsDSPContext { splat_mv_fn splat_mv; } Dav1dRefmvsDSPContext; -// call once per frame thread -void dav1d_refmvs_init(refmvs_frame *rf); -void dav1d_refmvs_clear(refmvs_frame *rf); - // call once per frame int dav1d_refmvs_init_frame(refmvs_frame *rf, const Dav1dSequenceHeader *seq_hdr, diff --git a/third_party/dav1d/src/riscv/64/cpu.S b/third_party/dav1d/src/riscv/64/cpu.S new file mode 100644 index 000000000000..b0e76f622141 --- /dev/null +++ b/third_party/dav1d/src/riscv/64/cpu.S @@ -0,0 +1,44 @@ +/****************************************************************************** + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2024, Nathan Egge + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include "src/riscv/asm.S" + +// This function detects non-compliant RVV 0.7.1 hardware which reports support +// for the V extension through HWCAP, by intentionally setting tail and mask +// agnostic vector configurations that were only introduced in RVV 0.9 spec. +// Existing non-compliant (pre RVV 1.0) hardware will set the VILL bit in VTYPE +// (indicating an illegal vector configuration) which is stored in the XLEN-1 +// bit position, thus a simple sign check is sufficient for detection. +// This work around is inexpensive and harmless on compliant hardware, but we +// should still consider removing it once all non-compliant RVV 0.7.1 hardware +// is out of service. +function has_compliant_rvv, export=1, ext=v + vsetvli t0, zero, e8, m1, ta, ma + csrr a0, vtype + sgtz a0, a0 + ret +endfunc diff --git a/third_party/dav1d/src/riscv/cpu.c b/third_party/dav1d/src/riscv/cpu.c index 16377109deac..30e135435960 100644 --- a/third_party/dav1d/src/riscv/cpu.c +++ b/third_party/dav1d/src/riscv/cpu.c @@ -38,11 +38,13 @@ #endif +int dav1d_has_compliant_rvv(void); + COLD unsigned dav1d_get_cpu_flags_riscv(void) { unsigned flags = 0; #if defined(HAVE_GETAUXVAL) unsigned long hw_cap = getauxval(AT_HWCAP); - flags |= (hw_cap & HWCAP_RVV) ? DAV1D_RISCV_CPU_FLAG_V : 0; + flags |= (hw_cap & HWCAP_RVV) && dav1d_has_compliant_rvv() ? DAV1D_RISCV_CPU_FLAG_V : 0; #endif return flags; diff --git a/third_party/dav1d/src/x86/mc.h b/third_party/dav1d/src/x86/mc.h index b142361daa9a..d1b7ae601e8a 100644 --- a/third_party/dav1d/src/x86/mc.h +++ b/third_party/dav1d/src/x86/mc.h @@ -29,7 +29,6 @@ #include "src/mc.h" #define decl_fn(type, name) \ - decl_##type##_fn(BF(name, sse2)); \ decl_##type##_fn(BF(name, ssse3)); \ decl_##type##_fn(BF(name, avx2)); \ decl_##type##_fn(BF(name, avx512icl)); @@ -108,25 +107,6 @@ decl_fn(resize, dav1d_resize); static ALWAYS_INLINE void mc_dsp_init_x86(Dav1dMCDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); - if(!(flags & DAV1D_X86_CPU_FLAG_SSE2)) - return; - -#if BITDEPTH == 8 - init_mct_fn(FILTER_2D_BILINEAR, bilin, sse2); - init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, sse2); - init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, sse2); - init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, sse2); - init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, sse2); - init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, sse2); - init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, sse2); - init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, sse2); - init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, sse2); - init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, sse2); - - c->warp8x8 = BF(dav1d_warp_affine_8x8, sse2); - c->warp8x8t = BF(dav1d_warp_affine_8x8t, sse2); -#endif - if(!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; diff --git a/third_party/dav1d/src/x86/mc16_avx512.asm b/third_party/dav1d/src/x86/mc16_avx512.asm index e5de7ecd9612..3e352c5b3774 100644 --- a/third_party/dav1d/src/x86/mc16_avx512.asm +++ b/third_party/dav1d/src/x86/mc16_avx512.asm @@ -64,8 +64,12 @@ prep_endC: db 1, 2, 5, 6, 9, 10, 13, 14, 65, 66, 69, 70, 73, 74, 77, 78 db 49, 50, 53, 54, 57, 58, 61, 62,113,114,117,118,121,122,125,126 spel_shuf4a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30 db 17, 18, 33, 34, 21, 22, 37, 38, 25, 26, 41, 42, 29, 30, 45, 46 -spel_shuf4b: db 18, 19, 33, 34, 22, 23, 37, 38, 26, 27, 41, 42, 30, 31, 45, 46 db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62 + db 49, 50, 65, 66, 53, 54, 69, 70, 57, 58, 73, 74, 61, 62, 77, 78 +spel_shuf4b: db 50, 51, 65, 66, 54, 55, 69, 70, 58, 59, 73, 74, 62, 63, 77, 78 + db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94 + db 81, 82, 97, 98, 85, 86,101,102, 89, 90,105,106, 93, 94,109,110 + db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126 spel_shuf8a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30 db 17, 18, 65, 66, 21, 22, 69, 70, 25, 26, 73, 74, 29, 30, 77, 78 db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62 @@ -240,7 +244,9 @@ BASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128 BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE put, 6tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, 8tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, 6tap, avx512icl, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, 8tap, avx512icl, 2, 4, 8, 16, 32, 64, 128 %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX @@ -874,20 +880,20 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, w, h, mxy, stride3 lea stride3q, [strideq*3] jmp wq .prep_w4: - movq xmm0, [srcq+strideq*0] - movhps xmm0, [srcq+strideq*1] - vpbroadcastq ymm1, [srcq+strideq*2] - vpbroadcastq ymm2, [srcq+stride3q ] + mov r3d, 0x0c + kmovb k1, r3d +.prep_w4_loop: + movq xm0, [srcq+strideq*0] + movhps xm0, [srcq+strideq*1] + vpbroadcastq ym1, [srcq+strideq*2] + vpunpcklqdq ym0{k1}, ym1, [srcq+stride3q] {1to4} lea srcq, [srcq+strideq*4] - vpblendd ymm0, ymm1, 0x30 - vpblendd ymm0, ymm2, 0xc0 - pmullw ymm0, ym4 - psubw ymm0, ym5 - mova [tmpq], ymm0 + pmullw ym0, ym4 + psubw ym0, ym5 + mova [tmpq], ym0 add tmpq, 32 sub hd, 4 - jg .prep_w4 - vzeroupper + jg .prep_w4_loop RET .prep_w8: movu xm0, [srcq+strideq*0] @@ -1482,16 +1488,16 @@ cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, w, h, mxy, stride3 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 -%macro MC_8TAP_FN 4 ; prefix, type, type_h, type_v -cglobal %1_8tap_%2_16bpc +%macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to +cglobal %1_%2_16bpc mov t0d, FILTER_%3 %ifidn %3, %4 mov t1d, t0d %else mov t1d, FILTER_%4 %endif -%ifnidn %2, regular ; skip the jump in the last filter - jmp mangle(private_prefix %+ _%1_8tap_16bpc %+ SUFFIX) +%if %0 == 5 ; skip the jump in the last filter + jmp mangle(private_prefix %+ _%5 %+ SUFFIX) %endif %endmacro @@ -1503,22 +1509,18 @@ DECLARE_REG_TMP 7, 8 %define buf rsp-40 ; red zone %endif -MC_8TAP_FN put, sharp, SHARP, SHARP -MC_8TAP_FN put, sharp_smooth, SHARP, SMOOTH -MC_8TAP_FN put, smooth_sharp, SMOOTH, SHARP -MC_8TAP_FN put, smooth, SMOOTH, SMOOTH -MC_8TAP_FN put, sharp_regular, SHARP, REGULAR -MC_8TAP_FN put, regular_sharp, REGULAR, SHARP -MC_8TAP_FN put, smooth_regular, SMOOTH, REGULAR -MC_8TAP_FN put, regular_smooth, REGULAR, SMOOTH -MC_8TAP_FN put, regular, REGULAR, REGULAR +%define PUT_8TAP_FN FN put_8tap, +PUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_16bpc +PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_16bpc +PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_16bpc +PUT_8TAP_FN regular, REGULAR, REGULAR -cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my +cglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my %define base r8-put_avx512icl imul mxd, mxm, 0x010101 - add mxd, t0d ; 8tap_h, mx, 4tap_h + add mxd, t0d ; 6tap_h, mx, 4tap_h imul myd, mym, 0x010101 - add myd, t1d ; 8tap_v, my, 4tap_v + add myd, t1d ; 6tap_v, my, 4tap_v lea r8, [put_avx512icl] movifnidn wd, wm movifnidn hd, hm @@ -1526,6 +1528,7 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my jnz .h test myd, 0xf00 jnz .v +.put: tzcnt wd, wd movzx wd, word [r8+wq*2+table_offset(put,)] add wq, r8 @@ -1533,85 +1536,10 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my pop r8 %endif jmp wq -.h_w2: - movzx mxd, mxb - sub srcq, 2 - mova ym2, [spel_h_shuf2a] - pmovsxbw xmm4, [base+subpel_filters+mxq*8] - pshufd xmm3, xmm4, q1111 - pshufd xmm4, xmm4, q2222 -.h_w2_loop: - movu xm1, [srcq+ssq*0] - vinserti32x4 ym1, [srcq+ssq*1], 1 - lea srcq, [srcq+ssq*2] - mova xmm0, xm8 - vpermb ym1, ym2, ym1 - vpdpwssd xmm0, xmm3, xm1 - vextracti32x4 xm1, ym1, 1 - vpdpwssd xmm0, xmm4, xm1 - psrad xmm0, 6 - packusdw xmm0, xmm0 - pminsw xmm0, xm9 - movd [dstq+dsq*0], xmm0 - pextrd [dstq+dsq*1], xmm0, 1 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .h_w2_loop - RET -.h_w4: - movzx mxd, mxb - sub srcq, 2 - pmovsxbw xmm0, [base+subpel_filters+mxq*8] - vbroadcasti32x4 ym4, [spel_h_shufA] - vbroadcasti32x4 ym5, [spel_h_shufB] - pshufd xmm0, xmm0, q2211 - vpbroadcastq ym6, xmm0 - vpermq ym7, ymm0, q1111 -.h_w4_loop: - movu xm2, [srcq+ssq*0] - vinserti32x4 ym2, [srcq+ssq*1], 1 - lea srcq, [srcq+ssq*2] - mova ym0, ym8 - pshufb ym1, ym2, ym4 - vpdpwssd ym0, ym6, ym1 - pshufb ym2, ym5 - vpdpwssd ym0, ym7, ym2 - psrad ym0, 6 - vextracti32x4 xm1, ym0, 1 - packusdw xm0, xm1 - pminsw xmm0, xm0, xm9 - movq [dstq+dsq*0], xmm0 - movhps [dstq+dsq*1], xmm0 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .h_w4_loop - RET -.h: - test myd, 0xf00 - jnz .hv - mov r7d, r8m - vpbroadcastw m9, r8m - shr r7d, 11 - vpbroadcastd m8, [base+put_8tap_h_rnd+r7*4] - cmp wd, 4 - je .h_w4 - jl .h_w2 - shr mxd, 16 - sub srcq, 6 - pmovsxbw xmm0, [base+subpel_filters+mxq*8] - mova [buf], xmm0 - vpbroadcastd m10, xmm0 - vpbroadcastd m11, [buf+ 4] - vpbroadcastd m12, [buf+ 8] - vpbroadcastd m13, [buf+12] - sub wd, 16 - je .h_w16 - jg .h_w32 .h_w8: mova m4, [spel_h_shufA] movu m5, [spel_h_shufB] movu m6, [spel_h_shufC] - mova m7, [spel_h_shufD] .h_w8_loop: movu ym2, [srcq+ssq*0] vinserti32x8 m2, [srcq+ssq*1], 1 @@ -1623,48 +1551,60 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my vpdpwssd m0, m11, m1 vpermb m1, m6, m2 vpdpwssd m0, m12, m1 - vpermb m1, m7, m2 - vpdpwssd m0, m13, m1 psrad m0, 6 vextracti32x8 ym1, m0, 1 packusdw ym0, ym1 - pminsw ym0, ym9 + pminsw ym0, ym15 mova [dstq+dsq*0], xm0 vextracti32x4 [dstq+dsq*1], ym0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8_loop RET -.h_w16: +.h: + vpbroadcastw m15, r8m + test myd, 0xf00 + jnz .hv + mov r7d, r8m + shr r7d, 11 + vpbroadcastd m8, [base+put_8tap_h_rnd+r7*4] + cmp wd, 4 + jle mangle(private_prefix %+ _put_8tap_16bpc_avx512icl).h_w4 + shr mxd, 16 + sub srcq, 4 + pmovsxbw xmm0, [base+subpel_filters+1+mxq*8] + mova [buf], xmm0 + vpbroadcastd m10, xmm0 + vpbroadcastd m12, [buf+8] + vpbroadcastd m11, [buf+4] + sub wd, 16 + jl .h_w8 vbroadcasti32x4 m6, [spel_h_shufA] vbroadcasti32x4 m7, [spel_h_shufB] + jg .h_w32 .h_w16_loop: movu ym2, [srcq+ssq*0+ 0] vinserti32x8 m2, [srcq+ssq*1+ 0], 1 - movu ym3, [srcq+ssq*0+16] - vinserti32x8 m3, [srcq+ssq*1+16], 1 + movu ym3, [srcq+ssq*0+12] + vinserti32x8 m3, [srcq+ssq*1+12], 1 lea srcq, [srcq+ssq*2] mova m0, m8 mova m1, m8 pshufb m4, m2, m6 - vpdpwssd m0, m10, m4 ; a0 - pshufb m4, m3, m6 - vpdpwssd m1, m12, m4 ; b2 - pshufb m4, m2, m7 - vpdpwssd m0, m11, m4 ; a1 + vpdpwssd m0, m10, m4 ; a0 b0 pshufb m4, m3, m7 - vpdpwssd m1, m13, m4 ; b3 - shufpd m2, m3, 0x55 - pshufb m4, m2, m6 - vpdpwssd m0, m12, m4 ; a2 - vpdpwssd m1, m10, m4 ; b0 + vpdpwssd m1, m12, m4 ; a2' b2' pshufb m2, m7 - vpdpwssd m0, m13, m2 ; a3 - vpdpwssd m1, m11, m2 ; b1 + pshufb m3, m6 + vpdpwssd m0, m11, m2 ; a1 b1 + vpdpwssd m1, m11, m3 ; a1' b1' + shufpd m2, m3, 0x55 + vpdpwssd m0, m12, m2 ; a2 b2 + vpdpwssd m1, m10, m2 ; a0' b0' psrad m0, 6 psrad m1, 6 packusdw m0, m1 - pminsw m0, m9 + pminsw m0, m15 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] @@ -1673,36 +1613,30 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my RET .h_w32: lea srcq, [srcq+wq*2] - vbroadcasti32x4 m6, [spel_h_shufA] lea dstq, [dstq+wq*2] - vbroadcasti32x4 m7, [spel_h_shufB] neg wq .h_w32_loop0: mov r6, wq .h_w32_loop: movu m2, [srcq+r6*2+ 0] - movu m3, [srcq+r6*2+ 8] + movu m3, [srcq+r6*2+12] mova m0, m8 mova m1, m8 pshufb m4, m2, m6 vpdpwssd m0, m10, m4 ; a0 - pshufb m4, m3, m6 - vpdpwssd m1, m10, m4 ; b0 - vpdpwssd m0, m12, m4 ; a2 - movu m4, [srcq+r6*2+16] - pshufb m3, m7 - vpdpwssd m1, m11, m3 ; b1 - vpdpwssd m0, m13, m3 ; a3 - pshufb m3, m4, m6 - vpdpwssd m1, m12, m3 ; b2 + pshufb m4, m3, m7 + vpdpwssd m1, m12, m4 ; b2 pshufb m2, m7 + pshufb m3, m6 vpdpwssd m0, m11, m2 ; a1 - pshufb m4, m7 - vpdpwssd m1, m13, m4 ; b3 + vpdpwssd m1, m11, m3 ; b1 + shufpd m2, m3, 0x55 + vpdpwssd m0, m12, m2 ; a2 + vpdpwssd m1, m10, m2 ; b0 psrad m0, 6 psrad m1, 6 packusdw m0, m1 - pminsw m0, m9 + pminsw m0, m15 mova [dstq+r6*2], m0 add r6, 32 jl .h_w32_loop @@ -1711,6 +1645,711 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my dec hd jg .h_w32_loop0 RET +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + vpbroadcastd m11, [pd_32] + pmovsxbw xmm0, [base+subpel_filters+1+myq*8] + tzcnt r7d, wd + vpbroadcastw m15, r8m + mov r6, ssq + movzx r7d, word [r8+r7*2+table_offset(put, _6tap_v)] + neg r6 + mova [rsp+stack_offset+8], xmm0 + vpbroadcastd m12, xmm0 + add r7, r8 + vpbroadcastd m13, [rsp+stack_offset+12] + vpbroadcastd m14, [rsp+stack_offset+16] + jmp r7 +.v_w2: + movd xmm2, [srcq+r6 *2] + pinsrd xmm2, [srcq+r6 *1], 1 + pinsrd xmm2, [srcq+ssq*0], 2 + pinsrd xmm2, [srcq+ssq*1], 3 ; 0 1 2 3 + lea srcq, [srcq+ssq*2] + movd xmm0, [srcq+ssq*0] + palignr xmm3, xmm0, xmm2, 4 ; 1 2 3 4 + punpcklwd xmm1, xmm2, xmm3 ; 01 12 + punpckhwd xmm2, xmm3 ; 23 34 +.v_w2_loop: + movd xmm3, [srcq+ssq*1] + mova xmm4, xm11 + vpdpwssd xmm4, xmm1, xm12 ; a0 b0 + lea srcq, [srcq+ssq*2] + mova xmm1, xmm2 + vpdpwssd xmm4, xmm2, xm13 ; a1 b1 + punpckldq xmm2, xmm0, xmm3 ; 4 5 + movd xmm0, [srcq+ssq*0] + punpckldq xmm3, xmm0 ; 5 6 + punpcklwd xmm2, xmm3 ; 45 56 + vpdpwssd xmm4, xmm2, xm14 ; a2 b2 + psrad xmm4, 6 + packusdw xmm4, xmm4 + pminsw xmm4, xm15 + movd [dstq+dsq*0], xmm4 + pextrd [dstq+dsq*1], xmm4, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movq xmm1, [srcq+r6 *2] + vpbroadcastq ymm3, [srcq+r6 *1] + vpbroadcastq ymm2, [srcq+ssq*0] + vpbroadcastq ymm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpbroadcastq ymm0, [srcq+ssq*0] + vpblendd ymm1, ymm3, 0x30 + vpblendd ymm3, ymm2, 0x30 + punpcklwd ymm1, ymm3 ; 01 12 + vpblendd ymm2, ymm4, 0x30 + vpblendd ymm4, ymm0, 0x30 + punpcklwd ymm2, ymm4 ; 23 34 +.v_w4_loop: + vpbroadcastq ymm3, [srcq+ssq*1] + mova ymm4, ym11 + vpdpwssd ymm4, ymm1, ym12 ; a0 b0 + lea srcq, [srcq+ssq*2] + mova ymm1, ymm2 + vpdpwssd ymm4, ymm2, ym13 ; a1 b1 + vpblendd ymm2, ymm0, ymm3, 0x30 + vpbroadcastq ymm0, [srcq+ssq*0] + vpblendd ymm3, ymm0, 0x30 + punpcklwd ymm2, ymm3 ; 45 56 + vpdpwssd ymm4, ymm2, ym14 ; a2 b2 + psrad ymm4, 6 + vextracti128 xmm3, ymm4, 1 + packusdw xmm4, xmm3 + pminsw xmm4, xm15 + movq [dstq+dsq*0], xmm4 + movhps [dstq+dsq*1], xmm4 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + vzeroupper + RET +.v_w8: + vbroadcasti32x4 m0, [srcq+ssq*0] + vinserti32x4 m1, m0, [srcq+r6 *2], 0 + vinserti32x4 m1, [srcq+r6 *1], 1 ; 0 1 2 + vinserti32x4 ym0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + mova m5, [spel_v_shuf8] + vinserti32x4 m0, [srcq+ssq*0], 2 ; 2 3 4 + vpermb m1, m5, m1 ; 01 12 + vpermb m2, m5, m0 ; 23 34 +.v_w8_loop: + vinserti32x4 m0, [srcq+ssq*1], 3 + lea srcq, [srcq+ssq*2] + movu xm3, [srcq+ssq*0] + mova m4, m11 + vpdpwssd m4, m12, m1 ; a0 b0 + vshufi32x4 m0, m3, q1032 ; 4 5 6 + mova m1, m2 + vpdpwssd m4, m13, m2 ; a1 b1 + vpermb m2, m5, m0 ; 45 56 + vpdpwssd m4, m14, m2 ; a2 b2 + psrad m4, 6 + vextracti32x8 ym3, m4, 1 + packusdw ym4, ym3 + pminsw ym4, ym15 + mova [dstq+dsq*0], xm4 + vextracti32x4 [dstq+dsq*1], ym4, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop + RET +.v_w16: + vbroadcasti32x8 m0, [srcq+r6 *1] + vinserti32x8 m1, m0, [srcq+ssq*0], 1 + vinserti32x8 m0, [srcq+r6*2], 0 + mova m6, [spel_v_shuf16] + movu ym3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti32x8 m3, [srcq+ssq*0], 1 + vpermb m1, m6, m1 ; 12 + vpermb m0, m6, m0 ; 01 + vpermb m3, m6, m3 ; 34 + mova m7, [deint_q_shuf] + vpshrdd m2, m1, m3, 16 ; 23 +.v_w16_loop: + mova m5, m11 + vpdpwssd m5, m12, m1 ; b0 + mova m4, m11 + vpdpwssd m4, m12, m0 ; a0 + mova m1, m3 + vpdpwssd m5, m13, m3 ; b1 + mova m0, m2 + vpdpwssd m4, m13, m2 ; a1 + movu ym3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti32x8 m3, [srcq+ssq*0], 1 + vpermb m3, m6, m3 ; 56 + vpshrdd m2, m1, m3, 16 ; 45 + vpdpwssd m5, m14, m3 ; b2 + vpdpwssd m4, m14, m2 ; a2 + psrad m5, 6 + psrad m4, 6 + packusdw m4, m5 + pminsw m4, m15 + vpermq m4, m7, m4 + mova [dstq+dsq*0], ym4 + vextracti32x8 [dstq+dsq*1], m4, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w16_loop + RET +.v_w32: +.v_w64: +.v_w128: + lea wd, [hq+wq*8-256] +.v_w32_loop0: + movu m16, [srcq+r6 *2] + movu m17, [srcq+r6 *1] + lea r7, [srcq+ssq*2] + movu m18, [srcq+ssq*0] + movu m19, [srcq+ssq*1] + mov r8, dstq + movu m20, [r7 +ssq*0] + punpcklwd m0, m16, m17 ; 01 + punpckhwd m16, m17 + punpcklwd m1, m17, m18 ; 12 + punpckhwd m17, m18 + punpcklwd m2, m18, m19 ; 23 + punpckhwd m18, m19 + punpcklwd m3, m19, m20 ; 34 + punpckhwd m19, m20 +.v_w32_loop: + mova m4, m11 + vpdpwssd m4, m12, m0 ; a0 + mova m6, m11 + vpdpwssd m6, m12, m16 + mova m5, m11 + vpdpwssd m5, m12, m1 ; b0 + mova m7, m11 + vpdpwssd m7, m12, m17 + mova m0, m2 + vpdpwssd m4, m13, m2 ; a1 + mova m16, m18 + vpdpwssd m6, m13, m18 + mova m1, m3 + vpdpwssd m5, m13, m3 ; b1 + mova m17, m19 + vpdpwssd m7, m13, m19 + movu m19, [r7+ssq*1] + lea r7, [r7+ssq*2] + punpcklwd m2, m20, m19 ; 45 + punpckhwd m18, m20, m19 + movu m20, [r7+ssq*0] + vpdpwssd m4, m14, m2 ; a2 + vpdpwssd m6, m14, m18 + punpcklwd m3, m19, m20 ; 56 + punpckhwd m19, m20 + vpdpwssd m5, m14, m3 ; b2 + vpdpwssd m7, m14, m19 + REPX {psrad x, 6}, m4, m6, m5, m7 + packusdw m4, m6 + packusdw m5, m7 + pminsw m4, m15 + pminsw m5, m15 + mova [r8+dsq*0], m4 + mova [r8+dsq*1], m5 + lea r8, [r8+dsq*2] + sub hd, 2 + jg .v_w32_loop + add srcq, 64 + add dstq, 64 + movzx hd, wb + sub wd, 1<<8 + jg .v_w32_loop0 + vzeroupper + RET +.hv: + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + pmovsxbw xmm0, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + pmovsxbw xmm1, [base+subpel_filters+1+myq*8] + mov r6, ssq + sub srcq, 2 + neg r6 + test dword r8m, 0x800 + jnz .hv_12bit + vpbroadcastd m10, [pd_2176] + psllw xmm0, 6 + jmp .hv_main +.hv_12bit: + vpbroadcastd m10, [pd_640] + psllw xmm0, 4 + psllw xmm1, 2 +.hv_main: + movu xm4, [srcq+r6 *2] + vinserti32x4 ym4, [srcq+r6 *1], 1 + vinserti32x4 m4, [srcq+ssq*0], 2 + vbroadcasti32x4 m6, [spel_h_shufA] + vinserti32x4 m4, [srcq+ssq*1], 3 ; 0 1 2 3 + lea srcq, [srcq+ssq*2] + movu xm5, [srcq+ssq*0] ; 4 + mova [buf+ 0], xmm0 + mova [buf+16], xmm1 + vpbroadcastd m8, [buf+ 4] + vpbroadcastd m9, [buf+ 8] + vpbroadcastd ym12, xmm1 + vpbroadcastd ym13, [buf+20] + vpbroadcastd ym14, [buf+24] + cmp wd, 4 + je .hv_w4 + vbroadcasti32x4 m2, [spel_h_shufA] + mova m3, [spel_h_shuf2b] + mova m1, m10 + pshufb m4, m6 + pshufb xm5, xm6 + punpcklqdq m2, m4, m5 + vpdpwssd m1, m8, m2 ; 04 1_ 2_ 3_ + mova ym6, [spel_h_shuf2a] + punpckhqdq m4, m5 + mova xm5, [spel_shuf2] + vpdpwssd m1, m9, m4 + vpermb m1, m3, m1 ; 01 12 + vextracti32x4 xm2, ym1, 1 ; 23 34 +.hv_w2_loop: + movu xm3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti32x4 ym3, [srcq+ssq*0], 1 + vpermb ym3, ym6, ym3 + pmaddwd xmm0, xm12, xm1 ; a0 b0 + mova xm4, xm10 + vpdpwssd xm4, xm8, xm3 + vextracti32x4 xm3, ym3, 1 + mova xm1, xm2 + vpdpwssd xmm0, xm13, xm2 ; a1 b1 + vpdpwssd xm4, xm9, xm3 ; 5 6 + vpermt2b xm2, xm5, xm4 ; 45 56 + vpdpwssd xmm0, xm14, xm2 ; a2 b2 + psrad xmm0, 10 + packusdw xmm0, xmm0 + pminsw xmm0, xm15 + movd [dstq+dsq*0], xmm0 + pextrd [dstq+dsq*1], xmm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: + vbroadcasti32x4 m7, [spel_h_shufB] + mova ym0, [spel_shuf4a] + pshufb m1, m4, m6 + mova m2, m10 + vpdpwssd m2, m8, m1 + pshufb xm1, xm5, xm6 + mova xm3, xm10 + vpdpwssd xm3, xm8, xm1 + pshufb m4, m7 + pshufb xm5, xm7 + vpdpwssd m2, m9, m4 ; 0 1 2 3 + vpdpwssd xm3, xm9, xm5 ; 4 + mova ym5, [spel_shuf4b] + vpermb m1, m0, m2 ; 01 12 + vshufi32x4 m2, m3, q1032 ; 2 3 4 + vpermb m2, m0, m2 ; 23 34 +.hv_w4_loop: + movu xm3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti32x4 ym3, [srcq+ssq*0], 1 + pmaddwd ym0, ym12, ym1 ; a0 b0 + mova ym1, ym2 + pshufb ym4, ym3, ym6 + mova ym2, ym10 + vpdpwssd ym2, ym8, ym4 + pshufb ym3, ym7 + vpdpwssd ym0, ym13, ym1 ; a1 b1 + vpdpwssd ym2, ym9, ym3 ; 5 6 + vpermt2b ym2, ym5, ym1 ; 45 56 + vpdpwssd ym0, ym14, ym2 ; a2 b2 + psrad ym0, 10 + vextracti32x4 xm4, ym0, 1 + packusdw xm0, xm4 + pminsw xmm0, xm0, xm15 + movq [dstq+dsq*0], xmm0 + movhps [dstq+dsq*1], xmm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + shr mxd, 16 + pmovsxbw xmm0, [base+subpel_filters+1+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + pmovsxbw xmm1, [base+subpel_filters+1+myq*8] + mov r6, ssq + sub srcq, 4 + neg r6 + test dword r8m, 0x800 + jnz .hv_w8_12bit + vpbroadcastd m8, [pd_2176] + psllw xmm0, 6 + jmp .hv_w8_main +.hv_w8_12bit: + vpbroadcastd m8, [pd_640] + psllw xmm0, 4 + psllw xmm1, 2 +.hv_w8_main: + mova [buf+ 0], xmm0 + mova [buf+16], xmm1 + vpbroadcastd m9, xmm0 + vpbroadcastd m10, [buf+ 4] + vpbroadcastd m11, [buf+ 8] + vpbroadcastd m12, xmm1 + vpbroadcastd m13, [buf+20] + vpbroadcastd m14, [buf+24] + cmp wd, 16 + jge .hv_w16 + mova m6, [spel_h_shufA] + movu ym16, [srcq+r6 *2] + vinserti32x8 m16, [srcq+r6 *1], 1 ; 0 1 + movu ym17, [srcq+ssq*0] + vinserti32x8 m17, [srcq+ssq*1], 1 ; 2 3 + lea srcq, [srcq+ssq*2] + movu ym18, [srcq+ssq*0] ; 4 + movu m7, [spel_h_shufC] + vpermb m3, m6, m16 + mova m1, m8 + vpermb m4, m6, m17 + vpdpwssd m1, m9, m3 ; a0 b0 + mova m2, m8 + vpermb m5, m6, m18 + vpdpwssd m2, m9, m4 ; c0 d0 + mova m0, m8 + vpermb m16, m7, m16 + vpdpwssd m0, m9, m5 ; e0 + vpermb m17, m7, m17 + vpdpwssd m1, m11, m16 ; a2 b2 + vpermb m18, m7, m18 + vpdpwssd m2, m11, m17 ; c2 d2 + shufpd m3, m16, 0x55 + vpdpwssd m0, m11, m18 ; e2 + mova m16, [spel_shuf8a] + shufpd m4, m17, 0x55 + vpdpwssd m1, m10, m3 ; a1 b1 + shufpd m5, m18, 0x55 + vpdpwssd m2, m10, m4 ; c1 d1 + vpdpwssd m0, m10, m5 ; e1 + mova m5, [spel_shuf8b] + vpermt2b m1, m16, m2 ; 01 12 + vpermt2b m2, m16, m0 ; 23 34 +.hv_w8_loop: + movu ym18, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti32x8 m18, [srcq+ssq*0], 1 + mova m0, m8 + vpermb m17, m6, m18 + vpdpwssd m0, m9, m17 ; f0 g0 + vpermb m18, m7, m18 + pmaddwd m16, m12, m1 ; A0 B0 + vpdpwssd m0, m11, m18 ; f2 g2 + shufpd m17, m18, 0x55 + mova m1, m2 + vpdpwssd m16, m13, m2 ; A1 B1 + vpdpwssd m0, m10, m17 ; f1 g1 + vpermt2b m2, m5, m0 ; 45 56 + vpdpwssd m16, m14, m2 ; A2 B2 + psrad m16, 10 + vextracti32x8 ym17, m16, 1 + packusdw ym16, ym17 + pminsw ym16, ym15 + mova [dstq+dsq*0], xm16 + vextracti128 [dstq+dsq*1], ym16, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop + vzeroupper + RET +.hv_w16: + vbroadcasti32x4 m20, [spel_h_shufA] + vbroadcasti32x4 m21, [spel_h_shufB] + jg .hv_w32 + vbroadcasti32x8 m6, [srcq+r6 *2+ 8] + vinserti32x8 m2, m6, [srcq+r6 *2+16], 1 + vinserti32x8 m6, [srcq+r6 *2+ 0], 0 ; 0 + movu ym16, [srcq+r6 *1+ 0] + movu ym17, [srcq+r6 *1+12] + vinserti32x8 m16, [srcq+ssq*0+ 0], 1 + vinserti32x8 m17, [srcq+ssq*0+12], 1 ; 1 2 + movu ym18, [srcq+ssq*1+ 0] + movu ym19, [srcq+ssq*1+12] + lea srcq, [srcq+ssq*2] + vinserti32x8 m18, [srcq+ssq*0+ 0], 1 + vinserti32x8 m19, [srcq+ssq*0+12], 1 ; 3 4 + pshufb m2, m20 + mova m1, m8 + pshufb m3, m16, m20 + vpdpwssd m1, m11, m2 ; a2 + mova m2, m8 + pshufb m4, m17, m21 + vpdpwssd m2, m9, m3 ; b0 c0 + mova m3, m8 + pshufb m5, m18, m20 + vpdpwssd m3, m11, m4 ; b2' c2' + mova m4, m8 + pshufb m7, m19, m21 + vpdpwssd m4, m9, m5 ; d0 e0 + mova m5, m8 + pshufb m0, m6, m20 + vpdpwssd m5, m11, m7 ; d2' e2' + mova m7, [spel_shuf16] + pshufb m16, m21 + vpdpwssd m1, m9, m0 ; a0 + pshufb m17, m20 + vpdpwssd m2, m10, m16 ; b1 c1 + pshufb m18, m21 + vpdpwssd m3, m10, m17 ; b1' c1' + pshufb m19, m20 + vpdpwssd m4, m10, m18 ; d1 e1 + pshufb m6, m21 + vpdpwssd m5, m10, m19 ; d1' e1' + shufpd m16, m17, 0x55 + vpdpwssd m1, m10, m6 ; a1 + shufpd m18, m19, 0x55 + vpdpwssd m2, m11, m16 ; b2 c2 + vpdpwssd m3, m9, m16 ; b0' c0' + vpdpwssd m4, m11, m18 ; d2 e2 + vpdpwssd m5, m9, m18 ; d0' e0' + pslldq m1, 1 + vpermt2b m2, m7, m3 ; 12 + vpermt2b m4, m7, m5 ; 34 + vpshrdd m1, m2, 16 ; 01 + vpshrdd m3, m2, m4, 16 ; 23 +.hv_w16_loop: + movu ym18, [srcq+ssq*1+ 0] + movu ym19, [srcq+ssq*1+12] + lea srcq, [srcq+ssq*2] + vinserti32x8 m18, [srcq+ssq*0+ 0], 1 + vinserti32x8 m19, [srcq+ssq*0+12], 1 + mova m5, m8 + mova m6, m8 + pshufb m17, m18, m20 + vpdpwssd m5, m9, m17 ; f0 g0 + pshufb m16, m19, m21 + vpdpwssd m6, m11, m16 ; f2' g2' + pmaddwd m17, m12, m2 ; B0 + mova m2, m4 + pmaddwd m16, m12, m1 ; A0 + mova m1, m3 + pshufb m18, m21 + vpdpwssd m5, m10, m18 ; f1 g1 + pshufb m19, m20 + vpdpwssd m6, m10, m19 ; f1' g1' + vpdpwssd m17, m13, m4 ; B1 + vpdpwssd m16, m13, m3 ; A1 + shufpd m18, m19, 0x55 + vpdpwssd m5, m11, m18 ; f2 g2 + vpdpwssd m6, m9, m18 ; f0' g0' + mova m4, m7 + vpermi2b m4, m5, m6 ; 56 + vpshrdd m3, m2, m4, 16 ; 45 + vpdpwssd m17, m14, m4 ; B2 + vpdpwssd m16, m14, m3 ; A2 + psrad m16, 10 + psrad m17, 10 + vshufi32x4 m18, m16, m17, q3232 + vinserti32x8 m16, ym17, 1 + packusdw m16, m18 + pminsw m16, m15 + mova [dstq+dsq*0], ym16 + vextracti32x8 [dstq+dsq*1], m16, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w16_loop + vzeroupper + RET +.hv_w32: + WIN64_SPILL_XMM 28 + mova m27, [spel_shuf32] + lea wd, [hq+wq*8-256] +.hv_w32_loop0: + movu m16, [srcq+r6 *2+ 0] + movu m7, [srcq+r6 *2+12] + movu m6, [srcq+r6 *1+ 0] + movu m18, [srcq+r6 *1+12] + lea r7, [srcq+ssq*2] + movu m17, [srcq+ssq*0+ 0] + movu m19, [srcq+ssq*0+12] + movu m22, [srcq+ssq*1+ 0] + movu m24, [srcq+ssq*1+12] + mov r8, dstq + movu m23, [r7 +ssq*0+ 0] + movu m25, [r7 +ssq*0+12] + pshufb m1, m16, m20 + mova m0, m8 + pshufb m2, m7, m21 + vpdpwssd m0, m9, m1 ; a0 + mova m1, m8 + pshufb m4, m6, m20 + vpdpwssd m1, m11, m2 ; a2' + mova m2, m8 + pshufb m3, m17, m20 + vpdpwssd m2, m9, m4 ; b0 + mova m4, m8 + pshufb m5, m18, m21 + vpdpwssd m4, m9, m3 ; c0 + mova m3, m8 + pshufb m26, m19, m21 + vpdpwssd m3, m11, m5 ; b2' + mova m5, m8 + pshufb m16, m21 + vpdpwssd m5, m11, m26 ; c2' + pshufb m7, m20 + vpdpwssd m0, m10, m16 ; a1 + pshufb m6, m21 + vpdpwssd m1, m10, m7 ; a1' + pshufb m17, m21 + vpdpwssd m2, m10, m6 ; b1 + pshufb m18, m20 + vpdpwssd m4, m10, m17 ; c1 + pshufb m19, m20 + vpdpwssd m3, m10, m18 ; b1' + shufpd m16, m7, 0x55 + vpdpwssd m5, m10, m19 ; c1' + shufpd m6, m18, 0x55 + vpdpwssd m0, m11, m16 ; a2 + shufpd m17, m19, 0x55 + vpdpwssd m1, m9, m16 ; a0' + pshufb m16, m22, m20 + vpdpwssd m2, m11, m6 ; b2 + pshufb m7, m23, m20 + vpdpwssd m4, m11, m17 ; c2 + vpdpwssd m3, m9, m6 ; b0' + mova m6, m8 + vpdpwssd m5, m9, m17 ; c0' + pshufb m17, m24, m21 + vpdpwssd m6, m9, m16 ; d0 + mova m16, m8 + pshufb m26, m25, m21 + vpdpwssd m16, m9, m7 ; e0 + mova m7, m8 + pshufb m22, m21 + vpdpwssd m7, m11, m17 ; d2' + mova m17, m8 + pshufb m23, m21 + vpdpwssd m17, m11, m26 ; e2' + pshufb m24, m20 + vpdpwssd m6, m10, m22 ; d1 + pshufb m25, m20 + vpdpwssd m16, m10, m23 ; e1 + shufpd m22, m24, 0x55 + vpdpwssd m7, m10, m24 ; d1' + shufpd m23, m25, 0x55 + vpdpwssd m17, m10, m25 ; e1' + pslldq m0, 1 + vpdpwssd m6, m11, m22 ; d2 + pslldq m1, 1 + vpdpwssd m16, m11, m23 ; e2 + vpermt2b m2, m27, m4 ; 12 + vpdpwssd m7, m9, m22 ; d0' + vpermt2b m3, m27, m5 ; 12' + vpdpwssd m17, m9, m23 ; e0' + vpshrdd m0, m2, 16 ; 01 + vpermt2b m6, m27, m16 ; 34 + vpshrdd m1, m3, 16 ; 01' + vpermt2b m7, m27, m17 ; 34' + vpshrdd m4, m2, m6, 16 ; 23 + vpshrdd m5, m3, m7, 16 ; 23' +.hv_w32_loop: + movu m22, [r7+ssq*1+ 0] + movu m24, [r7+ssq*1+12] + lea r7, [r7+ssq*2] + movu m23, [r7+ssq*0+ 0] + movu m25, [r7+ssq*0+12] + pmaddwd m17, m12, m2 ; B0 + mova m2, m6 + pmaddwd m19, m12, m3 ; B0' + mova m3, m7 + pmaddwd m16, m12, m0 ; A0 + mova m0, m4 + pmaddwd m18, m12, m1 ; A0' + mova m1, m5 + vpdpwssd m17, m13, m6 ; B1 + vpdpwssd m19, m13, m7 ; B1' + mova m6, m8 + vpdpwssd m16, m13, m4 ; A1 + pshufb m4, m22, m20 + vpdpwssd m18, m13, m5 ; A1' + pshufb m7, m23, m20 + vpdpwssd m6, m9, m4 ; f0 + mova m4, m8 + pshufb m5, m24, m21 + vpdpwssd m4, m9, m7 ; g0 + mova m7, m8 + pshufb m26, m25, m21 + vpdpwssd m7, m11, m5 ; f2' + mova m5, m8 + pshufb m22, m21 + vpdpwssd m5, m11, m26 ; g2' + pshufb m23, m21 + vpdpwssd m6, m10, m22 ; f1 + pshufb m24, m20 + vpdpwssd m4, m10, m23 ; g1 + pshufb m25, m20 + vpdpwssd m7, m10, m24 ; f1' + shufpd m22, m24, 0x55 + vpdpwssd m5, m10, m25 ; g1' + shufpd m23, m25, 0x55 + vpdpwssd m6, m11, m22 ; f2 + vpdpwssd m4, m11, m23 ; g2 + vpdpwssd m7, m9, m22 ; f0' + vpdpwssd m5, m9, m23 ; g0' + vpermt2b m6, m27, m4 ; 56 + vpermt2b m7, m27, m5 ; 56' + vpdpwssd m17, m14, m6 ; B2 + vpshrdd m4, m2, m6, 16 ; 45 + vpdpwssd m19, m14, m7 ; B2' + vpshrdd m5, m3, m7, 16 ; 45' + vpdpwssd m16, m14, m4 ; A2 + vpdpwssd m18, m14, m5 ; A2' + REPX {psrad x, 10}, m17, m19, m16, m18 + packusdw m17, m19 + packusdw m16, m18 + pminsw m17, m15 + pminsw m16, m15 + mova [r8+dsq*0], m16 + mova [r8+dsq*1], m17 + lea r8, [r8+dsq*2] + sub hd, 2 + jg .hv_w32_loop + add srcq, 64 + add dstq, 64 + movzx hd, wb + sub wd, 1<<8 + jg .hv_w32_loop0 + RET + +PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP, put_8tap_16bpc +PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_8tap_16bpc +PUT_8TAP_FN regular_sharp, REGULAR, SHARP, put_8tap_16bpc +PUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_8tap_16bpc +PUT_8TAP_FN sharp, SHARP, SHARP + +cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r8, [put_avx512icl] + movifnidn wd, wm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jz mangle(private_prefix %+ _put_6tap_16bpc_avx512icl).put .v: movzx mxd, myb shr myd, 16 @@ -1852,9 +2491,9 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my jg .v_w8_loop RET .v_w16: - vbroadcasti32x8 m1, [srcq+ssq*1] - vinserti32x8 m0, m1, [srcq+ssq*0], 0 - vinserti32x8 m1, [srcq+ssq*2], 1 + vbroadcasti32x8 m0, [srcq+ssq*1] + vinserti32x8 m1, m0, [srcq+ssq*2], 1 + vinserti32x8 m0, [srcq+ssq*0], 0 mova m8, [spel_v_shuf16] add srcq, r6 movu ym3, [srcq+ssq*0] @@ -1862,36 +2501,35 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my movu ym5, [srcq+ssq*2] add srcq, r6 vinserti32x8 m5, [srcq+ssq*0], 1 - vpermb m0, m8, m0 ; 01 vpermb m1, m8, m1 ; 12 + vpermb m0, m8, m0 ; 01 vpermb m3, m8, m3 ; 34 vpermb m5, m8, m5 ; 56 mova m9, [deint_q_shuf] vpshrdd m2, m1, m3, 16 ; 23 vpshrdd m4, m3, m5, 16 ; 45 .v_w16_loop: - mova m6, m10 mova m7, m10 - vpdpwssd m6, m12, m0 ; a0 - mova m0, m2 vpdpwssd m7, m12, m1 ; b0 + mova m6, m10 + vpdpwssd m6, m12, m0 ; a0 mova m1, m3 - vpdpwssd m6, m13, m2 ; a1 - mova m2, m4 vpdpwssd m7, m13, m3 ; b1 + mova m0, m2 + vpdpwssd m6, m13, m2 ; a1 mova m3, m5 - vpdpwssd m6, m14, m4 ; a2 - mova m4, m5 vpdpwssd m7, m14, m5 ; b2 + mova m2, m4 + vpdpwssd m6, m14, m4 ; a2 movu ym5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x8 m5, [srcq+ssq*0], 1 vpermb m5, m8, m5 ; 78 - vpshrdd m4, m5, 16 ; 67 - vpdpwssd m6, m15, m4 ; a3 + vpshrdd m4, m3, m5, 16 ; 67 vpdpwssd m7, m15, m5 ; b3 - psrad m6, 6 + vpdpwssd m6, m15, m4 ; a3 psrad m7, 6 + psrad m6, 6 packusdw m6, m7 pminsw m6, m11 vpermq m6, m9, m6 @@ -1904,22 +2542,19 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my .v_w32: .v_w64: .v_w128: -%if WIN64 - movaps [rsp+stack_offset+8], xmm6 -%endif + WIN64_SPILL_XMM 23 lea wd, [hq+wq*8-256] - mov r7, srcq - mov r8, dstq .v_w32_loop0: movu m16, [srcq+ssq*0] movu m17, [srcq+ssq*1] + lea r7, [srcq+r6 ] movu m18, [srcq+ssq*2] - add srcq, r6 - movu m19, [srcq+ssq*0] - movu m20, [srcq+ssq*1] - movu m21, [srcq+ssq*2] - add srcq, r6 - movu m22, [srcq+ssq*0] + movu m19, [r7 +ssq*0] + mov r8, dstq + movu m20, [r7 +ssq*1] + movu m21, [r7 +ssq*2] + add r7, r6 + movu m22, [r7 +ssq*0] punpcklwd m0, m16, m17 ; 01l punpckhwd m16, m17 ; 01h punpcklwd m1, m17, m18 ; 12l @@ -1957,11 +2592,11 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my vpdpwssd m7, m14, m5 ; b2l mova m19, m21 vpdpwssd m9, m14, m21 ; b2h - movu m21, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] + movu m21, [r7+ssq*1] + lea r7, [r7+ssq*2] punpcklwd m4, m22, m21 ; 67l punpckhwd m20, m22, m21 ; 67h - movu m22, [srcq+ssq*0] + movu m22, [r7+ssq*0] vpdpwssd m6, m15, m4 ; a3l vpdpwssd m8, m15, m20 ; a3h punpcklwd m5, m21, m22 ; 78l @@ -1973,25 +2608,192 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my packusdw m7, m9 pminsw m6, m11 pminsw m7, m11 - mova [dstq+dsq*0], m6 - mova [dstq+dsq*1], m7 - lea dstq, [dstq+dsq*2] + mova [r8+dsq*0], m6 + mova [r8+dsq*1], m7 + lea r8, [r8+dsq*2] sub hd, 2 jg .v_w32_loop - add r7, 64 - add r8, 64 + add srcq, 64 + add dstq, 64 movzx hd, wb - mov srcq, r7 - mov dstq, r8 sub wd, 1<<8 jg .v_w32_loop0 -%if WIN64 - movaps xmm6, [rsp+stack_offset+8] -%endif - vzeroupper + RET +.h_w2: + RESET_STACK_STATE + mova ym2, [spel_h_shuf2a] + sub srcq, 2 + pshufd xmm3, xmm0, q1111 + pshufd xmm4, xmm0, q2222 +.h_w2_loop: + movu xm1, [srcq+ssq*0] + vinserti32x4 ym1, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + mova xmm0, xm8 + vpermb ym1, ym2, ym1 + vpdpwssd xmm0, xmm3, xm1 + vextracti32x4 xm1, ym1, 1 + vpdpwssd xmm0, xmm4, xm1 + psrad xmm0, 6 + packusdw xmm0, xmm0 + pminsw xmm0, xm15 + movd [dstq+dsq*0], xmm0 + pextrd [dstq+dsq*1], xmm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2_loop + RET +.h_w4: + movzx mxd, mxb + pmovsxbw xmm0, [base+subpel_filters+mxq*8] + jl .h_w2 + vbroadcasti32x4 ym4, [spel_h_shufA] + vbroadcasti32x4 ym5, [spel_h_shufB] + sub srcq, 2 + pshufd xmm0, xmm0, q2211 + vpbroadcastq ym6, xmm0 + vpermq ym7, ymm0, q1111 +.h_w4_loop: + movu xm2, [srcq+ssq*0] + vinserti32x4 ym2, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + mova ym0, ym8 + pshufb ym1, ym2, ym4 + vpdpwssd ym0, ym6, ym1 + pshufb ym2, ym5 + vpdpwssd ym0, ym7, ym2 + psrad ym0, 6 + vextracti32x4 xm1, ym0, 1 + packusdw xm0, xm1 + pminsw xmm0, xm0, xm15 + movq [dstq+dsq*0], xmm0 + movhps [dstq+dsq*1], xmm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4_loop + RET +.h_w8: + mova m4, [spel_h_shufA] + movu m5, [spel_h_shufB] + movu m6, [spel_h_shufC] + mova m7, [spel_h_shufD] +.h_w8_loop: + movu ym2, [srcq+ssq*0] + vinserti32x8 m2, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + mova m0, m8 + vpermb m1, m4, m2 + vpdpwssd m0, m10, m1 + vpermb m1, m5, m2 + vpdpwssd m0, m11, m1 + vpermb m1, m6, m2 + vpdpwssd m0, m12, m1 + vpermb m1, m7, m2 + vpdpwssd m0, m13, m1 + psrad m0, 6 + vextracti32x8 ym1, m0, 1 + packusdw ym0, ym1 + pminsw ym0, ym15 + mova [dstq+dsq*0], xm0 + vextracti32x4 [dstq+dsq*1], ym0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8_loop + RET +.h: + vpbroadcastw m15, r8m + test myd, 0xf00 + jnz .hv + mov r7d, r8m + shr r7d, 11 + vpbroadcastd m8, [base+put_8tap_h_rnd+r7*4] + cmp wd, 4 + jle .h_w4 + shr mxd, 16 + sub srcq, 6 + pmovsxbw xmm0, [base+subpel_filters+mxq*8] + mova [buf], xmm0 + vpbroadcastd m10, xmm0 + vpbroadcastd m11, [buf+ 4] + vpbroadcastd m12, [buf+ 8] + vpbroadcastd m13, [buf+12] + sub wd, 16 + jl .h_w8 + vbroadcasti32x4 m6, [spel_h_shufA] + vbroadcasti32x4 m7, [spel_h_shufB] + jg .h_w32 +.h_w16_loop: + movu ym2, [srcq+ssq*0+ 0] + vinserti32x8 m2, [srcq+ssq*1+ 0], 1 + movu ym3, [srcq+ssq*0+16] + vinserti32x8 m3, [srcq+ssq*1+16], 1 + lea srcq, [srcq+ssq*2] + mova m0, m8 + mova m1, m8 + pshufb m4, m2, m6 + vpdpwssd m0, m10, m4 ; a0 + pshufb m4, m3, m6 + vpdpwssd m1, m12, m4 ; b2 + pshufb m4, m2, m7 + vpdpwssd m0, m11, m4 ; a1 + pshufb m4, m3, m7 + vpdpwssd m1, m13, m4 ; b3 + shufpd m2, m3, 0x55 + pshufb m4, m2, m6 + vpdpwssd m0, m12, m4 ; a2 + vpdpwssd m1, m10, m4 ; b0 + pshufb m2, m7 + vpdpwssd m0, m13, m2 ; a3 + vpdpwssd m1, m11, m2 ; b1 + psrad m0, 6 + psrad m1, 6 + packusdw m0, m1 + pminsw m0, m15 + mova [dstq+dsq*0], ym0 + vextracti32x8 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w16_loop + RET +.h_w32: + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + neg wq +.h_w32_loop0: + mov r6, wq +.h_w32_loop: + movu m2, [srcq+r6*2+ 0] + movu m3, [srcq+r6*2+ 8] + mova m0, m8 + mova m1, m8 + pshufb m4, m2, m6 + vpdpwssd m0, m10, m4 ; a0 + pshufb m4, m3, m6 + vpdpwssd m1, m10, m4 ; b0 + vpdpwssd m0, m12, m4 ; a2 + movu m4, [srcq+r6*2+16] + pshufb m3, m7 + vpdpwssd m1, m11, m3 ; b1 + vpdpwssd m0, m13, m3 ; a3 + pshufb m3, m4, m6 + vpdpwssd m1, m12, m3 ; b2 + pshufb m2, m7 + vpdpwssd m0, m11, m2 ; a1 + pshufb m4, m7 + vpdpwssd m1, m13, m4 ; b3 + psrad m0, 6 + psrad m1, 6 + packusdw m0, m1 + pminsw m0, m15 + mova [dstq+r6*2], m0 + add r6, 32 + jl .h_w32_loop + add srcq, ssq + add dstq, dsq + dec hd + jg .h_w32_loop0 RET .hv: - vpbroadcastw m11, r8m cmp wd, 4 jg .hv_w8 movzx mxd, mxb @@ -2018,10 +2820,10 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my mova [buf+16], xmm1 vpbroadcastd m8, [buf+ 4] vpbroadcastd m9, [buf+ 8] - vpbroadcastd ym12, xmm1 - vpbroadcastd ym13, [buf+20] - vpbroadcastd ym14, [buf+24] - vpbroadcastd ym15, [buf+28] + vpbroadcastd ym11, xmm1 + vpbroadcastd ym12, [buf+20] + vpbroadcastd ym13, [buf+24] + vpbroadcastd ym14, [buf+28] movu xm4, [srcq+ssq*0] vinserti32x4 ym4, [srcq+ssq*1], 1 vinserti32x4 m4, [srcq+ssq*2], 2 @@ -2053,19 +2855,19 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my vinserti32x4 ym5, [srcq+ssq*0], 1 mova xm4, xm10 vpermb ym5, ym6, ym5 - pmaddwd xmm0, xm12, xm1 ; a0 b0 + pmaddwd xmm0, xm11, xm1 ; a0 b0 vpdpwssd xm4, xm8, xm5 vextracti32x4 xm5, ym5, 1 mova xm1, xm2 - vpdpwssd xmm0, xm13, xm2 ; a1 b1 + vpdpwssd xmm0, xm12, xm2 ; a1 b1 vpdpwssd xm4, xm9, xm5 ; 7 8 mova xm2, xm3 - vpdpwssd xmm0, xm14, xm3 ; a2 b2 + vpdpwssd xmm0, xm13, xm3 ; a2 b2 vpermt2b xm3, xm7, xm4 ; 67 78 - vpdpwssd xmm0, xm15, xm3 ; a3 b3 + vpdpwssd xmm0, xm14, xm3 ; a3 b3 psrad xmm0, 10 packusdw xmm0, xmm0 - pminsw xmm0, xm11 + pminsw xmm0, xm15 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] @@ -2095,22 +2897,22 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my movu xm18, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti128 ym18, [srcq+ssq*0], 1 - mova ym4, ym10 - pshufb ym17, ym18, ym19 - pmaddwd ym16, ym12, ym1 ; a0 b0 - vpdpwssd ym4, ym8, ym17 - pshufb ym18, ym20 + pmaddwd ym16, ym11, ym1 ; a0 b0 mova ym1, ym2 - vpdpwssd ym16, ym13, ym2 ; a1 b1 - vpdpwssd ym4, ym9, ym18 ; 7 8 mova ym2, ym3 - vpdpwssd ym16, ym14, ym3 ; a2 b2 - vpermt2b ym3, ym7, ym4 ; 67 78 - vpdpwssd ym16, ym15, ym3 ; a3 b3 + pshufb ym17, ym18, ym19 + mova ym3, ym10 + vpdpwssd ym3, ym8, ym17 + pshufb ym18, ym20 + vpdpwssd ym16, ym12, ym1 ; a1 b1 + vpdpwssd ym3, ym9, ym18 ; 7 8 + vpdpwssd ym16, ym13, ym2 ; a2 b2 + vpermt2b ym3, ym7, ym2 ; 67 78 + vpdpwssd ym16, ym14, ym3 ; a3 b3 psrad ym16, 10 vextracti128 xm17, ym16, 1 packusdw xm16, xm17 - pminsw xm16, xm11 + pminsw xm16, xm15 movq [dstq+dsq*0], xm16 movhps [dstq+dsq*1], xm16 lea dstq, [dstq+dsq*2] @@ -2141,17 +2943,16 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my .hv_w8_main: mova [buf+ 0], xmm0 mova [buf+16], xmm1 - vpbroadcastd m12, xmm0 - vpbroadcastd m13, [buf+ 4] - vpbroadcastd m14, [buf+ 8] - vpbroadcastd m15, [buf+12] + vpbroadcastd m11, xmm0 + vpbroadcastd m12, [buf+ 4] + vpbroadcastd m13, [buf+ 8] + vpbroadcastd m14, [buf+12] vpbroadcastd m16, xmm1 vpbroadcastd m17, [buf+20] vpbroadcastd m18, [buf+24] vpbroadcastd m19, [buf+28] - cmp wd, 16 - je .hv_w16 - jg .hv_w32 + cmp wd, 8 + jg .hv_w16 mova m5, [spel_h_shufA] movu ym0, [srcq+ssq*0] vinserti32x8 m0, [srcq+ssq*1], 1 ; 0 1 @@ -2166,43 +2967,43 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my movu m7, [spel_h_shufC] vpermb m8, m5, m0 mova m1, m10 - vpdpwssd m1, m12, m8 ; a0 b0 + vpdpwssd m1, m11, m8 ; a0 b0 vpermb m8, m5, m9 mova m2, m10 - vpdpwssd m2, m12, m8 ; c0 d0 + vpdpwssd m2, m11, m8 ; c0 d0 vpermb m8, m5, m20 mova m3, m10 - vpdpwssd m3, m12, m8 ; e0 f0 + vpdpwssd m3, m11, m8 ; e0 f0 vpermb m8, m5, m21 mova m4, m10 - vpdpwssd m4, m12, m8 ; g0 + vpdpwssd m4, m11, m8 ; g0 vpermb m8, m6, m0 - vpdpwssd m1, m13, m8 ; a1 b1 + vpdpwssd m1, m12, m8 ; a1 b1 vpermb m8, m6, m9 - vpdpwssd m2, m13, m8 ; c1 d1 + vpdpwssd m2, m12, m8 ; c1 d1 vpermb m8, m6, m20 - vpdpwssd m3, m13, m8 ; e1 f1 + vpdpwssd m3, m12, m8 ; e1 f1 vpermb m8, m6, m21 - vpdpwssd m4, m13, m8 ; g1 + vpdpwssd m4, m12, m8 ; g1 vpermb m8, m7, m0 - vpdpwssd m1, m14, m8 ; a2 b2 + vpdpwssd m1, m13, m8 ; a2 b2 vpermb m8, m7, m9 - vpdpwssd m2, m14, m8 ; c2 d2 + vpdpwssd m2, m13, m8 ; c2 d2 vpermb m8, m7, m20 - vpdpwssd m3, m14, m8 ; e2 f2 + vpdpwssd m3, m13, m8 ; e2 f2 vpermb m8, m7, m21 - vpdpwssd m4, m14, m8 ; g2 + vpdpwssd m4, m13, m8 ; g2 mova m8, [spel_h_shufD] vpermb m0, m8, m0 - vpdpwssd m1, m15, m0 ; a3 b3 + vpdpwssd m1, m14, m0 ; a3 b3 mova m0, [spel_shuf8a] vpermb m9, m8, m9 - vpdpwssd m2, m15, m9 ; c3 d3 + vpdpwssd m2, m14, m9 ; c3 d3 mova m9, [spel_shuf8b] vpermb m20, m8, m20 - vpdpwssd m3, m15, m20 ; e3 f3 + vpdpwssd m3, m14, m20 ; e3 f3 vpermb m21, m8, m21 - vpdpwssd m4, m15, m21 ; g3 + vpdpwssd m4, m14, m21 ; g3 vpermt2b m1, m0, m2 ; 01 12 vpermt2b m2, m0, m3 ; 23 34 vpermt2b m3, m0, m4 ; 45 56 @@ -2212,24 +3013,24 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my vinserti32x8 m0, [srcq+ssq*0], 1 mova m4, m10 vpermb m21, m5, m0 - vpdpwssd m4, m12, m21 ; h0 i0 + vpdpwssd m4, m11, m21 ; h0 i0 vpermb m21, m6, m0 pmaddwd m20, m16, m1 ; A0 B0 - vpdpwssd m4, m13, m21 ; h1 i1 + vpdpwssd m4, m12, m21 ; h1 i1 vpermb m21, m7, m0 mova m1, m2 vpdpwssd m20, m17, m2 ; A1 B1 - vpdpwssd m4, m14, m21 ; h2 i2 + vpdpwssd m4, m13, m21 ; h2 i2 vpermb m21, m8, m0 mova m2, m3 vpdpwssd m20, m18, m3 ; A2 B2 - vpdpwssd m4, m15, m21 ; h3 i3 + vpdpwssd m4, m14, m21 ; h3 i3 vpermt2b m3, m9, m4 ; 67 78 vpdpwssd m20, m19, m3 ; A3 B3 psrad m20, 10 vextracti32x8 ym21, m20, 1 packusdw ym20, ym21 - pminsw ym20, ym11 + pminsw ym20, ym15 mova [dstq+dsq*0], xm20 vextracti128 [dstq+dsq*1], ym20, 1 lea dstq, [dstq+dsq*2] @@ -2239,86 +3040,90 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my RET .hv_w16: WIN64_SPILL_XMM 26 + vbroadcasti32x4 m20, [spel_h_shufA] + vbroadcasti32x4 m21, [spel_h_shufB] + add wd, wd + mova m9, [spel_shuf16] + lea wd, [hq+wq*8-256] +.hv_w16_loop0: vbroadcasti32x8 m5, [srcq+ssq*0+ 8] vinserti32x8 m4, m5, [srcq+ssq*0+ 0], 0 vinserti32x8 m5, [srcq+ssq*0+16], 1 ; 0 movu ym6, [srcq+ssq*1+ 0] movu ym7, [srcq+ssq*1+16] + lea r7, [srcq+r6] vinserti32x8 m6, [srcq+ssq*2+ 0], 1 vinserti32x8 m7, [srcq+ssq*2+16], 1 ; 1 2 - add srcq, r6 - movu ym22, [srcq+ssq*0+ 0] - movu ym23, [srcq+ssq*0+16] - vinserti32x8 m22, [srcq+ssq*1+ 0], 1 - vinserti32x8 m23, [srcq+ssq*1+16], 1 ; 3 4 - movu ym24, [srcq+ssq*2+ 0] - movu ym25, [srcq+ssq*2+16] - add srcq, r6 - vinserti32x8 m24, [srcq+ssq*0+ 0], 1 - vinserti32x8 m25, [srcq+ssq*0+16], 1 ; 5 6 - vbroadcasti32x4 m20, [spel_h_shufA] - vbroadcasti32x4 m21, [spel_h_shufB] - mova m9, [spel_shuf16] + movu ym22, [r7 +ssq*0+ 0] + movu ym23, [r7 +ssq*0+16] + mov r8, dstq + vinserti32x8 m22, [r7 +ssq*1+ 0], 1 + vinserti32x8 m23, [r7 +ssq*1+16], 1 ; 3 4 + movu ym24, [r7 +ssq*2+ 0] + movu ym25, [r7 +ssq*2+16] + add r7, r6 + vinserti32x8 m24, [r7 +ssq*0+ 0], 1 + vinserti32x8 m25, [r7 +ssq*0+16], 1 ; 5 6 pshufb m0, m4, m20 mova m1, m10 - vpdpwssd m1, m12, m0 ; a0 + vpdpwssd m1, m11, m0 ; a0 pshufb m0, m6, m20 mova m2, m10 - vpdpwssd m2, m12, m0 ; b0 + vpdpwssd m2, m11, m0 ; b0 pshufb m0, m7, m20 mova m3, m10 - vpdpwssd m3, m14, m0 ; c2 + vpdpwssd m3, m13, m0 ; c2 pshufb m0, m4, m21 - vpdpwssd m1, m13, m0 ; a1 + vpdpwssd m1, m12, m0 ; a1 pshufb m0, m6, m21 - vpdpwssd m2, m13, m0 ; b1 + vpdpwssd m2, m12, m0 ; b1 pshufb m0, m7, m21 - vpdpwssd m3, m15, m0 ; c3 + vpdpwssd m3, m14, m0 ; c3 pshufb m0, m5, m20 - vpdpwssd m1, m14, m0 ; a2 + vpdpwssd m1, m13, m0 ; a2 shufpd m6, m7, 0x55 pshufb m7, m6, m20 - vpdpwssd m2, m14, m7 ; b2 - vpdpwssd m3, m12, m7 ; c0 + vpdpwssd m2, m13, m7 ; b2 + vpdpwssd m3, m11, m7 ; c0 pshufb m5, m21 - vpdpwssd m1, m15, m5 ; a3 + vpdpwssd m1, m14, m5 ; a3 pshufb m6, m21 - vpdpwssd m2, m15, m6 ; b3 - vpdpwssd m3, m13, m6 ; c1 + vpdpwssd m2, m14, m6 ; b3 + vpdpwssd m3, m12, m6 ; c1 pshufb m0, m22, m20 mova m4, m10 - vpdpwssd m4, m12, m0 ; d0 + vpdpwssd m4, m11, m0 ; d0 pshufb m0, m23, m20 mova m5, m10 - vpdpwssd m5, m14, m0 ; e2 + vpdpwssd m5, m13, m0 ; e2 pshufb m0, m24, m20 mova m6, m10 - vpdpwssd m6, m12, m0 ; f0 + vpdpwssd m6, m11, m0 ; f0 pshufb m0, m25, m20 mova m7, m10 - vpdpwssd m7, m14, m0 ; g2 + vpdpwssd m7, m13, m0 ; g2 pshufb m0, m22, m21 - vpdpwssd m4, m13, m0 ; d1 + vpdpwssd m4, m12, m0 ; d1 pshufb m0, m23, m21 - vpdpwssd m5, m15, m0 ; e3 + vpdpwssd m5, m14, m0 ; e3 pshufb m0, m24, m21 - vpdpwssd m6, m13, m0 ; f1 + vpdpwssd m6, m12, m0 ; f1 pshufb m0, m25, m21 - vpdpwssd m7, m15, m0 ; g3 + vpdpwssd m7, m14, m0 ; g3 shufpd m22, m23, 0x55 pshufb m23, m22, m20 - vpdpwssd m4, m14, m23 ; d2 - vpdpwssd m5, m12, m23 ; e0 + vpdpwssd m4, m13, m23 ; d2 + vpdpwssd m5, m11, m23 ; e0 shufpd m24, m25, 0x55 pshufb m25, m24, m20 - vpdpwssd m6, m14, m25 ; f2 - vpdpwssd m7, m12, m25 ; g0 + vpdpwssd m6, m13, m25 ; f2 + vpdpwssd m7, m11, m25 ; g0 pshufb m22, m21 - vpdpwssd m4, m15, m22 ; d3 - vpdpwssd m5, m13, m22 ; e1 + vpdpwssd m4, m14, m22 ; d3 + vpdpwssd m5, m12, m22 ; e1 pshufb m24, m21 - vpdpwssd m6, m15, m24 ; f3 - vpdpwssd m7, m13, m24 ; g1 + vpdpwssd m6, m14, m24 ; f3 + vpdpwssd m7, m12, m24 ; g1 pslldq m1, 1 vpermt2b m2, m9, m3 ; 12 vpermt2b m4, m9, m5 ; 34 @@ -2327,38 +3132,38 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my vpshrdd m3, m2, m4, 16 ; 23 vpshrdd m5, m4, m6, 16 ; 45 .hv_w16_loop: - movu ym24, [srcq+ssq*1+ 0] - movu ym25, [srcq+ssq*1+16] - lea srcq, [srcq+ssq*2] - vinserti32x8 m24, [srcq+ssq*0+ 0], 1 - vinserti32x8 m25, [srcq+ssq*0+16], 1 + movu ym24, [r7+ssq*1+ 0] + movu ym25, [r7+ssq*1+16] + lea r7, [r7+ssq*2] + vinserti32x8 m24, [r7+ssq*0+ 0], 1 + vinserti32x8 m25, [r7+ssq*0+16], 1 mova m7, m10 mova m8, m10 pshufb m0, m24, m20 - vpdpwssd m7, m12, m0 ; h0 + vpdpwssd m7, m11, m0 ; h0 pshufb m0, m25, m20 - vpdpwssd m8, m14, m0 ; i2 + vpdpwssd m8, m13, m0 ; i2 pmaddwd m22, m16, m1 ; A0 mova m1, m3 pmaddwd m23, m16, m2 ; B0 mova m2, m4 pshufb m0, m24, m21 - vpdpwssd m7, m13, m0 ; h1 + vpdpwssd m7, m12, m0 ; h1 pshufb m0, m25, m21 - vpdpwssd m8, m15, m0 ; i3 + vpdpwssd m8, m14, m0 ; i3 vpdpwssd m22, m17, m3 ; A1 mova m3, m5 vpdpwssd m23, m17, m4 ; B1 mova m4, m6 shufpd m24, m25, 0x55 pshufb m25, m24, m20 - vpdpwssd m7, m14, m25 ; h2 - vpdpwssd m8, m12, m25 ; i0 + vpdpwssd m7, m13, m25 ; h2 + vpdpwssd m8, m11, m25 ; i0 vpdpwssd m22, m18, m5 ; A2 vpdpwssd m23, m18, m6 ; B2 pshufb m24, m21 - vpdpwssd m7, m15, m24 ; h3 - vpdpwssd m8, m13, m24 ; i1 + vpdpwssd m7, m14, m24 ; h3 + vpdpwssd m8, m12, m24 ; i1 vpermt2b m7, m9, m8 ; 78 vpshrdd m5, m6, m7, 16 ; 67 vpdpwssd m22, m19, m5 ; A3 @@ -2369,187 +3174,17 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my vshufi32x4 m0, m22, m23, q3232 vinserti32x8 m22, ym23, 1 packusdw m22, m0 - pminsw m22, m11 - mova [dstq+dsq*0], ym22 - vextracti32x8 [dstq+dsq*1], m22, 1 - lea dstq, [dstq+dsq*2] + pminsw m22, m15 + mova [r8+dsq*0], ym22 + vextracti32x8 [r8+dsq*1], m22, 1 + lea r8, [r8+dsq*2] sub hd, 2 jg .hv_w16_loop - RET -.hv_w32: - WIN64_SPILL_XMM 32 - vbroadcasti32x4 m20, [spel_h_shufA] - vbroadcasti32x4 m21, [spel_h_shufB] - mova m22, [spel_shuf32] - lea wd, [hq+wq*8-256] - mov r7, srcq - mov r8, dstq -.hv_w32_loop0: - movu m6, [srcq+ssq*0+ 0] - movu m7, [srcq+ssq*0+ 8] - movu m8, [srcq+ssq*0+16] - mova m0, m10 - mova m23, m10 - pshufb m9, m6, m20 - vpdpwssd m0, m12, m9 ; a0l - pshufb m9, m7, m20 - vpdpwssd m23, m12, m9 ; a0h - vpdpwssd m0, m14, m9 ; a2l - pshufb m7, m21 - vpdpwssd m23, m13, m7 ; a1h - vpdpwssd m0, m15, m7 ; a3l - pshufb m7, m8, m20 - vpdpwssd m23, m14, m7 ; a2h - pshufb m6, m21 - vpdpwssd m0, m13, m6 ; a1l - pshufb m8, m21 - vpdpwssd m23, m15, m8 ; a3h -%macro PUT_8TAP_HV_W32 5 ; dst_lo, dst_hi, stride_name, stride[1-2] - movu m6, [srcq+%3*%4+ 0] - movu m7, [srcq+%3*%4+ 8] - movu m8, [srcq+%3*%4+16] -%if %4 == 2 - add srcq, r6 -%endif - movu m29, [srcq+%3*%5+ 0] - movu m30, [srcq+%3*%5+ 8] - movu m31, [srcq+%3*%5+16] -%if %5 == 2 - add srcq, r6 -%endif - mova m%1, m10 - mova m9, m10 - pshufb m%2, m6, m20 - vpdpwssd m%1, m12, m%2 ; x0l - pshufb m%2, m29, m20 - vpdpwssd m9, m12, m%2 ; y0l - pshufb m6, m21 - vpdpwssd m%1, m13, m6 ; x1l - pshufb m29, m21 - vpdpwssd m9, m13, m29 ; y1l - pshufb m6, m7, m20 - mova m%2, m10 - vpdpwssd m%2, m12, m6 ; x0h - pshufb m29, m30, m20 - vpdpwssd m%1, m14, m6 ; y2l - mova m6, m10 - vpdpwssd m6, m12, m29 ; x0h - pshufb m7, m21 - vpdpwssd m9, m14, m29 ; y2l - pshufb m30, m21 - vpdpwssd m%2, m13, m7 ; x1h - vpdpwssd m%1, m15, m7 ; x3l - pshufb m7, m8, m20 - vpdpwssd m6, m13, m30 ; y1h - vpdpwssd m9, m15, m30 ; y3l - pshufb m30, m31, m20 - vpdpwssd m%2, m14, m7 ; x2h - pshufb m8, m21 - vpdpwssd m6, m14, m30 ; y2h - pshufb m31, m21 - vpdpwssd m%2, m15, m8 ; x3h - vpdpwssd m6, m15, m31 ; y3h -%if %1 == 1 - vpermt2b m0, m22, m%1 ; 01l - vpermt2b m23, m22, m%2 ; 01h -%endif - vpermt2b m%1, m22, m9 ; xyl - vpermt2b m%2, m22, m6 ; xyh -%endmacro - PUT_8TAP_HV_W32 1, 24, ssq, 1, 2 ; 12 - PUT_8TAP_HV_W32 3, 26, ssq, 0, 1 ; 34 - PUT_8TAP_HV_W32 5, 28, ssq, 2, 0 ; 56 - vpshrdd m2, m1, m3, 16 ; 23l - vpshrdd m25, m24, m26, 16 ; 23h - vpshrdd m4, m3, m5, 16 ; 45l - vpshrdd m27, m26, m28, 16 ; 45h -.hv_w32_loop: - movu m7, [srcq+ssq*1+ 0] - movu m9, [srcq+ssq*2+ 0] - movu m6, [srcq+ssq*1+ 8] - movu m8, [srcq+ssq*2+ 8] - mova m29, m10 - mova m31, m10 - pshufb m30, m7, m20 - vpdpwssd m29, m12, m30 ; h0l - pshufb m30, m9, m20 - vpdpwssd m31, m12, m30 ; i0l - pshufb m7, m21 - vpdpwssd m29, m13, m7 ; h1l - pshufb m9, m21 - vpdpwssd m31, m13, m9 ; i1l - pshufb m7, m6, m20 - vpdpwssd m29, m14, m7 ; h2l - pshufb m9, m8, m20 - vpdpwssd m31, m14, m9 ; i2l - pshufb m6, m21 - vpdpwssd m29, m15, m6 ; h3l - pshufb m8, m21 - vpdpwssd m31, m15, m8 ; i3l - mova m30, m10 - vpdpwssd m30, m12, m7 ; h0h - movu m7, [srcq+ssq*1+16] - lea srcq, [srcq+ssq*2] - vpermt2b m29, m22, m31 ; 78l - mova m31, m10 - vpdpwssd m31, m12, m9 ; i0h - movu m9, [srcq+ssq*0+16] - vpdpwssd m30, m13, m6 ; h1h - pshufb m6, m7, m20 - vpdpwssd m31, m13, m8 ; i1h - pshufb m8, m9, m20 - vpdpwssd m30, m14, m6 ; h2h - pmaddwd m6, m16, m0 ; A0l - pshufb m7, m21 - vpdpwssd m31, m14, m8 ; i2h - pmaddwd m8, m16, m23 ; A0h - pshufb m9, m21 - vpdpwssd m30, m15, m7 ; h3h - pmaddwd m7, m16, m1 ; B0l - vpdpwssd m31, m15, m9 ; i3h - pmaddwd m9, m16, m24 ; B0h - mova m0, m2 - vpdpwssd m6, m17, m2 ; A1l - mova m23, m25 - vpdpwssd m8, m17, m25 ; A1h - mova m1, m3 - vpdpwssd m7, m17, m3 ; B1l - mova m24, m26 - vpdpwssd m9, m17, m26 ; B1h - vpermt2b m30, m22, m31 ; 78h - vpdpwssd m6, m18, m4 ; A2l - mova m2, m4 - vpdpwssd m8, m18, m27 ; A2h - mova m25, m27 - vpdpwssd m7, m18, m5 ; B2l - mova m3, m5 - vpdpwssd m9, m18, m28 ; B2h - mova m26, m28 - vpshrdd m4, m5, m29, 16 ; 67l - vpdpwssd m6, m19, m4 ; A3l - vpshrdd m27, m28, m30, 16 ; 67h - vpdpwssd m8, m19, m27 ; A3h - mova m5, m29 - vpdpwssd m7, m19, m29 ; B3l - mova m28, m30 - vpdpwssd m9, m19, m30 ; B3h - REPX {psrad x, 10}, m6, m8, m7, m9 - packusdw m6, m8 - packusdw m7, m9 - pminsw m6, m11 - pminsw m7, m11 - mova [dstq+dsq*0], m6 - mova [dstq+dsq*1], m7 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .hv_w32_loop - add r7, 64 - add r8, 64 + add srcq, 32 + add dstq, 32 movzx hd, wb - mov srcq, r7 - mov dstq, r8 sub wd, 1<<8 - jg .hv_w32_loop0 + jg .hv_w16_loop0 RET %if WIN64 @@ -2558,17 +3193,776 @@ DECLARE_REG_TMP 6, 4 DECLARE_REG_TMP 6, 7 %endif -MC_8TAP_FN prep, sharp, SHARP, SHARP -MC_8TAP_FN prep, sharp_smooth, SHARP, SMOOTH -MC_8TAP_FN prep, smooth_sharp, SMOOTH, SHARP -MC_8TAP_FN prep, smooth, SMOOTH, SMOOTH -MC_8TAP_FN prep, sharp_regular, SHARP, REGULAR -MC_8TAP_FN prep, regular_sharp, REGULAR, SHARP -MC_8TAP_FN prep, smooth_regular, SMOOTH, REGULAR -MC_8TAP_FN prep, regular_smooth, REGULAR, SMOOTH -MC_8TAP_FN prep, regular, REGULAR, REGULAR +%define PREP_8TAP_FN FN prep_8tap, +PREP_8TAP_FN smooth, SMOOTH, SMOOTH, prep_6tap_16bpc +PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR, prep_6tap_16bpc +PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH, prep_6tap_16bpc +PREP_8TAP_FN regular, REGULAR, REGULAR -cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w, h, mx, my, stride3 +cglobal prep_6tap_16bpc, 3, 8, 0, tmp, src, ss, w, h, mx, my +%define base r7-prep_avx512icl + imul mxd, mxm, 0x010101 + add mxd, t0d ; 6tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 6tap_v, my, 4tap_v + lea r7, [prep_avx512icl] + mov wd, wm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v +.prep: + tzcnt wd, wd + mov r5d, r7m ; bitdepth_max + vpbroadcastd m5, [pw_8192] + movzx wd, word [r7+wq*2+table_offset(prep,)] + shr r5d, 11 + vpbroadcastd m4, [r7-prep_avx512icl+prep_mul+r5*4] + add wq, r7 + lea r6, [ssq*3] +%if WIN64 + pop r7 +%endif + jmp wq +.h_w8: + mova m6, [spel_h_shufA] + movu m7, [spel_h_shufC] + mova m8, [prep_endB] +.h_w8_loop: + movu ym4, [srcq+ssq*0] + vinserti32x8 m4, [srcq+ssq*1], 1 + movu ym5, [srcq+ssq*2] + vinserti32x8 m5, [srcq+r6 ], 1 + lea srcq, [srcq+ssq*4] + mova m0, m10 + mova m1, m10 + vpermb m2, m6, m4 + vpermb m3, m6, m5 + vpdpwssd m0, m12, m2 ; a0 b0 + vpdpwssd m1, m12, m3 ; c0 d0 + vpermb m4, m7, m4 + vpermb m5, m7, m5 + vpdpwssd m0, m14, m4 ; a2 b2 + vpdpwssd m1, m14, m5 ; c2 d2 + shufpd m2, m4, 0x55 + shufpd m3, m5, 0x55 + vpdpwssd m0, m13, m2 ; a1 b1 + vpdpwssd m1, m13, m3 ; c1 d1 + vpermt2b m0, m8, m1 + mova [tmpq], m0 + add tmpq, 64 + sub hd, 4 + jg .h_w8_loop + RET +.h: + vpbroadcastd m10, [prep_8tap_rnd] + test myd, 0xf00 + jnz .hv + lea r6, [ssq*3] + cmp wd, 4 + je mangle(private_prefix %+ _prep_8tap_16bpc_avx512icl).h_w4 + shr mxd, 16 + pmovsxbw xmm0, [base+subpel_filters+1+mxq*8] + mov r5d, r7m + sub srcq, 4 + shr r5d, 11 + psllw xmm0, [base+prep_hv_shift+r5*8] + mova [tmpq], xmm0 + vpbroadcastd m12, xmm0 + vpbroadcastd m13, [tmpq+ 4] + vpbroadcastd m14, [tmpq+ 8] + cmp wd, 16 + jl .h_w8 + vbroadcasti32x4 m5, [spel_h_shufA] + vbroadcasti32x4 m6, [spel_h_shufB] + mova m7, [prep_endC] + jg .h_w32 +.h_w16_loop: + movu ym2, [srcq+ssq*0+ 0] + vinserti32x8 m2, [srcq+ssq*1+ 0], 1 + movu ym3, [srcq+ssq*0+12] + vinserti32x8 m3, [srcq+ssq*1+12], 1 + lea srcq, [srcq+ssq*2] + mova m0, m10 + mova m1, m10 + pshufb m4, m2, m5 ; 01 + vpdpwssd m0, m12, m4 ; a0 b0 + pshufb m4, m3, m6 ; 89 + vpdpwssd m1, m14, m4 ; a2' b2' + pshufb m2, m6 ; 23 + pshufb m3, m5 ; 67 + vpdpwssd m0, m13, m2 ; a1 b1 + vpdpwssd m1, m13, m3 ; a1' b1' + shufpd m2, m3, 0x55 ; 45 + vpdpwssd m0, m14, m2 ; a2 b2 + vpdpwssd m1, m12, m2 ; a0' b0' + vpermt2b m0, m7, m1 + mova [tmpq], m0 + add tmpq, 64 + sub hd, 2 + jg .h_w16_loop + RET +.h_w32: + lea srcq, [srcq+wq*2] + neg wq +.h_w32_loop0: + mov r6, wq +.h_w32_loop: + movu m2, [srcq+r6*2+ 0] + movu m3, [srcq+r6*2+12] + mova m0, m10 + mova m1, m10 + pshufb m4, m2, m5 + vpdpwssd m0, m12, m4 + pshufb m4, m3, m6 + vpdpwssd m1, m14, m4 + pshufb m2, m6 + pshufb m3, m5 + vpdpwssd m0, m13, m2 + vpdpwssd m1, m13, m3 + shufpd m2, m3, 0x55 + vpdpwssd m0, m14, m2 + vpdpwssd m1, m12, m2 + vpermt2b m0, m7, m1 + mova [tmpq], m0 + add tmpq, 64 + add r6, 32 + jl .h_w32_loop + add srcq, ssq + dec hd + jg .h_w32_loop0 + RET +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + mov r5d, r7m + vpbroadcastd m10, [prep_8tap_rnd] + pmovsxbw xmm0, [base+subpel_filters+1+myq*8] + tzcnt r6d, wd + shr r5d, 11 + movzx r6d, word [r7+r6*2+table_offset(prep, _6tap_v)] + psllw xmm0, [base+prep_hv_shift+r5*8] + add r7, r6 + mova [tmpq], xmm0 + vpbroadcastd m12, xmm0 + mov r6, ssq + vpbroadcastd m13, [tmpq+ 4] + neg r6 + vpbroadcastd m14, [tmpq+ 8] + jmp r7 +.v_w4: + mov r3d, 0x330c + movq xm1, [srcq+r6 *2] + kmovw k1, r3d + vpbroadcastq ym1{k1}, [srcq+r6 *1] + vpbroadcastq m2, [srcq+ssq*0] + vinserti32x4 m1{k1}, m2, [srcq+ssq*1], 3 + movq xm0, [srcq+ssq*2] + mova ym4, [prep_endA] + valignq m0, m1, 2 + punpcklwd m1, m0 ; 01 12 23 34 +.v_w4_loop: + lea srcq, [srcq+ssq*4] + movq xm2, [srcq+r6 *1] + vpbroadcastq ym2{k1}, [srcq+ssq*0] + vpbroadcastq m3, [srcq+ssq*1] + vinserti32x4 m2{k1}, m3, [srcq+ssq*2], 3 + mova m3, m10 + vpdpwssd m3, m12, m1 ; a0 b0 c0 d0 + valignq m0, m2, m0, 6 ; 4 5 6 7 + punpcklwd m0, m2 ; 45 56 67 78 + vpdpwssd m3, m14, m0 ; a2 b2 c2 d2 + vshufi32x4 m1, m0, q1032 ; 23 34 45 56 + vpdpwssd m3, m13, m1 ; a1 b1 c1 d1 + mova m1, m0 + mova m0, m2 + vpermb m3, m4, m3 + mova [tmpq], ym3 + add tmpq, 32 + sub hd, 4 + jg .v_w4_loop + RET +.v_w8: + vbroadcasti32x4 ym1, [srcq+r6 *1] + mov r3d, 0x33 + vbroadcasti32x4 m2, [srcq+ssq*0] + kmovb k1, r3d + mova m6, [spel_v_shuf8] + vinserti64x2 m1{k1}, m2, [srcq+r6 *2], 0 ; 0 1 2 + vbroadcasti32x4 ym0, [srcq+ssq*1] + vinserti64x2 m0{k1}, m2, [srcq+ssq*2], 2 ; 2 3 4 + mova m7, [prep_endB] + vpermb m1, m6, m1 ; 01 12 + vpermb m2, m6, m0 ; 23 34 +.v_w8_loop: + lea srcq, [srcq+ssq*4] + vbroadcasti32x4 ym3, [srcq+r6 *1] + movu xm4, [srcq+ssq*0] + vshufi64x2 m3{k1}, m0, m4, q1032 ; 4 5 6 + vbroadcasti32x4 ym0, [srcq+ssq*1] + vinserti64x2 m0{k1}, m4, [srcq+ssq*2], 2 ; 6 7 8 + mova m4, m10 + vpdpwssd m4, m12, m1 ; a0 b0 + mova m5, m10 + vpdpwssd m5, m12, m2 ; c0 d0 + vpermb m1, m6, m3 ; 45 56 + vpdpwssd m4, m13, m2 ; a1 b1 + vpermb m2, m6, m0 ; 67 78 + vpdpwssd m5, m13, m1 ; c1 d1 + vpdpwssd m4, m14, m1 ; a2 b2 + vpdpwssd m5, m14, m2 ; c2 d2 + vpermt2b m4, m7, m5 + mova [tmpq], m4 + add tmpq, 64 + sub hd, 4 + jg .v_w8_loop + RET +.v_w16: + vbroadcasti32x8 m0, [srcq+r6 *1] + vinserti32x8 m1, m0, [srcq+ssq*0], 1 ; 1 2 + vinserti32x8 m0, [srcq+r6 *2], 0 ; 0 1 + mova m6, [spel_v_shuf16] + movu ym3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti32x8 m3, [srcq+ssq*0], 1 ; 3 4 + mova m7, [prep_endA] + vpermb m1, m6, m1 ; 12 + vpermb m0, m6, m0 ; 01 + vpermb m3, m6, m3 ; 34 + vpshrdd m2, m1, m3, 16 ; 23 +.v_w16_loop: + mova m5, m10 + vpdpwssd m5, m12, m1 ; b0 + mova m4, m10 + vpdpwssd m4, m12, m0 ; a0 + mova m1, m3 + vpdpwssd m5, m13, m3 ; b1 + movu ym3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpdpwssd m4, m13, m2 ; a1 + vinserti32x8 m3, [srcq+ssq*0], 1 + mova m0, m2 + vpermb m3, m6, m3 ; 56 + vpshrdd m2, m1, m3, 16 ; 45 + vpdpwssd m5, m14, m3 ; b2 + vpdpwssd m4, m14, m2 ; a2 + vpermt2b m4, m7, m5 + mova [tmpq], m4 + add tmpq, 64 + sub hd, 2 + jg .v_w16_loop + RET +.v_w32: +.v_w64: +.v_w128: +%if WIN64 + push r8 +%endif + mova m11, [prep_endC] + lea r5, [hq+wq*8-256] +.v_w32_loop0: + movu m4, [srcq+r6 *2] + movu m5, [srcq+r6 *1] + lea r7, [srcq+ssq*2] + movu m6, [srcq+ssq*0] + movu m7, [srcq+ssq*1] + mov r8, tmpq + movu m8, [r7 +ssq*0] + punpcklwd m0, m4, m5 ; 01 + punpckhwd m4, m5 + punpcklwd m1, m5, m6 ; 12 + punpckhwd m5, m6 + punpcklwd m2, m6, m7 ; 23 + punpckhwd m6, m7 + punpcklwd m3, m7, m8 ; 34 + punpckhwd m7, m8 +.v_w32_loop: + mova m16, m10 + movu m9, [r7+ssq*1] + mova m18, m10 + vpdpwssd m16, m12, m0 ; a0 + mova m17, m10 + vpdpwssd m18, m12, m4 + mova m19, m10 + vpdpwssd m17, m12, m1 ; b0 + lea r7, [r7+ssq*2] + vpdpwssd m19, m12, m5 + mova m0, m2 + vpdpwssd m16, m13, m2 ; a1 + punpcklwd m2, m8, m9 ; 45 + mova m4, m6 + vpdpwssd m18, m13, m6 + punpckhwd m6, m8, m9 + movu m8, [r7+ssq*0] + vpdpwssd m17, m13, m3 ; b1 + mova m1, m3 + vpdpwssd m19, m13, m7 + mova m5, m7 + vpdpwssd m16, m14, m2 ; a2 + punpcklwd m3, m9, m8 ; 56 + vpdpwssd m18, m14, m6 + punpckhwd m7, m9, m8 + vpdpwssd m17, m14, m3 ; b2 + vpdpwssd m19, m14, m7 + vpermt2b m16, m11, m18 + vpermt2b m17, m11, m19 + mova [r8+wq*0], m16 + mova [r8+wq*2], m17 + lea r8, [r8+wq*4] + sub hd, 2 + jg .v_w32_loop + add srcq, 64 + add tmpq, 64 + movzx hd, r5b + sub r5d, 1<<8 + jg .v_w32_loop0 +%if WIN64 + pop r8 +%endif + vzeroupper + RET +.hv_w4: + movzx mxd, mxb + pmovsxbw xmm0, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + mov r5d, r7m + pmovsxbw xmm1, [base+subpel_filters+1+myq*8] + mov r6, ssq + sub srcq, 2 + shr r5d, 11 + neg r6 + psllw xmm0, [base+prep_hv_shift+r5*8] + psllw xmm1, 2 + mova [tmpq+ 0], xmm0 + mova [tmpq+16], xmm1 + vpbroadcastd m8, [tmpq+ 4] + mov r3d, 0xf0 + vpbroadcastd m9, [tmpq+ 8] + vpbroadcastd m12, xmm1 + movu xm3, [srcq+r6 *2] + kmovb k1, r3d + vinserti32x4 ym3, [srcq+r6 *1], 1 + vbroadcasti32x4 m2, [srcq+ssq*0] + vinserti64x2 m3{k1}, m2, [srcq+ssq*1], 3 + movu xm4, [srcq+ssq*2] + vbroadcasti32x4 m5, [spel_h_shufA] + vbroadcasti32x4 m6, [spel_h_shufB] + mova m1, m11 + mova m15, [spel_shuf4a] + mova xm2, xm11 + pshufb m0, m3, m5 + vpdpwssd m1, m8, m0 + pshufb xm0, xm4, xm5 + vpdpwssd xm2, xm8, xm0 + vpbroadcastd m13, [tmpq+20] + pshufb m3, m6 + vpbroadcastd m14, [tmpq+24] + pshufb xm4, xm6 + mova m7, [spel_shuf4b] + vpdpwssd m1, m9, m3 ; 0 1 2 3 + vpdpwssd xm2, xm9, xm4 ; 4 + vpermt2b m1, m15, m2 ; 01 12 23 34 + mova ym15, [prep_endA] +.hv_w4_loop: + lea srcq, [srcq+ssq*4] + movu xm4, [srcq+r6 *1] + vinserti32x4 ym4, [srcq+ssq*0], 1 + vbroadcasti32x4 m3, [srcq+ssq*1] + vinserti64x2 m4{k1}, m3, [srcq+ssq*2], 3 + mova m2, m11 + pshufb m3, m4, m5 + vpdpwssd m2, m8, m3 + mova m3, m10 + vpdpwssd m3, m12, m1 ; a0 b0 c0 d0 + pshufb m4, m6 + vpdpwssd m2, m9, m4 ; 5 6 7 8 + mova m4, m1 + vpermt2b m1, m7, m2 ; 45 56 67 78 + vpdpwssd m3, m14, m1 ; a2 b2 c2 d2 + vshufi32x4 m4, m1, q1032 ; 23 34 45 56 + vpdpwssd m3, m13, m4 ; a1 b1 c1 d1 + vpermb m3, m15, m3 + mova [tmpq], ym3 + add tmpq, 32 + sub hd, 4 + jg .hv_w4_loop + RET +.hv_w8: + mova m8, [spel_h_shufA] + movu ym18, [srcq+r6 *2] + vinserti32x8 m18, [srcq+r6 *1], 1 ; 0 1 + movu ym19, [srcq+ssq*0] + vinserti32x8 m19, [srcq+ssq*1], 1 ; 2 3 + movu ym20, [srcq+ssq*2] ; 4 + movu m9, [spel_h_shufC] + mova m21, [spel_shuf8a] + mova m0, [spel_shuf8b] + vpermb m4, m8, m18 + mova m1, m10 + vpermb m5, m8, m19 + vpdpwssd m1, m12, m4 ; a0 b0 + mova m2, m10 + vpermb m6, m8, m20 + vpdpwssd m2, m12, m5 ; c0 d0 + mova m3, m10 + vpermb m18, m9, m18 + vpdpwssd m3, m12, m6 ; e0 + mova m7, [prep_endB] + vpermb m19, m9, m19 + vpdpwssd m1, m14, m18 ; a2 b2 + vpermb m20, m9, m20 + vpdpwssd m2, m14, m19 ; c2 d2 + shufpd m4, m18, 0x55 + vpdpwssd m3, m14, m20 ; e2 + shufpd m5, m19, 0x55 + vpdpwssd m1, m13, m4 ; a1 b1 + shufpd m6, m20, 0x55 + vpdpwssd m2, m13, m5 ; c1 d1 + vpdpwssd m3, m13, m6 ; e1 + vpermt2b m1, m21, m2 ; 01 12 + vpermt2b m2, m21, m3 ; 23 34 +.hv_w8_loop: + lea srcq, [srcq+ssq*4] + movu ym18, [srcq+r6 *1] + vinserti32x8 m18, [srcq+ssq*0], 1 + movu ym19, [srcq+ssq*1] + vinserti32x8 m19, [srcq+ssq*2], 1 + mova m3, m10 + vpermb m5, m8, m18 + mova m4, m10 + vpermb m6, m8, m19 + vpdpwssd m3, m12, m5 ; f0 g0 + mova m20, m11 + vpdpwssd m4, m12, m6 ; h0 i0 + mova m21, m11 + vpdpwssd m20, m15, m1 ; A0 B0 + vpermb m18, m9, m18 + vpdpwssd m21, m15, m2 ; C0 D0 + vpermb m19, m9, m19 + vpdpwssd m3, m14, m18 ; f2 g2 + vpdpwssd m4, m14, m19 ; h2 i2 + shufpd m5, m18, 0x55 + vpdpwssd m20, m16, m2 ; A1 B1 + shufpd m6, m19, 0x55 + vpdpwssd m3, m13, m5 ; f1 g1 + vpdpwssd m4, m13, m6 ; h1 i1 + vpermt2b m2, m0, m3 ; 45 56 + vpdpwssd m21, m16, m2 ; C1 D1 + mova m1, m2 + vpermt2b m2, m0, m4 ; 67 78 + vpdpwssd m20, m17, m1 ; A2 B2 + vpdpwssd m21, m17, m2 ; A2 B2 + vpermt2b m20, m7, m21 + mova [tmpq], m20 + add tmpq, 64 + sub hd, 4 + jg .hv_w8_loop + vzeroupper + RET +.hv: + vpbroadcastd m11, [pd_128] + cmp wd, 4 + je .hv_w4 + shr mxd, 16 + pmovsxbw xmm0, [base+subpel_filters+1+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + mov r5d, r7m + pmovsxbw xmm1, [base+subpel_filters+1+myq*8] + mov r6, ssq + sub srcq, 4 + shr r5d, 11 + neg r6 + psllw xmm0, [base+prep_hv_shift+r5*8] + psllw xmm1, 2 + mova [tmpq+ 0], xmm0 + mova [tmpq+16], xmm1 + vpbroadcastd m12, xmm0 + vpbroadcastd m13, [tmpq+ 4] + vpbroadcastd m14, [tmpq+ 8] + vpbroadcastd m15, xmm1 + vpbroadcastd m16, [tmpq+20] + vpbroadcastd m17, [tmpq+24] + cmp wd, 16 + jl .hv_w8 + vbroadcasti32x4 m8, [spel_h_shufA] + vbroadcasti32x4 m9, [spel_h_shufB] + jg .hv_w32 + vbroadcasti32x8 m6, [srcq+r6 *2+ 8] + vinserti32x8 m2, m6, [srcq+r6 *2+16], 1 + vinserti32x8 m6, [srcq+r6 *2+ 0], 0 ; 0 + movu ym18, [srcq+r6 *1+ 0] + movu ym19, [srcq+r6 *1+12] + vinserti32x8 m18, [srcq+ssq*0+ 0], 1 + vinserti32x8 m19, [srcq+ssq*0+12], 1 ; 1 2 + movu ym20, [srcq+ssq*1+ 0] + movu ym21, [srcq+ssq*1+12] + lea srcq, [srcq+ssq*2] + vinserti32x8 m20, [srcq+ssq*0+ 0], 1 + vinserti32x8 m21, [srcq+ssq*0+12], 1 ; 3 4 + pshufb m2, m8 + mova m1, m10 + pshufb m3, m18, m8 + vpdpwssd m1, m14, m2 ; a2 + mova m2, m10 + pshufb m4, m19, m9 + vpdpwssd m2, m12, m3 ; b0 c0 + mova m3, m10 + pshufb m5, m20, m8 + vpdpwssd m3, m14, m4 ; b2' c2' + mova m4, m10 + pshufb m7, m21, m9 + vpdpwssd m4, m12, m5 ; d0 e0 + mova m5, m10 + pshufb m0, m6, m8 + vpdpwssd m5, m14, m7 ; d2' e2' + mova m7, [spel_shuf16] + pshufb m18, m9 + vpdpwssd m1, m12, m0 ; a0 + pshufb m19, m8 + vpdpwssd m2, m13, m18 ; b1 c1 + pshufb m20, m9 + vpdpwssd m3, m13, m19 ; b1' c1' + pshufb m21, m8 + vpdpwssd m4, m13, m20 ; d1 e1 + pshufb m6, m9 + vpdpwssd m5, m13, m21 ; d1' e1' + mova m0, [prep_endB] + shufpd m18, m19, 0x55 + vpdpwssd m1, m13, m6 ; a1 + shufpd m20, m21, 0x55 + vpdpwssd m2, m14, m18 ; b2 c2 + vpdpwssd m3, m12, m18 ; b0' c0' + vpdpwssd m4, m14, m20 ; d2 e2 + vpdpwssd m5, m12, m20 ; d0' e0' + pslldq m1, 1 + vpermt2b m2, m7, m3 ; 12 + vpermt2b m4, m7, m5 ; 34 + vpshrdd m1, m2, 16 ; 01 + vpshrdd m3, m2, m4, 16 ; 23 +.hv_w16_loop: + movu ym18, [srcq+ssq*1+ 0] + movu ym19, [srcq+ssq*1+12] + lea srcq, [srcq+ssq*2] + vinserti32x8 m18, [srcq+ssq*0+ 0], 1 + vinserti32x8 m19, [srcq+ssq*0+12], 1 + mova m5, m10 + mova m6, m10 + pshufb m21, m18, m8 + vpdpwssd m5, m12, m21 ; f0 g0 + pshufb m20, m19, m9 + mova m21, m11 + vpdpwssd m6, m14, m20 ; f2' g2' + mova m20, m11 + vpdpwssd m21, m15, m2 ; B0 + mova m2, m4 + vpdpwssd m20, m15, m1 ; A0 + mova m1, m3 + pshufb m18, m9 + vpdpwssd m5, m13, m18 ; f1 g1 + pshufb m19, m8 + vpdpwssd m6, m13, m19 ; f1' g1' + vpdpwssd m21, m16, m4 ; B1 + vpdpwssd m20, m16, m3 ; A1 + shufpd m18, m19, 0x55 + vpdpwssd m5, m14, m18 ; f2 g2 + vpdpwssd m6, m12, m18 ; f0' g0' + mova m4, m7 + vpermi2b m4, m5, m6 ; 56 + vpshrdd m3, m2, m4, 16 ; 45 + vpdpwssd m21, m17, m4 ; B2 + vpdpwssd m20, m17, m3 ; A2 + vpermt2b m20, m0, m21 + mova [tmpq], m20 + add tmpq, 64 + sub hd, 2 + jg .hv_w16_loop + vzeroupper + RET +.hv_w32: + WIN64_SPILL_XMM 29 +%if WIN64 + push r8 +%endif + mova m27, [spel_shuf32] + lea r5d, [hq+wq*8-256] + mova m28, [prep_endC] +.hv_w32_loop0: + movu m18, [srcq+r6 *2+ 0] + movu m7, [srcq+r6 *2+12] + movu m6, [srcq+r6 *1+ 0] + movu m20, [srcq+r6 *1+12] + lea r7, [srcq+ssq*2] + movu m19, [srcq+ssq*0+ 0] + movu m21, [srcq+ssq*0+12] + movu m22, [srcq+ssq*1+ 0] + movu m24, [srcq+ssq*1+12] + mov r8, tmpq + movu m23, [r7 +ssq*0+ 0] + movu m25, [r7 +ssq*0+12] + pshufb m1, m18, m8 + mova m0, m10 + pshufb m2, m7, m9 + vpdpwssd m0, m12, m1 ; a0 + mova m1, m10 + pshufb m4, m6, m8 + vpdpwssd m1, m14, m2 ; a2' + mova m2, m10 + pshufb m3, m19, m8 + vpdpwssd m2, m12, m4 ; b0 + mova m4, m10 + pshufb m5, m20, m9 + vpdpwssd m4, m12, m3 ; c0 + mova m3, m10 + pshufb m26, m21, m9 + vpdpwssd m3, m14, m5 ; b2' + mova m5, m10 + pshufb m18, m9 + vpdpwssd m5, m14, m26 ; c2' + pshufb m7, m8 + vpdpwssd m0, m13, m18 ; a1 + pshufb m6, m9 + vpdpwssd m1, m13, m7 ; a1' + pshufb m19, m9 + vpdpwssd m2, m13, m6 ; b1 + pshufb m20, m8 + vpdpwssd m4, m13, m19 ; c1 + pshufb m21, m8 + vpdpwssd m3, m13, m20 ; b1' + shufpd m18, m7, 0x55 + vpdpwssd m5, m13, m21 ; c1' + shufpd m6, m20, 0x55 + vpdpwssd m0, m14, m18 ; a2 + shufpd m19, m21, 0x55 + vpdpwssd m1, m12, m18 ; a0' + pshufb m18, m22, m8 + vpdpwssd m2, m14, m6 ; b2 + pshufb m7, m23, m8 + vpdpwssd m4, m14, m19 ; c2 + vpdpwssd m3, m12, m6 ; b0' + mova m6, m10 + vpdpwssd m5, m12, m19 ; c0' + pshufb m19, m24, m9 + vpdpwssd m6, m12, m18 ; d0 + mova m18, m10 + pshufb m26, m25, m9 + vpdpwssd m18, m12, m7 ; e0 + mova m7, m10 + pshufb m22, m9 + vpdpwssd m7, m14, m19 ; d2' + mova m19, m10 + pshufb m23, m9 + vpdpwssd m19, m14, m26 ; e2' + pshufb m24, m8 + vpdpwssd m6, m13, m22 ; d1 + pshufb m25, m8 + vpdpwssd m18, m13, m23 ; e1 + shufpd m22, m24, 0x55 + vpdpwssd m7, m13, m24 ; d1' + shufpd m23, m25, 0x55 + vpdpwssd m19, m13, m25 ; e1' + pslldq m0, 1 + vpdpwssd m6, m14, m22 ; d2 + pslldq m1, 1 + vpdpwssd m18, m14, m23 ; e2 + vpermt2b m2, m27, m4 ; 12 + vpdpwssd m7, m12, m22 ; d0' + vpermt2b m3, m27, m5 ; 12' + vpdpwssd m19, m12, m23 ; e0' + vpshrdd m0, m2, 16 ; 01 + vpermt2b m6, m27, m18 ; 34 + vpshrdd m1, m3, 16 ; 01' + vpermt2b m7, m27, m19 ; 34' + vpshrdd m4, m2, m6, 16 ; 23 + vpshrdd m5, m3, m7, 16 ; 23' +.hv_w32_loop: + movu m22, [r7+ssq*1+ 0] + movu m24, [r7+ssq*1+12] + lea r7, [r7+ssq*2] + movu m23, [r7+ssq*0+ 0] + movu m25, [r7+ssq*0+12] + mova m19, m11 + vpdpwssd m19, m15, m2 ; B0 + mova m21, m11 + vpdpwssd m21, m15, m3 ; B0' + mova m18, m11 + vpdpwssd m18, m15, m0 ; A0 + mova m20, m11 + vpdpwssd m20, m15, m1 ; A0' + mova m2, m6 + vpdpwssd m19, m16, m6 ; B1 + mova m3, m7 + vpdpwssd m21, m16, m7 ; B1' + mova m0, m4 + vpdpwssd m18, m16, m4 ; A1 + mova m1, m5 + pshufb m4, m22, m8 + vpdpwssd m20, m16, m5 ; A1' + mova m6, m10 + pshufb m7, m23, m8 + vpdpwssd m6, m12, m4 ; f0 + mova m4, m10 + pshufb m5, m24, m9 + vpdpwssd m4, m12, m7 ; g0 + mova m7, m10 + pshufb m26, m25, m9 + vpdpwssd m7, m14, m5 ; f2' + mova m5, m10 + pshufb m22, m9 + vpdpwssd m5, m14, m26 ; g2' + pshufb m23, m9 + vpdpwssd m6, m13, m22 ; f1 + pshufb m24, m8 + vpdpwssd m4, m13, m23 ; g1 + pshufb m25, m8 + vpdpwssd m7, m13, m24 ; f1' + shufpd m22, m24, 0x55 + vpdpwssd m5, m13, m25 ; g1' + shufpd m23, m25, 0x55 + vpdpwssd m6, m14, m22 ; f2 + vpdpwssd m4, m14, m23 ; g2 + vpdpwssd m7, m12, m22 ; f0' + vpdpwssd m5, m12, m23 ; g0' + vpermt2b m6, m27, m4 ; 56 + vpermt2b m7, m27, m5 ; 56' + vpdpwssd m19, m17, m6 ; B2 + vpshrdd m4, m2, m6, 16 ; 45 + vpdpwssd m21, m17, m7 ; B2' + vpshrdd m5, m3, m7, 16 ; 45' + vpdpwssd m18, m17, m4 ; A2 + vpdpwssd m20, m17, m5 ; A2' + vpermt2b m19, m28, m21 + vpermt2b m18, m28, m20 + mova [r8+wq*0], m18 + mova [r8+wq*2], m19 + lea r8, [r8+wq*4] + sub hd, 2 + jg .hv_w32_loop + add srcq, 64 + add tmpq, 64 + movzx hd, r5b + sub r5d, 1<<8 + jg .hv_w32_loop0 +%if WIN64 + pop r8 +%endif + RET + +PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_16bpc +PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_16bpc +PREP_8TAP_FN regular_sharp, REGULAR, SHARP, prep_8tap_16bpc +PREP_8TAP_FN sharp_regular, SHARP, REGULAR, prep_8tap_16bpc +PREP_8TAP_FN sharp, SHARP, SHARP + +cglobal prep_8tap_16bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my %define base r7-prep_avx512icl imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h @@ -2580,20 +3974,239 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w, h, mx, my, stride3 test mxd, 0xf00 jnz .h test myd, 0xf00 - jnz .v - tzcnt wd, wd - mov r5d, r7m ; bitdepth_max - vpbroadcastd m5, [pw_8192] - movzx wd, word [r7+wq*2+table_offset(prep,)] + jz mangle(private_prefix %+ _prep_6tap_16bpc_avx512icl).prep +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + mov r5d, r7m + vpbroadcastd m10, [prep_8tap_rnd] + pmovsxbw xmm0, [base+subpel_filters+myq*8] + tzcnt r6d, wd shr r5d, 11 - vpbroadcastd m4, [r7-prep_avx512icl+prep_mul+r5*4] - add wq, r7 + movzx r6d, word [r7+r6*2+table_offset(prep, _8tap_v)] + psllw xmm0, [base+prep_hv_shift+r5*8] + add r7, r6 lea r6, [strideq*3] + sub srcq, r6 + mova [tmpq], xmm0 + vpbroadcastd m12, xmm0 + vpbroadcastd m13, [tmpq+ 4] + vpbroadcastd m14, [tmpq+ 8] + vpbroadcastd m15, [tmpq+12] + jmp r7 +.v_w4: + mov r3d, 0x330c + movq xm1, [srcq+strideq*0] + kmovw k1, r3d + vpbroadcastq ym1{k1}, [srcq+strideq*1] + vpbroadcastq m0, [srcq+r6 ] + vinserti32x4 m1{k1}, m0, [srcq+strideq*2], 2 ; 0 1 2 3 + lea srcq, [srcq+strideq*4] + vpbroadcastq ym0{k1}, [srcq+strideq*0] + vpbroadcastq m2, [srcq+strideq*1] + vinserti32x4 m0{k1}, m2, [srcq+strideq*2], 3 ; 3 4 5 6 + mova ym5, [prep_endA] + vshufi32x4 m3, m1, m0, q1021 ; 1 2 3 4 + vshufi32x4 m2, m1, m0, q2132 ; 2 3 4 5 + punpcklwd m1, m3 ; 01 12 23 34 + punpcklwd m2, m0 ; 23 34 45 56 +.v_w4_loop: + movq xm4, [srcq+r6 ] + lea srcq, [srcq+strideq*4] + vpbroadcastq ym4{k1}, [srcq+strideq*0] + vpbroadcastq m3, [srcq+strideq*1] + vinserti32x4 m4{k1}, m3, [srcq+strideq*2], 3 ; 7 8 9 a + mova m3, m10 + vpdpwssd m3, m12, m1 ; a0 b0 c0 d0 + valignq m1, m4, m0, 6 ; 6 7 8 9 + vpdpwssd m3, m13, m2 ; a1 b1 c1 d1 + mova m0, m4 + punpcklwd m4, m1, m4 ; 67 78 89 9a + vpdpwssd m3, m15, m4 ; a3 b3 c3 d3 + vshufi32x4 m1, m2, m4, q1032 ; 45 56 67 78 + vpdpwssd m3, m14, m1 ; a2 b2 c2 d2 + mova m2, m4 + vpermb m3, m5, m3 + mova [tmpq], ym3 + add tmpq, 32 + sub hd, 4 + jg .v_w4_loop + RET +.v_w8: + movu xm0, [srcq+strideq*0] + mov r3d, 0x33 + vbroadcasti32x4 ym1, [srcq+strideq*1] + kmovb k1, r3d + mova m7, [spel_v_shuf8] + vinserti64x2 m1{k1}, m0, [srcq+strideq*2], 2 ; 0 1 2 + add srcq, r6 + vbroadcasti32x4 ym2, [srcq+strideq*0] + vbroadcasti32x4 m3, [srcq+strideq*1] + vbroadcasti32x4 ym0, [srcq+strideq*2] + vshufi64x2 m2{k1}, m1, m3, q1032 ; 2 3 4 + vinserti64x2 m0{k1}, m3, [srcq+r6], 2 ; 4 5 6 + mova m8, [prep_endB] + vpermb m1, m7, m1 ; 01 12 + vpermb m2, m7, m2 ; 23 34 + vpermb m3, m7, m0 ; 45 56 +.v_w8_loop: + lea srcq, [srcq+strideq*4] + vbroadcasti32x4 ym4, [srcq+strideq*0] + movu xm5, [srcq+strideq*1] + vshufi64x2 m4{k1}, m0, m5, q1032 ; 6 7 8 + vbroadcasti32x4 ym0, [srcq+strideq*2] + vinserti64x2 m0{k1}, m5, [srcq+r6], 2 ; 8 9 a + mova m5, m10 + vpdpwssd m5, m12, m1 ; a0 b0 + mova m6, m10 + vpdpwssd m6, m12, m2 ; c0 d0 + mova m1, m3 + vpdpwssd m5, m13, m2 ; c1 d1 + vpdpwssd m6, m13, m3 ; c1 d1 + vpermb m2, m7, m4 ; 67 78 + vpdpwssd m5, m14, m3 ; a2 b2 + vpermb m3, m7, m0 ; 89 9a + vpdpwssd m6, m14, m2 ; c2 d2 + vpdpwssd m5, m15, m2 ; a3 b3 + vpdpwssd m6, m15, m3 ; c3 d3 + vpermt2b m5, m8, m6 + mova [tmpq], m5 + add tmpq, 64 + sub hd, 4 + jg .v_w8_loop + RET +.v_w16: + vbroadcasti32x8 m0, [srcq+strideq*1] + vinserti32x8 m1, m0, [srcq+strideq*2], 1 + vinserti32x8 m0, [srcq+strideq*0], 0 + mova m8, [spel_v_shuf16] + add srcq, r6 + movu ym3, [srcq+strideq*0] + vinserti32x8 m3, [srcq+strideq*1], 1 + movu ym5, [srcq+strideq*2] + add srcq, r6 + vinserti32x8 m5, [srcq+strideq*0], 1 + mova m11, [prep_endA] + vpermb m1, m8, m1 ; 12 + vpermb m0, m8, m0 ; 01 + vpermb m3, m8, m3 ; 34 + vpermb m5, m8, m5 ; 56 + vpshrdd m2, m1, m3, 16 ; 23 + vpshrdd m4, m3, m5, 16 ; 45 +.v_w16_loop: + mova m7, m10 + vpdpwssd m7, m12, m1 ; b0 + mova m6, m10 + vpdpwssd m6, m12, m0 ; a0 + mova m1, m3 + vpdpwssd m7, m13, m3 ; b1 + mova m0, m2 + vpdpwssd m6, m13, m2 ; a1 + mova m3, m5 + vpdpwssd m7, m14, m5 ; b2 + mova m2, m4 + vpdpwssd m6, m14, m4 ; a2 + movu ym5, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vinserti32x8 m5, [srcq+strideq*0], 1 + vpermb m5, m8, m5 ; 78 + vpshrdd m4, m3, m5, 16 ; 67 + vpdpwssd m7, m15, m5 ; b3 + vpdpwssd m6, m15, m4 ; a3 + vpermt2b m6, m11, m7 + mova [tmpq], m6 + add tmpq, 64 + sub hd, 2 + jg .v_w16_loop + RET +.v_w32: +.v_w64: +.v_w128: + WIN64_PUSH_XMM 23 %if WIN64 - pop r7 + push r8 %endif - jmp wq + mova m11, [prep_endC] + lea r5, [hq+wq*8-256] +.v_w32_loop0: + movu m16, [srcq+strideq*0] + movu m17, [srcq+strideq*1] + lea r7, [srcq+r6] + movu m18, [srcq+strideq*2] + movu m19, [r7 +strideq*0] + mov r8, tmpq + movu m20, [r7 +strideq*1] + movu m21, [r7 +strideq*2] + add r7, r6 + movu m22, [r7 +strideq*0] + punpcklwd m0, m16, m17 ; 01l + punpckhwd m16, m17 ; 01h + punpcklwd m1, m17, m18 ; 12l + punpckhwd m17, m18 ; 12h + punpcklwd m2, m18, m19 ; 23l + punpckhwd m18, m19 ; 23h + punpcklwd m3, m19, m20 ; 34l + punpckhwd m19, m20 ; 34h + punpcklwd m4, m20, m21 ; 45l + punpckhwd m20, m21 ; 45h + punpcklwd m5, m21, m22 ; 56l + punpckhwd m21, m22 ; 56h +.v_w32_loop: + mova m6, m10 + vpdpwssd m6, m12, m0 ; a0l + mova m8, m10 + vpdpwssd m8, m12, m16 ; a0h + mova m7, m10 + vpdpwssd m7, m12, m1 ; b0l + mova m9, m10 + vpdpwssd m9, m12, m17 ; b0h + mova m0, m2 + vpdpwssd m6, m13, m2 ; a1l + mova m16, m18 + vpdpwssd m8, m13, m18 ; a1h + mova m1, m3 + vpdpwssd m7, m13, m3 ; b1l + mova m17, m19 + vpdpwssd m9, m13, m19 ; b1h + mova m2, m4 + vpdpwssd m6, m14, m4 ; a2l + mova m18, m20 + vpdpwssd m8, m14, m20 ; a2h + mova m3, m5 + vpdpwssd m7, m14, m5 ; b2l + mova m19, m21 + vpdpwssd m9, m14, m21 ; b2h + movu m21, [r7+strideq*1] + lea r7, [r7+strideq*2] + punpcklwd m4, m22, m21 ; 67l + punpckhwd m20, m22, m21 ; 67h + movu m22, [r7+strideq*0] + vpdpwssd m6, m15, m4 ; a3l + vpdpwssd m8, m15, m20 ; a3h + punpcklwd m5, m21, m22 ; 78l + punpckhwd m21, m22 ; 78h + vpdpwssd m7, m15, m5 ; b3l + vpdpwssd m9, m15, m21 ; b3h + vpermt2b m6, m11, m8 + vpermt2b m7, m11, m9 + mova [r8+wq*0], m6 + mova [r8+wq*2], m7 + lea r8, [r8+wq*4] + sub hd, 2 + jg .v_w32_loop + add srcq, 64 + add tmpq, 64 + movzx hd, r5b + sub r5d, 1<<8 + jg .v_w32_loop0 +%if WIN64 + pop r8 +%endif + RET .h_w4: + RESET_STACK_STATE movzx mxd, mxb sub srcq, 2 pmovsxbw xmm0, [base+subpel_filters+mxq*8] @@ -2623,27 +4236,6 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w, h, mx, my, stride3 sub hd, 4 jg .h_w4_loop RET -.h: - test myd, 0xf00 - jnz .hv - vpbroadcastd m10, [prep_8tap_rnd] - lea r6, [strideq*3] - cmp wd, 4 - je .h_w4 - shr mxd, 16 - pmovsxbw xmm0, [base+subpel_filters+mxq*8] - mov r5d, r7m - sub srcq, 6 - shr r5d, 11 - psllw xmm0, [base+prep_hv_shift+r5*8] - mova [tmpq], xmm0 - vpbroadcastd m12, xmm0 - vpbroadcastd m13, [tmpq+ 4] - vpbroadcastd m14, [tmpq+ 8] - vpbroadcastd m15, [tmpq+12] - cmp wd, 16 - je .h_w16 - jg .h_w32 .h_w8: mova m6, [spel_h_shufA] movu m7, [spel_h_shufB] @@ -2680,10 +4272,30 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w, h, mx, my, stride3 sub hd, 4 jg .h_w8_loop RET -.h_w16: +.h: + vpbroadcastd m10, [prep_8tap_rnd] + test myd, 0xf00 + jnz .hv + lea r6, [strideq*3] + cmp wd, 4 + je .h_w4 + shr mxd, 16 + pmovsxbw xmm0, [base+subpel_filters+mxq*8] + mov r5d, r7m + sub srcq, 6 + shr r5d, 11 + psllw xmm0, [base+prep_hv_shift+r5*8] + mova [tmpq], xmm0 + vpbroadcastd m12, xmm0 + vpbroadcastd m13, [tmpq+ 4] + vpbroadcastd m14, [tmpq+ 8] + vpbroadcastd m15, [tmpq+12] + cmp wd, 16 + jl .h_w8 vbroadcasti32x4 m6, [spel_h_shufA] vbroadcasti32x4 m7, [spel_h_shufB] mova m11, [prep_endC] + jg .h_w32 .h_w16_loop: movu ym2, [srcq+strideq*0+ 0] vinserti32x8 m2, [srcq+strideq*1+ 0], 1 @@ -2714,11 +4326,8 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w, h, mx, my, stride3 jg .h_w16_loop RET .h_w32: - vbroadcasti32x4 m6, [spel_h_shufA] lea srcq, [srcq+wq*2] - vbroadcasti32x4 m7, [spel_h_shufB] neg wq - mova m11, [prep_endC] .h_w32_loop0: mov r6, wq .h_w32_loop: @@ -2750,238 +4359,8 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w, h, mx, my, stride3 dec hd jg .h_w32_loop0 RET -.v: - movzx mxd, myb - shr myd, 16 - cmp hd, 4 - cmove myd, mxd - mov r5d, r7m - vpbroadcastd m10, [prep_8tap_rnd] - pmovsxbw xmm0, [base+subpel_filters+myq*8] - tzcnt r6d, wd - shr r5d, 11 - movzx r6d, word [r7+r6*2+table_offset(prep, _8tap_v)] - psllw xmm0, [base+prep_hv_shift+r5*8] - add r7, r6 - lea r6, [strideq*3] - sub srcq, r6 - mova [tmpq], xmm0 - vpbroadcastd m12, xmm0 - vpbroadcastd m13, [tmpq+ 4] - vpbroadcastd m14, [tmpq+ 8] - vpbroadcastd m15, [tmpq+12] - jmp r7 -.v_w4: - movq xmm1, [srcq+strideq*0] - vpbroadcastq ymm0, [srcq+strideq*1] - vpbroadcastq ymm2, [srcq+strideq*2] - add srcq, r6 - vpbroadcastq ymm4, [srcq+strideq*0] - vpbroadcastq ymm3, [srcq+strideq*1] - vpbroadcastq ymm5, [srcq+strideq*2] - mova xm11, [prep_endA] - add srcq, r6 - vpblendd ymm1, ymm0, 0x30 - vpblendd ymm0, ymm2, 0x30 - punpcklwd ymm1, ymm0 ; 01 12 - vpbroadcastq ymm0, [srcq+strideq*0] - vpblendd ymm2, ymm4, 0x30 - vpblendd ymm4, ymm3, 0x30 - punpcklwd ymm2, ymm4 ; 23 34 - vpblendd ymm3, ymm5, 0x30 - vpblendd ymm5, ymm0, 0x30 - punpcklwd ymm3, ymm5 ; 45 56 -.v_w4_loop: - vpbroadcastq ymm5, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - mova ymm4, ym10 - vpdpwssd ymm4, ym12, ymm1 ; a0 b0 - mova ymm1, ymm2 - vpdpwssd ymm4, ym13, ymm2 ; a1 b1 - mova ymm2, ymm3 - vpdpwssd ymm4, ym14, ymm3 ; a2 b2 - vpblendd ymm3, ymm0, ymm5, 0x30 - vpbroadcastq ymm0, [srcq+strideq*0] - vpblendd ymm5, ymm0, 0x30 - punpcklwd ymm3, ymm5 ; 67 78 - vpdpwssd ymm4, ym15, ymm3 ; a3 b3 - vpermb ymm4, ym11, ymm4 - mova [tmpq], xmm4 - add tmpq, 16 - sub hd, 2 - jg .v_w4_loop - vzeroupper - RET -.v_w8: - vbroadcasti32x4 m2, [srcq+strideq*2] - vinserti32x4 m1, m2, [srcq+strideq*0], 0 - vinserti32x4 m1, [srcq+strideq*1], 1 ; 0 1 2 - add srcq, r6 - vinserti32x4 ym2, [srcq+strideq*0], 1 - vinserti32x4 m2, [srcq+strideq*1], 2 ; 2 3 4 - mova m6, [spel_v_shuf8] - movu xm0, [srcq+strideq*1] - vinserti32x4 ym0, [srcq+strideq*2], 1 - add srcq, r6 - vinserti32x4 m0, [srcq+strideq*0], 2 ; 4 5 6 - mova ym11, [prep_endB] - vpermb m1, m6, m1 ; 01 12 - vpermb m2, m6, m2 ; 23 34 - vpermb m3, m6, m0 ; 45 56 -.v_w8_loop: - vinserti32x4 m0, [srcq+strideq*1], 3 - lea srcq, [srcq+strideq*2] - movu xm5, [srcq+strideq*0] - mova m4, m10 - vpdpwssd m4, m12, m1 ; a0 b0 - mova m1, m2 - vshufi32x4 m0, m5, q1032 ; 6 7 8 - vpdpwssd m4, m13, m2 ; a1 b1 - mova m2, m3 - vpdpwssd m4, m14, m3 ; a2 b2 - vpermb m3, m6, m0 ; 67 78 - vpdpwssd m4, m15, m3 ; a3 b3 - vpermb m4, m11, m4 - mova [tmpq], ym4 - add tmpq, 32 - sub hd, 2 - jg .v_w8_loop - RET -.v_w16: - vbroadcasti32x8 m1, [srcq+strideq*1] - vinserti32x8 m0, m1, [srcq+strideq*0], 0 - vinserti32x8 m1, [srcq+strideq*2], 1 - mova m8, [spel_v_shuf16] - add srcq, r6 - movu ym3, [srcq+strideq*0] - vinserti32x8 m3, [srcq+strideq*1], 1 - movu ym5, [srcq+strideq*2] - add srcq, r6 - vinserti32x8 m5, [srcq+strideq*0], 1 - mova m11, [prep_endA] - vpermb m0, m8, m0 ; 01 - vpermb m1, m8, m1 ; 12 - vpermb m3, m8, m3 ; 34 - vpermb m5, m8, m5 ; 56 - vpshrdd m2, m1, m3, 16 ; 23 - vpshrdd m4, m3, m5, 16 ; 45 -.v_w16_loop: - mova m6, m10 - mova m7, m10 - vpdpwssd m6, m12, m0 ; a0 - mova m0, m2 - vpdpwssd m7, m12, m1 ; b0 - mova m1, m3 - vpdpwssd m6, m13, m2 ; a1 - mova m2, m4 - vpdpwssd m7, m13, m3 ; b1 - mova m3, m5 - vpdpwssd m6, m14, m4 ; a2 - mova m4, m5 - vpdpwssd m7, m14, m5 ; b2 - movu ym5, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vinserti32x8 m5, [srcq+strideq*0], 1 - vpermb m5, m8, m5 ; 78 - vpshrdd m4, m5, 16 ; 67 - vpdpwssd m6, m15, m4 ; a3 - vpdpwssd m7, m15, m5 ; b3 - vpermt2b m6, m11, m7 - mova [tmpq], m6 - add tmpq, 64 - sub hd, 2 - jg .v_w16_loop - RET -.v_w32: -.v_w64: -.v_w128: -%if WIN64 - PUSH r8 - movaps [rsp+stack_offset+8], xmm6 -%endif - lea r5, [hq+wq*8-256] - mov r7, srcq - mov r8, tmpq -.v_w32_loop0: - movu m16, [srcq+strideq*0] - movu m17, [srcq+strideq*1] - movu m18, [srcq+strideq*2] - add srcq, r6 - movu m19, [srcq+strideq*0] - movu m20, [srcq+strideq*1] - movu m21, [srcq+strideq*2] - add srcq, r6 - movu m22, [srcq+strideq*0] - mova m11, [prep_endC] - punpcklwd m0, m16, m17 ; 01l - punpckhwd m16, m17 ; 01h - punpcklwd m1, m17, m18 ; 12l - punpckhwd m17, m18 ; 12h - punpcklwd m2, m18, m19 ; 23l - punpckhwd m18, m19 ; 23h - punpcklwd m3, m19, m20 ; 34l - punpckhwd m19, m20 ; 34h - punpcklwd m4, m20, m21 ; 45l - punpckhwd m20, m21 ; 45h - punpcklwd m5, m21, m22 ; 56l - punpckhwd m21, m22 ; 56h -.v_w32_loop: - mova m6, m10 - vpdpwssd m6, m12, m0 ; a0l - mova m8, m10 - vpdpwssd m8, m12, m16 ; a0h - mova m7, m10 - vpdpwssd m7, m12, m1 ; b0l - mova m9, m10 - vpdpwssd m9, m12, m17 ; b0h - mova m0, m2 - vpdpwssd m6, m13, m2 ; a1l - mova m16, m18 - vpdpwssd m8, m13, m18 ; a1h - mova m1, m3 - vpdpwssd m7, m13, m3 ; b1l - mova m17, m19 - vpdpwssd m9, m13, m19 ; b1h - mova m2, m4 - vpdpwssd m6, m14, m4 ; a2l - mova m18, m20 - vpdpwssd m8, m14, m20 ; a2h - mova m3, m5 - vpdpwssd m7, m14, m5 ; b2l - mova m19, m21 - vpdpwssd m9, m14, m21 ; b2h - movu m21, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - punpcklwd m4, m22, m21 ; 67l - punpckhwd m20, m22, m21 ; 67h - movu m22, [srcq+strideq*0] - vpdpwssd m6, m15, m4 ; a3l - vpdpwssd m8, m15, m20 ; a3h - punpcklwd m5, m21, m22 ; 78l - punpckhwd m21, m22 ; 78h - vpdpwssd m7, m15, m5 ; b3l - vpdpwssd m9, m15, m21 ; b3h - vpermt2b m6, m11, m8 - vpermt2b m7, m11, m9 - mova [tmpq+wq*0], m6 - mova [tmpq+wq*2], m7 - lea tmpq, [tmpq+wq*4] - sub hd, 2 - jg .v_w32_loop - add r7, 64 - add r8, 64 - movzx hd, r5b - mov srcq, r7 - mov tmpq, r8 - sub r5d, 1<<8 - jg .v_w32_loop0 -%if WIN64 - movaps xmm6, [rsp+stack_offset+8] - POP r8 -%endif - vzeroupper - RET .hv: + vpbroadcastd m11, [pd_128] cmp wd, 4 jg .hv_w8 movzx mxd, mxb @@ -2998,65 +4377,66 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w, h, mx, my, stride3 sub srcq, r6 psllw xmm0, [base+prep_hv_shift+r5*8] psllw xmm1, 2 - vpbroadcastd m10, [prep_8tap_rnd] - vpbroadcastd ym11, [pd_128] - mova xm21, [prep_endA] mova [tmpq+ 0], xmm0 mova [tmpq+16], xmm1 + vpbroadcastd m12, xmm1 + movu xm16, [srcq+strideq*0] + mov r3d, 0xff0 + vinserti128 ym16, [srcq+strideq*1], 1 + kmovw k1, r3d + vbroadcasti32x4 m18, [srcq+strideq*2] + add srcq, r6 + vinserti64x2 m16{k1}, m18, [srcq+strideq*0], 3 + movu xm17, [srcq+strideq*1] + vbroadcasti32x4 ym18, [srcq+strideq*2] + add srcq, r6 + vinserti32x4 m17{k1}, m18, [srcq+strideq*0], 2 + vbroadcasti32x4 m5, [spel_h_shufA] + vbroadcasti32x4 m6, [spel_h_shufB] vpbroadcastd m8, [tmpq+ 4] vpbroadcastd m9, [tmpq+ 8] - vpbroadcastd ym12, xmm1 - vpbroadcastd ym13, [tmpq+20] - vpbroadcastd ym14, [tmpq+24] - vpbroadcastd ym15, [tmpq+28] - movu xm4, [srcq+strideq*0] - vinserti32x4 ym4, [srcq+strideq*1], 1 - vinserti32x4 m4, [srcq+strideq*2], 2 - add srcq, r6 - vinserti32x4 m4, [srcq+strideq*0], 3 ; 0 1 2 3 - movu xm0, [srcq+strideq*1] - vinserti32x4 ym0, [srcq+strideq*2], 1 - add srcq, r6 - vinserti32x4 m0, [srcq+strideq*0], 2 ; 4 5 6 - vbroadcasti32x4 m19, [spel_h_shufA] - vbroadcasti32x4 m20, [spel_h_shufB] - mova ym6, [spel_shuf4a] - mova ym7, [spel_shuf4b] + mova m1, m10 + mova m19, [spel_shuf4a] mova m2, m10 - mova m3, m10 - pshufb m1, m4, m19 - vpdpwssd m2, m8, m1 - pshufb m1, m0, m19 - vpdpwssd m3, m8, m1 - pshufb m4, m20 - vpdpwssd m2, m9, m4 - pshufb m0, m20 - vpdpwssd m3, m9, m0 - vpermb m1, m6, m2 ; 01 12 - vshufi32x4 m2, m3, q1032 - vpermb m3, m6, m3 ; 45 56 - vpermb m2, m6, m2 ; 23 34 + pshufb m0, m16, m5 + vpdpwssd m1, m8, m0 + pshufb m0, m17, m5 + vpdpwssd m2, m8, m0 + vpbroadcastd m13, [tmpq+20] + pshufb m16, m6 + vpbroadcastd m14, [tmpq+24] + pshufb m17, m6 + vpbroadcastd m15, [tmpq+28] + vpdpwssd m1, m9, m16 ; 0 1 2 3 + vpdpwssd m2, m9, m17 ; 4 5 6 + mova m7, [spel_shuf4b] + vpermt2b m1, m19, m2 ; 01 12 23 34 + vpermb m2, m19, m2 ; 45 56 + mova ym19, [prep_endA] + vshufi32x4 m2, m1, m2, q1032 ; 23 34 45 56 .hv_w4_loop: - movu xm18, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vinserti128 ym18, [srcq+strideq*0], 1 - mova ym16, ym11 - mova ym4, ym10 - pshufb ym17, ym18, ym19 - vpdpwssd ym16, ym12, ym1 ; a0 b0 - vpdpwssd ym4, ym8, ym17 - pshufb ym18, ym20 - mova ym1, ym2 - vpdpwssd ym16, ym13, ym2 ; a1 b1 - vpdpwssd ym4, ym9, ym18 ; 7 8 - mova ym2, ym3 - vpdpwssd ym16, ym14, ym3 ; a2 b2 - vpermt2b ym3, ym7, ym4 ; 67 78 - vpdpwssd ym16, ym15, ym3 ; a3 b3 - vpermb ym16, ym21, ym16 - mova [tmpq], xm16 - add tmpq, 16 - sub hd, 2 + movu xm17, [srcq+strideq*1] + vinserti128 ym17, [srcq+strideq*2], 1 + vbroadcasti32x4 m16, [srcq+r6 ] + lea srcq, [srcq+strideq*4] + vinserti64x2 m17{k1}, m16, [srcq+strideq*0], 3 + mova m18, m10 + pshufb m16, m17, m5 + vpdpwssd m18, m8, m16 + mova m16, m11 + vpdpwssd m16, m12, m1 ; a0 b0 c0 d0 + pshufb m17, m6 + vpdpwssd m18, m9, m17 ; 7 8 9 a + mova m1, m2 + vpdpwssd m16, m13, m2 ; a1 b1 c1 d1 + vpermt2b m2, m7, m18 ; 67 78 89 9a + vpdpwssd m16, m15, m2 ; a3 b3 c3 d3 + vshufi32x4 m1, m2, q1032 ; 45 56 67 78 + vpdpwssd m16, m14, m1 ; a2 b2 c2 d2 + vpermb m16, m19, m16 + mova [tmpq], ym16 + add tmpq, 32 + sub hd, 4 jg .hv_w4_loop vzeroupper RET @@ -3073,8 +4453,6 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w, h, mx, my, stride3 sub srcq, 6 shr r5d, 11 sub srcq, r6 - vpbroadcastd m10, [prep_8tap_rnd] - vpbroadcastd m11, [pd_128] psllw xmm0, [base+prep_hv_shift+r5*8] psllw xmm1, 2 mova [tmpq+ 0], xmm0 @@ -3087,10 +4465,9 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w, h, mx, my, stride3 vpbroadcastd m17, [tmpq+20] vpbroadcastd m18, [tmpq+24] vpbroadcastd m19, [tmpq+28] - cmp wd, 16 - je .hv_w16 - jg .hv_w32 - WIN64_SPILL_XMM 23 + cmp wd, 8 + jg .hv_w16 + WIN64_SPILL_XMM 23 mova m5, [spel_h_shufA] movu ym0, [srcq+strideq*0] vinserti32x8 m0, [srcq+strideq*1], 1 ; 0 1 @@ -3174,28 +4551,35 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w, h, mx, my, stride3 jg .hv_w8_loop RET .hv_w16: - WIN64_SPILL_XMM 27 + WIN64_SPILL_XMM 27 +%if WIN64 + push r8 +%endif + vbroadcasti32x4 m20, [spel_h_shufA] + vbroadcasti32x4 m21, [spel_h_shufB] + add wd, wd + mova m9, [spel_shuf16] + mova m26, [prep_endB] + lea r5d, [hq+wq*8-256] +.hv_w16_loop0: vbroadcasti32x8 m5, [srcq+strideq*0+ 8] vinserti32x8 m4, m5, [srcq+strideq*0+ 0], 0 vinserti32x8 m5, [srcq+strideq*0+16], 1 ; 0 movu ym6, [srcq+strideq*1+ 0] movu ym7, [srcq+strideq*1+16] + lea r7, [srcq+r6] vinserti32x8 m6, [srcq+strideq*2+ 0], 1 vinserti32x8 m7, [srcq+strideq*2+16], 1 ; 1 2 - add srcq, r6 - movu ym22, [srcq+strideq*0+ 0] - movu ym23, [srcq+strideq*0+16] - vinserti32x8 m22, [srcq+strideq*1+ 0], 1 - vinserti32x8 m23, [srcq+strideq*1+16], 1 ; 3 4 - movu ym24, [srcq+strideq*2+ 0] - movu ym25, [srcq+strideq*2+16] - add srcq, r6 - vinserti32x8 m24, [srcq+strideq*0+ 0], 1 - vinserti32x8 m25, [srcq+strideq*0+16], 1 ; 5 6 - vbroadcasti32x4 m20, [spel_h_shufA] - vbroadcasti32x4 m21, [spel_h_shufB] - mova m9, [spel_shuf16] - mova m26, [prep_endB] + movu ym22, [r7 +strideq*0+ 0] + movu ym23, [r7 +strideq*0+16] + mov r8, tmpq + vinserti32x8 m22, [r7 +strideq*1+ 0], 1 + vinserti32x8 m23, [r7 +strideq*1+16], 1 ; 3 4 + movu ym24, [r7 +strideq*2+ 0] + movu ym25, [r7 +strideq*2+16] + add r7, r6 + vinserti32x8 m24, [r7 +strideq*0+ 0], 1 + vinserti32x8 m25, [r7 +strideq*0+16], 1 ; 5 6 pshufb m0, m4, m20 mova m1, m10 vpdpwssd m1, m12, m0 ; a0 @@ -3264,11 +4648,11 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w, h, mx, my, stride3 vpshrdd m3, m2, m4, 16 ; 23 vpshrdd m5, m4, m6, 16 ; 45 .hv_w16_loop: - movu ym24, [srcq+strideq*1+ 0] - movu ym25, [srcq+strideq*1+16] - lea srcq, [srcq+strideq*2] - vinserti32x8 m24, [srcq+strideq*0+ 0], 1 - vinserti32x8 m25, [srcq+strideq*0+16], 1 + movu ym24, [r7+strideq*1+ 0] + movu ym25, [r7+strideq*1+16] + lea r7, [r7+strideq*2] + vinserti32x8 m24, [r7+strideq*0+ 0], 1 + vinserti32x8 m25, [r7+strideq*0+16], 1 mova m7, m10 mova m8, m10 pshufb m0, m24, m20 @@ -3304,139 +4688,19 @@ cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w, h, mx, my, stride3 vpdpwssd m23, m19, m7 ; B3 mova m6, m7 vpermt2b m22, m26, m23 - mova [tmpq], m22 - add tmpq, 64 + mova [r8+wq*0], ym22 + vextracti32x8 [r8+wq*1], m22, 1 + lea r8, [r8+wq*2] sub hd, 2 jg .hv_w16_loop - RET -.hv_w32: -%if WIN64 - PUSH r8 - %assign regs_used regs_used + 1 - WIN64_SPILL_XMM 32 -%endif - vbroadcasti32x4 m20, [spel_h_shufA] - vbroadcasti32x4 m21, [spel_h_shufB] - mova m22, [spel_shuf32] - lea r5d, [hq+wq*8-256] - mov r7, srcq - mov r8, tmpq -.hv_w32_loop0: - movu m6, [srcq+strideq*0+ 0] - movu m7, [srcq+strideq*0+ 8] - movu m8, [srcq+strideq*0+16] - mova m0, m10 - mova m23, m10 - pshufb m9, m6, m20 - vpdpwssd m0, m12, m9 ; a0l - pshufb m9, m7, m20 - vpdpwssd m23, m12, m9 ; a0h - vpdpwssd m0, m14, m9 ; a2l - pshufb m7, m21 - vpdpwssd m23, m13, m7 ; a1h - vpdpwssd m0, m15, m7 ; a3l - pshufb m7, m8, m20 - vpdpwssd m23, m14, m7 ; a2h - pshufb m6, m21 - vpdpwssd m0, m13, m6 ; a1l - pshufb m8, m21 - vpdpwssd m23, m15, m8 ; a3h - PUT_8TAP_HV_W32 1, 24, strideq, 1, 2 ; 12 - PUT_8TAP_HV_W32 3, 26, strideq, 0, 1 ; 34 - PUT_8TAP_HV_W32 5, 28, strideq, 2, 0 ; 56 - vpshrdd m2, m1, m3, 16 ; 23l - vpshrdd m25, m24, m26, 16 ; 23h - vpshrdd m4, m3, m5, 16 ; 45l - vpshrdd m27, m26, m28, 16 ; 45h -.hv_w32_loop: - movu m7, [srcq+strideq*1+ 0] - movu m9, [srcq+strideq*2+ 0] - movu m6, [srcq+strideq*1+ 8] - movu m8, [srcq+strideq*2+ 8] - mova m29, m10 - mova m31, m10 - pshufb m30, m7, m20 - vpdpwssd m29, m12, m30 ; h0l - pshufb m30, m9, m20 - vpdpwssd m31, m12, m30 ; i0l - pshufb m7, m21 - vpdpwssd m29, m13, m7 ; h1l - pshufb m9, m21 - vpdpwssd m31, m13, m9 ; i1l - pshufb m7, m6, m20 - vpdpwssd m29, m14, m7 ; h2l - pshufb m9, m8, m20 - vpdpwssd m31, m14, m9 ; i2l - pshufb m6, m21 - vpdpwssd m29, m15, m6 ; h3l - pshufb m8, m21 - vpdpwssd m31, m15, m8 ; i3l - mova m30, m10 - vpdpwssd m30, m12, m7 ; h0h - movu m7, [srcq+strideq*1+16] - lea srcq, [srcq+strideq*2] - vpermt2b m29, m22, m31 ; 78l - mova m31, m10 - vpdpwssd m31, m12, m9 ; i0h - movu m9, [srcq+strideq*0+16] - vpdpwssd m30, m13, m6 ; h1h - pshufb m6, m7, m20 - vpdpwssd m31, m13, m8 ; i1h - pshufb m8, m9, m20 - vpdpwssd m30, m14, m6 ; h2h - mova m6, m11 - vpdpwssd m6, m16, m0 ; A0l - pshufb m7, m21 - vpdpwssd m31, m14, m8 ; i2h - mova m8, m11 - vpdpwssd m8, m16, m23 ; A0h - pshufb m9, m21 - vpdpwssd m30, m15, m7 ; h3h - mova m7, m11 - vpdpwssd m7, m16, m1 ; B0l - vpdpwssd m31, m15, m9 ; i3h - mova m9, m11 - vpdpwssd m9, m16, m24 ; B0h - mova m0, m2 - vpdpwssd m6, m17, m2 ; A1l - mova m23, m25 - vpdpwssd m8, m17, m25 ; A1h - mova m1, m3 - vpdpwssd m7, m17, m3 ; B1l - mova m24, m26 - vpdpwssd m9, m17, m26 ; B1h - vpermt2b m30, m22, m31 ; 78h - mova m31, [prep_endC] - vpdpwssd m6, m18, m4 ; A2l - mova m2, m4 - vpdpwssd m8, m18, m27 ; A2h - mova m25, m27 - vpdpwssd m7, m18, m5 ; B2l - mova m3, m5 - vpdpwssd m9, m18, m28 ; B2h - mova m26, m28 - vpshrdd m4, m5, m29, 16 ; 67l - vpdpwssd m6, m19, m4 ; A3l - vpshrdd m27, m28, m30, 16 ; 67h - vpdpwssd m8, m19, m27 ; A3h - mova m5, m29 - vpdpwssd m7, m19, m29 ; B3l - mova m28, m30 - vpdpwssd m9, m19, m30 ; B3h - vpermt2b m6, m31, m8 - vpermt2b m7, m31, m9 - mova [tmpq+wq*0], m6 - mova [tmpq+wq*2], m7 - lea tmpq, [tmpq+wq*4] - sub hd, 2 - jg .hv_w32_loop - add r7, 64 - add r8, 64 + add srcq, 32 + add tmpq, 32 movzx hd, r5b - mov srcq, r7 - mov tmpq, r8 sub r5d, 1<<8 - jg .hv_w32_loop0 + jg .hv_w16_loop0 +%if WIN64 + pop r8 +%endif RET %if WIN64 diff --git a/third_party/dav1d/src/x86/mc_sse.asm b/third_party/dav1d/src/x86/mc_sse.asm index a447a8016138..deeb6dbc45b8 100644 --- a/third_party/dav1d/src/x86/mc_sse.asm +++ b/third_party/dav1d/src/x86/mc_sse.asm @@ -55,10 +55,12 @@ subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +subpel_h_shufD: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +subpel_h_shufE: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 +subpel_h_shufF: db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11 subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 -bilin_h_shuf8: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 unpckw: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 rescale_mul: dd 0, 1, 2, 3 resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7 @@ -238,7 +240,6 @@ BIDIR_JMP_TABLE blend_h, ssse3, 2, 4, 8, 16, 16, 16, 16 %endrep %endmacro -%xdefine prep_sse2 mangle(private_prefix %+ _prep_bilin_8bpc_sse2.prep) %xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_8bpc_ssse3.put) %xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_8bpc_ssse3.prep) @@ -277,9 +278,6 @@ BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128 %endif %endmacro -HV_JMP_TABLE prep, 8tap, sse2, 1, 4, 8, 16, 32, 64, 128 -HV_JMP_TABLE prep, bilin, sse2, 7, 4, 8, 16, 32, 64, 128 -HV_JMP_TABLE put, 8tap, ssse3, 3, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, 8tap, ssse3, 1, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, bilin, ssse3, 7, 4, 8, 16, 32, 64, 128 @@ -442,7 +440,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, h, mxy ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 imul mxyd, 0x00ff00ff - mova m4, [base+bilin_h_shuf8] + mova m4, [base+subpel_h_shufD] mova m0, [base+bilin_h_shuf4] add mxyd, 0x00100010 movd m5, mxyd @@ -900,56 +898,6 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, h, mxy %endif RET -%macro PSHUFB_BILIN_H8 2 ; dst, src - %if cpuflag(ssse3) - pshufb %1, %2 - %else - psrldq %2, %1, 1 - punpcklbw %1, %2 - %endif -%endmacro - -%macro PSHUFB_BILIN_H4 3 ; dst, src, tmp - %if cpuflag(ssse3) - pshufb %1, %2 - %else - psrldq %2, %1, 1 - punpckhbw %3, %1, %2 - punpcklbw %1, %2 - punpcklqdq %1, %3 - %endif -%endmacro - -%macro PMADDUBSW 5 ; dst/src1, src2, zero, tmp, reset_zero - %if cpuflag(ssse3) - pmaddubsw %1, %2 - %else - %if %5 == 1 - pxor %3, %3 - %endif - punpckhbw %4, %1, %3 - punpcklbw %1, %1, %3 - pmaddwd %4, %2 - pmaddwd %1, %2 - packssdw %1, %4 - %endif -%endmacro - -%macro PMULHRSW 5 ; dst, src, tmp, rndval, shift - %if cpuflag(ssse3) - pmulhrsw %1, %2 - %else - punpckhwd %3, %1, %4 - punpcklwd %1, %4 - pmaddwd %3, %2 - pmaddwd %1, %2 - psrad %3, %5 - psrad %1, %5 - packssdw %1, %3 - %endif -%endmacro - -%macro PREP_BILIN 0 %if ARCH_X86_32 %define base r6-prep%+SUFFIX %else @@ -958,7 +906,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, h, mxy cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movifnidn mxyd, r5m ; mx - LEA r6, prep%+SUFFIX + LEA r6, prep_ssse3 tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd @@ -967,10 +915,6 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 test mxyd, mxyd jnz .v .prep: -%if notcpuflag(ssse3) - add r6, prep_ssse3 - prep_sse2 - jmp prep_ssse3 -%else movzx wd, word [r6+wq*2+table_offset(prep,)] pxor m4, m4 add wq, r6 @@ -1070,34 +1014,22 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 dec hd jg .prep_w32_vloop RET -%endif .h: ; 16 * src[x] + (mx * (src[x + 1] - src[x])) ; = (16 - mx) * src[x] + mx * src[x + 1] -%if cpuflag(ssse3) imul mxyd, 0x00ff00ff - mova m4, [base+bilin_h_shuf8] + mova m4, [base+subpel_h_shufD] add mxyd, 0x00100010 -%else - imul mxyd, 0xffff - add mxyd, 16 -%endif movd m5, mxyd mov mxyd, r6m ; my pshufd m5, m5, q0000 test mxyd, mxyd jnz .hv movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] -%if notcpuflag(ssse3) - WIN64_SPILL_XMM 8 - pxor m6, m6 -%endif add wq, r6 jmp wq .h_w4: -%if cpuflag(ssse3) mova m4, [base+bilin_h_shuf4] -%endif lea stride3q, [strideq*3] .h_w4_loop: movq m0, [srcq+strideq*0] @@ -1105,10 +1037,10 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movq m1, [srcq+strideq*2] movhps m1, [srcq+stride3q ] lea srcq, [srcq+strideq*4] - PSHUFB_BILIN_H4 m0, m4, m2 - PMADDUBSW m0, m5, m6, m2, 0 - PSHUFB_BILIN_H4 m1, m4, m2 - PMADDUBSW m1, m5, m6, m2, 0 + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 mova [tmpq+0 ], m0 mova [tmpq+16], m1 add tmpq, 32 @@ -1123,14 +1055,8 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movu m2, [srcq+strideq*2] movu m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] - PSHUFB_BILIN_H8 m0, m4 - PSHUFB_BILIN_H8 m1, m4 - PSHUFB_BILIN_H8 m2, m4 - PSHUFB_BILIN_H8 m3, m4 - PMADDUBSW m0, m5, m6, m7, 0 - PMADDUBSW m1, m5, m6, m7, 0 - PMADDUBSW m2, m5, m6, m7, 0 - PMADDUBSW m3, m5, m6, m7, 0 + REPX {pshufb x, m4}, m0, m1, m2, m3 + REPX {pmaddubsw x, m5}, m0, m1, m2, m3 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 @@ -1145,14 +1071,8 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movu m2, [srcq+strideq*1+8*0] movu m3, [srcq+strideq*1+8*1] lea srcq, [srcq+strideq*2] - PSHUFB_BILIN_H8 m0, m4 - PSHUFB_BILIN_H8 m1, m4 - PSHUFB_BILIN_H8 m2, m4 - PSHUFB_BILIN_H8 m3, m4 - PMADDUBSW m0, m5, m6, m7, 0 - PMADDUBSW m1, m5, m6, m7, 0 - PMADDUBSW m2, m5, m6, m7, 0 - PMADDUBSW m3, m5, m6, m7, 0 + REPX {pshufb x, m4}, m0, m1, m2, m3 + REPX {pmaddubsw x, m5}, m0, m1, m2, m3 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 @@ -1178,14 +1098,8 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movu m1, [srcq+r6+8*1] movu m2, [srcq+r6+8*2] movu m3, [srcq+r6+8*3] - PSHUFB_BILIN_H8 m0, m4 - PSHUFB_BILIN_H8 m1, m4 - PSHUFB_BILIN_H8 m2, m4 - PSHUFB_BILIN_H8 m3, m4 - PMADDUBSW m0, m5, m6, m7, 0 - PMADDUBSW m1, m5, m6, m7, 0 - PMADDUBSW m2, m5, m6, m7, 0 - PMADDUBSW m3, m5, m6, m7, 0 + REPX {pshufb x, m4}, m0, m1, m2, m3 + REPX {pmaddubsw x, m5}, m0, m1, m2, m3 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 @@ -1198,18 +1112,9 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 jg .h_w32_vloop RET .v: -%if notcpuflag(ssse3) - WIN64_SPILL_XMM 8 -%endif movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] -%if cpuflag(ssse3) imul mxyd, 0x00ff00ff add mxyd, 0x00100010 -%else - imul mxyd, 0xffff - pxor m6, m6 - add mxyd, 16 -%endif add wq, r6 lea stride3q, [strideq*3] movd m5, mxyd @@ -1225,13 +1130,13 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 punpckldq m0, m1 punpckldq m1, m2 punpcklbw m0, m1 ; 01 12 - PMADDUBSW m0, m5, m6, m7, 0 + pmaddubsw m0, m5 mova [tmpq+16*0], m0 movd m0, [srcq+strideq*0] punpckldq m2, m3 punpckldq m3, m0 punpcklbw m2, m3 ; 23 34 - PMADDUBSW m2, m5, m6, m7, 0 + pmaddubsw m2, m5 mova [tmpq+16*1], m2 add tmpq, 16*2 sub hd, 4 @@ -1246,15 +1151,15 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 lea srcq, [srcq+strideq*4] punpcklbw m0, m1 ; 01 punpcklbw m1, m2 ; 12 - PMADDUBSW m0, m5, m6, m7, 0 - PMADDUBSW m1, m5, m6, m7, 0 + pmaddubsw m0, m5 + pmaddubsw m1, m5 mova [tmpq+16*0], m0 movq m0, [srcq+strideq*0] punpcklbw m2, m3 ; 23 punpcklbw m3, m0 ; 34 - PMADDUBSW m2, m5, m6, m7, 0 + pmaddubsw m2, m5 mova [tmpq+16*1], m1 - PMADDUBSW m3, m5, m6, m7, 0 + pmaddubsw m3, m5 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 @@ -1270,27 +1175,27 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 lea srcq, [srcq+strideq*4] punpcklbw m4, m0, m1 punpckhbw m0, m1 - PMADDUBSW m4, m5, m6, m7, 0 - PMADDUBSW m0, m5, m6, m7, 0 + pmaddubsw m4, m5 + pmaddubsw m0, m5 mova [tmpq+16*0], m4 punpcklbw m4, m1, m2 punpckhbw m1, m2 - PMADDUBSW m4, m5, m6, m7, 0 + pmaddubsw m4, m5 mova [tmpq+16*1], m0 movu m0, [srcq+strideq*0] - PMADDUBSW m1, m5, m6, m7, 0 + pmaddubsw m1, m5 mova [tmpq+16*2], m4 punpcklbw m4, m2, m3 punpckhbw m2, m3 - PMADDUBSW m4, m5, m6, m7, 0 + pmaddubsw m4, m5 mova [tmpq+16*3], m1 - PMADDUBSW m2, m5, m6, m7, 0 + pmaddubsw m2, m5 mova [tmpq+16*4], m4 punpcklbw m4, m3, m0 punpckhbw m3, m0 - PMADDUBSW m4, m5, m6, m7, 0 + pmaddubsw m4, m5 mova [tmpq+16*5], m2 - PMADDUBSW m3, m5, m6, m7, 0 + pmaddubsw m3, m5 mova [tmpq+16*6], m4 mova [tmpq+16*7], m3 add tmpq, 16*8 @@ -1325,29 +1230,29 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 lea srcq, [srcq+strideq*2] punpcklbw m4, m0, m2 punpckhbw m0, m2 - PMADDUBSW m4, m5, m6, m7, 0 - PMADDUBSW m0, m5, m6, m7, 0 + pmaddubsw m4, m5 + pmaddubsw m0, m5 mova [tmpq+16*0], m4 mova [tmpq+16*1], m0 movu m0, [srcq+strideq*0+16*0] punpcklbw m4, m1, m3 punpckhbw m1, m3 - PMADDUBSW m4, m5, m6, m7, 0 - PMADDUBSW m1, m5, m6, m7, 0 + pmaddubsw m4, m5 + pmaddubsw m1, m5 mova [tmpq+16*2], m4 mova [tmpq+16*3], m1 movu m1, [srcq+strideq*0+16*1] add tmpq, r6 punpcklbw m4, m2, m0 punpckhbw m2, m0 - PMADDUBSW m4, m5, m6, m7, 0 - PMADDUBSW m2, m5, m6, m7, 0 + pmaddubsw m4, m5 + pmaddubsw m2, m5 mova [tmpq+16*0], m4 mova [tmpq+16*1], m2 punpcklbw m4, m3, m1 punpckhbw m3, m1 - PMADDUBSW m4, m5, m6, m7, 0 - PMADDUBSW m3, m5, m6, m7, 0 + pmaddubsw m4, m5 + pmaddubsw m3, m5 mova [tmpq+16*2], m4 mova [tmpq+16*3], m3 add tmpq, r6 @@ -1374,51 +1279,36 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] -%if cpuflag(ssse3) imul mxyd, 0x08000800 WIN64_SPILL_XMM 8 -%else - or mxyd, 1<<16 - WIN64_SPILL_XMM 9 - %if ARCH_X86_64 - mova m8, [base+pw_8] - %else - %define m8 [base+pw_8] - %endif - pxor m7, m7 -%endif movd m6, mxyd add wq, r6 pshufd m6, m6, q0000 jmp wq .hv_w4: -%if cpuflag(ssse3) mova m4, [base+bilin_h_shuf4] movddup m0, [srcq+strideq*0] -%else - movhps m0, [srcq+strideq*0] -%endif lea r3, [strideq*3] - PSHUFB_BILIN_H4 m0, m4, m3 - PMADDUBSW m0, m5, m7, m4, 0 ; _ 0 + pshufb m0, m4 + pmaddubsw m0, m5 ; _ 0 .hv_w4_loop: movq m1, [srcq+strideq*1] movhps m1, [srcq+strideq*2] movq m2, [srcq+r3 ] lea srcq, [srcq+strideq*4] movhps m2, [srcq+strideq*0] - PSHUFB_BILIN_H4 m1, m4, m3 - PSHUFB_BILIN_H4 m2, m4, m3 - PMADDUBSW m1, m5, m7, m4, 0 ; 1 2 - PMADDUBSW m2, m5, m7, m4, 0 ; 3 4 + pshufb m1, m4 + pshufb m2, m4 + pmaddubsw m1, m5 ; 1 2 + pmaddubsw m2, m5 ; 3 4 shufpd m0, m1, 0x01 ; 0 1 shufpd m3, m1, m2, 0x01 ; 2 3 psubw m1, m0 - PMULHRSW m1, m6, m4, m8, 4 + pmulhrsw m1, m6 paddw m1, m0 mova m0, m2 psubw m2, m3 - PMULHRSW m2, m6, m4, m8, 4 + pmulhrsw m2, m6 paddw m2, m3 mova [tmpq+16*0], m1 mova [tmpq+16*1], m2 @@ -1428,22 +1318,22 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 RET .hv_w8: movu m0, [srcq+strideq*0] - PSHUFB_BILIN_H8 m0, m4 - PMADDUBSW m0, m5, m7, m4, 0 ; 0 + pshufb m0, m4 + pmaddubsw m0, m5 ; 0 .hv_w8_loop: movu m1, [srcq+strideq*1] lea srcq, [srcq+strideq*2] movu m2, [srcq+strideq*0] - PSHUFB_BILIN_H8 m1, m4 - PSHUFB_BILIN_H8 m2, m4 - PMADDUBSW m1, m5, m7, m4, 0 ; 1 - PMADDUBSW m2, m5, m7, m4, 0 ; 2 + pshufb m1, m4 + pshufb m2, m4 + pmaddubsw m1, m5 ; 1 + pmaddubsw m2, m5 ; 2 psubw m3, m1, m0 - PMULHRSW m3, m6, m4, m8, 4 + pmulhrsw m3, m6 paddw m3, m0 mova m0, m2 psubw m2, m1 - PMULHRSW m2, m6, m4, m8, 4 + pmulhrsw m2, m6 paddw m2, m1 mova [tmpq+16*0], m3 mova [tmpq+16*1], m2 @@ -1467,9 +1357,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 xor r3d, r3d mov r5d, 32 .hv_w16_start: -%if ARCH_X86_64 || cpuflag(ssse3) mov r6, srcq -%endif %if ARCH_X86_64 %if WIN64 PUSH r7 @@ -1479,39 +1367,39 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 .hv_w16_hloop: movu m0, [srcq+strideq*0+8*0] movu m1, [srcq+strideq*0+8*1] - PSHUFB_BILIN_H8 m0, m4 - PSHUFB_BILIN_H8 m1, m4 - PMADDUBSW m0, m5, m7, m4, 0 ; 0a - PMADDUBSW m1, m5, m7, m4, 0 ; 0b + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 ; 0a + pmaddubsw m1, m5 ; 0b .hv_w16_vloop: movu m2, [srcq+strideq*1+8*0] - PSHUFB_BILIN_H8 m2, m4 - PMADDUBSW m2, m5, m7, m4, 0 ; 1a + pshufb m2, m4 + pmaddubsw m2, m5 ; 1a psubw m3, m2, m0 - PMULHRSW m3, m6, m4, m8, 4 + pmulhrsw m3, m6 paddw m3, m0 mova [tmpq+16*0], m3 movu m3, [srcq+strideq*1+8*1] lea srcq, [srcq+strideq*2] - PSHUFB_BILIN_H8 m3, m4 - PMADDUBSW m3, m5, m7, m4, 0 ; 1b + pshufb m3, m4 + pmaddubsw m3, m5 ; 1b psubw m0, m3, m1 - PMULHRSW m0, m6, m4, m8, 4 + pmulhrsw m0, m6 paddw m0, m1 mova [tmpq+16*1], m0 add tmpq, r5 movu m0, [srcq+strideq*0+8*0] - PSHUFB_BILIN_H8 m0, m4 - PMADDUBSW m0, m5, m7, m4, 0 ; 2a + pshufb m0, m4 + pmaddubsw m0, m5 ; 2a psubw m1, m0, m2 - PMULHRSW m1, m6, m4, m8, 4 + pmulhrsw m1, m6 paddw m1, m2 mova [tmpq+16*0], m1 movu m1, [srcq+strideq*0+8*1] - PSHUFB_BILIN_H8 m1, m4 - PMADDUBSW m1, m5, m7, m4, 0 ; 2b + pshufb m1, m4 + pmaddubsw m1, m5 ; 2b psubw m2, m1, m3 - PMULHRSW m2, m6, m4, m8, 4 + pmulhrsw m2, m6 paddw m2, m3 mova [tmpq+16*1], m2 add tmpq, r5 @@ -1523,19 +1411,12 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 add r7, 2*16 mov srcq, r6 mov tmpq, r7 -%elif cpuflag(ssse3) +%else mov tmpq, tmpm add r6, 16 add tmpq, 2*16 mov srcq, r6 mov tmpm, tmpq -%else - mov srcq, srcm - mov tmpq, tmpm - add srcq, 16 - add tmpq, 2*16 - mov srcm, srcq - mov tmpm, tmpq %endif sub r3d, 1<<8 jg .hv_w16_hloop @@ -1543,14 +1424,13 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 POP r7 %endif RET -%endmacro ; int8_t subpel_filters[5][15][8] %assign FILTER_REGULAR (0*15 << 16) | 3*15 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 -%macro FN 4 ; prefix, type, type_h, type_v +%macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to cglobal %1_%2_8bpc mov t0d, FILTER_%3 %ifidn %3, %4 @@ -1558,8 +1438,8 @@ cglobal %1_%2_8bpc %else mov t1d, FILTER_%4 %endif -%ifnidn %2, regular ; skip the jump in the last filter - jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX) +%if %0 == 5 ; skip the jump in the last filter + jmp mangle(private_prefix %+ _%5 %+ SUFFIX) %endif %endmacro @@ -1571,16 +1451,6 @@ DECLARE_REG_TMP 4, 5 DECLARE_REG_TMP 7, 8 %endif -FN put_8tap, sharp, SHARP, SHARP -FN put_8tap, sharp_smooth, SHARP, SMOOTH -FN put_8tap, smooth_sharp, SMOOTH, SHARP -FN put_8tap, smooth, SMOOTH, SMOOTH -FN put_8tap, sharp_regular, SHARP, REGULAR -FN put_8tap, regular_sharp, REGULAR, SHARP -FN put_8tap, smooth_regular, SMOOTH, REGULAR -FN put_8tap, regular_smooth, REGULAR, SMOOTH -FN put_8tap, regular, REGULAR, REGULAR - %if ARCH_X86_32 %define base_reg r1 %define base base_reg-put_ssse3 @@ -1589,6 +1459,755 @@ FN put_8tap, regular, REGULAR, REGULAR %define base 0 %endif +%define PUT_8TAP_FN FN put_8tap, +PUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_8bpc +PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_8bpc +PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_8bpc +PUT_8TAP_FN regular, REGULAR, REGULAR + +cglobal put_6tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ns + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h +%if ARCH_X86_64 + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v +%else + imul ssd, mym, 0x010101 + add ssd, t1d ; 8tap_v, my, 4tap_v + mov srcq, srcm +%endif + mov wd, wm + movifnidn hd, hm + LEA base_reg, put_ssse3 + test mxd, 0xf00 + jnz .h +%if ARCH_X86_32 + test ssd, 0xf00 +%else + test myd, 0xf00 +%endif + jnz .v +.put: + tzcnt wd, wd + movzx wd, word [base_reg+wq*2+table_offset(put,)] + movifnidn ssq, ssmp + add wq, base_reg + movifnidn dsq, dsmp +%if WIN64 + pop r8 +%endif + lea r6, [ssq*3] + jmp wq +.h: +%if ARCH_X86_32 + test ssd, 0xf00 +%else + test myd, 0xf00 +%endif + jnz .hv + movifnidn ssq, ssmp + mova m5, [base+pw_34] ; 2 + (8 << 2) + cmp wd, 4 + jle mangle(private_prefix %+ _put_8tap_8bpc %+ SUFFIX).h_w4 + WIN64_SPILL_XMM 11 +%if ARCH_X86_64 + mova m8, [base+subpel_h_shufD] + mova m9, [base+subpel_h_shufE] + mova m10, [base+subpel_h_shufF] +%endif + shr mxd, 16 + sub srcq, 2 + movq m7, [base_reg-put_ssse3+subpel_filters+1+mxq*8] + punpcklwd m7, m7 + pshufd m4, m7, q0000 + pshufd m6, m7, q1111 + pshufd m7, m7, q2222 + sub wd, 16 + jge .h_w16 +%macro PUT_6TAP_H 3 ; dst/src, tmp[1-2] +%if ARCH_X86_32 + pshufb %2, %1, [base+subpel_h_shufD] + pshufb %3, %1, [base+subpel_h_shufE] + pshufb %1, [base+subpel_h_shufF] +%else + pshufb %2, %1, m8 + pshufb %3, %1, m9 + pshufb %1, m10 +%endif + pmaddubsw %2, m4 + pmaddubsw %3, m6 + pmaddubsw %1, m7 + paddw %2, m5 + paddw %2, %3 + paddw %1, %2 + psraw %1, 6 +%endmacro +%if ARCH_X86_32 + mov r4, dsm +%endif +.h_w8: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + PUT_6TAP_H m0, m2, m3 + PUT_6TAP_H m1, m2, m3 + packuswb m0, m1 +%if ARCH_X86_32 + movq [dstq+r4*0], m0 + movhps [dstq+r4*1], m0 + lea dstq, [dstq+r4*2] +%else + movq [dstq+dsq*0], m0 + movhps [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] +%endif + sub hd, 2 + jg .h_w8 + RET +.h_w16: + add srcq, wq + add dstq, wq + neg wq +.h_w16_loop_v: + mov r6, wq +.h_w16_loop_h: + movu m0, [srcq+r6+8*0] + movu m1, [srcq+r6+8*1] + PUT_6TAP_H m0, m2, m3 + PUT_6TAP_H m1, m2, m3 + packuswb m0, m1 + mova [dstq+r6], m0 + add r6, 16 + jle .h_w16_loop_h + add srcq, ssq + add dstq, dsmp + dec hd + jg .h_w16_loop_v + RET +.v: +%if ARCH_X86_32 + %define dsq r4 + %define m8 [base+pw_512] + movzx mxd, ssb + shr ssd, 16 + cmp hd, 6 + cmovs ssd, mxd + movq m7, [base_reg-put_ssse3+subpel_filters+1+ssq*8] + mov ssq, ssm + punpcklwd m7, m7 + pshufd m5, m7, q0000 + mov r6, ssq + pshufd m6, m7, q1111 + neg r6 + pshufd m7, m7, q2222 + cmp wd, 4 + jge .v_w4 +%else + WIN64_SPILL_XMM 9, 12 + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + movq m7, [base_reg-put_ssse3+subpel_filters+1+myq*8] + mova m8, [base+pw_512] + punpcklwd m7, m7 + pshufd m5, m7, q0000 + mov nsq, ssq + pshufd m6, m7, q1111 + neg nsq + pshufd m7, m7, q2222 + cmp wd, 4 + je .v_w4 + jg .v_w8 +%endif +.v_w2: +%if ARCH_X86_32 + mov dsq, dsm + movd m1, [srcq+r6 *2] + movd m3, [srcq+r6 *1] +%else + movd m1, [srcq+nsq*2] + movd m3, [srcq+nsq*1] +%endif + movd m2, [srcq+ssq*0] + movd m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movd m0, [srcq+ssq*0] + punpcklwd m1, m3 ; 0 1 + punpcklwd m3, m2 ; 1 2 + punpcklwd m2, m4 ; 2 3 + punpcklwd m4, m0 ; 3 4 + punpcklbw m1, m3 ; 01 12 + punpcklbw m2, m4 ; 23 34 +.v_w2_loop: + movd m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw m4, m1, m5 ; a0 b0 + mova m1, m2 + pmaddubsw m2, m6 ; a1 b1 + paddw m4, m2 + punpcklwd m2, m0, m3 ; 4 5 + movd m0, [srcq+ssq*0] + punpcklwd m3, m0 ; 5 6 + punpcklbw m2, m3 ; 67 78 + pmaddubsw m3, m2, m7 ; a2 b2 + paddw m4, m3 + pmulhrsw m4, m8 + packuswb m4, m4 + movd r6d, m4 + mov [dstq+dsq*0], r6w + shr r6d, 16 + mov [dstq+dsq*1], r6w + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: +%if ARCH_X86_32 + shl wd, 14 + lea srcq, [srcq+r6*2] + lea r6d, [hq+wq-(1<<16)] + mov srcm, srcq + mov dsq, dsm +.v_w4_loop0: + movd m1, [srcq+ssq*0] + movd m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] +%else + movd m1, [srcq+nsq*2] + movd m3, [srcq+nsq*1] +%endif + movd m2, [srcq+ssq*0] + movd m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movd m0, [srcq+ssq*0] + punpckldq m1, m3 ; 0 1 + punpckldq m3, m2 ; 1 2 + punpckldq m2, m4 ; 2 3 + punpckldq m4, m0 ; 3 4 + punpcklbw m1, m3 ; 01 12 + punpcklbw m2, m4 ; 23 34 +.v_w4_loop: + movd m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw m4, m1, m5 ; a0 b0 + mova m1, m2 + pmaddubsw m2, m6 ; a1 b1 + paddw m4, m2 + punpckldq m2, m0, m3 ; 4 5 + movd m0, [srcq+ssq*0] + punpckldq m3, m0 ; 5 6 + punpcklbw m2, m3 ; 67 78 + pmaddubsw m3, m2, m7 ; a2 b2 + paddw m4, m3 + pmulhrsw m4, m8 + packuswb m4, m4 + movd [dstq+dsq*0], m4 + psrlq m4, 32 + movd [dstq+dsq*1], m4 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop +%if ARCH_X86_32 + mov srcq, srcm + mov dstq, dstm + movzx hd, r6w + add srcq, 4 + add dstq, 4 + mov srcm, srcq + mov dstm, dstq + sub r6d, 1<<16 + jg .v_w4_loop0 +%endif + RET +%if ARCH_X86_64 +.v_w8: + WIN64_PUSH_XMM 12 + shl wd, 5 + lea r6d, [hq+wq-256] +.v_w8_loop0: + movq m1, [srcq+nsq*2] + movq m2, [srcq+nsq*1] + lea r4, [srcq+ssq*2] + movq m3, [srcq+ssq*0] + movq m4, [srcq+ssq*1] + mov r7, dstq + movq m0, [r4 +ssq*0] + punpcklbw m1, m2 ; 01 + punpcklbw m2, m3 ; 12 + punpcklbw m3, m4 ; 23 + punpcklbw m4, m0 ; 34 +.v_w8_loop: + pmaddubsw m10, m1, m5 ; a0 + mova m1, m3 + pmaddubsw m11, m2, m5 ; b0 + mova m2, m4 + pmaddubsw m3, m6 ; a1 + pmaddubsw m4, m6 ; b1 + paddw m10, m3 + paddw m11, m4 + movq m4, [r4+ssq*1] + lea r4, [r4+ssq*2] + punpcklbw m3, m0, m4 ; 67 + movq m0, [r4+ssq*0] + punpcklbw m4, m0 ; 78 + pmaddubsw m9, m3, m7 ; a2 + paddw m10, m9 + pmaddubsw m9, m4, m7 ; b2 + paddw m11, m9 + pmulhrsw m10, m8 + pmulhrsw m11, m8 + packuswb m10, m11 + movq [r7+dsq*0], m10 + movhps [r7+dsq*1], m10 + lea r7, [r7+dsq*2] + sub hd, 2 + jg .v_w8_loop + add srcq, 8 + add dstq, 8 + movzx hd, r6b + sub r6d, 1<<8 + jg .v_w8_loop0 + RET +%endif ;ARCH_X86_64 +.hv: + RESET_STACK_STATE + cmp wd, 4 + jg .hv_w8 +%if ARCH_X86_32 + and mxd, 0x7f +%else + movzx mxd, mxb +%endif + dec srcq + movd m1, [base_reg-put_ssse3+subpel_filters+2+mxq*8] +%if ARCH_X86_32 + movzx mxd, ssb + shr ssd, 16 + cmp hd, 6 + cmovs ssd, mxd + movq m0, [base_reg-put_ssse3+subpel_filters+1+ssq*8] + mov ssq, ssmp + ALLOC_STACK -mmsize*4 + %define m8 [rsp+mmsize*0] + %define m9 [rsp+mmsize*1] + %define m10 [rsp+mmsize*2] + punpcklbw m0, m0 + sub srcq, ssq + psraw m0, 8 ; sign-extend + sub srcq, ssq + pshufd m2, m0, q0000 + mova m8, m2 + pshufd m2, m0, q1111 + mova m9, m2 + pshufd m2, m0, q2222 + mova m10, m2 +%else + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + movq m0, [base_reg-put_ssse3+subpel_filters+1+myq*8] + WIN64_SPILL_XMM 11, 14 + mov nsq, ssq + punpcklbw m0, m0 + neg nsq + psraw m0, 8 ; sign-extend + pshufd m8, m0, q0000 + pshufd m9, m0, q1111 + pshufd m10, m0, q2222 +%endif + cmp wd, 4 + je .hv_w4 +.hv_w2: + mova m5, [base+subpel_h_shuf4] + mova m6, [base+pw_34] + pshufd m7, m1, q0000 +%if ARCH_X86_32 + movq m2, [srcq+ssq*0] + movhps m2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov dsq, [rstk+stack_offset+gprsize*2] +%else + movq m2, [srcq+nsq*2] + movhps m2, [srcq+nsq*1] ; 0 1 +%endif + movq m1, [srcq+ssq*0] + movhps m1, [srcq+ssq*1] ; 2 3 + lea srcq, [srcq+ssq*2] + movq m0, [srcq+ssq*0] ; 4 + REPX {pshufb x, m5}, m2, m1, m0 + REPX {pmaddubsw x, m7}, m2, m1, m0 + phaddw m2, m1 + phaddw m0, m0 + paddw m2, m6 + paddw m0, m6 + psraw m2, 2 ; 0 1 2 3 + psraw m0, 2 + palignr m0, m2, 4 ; 1 2 3 4 + punpcklwd m1, m2, m0 ; 01 12 + punpckhwd m2, m0 ; 23 34 +.hv_w2_loop: + movq m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movhps m3, [srcq+ssq*0] ; 5 6 + pshufb m3, m5 + pmaddubsw m3, m7 + pmaddwd m4, m8, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m9 ; a1 b1 + phaddw m3, m3 + paddw m3, m6 + psraw m3, 2 + paddd m4, m2 + palignr m2, m3, m0, 12 ; 4 5 + mova m0, m3 + punpcklwd m2, m3 ; 45 56 + pmaddwd m3, m10, m2 ; a2 b2 + paddd m4, m3 + psrad m4, 10 + packssdw m4, m5 + packuswb m4, m4 + movd r6d, m4 + mov [dstq+dsq*0], r6w + shr r6d, 16 + mov [dstq+dsq*1], r6w + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: +%if ARCH_X86_32 + movq m3, [srcq+ssq*0] + movq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov dsq, [rstk+stack_offset+gprsize*2] + %define m11 [base+pw_34] + %define m12 [base+subpel_h_shufA] + %define m13 [rsp+mmsize*3] + pshufd m1, m1, q0000 + mova m13, m1 +%else + WIN64_PUSH_XMM 14 + movq m3, [srcq+nsq*2] + movq m4, [srcq+nsq*1] + pshufd m13, m1, q0000 + mova m12, [base+subpel_h_shufA] + mova m11, [base+pw_34] +%endif + movq m0, [srcq+ssq*0] + movq m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movq m2, [srcq+ssq*0] +%if ARCH_X86_32 + mova m5, m12 + mova m6, m13 + REPX {pshufb x, m5 }, m3, m4, m0, m1, m2 + mova m5, m11 + REPX {pmaddubsw x, m6 }, m3, m4, m0, m1, m2 +%else + REPX {pshufb x, m12}, m3, m4, m0, m1, m2 + REPX {pmaddubsw x, m13}, m3, m4, m0, m1, m2 +%endif + phaddw m3, m0 ; 0 2 + phaddw m4, m1 ; 1 3 + phaddw m0, m2 ; 2 4 +%if ARCH_X86_32 + REPX {paddw x, m5 }, m3, m4, m0 +%else + REPX {paddw x, m11}, m3, m4, m0 +%endif + REPX {psraw x, 2 }, m3, m4, m0 + punpcklwd m1, m3, m4 ; 01 + punpckhwd m3, m4 ; 23 + punpcklwd m2, m4, m0 ; 12 + punpckhwd m4, m0 ; 34 +.hv_w4_loop: + movq m7, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movq m6, [srcq+ssq*0] + pshufb m7, m12 + pshufb m6, m12 + pmaddubsw m7, m13 + pmaddubsw m6, m13 + pmaddwd m5, m8, m1 ; a0 + mova m1, m3 + phaddw m7, m6 ; 5 6 + pmaddwd m6, m8, m2 ; b0 + mova m2, m4 + pmaddwd m3, m9 ; a1 + pmaddwd m4, m9 ; b1 + paddw m7, m11 + psraw m7, 2 + paddd m5, m3 + paddd m6, m4 + shufpd m4, m0, m7, 0x01 ; 4 5 + mova m0, m7 + punpcklwd m3, m4, m7 ; 45 + punpckhwd m4, m7 ; 56 + pmaddwd m7, m10, m3 ; a2 + paddd m5, m7 + pmaddwd m7, m10, m4 ; b2 + paddd m6, m7 + psrad m5, 10 + psrad m6, 10 + packssdw m5, m6 + packuswb m5, m5 + movd [dstq+dsq*0], m5 + psrlq m5, 32 + movd [dstq+dsq*1], m5 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + RESET_STACK_STATE + shr mxd, 16 + sub srcq, 2 +%if ARCH_X86_32 + movq m0, [base_reg-put_ssse3+subpel_filters+1+mxq*8] + movzx mxd, ssb + shr ssd, 16 + cmp hd, 6 + cmovs ssd, mxd + movq m1, [base_reg-put_ssse3+subpel_filters+1+ssq*8] + shl wd, 13 + mov ssq, ssm + lea r6d, [hq+wq-(1<<16)] +%assign regs_used 5 + ALLOC_STACK -mmsize*16 +%assign regs_used 7 + mov dsq, [rstk+stack_offset+gprsize*2] + sub srcq, ssq + sub srcq, ssq +%if STACK_ALIGNMENT < 16 + %define srcm [esp+mmsize*15+gprsize*0] + %define dstm [esp+mmsize*15+gprsize*1] + mov dstm, dstq +%endif + mov srcm, srcq +%else + ALLOC_STACK 16*6, 16 + movq m0, [base_reg-put_ssse3+subpel_filters+1+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + movq m1, [base_reg-put_ssse3+subpel_filters+1+myq*8] + mov nsq, ssq + shl wd, 13 + neg nsq + lea r6d, [hq+wq-(1<<16)] +%endif + mova m7, [base+pw_34] + punpcklwd m0, m0 + punpcklbw m1, m1 + psraw m1, 8 ; sign-extend + pshufd m2, m0, q0000 + mova [rsp+16*0], m2 + pshufd m2, m0, q1111 + mova [rsp+16*1], m2 + pshufd m0, m0, q2222 + mova [rsp+16*2], m0 + pshufd m2, m1, q0000 + mova [rsp+16*3], m2 + pshufd m2, m1, q1111 + mova [rsp+16*4], m2 + pshufd m1, m1, q2222 + mova [rsp+16*5], m1 +%macro HV_H_6TAP 3-8 [base+subpel_h_shufD], [base+subpel_h_shufF], \ + [rsp+16*0], [rsp+16*1], [rsp+16*2] ; src/dst, tmp[1-2], shuf[1-2], mul[1-3] + pshufb %2, %1, %4 + pshufb %1, %5 + pmaddubsw %3, %2, %6 + shufps %2, %1, q2121 + pmaddubsw %1, %8 + pmaddubsw %2, %7 + paddw %3, m7 + paddw %1, %3 + paddw %1, %2 + psraw %1, 2 +%endmacro +.hv_w8_loop0: + mova m2, [base+subpel_h_shufD] + mova m3, [base+subpel_h_shufF] + mova m4, [rsp+16*0] +%if ARCH_X86_32 + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + HV_H_6TAP m0, m5, m6, m2, m3, m4 + HV_H_6TAP m1, m5, m6, m2, m3, m4 + movu m5, [srcq+ssq*0] + punpcklwd m6, m0, m1 ; 01 + punpckhwd m0, m1 + mova [rsp+16* 6], m6 + mova [rsp+16* 7], m0 + HV_H_6TAP m5, m0, m6, m2, m3, m4 + movu m0, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklwd m6, m1, m5 ; 12 + punpckhwd m1, m5 + mova [rsp+16* 8], m6 + mova [rsp+16* 9], m1 + HV_H_6TAP m0, m1, m6, m2, m3, m4 + movu m1, [srcq+ssq*0] + punpcklwd m6, m5, m0 ; 23 + punpckhwd m5, m0 + mova [rsp+16*10], m6 + mova [rsp+16*11], m5 + HV_H_6TAP m1, m5, m6, m2, m3, m4 + mova [rsp+16*14], m1 + punpcklwd m6, m0, m1 ; 34 + punpckhwd m0, m1 + mova [rsp+16*12], m6 + mova [rsp+16*13], m0 +.hv_w8_loop: + mova m3, [rsp+16* 3] + pmaddwd m0, m3, [rsp+16* 6] ; a0 + pmaddwd m2, m3, [rsp+16* 7] ; a0' + pmaddwd m1, m3, [rsp+16* 8] ; b0 + pmaddwd m3, [rsp+16* 9] ; b0' + mova m6, [rsp+16* 4] + mova m4, [rsp+16*10] + mova m5, [rsp+16*11] + mova [rsp+16* 6], m4 + pmaddwd m4, m6 ; a1 + mova [rsp+16* 7], m5 + pmaddwd m5, m6 ; a1' + paddd m0, m4 + mova m4, [rsp+16*12] + paddd m2, m5 + mova m5, [rsp+16*13] + mova [rsp+16* 8], m4 + pmaddwd m4, m6 ; b1 + mova [rsp+16* 9], m5 + pmaddwd m5, m6 ; b1' + movu m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + paddd m1, m4 + paddd m3, m5 + HV_H_6TAP m6, m4, m5 + mova m5, [rsp+16*14] + punpcklwd m4, m5, m6 ; 45 + punpckhwd m5, m6 + mova [rsp+16*10], m4 + mova [rsp+16*11], m5 + pmaddwd m4, [rsp+16*5] ; a2 + pmaddwd m5, [rsp+16*5] ; a2' + paddd m0, m4 + movu m4, [srcq+ssq*0] + paddd m2, m5 + psrad m0, 10 + psrad m2, 10 + packssdw m0, m2 + HV_H_6TAP m4, m2, m5 + mova m2, [rsp+16*5] + punpcklwd m5, m6, m4 ; 56 + mova [rsp+16*14], m4 + punpckhwd m6, m4 + mova [rsp+16*12], m5 + pmaddwd m5, m2 ; b2 + mova [rsp+16*13], m6 + pmaddwd m6, m2 ; b2' + paddd m1, m5 + paddd m3, m6 + psrad m1, 10 + psrad m3, 10 + packssdw m1, m3 + packuswb m0, m1 + movq [dstq+dsq*0], m0 + movhps [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop + mov srcq, srcm + mov dstq, dstm + movzx hd, r6w + add srcq, 8 + add dstq, 8 + mov srcm, srcq + mov dstm, dstq +%else + movu m9, [srcq+nsq*2] + movu m11, [srcq+nsq*1] + lea r4, [srcq+ssq*2] + movu m13, [srcq+ssq*0] + movu m15, [srcq+ssq*1] + mov r7, dstq + movu m6, [r4 +ssq*0] + mova m5, [rsp+16*1] + mova m8, [rsp+16*2] + HV_H_6TAP m9, m0, m1, m2, m3, m4, m5, m8 + HV_H_6TAP m11, m0, m1, m2, m3, m4, m5, m8 + HV_H_6TAP m13, m0, m1, m2, m3, m4, m5, m8 + HV_H_6TAP m15, m0, m1, m2, m3, m4, m5, m8 + HV_H_6TAP m6, m0, m1, m2, m3, m4, m5, m8 + punpcklwd m8, m9, m11 ; 01 + punpckhwd m9, m11 + punpcklwd m10, m11, m13 ; 12 + punpckhwd m11, m13 + punpcklwd m12, m13, m15 ; 23 + punpckhwd m13, m15 + punpcklwd m14, m15, m6 ; 34 + punpckhwd m15, m6 +.hv_w8_loop: + mova m3, [rsp+16*3] + mova m4, [rsp+16*4] + pmaddwd m0, m8, m3 ; a0 + mova m8, m12 + pmaddwd m2, m9, m3 ; a0' + mova m9, m13 + pmaddwd m1, m10, m3 ; b0 + mova m10, m14 + pmaddwd m3, m11 ; b0' + mova m11, m15 + REPX {pmaddwd x, m4}, m12, m13, m14, m15 + paddd m0, m12 + paddd m2, m13 + paddd m1, m14 + paddd m3, m15 + movu m15, [r4+ssq*1] + lea r4, [r4+ssq*2] + HV_H_6TAP m15, m4, m5 + punpcklwd m12, m6, m15 + punpckhwd m13, m6, m15 + movu m6, [r4+ssq*0] + HV_H_6TAP m6, m4, m5 + mova m4, [rsp+16*5] + punpcklwd m14, m15, m6 + punpckhwd m15, m6 + pmaddwd m5, m12, m4 ; a2 + paddd m0, m5 + pmaddwd m5, m13, m4 ; a2' + paddd m2, m5 + pmaddwd m5, m14, m4 ; b2 + paddd m1, m5 + pmaddwd m4, m15 ; b2' + paddd m3, m4 + REPX {psrad x, 10}, m0, m2, m1, m3 + packssdw m0, m2 + packssdw m1, m3 + packuswb m0, m1 + movq [r7+dsq*0], m0 + movhps [r7+dsq*1], m0 + lea r7, [r7+dsq*2] + sub hd, 2 + jg .hv_w8_loop + add srcq, 8 + add dstq, 8 + movzx hd, r6b +%endif + sub r6d, 1<<16 + jg .hv_w8_loop0 + RET + +PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP, put_8tap_8bpc +PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_8tap_8bpc +PUT_8TAP_FN regular_sharp, REGULAR, SHARP, put_8tap_8bpc +PUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_8tap_8bpc +PUT_8TAP_FN sharp, SHARP, SHARP + cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h @@ -1613,60 +2232,23 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 jnz .v tzcnt wd, wd movzx wd, word [base_reg+wq*2+table_offset(put,)] - add wq, base_reg -; put_bilin mangling jump - movifnidn dsq, dsmp movifnidn ssq, ssmp + add wq, base_reg + movifnidn dsq, dsmp %if WIN64 pop r8 %endif lea r6, [ssq*3] jmp wq -.h: -%if ARCH_X86_32 - test ssd, 0xf00 -%else - test myd, 0xf00 -%endif - jnz .hv - movifnidn ssq, ssmp - WIN64_SPILL_XMM 12 - cmp wd, 4 - jl .h_w2 - je .h_w4 - tzcnt wd, wd -%if ARCH_X86_64 - mova m10, [base+subpel_h_shufA] - mova m11, [base+subpel_h_shufB] - mova m9, [base+subpel_h_shufC] -%endif - shr mxd, 16 - sub srcq, 3 - movzx wd, word [base_reg+wq*2+table_offset(put, _8tap_h)] - movq m6, [base_reg+mxq*8+subpel_filters-put_ssse3] - mova m7, [base+pw_34] ; 2 + (8 << 2) - pshufd m5, m6, q0000 - pshufd m6, m6, q1111 - add wq, base_reg - jmp wq .h_w2: -%if ARCH_X86_32 - and mxd, 0x7f -%else - movzx mxd, mxb -%endif - dec srcq - mova m4, [base+subpel_h_shuf4] - movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2] - mova m5, [base+pw_34] ; 2 + (8 << 2) - pshufd m3, m3, q0000 + mova m3, [base+subpel_h_shuf4] movifnidn dsq, dsmp .h_w2_loop: movq m0, [srcq+ssq*0] movhps m0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - pshufb m0, m4 - pmaddubsw m0, m3 + pshufb m0, m3 + pmaddubsw m0, m4 phaddw m0, m0 paddw m0, m5 ; pw34 psraw m0, 6 @@ -1685,20 +2267,21 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 %else movzx mxd, mxb %endif + movd m4, [base_reg+mxq*8+subpel_filters-put_ssse3+2] dec srcq - movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2] - mova m6, [base+subpel_h_shufA] - mova m5, [base+pw_34] ; 2 + (8 << 2) - pshufd m3, m3, q0000 + pshufd m4, m4, q0000 + cmp wd, 4 + jl .h_w2 + mova m3, [base+subpel_h_shufA] movifnidn dsq, dsmp .h_w4_loop: movq m0, [srcq+ssq*0] ; 1 movq m1, [srcq+ssq*1] ; 2 lea srcq, [srcq+ssq*2] - pshufb m0, m6 ; subpel_h_shufA - pshufb m1, m6 ; subpel_h_shufA - pmaddubsw m0, m3 ; subpel_filters - pmaddubsw m1, m3 ; subpel_filters + pshufb m0, m3 ; subpel_h_shufA + pshufb m1, m3 ; subpel_h_shufA + pmaddubsw m0, m4 ; subpel_filters + pmaddubsw m1, m4 ; subpel_filters phaddw m0, m1 paddw m0, m5 ; pw34 psraw m0, 6 @@ -1710,6 +2293,30 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 sub hd, 2 jg .h_w4_loop RET +.h: +%if ARCH_X86_32 + test ssd, 0xf00 +%else + test myd, 0xf00 +%endif + jnz .hv + movifnidn ssq, ssmp + mova m5, [base+pw_34] ; 2 + (8 << 2) + cmp wd, 4 + jle .h_w4 + WIN64_SPILL_XMM 12 +%if ARCH_X86_64 + mova m10, [base+subpel_h_shufA] + mova m11, [base+subpel_h_shufB] + mova m9, [base+subpel_h_shufC] +%endif + shr mxd, 16 + sub srcq, 3 + movq m7, [base_reg+mxq*8+subpel_filters-put_ssse3] + pshufd m6, m7, q0000 + pshufd m7, m7, q1111 + sub wd, 16 + jge .h_w16 %macro PUT_8TAP_H 4 ; dst/src, tmp[1-3] %if ARCH_X86_32 pshufb %2, %1, [base+subpel_h_shufB] @@ -1720,14 +2327,14 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 pshufb %3, %1, m9 ; subpel_h_shufC pshufb %1, m10 ; subpel_h_shufA %endif - pmaddubsw %4, %2, m5 ; subpel +0 B0 - pmaddubsw %2, m6 ; subpel +4 B4 - pmaddubsw %3, m6 ; C4 - pmaddubsw %1, m5 ; A0 + pmaddubsw %4, %2, m6 ; subpel +0 B0 + pmaddubsw %2, m7 ; subpel +4 B4 + pmaddubsw %3, m7 ; C4 + pmaddubsw %1, m6 ; A0 paddw %3, %4 ; C4+B0 paddw %1, %2 ; A0+B4 phaddw %1, %3 - paddw %1, m7 ; pw34 + paddw %1, m5 ; pw34 psraw %1, 6 %endmacro .h_w8: @@ -1750,22 +2357,12 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 sub hd, 2 jg .h_w8 RET -.h_w128: - mov r4, -16*7 - jmp .h_w16_start -.h_w64: - mov r4, -16*3 - jmp .h_w16_start -.h_w32: - mov r4, -16*1 - jmp .h_w16_start .h_w16: - xor r4d, r4d -.h_w16_start: - sub srcq, r4 - sub dstq, r4 + add srcq, wq + add dstq, wq + neg wq .h_w16_loop_v: - mov r6, r4 + mov r6, wq .h_w16_loop_h: movu m0, [srcq+r6+8*0] movu m1, [srcq+r6+8*1] @@ -1795,11 +2392,8 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 cmovs myd, mxd movq m0, [base_reg+myq*8+subpel_filters-put_ssse3] %endif - tzcnt r6d, wd - movzx r6d, word [base_reg+r6*2+table_offset(put, _8tap_v)] punpcklwd m0, m0 mova m7, [base+pw_512] - add r6, base_reg %if ARCH_X86_32 %define subpel0 [rsp+mmsize*0] %define subpel1 [rsp+mmsize*1] @@ -1821,6 +2415,8 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 sub srcq, ssq mov ssq, [rstk+stack_offset+gprsize*4] mov dsq, [rstk+stack_offset+gprsize*2] + cmp wd, 2 + jne .v_w4 %else %define subpel0 m8 %define subpel1 m9 @@ -1832,8 +2428,10 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 pshufd m9, m0, q1111 pshufd m10, m0, q2222 pshufd m11, m0, q3333 + cmp wd, 4 + je .v_w4 + jg .v_w8 %endif - jmp r6 .v_w2: movd m1, [srcq+ssq*0] movd m0, [srcq+ssq*1] @@ -1891,11 +2489,6 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 RET .v_w4: %if ARCH_X86_32 -.v_w8: -.v_w16: -.v_w32: -.v_w64: -.v_w128: shl wd, 14 %if STACK_ALIGNMENT < 16 %define dstm [rsp+mmsize*4+gprsize] @@ -1970,24 +2563,19 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 RET %if ARCH_X86_64 .v_w8: -.v_w16: -.v_w32: -.v_w64: -.v_w128: - lea r6d, [wq*8-64] - mov r4, srcq - mov r7, dstq - lea r6d, [hq+r6*4] + shl wd, 5 + lea r6d, [hq+wq-256] .v_w8_loop0: movq m1, [srcq+ssq*0] movq m2, [srcq+ssq*1] + lea r4, [srcq+ss3q] movq m3, [srcq+ssq*2] - add srcq, ss3q - movq m4, [srcq+ssq*0] - movq m5, [srcq+ssq*1] - movq m6, [srcq+ssq*2] - add srcq, ss3q - movq m0, [srcq+ssq*0] + movq m4, [r4 +ssq*0] + mov r7, dstq + movq m5, [r4 +ssq*1] + movq m6, [r4 +ssq*2] + add r4, ss3q + movq m0, [r4 +ssq*0] punpcklbw m1, m2 ; 01 punpcklbw m2, m3 ; 12 punpcklbw m3, m4 ; 23 @@ -1995,8 +2583,8 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 punpcklbw m5, m6 ; 45 punpcklbw m6, m0 ; 56 .v_w8_loop: - movq m13, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] + movq m13, [r4+ssq*1] + lea r4, [r4+ssq*2] pmaddubsw m14, m1, subpel0 ; a0 mova m1, m3 pmaddubsw m15, m2, subpel0 ; b0 @@ -2004,7 +2592,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 pmaddubsw m3, subpel1 ; a1 mova m12, m0 pmaddubsw m4, subpel1 ; b1 - movq m0, [srcq+ssq*0] + movq m0, [r4+ssq*0] paddw m14, m3 paddw m15, m4 mova m3, m5 @@ -2024,16 +2612,14 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 pmulhrsw m14, m7 pmulhrsw m15, m7 packuswb m14, m15 - movq [dstq+dsq*0], m14 - movhps [dstq+dsq*1], m14 - lea dstq, [dstq+dsq*2] + movq [r7+dsq*0], m14 + movhps [r7+dsq*1], m14 + lea r7, [r7+dsq*2] sub hd, 2 jg .v_w8_loop - add r4, 8 - add r7, 8 + add srcq, 8 + add dstq, 8 movzx hd, r6b - mov srcq, r4 - mov dstq, r7 sub r6d, 1<<8 jg .v_w8_loop0 RET @@ -2625,193 +3211,6 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 jg .hv_w8_loop0 RET -%macro PSHUFB_SUBPEL_H_4 5 ; dst/src1, src2/mask, tmp1, tmp2, reset_mask - %if cpuflag(ssse3) - pshufb %1, %2 - %else - %if %5 == 1 - pcmpeqd %2, %2 - psrlq %2, 32 - %endif - psrldq %3, %1, 1 - pshufd %3, %3, q2301 - pand %1, %2 - pandn %4, %2, %3 - por %1, %4 - %endif -%endmacro - -%macro PSHUFB_SUBPEL_H_4a 6 ; dst, src1, src2/mask, tmp1, tmp2, reset_mask - %ifnidn %1, %2 - mova %1, %2 - %endif - PSHUFB_SUBPEL_H_4 %1, %3, %4, %5, %6 -%endmacro - -%macro PSHUFB_SUBPEL_H_4b 6 ; dst, src1, src2/mask, tmp1, tmp2, reset_mask - %if notcpuflag(ssse3) - psrlq %1, %2, 16 - %elifnidn %1, %2 - mova %1, %2 - %endif - PSHUFB_SUBPEL_H_4 %1, %3, %4, %5, %6 -%endmacro - -%macro PALIGNR 4-5 ; dst, src1, src2, shift[, tmp] - %if cpuflag(ssse3) - palignr %1, %2, %3, %4 - %else - %if %0 == 4 - %assign %%i regnumof%+%1 + 1 - %define %%tmp m %+ %%i - %else - %define %%tmp %5 - %endif - psrldq %1, %3, %4 - pslldq %%tmp, %2, 16-%4 - por %1, %%tmp - %endif -%endmacro - -%macro PHADDW 4 ; dst, src, pw_1/tmp, load_pw_1 - %if cpuflag(ssse3) - phaddw %1, %2 - %elifnidn %1, %2 - %if %4 == 1 - mova %3, [base+pw_1] - %endif - pmaddwd %1, %3 - pmaddwd %2, %3 - packssdw %1, %2 - %else - %if %4 == 1 - pmaddwd %1, [base+pw_1] - %else - pmaddwd %1, %3 - %endif - packssdw %1, %1 - %endif -%endmacro - -%macro PMULHRSW_POW2 4 ; dst, src1, src2, shift - %if cpuflag(ssse3) - pmulhrsw %1, %2, %3 - %else - paddw %1, %2, %3 - psraw %1, %4 - %endif -%endmacro - -%macro PMULHRSW_8192 3 ; dst, src1, src2 - PMULHRSW_POW2 %1, %2, %3, 2 -%endmacro - -%macro PREP_8TAP_H_LOAD4 5 ; dst, src_memloc, tmp[1-2] - movd %1, [%2+0] - movd %3, [%2+1] - movd %4, [%2+2] - movd %5, [%2+3] - punpckldq %1, %3 - punpckldq %4, %5 - punpcklqdq %1, %4 -%endmacro - -%macro PREP_8TAP_H_LOAD 2 ; dst0, src_memloc - %if cpuflag(ssse3) - movu m%1, [%2] - pshufb m2, m%1, m11 ; subpel_h_shufB - pshufb m3, m%1, m9 ; subpel_h_shufC - pshufb m%1, m10 ; subpel_h_shufA - %else - %if ARCH_X86_64 - SWAP m12, m5 - SWAP m13, m6 - SWAP m14, m7 - %define %%mx0 m%+%%i - %define %%mx1 m%+%%j - %assign %%i 0 - %rep 12 - movd %%mx0, [%2+%%i] - %assign %%i %%i+1 - %endrep - %assign %%i 0 - %rep 6 - %assign %%j %%i+1 - punpckldq %%mx0, %%mx1 - %assign %%i %%i+2 - %endrep - %assign %%i 0 - %rep 3 - %assign %%j %%i+2 - punpcklqdq %%mx0, %%mx1 - %assign %%i %%i+4 - %endrep - SWAP m%1, m0 - SWAP m2, m4 - SWAP m3, m8 - SWAP m5, m12 - SWAP m6, m13 - SWAP m7, m14 - %else - PREP_8TAP_H_LOAD4 m0, %2+0, m1, m4, m7 - PREP_8TAP_H_LOAD4 m2, %2+4, m1, m4, m7 - PREP_8TAP_H_LOAD4 m3, %2+8, m1, m4, m7 - SWAP m%1, m0 - %endif - %endif -%endmacro - -%macro PREP_8TAP_H 2 ; dst, src_memloc - PREP_8TAP_H_LOAD %1, %2 - %if ARCH_X86_64 && notcpuflag(ssse3) - SWAP m8, m1 - SWAP m9, m7 - %endif - %xdefine mX m%+%1 - %assign %%i regnumof%+mX - %define mX m%+%%i - mova m4, m2 - PMADDUBSW m4, m5, m1, m7, 1 ; subpel +0 B0 - PMADDUBSW m2, m6, m1, m7, 0 ; subpel +4 B4 - PMADDUBSW m3, m6, m1, m7, 0 ; subpel +4 C4 - PMADDUBSW mX, m5, m1, m7, 0 ; subpel +0 A0 - %undef mX - %if ARCH_X86_64 && notcpuflag(ssse3) - SWAP m1, m8 - SWAP m7, m9 - %endif - paddw m3, m4 - paddw m%1, m2 - PHADDW m%1, m3, m15, ARCH_X86_32 - %if ARCH_X86_64 || cpuflag(ssse3) - PMULHRSW_8192 m%1, m%1, m7 - %else - PMULHRSW_8192 m%1, m%1, [base+pw_2] - %endif -%endmacro - -%macro PREP_8TAP_HV 4 ; dst, src_memloc, tmp[1-2] - %if cpuflag(ssse3) - movu %1, [%2] - pshufb m2, %1, shufB - pshufb m3, %1, shufC - pshufb %1, shufA - %else - PREP_8TAP_H_LOAD4 %1, %2+0, m1, %3, %4 - PREP_8TAP_H_LOAD4 m2, %2+4, m1, %3, %4 - PREP_8TAP_H_LOAD4 m3, %2+8, m1, %3, %4 - %endif - mova m1, m2 - PMADDUBSW m1, subpelh0, %3, %4, 1 ; subpel +0 C0 - PMADDUBSW m3, subpelh1, %3, %4, 0 ; subpel +4 B4 - PMADDUBSW m2, subpelh1, %3, %4, 0 ; C4 - PMADDUBSW %1, subpelh0, %3, %4, 0 ; A0 - paddw m1, m3 ; C0+B4 - paddw %1, m2 ; A0+C4 - PHADDW %1, m1, %3, 1 -%endmacro - -%macro PREP_8TAP 0 %if ARCH_X86_32 DECLARE_REG_TMP 1, 2 %elif WIN64 @@ -2820,23 +3219,629 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 DECLARE_REG_TMP 6, 7 %endif -FN prep_8tap, sharp, SHARP, SHARP -FN prep_8tap, sharp_smooth, SHARP, SMOOTH -FN prep_8tap, smooth_sharp, SMOOTH, SHARP -FN prep_8tap, smooth, SMOOTH, SMOOTH -FN prep_8tap, sharp_regular, SHARP, REGULAR -FN prep_8tap, regular_sharp, REGULAR, SHARP -FN prep_8tap, smooth_regular, SMOOTH, REGULAR -FN prep_8tap, regular_smooth, REGULAR, SMOOTH -FN prep_8tap, regular, REGULAR, REGULAR - %if ARCH_X86_32 %define base_reg r2 - %define base base_reg-prep%+SUFFIX + %define base base_reg-prep_ssse3 %else %define base_reg r7 %define base 0 %endif + +%define PREP_8TAP_FN FN prep_8tap, +PREP_8TAP_FN smooth, SMOOTH, SMOOTH, prep_6tap_8bpc +PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR, prep_6tap_8bpc +PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH, prep_6tap_8bpc +PREP_8TAP_FN regular, REGULAR, REGULAR + +cglobal prep_6tap_8bpc, 1, 9, 0, tmp, src, ss, w, h, mx, my, ns + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + mov wd, wm + movifnidn srcd, srcm + movifnidn hd, hm + LEA base_reg, prep_ssse3 + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v +.prep: + tzcnt wd, wd + movzx wd, word [base_reg-prep_ssse3+prep_ssse3_table+wq*2] + pxor m4, m4 + add wq, base_reg + movifnidn ssq, ssmp + lea r6, [ssq*3] +%if WIN64 + pop r8 + pop r7 +%endif + jmp wq +.h: + test myd, 0xf00 + jnz .hv + test myd, 0xf00 + jnz .hv +%if ARCH_X86_32 + %define ssq r6 + mov ssq, ssmp +%endif + cmp wd, 4 + jle mangle(private_prefix %+ _prep_8tap_8bpc %+ SUFFIX).h_w4 + WIN64_SPILL_XMM 11 + mova m5, [base+pw_8192] +%if ARCH_X86_64 + mova m8, [base+subpel_h_shufD] + mova m9, [base+subpel_h_shufE] + mova m10, [base+subpel_h_shufF] +%endif + shr mxd, 16 + sub srcq, 2 + movq m7, [base_reg-prep_ssse3+subpel_filters+1+mxq*8] + punpcklwd m7, m7 + pshufd m4, m7, q0000 + pshufd m6, m7, q1111 + pshufd m7, m7, q2222 + sub wd, 16 + jge .h_w16 +%macro PREP_6TAP_H 3 ; dst/src, tmp[1-2] +%if ARCH_X86_32 + pshufb %2, %1, [base+subpel_h_shufD] + pshufb %3, %1, [base+subpel_h_shufE] + pshufb %1, [base+subpel_h_shufF] +%else + pshufb %2, %1, m8 + pshufb %3, %1, m9 + pshufb %1, m10 +%endif + pmaddubsw %2, m4 + pmaddubsw %3, m6 + pmaddubsw %1, m7 + paddw %2, %3 + paddw %1, %2 + pmulhrsw %1, m5 +%endmacro +.h_w8: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + PREP_6TAP_H m0, m2, m3 + PREP_6TAP_H m1, m2, m3 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + add tmpq, 32 + sub hd, 2 + jg .h_w8 + RET +.h_w16: + add srcq, wq + neg wq +.h_w16_loop_v: + mov r5, wq +.h_w16_loop_h: + movu m0, [srcq+r5+8*0] + movu m1, [srcq+r5+8*1] + PREP_6TAP_H m0, m2, m3 + PREP_6TAP_H m1, m2, m3 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + add tmpq, 32 + add r5, 16 + jle .h_w16_loop_h + add srcq, ssq + dec hd + jg .h_w16_loop_v + RET +.v: +%if ARCH_X86_32 + mov mxd, myd + and mxd, 0x7f +%else + WIN64_SPILL_XMM 9, 12 + movzx mxd, myb +%endif + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + movq m7, [base_reg-prep_ssse3+subpel_filters+1+myq*8] + punpcklwd m7, m7 + pshufd m5, m7, q0000 + pshufd m6, m7, q1111 + pshufd m7, m7, q2222 +%if ARCH_X86_32 + %define m8 [base+pw_8192] + mov ssq, ssm + sub srcq, ssq + sub srcq, ssq +%else + mova m8, [base+pw_8192] + mov nsq, ssq + neg nsq + cmp wd, 4 + jg .v_w8 +%endif +.v_w4: +%if ARCH_X86_32 + lea r5d, [wq-4] + shl r5d, 14 + add r5d, hd + mov srcm, srcq +.v_w4_loop0: + movd m1, [srcq+ssq*0] + movd m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] +%else + movd m1, [srcq+nsq*2] + movd m3, [srcq+nsq*1] +%endif + movd m2, [srcq+ssq*0] + movd m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movd m0, [srcq+ssq*0] + punpckldq m1, m3 ; 0 1 + punpckldq m3, m2 ; 1 2 + punpckldq m2, m4 ; 2 3 + punpckldq m4, m0 ; 3 4 + punpcklbw m1, m3 ; 01 12 + punpcklbw m2, m4 ; 23 34 +.v_w4_loop: + movd m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw m4, m1, m5 ; a0 b0 + mova m1, m2 + pmaddubsw m2, m6 ; a1 b1 + paddw m4, m2 + punpckldq m2, m0, m3 ; 4 5 + movd m0, [srcq+ssq*0] + punpckldq m3, m0 ; 5 6 + punpcklbw m2, m3 ; 67 78 + pmaddubsw m3, m2, m7 ; a2 b2 + paddw m4, m3 + pmulhrsw m4, m8 +%if ARCH_X86_32 + movq [tmpq+wq*0], m4 + movhps [tmpq+wq*2], m4 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jg .v_w4_loop + mov srcq, srcm + mov tmpq, tmpm + movzx hd, r5w + add srcq, 4 + add tmpq, 8 + mov srcm, srcq + mov tmpm, tmpq + sub r5d, 1<<16 + jg .v_w4_loop0 +%else + mova [tmpq], m4 + add tmpq, 16 + sub hd, 2 + jg .v_w4_loop +%endif + RET +%if ARCH_X86_64 +.v_w8: + WIN64_PUSH_XMM 12 + lea r6d, [wq*4-32] + lea r6d, [r6*8+hq] +.v_w8_loop0: + movq m1, [srcq+nsq*2] + movq m2, [srcq+nsq*1] + lea r5, [srcq+ssq*2] + movq m3, [srcq+ssq*0] + movq m4, [srcq+ssq*1] + mov r8, tmpq + movq m0, [r5 +ssq*0] + punpcklbw m1, m2 ; 01 + punpcklbw m2, m3 ; 12 + punpcklbw m3, m4 ; 23 + punpcklbw m4, m0 ; 34 +.v_w8_loop: + pmaddubsw m10, m1, m5 ; a0 + mova m1, m3 + pmaddubsw m11, m2, m5 ; b0 + mova m2, m4 + pmaddubsw m3, m6 ; a1 + pmaddubsw m4, m6 ; b1 + paddw m10, m3 + paddw m11, m4 + movq m4, [r5+ssq*1] + lea r5, [r5+ssq*2] + punpcklbw m3, m0, m4 ; 67 + movq m0, [r5+ssq*0] + punpcklbw m4, m0 ; 78 + pmaddubsw m9, m3, m7 ; a2 + paddw m10, m9 + pmaddubsw m9, m4, m7 ; b2 + paddw m11, m9 + pmulhrsw m10, m8 + pmulhrsw m11, m8 + mova [r8+wq*0], m10 + mova [r8+wq*2], m11 + lea r8, [r8+wq*4] + sub hd, 2 + jg .v_w8_loop + add srcq, 8 + add tmpq, 16 + movzx hd, r6b + sub r6d, 1<<8 + jg .v_w8_loop0 + RET +%endif ;ARCH_X86_64 +.hv: + RESET_STACK_STATE + cmp wd, 4 + jg .hv_w8 +%if ARCH_X86_32 + and mxd, 0x7f +%else + movzx mxd, mxb +%endif + dec srcq + movd m1, [base_reg-prep_ssse3+subpel_filters+2+mxq*8] +%if ARCH_X86_32 + mov mxd, myd + and mxd, 0x7f +%else + movzx mxd, myb +%endif + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + movq m0, [base_reg-prep_ssse3+subpel_filters+1+myq*8] +%if ARCH_X86_32 + mov ssq, ssmp +%define regs_used 6 + ALLOC_STACK -mmsize*4 +%define regs_used 7 + %define m8 [rsp+mmsize*0] + %define m9 [rsp+mmsize*1] + %define m10 [rsp+mmsize*2] + punpcklbw m0, m0 + sub srcq, ssq + psraw m0, 8 ; sign-extend + sub srcq, ssq + pshufd m2, m0, q0000 + mova m8, m2 + pshufd m2, m0, q1111 + mova m9, m2 + pshufd m2, m0, q2222 + mova m10, m2 + movq m3, [srcq+ssq*0] + movq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + %define m11 [base+pw_8192] + %define m12 [base+subpel_h_shufA] + %define m13 [rsp+mmsize*3] + %define m14 [base+pd_32] + pshufd m1, m1, q0000 + mova m13, m1 +%else + WIN64_SPILL_XMM 15 + mov nsq, ssq + punpcklbw m0, m0 + neg nsq + psraw m0, 8 ; sign-extend + pshufd m8, m0, q0000 + pshufd m9, m0, q1111 + pshufd m10, m0, q2222 + movq m3, [srcq+nsq*2] + movq m4, [srcq+nsq*1] + pshufd m13, m1, q0000 + mova m12, [base+subpel_h_shufA] + mova m11, [base+pw_8192] + mova m14, [base+pd_32] +%endif + movq m0, [srcq+ssq*0] + movq m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movq m2, [srcq+ssq*0] +%if ARCH_X86_32 + mova m5, m12 + mova m6, m13 + REPX {pshufb x, m5 }, m3, m4, m0, m1, m2 + mova m5, m11 + REPX {pmaddubsw x, m6 }, m3, m4, m0, m1, m2 +%else + REPX {pshufb x, m12}, m3, m4, m0, m1, m2 + REPX {pmaddubsw x, m13}, m3, m4, m0, m1, m2 +%endif + phaddw m3, m0 ; 0 2 + phaddw m4, m1 ; 1 3 + phaddw m0, m2 ; 2 4 +%if ARCH_X86_32 + REPX {pmulhrsw x, m5 }, m3, m4, m0 +%else + REPX {pmulhrsw x, m11}, m3, m4, m0 +%endif + punpcklwd m1, m3, m4 ; 01 + punpckhwd m3, m4 ; 23 + punpcklwd m2, m4, m0 ; 12 + punpckhwd m4, m0 ; 34 +.hv_w4_loop: + movq m7, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movq m6, [srcq+ssq*0] + pshufb m7, m12 + pshufb m6, m12 + pmaddubsw m7, m13 + pmaddubsw m6, m13 + pmaddwd m5, m8, m1 ; a0 + mova m1, m3 + phaddw m7, m6 ; 5 6 + pmaddwd m6, m8, m2 ; b0 + mova m2, m4 + pmaddwd m3, m9 ; a1 + pmaddwd m4, m9 ; b1 + pmulhrsw m7, m11 + paddd m5, m14 + paddd m6, m14 + paddd m5, m3 + paddd m6, m4 + shufpd m4, m0, m7, 0x01 ; 4 5 + mova m0, m7 + punpcklwd m3, m4, m7 ; 45 + punpckhwd m4, m7 ; 56 + pmaddwd m7, m10, m3 ; a2 + paddd m5, m7 + pmaddwd m7, m10, m4 ; b2 + paddd m6, m7 + psrad m5, 6 + psrad m6, 6 + packssdw m5, m6 + mova [tmpq], m5 + add tmpq, 16 + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + RESET_STACK_STATE + shr mxd, 16 + sub srcq, 2 + movq m0, [base_reg-prep_ssse3+subpel_filters+1+mxq*8] +%if ARCH_X86_32 + mov mxd, myd + and mxd, 0x7f +%else + movzx mxd, myb +%endif + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + movq m1, [base_reg-prep_ssse3+subpel_filters+1+myq*8] +%if ARCH_X86_32 + mov ssq, ssm +%assign regs_used 6 + ALLOC_STACK -mmsize*16 +%assign regs_used 7 + sub srcq, ssq + sub srcq, ssq +%if STACK_ALIGNMENT < 16 + %define srcm [esp+mmsize*15+gprsize*0] + %define tmpm [esp+mmsize*15+gprsize*1] + mov tmpm, tmpq +%endif + mov srcm, srcq +%else + ALLOC_STACK 16*6, 16 + mov nsq, ssq + neg nsq +%endif + mova m7, [base+pw_8192] + lea r5d, [wq-8] + punpcklwd m0, m0 + shl r5d, 13 + punpcklbw m1, m1 + add r5d, hd + psraw m1, 8 ; sign-extend + pshufd m2, m0, q0000 + mova [rsp+16*0], m2 + pshufd m2, m0, q1111 + mova [rsp+16*1], m2 + pshufd m0, m0, q2222 + mova [rsp+16*2], m0 + pshufd m2, m1, q0000 + mova [rsp+16*3], m2 + pshufd m2, m1, q1111 + mova [rsp+16*4], m2 + pshufd m1, m1, q2222 + mova [rsp+16*5], m1 +%macro PREP_HV_H_6TAP 3-8 [base+subpel_h_shufD], [base+subpel_h_shufF], \ + [rsp+16*0], [rsp+16*1], [rsp+16*2] ; src/dst, tmp[1-2], shuf[1-2], mul[1-3] + pshufb %2, %1, %4 + pshufb %1, %5 + pmaddubsw %3, %2, %6 + shufps %2, %1, q2121 + pmaddubsw %1, %8 + pmaddubsw %2, %7 + paddw %1, %3 + paddw %1, %2 + pmulhrsw %1, m7 +%endmacro +.hv_w8_loop0: + mova m2, [base+subpel_h_shufD] + mova m3, [base+subpel_h_shufF] + mova m4, [rsp+16*0] +%if ARCH_X86_32 + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + PREP_HV_H_6TAP m0, m5, m6, m2, m3, m4 + PREP_HV_H_6TAP m1, m5, m6, m2, m3, m4 + movu m5, [srcq+ssq*0] + punpcklwd m6, m0, m1 ; 01 + punpckhwd m0, m1 + mova [rsp+16* 6], m6 + mova [rsp+16* 7], m0 + PREP_HV_H_6TAP m5, m0, m6, m2, m3, m4 + movu m0, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklwd m6, m1, m5 ; 12 + punpckhwd m1, m5 + mova [rsp+16* 8], m6 + mova [rsp+16* 9], m1 + PREP_HV_H_6TAP m0, m1, m6, m2, m3, m4 + movu m1, [srcq+ssq*0] + punpcklwd m6, m5, m0 ; 23 + punpckhwd m5, m0 + mova [rsp+16*10], m6 + mova [rsp+16*11], m5 + PREP_HV_H_6TAP m1, m5, m6, m2, m3, m4 + mova [rsp+16*14], m1 + punpcklwd m6, m0, m1 ; 34 + punpckhwd m0, m1 + mova [rsp+16*12], m6 + mova [rsp+16*13], m0 +.hv_w8_loop: + mova m3, [rsp+16* 3] + pmaddwd m0, m3, [rsp+16* 6] ; a0 + pmaddwd m2, m3, [rsp+16* 7] ; a0' + pmaddwd m1, m3, [rsp+16* 8] ; b0 + pmaddwd m3, [rsp+16* 9] ; b0' + mova m6, [rsp+16* 4] + mova m4, [rsp+16*10] + mova m5, [rsp+16*11] + mova [rsp+16* 6], m4 + pmaddwd m4, m6 ; a1 + mova [rsp+16* 7], m5 + pmaddwd m5, m6 ; a1' + paddd m0, m4 + mova m4, [rsp+16*12] + paddd m2, m5 + mova m5, [rsp+16*13] + mova [rsp+16* 8], m4 + pmaddwd m4, m6 ; b1 + mova [rsp+16* 9], m5 + pmaddwd m5, m6 ; b1' + movu m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + paddd m1, m4 + paddd m3, m5 + PREP_HV_H_6TAP m6, m4, m5 + mova m4, [base+pd_32] + mova m5, [rsp+16*14] + REPX {paddd x, m4}, m0, m2, m1, m3 + punpcklwd m4, m5, m6 ; 45 + punpckhwd m5, m6 + mova [rsp+16*10], m4 + mova [rsp+16*11], m5 + pmaddwd m4, [rsp+16*5] ; a2 + pmaddwd m5, [rsp+16*5] ; a2' + paddd m0, m4 + movu m4, [srcq+ssq*0] + paddd m2, m5 + psrad m0, 6 + psrad m2, 6 + packssdw m0, m2 + PREP_HV_H_6TAP m4, m2, m5 + mova m2, [rsp+16*5] + punpcklwd m5, m6, m4 ; 56 + mova [rsp+16*14], m4 + punpckhwd m6, m4 + mova [rsp+16*12], m5 + pmaddwd m5, m2 ; b2 + mova [rsp+16*13], m6 + pmaddwd m6, m2 ; b2' + paddd m1, m5 + paddd m3, m6 + psrad m1, 6 + psrad m3, 6 + packssdw m1, m3 + mova [tmpq+wq*0], m0 + mova [tmpq+wq*2], m1 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jg .hv_w8_loop + mov srcq, srcm + mov tmpq, tmpm + movzx hd, r5w + add srcq, 8 + add tmpq, 16 + mov srcm, srcq + mov tmpm, tmpq +%else + movu m9, [srcq+nsq*2] + movu m11, [srcq+nsq*1] + lea r6, [srcq+ssq*2] + movu m13, [srcq+ssq*0] + movu m15, [srcq+ssq*1] + mov r8, tmpq + movu m6, [r6 +ssq*0] + mova m5, [rsp+16*1] + mova m8, [rsp+16*2] + PREP_HV_H_6TAP m9, m0, m1, m2, m3, m4, m5, m8 + PREP_HV_H_6TAP m11, m0, m1, m2, m3, m4, m5, m8 + PREP_HV_H_6TAP m13, m0, m1, m2, m3, m4, m5, m8 + PREP_HV_H_6TAP m15, m0, m1, m2, m3, m4, m5, m8 + PREP_HV_H_6TAP m6, m0, m1, m2, m3, m4, m5, m8 + punpcklwd m8, m9, m11 ; 01 + punpckhwd m9, m11 + punpcklwd m10, m11, m13 ; 12 + punpckhwd m11, m13 + punpcklwd m12, m13, m15 ; 23 + punpckhwd m13, m15 + punpcklwd m14, m15, m6 ; 34 + punpckhwd m15, m6 +.hv_w8_loop: + mova m3, [rsp+16*3] + mova m4, [rsp+16*4] + mova m5, [base+pd_32] + pmaddwd m0, m8, m3 ; a0 + mova m8, m12 + pmaddwd m2, m9, m3 ; a0' + mova m9, m13 + pmaddwd m1, m10, m3 ; b0 + mova m10, m14 + pmaddwd m3, m11 ; b0' + mova m11, m15 + REPX {pmaddwd x, m4}, m12, m13, m14, m15 + REPX {paddd x, m5}, m0, m2, m1, m3 + paddd m0, m12 + paddd m2, m13 + paddd m1, m14 + paddd m3, m15 + movu m15, [r6+ssq*1] + lea r6, [r6+ssq*2] + PREP_HV_H_6TAP m15, m4, m5 + punpcklwd m12, m6, m15 + punpckhwd m13, m6, m15 + movu m6, [r6+ssq*0] + PREP_HV_H_6TAP m6, m4, m5 + mova m4, [rsp+16*5] + punpcklwd m14, m15, m6 + punpckhwd m15, m6 + pmaddwd m5, m12, m4 ; a2 + paddd m0, m5 + pmaddwd m5, m13, m4 ; a2' + paddd m2, m5 + pmaddwd m5, m14, m4 ; b2 + paddd m1, m5 + pmaddwd m4, m15 ; b2' + paddd m3, m4 + REPX {psrad x, 6}, m0, m2, m1, m3 + packssdw m0, m2 + packssdw m1, m3 + mova [r8+wq*0], m0 + mova [r8+wq*2], m1 + lea r8, [r8+wq*4] + sub hd, 2 + jg .hv_w8_loop + add srcq, 8 + add tmpq, 16 + movzx hd, r5b +%endif + sub r5d, 1<<16 + jg .hv_w8_loop0 + RET + +PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_8bpc +PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_8bpc +PREP_8TAP_FN regular_sharp, REGULAR, SHARP, prep_8tap_8bpc +PREP_8TAP_FN sharp_regular, SHARP, REGULAR, prep_8tap_8bpc +PREP_8TAP_FN sharp, SHARP, SHARP + cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h @@ -2845,245 +3850,12 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 mov wd, wm movifnidn srcd, srcm movifnidn hd, hm + LEA base_reg, prep_ssse3 test mxd, 0xf00 jnz .h test myd, 0xf00 - jnz .v - LEA base_reg, prep_ssse3 - tzcnt wd, wd - movzx wd, word [base_reg-prep_ssse3+prep_ssse3_table+wq*2] - pxor m4, m4 - add wq, base_reg - movifnidn strided, stridem - lea r6, [strideq*3] -%if WIN64 - pop r8 - pop r7 -%endif - jmp wq -.h: - LEA base_reg, prep%+SUFFIX - test myd, 0xf00 - jnz .hv -%if cpuflag(ssse3) - WIN64_SPILL_XMM 12 -%else - WIN64_SPILL_XMM 16 -%endif -%if ARCH_X86_32 - %define strideq r6 - mov strideq, stridem -%endif - cmp wd, 4 - je .h_w4 - tzcnt wd, wd -%if cpuflag(ssse3) - %if ARCH_X86_64 - mova m10, [base+subpel_h_shufA] - mova m11, [base+subpel_h_shufB] - mova m9, [base+subpel_h_shufC] - %else - %define m10 [base+subpel_h_shufA] - %define m11 [base+subpel_h_shufB] - %define m9 [base+subpel_h_shufC] - %endif -%endif - shr mxd, 16 - sub srcq, 3 - movzx wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)] - movq m6, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX] -%if cpuflag(ssse3) - mova m7, [base+pw_8192] - pshufd m5, m6, q0000 - pshufd m6, m6, q1111 -%else - punpcklbw m6, m6 - psraw m6, 8 - %if ARCH_X86_64 - mova m7, [pw_2] - mova m15, [pw_1] - %else - %define m15 m4 - %endif - pshufd m5, m6, q1010 - punpckhqdq m6, m6 -%endif - add wq, base_reg - jmp wq -.h_w4: -%if ARCH_X86_32 - and mxd, 0x7f -%else - movzx mxd, mxb -%endif - dec srcq - movd m4, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2] -%if cpuflag(ssse3) - mova m6, [base+pw_8192] - mova m5, [base+subpel_h_shufA] - pshufd m4, m4, q0000 -%else - mova m6, [base+pw_2] - %if ARCH_X86_64 - mova m14, [pw_1] - %else - %define m14 m7 - %endif - punpcklbw m4, m4 - psraw m4, 8 - punpcklqdq m4, m4 -%endif -%if ARCH_X86_64 - lea stride3q, [strideq*3] -%endif -.h_w4_loop: -%if cpuflag(ssse3) - movq m0, [srcq+strideq*0] ; 0 - movq m1, [srcq+strideq*1] ; 1 - %if ARCH_X86_32 - lea srcq, [srcq+strideq*2] - movq m2, [srcq+strideq*0] ; 2 - movq m3, [srcq+strideq*1] ; 3 - lea srcq, [srcq+strideq*2] - %else - movq m2, [srcq+strideq*2] ; 2 - movq m3, [srcq+stride3q ] ; 3 - lea srcq, [srcq+strideq*4] - %endif - pshufb m0, m5 - pshufb m1, m5 - pshufb m2, m5 - pshufb m3, m5 -%elif ARCH_X86_64 - movd m0, [srcq+strideq*0+0] - movd m12, [srcq+strideq*0+1] - movd m1, [srcq+strideq*1+0] - movd m5, [srcq+strideq*1+1] - movd m2, [srcq+strideq*2+0] - movd m13, [srcq+strideq*2+1] - movd m3, [srcq+stride3q +0] - movd m7, [srcq+stride3q +1] - punpckldq m0, m12 - punpckldq m1, m5 - punpckldq m2, m13 - punpckldq m3, m7 - movd m12, [srcq+strideq*0+2] - movd m8, [srcq+strideq*0+3] - movd m5, [srcq+strideq*1+2] - movd m9, [srcq+strideq*1+3] - movd m13, [srcq+strideq*2+2] - movd m10, [srcq+strideq*2+3] - movd m7, [srcq+stride3q +2] - movd m11, [srcq+stride3q +3] - lea srcq, [srcq+strideq*4] - punpckldq m12, m8 - punpckldq m5, m9 - punpckldq m13, m10 - punpckldq m7, m11 - punpcklqdq m0, m12 ; 0 - punpcklqdq m1, m5 ; 1 - punpcklqdq m2, m13 ; 2 - punpcklqdq m3, m7 ; 3 -%else - movd m0, [srcq+strideq*0+0] - movd m1, [srcq+strideq*0+1] - movd m2, [srcq+strideq*0+2] - movd m3, [srcq+strideq*0+3] - punpckldq m0, m1 - punpckldq m2, m3 - punpcklqdq m0, m2 ; 0 - movd m1, [srcq+strideq*1+0] - movd m2, [srcq+strideq*1+1] - movd m3, [srcq+strideq*1+2] - movd m7, [srcq+strideq*1+3] - lea srcq, [srcq+strideq*2] - punpckldq m1, m2 - punpckldq m3, m7 - punpcklqdq m1, m3 ; 1 - movd m2, [srcq+strideq*0+0] - movd m3, [srcq+strideq*0+1] - movd m7, [srcq+strideq*0+2] - movd m5, [srcq+strideq*0+3] - punpckldq m2, m3 - punpckldq m7, m5 - punpcklqdq m2, m7 ; 2 - movd m3, [srcq+strideq*1+0] - movd m7, [srcq+strideq*1+1] - punpckldq m3, m7 - movd m7, [srcq+strideq*1+2] - movd m5, [srcq+strideq*1+3] - lea srcq, [srcq+strideq*2] - punpckldq m7, m5 - punpcklqdq m3, m7 ; 3 -%endif - PMADDUBSW m0, m4, m5, m7, 1 ; subpel_filters + 2 - PMADDUBSW m1, m4, m5, m7, 0 - PMADDUBSW m2, m4, m5, m7, 0 - PMADDUBSW m3, m4, m5, m7, 0 - PHADDW m0, m1, m14, ARCH_X86_32 - PHADDW m2, m3, m14, 0 - PMULHRSW_8192 m0, m0, m6 - PMULHRSW_8192 m2, m2, m6 - mova [tmpq+16*0], m0 - mova [tmpq+16*1], m2 - add tmpq, 32 - sub hd, 4 - jg .h_w4_loop - RET -.h_w8: -%if cpuflag(ssse3) - PREP_8TAP_H 0, srcq+strideq*0 - PREP_8TAP_H 1, srcq+strideq*1 - mova [tmpq+16*0], m0 - mova [tmpq+16*1], m1 - lea srcq, [srcq+strideq*2] - add tmpq, 32 - sub hd, 2 -%else - PREP_8TAP_H 0, srcq - mova [tmpq], m0 - add srcq, strideq - add tmpq, 16 - dec hd -%endif - jg .h_w8 - RET -.h_w16: - mov r3, -16*1 - jmp .h_start -.h_w32: - mov r3, -16*2 - jmp .h_start -.h_w64: - mov r3, -16*4 - jmp .h_start -.h_w128: - mov r3, -16*8 -.h_start: - sub srcq, r3 - mov r5, r3 -.h_loop: -%if cpuflag(ssse3) - PREP_8TAP_H 0, srcq+r3+8*0 - PREP_8TAP_H 1, srcq+r3+8*1 - mova [tmpq+16*0], m0 - mova [tmpq+16*1], m1 - add tmpq, 32 - add r3, 16 -%else - PREP_8TAP_H 0, srcq+r3 - mova [tmpq], m0 - add tmpq, 16 - add r3, 8 -%endif - jl .h_loop - add srcq, strideq - mov r3, r5 - dec hd - jg .h_loop - RET + jz mangle(private_prefix %+ _prep_6tap_8bpc_ssse3).prep .v: - LEA base_reg, prep%+SUFFIX %if ARCH_X86_32 mov mxd, myd and mxd, 0x7f @@ -3094,26 +3866,17 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 shr myd, 16 cmp hd, 6 cmovs myd, mxd - movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] -%if cpuflag(ssse3) + movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3] mova m2, [base+pw_512] mova m7, [base+pw_8192] punpcklwd m0, m0 -%else - punpcklbw m0, m0 - psraw m0, 8 -%endif %if ARCH_X86_32 %define subpel0 [rsp+mmsize*0] %define subpel1 [rsp+mmsize*1] %define subpel2 [rsp+mmsize*2] %define subpel3 [rsp+mmsize*3] %assign regs_used 6 ; use r5 (mx) as tmp for stack alignment if needed - %if cpuflag(ssse3) ALLOC_STACK -mmsize*4 - %else - ALLOC_STACK -mmsize*5 - %endif %assign regs_used 7 mov strideq, [rstk+stack_offset+gprsize*3] pshufd m1, m0, q0000 @@ -3141,12 +3904,6 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 jns .v_w8 %endif .v_w4: -%if notcpuflag(ssse3) - pxor m6, m6 - %if ARCH_X86_64 - mova m7, [base+pw_2] - %endif -%endif %if ARCH_X86_32 %if STACK_ALIGNMENT < mmsize %define srcm [esp+stack_size+gprsize*1] @@ -3188,25 +3945,13 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 punpcklbw m2, m4 ; 23 34 punpcklbw m3, m5 ; 45 56 .v_w4_loop: -%if ARCH_X86_32 && notcpuflag(ssse3) - mova m7, subpel0 - %define subpel0 m7 -%endif mova m5, m1 - PMADDUBSW m5, subpel0, m6, m4, 0 ; a0 b0 -%if ARCH_X86_32 && notcpuflag(ssse3) - mova m7, subpel1 - %define subpel1 m7 -%endif + pmaddubsw m5, subpel0 ; a0 b0 mova m1, m2 - PMADDUBSW m2, subpel1, m6, m4, 0 ; a1 b1 + pmaddubsw m2, subpel1 ; a1 b1 paddw m5, m2 -%if ARCH_X86_32 && notcpuflag(ssse3) - mova m7, subpel2 - %define subpel2 m7 -%endif mova m2, m3 - PMADDUBSW m3, subpel2, m6, m4, 0 ; a2 b2 + pmaddubsw m3, subpel2 ; a2 b2 movd m4, [srcq+strideq*1] lea srcq, [srcq+strideq*2] paddw m5, m3 @@ -3214,27 +3959,10 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 movd m0, [srcq+strideq*0] punpckldq m4, m0 ; 7 8 _ _ punpcklbw m3, m4 ; 67 78 -%if notcpuflag(ssse3) - %if ARCH_X86_64 - SWAP m12, m0 - %else - mova [esp+mmsize*4], m0 - mova m7, subpel3 - %define subpel3 m7 - %endif -%endif mova m4, m3 - PMADDUBSW m4, subpel3, m6, m0, 0 ; a3 b3 + pmaddubsw m4, subpel3 ; a3 b3 paddw m5, m4 -%if ARCH_X86_64 || cpuflag(ssse3) - %if notcpuflag(ssse3) - SWAP m0, m12 - %endif - PMULHRSW_8192 m5, m5, m7 -%else - mova m0, [esp+mmsize*4] - PMULHRSW_8192 m5, m5, [base+pw_2] -%endif + pmulhrsw m5, m7 movq [tmpq+wq*0], m5 movhps [tmpq+wq*2], m5 lea tmpq, [tmpq+wq*4] @@ -3277,7 +4005,6 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 .v_w8_loop: movq m13, [srcq+strideq*1] lea srcq, [srcq+strideq*2] -%if cpuflag(ssse3) pmaddubsw m14, m1, subpel0 ; a0 pmaddubsw m15, m2, subpel0 ; b0 mova m1, m3 @@ -3303,35 +4030,6 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 paddw m15, m13 pmulhrsw m14, m7 pmulhrsw m15, m7 -%else - mova m14, m1 - PMADDUBSW m14, subpel0, m7, m12, 1 ; a0 - mova m15, m2 - PMADDUBSW m15, subpel0, m7, m12, 0 ; b0 - mova m1, m3 - PMADDUBSW m3, subpel1, m7, m12, 0 ; a1 - mova m2, m4 - PMADDUBSW m4, subpel1, m7, m12, 0 ; b1 - paddw m14, m3 - mova m3, m5 - PMADDUBSW m5, subpel2, m7, m12, 0 ; a2 - paddw m15, m4 - mova m4, m6 - PMADDUBSW m6, subpel2, m7, m12, 0 ; b2 - paddw m15, m6 - punpcklbw m12, m0, m13 ; 67 - movq m0, [srcq+strideq*0] - punpcklbw m13, m0 ; 78 - paddw m14, m5 - mova m5, m12 - PMADDUBSW m12, subpel3, m7, m6, 0 ; a3 - paddw m14, m12 - mova m6, m13 - PMADDUBSW m13, subpel3, m7, m12, 0 ; b3 - paddw m15, m13 - PMULHRSW_8192 m14, m14, [base+pw_2] - PMULHRSW_8192 m15, m15, [base+pw_2] -%endif movu [tmpq+wq*0], m14 movu [tmpq+wq*2], m15 lea tmpq, [tmpq+wq*4] @@ -3350,19 +4048,132 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %undef subpel1 %undef subpel2 %undef subpel3 +.h_w4: + WIN64_SPILL_XMM 7 +%if ARCH_X86_32 + and mxd, 0x7f +%else + movzx mxd, mxb +%endif + dec srcq + movd m4, [base_reg+mxq*8+subpel_filters-prep_ssse3+2] + mova m5, [base+subpel_h_shufA] + mova m6, [base+pw_8192] + movifnidn r2, stridemp + pshufd m4, m4, q0000 + lea r3, [r2*3] +.h_w4_loop: + movq m0, [srcq+r2*0] + movq m1, [srcq+r2*1] + movq m2, [srcq+r2*2] + movq m3, [srcq+r3 ] + lea srcq, [srcq+r2*4] + REPX {pshufb x, m5}, m0, m1, m2, m3 + REPX {pmaddubsw x, m4}, m0, m1, m2, m3 + phaddw m0, m1 + phaddw m2, m3 + pmulhrsw m0, m6 + pmulhrsw m2, m6 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m2 + add tmpq, 32 + sub hd, 4 + jg .h_w4_loop + RET +.h: + test myd, 0xf00 + jnz .hv + cmp wd, 4 + je .h_w4 + WIN64_SPILL_XMM 12 +%if ARCH_X86_32 + %define strideq r6 + mov strideq, stridem +%endif + tzcnt wd, wd +%if ARCH_X86_64 + mova m10, [base+subpel_h_shufA] + mova m11, [base+subpel_h_shufB] + mova m9, [base+subpel_h_shufC] +%else + %define m10 [base+subpel_h_shufA] + %define m11 [base+subpel_h_shufB] + %define m9 [base+subpel_h_shufC] +%endif + shr mxd, 16 + sub srcq, 3 + movzx wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)] + movq m6, [base_reg+mxq*8+subpel_filters-prep_ssse3] + mova m7, [base+pw_8192] + pshufd m5, m6, q0000 + pshufd m6, m6, q1111 + add wq, base_reg + jmp wq +%macro PREP_8TAP_H 2 ; dst, src_memloc + movu m%1, [%2] + pshufb m2, m%1, m11 ; subpel_h_shufB + pshufb m3, m%1, m9 ; subpel_h_shufC + pshufb m%1, m10 ; subpel_h_shufA + mova m4, m2 + pmaddubsw m4, m5 ; subpel +0 B0 + pmaddubsw m2, m6 ; subpel +4 B4 + pmaddubsw m3, m6 ; subpel +4 C4 + pmaddubsw m%1, m5 ; subpel +0 A0 + paddw m3, m4 + paddw m%1, m2 + phaddw m%1, m3 + pmulhrsw m%1, m7 +%endmacro +.h_w8: + PREP_8TAP_H 0, srcq+strideq*0 + PREP_8TAP_H 1, srcq+strideq*1 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + lea srcq, [srcq+strideq*2] + add tmpq, 32 + sub hd, 2 + jg .h_w8 + RET +.h_w16: + mov r3, -16*1 + jmp .h_start +.h_w32: + mov r3, -16*2 + jmp .h_start +.h_w64: + mov r3, -16*4 + jmp .h_start +.h_w128: + mov r3, -16*8 +.h_start: + sub srcq, r3 + mov r5, r3 +.h_loop: + PREP_8TAP_H 0, srcq+r3+8*0 + PREP_8TAP_H 1, srcq+r3+8*1 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + add tmpq, 32 + add r3, 16 + jl .h_loop + add srcq, strideq + mov r3, r5 + dec hd + jg .h_loop + RET .hv: RESET_STACK_STATE cmp wd, 4 jg .hv_w8 and mxd, 0x7f - movd m1, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2] + movd m1, [base_reg+mxq*8+subpel_filters-prep_ssse3+2] %if ARCH_X86_32 mov mxd, myd shr myd, 16 and mxd, 0x7f cmp hd, 6 cmovs myd, mxd - movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] + movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3] mov strideq, stridem %assign regs_used 6 ALLOC_STACK -mmsize*14 @@ -3388,12 +4199,8 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 shr myd, 16 cmp hd, 6 cmovs myd, mxd - movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] - %if cpuflag(ssse3) + movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3] ALLOC_STACK mmsize*14, 14 - %else - ALLOC_STACK mmsize*14, 16 - %endif lea stride3q, [strideq*3] sub srcq, stride3q dec srcq @@ -3403,11 +4210,7 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %define subpelv3 m13 punpcklbw m0, m0 psraw m0, 8 - %if cpuflag(ssse3) mova m8, [base+pw_8192] - %else - mova m8, [base+pw_2] - %endif mova m9, [base+pd_32] pshufd m10, m0, q0000 pshufd m11, m0, q1111 @@ -3415,10 +4218,6 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 pshufd m13, m0, q3333 %endif pshufd m7, m1, q0000 -%if notcpuflag(ssse3) - punpcklbw m7, m7 - psraw m7, 8 -%endif %define hv4_line_0_0 4 %define hv4_line_0_1 5 %define hv4_line_0_2 6 @@ -3430,26 +4229,14 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %define hv4_line_1_2 12 %define hv4_line_1_3 13 %if ARCH_X86_32 - %if cpuflag(ssse3) - %define w8192reg [base+pw_8192] - %else - %define w8192reg [base+pw_2] - %endif - %define d32reg [base+pd_32] + %define w8192reg [base+pw_8192] + %define d32reg [base+pd_32] %else - %define w8192reg m8 - %define d32reg m9 + %define w8192reg m8 + %define d32reg m9 %endif ; lower shuffle 0 1 2 3 4 -%if cpuflag(ssse3) mova m6, [base+subpel_h_shuf4] -%else - %if ARCH_X86_64 - mova m15, [pw_1] - %else - %define m15 m1 - %endif -%endif movq m5, [srcq+strideq*0] ; 0 _ _ _ movhps m5, [srcq+strideq*1] ; 0 _ 1 _ %if ARCH_X86_32 @@ -3462,34 +4249,23 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 movhps m4, [srcq+stride3q ] ; 2 _ 3 _ lea srcq, [srcq+strideq*4] %endif - PSHUFB_SUBPEL_H_4a m2, m5, m6, m1, m3, 1 ;H subpel_h_shuf4 0~1~ - PSHUFB_SUBPEL_H_4a m0, m4, m6, m1, m3, 0 ;H subpel_h_shuf4 2~3~ - PMADDUBSW m2, m7, m1, m3, 1 ;H subpel_filters - PMADDUBSW m0, m7, m1, m3, 0 ;H subpel_filters - PHADDW m2, m0, m15, ARCH_X86_32 ;H 0 1 2 3 - PMULHRSW_8192 m2, m2, w8192reg + pshufb m2, m5, m6 ;H subpel_h_shuf4 0~1~ + pshufb m0, m4, m6 ;H subpel_h_shuf4 2~3~ + pmaddubsw m2, m7 ;H subpel_filters + pmaddubsw m0, m7 ;H subpel_filters + phaddw m2, m0 + pmulhrsw m2, w8192reg SAVELINE_W4 m2, 2, 0 ; upper shuffle 2 3 4 5 6 -%if cpuflag(ssse3) mova m6, [base+subpel_h_shuf4+16] -%endif - PSHUFB_SUBPEL_H_4b m2, m5, m6, m1, m3, 0 ;H subpel_h_shuf4 0~1~ - PSHUFB_SUBPEL_H_4b m0, m4, m6, m1, m3, 0 ;H subpel_h_shuf4 2~3~ - PMADDUBSW m2, m7, m1, m3, 1 ;H subpel_filters - PMADDUBSW m0, m7, m1, m3, 0 ;H subpel_filters - PHADDW m2, m0, m15, ARCH_X86_32 ;H 0 1 2 3 - PMULHRSW_8192 m2, m2, w8192reg -%if notcpuflag(ssse3) - %if ARCH_X86_64 - SWAP m14, m2 - %else - mova [esp+mmsize*4], m2 - %endif -%endif + pshufb m2, m5, m6 ;H subpel_h_shuf4 0~1~ + pshufb m0, m4, m6 ;H subpel_h_shuf4 2~3~ + pmaddubsw m2, m7 ;H subpel_filters + pmaddubsw m0, m7 ;H subpel_filters + phaddw m2, m0 ;H 0 1 2 3 + pmulhrsw m2, w8192reg ; lower shuffle -%if cpuflag(ssse3) mova m6, [base+subpel_h_shuf4] -%endif movq m5, [srcq+strideq*0] ; 4 _ _ _ movhps m5, [srcq+strideq*1] ; 4 _ 5 _ %if ARCH_X86_32 @@ -3500,32 +4276,23 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 movq m4, [srcq+strideq*2] ; 6 _ _ _ add srcq, stride3q %endif - PSHUFB_SUBPEL_H_4a m3, m5, m6, m1, m2, 0 ;H subpel_h_shuf4 4~5~ - PSHUFB_SUBPEL_H_4a m0, m4, m6, m1, m2, 0 ;H subpel_h_shuf4 6~6~ - PMADDUBSW m3, m7, m1, m2, 1 ;H subpel_filters - PMADDUBSW m0, m7, m1, m2, 0 ;H subpel_filters - PHADDW m3, m0, m15, ARCH_X86_32 ;H 4 5 6 7 - PMULHRSW_8192 m3, m3, w8192reg + pshufb m3, m5, m6 ;H subpel_h_shuf4 4~5~ + pshufb m0, m4, m6 ;H subpel_h_shuf4 6~6~ + pmaddubsw m3, m7 ;H subpel_filters + pmaddubsw m0, m7 ;H subpel_filters + phaddw m3, m0 ;H 4 5 6 7 + pmulhrsw m3, w8192reg SAVELINE_W4 m3, 3, 0 ; upper shuffle -%if cpuflag(ssse3) mova m6, [base+subpel_h_shuf4+16] -%endif - PSHUFB_SUBPEL_H_4b m3, m5, m6, m1, m2, 0 ;H subpel_h_shuf4 4~5~ - PSHUFB_SUBPEL_H_4b m0, m4, m6, m1, m2, 0 ;H subpel_h_shuf4 6~6~ - PMADDUBSW m3, m7, m1, m2, 1 ;H subpel_filters - PMADDUBSW m0, m7, m1, m2, 0 ;H subpel_filters - PHADDW m3, m0, m15, ARCH_X86_32 ;H 4 5 6 7 - PMULHRSW_8192 m3, m3, w8192reg -%if notcpuflag(ssse3) - %if ARCH_X86_64 - SWAP m2, m14 - %else - mova m2, [esp+mmsize*4] - %endif -%endif + pshufb m3, m5, m6 ;H subpel_h_shuf4 4~5~ + pshufb m0, m4, m6 ;H subpel_h_shuf4 6~6~ + pmaddubsw m3, m7 ;H subpel_filters + pmaddubsw m0, m7 ;H subpel_filters + phaddw m3, m0 ;H 4 5 6 7 + pmulhrsw m3, w8192reg ;process high - PALIGNR m4, m3, m2, 4;V 1 2 3 4 + palignr m4, m3, m2, 4;V 1 2 3 4 punpcklwd m1, m2, m4 ; V 01 12 punpckhwd m2, m4 ; V 23 34 pshufd m0, m3, q2121;V 5 6 5 6 @@ -3537,7 +4304,7 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 ;process low RESTORELINE_W4 m2, 2, 0 RESTORELINE_W4 m3, 3, 0 - PALIGNR m4, m3, m2, 4;V 1 2 3 4 + palignr m4, m3, m2, 4;V 1 2 3 4 punpcklwd m1, m2, m4 ; V 01 12 punpckhwd m2, m4 ; V 23 34 pshufd m0, m3, q2121;V 5 6 5 6 @@ -3551,34 +4318,17 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 mova m2, m3 pmaddwd m3, subpelv2; V a2 b2 paddd m5, m3 -%if notcpuflag(ssse3) - %if ARCH_X86_64 - SWAP m14, m5 - %else - mova [esp+mmsize*4], m5 - %define m15 m3 - %endif -%endif -%if cpuflag(ssse3) mova m6, [base+subpel_h_shuf4] -%endif movq m4, [srcq+strideq*0] ; 7 movhps m4, [srcq+strideq*1] ; 7 _ 8 _ - PSHUFB_SUBPEL_H_4a m4, m4, m6, m3, m5, 0 ; H subpel_h_shuf4 7~8~ - PMADDUBSW m4, m7, m3, m5, 1 ; H subpel_filters - PHADDW m4, m4, m15, ARCH_X86_32 ; H 7878 - PMULHRSW_8192 m4, m4, w8192reg - PALIGNR m3, m4, m0, 12, m5 ; 6787 + pshufb m4, m6 ; H subpel_h_shuf4 7~8~ + pmaddubsw m4, m7 ; H subpel_filters + phaddw m4, m4 ; H 7878 + pmulhrsw m4, w8192reg + palignr m3, m4, m0, 12 ; 6787 mova m0, m4 punpcklwd m3, m4 ; 67 78 pmaddwd m4, m3, subpelv3; a3 b3 -%if notcpuflag(ssse3) - %if ARCH_X86_64 - SWAP m5, m14 - %else - mova m5, [esp+mmsize*4] - %endif -%endif paddd m5, d32reg ; pd_32 paddd m5, m4 psrad m5, 6 @@ -3599,33 +4349,17 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 mova m2, m3 pmaddwd m3, subpelv2; V a2 b2 paddd m5, m3 -%if notcpuflag(ssse3) - %if ARCH_X86_64 - SWAP m14, m5 - %else - mova [esp+0xA0], m5 - %endif -%endif -%if cpuflag(ssse3) mova m6, [base+subpel_h_shuf4+16] -%endif movq m4, [srcq+strideq*0] ; 7 movhps m4, [srcq+strideq*1] ; 7 _ 8 _ - PSHUFB_SUBPEL_H_4b m4, m4, m6, m3, m5, 0 ; H subpel_h_shuf4 7~8~ - PMADDUBSW m4, m7, m3, m5, 1 ; H subpel_filters - PHADDW m4, m4, m15, ARCH_X86_32 ; H 7878 - PMULHRSW_8192 m4, m4, w8192reg - PALIGNR m3, m4, m0, 12, m5 ; 6787 + pshufb m4, m6 ; H subpel_h_shuf4 7~8~ + pmaddubsw m4, m7 ; H subpel_filters + phaddw m4, m4 ; H 7878 + pmulhrsw m4, w8192reg + palignr m3, m4, m0, 12 ; 6787 mova m0, m4 punpcklwd m3, m4 ; 67 78 pmaddwd m4, m3, subpelv3; a3 b3 -%if notcpuflag(ssse3) - %if ARCH_X86_64 - SWAP m5, m14 - %else - mova m5, [esp+0xA0] - %endif -%endif paddd m5, d32reg ; pd_32 paddd m5, m4 psrad m4, m5, 6 @@ -3667,13 +4401,13 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %define subpelv3 [rsp+mmsize*10] %define accuv0 [rsp+mmsize*11] %define accuv1 [rsp+mmsize*12] - movq m1, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX] + movq m1, [base_reg+mxq*8+subpel_filters-prep_ssse3] mov mxd, myd shr myd, 16 and mxd, 0x7f cmp hd, 6 cmovs myd, mxd - movq m5, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] + movq m5, [base_reg+myq*8+subpel_filters-prep_ssse3] mov strideq, stridem %assign regs_used 6 ALLOC_STACK -mmsize*14 @@ -3685,15 +4419,8 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 mov tmpm, tmpq mov stridem, strideq %endif - %if cpuflag(ssse3) pshufd m0, m1, q0000 pshufd m1, m1, q1111 - %else - punpcklbw m1, m1 - psraw m1, 8 - pshufd m0, m1, q1010 - punpckhqdq m1, m1 - %endif punpcklbw m5, m5 psraw m5, 8 pshufd m2, m5, q0000 @@ -3719,22 +4446,14 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %define subpelv3 m15 %define accuv0 m8 %define accuv1 m9 - movq m0, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX] + movq m0, [base_reg+mxq*8+subpel_filters-prep_ssse3] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd - movq m1, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] - %if cpuflag(ssse3) + movq m1, [base_reg+myq*8+subpel_filters-prep_ssse3] pshufd subpelh0, m0, q0000 pshufd subpelh1, m0, q1111 - %else - punpcklbw m0, m0 - psraw m0, 8 - pshufd subpelh0, m0, q1010 - pshufd subpelh1, m0, q3232 - mova m7, [base+pw_2] - %endif punpcklbw m1, m1 psraw m1, 8 pshufd subpelv0, m1, q0000 @@ -3751,79 +4470,68 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 shl r5d, 14 add r5d, hd .hv_w8_loop0: -%if cpuflag(ssse3) - %if ARCH_X86_64 +%if ARCH_X86_64 mova m7, [base+subpel_h_shufA] mova m8, [base+subpel_h_shufB] mova m9, [base+subpel_h_shufC] - %define shufA m7 - %define shufB m8 - %define shufC m9 - %else - %define shufA [base+subpel_h_shufA] - %define shufB [base+subpel_h_shufB] - %define shufC [base+subpel_h_shufC] - %endif + %define shufA m7 + %define shufB m8 + %define shufC m9 +%else + %define shufA [base+subpel_h_shufA] + %define shufB [base+subpel_h_shufB] + %define shufC [base+subpel_h_shufC] %endif - PREP_8TAP_HV m4, srcq+strideq*0, m7, m0 - PREP_8TAP_HV m5, srcq+strideq*1, m7, m0 +%macro PREP_8TAP_HV 2 ; dst, src_memloc, tmp[1-2] + movu %1, [%2] + pshufb m2, %1, shufB + pshufb m3, %1, shufC + pshufb %1, shufA + mova m1, m2 + pmaddubsw m1, subpelh0 ; subpel +0 C0 + pmaddubsw m3, subpelh1 ; subpel +4 B4 + pmaddubsw m2, subpelh1 ; C4 + pmaddubsw %1, subpelh0 ; A0 + paddw m1, m3 ; C0+B4 + paddw %1, m2 ; A0+C4 + phaddw %1, m1 +%endmacro + PREP_8TAP_HV m4, srcq+strideq*0 + PREP_8TAP_HV m5, srcq+strideq*1 %if ARCH_X86_64 - PREP_8TAP_HV m6, srcq+strideq*2, m7, m0 + PREP_8TAP_HV m6, srcq+strideq*2 add srcq, stride3q - PREP_8TAP_HV m0, srcq+strideq*0, m7, m9 + PREP_8TAP_HV m0, srcq+strideq*0 %else lea srcq, [srcq+strideq*2] - %if notcpuflag(ssse3) - mova [esp], m4 - %endif - PREP_8TAP_HV m6, srcq+strideq*0, m7, m4 - PREP_8TAP_HV m0, srcq+strideq*1, m7, m4 + PREP_8TAP_HV m6, srcq+strideq*0 + PREP_8TAP_HV m0, srcq+strideq*1 lea srcq, [srcq+strideq*2] %endif -%if cpuflag(ssse3) mova m7, [base+pw_8192] -%else - mova m7, [base+pw_2] - %if ARCH_X86_32 - mova m4, [esp] - %endif -%endif - PMULHRSW_8192 m4, m4, m7 - PMULHRSW_8192 m5, m5, m7 - PMULHRSW_8192 m6, m6, m7 - PMULHRSW_8192 m0, m0, m7 + REPX {pmulhrsw x, m7}, m4, m5, m6, m0 punpcklwd m1, m4, m5 ; 01 punpcklwd m2, m5, m6 ; 12 punpcklwd m3, m6, m0 ; 23 SAVELINE_W8 1, m1 SAVELINE_W8 2, m2 SAVELINE_W8 3, m3 -%if cpuflag(ssse3) mova m7, [base+subpel_h_shufA] -%endif %if ARCH_X86_64 - PREP_8TAP_HV m4, srcq+strideq*1, m8, m9 - PREP_8TAP_HV m5, srcq+strideq*2, m8, m9 + PREP_8TAP_HV m4, srcq+strideq*1 + PREP_8TAP_HV m5, srcq+strideq*2 add srcq, stride3q - PREP_8TAP_HV m6, srcq+strideq*0, m8, m9 + PREP_8TAP_HV m6, srcq+strideq*0 %else - %if notcpuflag(ssse3) - mova [esp+0x30], m0 - %endif - PREP_8TAP_HV m4, srcq+strideq*0, m7, m0 - PREP_8TAP_HV m5, srcq+strideq*1, m7, m0 + PREP_8TAP_HV m4, srcq+strideq*0 + PREP_8TAP_HV m5, srcq+strideq*1 lea srcq, [srcq+strideq*2] - PREP_8TAP_HV m6, srcq+strideq*0, m7, m0 + PREP_8TAP_HV m6, srcq+strideq*0 %endif -%if cpuflag(ssse3) - mova m7, [base+pw_8192] -%elif ARCH_X86_32 - mova m0, [esp+0x30] - mova m7, [base+pw_2] -%endif - PMULHRSW_8192 m1, m4, m7 - PMULHRSW_8192 m2, m5, m7 - PMULHRSW_8192 m3, m6, m7 + mova m3, [base+pw_8192] + pmulhrsw m1, m3, m4 + pmulhrsw m2, m3, m5 + pmulhrsw m3, m6 punpcklwd m4, m0, m1 ; 34 punpcklwd m5, m1, m2 ; 45 punpcklwd m6, m2, m3 ; 56 @@ -3866,25 +4574,19 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 mova m7, [base+pd_32] paddd accuv0, m7 paddd accuv1, m7 - %if cpuflag(ssse3) mova m7, [base+subpel_h_shufB] mova m6, [base+subpel_h_shufC] mova m5, [base+subpel_h_shufA] - %define shufA m5 - %define shufB m7 - %define shufC m6 - %endif + %define shufA m5 + %define shufB m7 + %define shufC m6 %endif - PREP_8TAP_HV m0, srcq+strideq*1, m5, m6 + PREP_8TAP_HV m0, srcq+strideq*1 lea srcq, [srcq+strideq*2] - PREP_8TAP_HV m4, srcq+strideq*0, m5, m6 -%if cpuflag(ssse3) + PREP_8TAP_HV m4, srcq+strideq*0 mova m5, [base+pw_8192] -%else - mova m5, [base+pw_2] -%endif - PMULHRSW_8192 m0, m0, m5 - PMULHRSW_8192 m4, m4, m5 + pmulhrsw m0, m5 + pmulhrsw m4, m5 RESTORELINE_W8 6, m6 punpcklwd m5, m6, m0 ; 67 punpcklwd m6, m0, m4 ; 78 @@ -3925,7 +4627,6 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 sub r5d, 1<<16 jg .hv_w8_loop0 RET -%endmacro %macro movifprep 2 %if isprep @@ -7262,16 +7963,17 @@ DECLARE_REG_TMP 6, 8 %else DECLARE_REG_TMP 1, 2 %endif +%define PUT_8TAP_SCALED_FN FN put_8tap_scaled, BILIN_SCALED_FN put -FN put_8tap_scaled, sharp, SHARP, SHARP -FN put_8tap_scaled, sharp_smooth, SHARP, SMOOTH -FN put_8tap_scaled, smooth_sharp, SMOOTH, SHARP -FN put_8tap_scaled, smooth, SMOOTH, SMOOTH -FN put_8tap_scaled, sharp_regular, SHARP, REGULAR -FN put_8tap_scaled, regular_sharp, REGULAR, SHARP -FN put_8tap_scaled, smooth_regular, SMOOTH, REGULAR -FN put_8tap_scaled, regular_smooth, REGULAR, SMOOTH -FN put_8tap_scaled, regular, REGULAR, REGULAR +PUT_8TAP_SCALED_FN sharp, SHARP, SHARP, put_8tap_scaled_8bpc +PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, put_8tap_scaled_8bpc +PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, put_8tap_scaled_8bpc +PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, put_8tap_scaled_8bpc +PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, put_8tap_scaled_8bpc +PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, put_8tap_scaled_8bpc +PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, put_8tap_scaled_8bpc +PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, put_8tap_scaled_8bpc +PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR MC_8TAP_SCALED put %if WIN64 @@ -7281,16 +7983,17 @@ DECLARE_REG_TMP 6, 7 %else DECLARE_REG_TMP 1, 2 %endif +%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled, BILIN_SCALED_FN prep -FN prep_8tap_scaled, sharp, SHARP, SHARP -FN prep_8tap_scaled, sharp_smooth, SHARP, SMOOTH -FN prep_8tap_scaled, smooth_sharp, SMOOTH, SHARP -FN prep_8tap_scaled, smooth, SMOOTH, SMOOTH -FN prep_8tap_scaled, sharp_regular, SHARP, REGULAR -FN prep_8tap_scaled, regular_sharp, REGULAR, SHARP -FN prep_8tap_scaled, smooth_regular, SMOOTH, REGULAR -FN prep_8tap_scaled, regular_smooth, REGULAR, SMOOTH -FN prep_8tap_scaled, regular, REGULAR, REGULAR +PREP_8TAP_SCALED_FN sharp, SHARP, SHARP, prep_8tap_scaled_8bpc +PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_scaled_8bpc +PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_scaled_8bpc +PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, prep_8tap_scaled_8bpc +PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, prep_8tap_scaled_8bpc +PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, prep_8tap_scaled_8bpc +PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, prep_8tap_scaled_8bpc +PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, prep_8tap_scaled_8bpc +PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR MC_8TAP_SCALED prep %if ARCH_X86_32 @@ -7369,7 +8072,7 @@ MC_8TAP_SCALED prep %define m15 m7 %define m11 m7 %endif - %if notcpuflag(ssse3) || ARCH_X86_32 + %if ARCH_X86_32 pxor m11, m11 %endif lea tmp1d, [myq+deltaq*4] @@ -7446,7 +8149,7 @@ MC_8TAP_SCALED prep %endif %endif -%macro WARP_AFFINE_8X8T 0 +%macro WARP_AFFINE_8X8 0 %if ARCH_X86_64 cglobal warp_affine_8x8t_8bpc, 6, 14, 16, 0x90, tmp, ts %else @@ -7468,7 +8171,6 @@ cglobal warp_affine_8x8t_8bpc, 0, 7, 16, -0x130-copy_args, tmp, ts mova m14, [esp+0xE0] mova m15, [esp+0xF0] %endif -%if cpuflag(ssse3) psrad m12, 13 psrad m13, 13 psrad m14, 13 @@ -7478,22 +8180,6 @@ cglobal warp_affine_8x8t_8bpc, 0, 7, 16, -0x130-copy_args, tmp, ts mova m13, [PIC_sym(pw_8192)] pmulhrsw m12, m13 ; (x + (1 << 6)) >> 7 pmulhrsw m14, m13 -%else - %if ARCH_X86_32 - %define m10 m0 - %endif - mova m10, [PIC_sym(pd_16384)] - paddd m12, m10 - paddd m13, m10 - paddd m14, m10 - paddd m15, m10 - psrad m12, 15 - psrad m13, 15 - psrad m14, 15 - psrad m15, 15 - packssdw m12, m13 - packssdw m14, m15 -%endif mova [tmpq+tsq*0], m12 mova [tmpq+tsq*2], m14 dec counterd @@ -7506,9 +8192,7 @@ cglobal warp_affine_8x8t_8bpc, 0, 7, 16, -0x130-copy_args, tmp, ts call mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).main2 lea tmpq, [tmpq+tsq*4] jmp .loop -%endmacro -%macro WARP_AFFINE_8X8 0 %if ARCH_X86_64 cglobal warp_affine_8x8_8bpc, 6, 14, 16, 0x90, \ dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \ @@ -7557,11 +8241,7 @@ cglobal warp_affine_8x8_8bpc, 0, 7, 16, -0x130-copy_args, \ lea dstq, [dstq+dsq*2] .start: %if notcpuflag(sse4) - %if cpuflag(ssse3) %define roundval pw_8192 - %else - %define roundval pd_262144 - %endif %if ARCH_X86_64 mova m10, [PIC_sym(roundval)] %else @@ -7584,18 +8264,10 @@ cglobal warp_affine_8x8_8bpc, 0, 7, 16, -0x130-copy_args, \ packusdw m12, m13 pavgw m12, m11 ; (x + (1 << 10)) >> 11 %else - %if cpuflag(ssse3) psrad m12, 17 psrad m13, 17 packssdw m12, m13 pmulhrsw m12, m10 - %else - paddd m12, m10 - paddd m13, m10 - psrad m12, 19 - psrad m13, 19 - packssdw m12, m13 - %endif %endif %if ARCH_X86_32 %define m14 m6 @@ -7609,18 +8281,10 @@ cglobal warp_affine_8x8_8bpc, 0, 7, 16, -0x130-copy_args, \ packusdw m14, m15 pavgw m14, m11 ; (x + (1 << 10)) >> 11 %else - %if cpuflag(ssse3) psrad m14, 17 psrad m15, 17 packssdw m14, m15 pmulhrsw m14, m10 - %else - paddd m14, m10 - paddd m15, m10 - psrad m14, 19 - psrad m15, 19 - packssdw m14, m15 - %endif %endif packuswb m12, m14 movq [dstq+dsq*0], m12 @@ -7670,17 +8334,12 @@ ALIGN function_align lea filterq, [PIC_sym(mc_warp_filter2)] %if ARCH_X86_64 mov myd, r6m - %if cpuflag(ssse3) pxor m11, m11 - %endif %endif call .h psrld m2, m0, 16 psrld m3, m1, 16 %if ARCH_X86_32 - %if notcpuflag(ssse3) - mova [esp+gprsize+0x00], m2 - %endif mova [esp+gprsize+0x10], m3 %endif call .h @@ -7694,9 +8353,6 @@ ALIGN function_align %if ARCH_X86_64 %define blendmask [rsp+gprsize+0x80] %else - %if notcpuflag(ssse3) - mova m2, [esp+gprsize+0x00] - %endif mova m3, [esp+gprsize+0x10] %define blendmask [esp+gprsize+0x120] %define m10 m7 @@ -7720,9 +8376,6 @@ ALIGN function_align mova [rsp+gprsize+0x30], m5 call .h %if ARCH_X86_32 - %if notcpuflag(ssse3) - mova m2, [esp+gprsize+0x00] - %endif mova m3, [esp+gprsize+0x10] %define m10 m5 %endif @@ -7882,7 +8535,6 @@ ALIGN function_align lea tmp2d, [mxq+alphaq*1] shr mxd, 10 shr tmp1d, 10 -%if cpuflag(ssse3) movq m14, [filterq+mxq *8] ; 2 X movq m9, [filterq+tmp1q*8] ; 6 X lea tmp1d, [tmp2q+alphaq*4] @@ -7901,95 +8553,6 @@ ALIGN function_align pmaddubsw m10, m9 phaddw m0, m15 phaddw m1, m10 -%else - %if ARCH_X86_32 - %define m11 m2 - %endif - pcmpeqw m0, m0 - psrlw m14, m0, 8 - psrlw m15, m10, 8 ; 01 03 05 07 09 11 13 15 - pand m14, m10 ; 00 02 04 06 08 10 12 14 - packuswb m14, m15 ; 00 02 04 06 08 10 12 14 01 03 05 07 09 11 13 15 - psrldq m9, m0, 4 - pshufd m0, m14, q0220 - pand m0, m9 - psrldq m14, 1 ; 02 04 06 08 10 12 14 01 03 05 07 09 11 13 15 __ - pslldq m15, m14, 12 - por m0, m15 ; shufA - psrlw m15, m0, 8 - psraw m11, m1, 8 - psllw m0, 8 - psllw m1, 8 - psrlw m0, 8 - psraw m1, 8 - pmullw m15, m11 - pmullw m0, m1 - paddw m0, m15 ; pmaddubsw m0, m1 - pshufd m15, m14, q0220 - pand m15, m9 - psrldq m14, 1 ; 04 06 08 10 12 14 01 03 05 07 09 11 13 15 __ __ - pslldq m1, m14, 12 - por m15, m1 ; shufC - pshufd m1, m14, q0220 - pand m1, m9 - psrldq m14, 1 ; 06 08 10 12 14 01 03 05 07 09 11 13 15 __ __ __ - pslldq m11, m14, 12 - por m1, m11 ; shufB - pshufd m10, m14, q0220 - pand m10, m9 - psrldq m14, 1 ; 08 10 12 14 01 03 05 07 09 11 13 15 __ __ __ __ - pslldq m14, m14, 12 - por m10, m14 ; shufD - psrlw m9, m1, 8 - psraw m11, m8, 8 - psllw m1, 8 - psllw m8, 8 - psrlw m1, 8 - psraw m8, 8 - pmullw m9, m11 - pmullw m1, m8 - paddw m1, m9 ; pmaddubsw m1, m8 - movq m14, [filterq+mxq *8] ; 2 X - movq m9, [filterq+tmp1q*8] ; 6 X - lea tmp1d, [tmp2q+alphaq*4] - lea mxd, [tmp2q+betaq] ; mx += beta - shr tmp2d, 10 - shr tmp1d, 10 - movhps m14, [filterq+tmp2q*8] ; 2 3 - movhps m9, [filterq+tmp1q*8] ; 6 7 - psrlw m8, m15, 8 - psraw m11, m14, 8 - psllw m15, 8 - psllw m14, 8 - psrlw m15, 8 - psraw m14, 8 - pmullw m8, m11 - pmullw m15, m14 - paddw m15, m8 ; pmaddubsw m15, m14 - psrlw m8, m10, 8 - psraw m11, m9, 8 - psllw m10, 8 - psllw m9, 8 - psrlw m10, 8 - psraw m9, 8 - pmullw m8, m11 - pmullw m10, m9 - paddw m10, m8 ; pmaddubsw m10, m9 - pslld m8, m0, 16 - pslld m9, m1, 16 - pslld m14, m15, 16 - pslld m11, m10, 16 - paddw m0, m8 - paddw m1, m9 - paddw m15, m14 - paddw m10, m11 - psrad m0, 16 - psrad m1, 16 - psrad m15, 16 - psrad m10, 16 - packssdw m0, m15 ; phaddw m0, m15 - packssdw m1, m10 ; phaddw m1, m10 -%endif mova m14, [PIC_sym(pw_8192)] mova m9, [PIC_sym(pd_32768)] pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13 @@ -9575,17 +10138,7 @@ cglobal resize_8bpc, 0, 6, 8, 3 * 16, dst, dst_stride, src, src_stride, \ RET INIT_XMM ssse3 -PREP_BILIN -PREP_8TAP WARP_AFFINE_8X8 -WARP_AFFINE_8X8T INIT_XMM sse4 WARP_AFFINE_8X8 -WARP_AFFINE_8X8T - -INIT_XMM sse2 -PREP_BILIN -PREP_8TAP -WARP_AFFINE_8X8 -WARP_AFFINE_8X8T diff --git a/third_party/dav1d/src/x86/refmvs.asm b/third_party/dav1d/src/x86/refmvs.asm index d95861fa1754..085c9b3ded51 100644 --- a/third_party/dav1d/src/x86/refmvs.asm +++ b/third_party/dav1d/src/x86/refmvs.asm @@ -92,6 +92,31 @@ JMP_TABLE splat_mv_avx2, 1, 2, 4, 8, 16, 32 JMP_TABLE splat_mv_sse2, 1, 2, 4, 8, 16, 32 +struc rf + .frm_hdr: resq 1 + .iw4: resd 1 + .ih4: resd 1 + .iw8: resd 1 + .ih8: resd 1 + .sbsz: resd 1 + .use_rf_mvs: resd 1 + .sign_bias: resb 7 + .mfmv_sign: resb 7 + .pocdiff: resb 7 + .mfmv_ref: resb 3 + .mfmv_ref2cur: resd 3 + .mfmv_ref2ref: resd 3*7 + .n_mfmvs: resd 1 + .n_blocks: resd 1 + .rp: resq 1 + .rp_ref: resq 1 + .rp_proj: resq 1 + .rp_stride: resq 1 + .r: resq 1 + .n_tile_threads: resd 1 + .n_frame_threads: resd 1 +endstruc + SECTION .text %macro movif32 2 @@ -341,16 +366,16 @@ cglobal load_tmvs, 6, 15, 4, -0x50, rf, tridx, xstart, xend, ystart, yend, \ stride, rp_proj, roff, troff, \ xendi, xstarti, iw8, ih8, dst xor r14d, r14d - cmp dword [rfq+212], 1 ; n_tile_threads - mov ih8d, [rfq+20] ; rf->ih8 - mov iw8d, [rfq+16] ; rf->iw8 + cmp dword [rfq+rf.n_tile_threads], 1 + mov ih8d, [rfq+rf.ih8] + mov iw8d, [rfq+rf.iw8] mov xstartd, xstartd mov xendd, xendd cmove tridxd, r14d lea xstartid, [xstartq-8] lea xendid, [xendq+8] - mov strideq, [rfq+184] - mov rp_projq, [rfq+176] + mov strideq, [rfq+rf.rp_stride] + mov rp_projq, [rfq+rf.rp_proj] cmp ih8d, yendd mov [rsp+0x30], strideq cmovs yendd, ih8d @@ -397,7 +422,7 @@ cglobal load_tmvs, 6, 15, 4, -0x50, rf, tridx, xstart, xend, ystart, yend, \ jg .init_xloop_start DEFINE_ARGS rf, _, xstart, xend, ystart, yend, n7, stride, \ _, _, xendi, xstarti, stride5, _, n - mov r13d, [rfq+152] ; rf->n_mfmvs + mov r13d, [rfq+rf.n_mfmvs] test r13d, r13d jz .ret mov [rsp+0x0c], r13d @@ -418,14 +443,14 @@ cglobal load_tmvs, 6, 15, 4, -0x50, rf, tridx, xstart, xend, ystart, yend, \ DEFINE_ARGS y, off, xstart, xend, ystart, rf, n7, refsign, \ ref, rp_ref, xendi, xstarti, _, _, n mov rfq, [rsp+0x48] - mov refd, [rfq+56+nq*4] ; ref2cur + mov refd, [rfq+rf.mfmv_ref2cur+nq*4] cmp refd, 0x80000000 je .next_n mov [rsp+0x40], refd mov offq, [rsp+0x00] ; ystart * stride * 5 - movzx refd, byte [rfq+53+nq] ; rf->mfmv_ref[n] + movzx refd, byte [rfq+rf.mfmv_ref+nq] lea refsignq, [refq-4] - mov rp_refq, [rfq+168] + mov rp_refq, [rfq+rf.rp_ref] movq m2, refsignq add offq, [rp_refq+refq*8] ; r = rp_ref[ref] + row_offset mov [rsp+0x14], nd @@ -452,8 +477,8 @@ cglobal load_tmvs, 6, 15, 4, -0x50, rf, tridx, xstart, xend, ystart, yend, \ test refd, refd jz .next_x_bad_ref mov rfq, [rsp+0x48] - lea r14d, [16+n7q+refq] - mov ref2refd, [rfq+r14*4] ; rf->mfmv_ref2ref[n][b_ref-1] + lea ref2refd, [(rf.mfmv_ref2ref/4)+n7q+refq-1] + mov ref2refd, [rfq+ref2refq*4] ; rf->mfmv_ref2ref[n][b_ref-1] test ref2refd, ref2refd jz .next_x_bad_ref lea fracq, [mv_proj] diff --git a/third_party/dav1d/tests/dav1d_argon.bash b/third_party/dav1d/tests/dav1d_argon.bash index 954dad8d2dc4..ead3e6ed2dc1 100755 --- a/third_party/dav1d/tests/dav1d_argon.bash +++ b/third_party/dav1d/tests/dav1d_argon.bash @@ -131,7 +131,7 @@ else mapfile -t dirs < <(printf "${ARGON_DIR}/%s\n" "$@" | sort -u) fi -ver_info="dav1d $("$DAV1D" -v 2>&1) filmgrain=$FILMGRAIN cpumask=$CPUMASK" || error "Error! Can't run $DAV1D" +ver_info="dav1d $("$DAV1D" --filmgrain "$FILMGRAIN" --cpumask "$CPUMASK" --threads "$THREADS" -v 2>&1) filmgrain=$FILMGRAIN cpumask=$CPUMASK" || error "Error! Can't run $DAV1D" files=() for d in "${dirs[@]}"; do