diff --git a/libavcodec/x86/constants.c b/libavcodec/x86/constants.c index ddd009b749..553dd49d4f 100644 --- a/libavcodec/x86/constants.c +++ b/libavcodec/x86/constants.c @@ -35,21 +35,30 @@ DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL }; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL }; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL }; -DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_20) = { 0x0014001400140014ULL, 0x0014001400140014ULL }; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL }; DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL; DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL }; DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL; DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_255) = { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_255) = { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, + 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL }; DECLARE_ALIGNED(32, const ymm_reg, ff_pw_256) = { 0x0100010001000100ULL, 0x0100010001000100ULL, 0x0100010001000100ULL, 0x0100010001000100ULL }; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL, + 0x0200020002000200ULL, 0x0200020002000200ULL }; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL }; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1024) = { 0x0400040004000400ULL, 0x0400040004000400ULL }; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2048) = { 0x0800080008000800ULL, 0x0800080008000800ULL }; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8192) = { 0x2000200020002000ULL, 0x2000200020002000ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1023) = { 0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL, + 0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL}; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1024) = { 0x0400040004000400ULL, 0x0400040004000400ULL, + 0x0400040004000400ULL, 0x0400040004000400ULL}; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_2048) = { 0x0800080008000800ULL, 0x0800080008000800ULL, + 0x0800080008000800ULL, 0x0800080008000800ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_4096) = { 0x1000100010001000ULL, 0x1000100010001000ULL, + 0x1000100010001000ULL, 0x1000100010001000ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_8192) = { 0x2000200020002000ULL, 0x2000200020002000ULL, + 0x2000200020002000ULL, 0x2000200020002000ULL }; DECLARE_ALIGNED(32, const ymm_reg, ff_pw_m1) = { 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL }; @@ -63,6 +72,7 @@ DECLARE_ALIGNED(32, const ymm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x030 0x0303030303030303ULL, 0x0303030303030303ULL }; DECLARE_ALIGNED(32, const xmm_reg, ff_pb_15) = { 0x0F0F0F0F0F0F0F0FULL, 0x0F0F0F0F0F0F0F0FULL }; DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL }; +DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL }; DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL; DECLARE_ALIGNED(16, const xmm_reg, ff_ps_neg) = { 0x8000000080000000ULL, 0x8000000080000000ULL }; diff --git a/libavcodec/x86/constants.h b/libavcodec/x86/constants.h index 0b3c8740c7..33dbb650ae 100644 --- a/libavcodec/x86/constants.h +++ b/libavcodec/x86/constants.h @@ -35,18 +35,20 @@ extern const xmm_reg ff_pw_9; extern const uint64_t ff_pw_15; extern const xmm_reg ff_pw_16; extern const xmm_reg ff_pw_18; -extern const uint64_t ff_pw_20; +extern const xmm_reg ff_pw_20; extern const xmm_reg ff_pw_32; extern const uint64_t ff_pw_42; extern const uint64_t ff_pw_53; extern const xmm_reg ff_pw_64; extern const uint64_t ff_pw_96; extern const uint64_t ff_pw_128; -extern const xmm_reg ff_pw_255; -extern const xmm_reg ff_pw_512; -extern const xmm_reg ff_pw_1024; -extern const xmm_reg ff_pw_2048; -extern const xmm_reg ff_pw_8192; +extern const ymm_reg ff_pw_255; +extern const ymm_reg ff_pw_512; +extern const ymm_reg ff_pw_1023; +extern const ymm_reg ff_pw_1024; +extern const ymm_reg ff_pw_2048; +extern const ymm_reg ff_pw_4096; +extern const ymm_reg ff_pw_8192; extern const ymm_reg ff_pw_m1; extern const ymm_reg ff_pb_0; @@ -54,7 +56,7 @@ extern const ymm_reg ff_pb_1; extern const ymm_reg ff_pb_2; extern const ymm_reg ff_pb_3; extern const xmm_reg ff_pb_80; -extern const xmm_reg ff_pb_F8; +extern const xmm_reg ff_pb_FE; extern const uint64_t ff_pb_FC; extern const xmm_reg ff_ps_neg; diff --git a/libavcodec/x86/h264_deblock_10bit.asm b/libavcodec/x86/h264_deblock_10bit.asm index d8ace17ec9..ebf8a3f109 100644 --- a/libavcodec/x86/h264_deblock_10bit.asm +++ b/libavcodec/x86/h264_deblock_10bit.asm @@ -26,15 +26,13 @@ %include "libavutil/x86/x86util.asm" -SECTION_RODATA - -pw_pixel_max: times 8 dw ((1 << 10)-1) - SECTION .text cextern pw_2 cextern pw_3 cextern pw_4 +cextern pw_1023 +%define pw_pixel_max pw_1023 ; out: %4 = |%1-%2|-%3 ; clobbers: %5 diff --git a/libavcodec/x86/h264_idct_10bit.asm b/libavcodec/x86/h264_idct_10bit.asm index 5c3acb1d38..cc115b0ff9 100644 --- a/libavcodec/x86/h264_idct_10bit.asm +++ b/libavcodec/x86/h264_idct_10bit.asm @@ -26,11 +26,13 @@ SECTION_RODATA -pw_pixel_max: times 8 dw ((1 << 10)-1) pd_32: times 4 dd 32 SECTION .text +cextern pw_1023 +%define pw_pixel_max pw_1023 + ;----------------------------------------------------------------------------- ; void ff_h264_idct_add_10(pixel *dst, int16_t *block, int stride) ;----------------------------------------------------------------------------- diff --git a/libavcodec/x86/h264_intrapred_10bit.asm b/libavcodec/x86/h264_intrapred_10bit.asm index b60a21037f..9aeb70242b 100644 --- a/libavcodec/x86/h264_intrapred_10bit.asm +++ b/libavcodec/x86/h264_intrapred_10bit.asm @@ -26,6 +26,8 @@ SECTION_RODATA +cextern pw_1023 +%define pw_pixel_max pw_1023 cextern pw_512 cextern pw_16 cextern pw_8 @@ -35,7 +37,6 @@ cextern pw_1 pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4 pw_m3: times 8 dw -3 -pw_pixel_max: times 8 dw ((1 << 10)-1) pd_17: times 4 dd 17 pd_16: times 4 dd 16 diff --git a/libavcodec/x86/h264_qpel_10bit.asm b/libavcodec/x86/h264_qpel_10bit.asm index e7ce1b8b44..757c425898 100644 --- a/libavcodec/x86/h264_qpel_10bit.asm +++ b/libavcodec/x86/h264_qpel_10bit.asm @@ -26,12 +26,12 @@ SECTION_RODATA 32 +cextern pw_1023 +%define pw_pixel_max pw_1023 cextern pw_16 cextern pw_1 cextern pb_0 -pw_pixel_max: times 8 dw ((1 << 10)-1) - pad10: times 8 dw 10*1023 pad20: times 8 dw 20*1023 pad30: times 8 dw 30*1023 diff --git a/libavcodec/x86/h264_weight_10bit.asm b/libavcodec/x86/h264_weight_10bit.asm index 5d9496224a..f924e55854 100644 --- a/libavcodec/x86/h264_weight_10bit.asm +++ b/libavcodec/x86/h264_weight_10bit.asm @@ -26,11 +26,12 @@ SECTION_RODATA 32 -pw_pixel_max: times 8 dw ((1 << 10)-1) sq_1: dq 1 dq 0 cextern pw_1 +cextern pw_1023 +%define pw_pixel_max pw_1023 SECTION .text diff --git a/libavcodec/x86/hevc_deblock.asm b/libavcodec/x86/hevc_deblock.asm index f92cb2c0a5..48a597530b 100644 --- a/libavcodec/x86/hevc_deblock.asm +++ b/libavcodec/x86/hevc_deblock.asm @@ -26,8 +26,9 @@ SECTION_RODATA +cextern pw_1023 +%define pw_pixel_max_10 pw_1023 pw_pixel_max_12: times 8 dw ((1 << 12)-1) -pw_pixel_max_10: times 8 dw ((1 << 10)-1) pw_m2: times 8 dw -2 pd_1 : times 4 dd 1 diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm index 2b016f6f95..b16f3b48cf 100644 --- a/libavcodec/x86/hevc_mc.asm +++ b/libavcodec/x86/hevc_mc.asm @@ -21,14 +21,21 @@ %include "libavutil/x86/x86util.asm" SECTION_RODATA 32 -pw_8: times 16 dw (1 << 9) -pw_10: times 16 dw (1 << 11) -pw_12: times 16 dw (1 << 13) +cextern pw_255 +cextern pw_512 +cextern pw_2048 +cextern pw_8192 +cextern pw_1023 +cextern pw_1024 +cextern pw_4096 +%define pw_8 pw_512 +%define pw_10 pw_2048 +%define pw_12 pw_8192 +%define pw_bi_10 pw_1024 +%define pw_bi_12 pw_4096 +%define max_pixels_8 pw_255 +%define max_pixels_10 pw_1023 pw_bi_8: times 16 dw (1 << 8) -pw_bi_10: times 16 dw (1 << 10) -pw_bi_12: times 16 dw (1 << 12) -max_pixels_8: times 16 dw ((1 << 8)-1) -max_pixels_10: times 16 dw ((1 << 10)-1) max_pixels_12: times 16 dw ((1 << 12)-1) cextern pd_1 cextern pb_0 diff --git a/libavcodec/x86/hevc_res_add.asm b/libavcodec/x86/hevc_res_add.asm index 24b88eebab..dc3e88a373 100644 --- a/libavcodec/x86/hevc_res_add.asm +++ b/libavcodec/x86/hevc_res_add.asm @@ -20,12 +20,12 @@ ; */ %include "libavutil/x86/x86util.asm" -SECTION_RODATA 32 -max_pixels_10: times 16 dw ((1 << 10)-1) - - SECTION .text +cextern pw_1023 +%define max_pixels_10 pw_1023 + + ;the tr_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file %macro TR_ADD_MMX_4_8 0 mova m2, [r1] diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm index 3245de3891..751675fc5e 100644 --- a/libavcodec/x86/v210enc.asm +++ b/libavcodec/x86/v210enc.asm @@ -23,7 +23,8 @@ SECTION_RODATA -v210_enc_min_10: times 8 dw 0x4 +cextern pw_4 +%define v210_enc_min_10 pw_4 v210_enc_max_10: times 8 dw 0x3fb v210_enc_luma_mult_10: dw 4,1,16,4,1,16,0,0 @@ -32,8 +33,10 @@ v210_enc_luma_shuf_10: db -1,0,1,-1,2,3,4,5,-1,6,7,-1,8,9,10,11 v210_enc_chroma_mult_10: dw 1,4,16,0,16,1,4,0 v210_enc_chroma_shuf_10: db 0,1,8,9,-1,2,3,-1,10,11,4,5,-1,12,13,-1 -v210_enc_min_8: times 16 db 0x1 -v210_enc_max_8: times 16 db 0xfe +cextern pb_1 +%define v210_enc_min_8 pb_1 +cextern pb_FE +%define v210_enc_max_8 pb_FE v210_enc_luma_shuf_8: db 6,-1,7,-1,8,-1,9,-1,10,-1,11,-1,-1,-1,-1,-1 v210_enc_luma_mult_8: dw 16,4,64,16,4,64,0,0 diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm index 7ad4c3179b..ee5a6bf67a 100644 --- a/libavcodec/x86/vp3dsp.asm +++ b/libavcodec/x86/vp3dsp.asm @@ -36,11 +36,11 @@ vp3_idct_data: times 8 dw 64277 pb_7: times 8 db 0x07 pb_1F: times 8 db 0x1f pb_81: times 8 db 0x81 -pb_FE: times 8 db 0xFE cextern pb_1 cextern pb_3 cextern pb_80 +cextern pb_FE cextern pw_8