mirror of
https://github.com/xenia-project/FFmpeg.git
synced 2024-11-24 20:19:55 +00:00
x86: vc1dsp: Convert vc1_inv_trans_*_dc to NASM format
This commit is contained in:
parent
b62825a480
commit
838abfc1d7
@ -395,3 +395,101 @@ cglobal vc1_put_ver_16b_shift2, 4,7,0, dst, src, stride
|
||||
jnz .loop
|
||||
REP_RET
|
||||
%endif ; HAVE_MMX_INLINE
|
||||
|
||||
%macro INV_TRANS_INIT 0
|
||||
movsxdifnidn linesizeq, linesized
|
||||
movd m0, blockd
|
||||
SPLATW m0, m0
|
||||
pxor m1, m1
|
||||
psubw m1, m0
|
||||
packuswb m0, m0
|
||||
packuswb m1, m1
|
||||
|
||||
DEFINE_ARGS dest, linesize, linesize3
|
||||
lea linesize3q, [linesizeq*3]
|
||||
%endmacro
|
||||
|
||||
%macro INV_TRANS_PROCESS 1
|
||||
mov%1 m2, [destq+linesizeq*0]
|
||||
mov%1 m3, [destq+linesizeq*1]
|
||||
mov%1 m4, [destq+linesizeq*2]
|
||||
mov%1 m5, [destq+linesize3q]
|
||||
paddusb m2, m0
|
||||
paddusb m3, m0
|
||||
paddusb m4, m0
|
||||
paddusb m5, m0
|
||||
psubusb m2, m1
|
||||
psubusb m3, m1
|
||||
psubusb m4, m1
|
||||
psubusb m5, m1
|
||||
mov%1 [linesizeq*0+destq], m2
|
||||
mov%1 [linesizeq*1+destq], m3
|
||||
mov%1 [linesizeq*2+destq], m4
|
||||
mov%1 [linesize3q +destq], m5
|
||||
%endmacro
|
||||
|
||||
; ff_vc1_inv_trans_?x?_dc_mmxext(uint8_t *dest, int linesize, int16_t *block)
|
||||
INIT_MMX mmxext
|
||||
cglobal vc1_inv_trans_4x4_dc, 3,4,0, dest, linesize, block
|
||||
movsx r3d, WORD [blockq]
|
||||
mov blockd, r3d ; dc
|
||||
shl blockd, 4 ; 16 * dc
|
||||
lea blockd, [blockq+r3+4] ; 17 * dc + 4
|
||||
sar blockd, 3 ; >> 3
|
||||
mov r3d, blockd ; dc
|
||||
shl blockd, 4 ; 16 * dc
|
||||
lea blockd, [blockq+r3+64] ; 17 * dc + 64
|
||||
sar blockd, 7 ; >> 7
|
||||
|
||||
INV_TRANS_INIT
|
||||
|
||||
INV_TRANS_PROCESS h
|
||||
RET
|
||||
|
||||
INIT_MMX mmxext
|
||||
cglobal vc1_inv_trans_4x8_dc, 3,4,0, dest, linesize, block
|
||||
movsx r3d, WORD [blockq]
|
||||
mov blockd, r3d ; dc
|
||||
shl blockd, 4 ; 16 * dc
|
||||
lea blockd, [blockq+r3+4] ; 17 * dc + 4
|
||||
sar blockd, 3 ; >> 3
|
||||
shl blockd, 2 ; 4 * dc
|
||||
lea blockd, [blockq*3+64] ; 12 * dc + 64
|
||||
sar blockd, 7 ; >> 7
|
||||
|
||||
INV_TRANS_INIT
|
||||
|
||||
INV_TRANS_PROCESS h
|
||||
lea destq, [destq+linesizeq*4]
|
||||
INV_TRANS_PROCESS h
|
||||
RET
|
||||
|
||||
INIT_MMX mmxext
|
||||
cglobal vc1_inv_trans_8x4_dc, 3,4,0, dest, linesize, block
|
||||
movsx blockd, WORD [blockq] ; dc
|
||||
lea blockd, [blockq*3+1] ; 3 * dc + 1
|
||||
sar blockd, 1 ; >> 1
|
||||
mov r3d, blockd ; dc
|
||||
shl blockd, 4 ; 16 * dc
|
||||
lea blockd, [blockq+r3+64] ; 17 * dc + 64
|
||||
sar blockd, 7 ; >> 7
|
||||
|
||||
INV_TRANS_INIT
|
||||
|
||||
INV_TRANS_PROCESS a
|
||||
RET
|
||||
|
||||
INIT_MMX mmxext
|
||||
cglobal vc1_inv_trans_8x8_dc, 3,3,0, dest, linesize, block
|
||||
movsx blockd, WORD [blockq] ; dc
|
||||
lea blockd, [blockq*3+1] ; 3 * dc + 1
|
||||
sar blockd, 1 ; >> 1
|
||||
lea blockd, [blockq*3+16] ; 3 * dc + 16
|
||||
sar blockd, 5 ; >> 5
|
||||
|
||||
INV_TRANS_INIT
|
||||
|
||||
INV_TRANS_PROCESS a
|
||||
lea destq, [destq+linesizeq*4]
|
||||
INV_TRANS_PROCESS a
|
||||
RET
|
||||
|
@ -92,6 +92,14 @@ void ff_put_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
|
||||
int stride, int h, int x, int y);
|
||||
void ff_avg_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
|
||||
int stride, int h, int x, int y);
|
||||
void ff_vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, int linesize,
|
||||
int16_t *block);
|
||||
void ff_vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, int linesize,
|
||||
int16_t *block);
|
||||
void ff_vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, int linesize,
|
||||
int16_t *block);
|
||||
void ff_vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize,
|
||||
int16_t *block);
|
||||
|
||||
|
||||
av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
|
||||
@ -130,6 +138,11 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
|
||||
|
||||
dsp->avg_vc1_mspel_pixels_tab[1][0] = avg_vc1_mspel_mc00_8_mmxext;
|
||||
dsp->avg_vc1_mspel_pixels_tab[0][0] = avg_vc1_mspel_mc00_16_mmxext;
|
||||
|
||||
dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_mmxext;
|
||||
dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_mmxext;
|
||||
dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_mmxext;
|
||||
dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_mmxext;
|
||||
}
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_sse2;
|
||||
|
@ -481,208 +481,6 @@ DECLARE_FUNCTION(3, 1)
|
||||
DECLARE_FUNCTION(3, 2)
|
||||
DECLARE_FUNCTION(3, 3)
|
||||
|
||||
static void vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, int linesize,
|
||||
int16_t *block)
|
||||
{
|
||||
int dc = block[0];
|
||||
dc = (17 * dc + 4) >> 3;
|
||||
dc = (17 * dc + 64) >> 7;
|
||||
__asm__ volatile(
|
||||
"movd %0, %%mm0 \n\t"
|
||||
"pshufw $0, %%mm0, %%mm0 \n\t"
|
||||
"pxor %%mm1, %%mm1 \n\t"
|
||||
"psubw %%mm0, %%mm1 \n\t"
|
||||
"packuswb %%mm0, %%mm0 \n\t"
|
||||
"packuswb %%mm1, %%mm1 \n\t"
|
||||
::"r"(dc)
|
||||
);
|
||||
__asm__ volatile(
|
||||
"movd %0, %%mm2 \n\t"
|
||||
"movd %1, %%mm3 \n\t"
|
||||
"movd %2, %%mm4 \n\t"
|
||||
"movd %3, %%mm5 \n\t"
|
||||
"paddusb %%mm0, %%mm2 \n\t"
|
||||
"paddusb %%mm0, %%mm3 \n\t"
|
||||
"paddusb %%mm0, %%mm4 \n\t"
|
||||
"paddusb %%mm0, %%mm5 \n\t"
|
||||
"psubusb %%mm1, %%mm2 \n\t"
|
||||
"psubusb %%mm1, %%mm3 \n\t"
|
||||
"psubusb %%mm1, %%mm4 \n\t"
|
||||
"psubusb %%mm1, %%mm5 \n\t"
|
||||
"movd %%mm2, %0 \n\t"
|
||||
"movd %%mm3, %1 \n\t"
|
||||
"movd %%mm4, %2 \n\t"
|
||||
"movd %%mm5, %3 \n\t"
|
||||
:"+m"(*(uint32_t*)(dest+0*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+1*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+2*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+3*linesize))
|
||||
);
|
||||
}
|
||||
|
||||
static void vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, int linesize,
|
||||
int16_t *block)
|
||||
{
|
||||
int dc = block[0];
|
||||
dc = (17 * dc + 4) >> 3;
|
||||
dc = (12 * dc + 64) >> 7;
|
||||
__asm__ volatile(
|
||||
"movd %0, %%mm0 \n\t"
|
||||
"pshufw $0, %%mm0, %%mm0 \n\t"
|
||||
"pxor %%mm1, %%mm1 \n\t"
|
||||
"psubw %%mm0, %%mm1 \n\t"
|
||||
"packuswb %%mm0, %%mm0 \n\t"
|
||||
"packuswb %%mm1, %%mm1 \n\t"
|
||||
::"r"(dc)
|
||||
);
|
||||
__asm__ volatile(
|
||||
"movd %0, %%mm2 \n\t"
|
||||
"movd %1, %%mm3 \n\t"
|
||||
"movd %2, %%mm4 \n\t"
|
||||
"movd %3, %%mm5 \n\t"
|
||||
"paddusb %%mm0, %%mm2 \n\t"
|
||||
"paddusb %%mm0, %%mm3 \n\t"
|
||||
"paddusb %%mm0, %%mm4 \n\t"
|
||||
"paddusb %%mm0, %%mm5 \n\t"
|
||||
"psubusb %%mm1, %%mm2 \n\t"
|
||||
"psubusb %%mm1, %%mm3 \n\t"
|
||||
"psubusb %%mm1, %%mm4 \n\t"
|
||||
"psubusb %%mm1, %%mm5 \n\t"
|
||||
"movd %%mm2, %0 \n\t"
|
||||
"movd %%mm3, %1 \n\t"
|
||||
"movd %%mm4, %2 \n\t"
|
||||
"movd %%mm5, %3 \n\t"
|
||||
:"+m"(*(uint32_t*)(dest+0*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+1*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+2*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+3*linesize))
|
||||
);
|
||||
dest += 4*linesize;
|
||||
__asm__ volatile(
|
||||
"movd %0, %%mm2 \n\t"
|
||||
"movd %1, %%mm3 \n\t"
|
||||
"movd %2, %%mm4 \n\t"
|
||||
"movd %3, %%mm5 \n\t"
|
||||
"paddusb %%mm0, %%mm2 \n\t"
|
||||
"paddusb %%mm0, %%mm3 \n\t"
|
||||
"paddusb %%mm0, %%mm4 \n\t"
|
||||
"paddusb %%mm0, %%mm5 \n\t"
|
||||
"psubusb %%mm1, %%mm2 \n\t"
|
||||
"psubusb %%mm1, %%mm3 \n\t"
|
||||
"psubusb %%mm1, %%mm4 \n\t"
|
||||
"psubusb %%mm1, %%mm5 \n\t"
|
||||
"movd %%mm2, %0 \n\t"
|
||||
"movd %%mm3, %1 \n\t"
|
||||
"movd %%mm4, %2 \n\t"
|
||||
"movd %%mm5, %3 \n\t"
|
||||
:"+m"(*(uint32_t*)(dest+0*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+1*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+2*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+3*linesize))
|
||||
);
|
||||
}
|
||||
|
||||
static void vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, int linesize,
|
||||
int16_t *block)
|
||||
{
|
||||
int dc = block[0];
|
||||
dc = ( 3 * dc + 1) >> 1;
|
||||
dc = (17 * dc + 64) >> 7;
|
||||
__asm__ volatile(
|
||||
"movd %0, %%mm0 \n\t"
|
||||
"pshufw $0, %%mm0, %%mm0 \n\t"
|
||||
"pxor %%mm1, %%mm1 \n\t"
|
||||
"psubw %%mm0, %%mm1 \n\t"
|
||||
"packuswb %%mm0, %%mm0 \n\t"
|
||||
"packuswb %%mm1, %%mm1 \n\t"
|
||||
::"r"(dc)
|
||||
);
|
||||
__asm__ volatile(
|
||||
"movq %0, %%mm2 \n\t"
|
||||
"movq %1, %%mm3 \n\t"
|
||||
"movq %2, %%mm4 \n\t"
|
||||
"movq %3, %%mm5 \n\t"
|
||||
"paddusb %%mm0, %%mm2 \n\t"
|
||||
"paddusb %%mm0, %%mm3 \n\t"
|
||||
"paddusb %%mm0, %%mm4 \n\t"
|
||||
"paddusb %%mm0, %%mm5 \n\t"
|
||||
"psubusb %%mm1, %%mm2 \n\t"
|
||||
"psubusb %%mm1, %%mm3 \n\t"
|
||||
"psubusb %%mm1, %%mm4 \n\t"
|
||||
"psubusb %%mm1, %%mm5 \n\t"
|
||||
"movq %%mm2, %0 \n\t"
|
||||
"movq %%mm3, %1 \n\t"
|
||||
"movq %%mm4, %2 \n\t"
|
||||
"movq %%mm5, %3 \n\t"
|
||||
:"+m"(*(uint32_t*)(dest+0*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+1*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+2*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+3*linesize))
|
||||
);
|
||||
}
|
||||
|
||||
static void vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize,
|
||||
int16_t *block)
|
||||
{
|
||||
int dc = block[0];
|
||||
dc = (3 * dc + 1) >> 1;
|
||||
dc = (3 * dc + 16) >> 5;
|
||||
__asm__ volatile(
|
||||
"movd %0, %%mm0 \n\t"
|
||||
"pshufw $0, %%mm0, %%mm0 \n\t"
|
||||
"pxor %%mm1, %%mm1 \n\t"
|
||||
"psubw %%mm0, %%mm1 \n\t"
|
||||
"packuswb %%mm0, %%mm0 \n\t"
|
||||
"packuswb %%mm1, %%mm1 \n\t"
|
||||
::"r"(dc)
|
||||
);
|
||||
__asm__ volatile(
|
||||
"movq %0, %%mm2 \n\t"
|
||||
"movq %1, %%mm3 \n\t"
|
||||
"movq %2, %%mm4 \n\t"
|
||||
"movq %3, %%mm5 \n\t"
|
||||
"paddusb %%mm0, %%mm2 \n\t"
|
||||
"paddusb %%mm0, %%mm3 \n\t"
|
||||
"paddusb %%mm0, %%mm4 \n\t"
|
||||
"paddusb %%mm0, %%mm5 \n\t"
|
||||
"psubusb %%mm1, %%mm2 \n\t"
|
||||
"psubusb %%mm1, %%mm3 \n\t"
|
||||
"psubusb %%mm1, %%mm4 \n\t"
|
||||
"psubusb %%mm1, %%mm5 \n\t"
|
||||
"movq %%mm2, %0 \n\t"
|
||||
"movq %%mm3, %1 \n\t"
|
||||
"movq %%mm4, %2 \n\t"
|
||||
"movq %%mm5, %3 \n\t"
|
||||
:"+m"(*(uint32_t*)(dest+0*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+1*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+2*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+3*linesize))
|
||||
);
|
||||
dest += 4*linesize;
|
||||
__asm__ volatile(
|
||||
"movq %0, %%mm2 \n\t"
|
||||
"movq %1, %%mm3 \n\t"
|
||||
"movq %2, %%mm4 \n\t"
|
||||
"movq %3, %%mm5 \n\t"
|
||||
"paddusb %%mm0, %%mm2 \n\t"
|
||||
"paddusb %%mm0, %%mm3 \n\t"
|
||||
"paddusb %%mm0, %%mm4 \n\t"
|
||||
"paddusb %%mm0, %%mm5 \n\t"
|
||||
"psubusb %%mm1, %%mm2 \n\t"
|
||||
"psubusb %%mm1, %%mm3 \n\t"
|
||||
"psubusb %%mm1, %%mm4 \n\t"
|
||||
"psubusb %%mm1, %%mm5 \n\t"
|
||||
"movq %%mm2, %0 \n\t"
|
||||
"movq %%mm3, %1 \n\t"
|
||||
"movq %%mm4, %2 \n\t"
|
||||
"movq %%mm5, %3 \n\t"
|
||||
:"+m"(*(uint32_t*)(dest+0*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+1*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+2*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+3*linesize))
|
||||
);
|
||||
}
|
||||
|
||||
#define FN_ASSIGN(OP, X, Y, INSN) \
|
||||
dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = OP##vc1_mspel_mc##X##Y##INSN; \
|
||||
dsp->OP##vc1_mspel_pixels_tab[0][X+4*Y] = OP##vc1_mspel_mc##X##Y##_16##INSN
|
||||
@ -729,10 +527,5 @@ av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp)
|
||||
FN_ASSIGN(avg_, 3, 1, _mmxext);
|
||||
FN_ASSIGN(avg_, 3, 2, _mmxext);
|
||||
FN_ASSIGN(avg_, 3, 3, _mmxext);
|
||||
|
||||
dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmxext;
|
||||
dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmxext;
|
||||
dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmxext;
|
||||
dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmxext;
|
||||
}
|
||||
#endif /* HAVE_6REGS && HAVE_INLINE_ASM && HAVE_MMX_EXTERNAL */
|
||||
|
Loading…
Reference in New Issue
Block a user