vp3: don't use calls to inline asm in yasm code.

Mixing yasm and inline asm is a bad idea, since if either yasm or inline
asm is not supported by your toolchain, all of the asm stops working.
Thus, better to use either one or the other alone.

Signed-off-by: Derek Buitenhuis <derek.buitenhuis@gmail.com>
This commit is contained in:
Ronald S. Bultje 2012-07-22 20:38:56 +00:00 committed by Derek Buitenhuis
parent 79195ce565
commit a1878a88a1

View File

@ -38,13 +38,11 @@ cextern pb_1
cextern pb_3
cextern pb_7
cextern pb_1F
cextern pb_80
cextern pb_81
cextern pw_8
cextern put_signed_pixels_clamped_mmx
cextern add_pixels_clamped_mmx
SECTION .text
; this is off by one or two for some cases when filter_limit is greater than 63
@ -523,56 +521,96 @@ cglobal vp3_h_loop_filter_mmx2, 3, 4
PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
%endmacro
%macro vp3_idct_funcs 3
cglobal vp3_idct_put_%1, 3, %3, %2
%macro vp3_idct_funcs 1
cglobal vp3_idct_put_%1, 3, 4, 9
VP3_IDCT_%1 r2
%if ARCH_X86_64
mov r3, r2
mov r2, r1
mov r1, r0
mov r0, r3
%else
mov r0m, r2
mov r1m, r0
mov r2m, r1
%endif
%if WIN64
call put_signed_pixels_clamped_mmx
RET
%else
jmp put_signed_pixels_clamped_mmx
%endif
cglobal vp3_idct_add_%1, 3, %3, %2
VP3_IDCT_%1 r2
%if ARCH_X86_64
mov r3, r2
mov r2, r1
mov r1, r0
mov r0, r3
movsxdifnidn r1, r1d
mova m4, [pb_80]
lea r3, [r1*3]
%assign %%i 0
%rep 16/mmsize
mova m0, [r2+mmsize*0+%%i]
mova m1, [r2+mmsize*2+%%i]
mova m2, [r2+mmsize*4+%%i]
mova m3, [r2+mmsize*6+%%i]
packsswb m0, [r2+mmsize*1+%%i]
packsswb m1, [r2+mmsize*3+%%i]
packsswb m2, [r2+mmsize*5+%%i]
packsswb m3, [r2+mmsize*7+%%i]
paddb m0, m4
paddb m1, m4
paddb m2, m4
paddb m3, m4
movq [r0 ], m0
%if mmsize == 8
movq [r0+r1 ], m1
movq [r0+r1*2], m2
movq [r0+r3 ], m3
%else
mov r0m, r2
mov r1m, r0
mov r2m, r1
movhps [r0+r1 ], m0
movq [r0+r1*2], m1
movhps [r0+r3 ], m1
%endif
%if WIN64
call add_pixels_clamped_mmx
%if %%i == 0
lea r0, [r0+r1*4]
%endif
%if mmsize == 16
movq [r0 ], m2
movhps [r0+r1 ], m2
movq [r0+r1*2], m3
movhps [r0+r3 ], m3
%endif
%assign %%i %%i+64
%endrep
RET
%else
jmp add_pixels_clamped_mmx
cglobal vp3_idct_add_%1, 3, 4, 9
VP3_IDCT_%1 r2
mov r3, 4
pxor m4, m4
movsxdifnidn r1, r1d
.loop:
movq m0, [r0]
movq m1, [r0+r1]
%if mmsize == 8
mova m2, m0
mova m3, m1
%endif
punpcklbw m0, m4
punpcklbw m1, m4
%if mmsize == 8
punpckhbw m2, m4
punpckhbw m3, m4
%endif
paddsw m0, [r2+ 0]
paddsw m1, [r2+16]
%if mmsize == 8
paddsw m2, [r2+ 8]
paddsw m3, [r2+24]
packuswb m0, m2
packuswb m1, m3
%else ; mmsize == 16
packuswb m0, m1
%endif
movq [r0 ], m0
%if mmsize == 8
movq [r0+r1], m1
%else ; mmsize == 16
movhps [r0+r1], m0
%endif
lea r0, [r0+r1*2]
add r2, 32
dec r3
jg .loop
RET
%endmacro
%if ARCH_X86_64
%define REGS 4
%else
%define REGS 3
%endif
INIT_MMX
vp3_idct_funcs mmx, 0, REGS
vp3_idct_funcs mmx
INIT_XMM
vp3_idct_funcs sse2, 9, REGS
%undef REGS
vp3_idct_funcs sse2
%macro DC_ADD 0
movq m2, [r0 ]