mirror of
https://github.com/xenia-project/FFmpeg.git
synced 2024-11-24 12:09:55 +00:00
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
Originally committed as revision 24029 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
ea28e81faa
commit
f2a30bd840
@ -63,12 +63,16 @@ DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
|
||||
|
||||
DECLARE_ALIGNED(8, const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL;
|
||||
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3 ) = {0x0303030303030303ULL, 0x0303030303030303ULL};
|
||||
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4 ) = {0x0404040404040404ULL, 0x0404040404040404ULL};
|
||||
DECLARE_ALIGNED(8, const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL;
|
||||
DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL;
|
||||
DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
|
||||
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80 ) = {0x8080808080808080ULL, 0x8080808080808080ULL};
|
||||
DECLARE_ALIGNED(8, const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL;
|
||||
DECLARE_ALIGNED(8, const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
|
||||
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8 ) = {0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL};
|
||||
DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
|
||||
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE ) = {0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL};
|
||||
|
||||
DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
|
||||
DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
|
||||
|
@ -222,6 +222,13 @@ extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
|
||||
extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride);
|
||||
extern void ff_vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]);
|
||||
extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
|
||||
|
||||
extern void ff_vp8_v_loop_filter_simple_mmx (uint8_t *dst, int stride, int flim);
|
||||
extern void ff_vp8_v_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim);
|
||||
extern void ff_vp8_v_loop_filter_simple_sse2 (uint8_t *dst, int stride, int flim);
|
||||
extern void ff_vp8_h_loop_filter_simple_mmx (uint8_t *dst, int stride, int flim);
|
||||
extern void ff_vp8_h_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim);
|
||||
extern void ff_vp8_h_loop_filter_simple_sse2 (uint8_t *dst, int stride, int flim);
|
||||
#endif
|
||||
|
||||
#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
|
||||
@ -260,6 +267,9 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
|
||||
c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx;
|
||||
c->put_vp8_epel_pixels_tab[1][0][0] =
|
||||
c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
|
||||
|
||||
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx;
|
||||
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx;
|
||||
}
|
||||
|
||||
/* note that 4-tap width=16 functions are missing because w=16
|
||||
@ -272,6 +282,9 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
|
||||
VP8_BILINEAR_MC_FUNC(0, 16, mmxext);
|
||||
VP8_BILINEAR_MC_FUNC(1, 8, mmxext);
|
||||
VP8_BILINEAR_MC_FUNC(2, 4, mmxext);
|
||||
|
||||
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext;
|
||||
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext;
|
||||
}
|
||||
|
||||
if (mm_flags & FF_MM_SSE) {
|
||||
@ -284,6 +297,9 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
|
||||
VP8_MC_FUNC(1, 8, sse2);
|
||||
VP8_BILINEAR_MC_FUNC(0, 16, sse2);
|
||||
VP8_BILINEAR_MC_FUNC(1, 8, sse2);
|
||||
|
||||
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
|
||||
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
|
||||
}
|
||||
|
||||
if (mm_flags & FF_MM_SSSE3) {
|
||||
|
@ -146,8 +146,13 @@ pw_20091: times 4 dw 20091
|
||||
pw_17734: times 4 dw 17734
|
||||
|
||||
cextern pw_3
|
||||
cextern pb_3
|
||||
cextern pw_4
|
||||
cextern pb_4
|
||||
cextern pw_64
|
||||
cextern pb_80
|
||||
cextern pb_F8
|
||||
cextern pb_FE
|
||||
|
||||
SECTION .text
|
||||
|
||||
@ -1063,3 +1068,304 @@ cglobal vp8_luma_dc_wht_mmxext, 2,3
|
||||
add r0, 2*16*4
|
||||
SCATTER_WHT 3
|
||||
RET
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
; macro called with 7 mm register indexes as argument, and 4 regular registers
|
||||
;
|
||||
; first 4 mm registers will carry the transposed pixel data
|
||||
; the other three are scratchspace (one would be sufficient, but this allows
|
||||
; for more spreading/pipelining and thus faster execution on OOE CPUs)
|
||||
;
|
||||
; first two regular registers are buf+4*stride and buf+5*stride
|
||||
; third is -stride, fourth is +stride
|
||||
%macro READ_8x4_INTERLEAVED 11
|
||||
; interleave 8 (A-H) rows of 4 pixels each
|
||||
movd m%1, [%8+%10*4] ; A0-3
|
||||
movd m%5, [%9+%10*4] ; B0-3
|
||||
movd m%2, [%8+%10*2] ; C0-3
|
||||
movd m%6, [%8+%10] ; D0-3
|
||||
movd m%3, [%8] ; E0-3
|
||||
movd m%7, [%9] ; F0-3
|
||||
movd m%4, [%9+%11] ; G0-3
|
||||
punpcklbw m%1, m%5 ; A/B interleaved
|
||||
movd m%5, [%9+%11*2] ; H0-3
|
||||
punpcklbw m%2, m%6 ; C/D interleaved
|
||||
punpcklbw m%3, m%7 ; E/F interleaved
|
||||
punpcklbw m%4, m%5 ; G/H interleaved
|
||||
%endmacro
|
||||
|
||||
; macro called with 7 mm register indexes as argument, and 5 regular registers
|
||||
; first 11 mean the same as READ_8x4_TRANSPOSED above
|
||||
; fifth regular register is scratchspace to reach the bottom 8 rows, it
|
||||
; will be set to second regular register + 8*stride at the end
|
||||
%macro READ_16x4_INTERLEAVED 12
|
||||
; transpose 16 (A-P) rows of 4 pixels each
|
||||
lea %12, [r0+8*r2]
|
||||
|
||||
; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M
|
||||
movd m%1, [%8+%10*4] ; A0-3
|
||||
movd m%3, [%12+%10*4] ; I0-3
|
||||
movd m%2, [%8+%10*2] ; C0-3
|
||||
movd m%4, [%12+%10*2] ; K0-3
|
||||
movd m%6, [%8+%10] ; D0-3
|
||||
movd m%5, [%12+%10] ; L0-3
|
||||
movd m%7, [%12] ; M0-3
|
||||
add %12, %11
|
||||
punpcklbw m%1, m%3 ; A/I
|
||||
movd m%3, [%8] ; E0-3
|
||||
punpcklbw m%2, m%4 ; C/K
|
||||
punpcklbw m%6, m%5 ; D/L
|
||||
punpcklbw m%3, m%7 ; E/M
|
||||
punpcklbw m%2, m%6 ; C/D/K/L interleaved
|
||||
|
||||
; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P
|
||||
movd m%5, [%9+%10*4] ; B0-3
|
||||
movd m%4, [%12+%10*4] ; J0-3
|
||||
movd m%7, [%9] ; F0-3
|
||||
movd m%6, [%12] ; N0-3
|
||||
punpcklbw m%5, m%4 ; B/J
|
||||
punpcklbw m%7, m%6 ; F/N
|
||||
punpcklbw m%1, m%5 ; A/B/I/J interleaved
|
||||
punpcklbw m%3, m%7 ; E/F/M/N interleaved
|
||||
movd m%4, [%9+%11] ; G0-3
|
||||
movd m%6, [%12+%11] ; O0-3
|
||||
movd m%5, [%9+%11*2] ; H0-3
|
||||
movd m%7, [%12+%11*2] ; P0-3
|
||||
punpcklbw m%4, m%6 ; G/O
|
||||
punpcklbw m%5, m%7 ; H/P
|
||||
punpcklbw m%4, m%5 ; G/H/O/P interleaved
|
||||
%endmacro
|
||||
|
||||
; write 4 mm registers of 2 dwords each
|
||||
; first four arguments are mm register indexes containing source data
|
||||
; last four are registers containing buf+4*stride, buf+5*stride,
|
||||
; -stride and +stride
|
||||
%macro WRITE_4x2D 8
|
||||
; write out (2 dwords per register)
|
||||
movd [%5+%7*4], m%1
|
||||
movd [%5+%7*2], m%2
|
||||
movd [%5], m%3
|
||||
movd [%6+%8], m%4
|
||||
punpckhdq m%1, m%1
|
||||
punpckhdq m%2, m%2
|
||||
punpckhdq m%3, m%3
|
||||
punpckhdq m%4, m%4
|
||||
movd [%6+%7*4], m%1
|
||||
movd [%5+%7], m%2
|
||||
movd [%6], m%3
|
||||
movd [%6+%8*2], m%4
|
||||
%endmacro
|
||||
|
||||
; write 4 xmm registers of 4 dwords each
|
||||
; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
|
||||
; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
|
||||
; we add 1*stride to the third regular registry in the process
|
||||
%macro WRITE_4x4D 9
|
||||
; write out (4 dwords per register), start with dwords zero
|
||||
movd [%5+%8*4], m%1
|
||||
movd [%5], m%2
|
||||
movd [%5+%9*4], m%3
|
||||
movd [%5+%9*8], m%4
|
||||
|
||||
; store dwords 1
|
||||
psrldq m%1, 4
|
||||
psrldq m%2, 4
|
||||
psrldq m%3, 4
|
||||
psrldq m%4, 4
|
||||
movd [%6+%8*4], m%1
|
||||
movd [%6], m%2
|
||||
movd [%6+%9*4], m%3
|
||||
movd [%6+%9*8], m%4
|
||||
|
||||
; write dwords 2
|
||||
psrldq m%1, 4
|
||||
psrldq m%2, 4
|
||||
psrldq m%3, 4
|
||||
psrldq m%4, 4
|
||||
movd [%5+%8*2], m%1
|
||||
movd [%6+%9], m%2
|
||||
movd [%7+%8*2], m%3
|
||||
movd [%7+%9*2], m%4
|
||||
add %7, %9
|
||||
|
||||
; store dwords 3
|
||||
psrldq m%1, 4
|
||||
psrldq m%2, 4
|
||||
psrldq m%3, 4
|
||||
psrldq m%4, 4
|
||||
movd [%5+%8], m%1
|
||||
movd [%6+%9*2], m%2
|
||||
movd [%7+%8*2], m%3
|
||||
movd [%7+%9*2], m%4
|
||||
%endmacro
|
||||
|
||||
%macro SIMPLE_LOOPFILTER 3
|
||||
cglobal vp8_%2_loop_filter_simple_%1, 3, %3
|
||||
%ifidn %2, h
|
||||
mov r5, rsp ; backup stack pointer
|
||||
and rsp, ~(mmsize-1) ; align stack
|
||||
%endif
|
||||
%if mmsize == 8 ; mmx/mmxext
|
||||
mov r3, 2
|
||||
%endif
|
||||
|
||||
; splat register with "flim"
|
||||
movd m7, r2
|
||||
punpcklbw m7, m7
|
||||
%if mmsize == 16 ; sse2
|
||||
punpcklwd m7, m7
|
||||
pshufd m7, m7, 0x0
|
||||
%elifidn %1, mmx
|
||||
punpcklwd m7, m7
|
||||
punpckldq m7, m7
|
||||
%else ; mmxext
|
||||
pshufw m7, m7, 0x0
|
||||
%endif
|
||||
|
||||
; set up indexes to address 4 rows
|
||||
mov r2, r1
|
||||
neg r1
|
||||
%ifidn %2, h
|
||||
lea r0, [r0+4*r2-2]
|
||||
sub rsp, mmsize*2 ; (aligned) storage space for saving p1/q1
|
||||
%endif
|
||||
|
||||
%if mmsize == 8 ; mmx / mmxext
|
||||
.next8px
|
||||
%endif
|
||||
%ifidn %2, v
|
||||
; read 4 half/full rows of pixels
|
||||
mova m0, [r0+r1*2] ; p1
|
||||
mova m1, [r0+r1] ; p0
|
||||
mova m2, [r0] ; q0
|
||||
mova m3, [r0+r2] ; q1
|
||||
%else ; h
|
||||
lea r4, [r0+r2]
|
||||
|
||||
%if mmsize == 8 ; mmx/mmxext
|
||||
READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2
|
||||
%else ; sse2
|
||||
READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3
|
||||
%endif
|
||||
TRANSPOSE4x4W 0, 1, 2, 3, 4
|
||||
|
||||
mova [rsp], m0 ; store p1
|
||||
mova [rsp+mmsize], m3 ; store q1
|
||||
%endif
|
||||
|
||||
; simple_limit
|
||||
mova m5, m2 ; m5=backup of q0
|
||||
mova m6, m1 ; m6=backup of p0
|
||||
psubusb m1, m2 ; p0-q0
|
||||
psubusb m2, m6 ; q0-p0
|
||||
por m1, m2 ; FFABS(p0-q0)
|
||||
paddusb m1, m1 ; m1=FFABS(p0-q0)*2
|
||||
|
||||
mova m4, m3
|
||||
mova m2, m0
|
||||
psubusb m3, m0 ; q1-p1
|
||||
psubusb m0, m4 ; p1-q1
|
||||
por m3, m0 ; FFABS(p1-q1)
|
||||
mova m0, [pb_80]
|
||||
pxor m2, m0
|
||||
pxor m4, m0
|
||||
psubsb m2, m4 ; m2=p1-q1 (signed) backup for below
|
||||
pand m3, [pb_FE]
|
||||
psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed
|
||||
paddusb m3, m1
|
||||
psubusb m3, m7
|
||||
pxor m1, m1
|
||||
pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0)
|
||||
|
||||
; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask)
|
||||
mova m4, m5
|
||||
pxor m5, m0
|
||||
pxor m0, m6
|
||||
psubsb m5, m0 ; q0-p0 (signed)
|
||||
paddsb m2, m5
|
||||
paddsb m2, m5
|
||||
paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0)
|
||||
pand m2, m3 ; apply filter mask (m3)
|
||||
|
||||
mova m3, [pb_F8]
|
||||
mova m1, m2
|
||||
paddsb m2, [pb_4] ; f1<<3=a+4
|
||||
paddsb m1, [pb_3] ; f2<<3=a+3
|
||||
pand m2, m3
|
||||
pand m1, m3 ; cache f2<<3
|
||||
|
||||
pxor m0, m0
|
||||
pxor m3, m3
|
||||
pcmpgtb m0, m2 ; which values are <0?
|
||||
psubb m3, m2 ; -f1<<3
|
||||
psrlq m2, 3 ; +f1
|
||||
psrlq m3, 3 ; -f1
|
||||
pand m3, m0
|
||||
pandn m0, m2
|
||||
psubusb m4, m0
|
||||
paddusb m4, m3 ; q0-f1
|
||||
|
||||
pxor m0, m0
|
||||
pxor m3, m3
|
||||
pcmpgtb m0, m1 ; which values are <0?
|
||||
psubb m3, m1 ; -f2<<3
|
||||
psrlq m1, 3 ; +f2
|
||||
psrlq m3, 3 ; -f2
|
||||
pand m3, m0
|
||||
pandn m0, m1
|
||||
paddusb m6, m0
|
||||
psubusb m6, m3 ; p0+f2
|
||||
|
||||
; store
|
||||
%ifidn %2, v
|
||||
mova [r0], m4
|
||||
mova [r0+r1], m6
|
||||
%else ; h
|
||||
mova m0, [rsp] ; p1
|
||||
SWAP 2, 4 ; p0
|
||||
SWAP 1, 6 ; q0
|
||||
mova m3, [rsp+mmsize] ; q1
|
||||
|
||||
TRANSPOSE4x4B 0, 1, 2, 3, 4
|
||||
%if mmsize == 16 ; sse2
|
||||
add r3, r1 ; change from r4*8*stride to r0+8*stride
|
||||
WRITE_4x4D 0, 1, 2, 3, r0, r4, r3, r1, r2
|
||||
%else ; mmx/mmxext
|
||||
WRITE_4x2D 0, 1, 2, 3, r0, r4, r1, r2
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%if mmsize == 8 ; mmx/mmxext
|
||||
; next 8 pixels
|
||||
%ifidn %2, v
|
||||
add r0, 8 ; advance 8 cols = pixels
|
||||
%else ; h
|
||||
lea r0, [r0+r2*8] ; advance 8 rows = lines
|
||||
%endif
|
||||
dec r3
|
||||
jg .next8px
|
||||
%ifidn %2, v
|
||||
REP_RET
|
||||
%else ; h
|
||||
mov rsp, r5 ; restore stack pointer
|
||||
RET
|
||||
%endif
|
||||
%else ; sse2
|
||||
%ifidn %2, h
|
||||
mov rsp, r5 ; restore stack pointer
|
||||
%endif
|
||||
RET
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
INIT_MMX
|
||||
SIMPLE_LOOPFILTER mmx, v, 4
|
||||
SIMPLE_LOOPFILTER mmx, h, 6
|
||||
SIMPLE_LOOPFILTER mmxext, v, 4
|
||||
SIMPLE_LOOPFILTER mmxext, h, 6
|
||||
INIT_XMM
|
||||
SIMPLE_LOOPFILTER sse2, v, 3
|
||||
SIMPLE_LOOPFILTER sse2, h, 6
|
||||
|
@ -37,6 +37,14 @@
|
||||
SWAP %2, %4, %3
|
||||
%endmacro
|
||||
|
||||
%macro TRANSPOSE4x4B 5
|
||||
SBUTTERFLY bw, %1, %2, %5
|
||||
SBUTTERFLY bw, %3, %4, %5
|
||||
SBUTTERFLY wd, %1, %3, %5
|
||||
SBUTTERFLY wd, %2, %4, %5
|
||||
SWAP %2, %3
|
||||
%endmacro
|
||||
|
||||
%macro TRANSPOSE4x4W 5
|
||||
SBUTTERFLY wd, %1, %2, %5
|
||||
SBUTTERFLY wd, %3, %4, %5
|
||||
|
Loading…
Reference in New Issue
Block a user