float_dsp: add x86-optimized functions for vector_fmac_scalar()

This commit is contained in:
Justin Ruggles 2012-06-08 23:20:59 -04:00
parent cb5042d02c
commit 82b2df9790
3 changed files with 57 additions and 3 deletions

View File

@ -42,12 +42,12 @@ typedef struct AVFloatDSPContext {
* overlap exactly or not at all. * overlap exactly or not at all.
* *
* @param dst result vector * @param dst result vector
* constraints: 16-byte aligned * constraints: 32-byte aligned
* @param src input vector * @param src input vector
* constraints: 16-byte aligned * constraints: 32-byte aligned
* @param mul scalar value * @param mul scalar value
* @param len length of vector * @param len length of vector
* constraints: multiple of 4 * constraints: multiple of 16
*/ */
void (*vector_fmac_scalar)(float *dst, const float *src, float mul, void (*vector_fmac_scalar)(float *dst, const float *src, float mul,
int len); int len);

View File

@ -19,6 +19,7 @@
;****************************************************************************** ;******************************************************************************
%include "x86inc.asm" %include "x86inc.asm"
%include "x86util.asm"
SECTION .text SECTION .text
@ -53,3 +54,49 @@ VECTOR_FMUL
INIT_YMM avx INIT_YMM avx
VECTOR_FMUL VECTOR_FMUL
%endif %endif
;------------------------------------------------------------------------------
; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len)
;------------------------------------------------------------------------------
%macro VECTOR_FMAC_SCALAR 0
%if UNIX64
cglobal vector_fmac_scalar, 3,3,3, dst, src, len
%else
cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len
%endif
%if WIN64
SWAP 0, 2
%endif
%if ARCH_X86_32
VBROADCASTSS m0, mulm
%else
shufps xmm0, xmm0, 0
%if cpuflag(avx)
vinsertf128 m0, m0, xmm0, 1
%endif
%endif
lea lenq, [lend*4-2*mmsize]
.loop
mulps m1, m0, [srcq+lenq ]
mulps m2, m0, [srcq+lenq+mmsize]
addps m1, m1, [dstq+lenq ]
addps m2, m2, [dstq+lenq+mmsize]
mova [dstq+lenq ], m1
mova [dstq+lenq+mmsize], m2
sub lenq, 2*mmsize
jge .loop
%if mmsize == 32
vzeroupper
RET
%else
REP_RET
%endif
%endmacro
INIT_XMM sse
VECTOR_FMAC_SCALAR
%if HAVE_AVX
INIT_YMM avx
VECTOR_FMAC_SCALAR
%endif

View File

@ -26,6 +26,11 @@ extern void ff_vector_fmul_sse(float *dst, const float *src0, const float *src1,
extern void ff_vector_fmul_avx(float *dst, const float *src0, const float *src1, extern void ff_vector_fmul_avx(float *dst, const float *src0, const float *src1,
int len); int len);
extern void ff_vector_fmac_scalar_sse(float *dst, const float *src, float mul,
int len);
extern void ff_vector_fmac_scalar_avx(float *dst, const float *src, float mul,
int len);
void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp) void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
{ {
#if HAVE_YASM #if HAVE_YASM
@ -33,9 +38,11 @@ void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) { if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) {
fdsp->vector_fmul = ff_vector_fmul_sse; fdsp->vector_fmul = ff_vector_fmul_sse;
fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_sse;
} }
if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) { if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) {
fdsp->vector_fmul = ff_vector_fmul_avx; fdsp->vector_fmul = ff_vector_fmul_avx;
fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_avx;
} }
#endif #endif
} }