mirror of
https://github.com/xenia-project/FFmpeg.git
synced 2024-11-28 14:01:27 +00:00
Implement an sse version of scalarproduct_float().
Originally committed as revision 21386 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
57835fc1ae
commit
3deb53849e
@ -2510,6 +2510,8 @@ void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, ui
|
||||
int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
|
||||
|
||||
|
||||
float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
|
||||
|
||||
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
||||
{
|
||||
mm_flags = mm_support();
|
||||
@ -2965,6 +2967,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
||||
c->vector_clipf = vector_clipf_sse;
|
||||
c->float_to_int16 = float_to_int16_sse;
|
||||
c->float_to_int16_interleave = float_to_int16_interleave_sse;
|
||||
#if HAVE_YASM
|
||||
c->scalarproduct_float = ff_scalarproduct_float_sse;
|
||||
#endif
|
||||
}
|
||||
if(mm_flags & FF_MM_3DNOW)
|
||||
c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
|
||||
|
@ -397,3 +397,27 @@ cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
|
||||
.unaligned:
|
||||
ADD_HFYU_LEFT_LOOP 0
|
||||
|
||||
|
||||
; float ff_scalarproduct_float_sse(const float *v1, const float *v2, int len)
|
||||
cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
|
||||
neg offsetq
|
||||
shl offsetq, 2
|
||||
sub v1q, offsetq
|
||||
sub v2q, offsetq
|
||||
xorps xmm0, xmm0
|
||||
.loop:
|
||||
movaps xmm1, [v1q+offsetq]
|
||||
mulps xmm1, [v2q+offsetq]
|
||||
addps xmm0, xmm1
|
||||
add offsetq, 16
|
||||
js .loop
|
||||
movhlps xmm1, xmm0
|
||||
addps xmm0, xmm1
|
||||
movss xmm1, xmm0
|
||||
shufps xmm0, xmm0, 1
|
||||
addss xmm0, xmm1
|
||||
%ifndef ARCH_X86_64
|
||||
movd r0m, xmm0
|
||||
fld dword r0m
|
||||
%endif
|
||||
RET
|
||||
|
Loading…
Reference in New Issue
Block a user