FFmpeg/libavcodec/x86/lossless_audiodsp.asm
Christophe Gisquet 9630b3fc06 x86: lossless audio: SSE4 madd 32bits
The unique user so far is wmalossless 24bits. The few samples tested show an
order of 8, so more unrolling or an avx2 version do not make sense.

Timings: 68 -> 49 cycles

Reviewed-by: Paul B Mahol <onemda@gmail.com>
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
2016-05-07 23:28:48 +02:00

191 lines
4.7 KiB
NASM

;******************************************************************************
;* Copyright (c) 2008 Loren Merritt
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION .text
%macro SCALARPRODUCT 0
; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
; int order, int mul)
cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
shl orderq, 1
movd m7, mulm
%if mmsize == 16
pshuflw m7, m7, 0
punpcklqdq m7, m7
%else
pshufw m7, m7, 0
%endif
pxor m6, m6
add v1q, orderq
add v2q, orderq
add v3q, orderq
neg orderq
.loop:
movu m0, [v2q + orderq]
movu m1, [v2q + orderq + mmsize]
mova m4, [v1q + orderq]
mova m5, [v1q + orderq + mmsize]
movu m2, [v3q + orderq]
movu m3, [v3q + orderq + mmsize]
pmaddwd m0, m4
pmaddwd m1, m5
pmullw m2, m7
pmullw m3, m7
paddd m6, m0
paddd m6, m1
paddw m2, m4
paddw m3, m5
mova [v1q + orderq], m2
mova [v1q + orderq + mmsize], m3
add orderq, mmsize*2
jl .loop
HADDD m6, m0
movd eax, m6
RET
%endmacro
INIT_MMX mmxext
SCALARPRODUCT
INIT_XMM sse2
SCALARPRODUCT
INIT_XMM sse4
; int ff_scalarproduct_and_madd_int32(int16_t *v1, int32_t *v2, int16_t *v3,
; int order, int mul)
cglobal scalarproduct_and_madd_int32, 4,4,8, v1, v2, v3, order, mul
shl orderq, 1
movd m7, mulm
SPLATW m7, m7
pxor m6, m6
add v1q, orderq
lea v2q, [v2q + 2*orderq]
add v3q, orderq
neg orderq
.loop:
mova m3, [v1q + orderq]
movu m0, [v2q + 2*orderq]
pmovsxwd m4, m3
movu m1, [v2q + 2*orderq + mmsize]
movhlps m5, m3
movu m2, [v3q + orderq]
pmovsxwd m5, m5
pmullw m2, m7
pmulld m0, m4
pmulld m1, m5
paddw m2, m3
paddd m6, m0
paddd m6, m1
mova [v1q + orderq], m2
add orderq, 16
jl .loop
HADDD m6, m0
movd eax, m6
RET
%macro SCALARPRODUCT_LOOP 1
align 16
.loop%1:
sub orderq, mmsize*2
%if %1
mova m1, m4
mova m4, [v2q + orderq]
mova m0, [v2q + orderq + mmsize]
palignr m1, m0, %1
palignr m0, m4, %1
mova m3, m5
mova m5, [v3q + orderq]
mova m2, [v3q + orderq + mmsize]
palignr m3, m2, %1
palignr m2, m5, %1
%else
mova m0, [v2q + orderq]
mova m1, [v2q + orderq + mmsize]
mova m2, [v3q + orderq]
mova m3, [v3q + orderq + mmsize]
%endif
%define t0 [v1q + orderq]
%define t1 [v1q + orderq + mmsize]
%if ARCH_X86_64
mova m8, t0
mova m9, t1
%define t0 m8
%define t1 m9
%endif
pmaddwd m0, t0
pmaddwd m1, t1
pmullw m2, m7
pmullw m3, m7
paddw m2, t0
paddw m3, t1
paddd m6, m0
paddd m6, m1
mova [v1q + orderq], m2
mova [v1q + orderq + mmsize], m3
jg .loop%1
%if %1
jmp .end
%endif
%endmacro
; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
; int order, int mul)
INIT_XMM ssse3
cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
shl orderq, 1
movd m7, mulm
pshuflw m7, m7, 0
punpcklqdq m7, m7
pxor m6, m6
mov r4d, v2d
and r4d, 15
and v2q, ~15
and v3q, ~15
mova m4, [v2q + orderq]
mova m5, [v3q + orderq]
; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
cmp r4d, 0
je .loop0
cmp r4d, 2
je .loop2
cmp r4d, 4
je .loop4
cmp r4d, 6
je .loop6
cmp r4d, 8
je .loop8
cmp r4d, 10
je .loop10
cmp r4d, 12
je .loop12
SCALARPRODUCT_LOOP 14
SCALARPRODUCT_LOOP 12
SCALARPRODUCT_LOOP 10
SCALARPRODUCT_LOOP 8
SCALARPRODUCT_LOOP 6
SCALARPRODUCT_LOOP 4
SCALARPRODUCT_LOOP 2
SCALARPRODUCT_LOOP 0
.end:
HADDD m6, m0
movd eax, m6
RET