Merge remote-tracking branch 'qatar/master'

* qatar/master:
  wtv: Check the return value from gmtime
  x86: fft: convert sse inline asm to yasm
  x86: place some inline asm under #if HAVE_INLINE_ASM

Conflicts:
	libavcodec/x86/fft_sse.c
	libavformat/wtv.c

Merged-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
Michael Niedermayer 2012-06-25 16:55:31 +02:00
commit a6ff8514a9
15 changed files with 184 additions and 124 deletions

View File

@ -43,7 +43,6 @@ YASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_mmx.o x86/diracdsp_yasm.o
YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_yasm.o
YASM-OBJS-FFT-$(HAVE_AMD3DNOW) += x86/fft_3dn.o
YASM-OBJS-FFT-$(HAVE_AMD3DNOWEXT) += x86/fft_3dn2.o
YASM-OBJS-FFT-$(HAVE_SSE) += x86/fft_sse.o
YASM-OBJS-$(CONFIG_FFT) += x86/fft_mmx.o \
$(YASM-OBJS-FFT-yes)

View File

@ -27,6 +27,8 @@
#include "libavutil/internal.h"
#include "config.h"
#if HAVE_INLINE_ASM
#ifdef BROKEN_RELOCATIONS
#define TABLES_ARG , "r"(tables)
@ -220,4 +222,5 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
return val;
}
#endif /* HAVE_INLINE_ASM */
#endif /* AVCODEC_X86_CABAC_H */

View File

@ -29,6 +29,8 @@
#include "libavcodec/cavsdsp.h"
#include "dsputil_mmx.h"
#if HAVE_INLINE_ASM
/* in/out: mma=mma+mmb, mmb=mmb-mma */
#define SUMSUB_BA( a, b ) \
"paddw "#b", "#a" \n\t"\
@ -477,10 +479,14 @@ static void ff_cavsdsp_init_3dnow(CAVSDSPContext* c, AVCodecContext *avctx) {
c->cavs_idct8_add = cavs_idct8_add_mmx;
}
#endif /* HAVE_INLINE_ASM */
void ff_cavsdsp_init_mmx(CAVSDSPContext *c, AVCodecContext *avctx)
{
int mm_flags = av_get_cpu_flags();
#if HAVE_INLINE_ASM
if (mm_flags & AV_CPU_FLAG_MMX2) ff_cavsdsp_init_mmx2 (c, avctx);
if (mm_flags & AV_CPU_FLAG_3DNOW) ff_cavsdsp_init_3dnow(c, avctx);
#endif /* HAVE_INLINE_ASM */
}

View File

@ -24,6 +24,8 @@
#include "libavutil/x86_cpu.h"
#include "libavcodec/dnxhdenc.h"
#if HAVE_INLINE_ASM
static void get_pixels_8x4_sym_sse2(DCTELEM *block, const uint8_t *pixels, int line_size)
{
__asm__ volatile(
@ -50,10 +52,14 @@ static void get_pixels_8x4_sym_sse2(DCTELEM *block, const uint8_t *pixels, int l
);
}
#endif /* HAVE_INLINE_ASM */
void ff_dnxhd_init_mmx(DNXHDEncContext *ctx)
{
#if HAVE_INLINE_ASM
if (av_get_cpu_flags() & AV_CPU_FLAG_SSE2) {
if (ctx->cid_table->bit_depth == 8)
ctx->get_pixels_8x4_sym = get_pixels_8x4_sym_sse2;
}
#endif /* HAVE_INLINE_ASM */
}

View File

@ -47,6 +47,10 @@ struc FFTContext
.mdctbits: resd 1
.tcos: pointer 1
.tsin: pointer 1
.fftperm: pointer 1
.fftcalc: pointer 1
.imdctcalc:pointer 1
.imdcthalf:pointer 1
endstruc
%define M_SQRT1_2 0.70710678118654752440
@ -65,6 +69,7 @@ perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
ps_m1m1m1m1: times 4 dd 1<<31
ps_m1p1: dd 1<<31, 0
%assign i 16
@ -533,6 +538,16 @@ DEFINE_ARGS z, w, n, o1, o3
rep ret
%endmacro
%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
lea r2, [dispatch_tab%1]
mov r2, [r2 + (%2q-2)*gprsize]
%ifdef PIC
lea r3, [$$]
add r2, r3
%endif
call r2
%endmacro ; FFT_DISPATCH
INIT_YMM avx
%if HAVE_AVX
@ -549,6 +564,14 @@ INIT_YMM avx
DECL_PASS pass_avx, PASS_BIG 1
DECL_PASS pass_interleave_avx, PASS_BIG 0
cglobal fft_calc, 2,5,8
mov r3d, [r0 + FFTContext.nbits]
mov r0, r1
mov r1, r3
FFT_DISPATCH _interleave %+ SUFFIX, r1
REP_RET
%endif
INIT_XMM sse
@ -566,6 +589,112 @@ INIT_XMM sse
DECL_PASS pass_sse, PASS_BIG 1
DECL_PASS pass_interleave_sse, PASS_BIG 0
cglobal fft_calc, 2,5,8
mov r3d, [r0 + FFTContext.nbits]
PUSH r1
PUSH r3
mov r0, r1
mov r1, r3
FFT_DISPATCH _interleave %+ SUFFIX, r1
POP rcx
POP r4
cmp rcx, 4
jg .end
mov r2, -1
add rcx, 3
shl r2, cl
sub r4, r2
.loop
movaps xmm0, [r4 + r2]
movaps xmm1, xmm0
unpcklps xmm0, [r4 + r2 + 16]
unpckhps xmm1, [r4 + r2 + 16]
movaps [r4 + r2], xmm0
movaps [r4 + r2 + 16], xmm1
add r2, 32
jl .loop
.end:
REP_RET
cextern_naked memcpy
cglobal fft_permute, 2,7,1
mov r4, [r0 + FFTContext.revtab]
mov r5, [r0 + FFTContext.tmpbuf]
mov ecx, [r0 + FFTContext.nbits]
mov r2, 1
shl r2, cl
xor r0, r0
%if ARCH_X86_32
mov r1, r1m
%endif
.loop:
movaps xmm0, [r1 + 8*r0]
movzx r6, word [r4 + 2*r0]
movzx r3, word [r4 + 2*r0 + 2]
movlps [r5 + 8*r6], xmm0
movhps [r5 + 8*r3], xmm0
add r0, 2
cmp r0, r2
jl .loop
shl r2, 3
%if ARCH_X86_64
mov r0, r1
mov r1, r5
%else
push r2
push r5
push r1
%endif
%if ARCH_X86_64 && WIN64 == 0
jmp memcpy
%else
call memcpy
%if ARCH_X86_32
add esp, 12
%endif
REP_RET
%endif
cglobal imdct_calc, 3,5,3
mov r3d, [r0 + FFTContext.mdctsize]
mov r4, [r0 + FFTContext.imdcthalf]
add r1, r3
PUSH r3
PUSH r1
%if ARCH_X86_32
push r2
push r1
push r0
%else
sub rsp, 8
%endif
call r4
%if ARCH_X86_32
add esp, 12
%else
add rsp, 8
%endif
POP r1
POP r3
lea r0, [r1 + 2*r3]
mov r2, r3
sub r3, 16
neg r2
movaps xmm2, [ps_m1m1m1m1]
.loop:
movaps xmm0, [r1 + r3]
movaps xmm1, [r0 + r2]
shufps xmm0, xmm0, 0x1b
shufps xmm1, xmm1, 0x1b
xorps xmm0, xmm2
movaps [r0 + r3], xmm1
movaps [r1 + r2], xmm0
sub r3, 16
add r2, 16
jl .loop
REP_RET
INIT_MMX 3dnow
%define mulps pfmul
%define addps pfadd
@ -583,16 +712,6 @@ DECL_PASS pass_interleave_3dnow, PASS_BIG 0
%define SECTION_REL
%endif
%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
lea r2, [dispatch_tab%1]
mov r2, [r2 + (%2q-2)*gprsize]
%ifdef PIC
lea r3, [$$]
add r2, r3
%endif
call r2
%endmacro ; FFT_DISPATCH
%macro DECL_FFT 1-2 ; nbits, suffix
%ifidn %0, 1
%xdefine fullsuffix SUFFIX

View File

@ -1,110 +0,0 @@
/*
* FFT/MDCT transform with SSE optimizations
* Copyright (c) 2008 Loren Merritt
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/x86_cpu.h"
#include "libavcodec/dsputil.h"
#include "fft.h"
#include "config.h"
DECLARE_ASM_CONST(16, unsigned int, ff_m1m1m1m1)[4] =
{ 1U << 31, 1U << 31, 1U << 31, 1U << 31 };
void ff_fft_dispatch_sse(FFTComplex *z, int nbits);
void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits);
void ff_fft_dispatch_interleave_avx(FFTComplex *z, int nbits);
#if HAVE_AVX
void ff_fft_calc_avx(FFTContext *s, FFTComplex *z)
{
ff_fft_dispatch_interleave_avx(z, s->nbits);
}
#endif
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
{
int n = 1 << s->nbits;
ff_fft_dispatch_interleave_sse(z, s->nbits);
if(n <= 16) {
x86_reg i = -8*n;
__asm__ volatile(
"1: \n"
"movaps (%0,%1), %%xmm0 \n"
"movaps %%xmm0, %%xmm1 \n"
"unpcklps 16(%0,%1), %%xmm0 \n"
"unpckhps 16(%0,%1), %%xmm1 \n"
"movaps %%xmm0, (%0,%1) \n"
"movaps %%xmm1, 16(%0,%1) \n"
"add $32, %0 \n"
"jl 1b \n"
:"+r"(i)
:"r"(z+n)
:"memory"
);
}
}
void ff_fft_permute_sse(FFTContext *s, FFTComplex *z)
{
int n = 1 << s->nbits;
int i;
for(i=0; i<n; i+=2) {
__asm__ volatile(
"movaps %2, %%xmm0 \n"
"movlps %%xmm0, %0 \n"
"movhps %%xmm0, %1 \n"
:"=m"(s->tmp_buf[s->revtab[i]]),
"=m"(s->tmp_buf[s->revtab[i+1]])
:"m"(z[i])
);
}
memcpy(z, s->tmp_buf, n*sizeof(FFTComplex));
}
void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
{
x86_reg j, k;
long n = s->mdct_size;
long n4 = n >> 2;
s->imdct_half(s, output + n4, input);
j = -n;
k = n-16;
__asm__ volatile(
"movaps "MANGLE(ff_m1m1m1m1)", %%xmm7 \n"
"1: \n"
"movaps (%2,%1), %%xmm0 \n"
"movaps (%3,%0), %%xmm1 \n"
"shufps $0x1b, %%xmm0, %%xmm0 \n"
"shufps $0x1b, %%xmm1, %%xmm1 \n"
"xorps %%xmm7, %%xmm0 \n"
"movaps %%xmm1, (%3,%1) \n"
"movaps %%xmm0, (%2,%0) \n"
"sub $16, %1 \n"
"add $16, %0 \n"
"jl 1b \n"
:"+r"(j), "+r"(k)
:"r"(output+n4), "r"(output+n4*3)
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm7")
);
}

View File

@ -34,6 +34,8 @@
#include "libavcodec/cabac.h"
#include "cabac.h"
#if HAVE_INLINE_ASM
//FIXME use some macros to avoid duplicating get_cabac (cannot be done yet
//as that would make optimization work hard)
#if HAVE_7REGS
@ -187,4 +189,5 @@ static int decode_significance_8x8_x86(CABACContext *c,
}
#endif /* HAVE_7REGS && !defined(BROKEN_RELOCATIONS) */
#endif /* HAVE_INLINE_ASM */
#endif /* AVCODEC_X86_H264_I386_H */

View File

@ -23,6 +23,8 @@
#include "libavutil/cpu.h"
#include "libavcodec/lpc.h"
#if HAVE_INLINE_ASM
static void lpc_apply_welch_window_sse2(const int32_t *data, int len,
double *w_data)
{
@ -136,12 +138,16 @@ static void lpc_compute_autocorr_sse2(const double *data, int len, int lag,
}
}
#endif /* HAVE_INLINE_ASM */
av_cold void ff_lpc_init_x86(LPCContext *c)
{
int mm_flags = av_get_cpu_flags();
#if HAVE_INLINE_ASM
if (mm_flags & (AV_CPU_FLAG_SSE2|AV_CPU_FLAG_SSE2SLOW)) {
c->lpc_apply_welch_window = lpc_apply_welch_window_sse2;
c->lpc_compute_autocorr = lpc_compute_autocorr_sse2;
}
#endif /* HAVE_INLINE_ASM */
}

View File

@ -25,6 +25,8 @@
#include "config.h"
#include "libavutil/common.h"
#if HAVE_INLINE_ASM
#if ARCH_X86_32
#define MULL MULL
@ -118,4 +120,5 @@ static inline uint32_t NEG_USR32(uint32_t a, int8_t s){
return a;
}
#endif /* HAVE_INLINE_ASM */
#endif /* AVCODEC_X86_MATHOPS_H */

View File

@ -26,6 +26,8 @@
#include "libavcodec/dwt.h"
#include "dsputil_mmx.h"
#if HAVE_INLINE_ASM
static void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, IDWTELEM *temp, int width){
const int w2= (width+1)>>1;
const int w_l= (width>>1);
@ -871,8 +873,11 @@ static void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_str
ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
}
#endif /* HAVE_INLINE_ASM */
void ff_dwt_init_x86(DWTContext *c)
{
#if HAVE_INLINE_ASM
int mm_flags = av_get_cpu_flags();
if (mm_flags & AV_CPU_FLAG_MMX) {
@ -893,4 +898,5 @@ void ff_dwt_init_x86(DWTContext *c)
c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
}
}
#endif /* HAVE_INLINE_ASM */
}

View File

@ -30,6 +30,8 @@
#include "dsputil_mmx.h"
#include "libavcodec/vc1dsp.h"
#if HAVE_INLINE_ASM
#define OP_PUT(S,D)
#define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t"
@ -682,6 +684,8 @@ static void vc1_inv_trans_8x8_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *bloc
);
}
#endif /* HAVE_INLINE_ASM */
#define LOOP_FILTER(EXT) \
void ff_vc1_v_loop_filter4_ ## EXT(uint8_t *src, int stride, int pq); \
void ff_vc1_h_loop_filter4_ ## EXT(uint8_t *src, int stride, int pq); \
@ -730,6 +734,7 @@ void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
{
int mm_flags = av_get_cpu_flags();
#if HAVE_INLINE_ASM
if (mm_flags & AV_CPU_FLAG_MMX) {
dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_vc1_mspel_mc00_mmx;
dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx;
@ -791,6 +796,7 @@ void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
dsp->put_no_rnd_vc1_chroma_pixels_tab[0]= ff_put_vc1_chroma_mc8_ssse3_nornd;
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_ssse3_nornd;
}
#endif /* HAVE_INLINE_ASM */
#define ASSIGN_LF(EXT) \
dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_ ## EXT; \

View File

@ -372,7 +372,11 @@ static int read_probe(AVProbeData *p)
static void filetime_to_iso8601(char *buf, int buf_size, int64_t value)
{
time_t t = (value / 10000000LL) - 11644473600LL;
strftime(buf, buf_size, "%Y-%m-%d %H:%M:%S", gmtime(&t));
struct tm *tm = gmtime(&t);
if (tm)
strftime(buf, buf_size, "%Y-%m-%d %H:%M:%S", gmtime(&t));
else
buf[0] = '\0';
}
/**
@ -381,7 +385,11 @@ static void filetime_to_iso8601(char *buf, int buf_size, int64_t value)
static void crazytime_to_iso8601(char *buf, int buf_size, int64_t value)
{
time_t t = (value / 10000000LL) - 719162LL*86400LL;
strftime(buf, buf_size, "%Y-%m-%d %H:%M:%S", gmtime(&t));
struct tm *tm = gmtime(&t);
if (tm)
strftime(buf, buf_size, "%Y-%m-%d %H:%M:%S", gmtime(&t));
else
buf[0] = '\0';
}
/**

View File

@ -111,7 +111,7 @@ struct AVDictionary {
/* math */
#if ARCH_X86
#if ARCH_X86 && HAVE_INLINE_ASM
#define MASK_ABS(mask, level)\
__asm__ volatile(\
"cltd \n\t"\

View File

@ -28,6 +28,8 @@
#include "config.h"
#include "libavutil/attributes.h"
#if HAVE_INLINE_ASM
#if !AV_GCC_VERSION_AT_LEAST(4,1)
#define av_bswap16 av_bswap16
static av_always_inline av_const unsigned av_bswap16(unsigned x)
@ -55,4 +57,5 @@ static inline uint64_t av_const av_bswap64(uint64_t x)
#endif
#endif /* !AV_GCC_VERSION_AT_LEAST(4,5) */
#endif /* HAVE_INLINE_ASM */
#endif /* AVUTIL_X86_BSWAP_H */

View File

@ -21,6 +21,7 @@
#ifndef AVUTIL_X86_INTMATH_H
#define AVUTIL_X86_INTMATH_H
#if HAVE_INLINE_ASM
#define FASTDIV(a,b) \
({\
int ret, dmy;\
@ -31,5 +32,6 @@
);\
ret;\
})
#endif
#endif /* AVUTIL_X86_INTMATH_H */