dsputil_mmx: put optimized gmc code back and avoid a VLA without loosing features.

Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
Michael Niedermayer 2012-10-06 22:36:49 +02:00
parent 094a82c7de
commit e063ffbf4b

View File

@ -1929,10 +1929,15 @@ static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
#if HAVE_INLINE_ASM
static void gmc_mmx(uint8_t *dst, uint8_t *src,
typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
int linesize, int block_w, int block_h,
int src_x, int src_y, int w, int h);
static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
int stride, int h, int ox, int oy,
int dxx, int dxy, int dyx, int dyy,
int shift, int r, int width, int height)
int shift, int r, int width, int height,
emulated_edge_mc_func *emu_edge_fn)
{
const int w = 8;
const int ix = ox >> (16 + shift);
@ -1947,6 +1952,9 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src,
const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
const uint64_t shift2 = 2 * shift;
#define MAX_STRIDE 4096U
#define MAX_H 8U
uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
int x, y;
const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
@ -1957,9 +1965,8 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src,
((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
(oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
// uses more than 16 bits of subpel mv (only at huge resolution)
|| (dxx | dxy | dyx | dyy) & 15 ||
(unsigned)ix >= width - w ||
(unsigned)iy >= height - h) {
|| (dxx | dxy | dyx | dyy) & 15
|| h > MAX_H || stride > MAX_STRIDE) {
// FIXME could still use mmx for some of the rows
ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
shift, r, width, height);
@ -1967,6 +1974,11 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src,
}
src += ix + iy * stride;
if ((unsigned)ix >= width - w ||
(unsigned)iy >= height - h) {
emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height);
src = edge_buf;
}
__asm__ volatile (
"movd %0, %%mm6 \n\t"
@ -2045,6 +2057,36 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src,
}
}
#if HAVE_YASM
#if ARCH_X86_32
static void gmc_mmx(uint8_t *dst, uint8_t *src,
int stride, int h, int ox, int oy,
int dxx, int dxy, int dyx, int dyy,
int shift, int r, int width, int height)
{
gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
width, height, &emulated_edge_mc_mmx);
}
#endif
static void gmc_sse(uint8_t *dst, uint8_t *src,
int stride, int h, int ox, int oy,
int dxx, int dxy, int dyx, int dyy,
int shift, int r, int width, int height)
{
gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
width, height, &emulated_edge_mc_sse);
}
#else
static void gmc_mmx(uint8_t *dst, uint8_t *src,
int stride, int h, int ox, int oy,
int dxx, int dxy, int dyx, int dyy,
int shift, int r, int width, int height)
{
gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
width, height, &ff_emulated_edge_mc_8);
}
#endif
#define PREFETCH(name, op) \
static void name(void *mem, int stride, int h) \
{ \
@ -2545,7 +2587,9 @@ static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
}
#if ARCH_X86_32 || !HAVE_YASM
c->gmc = gmc_mmx;
#endif
c->add_bytes = add_bytes_mmx;
@ -2800,6 +2844,9 @@ static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
if (!high_bit_depth)
c->emulated_edge_mc = emulated_edge_mc_sse;
#if HAVE_INLINE_ASM
c->gmc = gmc_sse;
#endif
#endif /* HAVE_YASM */
}