From b0ef0ae77608a5e3d2ba68af503e8b1277a215d3 Mon Sep 17 00:00:00 2001 From: James Darnley Date: Sun, 10 Mar 2013 15:08:50 +0100 Subject: [PATCH] yadif: restore speed of the C filtering code Always use the special filter for the first and last 3 columns (only). Changes made in 64ed397 slowed the filter to just under 3/4 of what it was. This commit restores the speed while maintaining identical output. For reference, on my Athlon64: 1733222 decicycles in old 2358563 decicycles in new 1727558 decicycles in this Signed-off-by: Anton Khirnov --- libavfilter/vf_yadif.c | 70 ++++++++++++++++----------------- libavfilter/x86/vf_yadif_init.c | 12 ++---- libavfilter/yadif.h | 4 +- 3 files changed, 37 insertions(+), 49 deletions(-) diff --git a/libavfilter/vf_yadif.c b/libavfilter/vf_yadif.c index faa487f7df..076ad41519 100644 --- a/libavfilter/vf_yadif.c +++ b/libavfilter/vf_yadif.c @@ -33,14 +33,17 @@ #include #define CHECK(j)\ - { int score = FFABS(cur[mrefs + off_left + (j)] - cur[prefs + off_left - (j)])\ + { int score = FFABS(cur[mrefs - 1 + (j)] - cur[prefs - 1 - (j)])\ + FFABS(cur[mrefs +(j)] - cur[prefs -(j)])\ - + FFABS(cur[mrefs + off_right + (j)] - cur[prefs + off_right - (j)]);\ + + FFABS(cur[mrefs + 1 + (j)] - cur[prefs + 1 - (j)]);\ if (score < spatial_score) {\ spatial_score= score;\ spatial_pred= (cur[mrefs +(j)] + cur[prefs -(j)])>>1;\ -#define FILTER(start, end) \ +/* The is_not_edge argument here controls when the code will enter a branch + * which reads up to and including x-3 and x+3. */ + +#define FILTER(start, end, is_not_edge) \ for (x = start; x < end; x++) { \ int c = cur[mrefs]; \ int d = (prev2[0] + next2[0])>>1; \ @@ -50,12 +53,10 @@ int temporal_diff2 =(FFABS(next[mrefs] - c) + FFABS(next[prefs] - e) )>>1; \ int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2); \ int spatial_pred = (c+e) >> 1; \ - int off_right = (x < w - 1) ? 1 : -1;\ - int off_left = x ? -1 : 1;\ - int spatial_score = FFABS(cur[mrefs + off_left] - cur[prefs + off_left]) + FFABS(c-e) \ - + FFABS(cur[mrefs + off_right] - cur[prefs + off_right]) - 1; \ \ - if (x > 2 && x < w - 3) {\ + if (is_not_edge) {\ + int spatial_score = FFABS(cur[mrefs - 1] - cur[prefs - 1]) + FFABS(c-e) \ + + FFABS(cur[mrefs + 1] - cur[prefs + 1]) - 1; \ CHECK(-1) CHECK(-2) }} }} \ CHECK( 1) CHECK( 2) }} }} \ }\ @@ -96,12 +97,15 @@ static void filter_line_c(void *dst1, uint8_t *prev2 = parity ? prev : cur ; uint8_t *next2 = parity ? cur : next; - FILTER(0, w) + /* The function is called with the pointers already pointing to data[3] and + * with 6 subtracted from the width. This allows the FILTER macro to be + * called so that it processes all the pixels normally. A constant value of + * true for is_not_edge lets the compiler ignore the if statement. */ + FILTER(0, w, 1) } static void filter_edges(void *dst1, void *prev1, void *cur1, void *next1, - int w, int prefs, int mrefs, int parity, int mode, - int l_edge) + int w, int prefs, int mrefs, int parity, int mode) { uint8_t *dst = dst1; uint8_t *prev = prev1; @@ -111,7 +115,9 @@ static void filter_edges(void *dst1, void *prev1, void *cur1, void *next1, uint8_t *prev2 = parity ? prev : cur ; uint8_t *next2 = parity ? cur : next; - FILTER(0, l_edge) + /* Only edge pixels need to be processed here. A constant value of false + * for is_not_edge should let the compiler ignore the whole branch. */ + FILTER(0, 3, 0) dst = (uint8_t*)dst1 + w - 3; prev = (uint8_t*)prev1 + w - 3; @@ -120,7 +126,7 @@ static void filter_edges(void *dst1, void *prev1, void *cur1, void *next1, prev2 = (uint8_t*)(parity ? prev : cur); next2 = (uint8_t*)(parity ? cur : next); - FILTER(w - 3, w) + FILTER(w - 3, w, 0) } @@ -139,12 +145,11 @@ static void filter_line_c_16bit(void *dst1, mrefs /= 2; prefs /= 2; - FILTER(0, w) + FILTER(0, w, 1) } static void filter_edges_16bit(void *dst1, void *prev1, void *cur1, void *next1, - int w, int prefs, int mrefs, int parity, int mode, - int l_edge) + int w, int prefs, int mrefs, int parity, int mode) { uint16_t *dst = dst1; uint16_t *prev = prev1; @@ -154,7 +159,7 @@ static void filter_edges_16bit(void *dst1, void *prev1, void *cur1, void *next1, uint16_t *prev2 = parity ? prev : cur ; uint16_t *next2 = parity ? cur : next; - FILTER(0, l_edge) + FILTER(0, 3, 0) dst = (uint16_t*)dst1 + w - 3; prev = (uint16_t*)prev1 + w - 3; @@ -163,7 +168,7 @@ static void filter_edges_16bit(void *dst1, void *prev1, void *cur1, void *next1, prev2 = (uint16_t*)(parity ? prev : cur); next2 = (uint16_t*)(parity ? cur : next); - FILTER(w - 3, w) + FILTER(w - 3, w, 0) } static void filter(AVFilterContext *ctx, AVFrame *dstpic, @@ -177,7 +182,7 @@ static void filter(AVFilterContext *ctx, AVFrame *dstpic, int h = dstpic->height; int refs = yadif->cur->linesize[i]; int df = (yadif->csp->comp[i].depth_minus1 + 8) / 8; - int l_edge, l_edge_pix; + int pix_3 = 3 * df; if (i == 1 || i == 2) { /* Why is this not part of the per-plane description thing? */ @@ -188,8 +193,6 @@ static void filter(AVFilterContext *ctx, AVFrame *dstpic, /* filtering reads 3 pixels to the left/right; to avoid invalid reads, * we need to call the c variant which avoids this for border pixels */ - l_edge = yadif->req_align; - l_edge_pix = l_edge / df; for (y = 0; y < h; y++) { if ((y ^ parity) & 1) { @@ -198,22 +201,15 @@ static void filter(AVFilterContext *ctx, AVFrame *dstpic, uint8_t *next = &yadif->next->data[i][y * refs]; uint8_t *dst = &dstpic->data[i][y * dstpic->linesize[i]]; int mode = y == 1 || y + 2 == h ? 2 : yadif->mode; - if (yadif->req_align) { - yadif->filter_line(dst + l_edge, prev + l_edge, cur + l_edge, - next + l_edge, w - l_edge_pix - 3, - y + 1 < h ? refs : -refs, - y ? -refs : refs, - parity ^ tff, mode); - yadif->filter_edges(dst, prev, cur, next, w, - y + 1 < h ? refs : -refs, - y ? -refs : refs, - parity ^ tff, mode, l_edge_pix); - } else { - yadif->filter_line(dst, prev, cur, next + l_edge, w, - y + 1 < h ? refs : -refs, - y ? -refs : refs, - parity ^ tff, mode); - } + yadif->filter_line(dst + pix_3, prev + pix_3, cur + pix_3, + next + pix_3, w - 6, + y + 1 < h ? refs : -refs, + y ? -refs : refs, + parity ^ tff, mode); + yadif->filter_edges(dst, prev, cur, next, w, + y + 1 < h ? refs : -refs, + y ? -refs : refs, + parity ^ tff, mode); } else { memcpy(&dstpic->data[i][y * dstpic->linesize[i]], &yadif->cur->data[i][y * refs], w * df); diff --git a/libavfilter/x86/vf_yadif_init.c b/libavfilter/x86/vf_yadif_init.c index 99520a2ee8..5978a4fd42 100644 --- a/libavfilter/x86/vf_yadif_init.c +++ b/libavfilter/x86/vf_yadif_init.c @@ -42,18 +42,12 @@ av_cold void ff_yadif_init_x86(YADIFContext *yadif) int cpu_flags = av_get_cpu_flags(); #if ARCH_X86_32 - if (EXTERNAL_MMXEXT(cpu_flags)) { + if (EXTERNAL_MMXEXT(cpu_flags)) yadif->filter_line = ff_yadif_filter_line_mmxext; - yadif->req_align = 8; - } #endif /* ARCH_X86_32 */ - if (EXTERNAL_SSE2(cpu_flags)) { + if (EXTERNAL_SSE2(cpu_flags)) yadif->filter_line = ff_yadif_filter_line_sse2; - yadif->req_align = 16; - } - if (EXTERNAL_SSSE3(cpu_flags)) { + if (EXTERNAL_SSSE3(cpu_flags)) yadif->filter_line = ff_yadif_filter_line_ssse3; - yadif->req_align = 16; - } #endif /* HAVE_YASM */ } diff --git a/libavfilter/yadif.h b/libavfilter/yadif.h index 665922d925..6936723c47 100644 --- a/libavfilter/yadif.h +++ b/libavfilter/yadif.h @@ -55,13 +55,11 @@ typedef struct YADIFContext { /** * Required alignment for filter_line */ - int req_align; void (*filter_line)(void *dst, void *prev, void *cur, void *next, int w, int prefs, int mrefs, int parity, int mode); void (*filter_edges)(void *dst, void *prev, void *cur, void *next, - int w, int prefs, int mrefs, int parity, int mode, - int l_edge); + int w, int prefs, int mrefs, int parity, int mode); const AVPixFmtDescriptor *csp; int eof;