diff --git a/libavfilter/unsharp.h b/libavfilter/unsharp.h index caff986fc1..a60b30f31a 100644 --- a/libavfilter/unsharp.h +++ b/libavfilter/unsharp.h @@ -37,7 +37,8 @@ typedef struct UnsharpFilterParam { int steps_y; ///< vertical step count int scalebits; ///< bits to shift pixel int32_t halfscale; ///< amount to add to pixel - uint32_t *sc[MAX_MATRIX_SIZE - 1]; ///< finite state machine storage + uint32_t *sr; ///< finite state machine storage within a row + uint32_t **sc; ///< finite state machine storage across rows } UnsharpFilterParam; typedef struct UnsharpContext { @@ -47,6 +48,7 @@ typedef struct UnsharpContext { UnsharpFilterParam luma; ///< luma parameters (width, height, amount) UnsharpFilterParam chroma; ///< chroma parameters (width, height, amount) int hsub, vsub; + int nb_threads; int opencl; int (* apply_unsharp)(AVFilterContext *ctx, AVFrame *in, AVFrame *out); } UnsharpContext; diff --git a/libavfilter/vf_unsharp.c b/libavfilter/vf_unsharp.c index 41ccc56942..af05833a5d 100644 --- a/libavfilter/vf_unsharp.c +++ b/libavfilter/vf_unsharp.c @@ -47,15 +47,22 @@ #include "libavutil/pixdesc.h" #include "unsharp.h" -static void apply_unsharp( uint8_t *dst, int dst_stride, - const uint8_t *src, int src_stride, - int width, int height, UnsharpFilterParam *fp) -{ - uint32_t **sc = fp->sc; - uint32_t sr[MAX_MATRIX_SIZE - 1], tmp1, tmp2; +typedef struct TheadData { + UnsharpFilterParam *fp; + uint8_t *dst; + const uint8_t *src; + int dst_stride; + int src_stride; + int width; + int height; +} ThreadData; - int32_t res; - int x, y, z; +static int unsharp_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) +{ + ThreadData *td = arg; + UnsharpFilterParam *fp = td->fp; + uint32_t **sc = fp->sc; + uint32_t *sr = fp->sr; const uint8_t *src2 = NULL; //silence a warning const int amount = fp->amount; const int steps_x = fp->steps_x; @@ -63,30 +70,54 @@ static void apply_unsharp( uint8_t *dst, int dst_stride, const int scalebits = fp->scalebits; const int32_t halfscale = fp->halfscale; + uint8_t *dst = td->dst; + const uint8_t *src = td->src; + const int dst_stride = td->dst_stride; + const int src_stride = td->src_stride; + const int width = td->width; + const int height = td->height; + const int sc_offset = jobnr * 2 * steps_y; + const int sr_offset = jobnr * (MAX_MATRIX_SIZE - 1); + const int slice_start = (height * jobnr) / nb_jobs; + const int slice_end = (height * (jobnr+1)) / nb_jobs; + + int32_t res; + int x, y, z; + uint32_t tmp1, tmp2; + if (!amount) { - av_image_copy_plane(dst, dst_stride, src, src_stride, width, height); - return; + av_image_copy_plane(dst + slice_start * dst_stride, dst_stride, + src + slice_start * src_stride, src_stride, + width, slice_end - slice_start); + return 0; } for (y = 0; y < 2 * steps_y; y++) - memset(sc[y], 0, sizeof(sc[y][0]) * (width + 2 * steps_x)); + memset(sc[sc_offset + y], 0, sizeof(sc[y][0]) * (width + 2 * steps_x)); - for (y = -steps_y; y < height + steps_y; y++) { + // if this is not the first tile, we start from (slice_start - steps_y), + // so we can get smooth result at slice boundary + if (slice_start > steps_y) { + src += (slice_start - steps_y) * src_stride; + dst += (slice_start - steps_y) * dst_stride; + } + + for (y = -steps_y + slice_start; y < steps_y + slice_end; y++) { if (y < height) src2 = src; - memset(sr, 0, sizeof(sr[0]) * (2 * steps_x - 1)); + memset(sr + sr_offset, 0, sizeof(sr[0]) * (2 * steps_x - 1)); for (x = -steps_x; x < width + steps_x; x++) { tmp1 = x <= 0 ? src2[0] : x >= width ? src2[width-1] : src2[x]; for (z = 0; z < steps_x * 2; z += 2) { - tmp2 = sr[z + 0] + tmp1; sr[z + 0] = tmp1; - tmp1 = sr[z + 1] + tmp2; sr[z + 1] = tmp2; + tmp2 = sr[sr_offset + z + 0] + tmp1; sr[sr_offset + z + 0] = tmp1; + tmp1 = sr[sr_offset + z + 1] + tmp2; sr[sr_offset + z + 1] = tmp2; } for (z = 0; z < steps_y * 2; z += 2) { - tmp2 = sc[z + 0][x + steps_x] + tmp1; sc[z + 0][x + steps_x] = tmp1; - tmp1 = sc[z + 1][x + steps_x] + tmp2; sc[z + 1][x + steps_x] = tmp2; + tmp2 = sc[sc_offset + z + 0][x + steps_x] + tmp1; sc[sc_offset + z + 0][x + steps_x] = tmp1; + tmp1 = sc[sc_offset + z + 1][x + steps_x] + tmp2; sc[sc_offset + z + 1][x + steps_x] = tmp2; } - if (x >= steps_x && y >= steps_y) { + if (x >= steps_x && y >= (steps_y + slice_start)) { const uint8_t *srx = src - steps_y * src_stride + x - steps_x; uint8_t *dsx = dst - steps_y * dst_stride + x - steps_x; @@ -99,6 +130,7 @@ static void apply_unsharp( uint8_t *dst, int dst_stride, src += src_stride; } } + return 0; } static int apply_unsharp_c(AVFilterContext *ctx, AVFrame *in, AVFrame *out) @@ -107,6 +139,8 @@ static int apply_unsharp_c(AVFilterContext *ctx, AVFrame *in, AVFrame *out) UnsharpContext *s = ctx->priv; int i, plane_w[3], plane_h[3]; UnsharpFilterParam *fp[3]; + ThreadData td; + plane_w[0] = inlink->w; plane_w[1] = plane_w[2] = AV_CEIL_RSHIFT(inlink->w, s->hsub); plane_h[0] = inlink->h; @@ -114,7 +148,14 @@ static int apply_unsharp_c(AVFilterContext *ctx, AVFrame *in, AVFrame *out) fp[0] = &s->luma; fp[1] = fp[2] = &s->chroma; for (i = 0; i < 3; i++) { - apply_unsharp(out->data[i], out->linesize[i], in->data[i], in->linesize[i], plane_w[i], plane_h[i], fp[i]); + td.fp = fp[i]; + td.dst = out->data[i]; + td.src = in->data[i]; + td.width = plane_w[i]; + td.height = plane_h[i]; + td.dst_stride = out->linesize[i]; + td.src_stride = in->linesize[i]; + ctx->internal->execute(ctx, unsharp_slice, &td, NULL, FFMIN(plane_h[i], s->nb_threads)); } return 0; } @@ -163,6 +204,7 @@ static int query_formats(AVFilterContext *ctx) static int init_filter_param(AVFilterContext *ctx, UnsharpFilterParam *fp, const char *effect_type, int width) { int z; + UnsharpContext *s = ctx->priv; const char *effect = fp->amount == 0 ? "none" : fp->amount < 0 ? "blur" : "sharpen"; if (!(fp->msize_x & fp->msize_y & 1)) { @@ -175,7 +217,12 @@ static int init_filter_param(AVFilterContext *ctx, UnsharpFilterParam *fp, const av_log(ctx, AV_LOG_VERBOSE, "effect:%s type:%s msize_x:%d msize_y:%d amount:%0.2f\n", effect, effect_type, fp->msize_x, fp->msize_y, fp->amount / 65535.0); - for (z = 0; z < 2 * fp->steps_y; z++) + fp->sr = av_malloc_array((MAX_MATRIX_SIZE - 1) * s->nb_threads, sizeof(uint32_t)); + fp->sc = av_malloc_array(2 * fp->steps_y * s->nb_threads, sizeof(uint32_t **)); + if (!fp->sr || !fp->sc) + return AVERROR(ENOMEM); + + for (z = 0; z < 2 * fp->steps_y * s->nb_threads; z++) if (!(fp->sc[z] = av_malloc_array(width + 2 * fp->steps_x, sizeof(*(fp->sc[z]))))) return AVERROR(ENOMEM); @@ -192,6 +239,11 @@ static int config_props(AVFilterLink *link) s->hsub = desc->log2_chroma_w; s->vsub = desc->log2_chroma_h; + // ensure (height / nb_threads) > 4 * steps_y, + // so that we don't have too much overlap between two threads + s->nb_threads = FFMIN(ff_filter_get_nb_threads(link->dst), + link->h / (4 * s->luma.steps_y)); + ret = init_filter_param(link->dst, &s->luma, "luma", link->w); if (ret < 0) return ret; @@ -202,20 +254,22 @@ static int config_props(AVFilterLink *link) return 0; } -static void free_filter_param(UnsharpFilterParam *fp) +static void free_filter_param(UnsharpFilterParam *fp, int nb_threads) { int z; - for (z = 0; z < 2 * fp->steps_y; z++) + for (z = 0; z < 2 * fp->steps_y * nb_threads; z++) av_freep(&fp->sc[z]); + av_freep(&fp->sc); + av_freep(&fp->sr); } static av_cold void uninit(AVFilterContext *ctx) { UnsharpContext *s = ctx->priv; - free_filter_param(&s->luma); - free_filter_param(&s->chroma); + free_filter_param(&s->luma, s->nb_threads); + free_filter_param(&s->chroma, s->nb_threads); } static int filter_frame(AVFilterLink *link, AVFrame *in) @@ -294,5 +348,5 @@ AVFilter ff_vf_unsharp = { .query_formats = query_formats, .inputs = avfilter_vf_unsharp_inputs, .outputs = avfilter_vf_unsharp_outputs, - .flags = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC, + .flags = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC | AVFILTER_FLAG_SLICE_THREADS, };