From cc16da75c2f99d92f7a6461100f041352deb6d88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Raulet?= Date: Tue, 5 Jul 2016 18:52:38 +0200 Subject: [PATCH] hevc: Add coefficient limiting to speed up IDCT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Integrated to libav by Josh de Kock . Signed-off-by: Alexandra Hájková --- libavcodec/hevc.c | 12 +++++++++-- libavcodec/hevcdsp.h | 2 +- libavcodec/hevcdsp_template.c | 38 +++++++++++++++++++++++------------ 3 files changed, 36 insertions(+), 16 deletions(-) diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c index d8c707b3c6..5d58b5212a 100644 --- a/libavcodec/hevc.c +++ b/libavcodec/hevc.c @@ -1218,8 +1218,16 @@ static void hls_residual_coding(HEVCContext *s, int x0, int y0, int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y); if (max_xy == 0) s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs); - else - s->hevcdsp.idct[log2_trafo_size - 2](coeffs); + else { + int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4; + if (max_xy < 4) + col_limit = FFMIN(4, col_limit); + else if (max_xy < 8) + col_limit = FFMIN(8, col_limit); + else if (max_xy < 12) + col_limit = FFMIN(24, col_limit); + s->hevcdsp.idct[log2_trafo_size - 2](coeffs, col_limit); + } } } s->hevcdsp.add_residual[log2_trafo_size - 2](dst, coeffs, stride); diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h index bbc4cb27ce..199e5a9064 100644 --- a/libavcodec/hevcdsp.h +++ b/libavcodec/hevcdsp.h @@ -46,7 +46,7 @@ typedef struct HEVCDSPContext { void (*dequant)(int16_t *coeffs); void (*transform_4x4_luma)(int16_t *coeffs); - void (*idct[4])(int16_t *coeffs); + void (*idct[4])(int16_t *coeffs, int col_limit); void (*idct_dc[4])(int16_t *coeffs); void (*sao_band_filter[4])(uint8_t *dst, uint8_t *src, ptrdiff_t stride, diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c index 81e3ea5d59..076b251344 100644 --- a/libavcodec/hevcdsp_template.c +++ b/libavcodec/hevcdsp_template.c @@ -137,7 +137,7 @@ static void FUNC(transform_4x4_luma)(int16_t *coeffs) #undef TR_4x4_LUMA -#define TR_4(dst, src, dstep, sstep, assign) \ +#define TR_4(dst, src, dstep, sstep, assign, end) \ do { \ const int e0 = transform[8 * 0][0] * src[0 * sstep] + \ transform[8 * 2][0] * src[2 * sstep]; \ @@ -154,15 +154,15 @@ static void FUNC(transform_4x4_luma)(int16_t *coeffs) assign(dst[3 * dstep], e0 - o0); \ } while (0) -#define TR_8(dst, src, dstep, sstep, assign) \ +#define TR_8(dst, src, dstep, sstep, assign, end) \ do { \ int i, j; \ int e_8[4]; \ int o_8[4] = { 0 }; \ for (i = 0; i < 4; i++) \ - for (j = 1; j < 8; j += 2) \ + for (j = 1; j < end; j += 2) \ o_8[i] += transform[4 * j][i] * src[j * sstep]; \ - TR_4(e_8, src, 1, 2 * sstep, SET); \ + TR_4(e_8, src, 1, 2 * sstep, SET, 4); \ \ for (i = 0; i < 4; i++) { \ assign(dst[i * dstep], e_8[i] + o_8[i]); \ @@ -170,15 +170,15 @@ static void FUNC(transform_4x4_luma)(int16_t *coeffs) } \ } while (0) -#define TR_16(dst, src, dstep, sstep, assign) \ +#define TR_16(dst, src, dstep, sstep, assign, end) \ do { \ int i, j; \ int e_16[8]; \ int o_16[8] = { 0 }; \ for (i = 0; i < 8; i++) \ - for (j = 1; j < 16; j += 2) \ + for (j = 1; j < end; j += 2) \ o_16[i] += transform[2 * j][i] * src[j * sstep]; \ - TR_8(e_16, src, 1, 2 * sstep, SET); \ + TR_8(e_16, src, 1, 2 * sstep, SET, 8); \ \ for (i = 0; i < 8; i++) { \ assign(dst[i * dstep], e_16[i] + o_16[i]); \ @@ -186,15 +186,15 @@ static void FUNC(transform_4x4_luma)(int16_t *coeffs) } \ } while (0) -#define TR_32(dst, src, dstep, sstep, assign) \ +#define TR_32(dst, src, dstep, sstep, assign, end) \ do { \ int i, j; \ int e_32[16]; \ int o_32[16] = { 0 }; \ for (i = 0; i < 16; i++) \ - for (j = 1; j < 32; j += 2) \ + for (j = 1; j < end; j += 2) \ o_32[i] += transform[j][i] * src[j * sstep]; \ - TR_16(e_32, src, 1, 2 * sstep, SET); \ + TR_16(e_32, src, 1, 2 * sstep, SET, end / 2); \ \ for (i = 0; i < 16; i++) { \ assign(dst[i * dstep], e_32[i] + o_32[i]); \ @@ -202,23 +202,35 @@ static void FUNC(transform_4x4_luma)(int16_t *coeffs) } \ } while (0) +#define IDCT_VAR4(H) \ + int limit2 = FFMIN(col_limit + 4, H) +#define IDCT_VAR8(H) \ + int limit = FFMIN(col_limit, H); \ + int limit2 = FFMIN(col_limit + 4, H) +#define IDCT_VAR16(H) IDCT_VAR8(H) +#define IDCT_VAR32(H) IDCT_VAR8(H) + #define IDCT(H) \ -static void FUNC(idct_ ## H ## x ## H )(int16_t *coeffs) \ +static void FUNC(idct_ ## H ## x ## H )(int16_t *coeffs, \ + int col_limit) \ { \ int i; \ int shift = 7; \ int add = 1 << (shift - 1); \ int16_t *src = coeffs; \ + IDCT_VAR ## H(H); \ \ for (i = 0; i < H; i++) { \ - TR_ ## H(src, src, H, H, SCALE); \ + TR_ ## H(src, src, H, H, SCALE, limit2); \ + if (limit2 < H && i%4 == 0 && !!i) \ + limit2 -= 4; \ src++; \ } \ \ shift = 20 - BIT_DEPTH; \ add = 1 << (shift - 1); \ for (i = 0; i < H; i++) { \ - TR_ ## H(coeffs, coeffs, 1, 1, SCALE); \ + TR_ ## H(coeffs, coeffs, 1, 1, SCALE, limit); \ coeffs += H; \ } \ }