From d7e162d46b4a0fc03ca5161cdcac840152f048cb Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Sat, 19 Jul 2014 14:50:11 +0200 Subject: [PATCH] hevcdsp: remove an unneeded variable in the loop filter beta0 and beta1 will always be the same within a CU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Mickaƫl Raulet cherry picked from commit 4a23d824741a289c7d2d2f2871d1e2621b63fa1b Signed-off-by: Michael Niedermayer --- libavcodec/hevc_filter.c | 22 ++++----- libavcodec/hevcdsp.h | 8 ++-- libavcodec/hevcdsp_template.c | 9 ++-- libavcodec/x86/hevc_deblock.asm | 83 +++++++++++++-------------------- 4 files changed, 51 insertions(+), 71 deletions(-) diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c index 08880efb80..7b41dd47ed 100644 --- a/libavcodec/hevc_filter.c +++ b/libavcodec/hevc_filter.c @@ -340,7 +340,7 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) uint8_t *src; int x, y; int chroma; - int c_tc[2], beta[2], tc[2]; + int c_tc[2], tc[2], beta; uint8_t no_p[2] = { 0 }; uint8_t no_q[2] = { 0 }; @@ -381,13 +381,11 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) const int bs0 = s->vertical_bs[(x + y * s->bs_width) >> 2]; const int bs1 = s->vertical_bs[(x + (y + 4) * s->bs_width) >> 2]; if (bs0 || bs1) { - const int qp0 = (get_qPy(s, x - 1, y) + get_qPy(s, x, y) + 1) >> 1; - const int qp1 = (get_qPy(s, x - 1, y + 4) + get_qPy(s, x, y + 4) + 1) >> 1; + const int qp = (get_qPy(s, x - 1, y) + get_qPy(s, x, y) + 1) >> 1; - beta[0] = betatable[av_clip(qp0 + beta_offset, 0, MAX_QP)]; - beta[1] = betatable[av_clip(qp1 + beta_offset, 0, MAX_QP)]; - tc[0] = bs0 ? TC_CALC(qp0, bs0) : 0; - tc[1] = bs1 ? TC_CALC(qp1, bs1) : 0; + beta = betatable[av_clip(qp + beta_offset, 0, MAX_QP)]; + tc[0] = bs0 ? TC_CALC(qp, bs0) : 0; + tc[1] = bs1 ? TC_CALC(qp, bs1) : 0; src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->sps->pixel_shift)]; if (pcmf) { no_p[0] = get_pcm(s, x - 1, y); @@ -447,16 +445,14 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) const int bs0 = s->horizontal_bs[( x + y * s->bs_width) >> 2]; const int bs1 = s->horizontal_bs[((x + 4) + y * s->bs_width) >> 2]; if (bs0 || bs1) { - const int qp0 = (get_qPy(s, x, y - 1) + get_qPy(s, x, y) + 1) >> 1; - const int qp1 = (get_qPy(s, x + 4, y - 1) + get_qPy(s, x + 4, y) + 1) >> 1; + const int qp = (get_qPy(s, x, y - 1) + get_qPy(s, x, y) + 1) >> 1; tc_offset = x >= x0 ? cur_tc_offset : left_tc_offset; beta_offset = x >= x0 ? cur_beta_offset : left_beta_offset; - beta[0] = betatable[av_clip(qp0 + beta_offset, 0, MAX_QP)]; - beta[1] = betatable[av_clip(qp1 + beta_offset, 0, MAX_QP)]; - tc[0] = bs0 ? TC_CALC(qp0, bs0) : 0; - tc[1] = bs1 ? TC_CALC(qp1, bs1) : 0; + beta = betatable[av_clip(qp + beta_offset, 0, MAX_QP)]; + tc[0] = bs0 ? TC_CALC(qp, bs0) : 0; + tc[1] = bs1 ? TC_CALC(qp, bs1) : 0; src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->sps->pixel_shift)]; if (pcmf) { no_p[0] = get_pcm(s, x, y - 1); diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h index dfff7801f7..8564deb1a0 100644 --- a/libavcodec/hevcdsp.h +++ b/libavcodec/hevcdsp.h @@ -97,20 +97,20 @@ typedef struct HEVCDSPContext { int ox1, intptr_t mx, intptr_t my, int width); void (*hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, - int *beta, int *tc, + int beta, int *tc, uint8_t *no_p, uint8_t *no_q); void (*hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, - int *beta, int *tc, + int beta, int *tc, uint8_t *no_p, uint8_t *no_q); void (*hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride, int *tc, uint8_t *no_p, uint8_t *no_q); void (*hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride, int *tc, uint8_t *no_p, uint8_t *no_q); void (*hevc_h_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride, - int *beta, int *tc, + int beta, int *tc, uint8_t *no_p, uint8_t *no_q); void (*hevc_v_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride, - int *beta, int *tc, + int beta, int *tc, uint8_t *no_p, uint8_t *no_q); void (*hevc_h_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride, int *tc, uint8_t *no_p, diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c index 7840ec7472..8ebe8d400f 100644 --- a/libavcodec/hevcdsp_template.c +++ b/libavcodec/hevcdsp_template.c @@ -1564,7 +1564,7 @@ static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uin static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix, ptrdiff_t _xstride, ptrdiff_t _ystride, - int *_beta, int *_tc, + int beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q) { int d, j; @@ -1572,6 +1572,8 @@ static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix, ptrdiff_t xstride = _xstride / sizeof(pixel); ptrdiff_t ystride = _ystride / sizeof(pixel); + beta <<= BIT_DEPTH - 8; + for (j = 0; j < 2; j++) { const int dp0 = abs(P2 - 2 * P1 + P0); const int dq0 = abs(Q2 - 2 * Q1 + Q0); @@ -1579,7 +1581,6 @@ static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix, const int dq3 = abs(TQ2 - 2 * TQ1 + TQ0); const int d0 = dp0 + dq0; const int d3 = dp3 + dq3; - const int beta = _beta[j] << (BIT_DEPTH - 8); const int tc = _tc[j] << (BIT_DEPTH - 8); const int no_p = _no_p[j]; const int no_q = _no_q[j]; @@ -1706,7 +1707,7 @@ static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride, } static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, - int *beta, int *tc, uint8_t *no_p, + int beta, int *tc, uint8_t *no_p, uint8_t *no_q) { FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel), @@ -1714,7 +1715,7 @@ static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, } static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, - int *beta, int *tc, uint8_t *no_p, + int beta, int *tc, uint8_t *no_p, uint8_t *no_q) { FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride, diff --git a/libavcodec/x86/hevc_deblock.asm b/libavcodec/x86/hevc_deblock.asm index a5619b6113..395b20ee45 100644 --- a/libavcodec/x86/hevc_deblock.asm +++ b/libavcodec/x86/hevc_deblock.asm @@ -310,7 +310,7 @@ INIT_XMM sse2 %endmacro ALIGN 16 -; input in m0 ... m3 and tcs in tc (r2). Output in m1 and m2 +; input in m0 ... m3 and tcs in r2. Output in m1 and m2 %macro CHROMA_DEBLOCK_BODY 1 psubw m4, m2, m1; q0 - p0 psubw m5, m0, m3; p1 - q1 @@ -339,7 +339,7 @@ ALIGN 16 psubw m2, m5; q0 - delta0 %endmacro -; input in m0 ... m7, betas in r2 tcs in r3. Output in m1...m6 +; input in m0 ... m7, beta in r2 tcs in r3. Output in m1...m6 %macro LUMA_DEBLOCK_BODY 2 psllw m9, m2, 1; *2 psubw m10, m1, m9 @@ -352,20 +352,11 @@ ALIGN 16 ABS1 m11, m13 ; 0dq0, 0dq3 , 1dq0, 1dq3 ;beta calculations - mov r11, [betaq]; %if %1 > 8 - shl r11, %1 - 8 + shl betaq, %1 - 8 %endif - movd m13, r11d; beta0 - add betaq, 4; - punpcklwd m13, m13 - mov r12, [betaq]; -%if %1 > 8 - shl r12, %1 - 8 -%endif - movd m14, r12d; beta1 - punpcklwd m14, m14 - pshufd m13, m14, 0; beta0, beta1 + movd m13, betaq + SPLATW m13, m13, 0 ;end beta calculations paddw m9, m10, m11; 0d0, 0d3 , 1d0, 1d3 @@ -412,31 +403,31 @@ ALIGN 16 ; end calc for weak filter ; filtering mask - mov r2, r13 - shr r2, 3 - movd m15, r2d + mov r11, r13 + shr r11, 3 + movd m15, r11d and r13, 1 movd m11, r13d shufps m11, m15, 0 - shl r2, 1 - or r13, r2 + shl r11, 1 + or r13, r11 pcmpeqd m11, [pd_1]; filtering mask ;decide between strong and weak filtering ;tc25 calculations - mov r2d, [tcq]; + mov r11d, [tcq]; %if %1 > 8 - shl r2, %1 - 8 + shl r11, %1 - 8 %endif - movd m8, r2d; tc0 + movd m8, r11d; tc0 add tcq, 4; mov r3d, [tcq]; %if %1 > 8 shl r3, %1 - 8 %endif movd m9, r3d; tc1 - add r2d, r3d; tc0 + tc1 + add r11d, r3d; tc0 + tc1 jz .bypassluma punpcklwd m8, m8 punpcklwd m9, m9 @@ -460,8 +451,8 @@ ALIGN 16 psraw m13, 3; beta >> 3 pcmpgtw m13, m12; - movmskps r2, m13; - and r14, r2; strong mask , beta_2 and beta_3 comparisons + movmskps r11, m13; + and r14, r11; strong mask , beta_2 and beta_3 comparisons ;----beta_3 comparison end----- ;----tc25 comparison--- psubw m12, m3, m4; p0 - q0 @@ -471,24 +462,24 @@ ALIGN 16 pshuflw m12, m12, 0xf0 ;0b11110000; pcmpgtw m8, m12; tc25 comparisons - movmskps r2, m8; - and r14, r2; strong mask, beta_2, beta_3 and tc25 comparisons + movmskps r11, m8; + and r14, r11; strong mask, beta_2, beta_3 and tc25 comparisons ;----tc25 comparison end--- - mov r2, r14; - shr r2, 1; - and r14, r2; strong mask, bits 2 and 0 + mov r11, r14; + shr r11, 1; + and r14, r11; strong mask, bits 2 and 0 pmullw m14, m9, [pw_m2]; -tc * 2 paddw m9, m9 and r14, 5; 0b101 - mov r2, r14; strong mask + mov r11, r14; strong mask shr r14, 2; movd m12, r14d; store to xmm for mask generation shl r14, 1 - and r2, 1 - movd m10, r2d; store to xmm for mask generation - or r14, r2; final strong mask, bits 1 and 0 + and r11, 1 + movd m10, r11d; store to xmm for mask generation + or r14, r11; final strong mask, bits 1 and 0 jz .weakfilter shufps m10, m12, 0 @@ -578,23 +569,18 @@ ALIGN 16 jz .store ; weak filtering mask - mov r2, r14 - shr r2, 1 - movd m12, r2d + mov r11, r14 + shr r11, 1 + movd m12, r11d and r14, 1 movd m11, r14d shufps m11, m12, 0 pcmpeqd m11, [pd_1]; filtering mask - mov r13, r11; beta0 + mov r13, betaq shr r13, 1; - add r11, r13 - shr r11, 3; ((beta0+(beta0>>1))>>3)) - - mov r13, r12; beta1 - shr r13, 1; - add r12, r13 - shr r12, 3; ((beta1+(beta1>>1))>>3)) + add betaq, r13 + shr betaq, 3; ((beta + (beta >> 1)) >> 3)) mova m13, [pw_8] psubw m12, m4, m3 ; q0 - p0 @@ -633,11 +619,8 @@ ALIGN 16 paddw m15, m2; p1' ;beta calculations - movd m10, r11d; beta0 - punpcklwd m10, m10 - movd m13, r12d; beta1 - punpcklwd m13, m13 - shufps m10, m13, 0; betax0, betax1 + movd m10, betaq + SPLATW m10, m10, 0 movd m13, r7d; 1dp0 + 1dp3 movd m8, r8d; 0dp0 + 0dp3