diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c index 596cfc5a76..651924196b 100644 --- a/libavcodec/vp8.c +++ b/libavcodec/vp8.c @@ -117,6 +117,7 @@ typedef struct { */ DECLARE_ALIGNED(16, uint8_t, non_zero_count_cache)[6][4]; DECLARE_ALIGNED(16, DCTELEM, block)[6][4][16]; + DECLARE_ALIGNED(16, DCTELEM, block_dc)[16]; uint8_t intra4x4_pred_mode_mb[16]; int chroma_pred_mode; ///< 8x8c pred mode of the current macroblock @@ -864,22 +865,19 @@ static av_always_inline void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, uint8_t t_nnz[9], uint8_t l_nnz[9]) { - LOCAL_ALIGNED_16(DCTELEM, dc,[16]); int i, x, y, luma_start = 0, luma_ctx = 3; int nnz_pred, nnz, nnz_total = 0; int segment = s->segment; if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) { - AV_ZERO128(dc); - AV_ZERO128(dc+8); nnz_pred = t_nnz[8] + l_nnz[8]; // decode DC values and do hadamard - nnz = decode_block_coeffs(c, dc, s->prob->token[1], 0, nnz_pred, + nnz = decode_block_coeffs(c, s->block_dc, s->prob->token[1], 0, nnz_pred, s->qmat[segment].luma_dc_qmul); l_nnz[8] = t_nnz[8] = !!nnz; nnz_total += nnz; - s->vp8dsp.vp8_luma_dc_wht(s->block, dc); + s->vp8dsp.vp8_luma_dc_wht(s->block, s->block_dc); luma_start = 1; luma_ctx = 0; } diff --git a/libavcodec/vp8dsp.c b/libavcodec/vp8dsp.c index f3f3fb6da0..5f5124803d 100644 --- a/libavcodec/vp8dsp.c +++ b/libavcodec/vp8dsp.c @@ -46,6 +46,10 @@ static void vp8_luma_dc_wht_c(DCTELEM block[4][4][16], DCTELEM dc[16]) t1 = dc[i*4+1] + dc[i*4+2]; t2 = dc[i*4+1] - dc[i*4+2]; t3 = dc[i*4+0] - dc[i*4+3] + 3; // rounding + dc[i*4+0] = 0; + dc[i*4+1] = 0; + dc[i*4+2] = 0; + dc[i*4+3] = 0; *block[i][0] = (t0 + t1) >> 3; *block[i][1] = (t3 + t2) >> 3; diff --git a/libavcodec/x86/vp8dsp-init.c b/libavcodec/x86/vp8dsp-init.c index 4bf49364e7..aceec6a346 100644 --- a/libavcodec/x86/vp8dsp-init.c +++ b/libavcodec/x86/vp8dsp-init.c @@ -224,6 +224,7 @@ extern void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, DCTELEM block[4][16], int str extern void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, DCTELEM block[4][16], int stride); extern void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, DCTELEM block[2][16], int stride); extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]); +extern void ff_vp8_luma_dc_wht_sse(DCTELEM block[4][4][16], DCTELEM dc[16]); extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16], int stride); @@ -335,6 +336,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) if (mm_flags & FF_MM_SSE) { c->vp8_idct_add = ff_vp8_idct_add_sse; + c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_sse; c->put_vp8_epel_pixels_tab[0][0][0] = c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse; } diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index 4f430d80c8..6999e87b63 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -1186,12 +1186,23 @@ VP8_IDCT_ADD sse SWAP %1, %4, %3 %endmacro -INIT_MMX -cglobal vp8_luma_dc_wht_mmx, 2,3 +%macro VP8_DC_WHT 1 +cglobal vp8_luma_dc_wht_%1, 2,3 movq m0, [r1] movq m1, [r1+8] movq m2, [r1+16] movq m3, [r1+24] +%ifidn %1, sse + xorps xmm0, xmm0 + movaps [r1+ 0], xmm0 + movaps [r1+16], xmm0 +%else + pxor m4, m4 + movq [r1+ 0], m4 + movq [r1+ 8], m4 + movq [r1+16], m4 + movq [r1+24], m4 +%endif HADAMARD4_1D 0, 1, 2, 3 TRANSPOSE4x4W 0, 1, 2, 3, 4 paddw m0, [pw_3] @@ -1203,6 +1214,11 @@ cglobal vp8_luma_dc_wht_mmx, 2,3 SCATTER_WHT 0, 1, 0 SCATTER_WHT 2, 3, 2 RET +%endmacro + +INIT_MMX +VP8_DC_WHT mmx +VP8_DC_WHT sse ;----------------------------------------------------------------------------- ; void vp8_h/v_loop_filter_simple_(uint8_t *dst, int stride, int flim);