H.264: switch to x264-style tracking of luma/chroma DC NNZ

Useful so that we don't have to run the hierarchical DC iDCT if there aren't
any coefficients.  Opens up some future opportunities for optimization as well.

Originally committed as revision 26337 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
Jason Garrett-Glaser 2011-01-14 21:36:16 +00:00
parent 19fb234e4a
commit 5657d14094
4 changed files with 33 additions and 15 deletions

View File

@ -1203,6 +1203,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
} }
}else{ }else{
h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize); h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
if(h->non_zero_count_cache[ scan8[LUMA_DC_BLOCK_INDEX] ]){
if(is_h264){ if(is_h264){
if(!transform_bypass) if(!transform_bypass)
h->h264dsp.h264_luma_dc_dequant_idct(h->mb, h->mb_luma_dc, h->dequant4_coeff[0][s->qscale][0]); h->h264dsp.h264_luma_dc_dequant_idct(h->mb, h->mb_luma_dc, h->dequant4_coeff[0][s->qscale][0]);
@ -1214,6 +1215,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
} }
}else }else
ff_svq3_luma_dc_dequant_idct_c(h->mb, h->mb_luma_dc, s->qscale); ff_svq3_luma_dc_dequant_idct_c(h->mb, h->mb_luma_dc, s->qscale);
}
} }
if(h->deblocking_filter) if(h->deblocking_filter)
xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple); xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
@ -1281,8 +1283,10 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
} }
} }
}else{ }else{
chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]); if(h->non_zero_count_cache[ scan8[CHROMA_DC_BLOCK_INDEX+0] ])
chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]); chroma_dc_dequant_idct_c(h->mb + 16*16 , h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
if(h->non_zero_count_cache[ scan8[CHROMA_DC_BLOCK_INDEX+1] ])
chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
if(is_h264){ if(is_h264){
h->h264dsp.h264_idct_add8(dest, block_offset, h->h264dsp.h264_idct_add8(dest, block_offset,
h->mb, uvlinesize, h->mb, uvlinesize,

View File

@ -39,8 +39,8 @@
#define interlaced_dct interlaced_dct_is_a_bad_name #define interlaced_dct interlaced_dct_is_a_bad_name
#define mb_intra mb_intra_is_not_initialized_see_mb_type #define mb_intra mb_intra_is_not_initialized_see_mb_type
#define LUMA_DC_BLOCK_INDEX 25 #define LUMA_DC_BLOCK_INDEX 24
#define CHROMA_DC_BLOCK_INDEX 26 #define CHROMA_DC_BLOCK_INDEX 25
#define CHROMA_DC_COEFF_TOKEN_VLC_BITS 8 #define CHROMA_DC_COEFF_TOKEN_VLC_BITS 8
#define COEFF_TOKEN_VLC_BITS 8 #define COEFF_TOKEN_VLC_BITS 8
@ -722,8 +722,20 @@ o-o o-o
/ / / / / /
o-o o-o o-o o-o
*/ */
/* Scan8 organization:
* 0 1 2 3 4 5 6 7
* 0 u u y y y y y
* 1 u U U y Y Y Y Y
* 2 u U U y Y Y Y Y
* 3 v v y Y Y Y Y
* 4 v V V y Y Y Y Y
* 5 v V V DYDUDV
* DY/DU/DV are for luma/chroma DC.
*/
//This table must be here because scan8[constant] must be known at compiletime //This table must be here because scan8[constant] must be known at compiletime
static const uint8_t scan8[16 + 2*4]={ static const uint8_t scan8[16 + 2*4 + 3]={
4+1*8, 5+1*8, 4+2*8, 5+2*8, 4+1*8, 5+1*8, 4+2*8, 5+2*8,
6+1*8, 7+1*8, 6+2*8, 7+2*8, 6+1*8, 7+1*8, 6+2*8, 7+2*8,
4+3*8, 5+3*8, 4+4*8, 5+4*8, 4+3*8, 5+3*8, 4+4*8, 5+4*8,
@ -732,6 +744,7 @@ static const uint8_t scan8[16 + 2*4]={
1+2*8, 2+2*8, 1+2*8, 2+2*8,
1+4*8, 2+4*8, 1+4*8, 2+4*8,
1+5*8, 2+5*8, 1+5*8, 2+5*8,
4+5*8, 5+5*8, 6+5*8
}; };
static av_always_inline uint32_t pack16to32(int a, int b){ static av_always_inline uint32_t pack16to32(int a, int b){

View File

@ -965,6 +965,7 @@ static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx,
nza = h->left_cbp&0x100; nza = h->left_cbp&0x100;
nzb = h-> top_cbp&0x100; nzb = h-> top_cbp&0x100;
} else { } else {
idx -= CHROMA_DC_BLOCK_INDEX;
nza = (h->left_cbp>>(6+idx))&0x01; nza = (h->left_cbp>>(6+idx))&0x01;
nzb = (h-> top_cbp>>(6+idx))&0x01; nzb = (h-> top_cbp>>(6+idx))&0x01;
} }
@ -1060,8 +1061,7 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
/* read coded block flag */ /* read coded block flag */
if( is_dc || cat != 5 ) { if( is_dc || cat != 5 ) {
if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) { if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
if( !is_dc ) h->non_zero_count_cache[scan8[n]] = 0;
h->non_zero_count_cache[scan8[n]] = 0;
#ifdef CABAC_ON_STACK #ifdef CABAC_ON_STACK
h->cabac.range = cc.range ; h->cabac.range = cc.range ;
@ -1112,7 +1112,8 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
if( cat == 0 ) if( cat == 0 )
h->cbp_table[h->mb_xy] |= 0x100; h->cbp_table[h->mb_xy] |= 0x100;
else else
h->cbp_table[h->mb_xy] |= 0x40 << n; h->cbp_table[h->mb_xy] |= 0x40 << (n - CHROMA_DC_BLOCK_INDEX);
h->non_zero_count_cache[scan8[n]] = coeff_count;
} else { } else {
if( cat == 5 ) if( cat == 5 )
fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1); fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
@ -1642,7 +1643,7 @@ decode_intra_mb:
//av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" ); //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
AV_ZERO128(h->mb_luma_dc+0); AV_ZERO128(h->mb_luma_dc+0);
AV_ZERO128(h->mb_luma_dc+8); AV_ZERO128(h->mb_luma_dc+8);
decode_cabac_residual_dc( h, h->mb_luma_dc, 0, 0, scan, 16); decode_cabac_residual_dc( h, h->mb_luma_dc, 0, LUMA_DC_BLOCK_INDEX, scan, 16);
if( cbp&15 ) { if( cbp&15 ) {
qmul = h->dequant4_coeff[0][s->qscale]; qmul = h->dequant4_coeff[0][s->qscale];
@ -1681,7 +1682,7 @@ decode_intra_mb:
int c; int c;
for( c = 0; c < 2; c++ ) { for( c = 0; c < 2; c++ ) {
//av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c ); //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
decode_cabac_residual_dc(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, 4); decode_cabac_residual_dc(h, h->mb + 256 + 16*4*c, 3, CHROMA_DC_BLOCK_INDEX+c, chroma_dc_scan, 4);
} }
} }

View File

@ -371,7 +371,7 @@ static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, in
//FIXME put trailing_onex into the context //FIXME put trailing_onex into the context
if(n == CHROMA_DC_BLOCK_INDEX){ if(n >= CHROMA_DC_BLOCK_INDEX){
coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1); coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
total_coeff= coeff_token>>2; total_coeff= coeff_token>>2;
}else{ }else{
@ -383,9 +383,9 @@ static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, in
total_coeff= pred_non_zero_count(h, n); total_coeff= pred_non_zero_count(h, n);
coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2); coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
total_coeff= coeff_token>>2; total_coeff= coeff_token>>2;
h->non_zero_count_cache[ scan8[n] ]= total_coeff;
} }
} }
h->non_zero_count_cache[ scan8[n] ]= total_coeff;
//FIXME set last_non_zero? //FIXME set last_non_zero?
@ -482,14 +482,14 @@ static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, in
if(total_coeff == max_coeff) if(total_coeff == max_coeff)
zeros_left=0; zeros_left=0;
else{ else{
if(n == CHROMA_DC_BLOCK_INDEX) if(n >= CHROMA_DC_BLOCK_INDEX)
zeros_left= get_vlc2(gb, (chroma_dc_total_zeros_vlc-1)[ total_coeff ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1); zeros_left= get_vlc2(gb, (chroma_dc_total_zeros_vlc-1)[ total_coeff ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
else else
zeros_left= get_vlc2(gb, (total_zeros_vlc-1)[ total_coeff ].table, TOTAL_ZEROS_VLC_BITS, 1); zeros_left= get_vlc2(gb, (total_zeros_vlc-1)[ total_coeff ].table, TOTAL_ZEROS_VLC_BITS, 1);
} }
scantable += zeros_left + total_coeff - 1; scantable += zeros_left + total_coeff - 1;
if(n > 24){ if(n >= LUMA_DC_BLOCK_INDEX){
block[*scantable] = level[0]; block[*scantable] = level[0];
for(i=1;i<total_coeff && zeros_left > 0;i++) { for(i=1;i<total_coeff && zeros_left > 0;i++) {
if(zeros_left < 7) if(zeros_left < 7)
@ -988,7 +988,7 @@ decode_intra_mb:
if(cbp&0x30){ if(cbp&0x30){
for(chroma_idx=0; chroma_idx<2; chroma_idx++) for(chroma_idx=0; chroma_idx<2; chroma_idx++)
if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){ if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX+chroma_idx, chroma_dc_scan, NULL, 4) < 0){
return -1; return -1;
} }
} }