mirror of
https://gitee.com/openharmony/third_party_ffmpeg
synced 2024-11-23 11:19:55 +00:00
H.264: split luma dc idct out and implement MMX/SSE2 versions
About 2.5x the speed. NOTE: the way that the asm code handles large qmuls is a bit suboptimal. If x264-style dequant was used (separate shift and qmul values), it might be possible to get some extra speed. Originally committed as revision 26336 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
6c18f1cda2
commit
19fb234e4a
@ -64,6 +64,10 @@ void ff_h264_idct_add16intra_c(uint8_t *dst, const int *blockoffset, DCTELEM *bl
|
||||
void ff_h264_idct8_add4_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
|
||||
void ff_h264_idct_add8_c(uint8_t **dest, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
|
||||
|
||||
void ff_h264_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qmul);
|
||||
void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qp);
|
||||
void ff_svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
|
||||
|
||||
void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1,
|
||||
const float *win, float add_bias, int len);
|
||||
void ff_float_to_int16_c(int16_t *dst, const float *src, long len);
|
||||
|
@ -246,46 +246,6 @@ int ff_h264_decode_rbsp_trailing(H264Context *h, const uint8_t *src){
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* IDCT transforms the 16 dc values and dequantizes them.
|
||||
* @param qp quantization parameter
|
||||
*/
|
||||
static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
|
||||
#define stride 16
|
||||
int i;
|
||||
int temp[16]; //FIXME check if this is a good idea
|
||||
static const int x_offset[4]={0, 1*stride, 4* stride, 5*stride};
|
||||
static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
|
||||
|
||||
//memset(block, 64, 2*256);
|
||||
//return;
|
||||
for(i=0; i<4; i++){
|
||||
const int offset= y_offset[i];
|
||||
const int z0= block[offset+stride*0] + block[offset+stride*4];
|
||||
const int z1= block[offset+stride*0] - block[offset+stride*4];
|
||||
const int z2= block[offset+stride*1] - block[offset+stride*5];
|
||||
const int z3= block[offset+stride*1] + block[offset+stride*5];
|
||||
|
||||
temp[4*i+0]= z0+z3;
|
||||
temp[4*i+1]= z1+z2;
|
||||
temp[4*i+2]= z1-z2;
|
||||
temp[4*i+3]= z0-z3;
|
||||
}
|
||||
|
||||
for(i=0; i<4; i++){
|
||||
const int offset= x_offset[i];
|
||||
const int z0= temp[4*0+i] + temp[4*2+i];
|
||||
const int z1= temp[4*0+i] - temp[4*2+i];
|
||||
const int z2= temp[4*1+i] - temp[4*3+i];
|
||||
const int z3= temp[4*1+i] + temp[4*3+i];
|
||||
|
||||
block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
|
||||
block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
|
||||
block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
|
||||
block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
|
||||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
/**
|
||||
* DCT transforms the 16 dc values.
|
||||
@ -1245,9 +1205,15 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
|
||||
h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
|
||||
if(is_h264){
|
||||
if(!transform_bypass)
|
||||
h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
|
||||
h->h264dsp.h264_luma_dc_dequant_idct(h->mb, h->mb_luma_dc, h->dequant4_coeff[0][s->qscale][0]);
|
||||
else{
|
||||
static const uint8_t dc_mapping[16] = { 0*16, 1*16, 4*16, 5*16, 2*16, 3*16, 6*16, 7*16,
|
||||
8*16, 9*16,12*16,13*16,10*16,11*16,14*16,15*16};
|
||||
for(i = 0; i < 16; i++)
|
||||
h->mb[dc_mapping[i]] = h->mb_luma_dc[i];
|
||||
}
|
||||
}else
|
||||
ff_svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
|
||||
ff_svq3_luma_dc_dequant_idct_c(h->mb, h->mb_luma_dc, s->qscale);
|
||||
}
|
||||
if(h->deblocking_filter)
|
||||
xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
|
||||
|
@ -406,6 +406,7 @@ typedef struct H264Context{
|
||||
GetBitContext *inter_gb_ptr;
|
||||
|
||||
DECLARE_ALIGNED(16, DCTELEM, mb)[16*24];
|
||||
DECLARE_ALIGNED(16, DCTELEM, mb_luma_dc)[16];
|
||||
DCTELEM mb_padding[256]; ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either check that i is not too large or ensure that there is some unused stuff after mb
|
||||
|
||||
/**
|
||||
@ -600,10 +601,6 @@ typedef struct H264Context{
|
||||
|
||||
extern const uint8_t ff_h264_chroma_qp[52];
|
||||
|
||||
void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
|
||||
|
||||
void ff_svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
|
||||
|
||||
/**
|
||||
* Decode SEI
|
||||
*/
|
||||
|
@ -1597,17 +1597,15 @@ decode_intra_mb:
|
||||
s->current_picture.mb_type[mb_xy]= mb_type;
|
||||
|
||||
if( cbp || IS_INTRA16x16( mb_type ) ) {
|
||||
const uint8_t *scan, *scan8x8, *dc_scan;
|
||||
const uint8_t *scan, *scan8x8;
|
||||
const uint32_t *qmul;
|
||||
|
||||
if(IS_INTERLACED(mb_type)){
|
||||
scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
|
||||
scan= s->qscale ? h->field_scan : h->field_scan_q0;
|
||||
dc_scan= luma_dc_field_scan;
|
||||
}else{
|
||||
scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
|
||||
scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
|
||||
dc_scan= luma_dc_zigzag_scan;
|
||||
}
|
||||
|
||||
// decode_cabac_mb_dqp
|
||||
@ -1642,7 +1640,9 @@ decode_intra_mb:
|
||||
if( IS_INTRA16x16( mb_type ) ) {
|
||||
int i;
|
||||
//av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
|
||||
decode_cabac_residual_dc( h, h->mb, 0, 0, dc_scan, 16);
|
||||
AV_ZERO128(h->mb_luma_dc+0);
|
||||
AV_ZERO128(h->mb_luma_dc+8);
|
||||
decode_cabac_residual_dc( h, h->mb_luma_dc, 0, 0, scan, 16);
|
||||
|
||||
if( cbp&15 ) {
|
||||
qmul = h->dequant4_coeff[0][s->qscale];
|
||||
|
@ -911,16 +911,14 @@ decode_intra_mb:
|
||||
int i8x8, i4x4, chroma_idx;
|
||||
int dquant;
|
||||
GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
|
||||
const uint8_t *scan, *scan8x8, *dc_scan;
|
||||
const uint8_t *scan, *scan8x8;
|
||||
|
||||
if(IS_INTERLACED(mb_type)){
|
||||
scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
|
||||
scan= s->qscale ? h->field_scan : h->field_scan_q0;
|
||||
dc_scan= luma_dc_field_scan;
|
||||
}else{
|
||||
scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
|
||||
scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
|
||||
dc_scan= luma_dc_zigzag_scan;
|
||||
}
|
||||
|
||||
dquant= get_se_golomb(&s->gb);
|
||||
@ -939,7 +937,9 @@ decode_intra_mb:
|
||||
h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
|
||||
h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
|
||||
if(IS_INTRA16x16(mb_type)){
|
||||
if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
|
||||
AV_ZERO128(h->mb_luma_dc+0);
|
||||
AV_ZERO128(h->mb_luma_dc+8);
|
||||
if( decode_residual(h, h->intra_gb_ptr, h->mb_luma_dc, LUMA_DC_BLOCK_INDEX, scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
|
||||
return -1; //FIXME continue if partitioned and other return -1 too
|
||||
}
|
||||
|
||||
|
@ -282,6 +282,7 @@ void ff_h264dsp_init(H264DSPContext *c)
|
||||
c->h264_idct8_add4 = ff_h264_idct8_add4_c;
|
||||
c->h264_idct_add8 = ff_h264_idct_add8_c;
|
||||
c->h264_idct_add16intra= ff_h264_idct_add16intra_c;
|
||||
c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_c;
|
||||
|
||||
c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
|
||||
c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
|
||||
|
@ -65,11 +65,13 @@ typedef struct H264DSPContext{
|
||||
void (*h264_idct8_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride);
|
||||
void (*h264_idct_dc_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride);
|
||||
void (*h264_idct8_dc_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride);
|
||||
|
||||
void (*h264_dct)(DCTELEM block[4][4]);
|
||||
void (*h264_idct_add16)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
|
||||
void (*h264_idct8_add4)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
|
||||
void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
|
||||
void (*h264_idct_add16intra)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
|
||||
void (*h264_luma_dc_dequant_idct)(DCTELEM *output, DCTELEM *input/*align 16*/, int qmul);
|
||||
}H264DSPContext;
|
||||
|
||||
void ff_h264dsp_init(H264DSPContext *c);
|
||||
|
@ -216,3 +216,38 @@ void ff_h264_idct_add8_c(uint8_t **dest, const int *block_offset, DCTELEM *block
|
||||
ff_h264_idct_dc_add_c(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* IDCT transforms the 16 dc values and dequantizes them.
|
||||
* @param qp quantization parameter
|
||||
*/
|
||||
void ff_h264_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qmul){
|
||||
#define stride 16
|
||||
int i;
|
||||
int temp[16];
|
||||
static const uint8_t x_offset[4]={0, 2*stride, 8*stride, 10*stride};
|
||||
|
||||
for(i=0; i<4; i++){
|
||||
const int z0= input[4*i+0] + input[4*i+1];
|
||||
const int z1= input[4*i+0] - input[4*i+1];
|
||||
const int z2= input[4*i+2] - input[4*i+3];
|
||||
const int z3= input[4*i+2] + input[4*i+3];
|
||||
|
||||
temp[4*i+0]= z0+z3;
|
||||
temp[4*i+1]= z0-z3;
|
||||
temp[4*i+2]= z1-z2;
|
||||
temp[4*i+3]= z1+z2;
|
||||
}
|
||||
|
||||
for(i=0; i<4; i++){
|
||||
const int offset= x_offset[i];
|
||||
const int z0= temp[4*0+i] + temp[4*2+i];
|
||||
const int z1= temp[4*0+i] - temp[4*2+i];
|
||||
const int z2= temp[4*1+i] - temp[4*3+i];
|
||||
const int z3= temp[4*1+i] + temp[4*3+i];
|
||||
|
||||
output[stride* 0+offset]= ((((z0 + z3)*qmul + 128 ) >> 8));
|
||||
output[stride* 1+offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
|
||||
output[stride* 4+offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
|
||||
output[stride* 5+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
|
||||
}
|
||||
}
|
||||
|
@ -126,21 +126,19 @@ static const uint32_t svq3_dequant_coeff[32] = {
|
||||
};
|
||||
|
||||
|
||||
void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp)
|
||||
void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qp)
|
||||
{
|
||||
const int qmul = svq3_dequant_coeff[qp];
|
||||
#define stride 16
|
||||
int i;
|
||||
int temp[16];
|
||||
static const int x_offset[4] = {0, 1*stride, 4* stride, 5*stride};
|
||||
static const int y_offset[4] = {0, 2*stride, 8* stride, 10*stride};
|
||||
|
||||
for (i = 0; i < 4; i++){
|
||||
const int offset = y_offset[i];
|
||||
const int z0 = 13*(block[offset+stride*0] + block[offset+stride*4]);
|
||||
const int z1 = 13*(block[offset+stride*0] - block[offset+stride*4]);
|
||||
const int z2 = 7* block[offset+stride*1] - 17*block[offset+stride*5];
|
||||
const int z3 = 17* block[offset+stride*1] + 7*block[offset+stride*5];
|
||||
const int z0= 13*(input[4*i+0] + input[4*i+1]);
|
||||
const int z1= 13*(input[4*i+0] - input[4*i+1]);
|
||||
const int z2= 7* input[4*i+2] - 17*input[4*i+3];
|
||||
const int z3= 17* input[4*i+2] + 7*input[4*i+3];
|
||||
|
||||
temp[4*i+0] = z0+z3;
|
||||
temp[4*i+1] = z1+z2;
|
||||
@ -155,10 +153,10 @@ void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp)
|
||||
const int z2 = 7* temp[4*1+i] - 17*temp[4*3+i];
|
||||
const int z3 = 17* temp[4*1+i] + 7*temp[4*3+i];
|
||||
|
||||
block[stride*0 +offset] = ((z0 + z3)*qmul + 0x80000) >> 20;
|
||||
block[stride*2 +offset] = ((z1 + z2)*qmul + 0x80000) >> 20;
|
||||
block[stride*8 +offset] = ((z1 - z2)*qmul + 0x80000) >> 20;
|
||||
block[stride*10+offset] = ((z0 - z3)*qmul + 0x80000) >> 20;
|
||||
output[stride*0 +offset] = ((z0 + z3)*qmul + 0x80000) >> 20;
|
||||
output[stride*2 +offset] = ((z1 + z2)*qmul + 0x80000) >> 20;
|
||||
output[stride*8 +offset] = ((z1 - z2)*qmul + 0x80000) >> 20;
|
||||
output[stride*10+offset] = ((z0 - z3)*qmul + 0x80000) >> 20;
|
||||
}
|
||||
}
|
||||
#undef stride
|
||||
|
@ -41,6 +41,7 @@ DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
|
||||
DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
|
||||
{0x8000000080000000ULL, 0x8000000080000000ULL};
|
||||
|
||||
DECLARE_ALIGNED(8, const uint64_t, ff_pw_1 ) = 0x0001000100010001ULL;
|
||||
DECLARE_ALIGNED(8, const xmm_reg, ff_pw_3 ) = {0x0003000300030003ULL, 0x0003000300030003ULL};
|
||||
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL};
|
||||
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
|
||||
|
@ -47,6 +47,7 @@ scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8
|
||||
%endif
|
||||
|
||||
cextern pw_32
|
||||
cextern pw_1
|
||||
|
||||
SECTION .text
|
||||
|
||||
@ -854,3 +855,156 @@ cglobal h264_idct_add8_sse2, 5, 7, 8
|
||||
add8_sse2_cycle 2, 0x21
|
||||
add8_sse2_cycle 3, 0x29
|
||||
RET
|
||||
|
||||
;void ff_h264_luma_dc_dequant_idct_mmx(DCTELEM *output, DCTELEM *input, int qmul)
|
||||
|
||||
%macro WALSH4_1D 5
|
||||
SUMSUB_BADC m%4, m%3, m%2, m%1, m%5
|
||||
SUMSUB_BADC m%4, m%2, m%3, m%1, m%5
|
||||
SWAP %1, %4, %3
|
||||
%endmacro
|
||||
|
||||
%macro DEQUANT_MMX 3
|
||||
mova m7, [pw_1]
|
||||
mova m4, %1
|
||||
punpcklwd %1, m7
|
||||
punpckhwd m4, m7
|
||||
mova m5, %2
|
||||
punpcklwd %2, m7
|
||||
punpckhwd m5, m7
|
||||
movd m7, t3d
|
||||
punpckldq m7, m7
|
||||
pmaddwd %1, m7
|
||||
pmaddwd %2, m7
|
||||
pmaddwd m4, m7
|
||||
pmaddwd m5, m7
|
||||
psrad %1, %3
|
||||
psrad %2, %3
|
||||
psrad m4, %3
|
||||
psrad m5, %3
|
||||
packssdw %1, m4
|
||||
packssdw %2, m5
|
||||
%endmacro
|
||||
|
||||
%macro STORE_WORDS_MMX 5
|
||||
movd t0d, %1
|
||||
psrlq %1, 32
|
||||
movd t1d, %1
|
||||
mov [t2+%2*32], t0w
|
||||
mov [t2+%4*32], t1w
|
||||
shr t0d, 16
|
||||
shr t1d, 16
|
||||
mov [t2+%3*32], t0w
|
||||
mov [t2+%5*32], t1w
|
||||
%endmacro
|
||||
|
||||
%macro DEQUANT_STORE_MMX 1
|
||||
DEQUANT_MMX m0, m1, %1
|
||||
STORE_WORDS_MMX m0, 0, 1, 4, 5
|
||||
STORE_WORDS_MMX m1, 2, 3, 6, 7
|
||||
|
||||
DEQUANT_MMX m2, m3, %1
|
||||
STORE_WORDS_MMX m2, 8, 9, 12, 13
|
||||
STORE_WORDS_MMX m3, 10, 11, 14, 15
|
||||
%endmacro
|
||||
|
||||
%macro STORE_WORDS_SSE 9
|
||||
movd t0d, %1
|
||||
psrldq %1, 4
|
||||
movd t1d, %1
|
||||
psrldq %1, 4
|
||||
mov [t2+%2*32], t0w
|
||||
mov [t2+%4*32], t1w
|
||||
shr t0d, 16
|
||||
shr t1d, 16
|
||||
mov [t2+%3*32], t0w
|
||||
mov [t2+%5*32], t1w
|
||||
movd t0d, %1
|
||||
psrldq %1, 4
|
||||
movd t1d, %1
|
||||
mov [t2+%6*32], t0w
|
||||
mov [t2+%8*32], t1w
|
||||
shr t0d, 16
|
||||
shr t1d, 16
|
||||
mov [t2+%7*32], t0w
|
||||
mov [t2+%9*32], t1w
|
||||
%endmacro
|
||||
|
||||
%macro DEQUANT_STORE_SSE2 1
|
||||
movd xmm4, t3d
|
||||
movq xmm5, [pw_1]
|
||||
pshufd xmm4, xmm4, 0
|
||||
movq2dq xmm0, m0
|
||||
movq2dq xmm1, m1
|
||||
movq2dq xmm2, m2
|
||||
movq2dq xmm3, m3
|
||||
punpcklwd xmm0, xmm5
|
||||
punpcklwd xmm1, xmm5
|
||||
punpcklwd xmm2, xmm5
|
||||
punpcklwd xmm3, xmm5
|
||||
pmaddwd xmm0, xmm4
|
||||
pmaddwd xmm1, xmm4
|
||||
pmaddwd xmm2, xmm4
|
||||
pmaddwd xmm3, xmm4
|
||||
psrad xmm0, %1
|
||||
psrad xmm1, %1
|
||||
psrad xmm2, %1
|
||||
psrad xmm3, %1
|
||||
packssdw xmm0, xmm1
|
||||
packssdw xmm2, xmm3
|
||||
STORE_WORDS_SSE xmm0, 0, 1, 4, 5, 2, 3, 6, 7
|
||||
STORE_WORDS_SSE xmm2, 8, 9, 12, 13, 10, 11, 14, 15
|
||||
%endmacro
|
||||
|
||||
%macro IDCT_DC_DEQUANT 2
|
||||
cglobal h264_luma_dc_dequant_idct_%1, 3,4,%2
|
||||
movq m3, [r1+24]
|
||||
movq m2, [r1+16]
|
||||
movq m1, [r1+ 8]
|
||||
movq m0, [r1+ 0]
|
||||
WALSH4_1D 0,1,2,3,4
|
||||
TRANSPOSE4x4W 0,1,2,3,4
|
||||
WALSH4_1D 0,1,2,3,4
|
||||
|
||||
; shift, tmp, output, qmul
|
||||
%ifdef WIN64
|
||||
DECLARE_REG_TMP 0,3,1,2
|
||||
; we can't avoid this, because r0 is the shift register (ecx) on win64
|
||||
xchg r0, t2
|
||||
%elifdef ARCH_X86_64
|
||||
DECLARE_REG_TMP 3,1,0,2
|
||||
%else
|
||||
DECLARE_REG_TMP 1,3,0,2
|
||||
%endif
|
||||
|
||||
cmp t3d, 32767
|
||||
jg .big_qmul
|
||||
add t3d, 128 << 16
|
||||
%ifidn %1,mmx
|
||||
DEQUANT_STORE_MMX 8
|
||||
%else
|
||||
DEQUANT_STORE_SSE2 8
|
||||
%endif
|
||||
RET
|
||||
.big_qmul:
|
||||
bsr t0d, t3d
|
||||
add t3d, 128 << 16
|
||||
mov t1d, 7
|
||||
cmp t0d, t1d
|
||||
cmovg t0d, t1d
|
||||
inc t1d
|
||||
shr t3d, t0b
|
||||
sub t1d, t0d
|
||||
%ifidn %1,mmx
|
||||
movd m6, t1d
|
||||
DEQUANT_STORE_MMX m6
|
||||
%else
|
||||
movd xmm6, t1d
|
||||
DEQUANT_STORE_SSE2 xmm6
|
||||
%endif
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX
|
||||
IDCT_DC_DEQUANT mmx, 0
|
||||
IDCT_DC_DEQUANT sse2, 7
|
||||
|
@ -59,6 +59,8 @@ void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM
|
||||
int stride, const uint8_t nnzc[6*8]);
|
||||
void ff_h264_idct_add8_sse2 (uint8_t **dest, const int *block_offset, DCTELEM *block,
|
||||
int stride, const uint8_t nnzc[6*8]);
|
||||
void ff_h264_luma_dc_dequant_idct_mmx (DCTELEM *output, DCTELEM *input, int qmul);
|
||||
void ff_h264_luma_dc_dequant_idct_sse2(DCTELEM *output, DCTELEM *input, int qmul);
|
||||
|
||||
/***********************************/
|
||||
/* deblocking */
|
||||
@ -301,6 +303,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c)
|
||||
c->h264_idct8_add4 = ff_h264_idct8_add4_mmx;
|
||||
c->h264_idct_add8 = ff_h264_idct_add8_mmx;
|
||||
c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx;
|
||||
c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_mmx;
|
||||
|
||||
if (mm_flags & AV_CPU_FLAG_MMX2) {
|
||||
c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
|
||||
@ -341,6 +344,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c)
|
||||
if (mm_flags&AV_CPU_FLAG_SSE2) {
|
||||
c->h264_idct8_add = ff_h264_idct8_add_sse2;
|
||||
c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
|
||||
c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2;
|
||||
|
||||
c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2;
|
||||
c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2;
|
||||
|
Loading…
Reference in New Issue
Block a user