Merge remote branch 'qatar/master'

* qatar/master: (23 commits) ac3enc: correct the flipped sign in the ac3_fixed encoder Eliminate pointless '#if 1' statements without matching '#else'. Add AVX FFT implementation. Increase alignment of av_malloc() as needed by AVX ASM. Update x86inc.asm from x264 to allow AVX emulation using SSE and MMX. mjpeg: Detect overreads in mjpeg_decode_scan() and error out. documentation: extend documentation for ffmpeg -aspect option APIChanges: update commit hashes for recent additions. lavc: deprecate FF_*_TYPE macros in favor of AV_PICTURE_TYPE_* enums aac: add headers needed for log2f() lavc: remove FF_API_MB_Q cruft lavc: remove FF_API_RATE_EMU cruft lavc: remove FF_API_HURRY_UP cruft pad: make the filter parametric vsrc_movie: add key_frame and pict_type. vsrc_movie: fix leak in request_frame() lavfi: add key_frame and pict_type to AVFilterBufferRefVideo. vsrc_buffer: add sample_aspect_ratio fields to arguments. lavfi: add fieldorder filter scale: make the filter parametric ... Conflicts: Changelog doc/filters.texi ffmpeg.c libavcodec/ac3dec.h libavcodec/dsputil.c libavfilter/avfilter.h libavfilter/vf_scale.c libavfilter/vf_yadif.c libavfilter/vsrc_buffer.c Merged-by: Michael Niedermayer <michaelni@gmx.at>
2024-11-23 19:30:05 +00:00 · 2011-04-27 03:51:04 +02:00 · 2011-04-27 03:51:04 +02:00 · d7e5aebae7
commit d7e5aebae7
parent 93c28a55fd 79ee8977c2
64 changed files with 807 additions and 424 deletions
--- a/2
+++ b/2
@ -10,7 +10,7 @@ version <next>:
 - libxvid aspect pickiness fixed
 - Frame multithreaded decoding
 - Lots of deprecated API cruft removed
-
+- fft and imdct optimizations for AVX (Sandy Bridge) processors
 version 0.7_beta1:
--- a/doc/APIchanges
+++ b/doc/APIchanges
@ -13,6 +13,17 @@ libavutil:   2011-04-18
 API changes, most recent first:
 2011-04-XX - bebe72f - lavu 51.1.0 - avutil.h
  Add AVPictureType enum and av_get_picture_type_char(), deprecate
  FF_*_TYPE defines and av_get_pict_type_char() defined in
  libavcodec/avcodec.h.
 2011-04-xx - 10d3940 - lavfi 2.3.0 - avfilter.h
  Add pict_type and key_frame fields to AVFilterBufferRefVideo.
 2011-04-xx - 7a11c82 - lavfi 2.2.0 - vsrc_buffer
  Add sample_aspect_ratio fields to vsrc_buffer arguments
 2011-04-21 - 94f7451 - lavc 53.1.0 - avcodec.h
  Add CODEC_CAP_SLICE_THREADS for codecs supporting sliced threading.
--- a/ffmpeg.c
+++ b/ffmpeg.c
@ -2908,6 +2908,10 @@ static void opt_frame_aspect_ratio(const char *arg)
        ffmpeg_exit(1);
    }
    frame_aspect_ratio = ar;
    x = vfilters ? strlen(vfilters) : 0;
    vfilters = av_realloc(vfilters, x+100);
    snprintf(vfilters+x, x+100, "%csetdar=%f\n", x?',':' ', ar);
 }
 static int opt_metadata(const char *opt, const char *arg)
--- a/ffserver.c
+++ b/ffserver.c
@ -2185,10 +2185,8 @@ static int open_input_stream(HTTPContext *c, const char *info)
        }
    }
 #if 1
    if (c->fmt_in->iformat->read_seek)
        av_seek_frame(c->fmt_in, -1, stream_pos, 0);
 #endif
    /* set the start time (needed for maxtime and RTP packet timing) */
    c->start_time = cur_time;
    c->first_pts = AV_NOPTS_VALUE;
--- a/libavcodec/aac.h
+++ b/libavcodec/aac.h
@ -223,9 +223,9 @@ typedef struct {
    float sf[120];                                  ///< scalefactors
    int sf_idx[128];                                ///< scalefactor indices (used by encoder)
    uint8_t zeroes[128];                            ///< band is not coded (used by encoder)
-    DECLARE_ALIGNED(16, float,   coeffs)[1024];     ///< coefficients for IMDCT
+    DECLARE_ALIGNED(32, float,   coeffs)[1024];     ///< coefficients for IMDCT
-    DECLARE_ALIGNED(16, float,   saved)[1024];      ///< overlap
+    DECLARE_ALIGNED(32, float,   saved)[1024];      ///< overlap
-    DECLARE_ALIGNED(16, float,   ret)[2048];        ///< PCM output
+    DECLARE_ALIGNED(32, float,   ret)[2048];        ///< PCM output
    DECLARE_ALIGNED(16, int16_t, ltp_state)[3072];  ///< time signal for LTP
    PredictorState predictor_state[MAX_PREDICTORS];
 } SingleChannelElement;
@ -272,7 +272,7 @@ typedef struct {
     * @defgroup temporary aligned temporary buffers (We do not want to have these on the stack.)
     * @{
     */
-    DECLARE_ALIGNED(16, float, buf_mdct)[1024];
+    DECLARE_ALIGNED(32, float, buf_mdct)[1024];
    /** @} */
    /**
@ -296,7 +296,7 @@ typedef struct {
    int sf_offset;                                    ///< offset into pow2sf_tab as appropriate for dsp.float_to_int16
    /** @} */
-    DECLARE_ALIGNED(16, float, temp)[128];
+    DECLARE_ALIGNED(32, float, temp)[128];
    enum OCStatus output_configured;
 } AACContext;
--- a/libavcodec/aaccoder.c
+++ b/libavcodec/aaccoder.c
@ -37,6 +37,7 @@
 #include "aac.h"
 #include "aacenc.h"
 #include "aactab.h"
 #include "libavutil/libm.h"
 /** bits needed to code codebook run value for long windows */
 static const uint8_t run_value_bits_long[64] = {
--- a/libavcodec/aacenc.h
+++ b/libavcodec/aacenc.h
@ -64,7 +64,7 @@ typedef struct AACEncContext {
    int last_frame;
    float lambda;
    DECLARE_ALIGNED(16, int,   qcoefs)[96];      ///< quantized coefficients
-    DECLARE_ALIGNED(16, float, scoefs)[1024];    ///< scaled coefficients
+    DECLARE_ALIGNED(32, float, scoefs)[1024];    ///< scaled coefficients
 } AACEncContext;
 #endif /* AVCODEC_AACENC_H */
--- a/libavcodec/aacsbr.c
+++ b/libavcodec/aacsbr.c
@ -32,6 +32,7 @@
 #include "aacsbrdata.h"
 #include "fft.h"
 #include "aacps.h"
 #include "libavutil/libm.h"
 #include <stdint.h>
 #include <float.h>
--- a/libavcodec/ac3dec.h
+++ b/libavcodec/ac3dec.h
@ -201,13 +201,13 @@ typedef struct {
 ///@}
 ///@defgroup arrays aligned arrays
-    DECLARE_ALIGNED(16, int,   fixed_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS];       ///< fixed-point transform coefficients
+    DECLARE_ALIGNED(16, int,   fixed_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS];       ///> fixed-point transform coefficients
-    DECLARE_ALIGNED(16, float, transform_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS];   ///< transform coefficients
+    DECLARE_ALIGNED(32, float, transform_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS];   ///< transform coefficients
-    DECLARE_ALIGNED(16, float, delay)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE];             ///< delay - added to the next block
+    DECLARE_ALIGNED(32, float, delay)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE];             ///< delay - added to the next block
-    DECLARE_ALIGNED(16, float, window)[AC3_BLOCK_SIZE];                              ///< window coefficients
+    DECLARE_ALIGNED(32, float, window)[AC3_BLOCK_SIZE];                              ///< window coefficients
-    DECLARE_ALIGNED(16, float, tmp_output)[AC3_BLOCK_SIZE];                          ///< temporary storage for output before windowing
+    DECLARE_ALIGNED(32, float, tmp_output)[AC3_BLOCK_SIZE];                          ///< temporary storage for output before windowing
-    DECLARE_ALIGNED(16, float, output)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE];            ///< output after imdct transform and windowing
+    DECLARE_ALIGNED(32, float, output)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE];            ///< output after imdct transform and windowing
-    DECLARE_ALIGNED(16, uint8_t, input_buffer)[AC3_FRAME_BUFFER_SIZE + FF_INPUT_BUFFER_PADDING_SIZE]; ///< temp buffer to prevent overread
+    DECLARE_ALIGNED(32, uint8_t, input_buffer)[AC3_FRAME_BUFFER_SIZE + FF_INPUT_BUFFER_PADDING_SIZE]; ///< temp buffer to prevent overread
 ///@}
 } AC3DecodeContext;
--- a/libavcodec/ac3enc.c
+++ b/libavcodec/ac3enc.c
@ -171,7 +171,7 @@ typedef struct AC3EncodeContext {
    uint8_t exp_strategy[AC3_MAX_CHANNELS][AC3_MAX_BLOCKS]; ///< exponent strategies
-    DECLARE_ALIGNED(16, SampleType, windowed_samples)[AC3_WINDOW_SIZE];
+    DECLARE_ALIGNED(32, SampleType, windowed_samples)[AC3_WINDOW_SIZE];
 } AC3EncodeContext;
 typedef struct AC3Mant {
--- a/libavcodec/ac3enc_fixed.c
+++ b/libavcodec/ac3enc_fixed.c
@ -47,7 +47,7 @@ static av_cold void mdct_end(AC3MDCTContext *mdct)
 static av_cold int mdct_init(AVCodecContext *avctx, AC3MDCTContext *mdct,
                             int nbits)
 {
-    int ret = ff_mdct_init(&mdct->fft, nbits, 0, 1.0);
+    int ret = ff_mdct_init(&mdct->fft, nbits, 0, -1.0);
    mdct->window = ff_ac3_window;
    return ret;
 }
--- a/libavcodec/atrac1.c
+++ b/libavcodec/atrac1.c
@ -60,11 +60,11 @@ typedef struct {
    int                 log2_block_count[AT1_QMF_BANDS];    ///< log2 number of blocks in a band
    int                 num_bfus;                           ///< number of Block Floating Units
    float*              spectrum[2];
-    DECLARE_ALIGNED(16, float, spec1)[AT1_SU_SAMPLES];     ///< mdct buffer
+    DECLARE_ALIGNED(32, float, spec1)[AT1_SU_SAMPLES];     ///< mdct buffer
-    DECLARE_ALIGNED(16, float, spec2)[AT1_SU_SAMPLES];     ///< mdct buffer
+    DECLARE_ALIGNED(32, float, spec2)[AT1_SU_SAMPLES];     ///< mdct buffer
-    DECLARE_ALIGNED(16, float, fst_qmf_delay)[46];         ///< delay line for the 1st stacked QMF filter
+    DECLARE_ALIGNED(32, float, fst_qmf_delay)[46];         ///< delay line for the 1st stacked QMF filter
-    DECLARE_ALIGNED(16, float, snd_qmf_delay)[46];         ///< delay line for the 2nd stacked QMF filter
+    DECLARE_ALIGNED(32, float, snd_qmf_delay)[46];         ///< delay line for the 2nd stacked QMF filter
-    DECLARE_ALIGNED(16, float, last_qmf_delay)[256+23];    ///< delay line for the last stacked QMF filter
+    DECLARE_ALIGNED(32, float, last_qmf_delay)[256+23];    ///< delay line for the last stacked QMF filter
 } AT1SUCtx;
 /**
@ -72,13 +72,13 @@ typedef struct {
 */
 typedef struct {
    AT1SUCtx            SUs[AT1_MAX_CHANNELS];              ///< channel sound unit
-    DECLARE_ALIGNED(16, float, spec)[AT1_SU_SAMPLES];      ///< the mdct spectrum buffer
+    DECLARE_ALIGNED(32, float, spec)[AT1_SU_SAMPLES];      ///< the mdct spectrum buffer
-    DECLARE_ALIGNED(16, float,  low)[256];
+    DECLARE_ALIGNED(32, float,  low)[256];
-    DECLARE_ALIGNED(16, float,  mid)[256];
+    DECLARE_ALIGNED(32, float,  mid)[256];
-    DECLARE_ALIGNED(16, float, high)[512];
+    DECLARE_ALIGNED(32, float, high)[512];
    float*              bands[3];
-    DECLARE_ALIGNED(16, float, out_samples)[AT1_MAX_CHANNELS][AT1_SU_SAMPLES];
+    DECLARE_ALIGNED(32, float, out_samples)[AT1_MAX_CHANNELS][AT1_SU_SAMPLES];
    FFTContext          mdct_ctx[3];
    int                 channels;
    DSPContext          dsp;
--- a/libavcodec/atrac3.c
+++ b/libavcodec/atrac3.c
@ -74,8 +74,8 @@ typedef struct {
    int               gcBlkSwitch;
    gain_block        gainBlock[2];
-    DECLARE_ALIGNED(16, float, spectrum)[1024];
+    DECLARE_ALIGNED(32, float, spectrum)[1024];
-    DECLARE_ALIGNED(16, float, IMDCT_buf)[1024];
+    DECLARE_ALIGNED(32, float, IMDCT_buf)[1024];
    float             delayBuf1[46]; ///<qmf delay buffers
    float             delayBuf2[46];
@ -122,7 +122,7 @@ typedef struct {
    FFTContext          mdct_ctx;
 } ATRAC3Context;
-static DECLARE_ALIGNED(16, float,mdct_window)[512];
+static DECLARE_ALIGNED(32, float, mdct_window)[512];
 static VLC              spectral_coeff_tab[7];
 static float            gain_tab1[16];
 static float            gain_tab2[31];
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@ -766,7 +766,7 @@ typedef struct AVPanScan{
     * - encoding: Set by libavcodec. for coded_picture (and set by user for input).\
     * - decoding: Set by libavcodec.\
     */\
-    int pict_type;\
+    enum AVPictureType pict_type;\
 \
    /**\
     * presentation timestamp in time_base units (time when frame should be shown to user)\
@ -1016,14 +1016,16 @@ typedef struct AVPanScan{
 #define FF_BUFFER_TYPE_SHARED   4 ///< Buffer from somewhere else; don't deallocate image (data/base), all other tables are not shared.
 #define FF_BUFFER_TYPE_COPY     8 ///< Just a (modified) copy of some other buffer, don't deallocate anything.
-
+#if FF_API_OLD_FF_PICT_TYPES
-#define FF_I_TYPE  1 ///< Intra
+/* DEPRECATED, directly use the AV_PICTURE_TYPE_* enum values */
-#define FF_P_TYPE  2 ///< Predicted
+#define FF_I_TYPE  AV_PICTURE_TYPE_I  ///< Intra
-#define FF_B_TYPE  3 ///< Bi-dir predicted
+#define FF_P_TYPE  AV_PICTURE_TYPE_P  ///< Predicted
-#define FF_S_TYPE  4 ///< S(GMC)-VOP MPEG4
+#define FF_B_TYPE  AV_PICTURE_TYPE_B  ///< Bi-dir predicted
-#define FF_SI_TYPE 5 ///< Switching Intra
+#define FF_S_TYPE  AV_PICTURE_TYPE_S  ///< S(GMC)-VOP MPEG4
-#define FF_SP_TYPE 6 ///< Switching Predicted
+#define FF_SI_TYPE AV_PICTURE_TYPE_SI ///< Switching Intra
-#define FF_BI_TYPE 7
+#define FF_SP_TYPE AV_PICTURE_TYPE_SP ///< Switching Predicted
 #define FF_BI_TYPE AV_PICTURE_TYPE_BI
 #endif
 #define FF_BUFFER_HINTS_VALID    0x01 // Buffer hints value is meaningful (if 0 ignore).
 #define FF_BUFFER_HINTS_READABLE 0x02 // Codec will read from buffer.
@ -1215,16 +1217,6 @@ typedef struct AVCodecContext {
     */
    enum PixelFormat pix_fmt;
 #if FF_API_RATE_EMU
    /**
     * Frame rate emulation. If not zero, the lower layer (i.e. format handler)
     * has to read frames at native frame rate.
     * - encoding: Set by user.
     * - decoding: unused
     */
    attribute_deprecated int rate_emu;
 #endif
    /**
     * If non NULL, 'draw_horiz_band' is called by the libavcodec
     * decoder to draw a horizontal band. It improves cache usage. Not
@ -1326,16 +1318,6 @@ typedef struct AVCodecContext {
    int b_frame_strategy;
 #if FF_API_HURRY_UP
    /**
     * hurry up amount
     * - encoding: unused
     * - decoding: Set by user. 1-> Skip B-frames, 2-> Skip IDCT/dequant too, 5-> Skip everything except header
     * @deprecated Deprecated in favor of skip_idct and skip_frame.
     */
    attribute_deprecated int hurry_up;
 #endif
    struct AVCodec *codec;
    void *priv_data;
@ -1800,22 +1782,6 @@ typedef struct AVCodecContext {
     */
    uint64_t error[4];
 #if FF_API_MB_Q
    /**
     * minimum MB quantizer
     * - encoding: unused
     * - decoding: unused
     */
    attribute_deprecated int mb_qmin;
    /**
     * maximum MB quantizer
     * - encoding: unused
     * - decoding: unused
     */
    attribute_deprecated int mb_qmax;
 #endif
    /**
     * motion estimation comparison function
     * - encoding: Set by user.
@ -3866,13 +3832,17 @@ void avcodec_default_free_buffers(AVCodecContext *s);
 /* misc useful functions */
 #if FF_API_OLD_FF_PICT_TYPES
 /**
 * Return a single letter to describe the given picture type pict_type.
 *
 * @param[in] pict_type the picture type
 * @return A single character representing the picture type.
 * @deprecated Use av_get_picture_type_char() instead.
 */
 attribute_deprecated
 char av_get_pict_type_char(int pict_type);
 #endif
 /**
 * Return codec bits per sample.
--- a/libavcodec/binkaudio.c
+++ b/libavcodec/binkaudio.c
@ -55,7 +55,7 @@ typedef struct {
    int num_bands;
    unsigned int *bands;
    float root;
-    DECLARE_ALIGNED(16, FFTSample, coeffs)[BINK_BLOCK_MAX_SIZE];
+    DECLARE_ALIGNED(32, FFTSample, coeffs)[BINK_BLOCK_MAX_SIZE];
    DECLARE_ALIGNED(16, short, previous)[BINK_BLOCK_MAX_SIZE / 16];  ///< coeffs from previous audio block
    float *coeffs_ptr[MAX_CHANNELS]; ///< pointers to the coeffs arrays for float_to_int16_interleave
    union {
--- a/libavcodec/cook.c
+++ b/libavcodec/cook.c
@ -153,7 +153,7 @@ typedef struct cook {
    /* data buffers */
    uint8_t*            decoded_bytes_buffer;
-    DECLARE_ALIGNED(16, float,mono_mdct_output)[2048];
+    DECLARE_ALIGNED(32, float, mono_mdct_output)[2048];
    float               decode_buffer_1[1024];
    float               decode_buffer_2[1024];
    float               decode_buffer_0[1060]; /* static allocation for joint decode */
--- a/libavcodec/dca.c
+++ b/libavcodec/dca.c
@ -321,16 +321,16 @@ typedef struct {
    /* Subband samples history (for ADPCM) */
    float subband_samples_hist[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][4];
-    DECLARE_ALIGNED(16, float, subband_fir_hist)[DCA_PRIM_CHANNELS_MAX][512];
+    DECLARE_ALIGNED(32, float, subband_fir_hist)[DCA_PRIM_CHANNELS_MAX][512];
-    DECLARE_ALIGNED(16, float, subband_fir_noidea)[DCA_PRIM_CHANNELS_MAX][32];
+    DECLARE_ALIGNED(32, float, subband_fir_noidea)[DCA_PRIM_CHANNELS_MAX][32];
    int hist_index[DCA_PRIM_CHANNELS_MAX];
-    DECLARE_ALIGNED(16, float, raXin)[32];
+    DECLARE_ALIGNED(32, float, raXin)[32];
    int output;                 ///< type of output
    float scale_bias;           ///< output scale
-    DECLARE_ALIGNED(16, float, subband_samples)[DCA_BLOCKS_MAX][DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][8];
+    DECLARE_ALIGNED(32, float, subband_samples)[DCA_BLOCKS_MAX][DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][8];
-    DECLARE_ALIGNED(16, float, samples)[(DCA_PRIM_CHANNELS_MAX+1)*256];
+    DECLARE_ALIGNED(32, float, samples)[(DCA_PRIM_CHANNELS_MAX+1)*256];
    const float *samples_chanptr[DCA_PRIM_CHANNELS_MAX+1];
    uint8_t dca_buffer[DCA_MAX_FRAME_SIZE + DCA_MAX_EXSS_HEADER_SIZE + DCA_BUFFER_PADDING_SIZE];
--- a/libavcodec/dct-test.c
+++ b/libavcodec/dct-test.c
@ -312,18 +312,16 @@ static void dct_error(const char *name, int is_idct,
    }
    for(i=0; i<64; i++) sysErrMax= FFMAX(sysErrMax, FFABS(sysErr[i]));
 #if 1 // dump systematic errors
    for(i=0; i<64; i++){
        if(i%8==0) printf("\n");
        printf("%7d ", (int)sysErr[i]);
    }
    printf("\n");
 #endif
    printf("%s %s: err_inf=%d err2=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
           is_idct ? "IDCT" : "DCT",
           name, err_inf, (double)err2 / NB_ITS / 64.0, (double)sysErrMax / NB_ITS, maxout, blockSumErrMax);
-#if 1 //Speed test
+
    /* speed test */
    for(i=0;i<64;i++)
        block1[i] = 0;
@ -376,7 +374,6 @@ static void dct_error(const char *name, int is_idct,
    printf("%s %s: %0.1f kdct/s\n",
           is_idct ? "IDCT" : "DCT",
           name, (double)it1 * 1000.0 / (double)ti1);
 #endif
 }
 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
--- a/libavcodec/error_resilience.c
+++ b/libavcodec/error_resilience.c
@ -786,7 +786,6 @@ void ff_er_frame_end(MpegEncContext *s){
        }
    }
 #if 1
    /* handle overlapping slices */
    for(error_type=1; error_type<=3; error_type++){
        int end_ok=0;
@ -807,8 +806,7 @@ void ff_er_frame_end(MpegEncContext *s){
                end_ok=0;
        }
    }
-#endif
+
 #if 1
    /* handle slices with partitions of different length */
    if(s->partitioned_frame){
        int end_ok=0;
@ -829,7 +827,7 @@ void ff_er_frame_end(MpegEncContext *s){
                end_ok=0;
        }
    }
-#endif
+
    /* handle missing slices */
    if(s->error_recognition>=4){
        int end_ok=1;
@ -853,7 +851,6 @@ void ff_er_frame_end(MpegEncContext *s){
        }
    }
 #if 1
    /* backward mark errors */
    distance=9999999;
    for(error_type=1; error_type<=3; error_type++){
@ -878,7 +875,6 @@ void ff_er_frame_end(MpegEncContext *s){
                distance= 9999999;
        }
    }
 #endif
    /* forward mark errors */
    error=0;
@ -893,7 +889,7 @@ void ff_er_frame_end(MpegEncContext *s){
            s->error_status_table[mb_xy]|= error;
        }
    }
-#if 1
+
    /* handle not partitioned case */
    if(!s->partitioned_frame){
        for(i=0; i<s->mb_num; i++){
@ -904,7 +900,6 @@ void ff_er_frame_end(MpegEncContext *s){
            s->error_status_table[mb_xy]= error;
        }
    }
 #endif
    dc_error= ac_error= mv_error=0;
    for(i=0; i<s->mb_num; i++){
@ -1065,16 +1060,15 @@ void ff_er_frame_end(MpegEncContext *s){
            s->dc_val[2][mb_x + mb_y*s->mb_stride]= (dcv+4)>>3;
        }
    }
-#if 1
+
    /* guess DC for damaged blocks */
    guess_dc(s, s->dc_val[0], s->mb_width*2, s->mb_height*2, s->b8_stride, 1);
    guess_dc(s, s->dc_val[1], s->mb_width  , s->mb_height  , s->mb_stride, 0);
    guess_dc(s, s->dc_val[2], s->mb_width  , s->mb_height  , s->mb_stride, 0);
-#endif
+
    /* filter luma DC */
    filter181(s->dc_val[0], s->mb_width*2, s->mb_height*2, s->b8_stride);
 #if 1
    /* render DC only intra */
    for(mb_y=0; mb_y<s->mb_height; mb_y++){
        for(mb_x=0; mb_x<s->mb_width; mb_x++){
@ -1094,7 +1088,6 @@ void ff_er_frame_end(MpegEncContext *s){
            put_dc(s, dest_y, dest_cb, dest_cr, mb_x, mb_y);
        }
    }
 #endif
    if(s->avctx->error_concealment&FF_EC_DEBLOCK){
        /* filter horizontal block boundaries */
--- a/libavcodec/fft.c
+++ b/libavcodec/fft.c
@ -93,6 +93,44 @@ av_cold void ff_init_ff_cos_tabs(int index)
 #endif
 }
 static const int avx_tab[] = {
    0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15
 };
 static int is_second_half_of_fft32(int i, int n)
 {
    if (n <= 32)
        return i >= 16;
    else if (i < n/2)
        return is_second_half_of_fft32(i, n/2);
    else if (i < 3*n/4)
        return is_second_half_of_fft32(i - n/2, n/4);
    else
        return is_second_half_of_fft32(i - 3*n/4, n/4);
 }
 static av_cold void fft_perm_avx(FFTContext *s)
 {
    int i;
    int n = 1 << s->nbits;
    for (i = 0; i < n; i += 16) {
        int k;
        if (is_second_half_of_fft32(i, n)) {
            for (k = 0; k < 16; k++)
                s->revtab[-split_radix_permutation(i + k, n, s->inverse) & (n - 1)] =
                    i + avx_tab[k];
        } else {
            for (k = 0; k < 16; k++) {
                int j = i + k;
                j = (j & ~7) | ((j >> 1) & 3) | ((j << 2) & 4);
                s->revtab[-split_radix_permutation(i + k, n, s->inverse) & (n - 1)] = j;
            }
        }
    }
 }
 av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
 {
    int i, j, n;
@ -132,11 +170,16 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
    for(j=4; j<=nbits; j++) {
        ff_init_ff_cos_tabs(j);
    }
-    for(i=0; i<n; i++) {
+
-        int j = i;
+    if (s->fft_permutation == FF_FFT_PERM_AVX) {
-        if (s->fft_permutation == FF_FFT_PERM_SWAP_LSBS)
+        fft_perm_avx(s);
-            j = (j&~3) | ((j>>1)&1) | ((j<<1)&2);
+    } else {
-        s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = j;
+        for(i=0; i<n; i++) {
            int j = i;
            if (s->fft_permutation == FF_FFT_PERM_SWAP_LSBS)
                j = (j&~3) | ((j>>1)&1) | ((j<<1)&2);
            s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = j;
        }
    }
    return 0;
--- a/libavcodec/fft.h
+++ b/libavcodec/fft.h
@ -85,6 +85,7 @@ struct FFTContext {
    int fft_permutation;
 #define FF_FFT_PERM_DEFAULT   0
 #define FF_FFT_PERM_SWAP_LSBS 1
 #define FF_FFT_PERM_AVX       2
    int mdct_permutation;
 #define FF_MDCT_PERM_NONE       0
 #define FF_MDCT_PERM_INTERLEAVE 1
@ -97,7 +98,7 @@ struct FFTContext {
 #endif
 #define COSTABLE(size) \
-    COSTABLE_CONST DECLARE_ALIGNED(16, FFTSample, FFT_NAME(ff_cos_##size))[size/2]
+    COSTABLE_CONST DECLARE_ALIGNED(32, FFTSample, FFT_NAME(ff_cos_##size))[size/2]
 extern COSTABLE(16);
 extern COSTABLE(32);
--- a/libavcodec/h261dec.c
+++ b/libavcodec/h261dec.c
@ -599,10 +599,6 @@ retry:
    s->current_picture.pict_type= s->pict_type;
    s->current_picture.key_frame= s->pict_type == FF_I_TYPE;
 #if FF_API_HURRY_UP
    /* skip everything if we are in a hurry>=5 */
    if(avctx->hurry_up>=5) return get_consumed_bytes(s, buf_size);
 #endif
    if(  (avctx->skip_frame >= AVDISCARD_NONREF && s->pict_type==FF_B_TYPE)
       ||(avctx->skip_frame >= AVDISCARD_NONKEY && s->pict_type!=FF_I_TYPE)
       || avctx->skip_frame >= AVDISCARD_ALL)
--- a/libavcodec/h263dec.c
+++ b/libavcodec/h263dec.c
@ -612,18 +612,10 @@ retry:
    /* skip B-frames if we don't have reference frames */
    if(s->last_picture_ptr==NULL && (s->pict_type==FF_B_TYPE || s->dropable)) return get_consumed_bytes(s, buf_size);
 #if FF_API_HURRY_UP
    /* skip b frames if we are in a hurry */
    if(avctx->hurry_up && s->pict_type==FF_B_TYPE) return get_consumed_bytes(s, buf_size);
 #endif
    if(   (avctx->skip_frame >= AVDISCARD_NONREF && s->pict_type==FF_B_TYPE)
       || (avctx->skip_frame >= AVDISCARD_NONKEY && s->pict_type!=FF_I_TYPE)
       ||  avctx->skip_frame >= AVDISCARD_ALL)
        return get_consumed_bytes(s, buf_size);
 #if FF_API_HURRY_UP
    /* skip everything if we are in a hurry>=5 */
    if(avctx->hurry_up>=5) return get_consumed_bytes(s, buf_size);
 #endif
    if(s->next_p_frame_damaged){
        if(s->pict_type==FF_B_TYPE)
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@ -2966,11 +2966,7 @@ static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
        buf_index += consumed;
        //FIXME do not discard SEI id
-        if(
+        if(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0)
 #if FF_API_HURRY_UP
           (s->hurry_up == 1 && h->nal_ref_idc  == 0) ||
 #endif
           (avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
            continue;
      again:
@ -3007,9 +3003,6 @@ static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
            }
            if(hx->redundant_pic_count==0
 #if FF_API_HURRY_UP
               && hx->s.hurry_up < 5
 #endif
               && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
               && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
               && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
@ -3047,9 +3040,6 @@ static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
            if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
               && s->context_initialized
 #if FF_API_HURRY_UP
               && s->hurry_up < 5
 #endif
               && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
               && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
               && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
@ -3186,11 +3176,7 @@ static int decode_frame(AVCodecContext *avctx,
    }
    if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
-        if (avctx->skip_frame >= AVDISCARD_NONREF
+        if (avctx->skip_frame >= AVDISCARD_NONREF)
 #if FF_API_HURRY_UP
                || s->hurry_up
 #endif
           )
            return 0;
        av_log(avctx, AV_LOG_ERROR, "no frame!\n");
        return -1;
--- a/libavcodec/h264.h
+++ b/libavcodec/h264.h
@ -1007,7 +1007,6 @@ static void fill_decode_caches(H264Context *h, int mb_type){
    }
    }
 #if 1
    if(IS_INTER(mb_type) || (IS_DIRECT(mb_type) && h->direct_spatial_mv_pred)){
        int list;
        for(list=0; list<h->list_count; list++){
@ -1182,7 +1181,6 @@ static void fill_decode_caches(H264Context *h, int mb_type){
            }
        }
    }
 #endif
        h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 }
--- a/libavcodec/imc.c
+++ b/libavcodec/imc.c
@ -88,7 +88,7 @@ typedef struct {
    DSPContext dsp;
    FFTContext fft;
-    DECLARE_ALIGNED(16, FFTComplex, samples)[COEFFS/2];
+    DECLARE_ALIGNED(32, FFTComplex, samples)[COEFFS/2];
    float *out_samples;
 } IMCContext;
--- a/libavcodec/motion_est_template.c
+++ b/libavcodec/motion_est_template.c
@ -158,7 +158,6 @@ static int hpel_motion_search(MpegEncContext * s,
        const int b= score_map[(index+(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)]
                     + (mv_penalty[bx   - pred_x] + mv_penalty[by+2 - pred_y])*c->penalty_factor;
 #if 1
        int key;
        int map_generation= c->map_generation;
 #ifndef NDEBUG
@ -172,7 +171,6 @@ static int hpel_motion_search(MpegEncContext * s,
        assert(map[(index+1)&(ME_MAP_SIZE-1)] == key);
        key= ((my)<<ME_MAP_MV_BITS) + (mx-1) + map_generation;
        assert(map[(index-1)&(ME_MAP_SIZE-1)] == key);
 #endif
        if(t<=b){
            CHECK_HALF_MV(0, 1, mx  ,my-1)
            if(l<=r){
--- a/libavcodec/mpeg12.c
+++ b/libavcodec/mpeg12.c
@ -2476,18 +2476,10 @@ static int decode_chunks(AVCodecContext *avctx,
                /* Skip P-frames if we do not have a reference frame or we have an invalid header. */
                    if(s2->pict_type==FF_P_TYPE && !s->sync) break;
                }
 #if FF_API_HURRY_UP
                /* Skip B-frames if we are in a hurry. */
                if(avctx->hurry_up && s2->pict_type==FF_B_TYPE) break;
 #endif
                if(  (avctx->skip_frame >= AVDISCARD_NONREF && s2->pict_type==FF_B_TYPE)
                    ||(avctx->skip_frame >= AVDISCARD_NONKEY && s2->pict_type!=FF_I_TYPE)
                    || avctx->skip_frame >= AVDISCARD_ALL)
                    break;
 #if FF_API_HURRY_UP
                /* Skip everything if we are in a hurry>=5. */
                if(avctx->hurry_up>=5) break;
 #endif
                if (!s->mpeg_enc_ctx_allocated) break;
--- a/libavcodec/mpegvideo.c
+++ b/libavcodec/mpegvideo.c
@ -1131,9 +1131,6 @@ int MPV_frame_start(MpegEncContext *s, AVCodecContext *avctx)
        }
    }
 #if FF_API_HURRY_UP
    s->hurry_up= s->avctx->hurry_up;
 #endif
    s->error_recognition= avctx->error_recognition;
    /* set dequantizer, we can't do it during init as it might change for mpeg4
@ -2125,9 +2122,6 @@ void MPV_decode_mb_internal(MpegEncContext *s, DCTELEM block[12][64],
            }
            /* skip dequant / idct if we are really late ;) */
 #if FF_API_HURRY_UP
            if(s->hurry_up>1) goto skip_idct;
 #endif
            if(s->avctx->skip_idct){
                if(  (s->avctx->skip_idct >= AVDISCARD_NONREF && s->pict_type == FF_B_TYPE)
                   ||(s->avctx->skip_idct >= AVDISCARD_NONKEY && s->pict_type != FF_I_TYPE)
--- a/libavcodec/mpegvideo.h
+++ b/libavcodec/mpegvideo.h
@ -391,11 +391,6 @@ typedef struct MpegEncContext {
    int no_rounding;  /**< apply no rounding to motion compensation (MPEG4, msmpeg4, ...)
                        for b-frames rounding mode is always 0 */
 #if FF_API_HURRY_UP
    int hurry_up;     /**< when set to 1 during decoding, b frames will be skipped
                         when set to 2 idct/dequant will be skipped too */
 #endif
    /* macroblock layer */
    int mb_x, mb_y;
    int mb_skip_run;
--- a/libavcodec/msmpeg4.c
+++ b/libavcodec/msmpeg4.c
@ -985,10 +985,9 @@ void ff_msmpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int n)
            if(level<=MAX_LEVEL && run<=MAX_RUN){
                s->ac_stats[s->mb_intra][n>3][level][run][last]++;
            }
-#if 0
+
-else
+            s->ac_stats[s->mb_intra][n > 3][40][63][0]++; //esc3 like
-    s->ac_stats[s->mb_intra][n>3][40][63][0]++; //esc3 like
+
 #endif
            code = get_rl_index(rl, last, run, level);
            put_bits(&s->pb, rl->table_vlc[code][1], rl->table_vlc[code][0]);
            if (code == rl->n) {
--- a/libavcodec/nellymoserdec.c
+++ b/libavcodec/nellymoserdec.c
@ -47,7 +47,7 @@
 typedef struct NellyMoserDecodeContext {
    AVCodecContext* avctx;
-    DECLARE_ALIGNED(16, float,float_buf)[NELLY_SAMPLES];
+    DECLARE_ALIGNED(32, float, float_buf)[NELLY_SAMPLES];
    float           state[128];
    AVLFG           random_state;
    GetBitContext   gb;
@ -55,7 +55,7 @@ typedef struct NellyMoserDecodeContext {
    DSPContext      dsp;
    FFTContext      imdct_ctx;
    FmtConvertContext fmt_conv;
-    DECLARE_ALIGNED(16, float,imdct_out)[NELLY_BUF_LEN * 2];
+    DECLARE_ALIGNED(32, float, imdct_out)[NELLY_BUF_LEN * 2];
 } NellyMoserDecodeContext;
 static void overlap_and_window(NellyMoserDecodeContext *s, float *state, float *audio, float *a_in)
--- a/libavcodec/nellymoserenc.c
+++ b/libavcodec/nellymoserenc.c
@ -55,9 +55,9 @@ typedef struct NellyMoserEncodeContext {
    int             have_saved;
    DSPContext      dsp;
    FFTContext      mdct_ctx;
-    DECLARE_ALIGNED(16, float, mdct_out)[NELLY_SAMPLES];
+    DECLARE_ALIGNED(32, float, mdct_out)[NELLY_SAMPLES];
-    DECLARE_ALIGNED(16, float, in_buff)[NELLY_SAMPLES];
+    DECLARE_ALIGNED(32, float, in_buff)[NELLY_SAMPLES];
-    DECLARE_ALIGNED(16, float, buf)[2][3 * NELLY_BUF_LEN];     ///< sample buffer
+    DECLARE_ALIGNED(32, float, buf)[2][3 * NELLY_BUF_LEN];     ///< sample buffer
    float           (*opt )[NELLY_BANDS];
    uint8_t         (*path)[NELLY_BANDS];
 } NellyMoserEncodeContext;
--- a/libavcodec/options.c
+++ b/libavcodec/options.c
@ -105,9 +105,6 @@ static const AVOption options[]={
 {"extradata_size", NULL, OFFSET(extradata_size), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX},
 {"time_base", NULL, OFFSET(time_base), FF_OPT_TYPE_RATIONAL, DEFAULT, INT_MIN, INT_MAX},
 {"g", "set the group of picture size", OFFSET(gop_size), FF_OPT_TYPE_INT, 12, INT_MIN, INT_MAX, V|E},
 #if FF_API_RATE_EMU
 {"rate_emu", "frame rate emulation", OFFSET(rate_emu), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX},
 #endif
 {"ar", "set audio sampling rate (in Hz)", OFFSET(sample_rate), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX},
 {"ac", "set number of audio channels", OFFSET(channels), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX},
 {"cutoff", "set cutoff bandwidth", OFFSET(cutoff), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX, A|E},
@ -124,9 +121,6 @@ static const AVOption options[]={
 {"rc_strategy", "ratecontrol method", OFFSET(rc_strategy), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX, V|E},
 {"b_strategy", "strategy to choose between I/P/B-frames", OFFSET(b_frame_strategy), FF_OPT_TYPE_INT, 0, INT_MIN, INT_MAX, V|E},
 {"wpredp", "weighted prediction analysis method", OFFSET(weighted_p_pred), FF_OPT_TYPE_INT, 0, INT_MIN, INT_MAX, V|E},
 #if FF_API_HURRY_UP
 {"hurry_up", "deprecated, use skip_idct/skip_frame instead", OFFSET(hurry_up), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX, V|D},
 #endif
 {"ps", "rtp payload size in bytes", OFFSET(rtp_payload_size), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX, V|E},
 {"mv_bits", NULL, OFFSET(mv_bits), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX},
 {"header_bits", NULL, OFFSET(header_bits), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX},
@ -253,10 +247,6 @@ static const AVOption options[]={
 {"pf", "forward predicted MVs of P-frames", 0, FF_OPT_TYPE_CONST, FF_DEBUG_VIS_MV_P_FOR, INT_MIN, INT_MAX, V|D, "debug_mv"},
 {"bf", "forward predicted MVs of B-frames", 0, FF_OPT_TYPE_CONST, FF_DEBUG_VIS_MV_B_FOR, INT_MIN, INT_MAX, V|D, "debug_mv"},
 {"bb", "backward predicted MVs of B-frames", 0, FF_OPT_TYPE_CONST, FF_DEBUG_VIS_MV_B_BACK, INT_MIN, INT_MAX, V|D, "debug_mv"},
 #if FF_API_MB_Q
 {"mb_qmin", "obsolete, use qmin", OFFSET(mb_qmin), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX, V|E},
 {"mb_qmax", "obsolete, use qmax", OFFSET(mb_qmax), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX, V|E},
 #endif
 {"cmp", "full pel me compare function", OFFSET(me_cmp), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX, V|E, "cmp_func"},
 {"subcmp", "sub pel me compare function", OFFSET(me_sub_cmp), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX, V|E, "cmp_func"},
 {"mbcmp", "macroblock compare function", OFFSET(mb_cmp), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX, V|E, "cmp_func"},
--- a/libavcodec/pthread.c
+++ b/libavcodec/pthread.c
@ -380,9 +380,6 @@ static void update_context_from_user(AVCodecContext *dst, AVCodecContext *src)
    dst->release_buffer = src->release_buffer;
    dst->opaque   = src->opaque;
 #if FF_API_HURRY_UP
    dst->hurry_up = src->hurry_up;
 #endif
    dst->dsp_mask = src->dsp_mask;
    dst->debug    = src->debug;
    dst->debug_mv = src->debug_mv;
--- a/libavcodec/qdm2.c
+++ b/libavcodec/qdm2.c
@ -120,7 +120,7 @@ typedef struct {
 } FFTCoefficient;
 typedef struct {
-    DECLARE_ALIGNED(16, QDM2Complex, complex)[MPA_MAX_CHANNELS][256];
+    DECLARE_ALIGNED(32, QDM2Complex, complex)[MPA_MAX_CHANNELS][256];
 } QDM2FFT;
 /**
--- a/libavcodec/rv34.c
+++ b/libavcodec/rv34.c
@ -1454,19 +1454,10 @@ int ff_rv34_decode_frame(AVCodecContext *avctx,
    }
    if((!s->last_picture_ptr || !s->last_picture_ptr->data[0]) && si.type == FF_B_TYPE)
        return -1;
 #if FF_API_HURRY_UP
    /* skip b frames if we are in a hurry */
    if(avctx->hurry_up && si.type==FF_B_TYPE) return buf_size;
 #endif
    if(   (avctx->skip_frame >= AVDISCARD_NONREF && si.type==FF_B_TYPE)
       || (avctx->skip_frame >= AVDISCARD_NONKEY && si.type!=FF_I_TYPE)
       ||  avctx->skip_frame >= AVDISCARD_ALL)
        return buf_size;
 #if FF_API_HURRY_UP
    /* skip everything if we are in a hurry>=5 */
    if(avctx->hurry_up>=5)
        return buf_size;
 #endif
    for(i=0; i<slice_count; i++){
        int offset= get_slice_offset(avctx, slices_hdr, i);
--- a/libavcodec/sh4/qpel.c
+++ b/libavcodec/sh4/qpel.c
@ -897,7 +897,6 @@ QPEL_MC(0, avg_       , _       , op_avg)
 #undef op_put
 #undef op_put_no_rnd
 #if 1
 #define H264_LOWPASS(OPNAME, OP, OP2) \
 static inline void OPNAME ## h264_qpel_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride,int w,int h){\
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
@ -1298,7 +1297,6 @@ H264_MC(avg_, 16)
 #undef op_put
 #undef op2_avg
 #undef op2_put
 #endif
 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
--- a/libavcodec/snow.c
+++ b/libavcodec/snow.c
@ -3293,10 +3293,8 @@ static void iterative_me(SnowContext *s){
                }
                best_rd= ref_rd;
                *block= ref_b;
 #if 1
                check_block(s, mb_x, mb_y, color, 1, *obmc_edged, &best_rd);
                //FIXME RD style color selection
 #endif
                if(!same_block(block, &backup)){
                    if(tb ) tb ->type &= ~BLOCK_OPT;
                    if(lb ) lb ->type &= ~BLOCK_OPT;
--- a/libavcodec/svq1dec.c
+++ b/libavcodec/svq1dec.c
@ -684,9 +684,6 @@ static int svq1_decode_frame(AVCodecContext *avctx,
  //this should be removed after libavcodec can handle more flexible picture types & ordering
  if(s->pict_type==FF_B_TYPE && s->last_picture_ptr==NULL) return buf_size;
 #if FF_API_HURRY_UP
  if(avctx->hurry_up && s->pict_type==FF_B_TYPE) return buf_size;
 #endif
  if(  (avctx->skip_frame >= AVDISCARD_NONREF && s->pict_type==FF_B_TYPE)
     ||(avctx->skip_frame >= AVDISCARD_NONKEY && s->pict_type!=FF_I_TYPE)
     || avctx->skip_frame >= AVDISCARD_ALL)
--- a/libavcodec/svq3.c
+++ b/libavcodec/svq3.c
@ -952,14 +952,6 @@ static int svq3_decode_frame(AVCodecContext *avctx,
    /* Skip B-frames if we do not have reference frames. */
    if (s->last_picture_ptr == NULL && s->pict_type == FF_B_TYPE)
        return 0;
 #if FF_API_HURRY_UP
    /* Skip B-frames if we are in a hurry. */
    if (avctx->hurry_up && s->pict_type == FF_B_TYPE)
        return 0;
    /* Skip everything if we are in a hurry >= 5. */
    if (avctx->hurry_up >= 5)
        return 0;
 #endif
    if (  (avctx->skip_frame >= AVDISCARD_NONREF && s->pict_type == FF_B_TYPE)
        ||(avctx->skip_frame >= AVDISCARD_NONKEY && s->pict_type != FF_I_TYPE)
        || avctx->skip_frame >= AVDISCARD_ALL)
--- a/libavcodec/utils.c
+++ b/libavcodec/utils.c
@ -1105,18 +1105,11 @@ void avcodec_default_free_buffers(AVCodecContext *s){
    s->internal_buffer_count=0;
 }
 #if FF_API_OLD_FF_PICT_TYPES
 char av_get_pict_type_char(int pict_type){
-    switch(pict_type){
+    return av_get_picture_type_char(pict_type);
    case FF_I_TYPE: return 'I';
    case FF_P_TYPE: return 'P';
    case FF_B_TYPE: return 'B';
    case FF_S_TYPE: return 'S';
    case FF_SI_TYPE:return 'i';
    case FF_SP_TYPE:return 'p';
    case FF_BI_TYPE:return 'b';
    default:        return '?';
    }
 }
 #endif
 int av_get_bits_per_sample(enum CodecID codec_id){
    switch(codec_id){
--- a/libavcodec/vc1dec.c
+++ b/libavcodec/vc1dec.c
@ -3519,21 +3519,11 @@ static int vc1_decode_frame(AVCodecContext *avctx,
    if(s->last_picture_ptr==NULL && (s->pict_type==FF_B_TYPE || s->dropable)){
        goto err;
    }
 #if FF_API_HURRY_UP
    /* skip b frames if we are in a hurry */
    if(avctx->hurry_up && s->pict_type==FF_B_TYPE) return -1;//buf_size;
 #endif
    if(   (avctx->skip_frame >= AVDISCARD_NONREF && s->pict_type==FF_B_TYPE)
       || (avctx->skip_frame >= AVDISCARD_NONKEY && s->pict_type!=FF_I_TYPE)
       ||  avctx->skip_frame >= AVDISCARD_ALL) {
        goto end;
    }
 #if FF_API_HURRY_UP
    /* skip everything if we are in a hurry>=5 */
    if(avctx->hurry_up>=5) {
        goto err;
    }
 #endif
    if(s->next_p_frame_damaged){
        if(s->pict_type==FF_B_TYPE)
--- a/libavcodec/version.h
+++ b/libavcodec/version.h
@ -22,7 +22,7 @@
 #define LIBAVCODEC_VERSION_MAJOR 53
 #define LIBAVCODEC_VERSION_MINOR  1
-#define LIBAVCODEC_VERSION_MICRO  0
+#define LIBAVCODEC_VERSION_MICRO  1
 #define LIBAVCODEC_VERSION_INT  AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
                                               LIBAVCODEC_VERSION_MINOR, \
@ -47,15 +47,6 @@
 #ifndef FF_API_OLD_AUDIOCONVERT
 #define FF_API_OLD_AUDIOCONVERT (LIBAVCODEC_VERSION_MAJOR < 54)
 #endif
 #ifndef FF_API_HURRY_UP
 #define FF_API_HURRY_UP         (LIBAVCODEC_VERSION_MAJOR < 53)
 #endif
 #ifndef FF_API_RATE_EMU
 #define FF_API_RATE_EMU         (LIBAVCODEC_VERSION_MAJOR < 53)
 #endif
 #ifndef FF_API_MB_Q
 #define FF_API_MB_Q             (LIBAVCODEC_VERSION_MAJOR < 53)
 #endif
 #ifndef FF_API_ANTIALIAS_ALGO
 #define FF_API_ANTIALIAS_ALGO   (LIBAVCODEC_VERSION_MAJOR < 54)
 #endif
@ -68,5 +59,8 @@
 #ifndef FF_API_THREAD_INIT
 #define FF_API_THREAD_INIT      (LIBAVCODEC_VERSION_MAJOR < 54)
 #endif
 #ifndef FF_API_OLD_FF_PICT_TYPES
 #define FF_API_OLD_FF_PICT_TYPES (LIBAVCODEC_VERSION_MAJOR < 54)
 #endif
 #endif /* AVCODEC_VERSION_H */
--- a/libavcodec/wma.h
+++ b/libavcodec/wma.h
@ -113,15 +113,15 @@ typedef struct WMACodecContext {
    uint8_t ms_stereo;                      ///< true if mid/side stereo mode
    uint8_t channel_coded[MAX_CHANNELS];    ///< true if channel is coded
    int exponents_bsize[MAX_CHANNELS];      ///< log2 ratio frame/exp. length
-    DECLARE_ALIGNED(16, float, exponents)[MAX_CHANNELS][BLOCK_MAX_SIZE];
+    DECLARE_ALIGNED(32, float, exponents)[MAX_CHANNELS][BLOCK_MAX_SIZE];
    float max_exponent[MAX_CHANNELS];
    WMACoef coefs1[MAX_CHANNELS][BLOCK_MAX_SIZE];
-    DECLARE_ALIGNED(16, float, coefs)[MAX_CHANNELS][BLOCK_MAX_SIZE];
+    DECLARE_ALIGNED(32, float, coefs)[MAX_CHANNELS][BLOCK_MAX_SIZE];
-    DECLARE_ALIGNED(16, FFTSample, output)[BLOCK_MAX_SIZE * 2];
+    DECLARE_ALIGNED(32, FFTSample, output)[BLOCK_MAX_SIZE * 2];
    FFTContext mdct_ctx[BLOCK_NB_SIZES];
    float *windows[BLOCK_NB_SIZES];
    /* output buffer for one frame and the last for IMDCT windowing */
-    DECLARE_ALIGNED(16, float, frame_out)[MAX_CHANNELS][BLOCK_MAX_SIZE * 2];
+    DECLARE_ALIGNED(32, float, frame_out)[MAX_CHANNELS][BLOCK_MAX_SIZE * 2];
    /* last frame info */
    uint8_t last_superframe[MAX_CODED_SUPERFRAME_SIZE + 4]; /* padding added */
    int last_bitoffset;
--- a/libavcodec/wmaprodec.c
+++ b/libavcodec/wmaprodec.c
@ -145,7 +145,7 @@ typedef struct {
    uint8_t  table_idx;                               ///< index in sf_offsets for the scale factor reference block
    float*   coeffs;                                  ///< pointer to the subframe decode buffer
    uint16_t num_vec_coeffs;                          ///< number of vector coded coefficients
-    DECLARE_ALIGNED(16, float, out)[WMAPRO_BLOCK_MAX_SIZE + WMAPRO_BLOCK_MAX_SIZE / 2]; ///< output buffer
+    DECLARE_ALIGNED(32, float, out)[WMAPRO_BLOCK_MAX_SIZE + WMAPRO_BLOCK_MAX_SIZE / 2]; ///< output buffer
 } WMAProChannelCtx;
 /**
@ -170,7 +170,7 @@ typedef struct WMAProDecodeCtx {
                      FF_INPUT_BUFFER_PADDING_SIZE];///< compressed frame data
    PutBitContext    pb;                            ///< context for filling the frame_data buffer
    FFTContext       mdct_ctx[WMAPRO_BLOCK_SIZES];  ///< MDCT context per block size
-    DECLARE_ALIGNED(16, float, tmp)[WMAPRO_BLOCK_MAX_SIZE]; ///< IMDCT output buffer
+    DECLARE_ALIGNED(32, float, tmp)[WMAPRO_BLOCK_MAX_SIZE]; ///< IMDCT output buffer
    float*           windows[WMAPRO_BLOCK_SIZES];   ///< windows for the different block sizes
    /* frame size dependent frame information (set during initialization) */
--- a/libavcodec/wmavoice.c
+++ b/libavcodec/wmavoice.c
@ -275,11 +275,11 @@ typedef struct {
                                  ///< by postfilter
    float denoise_filter_cache[MAX_FRAMESIZE];
    int   denoise_filter_cache_size; ///< samples in #denoise_filter_cache
-    DECLARE_ALIGNED(16, float, tilted_lpcs_pf)[0x80];
+    DECLARE_ALIGNED(32, float, tilted_lpcs_pf)[0x80];
                                  ///< aligned buffer for LPC tilting
-    DECLARE_ALIGNED(16, float, denoise_coeffs_pf)[0x80];
+    DECLARE_ALIGNED(32, float, denoise_coeffs_pf)[0x80];
                                  ///< aligned buffer for denoise coefficients
-    DECLARE_ALIGNED(16, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16];
+    DECLARE_ALIGNED(32, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16];
                                  ///< aligned buffer for postfilter speech
                                  ///< synthesis
    /**
--- a/libavcodec/x86/fft.c
+++ b/libavcodec/x86/fft.c
@ -25,7 +25,14 @@ av_cold void ff_fft_init_mmx(FFTContext *s)
 {
 #if HAVE_YASM
    int has_vectors = av_get_cpu_flags();
-    if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) {
+    if (has_vectors & AV_CPU_FLAG_AVX && HAVE_AVX && s->nbits >= 5) {
        /* AVX for SB */
        s->imdct_calc      = ff_imdct_calc_sse;
        s->imdct_half      = ff_imdct_half_avx;
        s->fft_permute     = ff_fft_permute_sse;
        s->fft_calc        = ff_fft_calc_avx;
        s->fft_permutation = FF_FFT_PERM_AVX;
    } else if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) {
        /* SSE for P3/P4/K8 */
        s->imdct_calc  = ff_imdct_calc_sse;
        s->imdct_half  = ff_imdct_half_sse;
--- a/libavcodec/x86/fft.h
+++ b/libavcodec/x86/fft.h
@ -22,6 +22,7 @@
 #include "libavcodec/fft.h"
 void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_avx(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z);
@ -32,6 +33,7 @@ void ff_imdct_calc_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input
 void ff_imdct_half_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_dct32_float_sse(FFTSample *out, const FFTSample *in);
 #endif
--- a/libavcodec/x86/fft_mmx.asm
+++ b/libavcodec/x86/fft_mmx.asm
@ -1,6 +1,7 @@
 ;******************************************************************************
 ;* FFT transform with SSE/3DNow optimizations
 ;* Copyright (c) 2008 Loren Merritt
 ;* Copyright (c) 2011 Vitor Sessak
 ;*
 ;* This algorithm (though not any of the implementation details) is
 ;* based on libdjbfft by D. J. Bernstein.
@ -49,9 +50,21 @@ endstruc
 SECTION_RODATA
 %define M_SQRT1_2 0.70710678118654752440
-ps_root2: times 4 dd M_SQRT1_2
+%define M_COS_PI_1_8 0.923879532511287
-ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
+%define M_COS_PI_3_8 0.38268343236509
-ps_p1p1m1p1: dd 0, 0, 1<<31, 0
+
 align 32
 ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
 ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
 ps_root2: times 8 dd M_SQRT1_2
 ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
 ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
 perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
 perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
 ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
 ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
 ps_m1p1: dd 1<<31, 0
 %assign i 16
@ -96,51 +109,80 @@ section .text align=16
    SWAP     %3, %6
 %endmacro
-; in:  %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
+;  in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
-; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
+;      %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
-%macro T4_SSE 3
+;      %3, %4, %5 tmp
-    mova     %3, %1
+; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
-    addps    %1, %2       ; {t1,t2,t6,t5}
+;      %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
-    subps    %3, %2       ; {t3,t4,-t8,t7}
+%macro T8_AVX 5
-    xorps    %3, [ps_p1p1m1p1]
+    vsubps     %5, %1, %2       ; v  = %1 - %2
-    mova     %2, %1
+    vaddps     %3, %1, %2       ; w  = %1 + %2
-    shufps   %1, %3, 0x44 ; {t1,t2,t3,t4}
+    vmulps     %2, %5, [ps_p1p1m1p1root2]  ; v *= vals1
-    shufps   %2, %3, 0xbe ; {t6,t5,t7,t8}
+    vpermilps  %2, %2, [perm1]
-    mova     %3, %1
+    vblendps   %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
-    addps    %1, %2       ; {r0,i0,r1,i1}
+    vshufps    %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
-    subps    %3, %2       ; {r2,i2,r3,i3}
+    vsubps     %4, %5, %1       ; s = r - q
-    mova     %2, %1
+    vaddps     %1, %5, %1       ; u = r + q
-    shufps   %1, %3, 0x88 ; {r0,r1,r2,r3}
+    vpermilps  %1, %1, [perm2]  ; k  = {u1,u2,u3,u4,u6,u5,u7,u8}
-    shufps   %2, %3, 0xdd ; {i0,i1,i2,i3}
+    vshufps    %5, %4, %1, 0xbb
    vshufps    %3, %4, %1, 0xee
    vperm2f128 %3, %3, %5, 0x13
    vxorps     %4, %4, [ps_m1m1p1m1p1m1m1m1]  ; s *= {1,1,-1,-1,1,-1,-1,-1}
    vshufps    %2, %1, %4, 0xdd
    vshufps    %1, %1, %4, 0x88
    vperm2f128 %4, %2, %1, 0x02 ; v  = {k1,k3,s1,s3,k2,k4,s2,s4}
    vperm2f128 %1, %1, %2, 0x13 ; w  = {k6,k8,s6,s8,k5,k7,s5,s7}
    vsubps     %5, %1, %3
    vblendps   %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
    vsubps     %2, %4, %1       ; %2 = v - w
    vaddps     %1, %4, %1       ; %1 = v + w
 %endmacro
 ; In SSE mode do one fft4 transforms
 ; in:  %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
 ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
 ;
 ; In AVX mode do two fft4 transforms
 ; in:  %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
 ; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
 %macro T4_SSE 3
    subps    %3, %1, %2       ; {t3,t4,-t8,t7}
    addps    %1, %1, %2       ; {t1,t2,t6,t5}
    xorps    %3, %3, [ps_p1p1m1p1]
    shufps   %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
    shufps   %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
    subps    %3, %1, %2       ; {r2,i2,r3,i3}
    addps    %1, %1, %2       ; {r0,i0,r1,i1}
    shufps   %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
    shufps   %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
 %endmacro
 ; In SSE mode do one FFT8
 ; in:  %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
 ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
 ;
 ; In AVX mode do two FFT8
 ; in:  %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
 ;      %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
 ; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
 ;      %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
 %macro T8_SSE 6
-    mova     %6, %3
+    addps    %6, %3, %4       ; {t1,t2,t3,t4}
-    subps    %3, %4       ; {r5,i5,r7,i7}
+    subps    %3, %3, %4       ; {r5,i5,r7,i7}
-    addps    %6, %4       ; {t1,t2,t3,t4}
+    shufps   %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
-    mova     %4, %3
+    mulps    %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
-    shufps   %4, %4, 0xb1 ; {i5,r5,i7,r7}
+    mulps    %4, %4, [ps_root2]
-    mulps    %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
+    addps    %3, %3, %4       ; {t8,t7,ta,t9}
-    mulps    %4, [ps_root2]
+    shufps   %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
-    addps    %3, %4       ; {t8,t7,ta,t9}
+    shufps   %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
-    mova     %4, %6
+    subps    %3, %6, %4       ; {t6,t5,tc,tb}
-    shufps   %6, %3, 0x36 ; {t3,t2,t9,t8}
+    addps    %6, %6, %4       ; {t1,t2,t9,ta}
-    shufps   %4, %3, 0x9c ; {t1,t4,t7,ta}
+    shufps   %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
-    mova     %3, %6
+    shufps   %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
-    addps    %6, %4       ; {t1,t2,t9,ta}
+    subps    %3, %1, %6       ; {r4,r5,r6,r7}
-    subps    %3, %4       ; {t6,t5,tc,tb}
+    addps    %1, %1, %6       ; {r0,r1,r2,r3}
-    mova     %4, %6
+    subps    %4, %2, %5       ; {i4,i5,i6,i7}
-    shufps   %6, %3, 0xd8 ; {t1,t9,t5,tb}
+    addps    %2, %2, %5       ; {i0,i1,i2,i3}
    shufps   %4, %3, 0x8d ; {t2,ta,t6,tc}
    mova     %3, %1
    mova     %5, %2
    addps    %1, %6       ; {r0,r1,r2,r3}
    addps    %2, %4       ; {i0,i1,i2,i3}
    subps    %3, %6       ; {r4,r5,r6,r7}
    subps    %5, %4       ; {i4,i5,i6,i7}
    SWAP     %4, %5
 %endmacro
 ; scheduled for cpu-bound sizes
@ -148,52 +190,44 @@ section .text align=16
 IF%1 mova    m4, Z(4)
 IF%1 mova    m5, Z(5)
    mova     m0, %2 ; wre
    mova     m2, m4
    mova     m1, %3 ; wim
-    mova     m3, m5
+    mulps    m2, m4, m0 ; r2*wre
    mulps    m2, m0 ; r2*wre
 IF%1 mova    m6, Z2(6)
-    mulps    m3, m1 ; i2*wim
+    mulps    m3, m5, m1 ; i2*wim
 IF%1 mova    m7, Z2(7)
-    mulps    m4, m1 ; r2*wim
+    mulps    m4, m4, m1 ; r2*wim
-    mulps    m5, m0 ; i2*wre
+    mulps    m5, m5, m0 ; i2*wre
-    addps    m2, m3 ; r2*wre + i2*wim
+    addps    m2, m2, m3 ; r2*wre + i2*wim
-    mova     m3, m1
+    mulps    m3, m1, m7 ; i3*wim
-    mulps    m1, m6 ; r3*wim
+    subps    m5, m5, m4 ; i2*wre - r2*wim
-    subps    m5, m4 ; i2*wre - r2*wim
+    mulps    m1, m1, m6 ; r3*wim
-    mova     m4, m0
+    mulps    m4, m0, m6 ; r3*wre
-    mulps    m3, m7 ; i3*wim
+    mulps    m0, m0, m7 ; i3*wre
-    mulps    m4, m6 ; r3*wre
+    subps    m4, m4, m3 ; r3*wre - i3*wim
    mulps    m0, m7 ; i3*wre
    subps    m4, m3 ; r3*wre - i3*wim
    mova     m3, Z(0)
-    addps    m0, m1 ; i3*wre + r3*wim
+    addps    m0, m0, m1 ; i3*wre + r3*wim
-    mova     m1, m4
+    subps    m1, m4, m2 ; t3
-    addps    m4, m2 ; t5
+    addps    m4, m4, m2 ; t5
-    subps    m1, m2 ; t3
+    subps    m3, m3, m4 ; r2
-    subps    m3, m4 ; r2
+    addps    m4, m4, Z(0) ; r0
    addps    m4, Z(0) ; r0
    mova     m6, Z(2)
    mova   Z(4), m3
    mova   Z(0), m4
-    mova     m3, m5
+    subps    m3, m5, m0 ; t4
-    subps    m5, m0 ; t4
+    subps    m4, m6, m3 ; r3
-    mova     m4, m6
+    addps    m3, m3, m6 ; r1
-    subps    m6, m5 ; r3
+    mova  Z2(6), m4
-    addps    m5, m4 ; r1
+    mova   Z(2), m3
    mova  Z2(6), m6
    mova   Z(2), m5
    mova     m2, Z(3)
-    addps    m3, m0 ; t6
+    addps    m3, m5, m0 ; t6
-    subps    m2, m1 ; i3
+    subps    m2, m2, m1 ; i3
    mova     m7, Z(1)
-    addps    m1, Z(3) ; i1
+    addps    m1, m1, Z(3) ; i1
    mova  Z2(7), m2
    mova   Z(3), m1
-    mova     m4, m7
+    subps    m4, m7, m3 ; i2
-    subps    m7, m3 ; i2
+    addps    m3, m3, m7 ; i0
-    addps    m3, m4 ; i0
+    mova   Z(5), m4
    mova   Z(5), m7
    mova   Z(1), m3
 %endmacro
@ -201,77 +235,55 @@ IF%1 mova    m7, Z2(7)
 %macro PASS_BIG 1 ; (!interleave)
    mova     m4, Z(4) ; r2
    mova     m5, Z(5) ; i2
    mova     m2, m4
    mova     m0, [wq] ; wre
    mova     m3, m5
    mova     m1, [wq+o1q] ; wim
-    mulps    m2, m0 ; r2*wre
+    mulps    m2, m4, m0 ; r2*wre
    mova     m6, Z2(6) ; r3
-    mulps    m3, m1 ; i2*wim
+    mulps    m3, m5, m1 ; i2*wim
    mova     m7, Z2(7) ; i3
-    mulps    m4, m1 ; r2*wim
+    mulps    m4, m4, m1 ; r2*wim
-    mulps    m5, m0 ; i2*wre
+    mulps    m5, m5, m0 ; i2*wre
-    addps    m2, m3 ; r2*wre + i2*wim
+    addps    m2, m2, m3 ; r2*wre + i2*wim
-    mova     m3, m1
+    mulps    m3, m1, m7 ; i3*wim
-    mulps    m1, m6 ; r3*wim
+    mulps    m1, m1, m6 ; r3*wim
-    subps    m5, m4 ; i2*wre - r2*wim
+    subps    m5, m5, m4 ; i2*wre - r2*wim
-    mova     m4, m0
+    mulps    m4, m0, m6 ; r3*wre
-    mulps    m3, m7 ; i3*wim
+    mulps    m0, m0, m7 ; i3*wre
-    mulps    m4, m6 ; r3*wre
+    subps    m4, m4, m3 ; r3*wre - i3*wim
    mulps    m0, m7 ; i3*wre
    subps    m4, m3 ; r3*wre - i3*wim
    mova     m3, Z(0)
-    addps    m0, m1 ; i3*wre + r3*wim
+    addps    m0, m0, m1 ; i3*wre + r3*wim
-    mova     m1, m4
+    subps    m1, m4, m2 ; t3
-    addps    m4, m2 ; t5
+    addps    m4, m4, m2 ; t5
-    subps    m1, m2 ; t3
+    subps    m3, m3, m4 ; r2
-    subps    m3, m4 ; r2
+    addps    m4, m4, Z(0) ; r0
    addps    m4, Z(0) ; r0
    mova     m6, Z(2)
    mova   Z(4), m3
    mova   Z(0), m4
-    mova     m3, m5
+    subps    m3, m5, m0 ; t4
-    subps    m5, m0 ; t4
+    subps    m4, m6, m3 ; r3
-    mova     m4, m6
+    addps    m3, m3, m6 ; r1
-    subps    m6, m5 ; r3
+IF%1 mova Z2(6), m4
-    addps    m5, m4 ; r1
+IF%1 mova  Z(2), m3
 IF%1 mova Z2(6), m6
 IF%1 mova  Z(2), m5
    mova     m2, Z(3)
-    addps    m3, m0 ; t6
+    addps    m5, m5, m0 ; t6
-    subps    m2, m1 ; i3
+    subps    m2, m2, m1 ; i3
    mova     m7, Z(1)
-    addps    m1, Z(3) ; i1
+    addps    m1, m1, Z(3) ; i1
 IF%1 mova Z2(7), m2
 IF%1 mova  Z(3), m1
-    mova     m4, m7
+    subps    m6, m7, m5 ; i2
-    subps    m7, m3 ; i2
+    addps    m5, m5, m7 ; i0
-    addps    m3, m4 ; i0
+IF%1 mova  Z(5), m6
-IF%1 mova  Z(5), m7
+IF%1 mova  Z(1), m5
 IF%1 mova  Z(1), m3
 %if %1==0
-    mova     m4, m5 ; r1
+    INTERL m1, m3, m7, Z, 2
-    mova     m0, m6 ; r3
+    INTERL m2, m4, m0, Z2, 6
-    unpcklps m5, m1
+
    unpckhps m4, m1
    unpcklps m6, m2
    unpckhps m0, m2
    mova     m1, Z(0)
    mova     m2, Z(4)
-    mova   Z(2), m5
+
-    mova   Z(3), m4
+    INTERL m5, m1, m3, Z, 0
-    mova  Z2(6), m6
+    INTERL m6, m2, m7, Z, 4
    mova  Z2(7), m0
    mova     m5, m1 ; r0
    mova     m4, m2 ; r2
    unpcklps m1, m3
    unpckhps m5, m3
    unpcklps m2, m7
    unpckhps m4, m7
    mova   Z(0), m1
    mova   Z(1), m5
    mova   Z(4), m2
    mova   Z(5), m4
 %endif
 %endmacro
@ -281,13 +293,106 @@ IF%1 mova  Z(1), m3
    punpckhdq %3, %2
 %endmacro
 INIT_XMM
 %define mova movaps
 %define Z(x) [r0+mmsize*x]
 %define Z2(x) [r0+mmsize*x]
 %define ZH(x) [r0+mmsize*x+mmsize/2]
 INIT_YMM
 align 16
 fft8_avx:
    mova      m0, Z(0)
    mova      m1, Z(1)
    T8_AVX    m0, m1, m2, m3, m4
    mova      Z(0), m0
    mova      Z(1), m1
    ret
 align 16
 fft16_avx:
    mova       m2, Z(2)
    mova       m3, Z(3)
    T4_SSE     m2, m3, m7
    mova       m0, Z(0)
    mova       m1, Z(1)
    T8_AVX     m0, m1, m4, m5, m7
    mova       m4, [ps_cos16_1]
    mova       m5, [ps_cos16_2]
    vmulps     m6, m2, m4
    vmulps     m7, m3, m5
    vaddps     m7, m7, m6
    vmulps     m2, m2, m5
    vmulps     m3, m3, m4
    vsubps     m3, m3, m2
    vblendps   m2, m7, m3, 0xf0
    vperm2f128 m3, m7, m3, 0x21
    vaddps     m4, m2, m3
    vsubps     m2, m3, m2
    vperm2f128 m2, m2, m2, 0x01
    vsubps     m3, m1, m2
    vaddps     m1, m1, m2
    vsubps     m5, m0, m4
    vaddps     m0, m0, m4
    vextractf128   Z(0), m0, 0
    vextractf128  ZH(0), m1, 0
    vextractf128   Z(1), m0, 1
    vextractf128  ZH(1), m1, 1
    vextractf128   Z(2), m5, 0
    vextractf128  ZH(2), m3, 0
    vextractf128   Z(3), m5, 1
    vextractf128  ZH(3), m3, 1
    ret
 align 16
 fft32_avx:
    call fft16_avx
    mova m0, Z(4)
    mova m1, Z(5)
    T4_SSE      m0, m1, m4
    mova m2, Z(6)
    mova m3, Z(7)
    T8_SSE      m0, m1, m2, m3, m4, m6
    ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
    ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
    vperm2f128  m4, m0, m2, 0x20
    vperm2f128  m5, m1, m3, 0x20
    vperm2f128  m6, m0, m2, 0x31
    vperm2f128  m7, m1, m3, 0x31
    PASS_SMALL 0, [cos_32], [cos_32+32]
    ret
 fft32_interleave_avx:
    call fft32_avx
    mov r2d, 32
 .deint_loop:
    mova     m2, Z(0)
    mova     m3, Z(1)
    vunpcklps      m0, m2, m3
    vunpckhps      m1, m2, m3
    vextractf128   Z(0), m0, 0
    vextractf128  ZH(0), m1, 0
    vextractf128   Z(1), m0, 1
    vextractf128  ZH(1), m1, 1
    add r0, mmsize*2
    sub r2d, mmsize/4
    jg .deint_loop
    ret
 INIT_XMM
 %define movdqa  movaps
 align 16
 fft4_avx:
 fft4_sse:
    mova     m0, Z(0)
    mova     m1, Z(1)
@ -406,6 +511,8 @@ FFT48_3DN _3dn
 %define Z(x) [zq + o1q*(x&6) + mmsize*(x&1)]
 %define Z2(x) [zq + o3q + mmsize*(x&1)]
 %define ZH(x) [zq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
 %define Z2H(x) [zq + o3q + mmsize*(x&1) + mmsize/2]
 %macro DECL_PASS 2+ ; name, payload
 align 16
@ -423,8 +530,34 @@ DEFINE_ARGS z, w, n, o1, o3
    rep ret
 %endmacro
 INIT_YMM
 %macro INTERL_AVX 5
    vunpckhps      %3, %2, %1
    vunpcklps      %2, %2, %1
    vextractf128   %4(%5), %2, 0
    vextractf128  %4 %+ H(%5), %3, 0
    vextractf128   %4(%5 + 1), %2, 1
    vextractf128  %4 %+ H(%5 + 1), %3, 1
 %endmacro
 %define INTERL INTERL_AVX
 DECL_PASS pass_avx, PASS_BIG 1
 DECL_PASS pass_interleave_avx, PASS_BIG 0
 INIT_XMM
-%define mova movaps
+
 %macro INTERL_SSE 5
    mova     %3, %2
    unpcklps %2, %1
    unpckhps %3, %1
    mova  %4(%5), %2
    mova  %4(%5+1), %3
 %endmacro
 %define INTERL INTERL_SSE
 DECL_PASS pass_sse, PASS_BIG 1
 DECL_PASS pass_interleave_sse, PASS_BIG 0
@ -457,9 +590,12 @@ DECL_PASS pass_interleave_3dn, PASS_BIG 0
 %macro DECL_FFT 2-3 ; nbits, cpu, suffix
 %xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL
-%if %1==5
+%if %1>=5
 %xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL
 %endif
 %if %1>=6
 %xdefine list_of_fft list_of_fft, fft32%3%2 SECTION_REL
 %endif
 %assign n 1<<%1
 %rep 17-%1
@ -492,9 +628,14 @@ section .text
 ; The others pass args in registers and don't spill anything.
 cglobal fft_dispatch%3%2, 2,5,8, z, nbits
    FFT_DISPATCH %3%2, nbits
 %ifidn %2, _avx
    vzeroupper
 %endif
    RET
 %endmacro ; DECL_FFT
 DECL_FFT 6, _avx
 DECL_FFT 6, _avx, _interleave
 DECL_FFT 5, _sse
 DECL_FFT 5, _sse, _interleave
 DECL_FFT 4, _3dn
@ -533,21 +674,53 @@ INIT_XMM
 %endmacro
 %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
-    movaps   xmm6, [%4+%1*2]
+    mulps      m6, %3, [%5+%1]
-    movaps   %2,   [%4+%1*2+0x10]
+    mulps      m7, %2, [%5+%1]
-    movaps   %3,   xmm6
+    mulps      %2, %2, [%6+%1]
-    movaps   xmm7, %2
+    mulps      %3, %3, [%6+%1]
-    mulps    xmm6, [%5+%1]
+    subps      %2, %2, m6
-    mulps    %2,   [%6+%1]
+    addps      %3, %3, m7
-    mulps    %3,   [%6+%1]
+%endmacro
-    mulps    xmm7, [%5+%1]
+
-    subps    %2,   xmm6
+%macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8
-    addps    %3,   xmm7
+.post:
    vmovaps      ymm1,   [%3+%1*2]
    vmovaps      ymm0,   [%3+%1*2+0x20]
    vmovaps      ymm3,   [%3+%2*2]
    vmovaps      ymm2,   [%3+%2*2+0x20]
    CMUL         %1, ymm0, ymm1, %3, %4, %5
    CMUL         %2, ymm2, ymm3, %3, %4, %5
    vshufps      ymm1, ymm1, ymm1, 0x1b
    vshufps      ymm3, ymm3, ymm3, 0x1b
    vperm2f128   ymm1, ymm1, ymm1, 0x01
    vperm2f128   ymm3, ymm3, ymm3, 0x01
    vunpcklps    ymm6, ymm2, ymm1
    vunpckhps    ymm4, ymm2, ymm1
    vunpcklps    ymm7, ymm0, ymm3
    vunpckhps    ymm5, ymm0, ymm3
    vextractf128 [%3+%1*2],      ymm7, 0
    vextractf128 [%3+%1*2+0x10], ymm5, 0
    vextractf128 [%3+%1*2+0x20], ymm7, 1
    vextractf128 [%3+%1*2+0x30], ymm5, 1
    vextractf128 [%3+%2*2],      ymm6, 0
    vextractf128 [%3+%2*2+0x10], ymm4, 0
    vextractf128 [%3+%2*2+0x20], ymm6, 1
    vextractf128 [%3+%2*2+0x30], ymm4, 1
    sub      %2,   0x20
    add      %1,   0x20
    jl       .post
 %endmacro
 %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
 .post:
    movaps   xmm1, [%3+%1*2]
    movaps   xmm0, [%3+%1*2+0x10]
    CMUL     %1,   xmm0, xmm1, %3, %4, %5
    movaps   xmm5, [%3+%2*2]
    movaps   xmm4, [%3+%2*2+0x10]
    CMUL     %2,   xmm4, xmm5, %3, %4, %5
    shufps   xmm1, xmm1, 0x1b
    shufps   xmm5, xmm5, 0x1b
@ -566,7 +739,8 @@ INIT_XMM
    jl       .post
 %endmacro
-cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input
+%macro DECL_IMDCT 2
 cglobal imdct_half%1, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input
 %ifdef ARCH_X86_64
 %define rrevtab r10
 %define rtcos   r11
@ -641,7 +815,7 @@ cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample
    mov  r0, r1
    mov  r1d, [r5+FFTContext.nbits]
-    FFT_DISPATCH _sse, r1
+    FFT_DISPATCH %1, r1
    mov  r0d, [r5+FFTContext.mdctsize]
    add  r6, r0
@ -653,14 +827,24 @@ cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample
    mov  rtsin, [esp+4]
 %endif
    neg  r0
-    mov  r1, -16
+    mov  r1, -mmsize
    sub  r1, r0
-    POSROTATESHUF r0, r1, r6, rtcos, rtsin
+    %2 r0, r1, r6, rtcos, rtsin
 %ifdef ARCH_X86_64
    pop  r14
    pop  r13
    pop  r12
 %else
    add esp, 12
 %endif
 %ifidn avx_enabled, 1
    vzeroupper
 %endif
    RET
 %endmacro
 DECL_IMDCT _sse, POSROTATESHUF
 INIT_YMM
 DECL_IMDCT _avx, POSROTATESHUF_AVX
--- a/libavcodec/x86/fft_sse.c
+++ b/libavcodec/x86/fft_sse.c
@ -28,6 +28,12 @@ DECLARE_ASM_CONST(16, int, ff_m1m1m1m1)[4] =
 void ff_fft_dispatch_sse(FFTComplex *z, int nbits);
 void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits);
 void ff_fft_dispatch_interleave_avx(FFTComplex *z, int nbits);
 void ff_fft_calc_avx(FFTContext *s, FFTComplex *z)
 {
    ff_fft_dispatch_interleave_avx(z, s->nbits);
 }
 void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
 {
@ -77,7 +83,7 @@ void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
    long n = s->mdct_size;
    long n4 = n >> 2;
-    ff_imdct_half_sse(s, output+n4, input);
+    s->imdct_half(s, output + n4, input);
    j = -n;
    k = n-16;
--- a/libavcodec/x86/x86inc.asm
+++ b/libavcodec/x86/x86inc.asm
@ -1,10 +1,11 @@
 ;*****************************************************************************
 ;* x86inc.asm
 ;*****************************************************************************
-;* Copyright (C) 2005-2008 x264 project
+;* Copyright (C) 2005-2011 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Anton Mitrofanov <BugMaster@narod.ru>
 ;*          Jason Garrett-Glaser <darkshikari@gmail.com>
 ;*
 ;* Permission to use, copy, modify, and/or distribute this software for any
 ;* purpose with or without fee is hereby granted, provided that the above
@ -499,6 +500,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
 %endmacro
 %macro INIT_MMX 0
    %assign avx_enabled 0
    %define RESET_MM_PERMUTATION INIT_MMX
    %define mmsize 8
    %define num_mmregs 8
@ -520,6 +522,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
 %endmacro
 %macro INIT_XMM 0
    %assign avx_enabled 0
    %define RESET_MM_PERMUTATION INIT_XMM
    %define mmsize 16
    %define num_mmregs 8
@ -538,6 +541,31 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
    %endrep
 %endmacro
 %macro INIT_AVX 0
    INIT_XMM
    %assign avx_enabled 1
    %define PALIGNR PALIGNR_SSSE3
    %define RESET_MM_PERMUTATION INIT_AVX
 %endmacro
 %macro INIT_YMM 0
    %assign avx_enabled 1
    %define RESET_MM_PERMUTATION INIT_YMM
    %define mmsize 32
    %define num_mmregs 8
    %ifdef ARCH_X86_64
    %define num_mmregs 16
    %endif
    %define mova vmovaps
    %define movu vmovups
    %assign %%i 0
    %rep num_mmregs
    CAT_XDEFINE m, %%i, ymm %+ %%i
    CAT_XDEFINE nymm, %%i, %%i
    %assign %%i %%i+1
    %endrep
 %endmacro
 INIT_MMX
 ; I often want to use macros that permute their arguments. e.g. there's no
@ -645,3 +673,222 @@ INIT_MMX
        sub %1, %2
    %endif
 %endmacro
 ;=============================================================================
 ; AVX abstraction layer
 ;=============================================================================
 %assign i 0
 %rep 16
    %if i < 8
        CAT_XDEFINE sizeofmm, i, 8
    %endif
    CAT_XDEFINE sizeofxmm, i, 16
    CAT_XDEFINE sizeofymm, i, 32
 %assign i i+1
 %endrep
 %undef i
 ;%1 == instruction
 ;%2 == 1 if float, 0 if int
 ;%3 == 0 if 3-operand (xmm, xmm, xmm), 1 if 4-operand (xmm, xmm, xmm, imm)
 ;%4 == number of operands given
 ;%5+: operands
 %macro RUN_AVX_INSTR 6-7+
    %if sizeof%5==32
        v%1 %5, %6, %7
    %else
        %if sizeof%5==8
            %define %%regmov movq
        %elif %2
            %define %%regmov movaps
        %else
            %define %%regmov movdqa
        %endif
        %if %4>=3+%3
            %ifnidn %5, %6
                %if avx_enabled && sizeof%5==16
                    v%1 %5, %6, %7
                %else
                    %%regmov %5, %6
                    %1 %5, %7
                %endif
            %else
                %1 %5, %7
            %endif
        %elif %3
            %1 %5, %6, %7
        %else
            %1 %5, %6
        %endif
    %endif
 %endmacro
 ;%1 == instruction
 ;%2 == 1 if float, 0 if int
 ;%3 == 0 if 3-operand (xmm, xmm, xmm), 1 if 4-operand (xmm, xmm, xmm, imm)
 %macro AVX_INSTR 3
    %macro %1 2-8 fnord, fnord, fnord, %1, %2, %3
        %ifidn %3, fnord
            RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
        %elifidn %4, fnord
            RUN_AVX_INSTR %6, %7, %8, 3, %1, %2, %3
        %elifidn %5, fnord
            RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
        %else
            RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5
        %endif
    %endmacro
 %endmacro
 AVX_INSTR addpd, 1, 0
 AVX_INSTR addps, 1, 0
 AVX_INSTR addsd, 1, 0
 AVX_INSTR addss, 1, 0
 AVX_INSTR addsubpd, 1, 0
 AVX_INSTR addsubps, 1, 0
 AVX_INSTR andpd, 1, 0
 AVX_INSTR andps, 1, 0
 AVX_INSTR andnpd, 1, 0
 AVX_INSTR andnps, 1, 0
 AVX_INSTR blendpd, 1, 0
 AVX_INSTR blendps, 1, 0
 AVX_INSTR blendvpd, 1, 0
 AVX_INSTR blendvps, 1, 0
 AVX_INSTR cmppd, 1, 0
 AVX_INSTR cmpps, 1, 0
 AVX_INSTR cmpsd, 1, 0
 AVX_INSTR cmpss, 1, 0
 AVX_INSTR divpd, 1, 0
 AVX_INSTR divps, 1, 0
 AVX_INSTR divsd, 1, 0
 AVX_INSTR divss, 1, 0
 AVX_INSTR dppd, 1, 0
 AVX_INSTR dpps, 1, 0
 AVX_INSTR haddpd, 1, 0
 AVX_INSTR haddps, 1, 0
 AVX_INSTR hsubpd, 1, 0
 AVX_INSTR hsubps, 1, 0
 AVX_INSTR maxpd, 1, 0
 AVX_INSTR maxps, 1, 0
 AVX_INSTR maxsd, 1, 0
 AVX_INSTR maxss, 1, 0
 AVX_INSTR minpd, 1, 0
 AVX_INSTR minps, 1, 0
 AVX_INSTR minsd, 1, 0
 AVX_INSTR minss, 1, 0
 AVX_INSTR mpsadbw, 0, 1
 AVX_INSTR mulpd, 1, 0
 AVX_INSTR mulps, 1, 0
 AVX_INSTR mulsd, 1, 0
 AVX_INSTR mulss, 1, 0
 AVX_INSTR orpd, 1, 0
 AVX_INSTR orps, 1, 0
 AVX_INSTR packsswb, 0, 0
 AVX_INSTR packssdw, 0, 0
 AVX_INSTR packuswb, 0, 0
 AVX_INSTR packusdw, 0, 0
 AVX_INSTR paddb, 0, 0
 AVX_INSTR paddw, 0, 0
 AVX_INSTR paddd, 0, 0
 AVX_INSTR paddq, 0, 0
 AVX_INSTR paddsb, 0, 0
 AVX_INSTR paddsw, 0, 0
 AVX_INSTR paddusb, 0, 0
 AVX_INSTR paddusw, 0, 0
 AVX_INSTR palignr, 0, 1
 AVX_INSTR pand, 0, 0
 AVX_INSTR pandn, 0, 0
 AVX_INSTR pavgb, 0, 0
 AVX_INSTR pavgw, 0, 0
 AVX_INSTR pblendvb, 0, 0
 AVX_INSTR pblendw, 0, 1
 AVX_INSTR pcmpestri, 0, 0
 AVX_INSTR pcmpestrm, 0, 0
 AVX_INSTR pcmpistri, 0, 0
 AVX_INSTR pcmpistrm, 0, 0
 AVX_INSTR pcmpeqb, 0, 0
 AVX_INSTR pcmpeqw, 0, 0
 AVX_INSTR pcmpeqd, 0, 0
 AVX_INSTR pcmpeqq, 0, 0
 AVX_INSTR pcmpgtb, 0, 0
 AVX_INSTR pcmpgtw, 0, 0
 AVX_INSTR pcmpgtd, 0, 0
 AVX_INSTR pcmpgtq, 0, 0
 AVX_INSTR phaddw, 0, 0
 AVX_INSTR phaddd, 0, 0
 AVX_INSTR phaddsw, 0, 0
 AVX_INSTR phsubw, 0, 0
 AVX_INSTR phsubd, 0, 0
 AVX_INSTR phsubsw, 0, 0
 AVX_INSTR pmaddwd, 0, 0
 AVX_INSTR pmaddubsw, 0, 0
 AVX_INSTR pmaxsb, 0, 0
 AVX_INSTR pmaxsw, 0, 0
 AVX_INSTR pmaxsd, 0, 0
 AVX_INSTR pmaxub, 0, 0
 AVX_INSTR pmaxuw, 0, 0
 AVX_INSTR pmaxud, 0, 0
 AVX_INSTR pminsb, 0, 0
 AVX_INSTR pminsw, 0, 0
 AVX_INSTR pminsd, 0, 0
 AVX_INSTR pminub, 0, 0
 AVX_INSTR pminuw, 0, 0
 AVX_INSTR pminud, 0, 0
 AVX_INSTR pmulhuw, 0, 0
 AVX_INSTR pmulhrsw, 0, 0
 AVX_INSTR pmulhw, 0, 0
 AVX_INSTR pmullw, 0, 0
 AVX_INSTR pmulld, 0, 0
 AVX_INSTR pmuludq, 0, 0
 AVX_INSTR pmuldq, 0, 0
 AVX_INSTR por, 0, 0
 AVX_INSTR psadbw, 0, 0
 AVX_INSTR pshufb, 0, 0
 AVX_INSTR psignb, 0, 0
 AVX_INSTR psignw, 0, 0
 AVX_INSTR psignd, 0, 0
 AVX_INSTR psllw, 0, 0
 AVX_INSTR pslld, 0, 0
 AVX_INSTR psllq, 0, 0
 AVX_INSTR pslldq, 0, 0
 AVX_INSTR psraw, 0, 0
 AVX_INSTR psrad, 0, 0
 AVX_INSTR psrlw, 0, 0
 AVX_INSTR psrld, 0, 0
 AVX_INSTR psrlq, 0, 0
 AVX_INSTR psrldq, 0, 0
 AVX_INSTR psubb, 0, 0
 AVX_INSTR psubw, 0, 0
 AVX_INSTR psubd, 0, 0
 AVX_INSTR psubq, 0, 0
 AVX_INSTR psubsb, 0, 0
 AVX_INSTR psubsw, 0, 0
 AVX_INSTR psubusb, 0, 0
 AVX_INSTR psubusw, 0, 0
 AVX_INSTR punpckhbw, 0, 0
 AVX_INSTR punpckhwd, 0, 0
 AVX_INSTR punpckhdq, 0, 0
 AVX_INSTR punpckhqdq, 0, 0
 AVX_INSTR punpcklbw, 0, 0
 AVX_INSTR punpcklwd, 0, 0
 AVX_INSTR punpckldq, 0, 0
 AVX_INSTR punpcklqdq, 0, 0
 AVX_INSTR pxor, 0, 0
 AVX_INSTR shufps, 0, 1
 AVX_INSTR subpd, 1, 0
 AVX_INSTR subps, 1, 0
 AVX_INSTR subsd, 1, 0
 AVX_INSTR subss, 1, 0
 AVX_INSTR unpckhpd, 1, 0
 AVX_INSTR unpckhps, 1, 0
 AVX_INSTR unpcklpd, 1, 0
 AVX_INSTR unpcklps, 1, 0
 AVX_INSTR xorpd, 1, 0
 AVX_INSTR xorps, 1, 0
 ; 3DNow instructions, for sharing code between AVX, SSE and 3DN
 AVX_INSTR pfadd, 1, 0
 AVX_INSTR pfsub, 1, 0
 AVX_INSTR pfmul, 1, 0
--- a/libavfilter/avfilter.h
+++ b/libavfilter/avfilter.h
@ -26,8 +26,8 @@
 #include "libavutil/samplefmt.h"
 #define LIBAVFILTER_VERSION_MAJOR  2
-#define LIBAVFILTER_VERSION_MINOR  0
+#define LIBAVFILTER_VERSION_MINOR  3
-#define LIBAVFILTER_VERSION_MICRO  0
+#define LIBAVFILTER_VERSION_MICRO  1
 #define LIBAVFILTER_VERSION_INT AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \
                                               LIBAVFILTER_VERSION_MINOR, \
@ -115,7 +115,7 @@ typedef struct AVFilterBufferRefVideoProps {
    AVRational pixel_aspect;    ///< pixel aspect ratio
    int interlaced;             ///< is frame interlaced
    int top_field_first;        ///< field order
-    int pict_type;              ///< Picture type of the frame
+    enum AVPictureType pict_type; ///< picture type of the frame
    int key_frame;              ///< 1 -> keyframe, 0-> not
 } AVFilterBufferRefVideoProps;
--- a/libavfilter/vf_scale.c
+++ b/libavfilter/vf_scale.c
@ -75,8 +75,8 @@ typedef struct {
    int input_is_pal;           ///< set to 1 if the input format is paletted
    int interlaced;
-    char w_expr[256];             ///< width  expression string
+    char w_expr[256];           ///< width  expression string
-    char h_expr[256];             ///< height expression string
+    char h_expr[256];           ///< height expression string
 } ScaleContext;
 static av_cold int init(AVFilterContext *ctx, const char *args, void *opaque)
--- a/libavfilter/vf_yadif.c
+++ b/libavfilter/vf_yadif.c
@ -1,7 +1,6 @@
 /*
 * Copyright (C) 2006-2010 Michael Niedermayer <michaelni@gmx.at>
- *               2010 James Darnley <james.darnley@gmail.com>
+ *               2010      James Darnley <james.darnley@gmail.com>
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
--- a/libavfilter/vsrc_buffer.c
+++ b/libavfilter/vsrc_buffer.c
@ -131,7 +131,7 @@ static av_cold int init(AVFilterContext *ctx, const char *args, void *opaque)
        (n = sscanf(args, "%d:%d:%127[^:]:%d:%d:%d:%d", &c->w, &c->h, pix_fmt_str,
                    &c->time_base.num, &c->time_base.den,
                    &c->pixel_aspect.num, &c->pixel_aspect.den)) != 7) {
-        av_log(ctx, AV_LOG_ERROR, "Expected 7 arguments, but %d found in '%s'\n", n, args);
+        av_log(ctx, AV_LOG_ERROR, "Expected 7 arguments, but only %d found in '%s'\n", n, args);
        return AVERROR(EINVAL);
    }
    if ((c->pix_fmt = av_get_pix_fmt(pix_fmt_str)) == PIX_FMT_NONE) {
--- a/libavformat/nutenc.c
+++ b/libavformat/nutenc.c
@ -175,7 +175,6 @@ static void build_frame_code(AVFormatContext *s){
        }
        key_frame= intra_only;
 #if 1
        if(is_audio){
            int frame_bytes= codec->frame_size*(int64_t)codec->bit_rate / (8*codec->sample_rate);
            int pts;
@ -199,7 +198,6 @@ static void build_frame_code(AVFormatContext *s){
            ft->pts_delta=1;
            start2++;
        }
 #endif
        if(codec->has_b_frames){
            pred_count=5;
--- a/libavutil/avutil.h
+++ b/libavutil/avutil.h
@ -40,7 +40,7 @@
 #define AV_VERSION(a, b, c) AV_VERSION_DOT(a, b, c)
 #define LIBAVUTIL_VERSION_MAJOR 51
-#define LIBAVUTIL_VERSION_MINOR  0
+#define LIBAVUTIL_VERSION_MINOR  1
 #define LIBAVUTIL_VERSION_MICRO  0
 #define LIBAVUTIL_VERSION_INT   AV_VERSION_INT(LIBAVUTIL_VERSION_MAJOR, \
@ -97,6 +97,25 @@ enum AVMediaType {
 #define AV_TIME_BASE            1000000
 #define AV_TIME_BASE_Q          (AVRational){1, AV_TIME_BASE}
 enum AVPictureType {
    AV_PICTURE_TYPE_I = 1, ///< Intra
    AV_PICTURE_TYPE_P,     ///< Predicted
    AV_PICTURE_TYPE_B,     ///< Bi-dir predicted
    AV_PICTURE_TYPE_S,     ///< S(GMC)-VOP MPEG4
    AV_PICTURE_TYPE_SI,    ///< Switching Intra
    AV_PICTURE_TYPE_SP,    ///< Switching Predicted
    AV_PICTURE_TYPE_BI,    ///< BI type
 };
 /**
 * Return a single letter to describe the given picture type
 * pict_type.
 *
 * @param[in] pict_type the picture type @return a single character
 * representing the picture type, '?' if pict_type is unknown
 */
 char av_get_picture_type_char(enum AVPictureType pict_type);
 #include "common.h"
 #include "error.h"
 #include "mathematics.h"
--- a/libavutil/mem.c
+++ b/libavutil/mem.c
@ -69,21 +69,21 @@ void *av_malloc(size_t size)
 #endif
    /* let's disallow possible ambiguous cases */
-    if(size > (INT_MAX-16) )
+    if(size > (INT_MAX-32) )
        return NULL;
 #if CONFIG_MEMALIGN_HACK
-    ptr = malloc(size+16);
+    ptr = malloc(size+32);
    if(!ptr)
        return ptr;
-    diff= ((-(long)ptr - 1)&15) + 1;
+    diff= ((-(long)ptr - 1)&31) + 1;
    ptr = (char*)ptr + diff;
    ((char*)ptr)[-1]= diff;
 #elif HAVE_POSIX_MEMALIGN
-    if (posix_memalign(&ptr,16,size))
+    if (posix_memalign(&ptr,32,size))
        ptr = NULL;
 #elif HAVE_MEMALIGN
-    ptr = memalign(16,size);
+    ptr = memalign(32,size);
    /* Why 64?
       Indeed, we should align it:
         on 4 for 386
@ -93,10 +93,8 @@ void *av_malloc(size_t size)
       Because L1 and L2 caches are aligned on those values.
       But I don't want to code such logic here!
     */
-     /* Why 16?
+     /* Why 32?
-        Because some CPUs need alignment, for example SSE2 on P4, & most RISC CPUs
+        For AVX ASM. SSE / NEON needs only 16.
        it will just trigger an exception and the unaligned load will be done in the
        exception handler or it will just segfault (SSE2 on P4).
        Why not larger? Because I did not see a difference in benchmarks ...
     */
     /* benchmarks with P3
--- a/libavutil/pca.c
+++ b/libavutil/pca.c
@ -218,7 +218,6 @@ int main(void){
        printf("\n");
    }
 #if 1
    for(i=0; i<LEN; i++){
        double v[LEN];
        double error=0;
@ -233,7 +232,7 @@ int main(void){
        printf("%f ", error);
    }
    printf("\n");
-#endif
+
    for(i=0; i<LEN; i++){
        for(j=0; j<LEN; j++){
            printf("%9.6f ", eigenvector[i + j*LEN]);
--- a/libavutil/utils.c
+++ b/libavutil/utils.c
@ -39,3 +39,17 @@ const char *avutil_license(void)
 #define LICENSE_PREFIX "libavutil license: "
    return LICENSE_PREFIX FFMPEG_LICENSE + sizeof(LICENSE_PREFIX) - 1;
 }
 char av_get_picture_type_char(enum AVPictureType pict_type)
 {
    switch (pict_type) {
    case AV_PICTURE_TYPE_I:  return 'I';
    case AV_PICTURE_TYPE_P:  return 'P';
    case AV_PICTURE_TYPE_B:  return 'B';
    case AV_PICTURE_TYPE_S:  return 'S';
    case AV_PICTURE_TYPE_SI: return 'i';
    case AV_PICTURE_TYPE_SP: return 'p';
    case AV_PICTURE_TYPE_BI: return 'b';
    default:                 return '?';
    }
 }
--- a/tests/ref/acodec/ac3_fixed
+++ b/tests/ref/acodec/ac3_fixed
@ -1,2 +1,2 @@
-5ddb6d25dd117db29627f9d286153a7a *./tests/data/acodec/ac3.rm
+0f14801e166819dd4a58981aea36e08b *./tests/data/acodec/ac3.rm
 98751 ./tests/data/acodec/ac3.rm
--- a/tests/ref/lavf/rm
+++ b/tests/ref/lavf/rm
@ -1,2 +1,2 @@
-a1c71456f21d5459d2824d75bbdcc80c *./tests/data/lavf/lavf.rm
+2e3d6b1944c6cd2cf14e13055aecf82a *./tests/data/lavf/lavf.rm
 346706 ./tests/data/lavf/lavf.rm
--- a/tests/ref/seek/ac3_rm
+++ b/tests/ref/seek/ac3_rm
@ -11,7 +11,8 @@ ret:-1         st:-1 flags:1  ts: 1.470835
 ret:-1         st: 0 flags:0  ts: 0.365000
 ret: 0         st: 0 flags:1  ts:-0.741000
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
-ret:-1         st:-1 flags:0  ts: 2.153336
+ret: 0         st:-1 flags:0  ts: 2.153336
 ret: 0         st: 0 flags:1 dts: 2.159000 pts: 2.159000 pos:  35567 size:   556
 ret:-1         st:-1 flags:1  ts: 1.047503
 ret: 0         st: 0 flags:0  ts:-0.058000
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
`@ -1,2 +1,2 @@`
	`5ddb6d25dd117db29627f9d286153a7a *./tests/data/acodec/ac3.rm`	`0f14801e166819dd4a58981aea36e08b *./tests/data/acodec/ac3.rm`
	`98751 ./tests/data/acodec/ac3.rm`	`98751 ./tests/data/acodec/ac3.rm`
`@ -1,2 +1,2 @@`
	`a1c71456f21d5459d2824d75bbdcc80c *./tests/data/lavf/lavf.rm`	`2e3d6b1944c6cd2cf14e13055aecf82a *./tests/data/lavf/lavf.rm`
	`346706 ./tests/data/lavf/lavf.rm`	`346706 ./tests/data/lavf/lavf.rm`