dsputil: Split off HuffYUV decoding bits into their own context

Also shorten HuffYUV context member names to avoid clutter.
2024-11-23 11:19:55 +00:00 · 2014-01-07 12:23:13 +01:00 · 2014-01-07 12:23:13 +01:00 · 0d439fbede
commit 0d439fbede
parent 888dcd8675
23 changed files with 578 additions and 378 deletions
--- a/7
+++ b/7
@ -1541,6 +1541,7 @@ CONFIG_EXTRA="
    h264qpel
    hpeldsp
    huffman
+    huffyuvdsp
    intrax8
    lgplv3
    lpc
@ -1771,7 +1772,7 @@ h263p_encoder_select="h263_encoder"
 h264_decoder_select="cabac golomb h264chroma h264dsp h264pred h264qpel videodsp"
 h264_decoder_suggest="error_resilience"
 hevc_decoder_select="cabac dsputil golomb videodsp"
-huffyuv_decoder_select="dsputil"
+huffyuv_decoder_select="dsputil huffyuvdsp"
 huffyuv_encoder_select="dsputil huffman"
 iac_decoder_select="imc_decoder"
 imc_decoder_select="dsputil fft mdct sinewin"
@ -1780,7 +1781,7 @@ interplay_video_decoder_select="hpeldsp"
 jpegls_decoder_select="golomb mjpeg_decoder"
 jpegls_encoder_select="golomb"
 jv_decoder_select="dsputil"
-lagarith_decoder_select="dsputil"
+lagarith_decoder_select="huffyuvdsp"
 ljpeg_encoder_select="aandcttables mpegvideoenc"
 loco_decoder_select="golomb"
 mdec_decoder_select="dsputil error_resilience mpegvideo"
@ -1857,7 +1858,7 @@ tscc_decoder_deps="zlib"
 twinvq_decoder_select="mdct lsp sinewin"
 utvideo_decoder_select="dsputil"
 utvideo_encoder_select="dsputil huffman"
-vble_decoder_select="dsputil"
+vble_decoder_select="huffyuvdsp"
 vc1_decoder_select="error_resilience h263_decoder h264chroma h264qpel intrax8"
 vc1image_decoder_select="vc1_decoder"
 vorbis_decoder_select="mdct"
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@ -46,6 +46,7 @@ OBJS-$(CONFIG_H264PRED)                += h264pred.o
 OBJS-$(CONFIG_H264QPEL)                += h264qpel.o
 OBJS-$(CONFIG_HPELDSP)                 += hpeldsp.o
 OBJS-$(CONFIG_HUFFMAN)                 += huffman.o
+OBJS-$(CONFIG_HUFFYUVDSP)              += huffyuvdsp.o
 OBJS-$(CONFIG_INTRAX8)                 += intrax8.o intrax8dsp.o
 OBJS-$(CONFIG_LIBXVID)                 += libxvid_rc.o
 OBJS-$(CONFIG_LPC)                     += lpc.o
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@ -1663,19 +1663,6 @@ void ff_set_cmp(DSPContext *c, me_cmp_func *cmp, int type)
    }
 }

-static void add_bytes_c(uint8_t *dst, uint8_t *src, int w)
-{
-    long i;
-
-    for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
-        long a = *(long *) (src + i);
-        long b = *(long *) (dst + i);
-        *(long *) (dst + i) = ((a & pb_7f) + (b & pb_7f)) ^ ((a ^ b) & pb_80);
-    }
-    for (; i < w; i++)
-        dst[i + 0] += src[i + 0];
-}
-
 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w)
 {
    long i;
@ -1704,26 +1691,6 @@ static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w)
        dst[i + 0] = src1[i + 0] - src2[i + 0];
 }

-static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1,
-                                         const uint8_t *diff, int w,
-                                         int *left, int *left_top)
-{
-    int i;
-    uint8_t l, lt;
-
-    l  = *left;
-    lt = *left_top;
-
-    for (i = 0; i < w; i++) {
-        l      = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF) + diff[i];
-        lt     = src1[i];
-        dst[i] = l;
-    }
-
-    *left     = l;
-    *left_top = lt;
-}
-
 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1,
                                         const uint8_t *src2, int w,
                                         int *left, int *left_top)
@ -1745,66 +1712,6 @@ static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1,
    *left_top = lt;
 }

-static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src,
-                                      int w, int acc)
-{
-    int i;
-
-    for (i = 0; i < w - 1; i++) {
-        acc   += src[i];
-        dst[i] = acc;
-        i++;
-        acc   += src[i];
-        dst[i] = acc;
-    }
-
-    for (; i < w; i++) {
-        acc   += src[i];
-        dst[i] = acc;
-    }
-
-    return acc;
-}
-
-#if HAVE_BIGENDIAN
-#define B 3
-#define G 2
-#define R 1
-#define A 0
-#else
-#define B 0
-#define G 1
-#define R 2
-#define A 3
-#endif
-static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src,
-                                             int w, int *red, int *green,
-                                             int *blue, int *alpha)
-{
-    int i, r = *red, g = *green, b = *blue, a = *alpha;
-
-    for (i = 0; i < w; i++) {
-        b += src[4 * i + B];
-        g += src[4 * i + G];
-        r += src[4 * i + R];
-        a += src[4 * i + A];
-
-        dst[4 * i + B] = b;
-        dst[4 * i + G] = g;
-        dst[4 * i + R] = r;
-        dst[4 * i + A] = a;
-    }
-
-    *red   = r;
-    *green = g;
-    *blue  = b;
-    *alpha = a;
-}
-#undef B
-#undef G
-#undef R
-#undef A
-
 #define BUTTERFLY2(o1, o2, i1, i2)              \
    o1 = (i1) + (i2);                           \
    o2 = (i1) - (i2);
@ -2578,11 +2485,6 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)

    c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;

-    c->add_bytes                      = add_bytes_c;
-    c->add_hfyu_median_prediction     = add_hfyu_median_prediction_c;
-    c->add_hfyu_left_prediction       = add_hfyu_left_prediction_c;
-    c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
-
    c->diff_bytes                 = diff_bytes_c;
    c->sub_hfyu_median_prediction = sub_hfyu_median_prediction_c;

--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@ -186,9 +186,6 @@ typedef struct DSPContext {
    me_cmp_func pix_abs[2][4];

    /* HuffYUV specific */
-    void (*add_bytes)(uint8_t *dst /* align 16 */,
-                      uint8_t *src /* align 16 */,
-                      int w);
    void (*diff_bytes)(uint8_t *dst /* align 16 */,
                       uint8_t *src1 /* align 16 */,
                       uint8_t *src2 /* align 1 */,
@ -200,14 +197,7 @@ typedef struct DSPContext {
    void (*sub_hfyu_median_prediction)(uint8_t *dst, const uint8_t *src1,
                                       const uint8_t *src2, int w,
                                       int *left, int *left_top);
-    void (*add_hfyu_median_prediction)(uint8_t *dst, const uint8_t *top,
-                                       const uint8_t *diff, int w,
-                                       int *left, int *left_top);
-    int (*add_hfyu_left_prediction)(uint8_t *dst, const uint8_t *src,
-                                    int w, int left);
-    void (*add_hfyu_left_prediction_bgr32)(uint8_t *dst, const uint8_t *src,
-                                           int w, int *red, int *green,
-                                           int *blue, int *alpha);
+
    void (*bswap_buf)(uint32_t *dst, const uint32_t *src, int w);
    void (*bswap16_buf)(uint16_t *dst, const uint16_t *src, int len);

--- a/libavcodec/huffyuv.c
+++ b/libavcodec/huffyuv.c
@ -33,6 +33,7 @@
 #include "libavutil/mem.h"

 #include "avcodec.h"
+#include "dsputil.h"
 #include "huffyuv.h"

 int ff_huffyuv_generate_bits_table(uint32_t *dst, const uint8_t *len_table)
--- a/libavcodec/huffyuv.h
+++ b/libavcodec/huffyuv.h
@ -34,6 +34,7 @@
 #include "avcodec.h"
 #include "dsputil.h"
 #include "get_bits.h"
+#include "huffyuvdsp.h"
 #include "put_bits.h"

 #define VLC_BITS 11
@ -81,6 +82,7 @@ typedef struct HYuvContext {
    uint8_t *bitstream_buffer;
    unsigned int bitstream_buffer_size;
    DSPContext dsp;
+    HuffYUVDSPContext hdsp;
 } HYuvContext;

 void ff_huffyuv_common_init(AVCodecContext *s);
--- a/libavcodec/huffyuvdec.c
+++ b/libavcodec/huffyuvdec.c
@ -31,6 +31,7 @@
 #include "avcodec.h"
 #include "get_bits.h"
 #include "huffyuv.h"
+#include "huffyuvdsp.h"
 #include "thread.h"

 #define classic_shift_luma_table_size 42
@ -239,6 +240,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
    HYuvContext *s = avctx->priv_data;

    ff_huffyuv_common_init(avctx);
+    ff_huffyuvdsp_init(&s->hdsp);
    memset(s->vlc, 0, 3 * sizeof(VLC));

    s->interlaced = s->height > 288;
@ -542,10 +544,10 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
            case LEFT:
            case PLANE:
                decode_422_bitstream(s, width-2);
-                lefty = s->dsp.add_hfyu_left_prediction(p->data[0] + 2, s->temp[0], width-2, lefty);
+                lefty = s->hdsp.add_hfyu_left_pred(p->data[0] + 2, s->temp[0], width - 2, lefty);
                if (!(s->flags&CODEC_FLAG_GRAY)) {
-                    leftu = s->dsp.add_hfyu_left_prediction(p->data[1] + 1, s->temp[1], width2 - 1, leftu);
-                    leftv = s->dsp.add_hfyu_left_prediction(p->data[2] + 1, s->temp[2], width2 - 1, leftv);
+                    leftu = s->hdsp.add_hfyu_left_pred(p->data[1] + 1, s->temp[1], width2 - 1, leftu);
+                    leftv = s->hdsp.add_hfyu_left_pred(p->data[2] + 1, s->temp[2], width2 - 1, leftv);
                }

                for (cy = y = 1; y < s->height; y++, cy++) {
@ -556,10 +558,10 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,

                        ydst = p->data[0] + p->linesize[0] * y;

-                        lefty = s->dsp.add_hfyu_left_prediction(ydst, s->temp[0], width, lefty);
+                        lefty = s->hdsp.add_hfyu_left_pred(ydst, s->temp[0], width, lefty);
                        if (s->predictor == PLANE) {
                            if (y > s->interlaced)
-                                s->dsp.add_bytes(ydst, ydst - fake_ystride, width);
+                                s->hdsp.add_bytes(ydst, ydst - fake_ystride, width);
                        }
                        y++;
                        if (y >= s->height) break;
@ -572,17 +574,17 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                    vdst = p->data[2] + p->linesize[2]*cy;

                    decode_422_bitstream(s, width);
-                    lefty = s->dsp.add_hfyu_left_prediction(ydst, s->temp[0], width, lefty);
+                    lefty = s->hdsp.add_hfyu_left_pred(ydst, s->temp[0], width, lefty);
                    if (!(s->flags & CODEC_FLAG_GRAY)) {
-                        leftu= s->dsp.add_hfyu_left_prediction(udst, s->temp[1], width2, leftu);
-                        leftv= s->dsp.add_hfyu_left_prediction(vdst, s->temp[2], width2, leftv);
+                        leftu = s->hdsp.add_hfyu_left_pred(udst, s->temp[1], width2, leftu);
+                        leftv = s->hdsp.add_hfyu_left_pred(vdst, s->temp[2], width2, leftv);
                    }
                    if (s->predictor == PLANE) {
                        if (cy > s->interlaced) {
-                            s->dsp.add_bytes(ydst, ydst - fake_ystride, width);
+                            s->hdsp.add_bytes(ydst, ydst - fake_ystride, width);
                            if (!(s->flags & CODEC_FLAG_GRAY)) {
-                                s->dsp.add_bytes(udst, udst - fake_ustride, width2);
-                                s->dsp.add_bytes(vdst, vdst - fake_vstride, width2);
+                                s->hdsp.add_bytes(udst, udst - fake_ustride, width2);
+                                s->hdsp.add_bytes(vdst, vdst - fake_vstride, width2);
                            }
                        }
                    }
@ -593,10 +595,10 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
            case MEDIAN:
                /* first line except first 2 pixels is left predicted */
                decode_422_bitstream(s, width - 2);
-                lefty= s->dsp.add_hfyu_left_prediction(p->data[0] + 2, s->temp[0], width - 2, lefty);
+                lefty = s->hdsp.add_hfyu_left_pred(p->data[0] + 2, s->temp[0], width - 2, lefty);
                if (!(s->flags & CODEC_FLAG_GRAY)) {
-                    leftu = s->dsp.add_hfyu_left_prediction(p->data[1] + 1, s->temp[1], width2 - 1, leftu);
-                    leftv = s->dsp.add_hfyu_left_prediction(p->data[2] + 1, s->temp[2], width2 - 1, leftv);
+                    leftu = s->hdsp.add_hfyu_left_pred(p->data[1] + 1, s->temp[1], width2 - 1, leftu);
+                    leftv = s->hdsp.add_hfyu_left_pred(p->data[2] + 1, s->temp[2], width2 - 1, leftv);
                }

                cy = y = 1;
@ -604,31 +606,31 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                /* second line is left predicted for interlaced case */
                if (s->interlaced) {
                    decode_422_bitstream(s, width);
-                    lefty = s->dsp.add_hfyu_left_prediction(p->data[0] + p->linesize[0], s->temp[0], width, lefty);
+                    lefty = s->hdsp.add_hfyu_left_pred(p->data[0] + p->linesize[0], s->temp[0], width, lefty);
                    if (!(s->flags & CODEC_FLAG_GRAY)) {
-                        leftu = s->dsp.add_hfyu_left_prediction(p->data[1] + p->linesize[2], s->temp[1], width2, leftu);
-                        leftv = s->dsp.add_hfyu_left_prediction(p->data[2] + p->linesize[1], s->temp[2], width2, leftv);
+                        leftu = s->hdsp.add_hfyu_left_pred(p->data[1] + p->linesize[2], s->temp[1], width2, leftu);
+                        leftv = s->hdsp.add_hfyu_left_pred(p->data[2] + p->linesize[1], s->temp[2], width2, leftv);
                    }
                    y++; cy++;
                }

                /* next 4 pixels are left predicted too */
                decode_422_bitstream(s, 4);
-                lefty = s->dsp.add_hfyu_left_prediction(p->data[0] + fake_ystride, s->temp[0], 4, lefty);
+                lefty = s->hdsp.add_hfyu_left_pred(p->data[0] + fake_ystride, s->temp[0], 4, lefty);
                if (!(s->flags&CODEC_FLAG_GRAY)) {
-                    leftu = s->dsp.add_hfyu_left_prediction(p->data[1] + fake_ustride, s->temp[1], 2, leftu);
-                    leftv = s->dsp.add_hfyu_left_prediction(p->data[2] + fake_vstride, s->temp[2], 2, leftv);
+                    leftu = s->hdsp.add_hfyu_left_pred(p->data[1] + fake_ustride, s->temp[1], 2, leftu);
+                    leftv = s->hdsp.add_hfyu_left_pred(p->data[2] + fake_vstride, s->temp[2], 2, leftv);
                }

                /* next line except the first 4 pixels is median predicted */
                lefttopy = p->data[0][3];
                decode_422_bitstream(s, width - 4);
-                s->dsp.add_hfyu_median_prediction(p->data[0] + fake_ystride+4, p->data[0]+4, s->temp[0], width-4, &lefty, &lefttopy);
+                s->hdsp.add_hfyu_median_pred(p->data[0] + fake_ystride + 4, p->data[0] + 4, s->temp[0], width - 4, &lefty, &lefttopy);
                if (!(s->flags&CODEC_FLAG_GRAY)) {
                    lefttopu = p->data[1][1];
                    lefttopv = p->data[2][1];
-                    s->dsp.add_hfyu_median_prediction(p->data[1] + fake_ustride+2, p->data[1] + 2, s->temp[1], width2 - 2, &leftu, &lefttopu);
-                    s->dsp.add_hfyu_median_prediction(p->data[2] + fake_vstride+2, p->data[2] + 2, s->temp[2], width2 - 2, &leftv, &lefttopv);
+                    s->hdsp.add_hfyu_median_pred(p->data[1] + fake_ustride + 2, p->data[1] + 2, s->temp[1], width2 - 2, &leftu, &lefttopu);
+                    s->hdsp.add_hfyu_median_pred(p->data[2] + fake_vstride + 2, p->data[2] + 2, s->temp[2], width2 - 2, &leftv, &lefttopv);
                }
                y++; cy++;

@ -639,7 +641,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                        while (2 * cy > y) {
                            decode_gray_bitstream(s, width);
                            ydst = p->data[0] + p->linesize[0] * y;
-                            s->dsp.add_hfyu_median_prediction(ydst, ydst - fake_ystride, s->temp[0], width, &lefty, &lefttopy);
+                            s->hdsp.add_hfyu_median_pred(ydst, ydst - fake_ystride, s->temp[0], width, &lefty, &lefttopy);
                            y++;
                        }
                        if (y >= height) break;
@ -652,10 +654,10 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                    udst = p->data[1] + p->linesize[1] * cy;
                    vdst = p->data[2] + p->linesize[2] * cy;

-                    s->dsp.add_hfyu_median_prediction(ydst, ydst - fake_ystride, s->temp[0], width, &lefty, &lefttopy);
+                    s->hdsp.add_hfyu_median_pred(ydst, ydst - fake_ystride, s->temp[0], width, &lefty, &lefttopy);
                    if (!(s->flags & CODEC_FLAG_GRAY)) {
-                        s->dsp.add_hfyu_median_prediction(udst, udst - fake_ustride, s->temp[1], width2, &leftu, &lefttopu);
-                        s->dsp.add_hfyu_median_prediction(vdst, vdst - fake_vstride, s->temp[2], width2, &leftv, &lefttopv);
+                        s->hdsp.add_hfyu_median_pred(udst, udst - fake_ustride, s->temp[1], width2, &leftu, &lefttopu);
+                        s->hdsp.add_hfyu_median_pred(vdst, vdst - fake_vstride, s->temp[2], width2, &leftv, &lefttopv);
                    }
                }

@ -686,19 +688,19 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
            case LEFT:
            case PLANE:
                decode_bgr_bitstream(s, width - 1);
-                s->dsp.add_hfyu_left_prediction_bgr32(p->data[0] + last_line+4, s->temp[0], width - 1, &leftr, &leftg, &leftb, &lefta);
+                s->hdsp.add_hfyu_left_pred_bgr32(p->data[0] + last_line + 4, s->temp[0], width - 1, &leftr, &leftg, &leftb, &lefta);

                for (y = s->height - 2; y >= 0; y--) { //Yes it is stored upside down.
                    decode_bgr_bitstream(s, width);

-                    s->dsp.add_hfyu_left_prediction_bgr32(p->data[0] + p->linesize[0]*y, s->temp[0], width, &leftr, &leftg, &leftb, &lefta);
+                    s->hdsp.add_hfyu_left_pred_bgr32(p->data[0] + p->linesize[0] * y, s->temp[0], width, &leftr, &leftg, &leftb, &lefta);
                    if (s->predictor == PLANE) {
                        if (s->bitstream_bpp != 32) lefta = 0;
                        if ((y & s->interlaced) == 0 &&
                            y < s->height - 1 - s->interlaced) {
-                            s->dsp.add_bytes(p->data[0] + p->linesize[0] * y,
-                                             p->data[0] + p->linesize[0] * y +
-                                             fake_ystride, fake_ystride);
+                            s->hdsp.add_bytes(p->data[0] + p->linesize[0] * y,
+                                              p->data[0] + p->linesize[0] * y +
+                                              fake_ystride, fake_ystride);
                        }
                    }
                }
--- a/libavcodec/huffyuvdsp.c
+++ b/libavcodec/huffyuvdsp.c
@ -0,0 +1,132 @@
+/*
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "mathops.h"
+#include "huffyuvdsp.h"
+
+// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
+#define pb_7f (~0UL / 255 * 0x7f)
+#define pb_80 (~0UL / 255 * 0x80)
+
+static void add_bytes_c(uint8_t *dst, uint8_t *src, int w)
+{
+    long i;
+
+    for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
+        long a = *(long *) (src + i);
+        long b = *(long *) (dst + i);
+        *(long *) (dst + i) = ((a & pb_7f) + (b & pb_7f)) ^ ((a ^ b) & pb_80);
+    }
+    for (; i < w; i++)
+        dst[i + 0] += src[i + 0];
+}
+
+static void add_hfyu_median_pred_c(uint8_t *dst, const uint8_t *src1,
+                                   const uint8_t *diff, int w,
+                                   int *left, int *left_top)
+{
+    int i;
+    uint8_t l, lt;
+
+    l  = *left;
+    lt = *left_top;
+
+    for (i = 0; i < w; i++) {
+        l      = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF) + diff[i];
+        lt     = src1[i];
+        dst[i] = l;
+    }
+
+    *left     = l;
+    *left_top = lt;
+}
+
+static int add_hfyu_left_pred_c(uint8_t *dst, const uint8_t *src, int w,
+                                int acc)
+{
+    int i;
+
+    for (i = 0; i < w - 1; i++) {
+        acc   += src[i];
+        dst[i] = acc;
+        i++;
+        acc   += src[i];
+        dst[i] = acc;
+    }
+
+    for (; i < w; i++) {
+        acc   += src[i];
+        dst[i] = acc;
+    }
+
+    return acc;
+}
+
+#if HAVE_BIGENDIAN
+#define B 3
+#define G 2
+#define R 1
+#define A 0
+#else
+#define B 0
+#define G 1
+#define R 2
+#define A 3
+#endif
+static void add_hfyu_left_pred_bgr32_c(uint8_t *dst, const uint8_t *src,
+                                       int w, int *red, int *green,
+                                       int *blue, int *alpha)
+{
+    int i, r = *red, g = *green, b = *blue, a = *alpha;
+
+    for (i = 0; i < w; i++) {
+        b += src[4 * i + B];
+        g += src[4 * i + G];
+        r += src[4 * i + R];
+        a += src[4 * i + A];
+
+        dst[4 * i + B] = b;
+        dst[4 * i + G] = g;
+        dst[4 * i + R] = r;
+        dst[4 * i + A] = a;
+    }
+
+    *red   = r;
+    *green = g;
+    *blue  = b;
+    *alpha = a;
+}
+#undef B
+#undef G
+#undef R
+#undef A
+
+av_cold void ff_huffyuvdsp_init(HuffYUVDSPContext *c)
+{
+    c->add_bytes                = add_bytes_c;
+    c->add_hfyu_median_pred     = add_hfyu_median_pred_c;
+    c->add_hfyu_left_pred       = add_hfyu_left_pred_c;
+    c->add_hfyu_left_pred_bgr32 = add_hfyu_left_pred_bgr32_c;
+
+    if (ARCH_X86)
+        ff_huffyuvdsp_init_x86(c);
+}
--- a/libavcodec/huffyuvdsp.h
+++ b/libavcodec/huffyuvdsp.h
@ -0,0 +1,41 @@
+/*
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_HUFFYUVDSP_H
+#define AVCODEC_HUFFYUVDSP_H
+
+#include <stdint.h>
+
+typedef struct HuffYUVDSPContext {
+    void (*add_bytes)(uint8_t *dst /* align 16 */, uint8_t *src /* align 16 */,
+                      int w);
+    void (*add_hfyu_median_pred)(uint8_t *dst, const uint8_t *top,
+                                 const uint8_t *diff, int w,
+                                 int *left, int *left_top);
+    int (*add_hfyu_left_pred)(uint8_t *dst, const uint8_t *src,
+                              int w, int left);
+    void (*add_hfyu_left_pred_bgr32)(uint8_t *dst, const uint8_t *src,
+                                     int w, int *red, int *green,
+                                     int *blue, int *alpha);
+} HuffYUVDSPContext;
+
+void ff_huffyuvdsp_init(HuffYUVDSPContext *c);
+void ff_huffyuvdsp_init_ppc(HuffYUVDSPContext *c);
+void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c);
+
+#endif /* AVCODEC_HUFFYUVDSP_H */
--- a/libavcodec/lagarith.c
+++ b/libavcodec/lagarith.c
@ -30,7 +30,7 @@
 #include "avcodec.h"
 #include "get_bits.h"
 #include "mathops.h"
-#include "dsputil.h"
+#include "huffyuvdsp.h"
 #include "lagarithrac.h"
 #include "thread.h"

@ -50,7 +50,7 @@ enum LagarithFrameType {

 typedef struct LagarithContext {
    AVCodecContext *avctx;
-    DSPContext dsp;
+    HuffYUVDSPContext hdsp;
    int zeros;                  /**< number of consecutive zero bytes encountered */
    int zeros_rem;              /**< number of zero bytes remaining to output */
    uint8_t *rgb_planes;
@ -225,7 +225,7 @@ static void add_lag_median_prediction(uint8_t *dst, uint8_t *src1,
                                      uint8_t *diff, int w, int *left,
                                      int *left_top)
 {
-    /* This is almost identical to add_hfyu_median_prediction in dsputil.h.
+    /* This is almost identical to add_hfyu_median_pred in huffyuvdsp.h.
     * However the &0xFF on the gradient predictor yealds incorrect output
     * for lagarith.
     */
@ -253,8 +253,7 @@ static void lag_pred_line(LagarithContext *l, uint8_t *buf,
    if (!line) {
        int i, align_width = (width - 1) & ~31;
        /* Left prediction only for first line */
-        L = l->dsp.add_hfyu_left_prediction(buf + 1, buf + 1,
-                                            align_width, buf[0]);
+        L = l->hdsp.add_hfyu_left_pred(buf + 1, buf + 1, align_width, buf[0]);
        for (i = align_width + 1; i < width; i++)
            buf[i] += buf[i - 1];
    } else {
@ -289,7 +288,7 @@ static void lag_pred_line_yuy2(LagarithContext *l, uint8_t *buf,
        }

        align_width = (width - 1) & ~31;
-        l->dsp.add_hfyu_left_prediction(buf + 1, buf + 1, align_width, buf[0]);
+        l->hdsp.add_hfyu_left_pred(buf + 1, buf + 1, align_width, buf[0]);

        for (i = align_width + 1; i < width; i++)
            buf[i] += buf[i - 1];
@ -314,8 +313,7 @@ static void lag_pred_line_yuy2(LagarithContext *l, uint8_t *buf,
    } else {
        TL = buf[width - (2 * stride) - 1];
        L  = buf[width - stride - 1];
-        l->dsp.add_hfyu_median_prediction(buf, buf - stride, buf, width,
-                                          &L, &TL);
+        l->hdsp.add_hfyu_median_pred(buf, buf - stride, buf, width, &L, &TL);
    }
 }

@ -682,7 +680,7 @@ static av_cold int lag_decode_init(AVCodecContext *avctx)
    LagarithContext *l = avctx->priv_data;
    l->avctx = avctx;

-    ff_dsputil_init(&l->dsp, avctx);
+    ff_huffyuvdsp_init(&l->hdsp);

    return 0;
 }
--- a/libavcodec/ppc/Makefile
+++ b/libavcodec/ppc/Makefile
@ -6,6 +6,7 @@ OBJS-$(CONFIG_H264CHROMA)              += ppc/h264chroma_init.o
 OBJS-$(CONFIG_H264DSP)                 += ppc/h264dsp.o
 OBJS-$(CONFIG_H264QPEL)                += ppc/h264qpel.o
 OBJS-$(CONFIG_HPELDSP)                 += ppc/hpeldsp_altivec.o
+OBJS-$(CONFIG_HUFFYUVDSP)              += ppc/huffyuvdsp_altivec.o
 OBJS-$(CONFIG_MPEGAUDIODSP)            += ppc/mpegaudiodsp_altivec.o
 OBJS-$(CONFIG_MPEGVIDEO)               += ppc/mpegvideo_altivec.o
 OBJS-$(CONFIG_VIDEODSP)                += ppc/videodsp_ppc.o
--- a/libavcodec/ppc/dsputil_altivec.c
+++ b/libavcodec/ppc/dsputil_altivec.c
@ -571,23 +571,6 @@ static void clear_block_altivec(int16_t *block)
    vec_st(zero_s16v, 112, block);
 }

-static void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w)
-{
-    register int i;
-    register vector unsigned char vdst, vsrc;
-
-    /* dst and src are 16 bytes-aligned (guaranteed). */
-    for (i = 0; i + 15 < w; i += 16) {
-        vdst = vec_ld(i, (unsigned char *) dst);
-        vsrc = vec_ld(i, (unsigned char *) src);
-        vdst = vec_add(vsrc, vdst);
-        vec_st(vdst, i, (unsigned char *) dst);
-    }
-    /* If w is not a multiple of 16. */
-    for (; i < w; i++)
-        dst[i] = src[i];
-}
-
 static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst,
                                     uint8_t *src, int stride, int h)
 {
@ -945,7 +928,6 @@ av_cold void ff_dsputil_init_altivec(DSPContext *c, AVCodecContext *avctx,
    c->pix_sum   = pix_sum_altivec;

    c->diff_pixels = diff_pixels_altivec;
-    c->add_bytes   = add_bytes_altivec;

    if (!high_bit_depth) {
        c->get_pixels = get_pixels_altivec;
--- a/libavcodec/ppc/huffyuvdsp_altivec.c
+++ b/libavcodec/ppc/huffyuvdsp_altivec.c
@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2002 Brian Foley
+ * Copyright (c) 2002 Dieter Shirley
+ * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#if HAVE_ALTIVEC_H
+#include <altivec.h>
+#endif
+
+#include "libavutil/attributes.h"
+#include "libavutil/ppc/types_altivec.h"
+#include "libavutil/ppc/util_altivec.h"
+#include "libavcodec/huffyuvdsp.h"
+
+#if HAVE_ALTIVEC
+static void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w)
+{
+    register int i;
+    register vector unsigned char vdst, vsrc;
+
+    /* dst and src are 16 bytes-aligned (guaranteed). */
+    for (i = 0; i + 15 < w; i += 16) {
+        vdst = vec_ld(i, (unsigned char *) dst);
+        vsrc = vec_ld(i, (unsigned char *) src);
+        vdst = vec_add(vsrc, vdst);
+        vec_st(vdst, i, (unsigned char *) dst);
+    }
+    /* If w is not a multiple of 16. */
+    for (; i < w; i++)
+        dst[i] = src[i];
+}
+#endif /* HAVE_ALTIVEC */
+
+av_cold void ff_huffyuvdsp_init_ppc(HuffYUVDSPContext *c)
+{
+#if HAVE_ALTIVEC
+    c->add_bytes = add_bytes_altivec;
+#endif /* HAVE_ALTIVEC */
+}
--- a/libavcodec/vble.c
+++ b/libavcodec/vble.c
@ -27,14 +27,14 @@
 #define BITSTREAM_READER_LE

 #include "avcodec.h"
-#include "dsputil.h"
 #include "get_bits.h"
+#include "huffyuvdsp.h"
 #include "internal.h"
 #include "mathops.h"

 typedef struct {
    AVCodecContext *avctx;
-    DSPContext dsp;
+    HuffYUVDSPContext hdsp;

    int            size;
    uint8_t        *val; /* First holds the lengths of vlc symbols and then their values */
@ -100,8 +100,8 @@ static void vble_restore_plane(VBLEContext *ctx, AVFrame *pic,
        if (i) {
            left = 0;
            left_top = dst[-stride];
-            ctx->dsp.add_hfyu_median_prediction(dst, dst-stride, val,
-                                                width, &left, &left_top);
+            ctx->hdsp.add_hfyu_median_pred(dst, dst - stride, val,
+                                           width, &left, &left_top);
        } else {
            dst[0] = val[0];
            for (j = 1; j < width; j++)
@ -178,7 +178,7 @@ static av_cold int vble_decode_init(AVCodecContext *avctx)

    /* Stash for later use */
    ctx->avctx = avctx;
-    ff_dsputil_init(&ctx->dsp, avctx);
+    ff_huffyuvdsp_init(&ctx->hdsp);

    avctx->pix_fmt = AV_PIX_FMT_YUV420P;
    avctx->bits_per_raw_sample = 8;
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@ -3,8 +3,7 @@ OBJS                                   += x86/constants.o               \

 OBJS-$(CONFIG_AC3DSP)                  += x86/ac3dsp_init.o
 OBJS-$(CONFIG_DCT)                     += x86/dct_init.o
-OBJS-$(CONFIG_DSPUTIL)                 += x86/dsputil_init.o            \
-                                          x86/dsputil_x86.o
+OBJS-$(CONFIG_DSPUTIL)                 += x86/dsputil_init.o
 OBJS-$(CONFIG_ENCODERS)                += x86/dsputilenc_mmx.o          \
                                          x86/fdct.o                    \
                                          x86/motion_est.o
@ -15,6 +14,7 @@ OBJS-$(CONFIG_H264DSP)                 += x86/h264dsp_init.o
 OBJS-$(CONFIG_H264PRED)                += x86/h264_intrapred_init.o
 OBJS-$(CONFIG_H264QPEL)                += x86/h264_qpel.o
 OBJS-$(CONFIG_HPELDSP)                 += x86/hpeldsp_init.o
+OBJS-$(CONFIG_HUFFYUVDSP)              += x86/huffyuvdsp_init.o
 OBJS-$(CONFIG_LPC)                     += x86/lpc.o
 OBJS-$(CONFIG_MPEGAUDIODSP)            += x86/mpegaudiodsp.o
 OBJS-$(CONFIG_MPEGVIDEO)               += x86/mpegvideo.o
@ -48,6 +48,7 @@ MMX-OBJS-$(CONFIG_DSPUTIL)             += x86/dsputil_mmx.o             \
                                          x86/simple_idct.o
 MMX-OBJS-$(CONFIG_HPELDSP)             += x86/fpel_mmx.o                \
                                          x86/hpeldsp_mmx.o
+MMX-OBJS-$(CONFIG_HUFFYUVDSP)          += x86/huffyuvdsp_mmx.o

 MMX-OBJS-$(CONFIG_VC1_DECODER)         += x86/vc1dsp_mmx.o

@ -80,6 +81,7 @@ YASM-OBJS-$(CONFIG_H264QPEL)           += x86/h264_qpel_8bit.o          \
                                          x86/qpel.o
 YASM-OBJS-$(CONFIG_HPELDSP)            += x86/fpel.o                    \
                                          x86/hpeldsp.o
+YASM-OBJS-$(CONFIG_HUFFYUVDSP)         += x86/huffyuvdsp.o
 YASM-OBJS-$(CONFIG_MPEGAUDIODSP)       += x86/imdct36.o
 YASM-OBJS-$(CONFIG_VIDEODSP)           += x86/videodsp.o
 YASM-OBJS-$(CONFIG_VP3DSP)             += x86/vp3dsp.o
--- a/libavcodec/x86/dsputil.asm
+++ b/libavcodec/x86/dsputil.asm
@ -22,11 +22,6 @@
 %include "libavutil/x86/x86util.asm"

 SECTION_RODATA
-pb_f: times 16 db 15
-pb_zzzzzzzz77777777: times 8 db -1
-pb_7: times 8 db 7
-pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
-pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
 pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12

 SECTION_TEXT
@ -203,141 +198,6 @@ SCALARPRODUCT_LOOP 0
    RET


-; void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
-;                                           const uint8_t *diff, int w,
-;                                           int *left, int *left_top)
-INIT_MMX mmxext
-cglobal add_hfyu_median_prediction, 6,6,0, dst, top, diff, w, left, left_top
-    movq    mm0, [topq]
-    movq    mm2, mm0
-    movd    mm4, [left_topq]
-    psllq   mm2, 8
-    movq    mm1, mm0
-    por     mm4, mm2
-    movd    mm3, [leftq]
-    psubb   mm0, mm4 ; t-tl
-    add    dstq, wq
-    add    topq, wq
-    add   diffq, wq
-    neg      wq
-    jmp .skip
-.loop:
-    movq    mm4, [topq+wq]
-    movq    mm0, mm4
-    psllq   mm4, 8
-    por     mm4, mm1
-    movq    mm1, mm0 ; t
-    psubb   mm0, mm4 ; t-tl
-.skip:
-    movq    mm2, [diffq+wq]
-%assign i 0
-%rep 8
-    movq    mm4, mm0
-    paddb   mm4, mm3 ; t-tl+l
-    movq    mm5, mm3
-    pmaxub  mm3, mm1
-    pminub  mm5, mm1
-    pminub  mm3, mm4
-    pmaxub  mm3, mm5 ; median
-    paddb   mm3, mm2 ; +residual
-%if i==0
-    movq    mm7, mm3
-    psllq   mm7, 56
-%else
-    movq    mm6, mm3
-    psrlq   mm7, 8
-    psllq   mm6, 56
-    por     mm7, mm6
-%endif
-%if i<7
-    psrlq   mm0, 8
-    psrlq   mm1, 8
-    psrlq   mm2, 8
-%endif
-%assign i i+1
-%endrep
-    movq [dstq+wq], mm7
-    add      wq, 8
-    jl .loop
-    movzx   r2d, byte [dstq-1]
-    mov [leftq], r2d
-    movzx   r2d, byte [topq-1]
-    mov [left_topq], r2d
-    RET
-
-
-%macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
-    add     srcq, wq
-    add     dstq, wq
-    neg     wq
-%%.loop:
-%if %2
-    mova    m1, [srcq+wq]
-%else
-    movu    m1, [srcq+wq]
-%endif
-    mova    m2, m1
-    psllw   m1, 8
-    paddb   m1, m2
-    mova    m2, m1
-    pshufb  m1, m3
-    paddb   m1, m2
-    pshufb  m0, m5
-    mova    m2, m1
-    pshufb  m1, m4
-    paddb   m1, m2
-%if mmsize == 16
-    mova    m2, m1
-    pshufb  m1, m6
-    paddb   m1, m2
-%endif
-    paddb   m0, m1
-%if %1
-    mova    [dstq+wq], m0
-%else
-    movq    [dstq+wq], m0
-    movhps  [dstq+wq+8], m0
-%endif
-    add     wq, mmsize
-    jl %%.loop
-    mov     eax, mmsize-1
-    sub     eax, wd
-    movd    m1, eax
-    pshufb  m0, m1
-    movd    eax, m0
-    RET
-%endmacro
-
-; int ff_add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src,
-;                                 int w, int left)
-INIT_MMX ssse3
-cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
-.skip_prologue:
-    mova    m5, [pb_7]
-    mova    m4, [pb_zzzz3333zzzzbbbb]
-    mova    m3, [pb_zz11zz55zz99zzdd]
-    movd    m0, leftm
-    psllq   m0, 56
-    ADD_HFYU_LEFT_LOOP 1, 1
-
-INIT_XMM sse4
-cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
-    mova    m5, [pb_f]
-    mova    m6, [pb_zzzzzzzz77777777]
-    mova    m4, [pb_zzzz3333zzzzbbbb]
-    mova    m3, [pb_zz11zz55zz99zzdd]
-    movd    m0, leftm
-    pslldq  m0, 15
-    test    srcq, 15
-    jnz .src_unaligned
-    test    dstq, 15
-    jnz .dst_unaligned
-    ADD_HFYU_LEFT_LOOP 1, 1
-.dst_unaligned:
-    ADD_HFYU_LEFT_LOOP 0, 1
-.src_unaligned:
-    ADD_HFYU_LEFT_LOOP 0, 0
-
 ;-----------------------------------------------------------------------------
 ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
 ;                           int32_t max, unsigned int len)
--- a/libavcodec/x86/dsputil_init.c
+++ b/libavcodec/x86/dsputil_init.c
@ -20,7 +20,6 @@
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/internal.h"
-#include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/dsputil.h"
@ -90,14 +89,6 @@ int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);

-void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
-                                          const uint8_t *diff, int w,
-                                          int *left, int *left_top);
-int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
-                                      int w, int left);
-int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
-                                     int w, int left);
-
 void ff_vector_clip_int32_mmx(int32_t *dst, const int32_t *src,
                              int32_t min, int32_t max, unsigned int len);
 void ff_vector_clip_int32_sse2(int32_t *dst, const int32_t *src,
@ -549,8 +540,6 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
    }

    c->gmc = ff_gmc_mmx;
-
-    c->add_bytes = ff_add_bytes_mmx;
 #endif /* HAVE_MMX_INLINE */

 #if HAVE_MMX_EXTERNAL
@ -578,10 +567,6 @@ static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
    SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
    SET_QPEL_FUNCS(put_no_rnd_qpel, 1,  8, mmxext, );

-    /* slower than cmov version on AMD */
-    if (!(cpu_flags & AV_CPU_FLAG_3DNOW))
-        c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
-
    c->scalarproduct_int16          = ff_scalarproduct_int16_mmxext;
    c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
 #endif /* HAVE_MMXEXT_EXTERNAL */
@ -636,10 +621,6 @@ static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
                                       int cpu_flags, unsigned high_bit_depth)
 {
 #if HAVE_SSSE3_EXTERNAL
-    c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
-    if (cpu_flags & AV_CPU_FLAG_SSE4) // not really SSE4, just slow on Conroe
-        c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
-
    if (!(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
        c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
    c->bswap_buf = ff_bswap32_buf_ssse3;
@ -659,11 +640,6 @@ av_cold void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx,
 {
    int cpu_flags = av_get_cpu_flags();

-#if HAVE_7REGS && HAVE_INLINE_ASM
-    if (cpu_flags & AV_CPU_FLAG_CMOV)
-        c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_cmov;
-#endif
-
    if (X86_MMX(cpu_flags))
        dsputil_init_mmx(c, avctx, cpu_flags, high_bit_depth);

--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@ -222,32 +222,6 @@ void ff_clear_blocks_sse(int16_t *blocks)
        : "%"REG_a);
 }

-void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
-{
-    x86_reg i = 0;
-
-    __asm__ volatile (
-        "jmp          2f                \n\t"
-        "1:                             \n\t"
-        "movq   (%1, %0), %%mm0         \n\t"
-        "movq   (%2, %0), %%mm1         \n\t"
-        "paddb     %%mm0, %%mm1         \n\t"
-        "movq      %%mm1, (%2, %0)      \n\t"
-        "movq  8(%1, %0), %%mm0         \n\t"
-        "movq  8(%2, %0), %%mm1         \n\t"
-        "paddb     %%mm0, %%mm1         \n\t"
-        "movq      %%mm1, 8(%2, %0)     \n\t"
-        "add         $16, %0            \n\t"
-        "2:                             \n\t"
-        "cmp          %3, %0            \n\t"
-        "js           1b                \n\t"
-        : "+r" (i)
-        : "r" (src), "r" (dst), "r" ((x86_reg) w - 15));
-
-    for (; i < w; i++)
-        dst[i + 0] += src[i + 0];
-}
-
 /* Draw the edges of width 'w' of an image of size width, height
 * this MMX version can only handle w == 8 || w == 16. */
 void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
--- a/libavcodec/x86/dsputil_x86.h
+++ b/libavcodec/x86/dsputil_x86.h
@ -43,12 +43,6 @@ void ff_clear_block_sse(int16_t *block);
 void ff_clear_blocks_mmx(int16_t *blocks);
 void ff_clear_blocks_sse(int16_t *blocks);

-void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w);
-
-void ff_add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
-                                        const uint8_t *diff, int w,
-                                        int *left, int *left_top);
-
 void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
                       int w, int h, int sides);

--- a/libavcodec/x86/huffyuvdsp.asm
+++ b/libavcodec/x86/huffyuvdsp.asm
@ -0,0 +1,165 @@
+;******************************************************************************
+;* SIMD-optimized HuffYUV functions
+;* Copyright (c) 2008 Loren Merritt
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+pb_f: times 16 db 15
+pb_zzzzzzzz77777777: times 8 db -1
+pb_7: times 8 db 7
+pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
+pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
+
+SECTION_TEXT
+
+; void ff_add_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
+;                                     const uint8_t *diff, int w,
+;                                     int *left, int *left_top)
+INIT_MMX mmxext
+cglobal add_hfyu_median_pred, 6,6,0, dst, top, diff, w, left, left_top
+    movq    mm0, [topq]
+    movq    mm2, mm0
+    movd    mm4, [left_topq]
+    psllq   mm2, 8
+    movq    mm1, mm0
+    por     mm4, mm2
+    movd    mm3, [leftq]
+    psubb   mm0, mm4 ; t-tl
+    add    dstq, wq
+    add    topq, wq
+    add   diffq, wq
+    neg      wq
+    jmp .skip
+.loop:
+    movq    mm4, [topq+wq]
+    movq    mm0, mm4
+    psllq   mm4, 8
+    por     mm4, mm1
+    movq    mm1, mm0 ; t
+    psubb   mm0, mm4 ; t-tl
+.skip:
+    movq    mm2, [diffq+wq]
+%assign i 0
+%rep 8
+    movq    mm4, mm0
+    paddb   mm4, mm3 ; t-tl+l
+    movq    mm5, mm3
+    pmaxub  mm3, mm1
+    pminub  mm5, mm1
+    pminub  mm3, mm4
+    pmaxub  mm3, mm5 ; median
+    paddb   mm3, mm2 ; +residual
+%if i==0
+    movq    mm7, mm3
+    psllq   mm7, 56
+%else
+    movq    mm6, mm3
+    psrlq   mm7, 8
+    psllq   mm6, 56
+    por     mm7, mm6
+%endif
+%if i<7
+    psrlq   mm0, 8
+    psrlq   mm1, 8
+    psrlq   mm2, 8
+%endif
+%assign i i+1
+%endrep
+    movq [dstq+wq], mm7
+    add      wq, 8
+    jl .loop
+    movzx   r2d, byte [dstq-1]
+    mov [leftq], r2d
+    movzx   r2d, byte [topq-1]
+    mov [left_topq], r2d
+    RET
+
+
+%macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
+    add     srcq, wq
+    add     dstq, wq
+    neg     wq
+%%.loop:
+%if %2
+    mova    m1, [srcq+wq]
+%else
+    movu    m1, [srcq+wq]
+%endif
+    mova    m2, m1
+    psllw   m1, 8
+    paddb   m1, m2
+    mova    m2, m1
+    pshufb  m1, m3
+    paddb   m1, m2
+    pshufb  m0, m5
+    mova    m2, m1
+    pshufb  m1, m4
+    paddb   m1, m2
+%if mmsize == 16
+    mova    m2, m1
+    pshufb  m1, m6
+    paddb   m1, m2
+%endif
+    paddb   m0, m1
+%if %1
+    mova    [dstq+wq], m0
+%else
+    movq    [dstq+wq], m0
+    movhps  [dstq+wq+8], m0
+%endif
+    add     wq, mmsize
+    jl %%.loop
+    mov     eax, mmsize-1
+    sub     eax, wd
+    movd    m1, eax
+    pshufb  m0, m1
+    movd    eax, m0
+    RET
+%endmacro
+
+; int ff_add_hfyu_left_pred(uint8_t *dst, const uint8_t *src, int w, int left)
+INIT_MMX ssse3
+cglobal add_hfyu_left_pred, 3,3,7, dst, src, w, left
+.skip_prologue:
+    mova    m5, [pb_7]
+    mova    m4, [pb_zzzz3333zzzzbbbb]
+    mova    m3, [pb_zz11zz55zz99zzdd]
+    movd    m0, leftm
+    psllq   m0, 56
+    ADD_HFYU_LEFT_LOOP 1, 1
+
+INIT_XMM sse4
+cglobal add_hfyu_left_pred, 3,3,7, dst, src, w, left
+    mova    m5, [pb_f]
+    mova    m6, [pb_zzzzzzzz77777777]
+    mova    m4, [pb_zzzz3333zzzzbbbb]
+    mova    m3, [pb_zz11zz55zz99zzdd]
+    movd    m0, leftm
+    pslldq  m0, 15
+    test    srcq, 15
+    jnz .src_unaligned
+    test    dstq, 15
+    jnz .dst_unaligned
+    ADD_HFYU_LEFT_LOOP 1, 1
+.dst_unaligned:
+    ADD_HFYU_LEFT_LOOP 0, 1
+.src_unaligned:
+    ADD_HFYU_LEFT_LOOP 0, 0
--- a/libavcodec/x86/huffyuvdsp.h
+++ b/libavcodec/x86/huffyuvdsp.h
@ -0,0 +1,30 @@
+/*
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_HUFFYUVDSP_H
+#define AVCODEC_X86_HUFFYUVDSP_H
+
+#include <stdint.h>
+
+void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w);
+
+void ff_add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top,
+                                  const uint8_t *diff, int w,
+                                  int *left, int *left_top);
+
+#endif /* AVCODEC_X86_HUFFYUVDSP_H */
--- a/libavcodec/x86/huffyuvdsp_init.c
+++ b/libavcodec/x86/huffyuvdsp_init.c
@ -0,0 +1,63 @@
+/*
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/huffyuvdsp.h"
+
+void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w);
+
+void ff_add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top,
+                                  const uint8_t *diff, int w,
+                                  int *left, int *left_top);
+void ff_add_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
+                                    const uint8_t *diff, int w,
+                                    int *left, int *left_top);
+
+int  ff_add_hfyu_left_pred_ssse3(uint8_t *dst, const uint8_t *src,
+                                 int w, int left);
+int  ff_add_hfyu_left_pred_sse4(uint8_t *dst, const uint8_t *src,
+                                int w, int left);
+
+av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#if HAVE_7REGS && HAVE_INLINE_ASM
+    if (cpu_flags & AV_CPU_FLAG_CMOV)
+        c->add_hfyu_median_pred = ff_add_hfyu_median_pred_cmov;
+#endif
+
+    if (INLINE_MMX(cpu_flags))
+        c->add_bytes = ff_add_bytes_mmx;
+
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        /* slower than cmov version on AMD */
+        if (!(cpu_flags & AV_CPU_FLAG_3DNOW))
+            c->add_hfyu_median_pred = ff_add_hfyu_median_pred_mmxext;
+    }
+
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        c->add_hfyu_left_pred = ff_add_hfyu_left_pred_ssse3;
+        if (cpu_flags & AV_CPU_FLAG_SSE4) // not really SSE4, just slow on Conroe
+            c->add_hfyu_left_pred = ff_add_hfyu_left_pred_sse4;
+    }
+}
--- a/libavcodec/x86/huffyuvdsp_mmx.c
+++ b/libavcodec/x86/huffyuvdsp_mmx.c
@ -20,14 +20,14 @@

 #include "config.h"
 #include "libavutil/x86/asm.h"
-#include "dsputil_x86.h"
+#include "huffyuvdsp.h"

 #if HAVE_INLINE_ASM

 #if HAVE_7REGS
-void ff_add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
-                                        const uint8_t *diff, int w,
-                                        int *left, int *left_top)
+void ff_add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top,
+                                  const uint8_t *diff, int w,
+                                  int *left, int *left_top)
 {
    x86_reg w2 = -w;
    x86_reg x;
@ -62,4 +62,30 @@ void ff_add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
 }
 #endif

+void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
+{
+    x86_reg i = 0;
+
+    __asm__ volatile (
+        "jmp          2f                \n\t"
+        "1:                             \n\t"
+        "movq   (%1, %0), %%mm0         \n\t"
+        "movq   (%2, %0), %%mm1         \n\t"
+        "paddb     %%mm0, %%mm1         \n\t"
+        "movq      %%mm1, (%2, %0)      \n\t"
+        "movq  8(%1, %0), %%mm0         \n\t"
+        "movq  8(%2, %0), %%mm1         \n\t"
+        "paddb     %%mm0, %%mm1         \n\t"
+        "movq      %%mm1, 8(%2, %0)     \n\t"
+        "add         $16, %0            \n\t"
+        "2:                             \n\t"
+        "cmp          %3, %0            \n\t"
+        "js           1b                \n\t"
+        : "+r" (i)
+        : "r" (src), "r" (dst), "r" ((x86_reg) w - 15));
+
+    for (; i < w; i++)
+        dst[i + 0] += src[i + 0];
+}
+
 #endif /* HAVE_INLINE_ASM */