From 054013a0fc6f2b52c60cee3e051be8cc7f82cef3 Mon Sep 17 00:00:00 2001
From: Diego Biurrun <diego@biurrun.de>
Date: Sun, 29 Dec 2013 02:32:16 +0100
Subject: [PATCH] dsputil: Move APE-specific bits into apedsp

---
 libavcodec/apedec.c                |  33 +++++-
 libavcodec/apedsp.h                |  44 ++++++++
 libavcodec/arm/Makefile            |   2 +
 libavcodec/arm/apedsp_init_arm.c   |  38 +++++++
 libavcodec/arm/apedsp_neon.S       |  62 +++++++++++
 libavcodec/arm/dsputil_init_neon.c |   5 -
 libavcodec/arm/int_neon.S          |  40 -------
 libavcodec/dsputil.c               |  15 ---
 libavcodec/dsputil.h               |  10 --
 libavcodec/ppc/Makefile            |   1 +
 libavcodec/ppc/apedsp_altivec.c    |  77 +++++++++++++
 libavcodec/ppc/int_altivec.c       |  42 --------
 libavcodec/x86/Makefile            |   2 +
 libavcodec/x86/apedsp.asm          | 167 +++++++++++++++++++++++++++++
 libavcodec/x86/apedsp_init.c       |  47 ++++++++
 libavcodec/x86/dsputil.asm         | 137 -----------------------
 libavcodec/x86/dsputil_init.c      |  13 ---
 17 files changed, 468 insertions(+), 267 deletions(-)
 create mode 100644 libavcodec/apedsp.h
 create mode 100644 libavcodec/arm/apedsp_init_arm.c
 create mode 100644 libavcodec/arm/apedsp_neon.S
 create mode 100644 libavcodec/ppc/apedsp_altivec.c
 create mode 100644 libavcodec/x86/apedsp.asm
 create mode 100644 libavcodec/x86/apedsp_init.c

diff --git a/libavcodec/apedec.c b/libavcodec/apedec.c
index fb41918265..6329295c9a 100644
--- a/libavcodec/apedec.c
+++ b/libavcodec/apedec.c
@@ -25,6 +25,7 @@
 #include "libavutil/avassert.h"
 #include "libavutil/channel_layout.h"
 #include "libavutil/opt.h"
+#include "apedsp.h"
 #include "avcodec.h"
 #include "dsputil.h"
 #include "bytestream.h"
@@ -136,6 +137,7 @@ typedef struct APEContext {
     AVClass *class;                          ///< class for AVOptions
     AVCodecContext *avctx;
     DSPContext dsp;
+    APEDSPContext adsp;
     int channels;
     int samples;                             ///< samples left to decode in current frame
     int bps;
@@ -195,8 +197,6 @@ static void predictor_decode_stereo_3930(APEContext *ctx, int count);
 static void predictor_decode_mono_3950(APEContext *ctx, int count);
 static void predictor_decode_stereo_3950(APEContext *ctx, int count);
 
-// TODO: dsputilize
-
 static av_cold int ape_decode_close(AVCodecContext *avctx)
 {
     APEContext *s = avctx->priv_data;
@@ -212,6 +212,19 @@ static av_cold int ape_decode_close(AVCodecContext *avctx)
     return 0;
 }
 
+static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
+                                              const int16_t *v3,
+                                              int order, int mul)
+{
+    int res = 0;
+
+    while (order--) {
+        res   += *v1 * *v2++;
+        *v1++ += mul * *v3++;
+    }
+    return res;
+}
+
 static av_cold int ape_decode_init(AVCodecContext *avctx)
 {
     APEContext *s = avctx->priv_data;
@@ -292,6 +305,15 @@ static av_cold int ape_decode_init(AVCodecContext *avctx)
         s->predictor_decode_stereo = predictor_decode_stereo_3950;
     }
 
+    s->adsp.scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
+
+    if (ARCH_ARM)
+        ff_apedsp_init_arm(&s->adsp);
+    if (ARCH_PPC)
+        ff_apedsp_init_ppc(&s->adsp);
+    if (ARCH_X86)
+        ff_apedsp_init_x86(&s->adsp);
+
     ff_dsputil_init(&s->dsp, avctx);
     avctx->channel_layout = (avctx->channels==2) ? AV_CH_LAYOUT_STEREO : AV_CH_LAYOUT_MONO;
 
@@ -1263,9 +1285,10 @@ static void do_apply_filter(APEContext *ctx, int version, APEFilter *f,
 
     while (count--) {
         /* round fixedpoint scalar product */
-        res = ctx->dsp.scalarproduct_and_madd_int16(f->coeffs, f->delay - order,
-                                                    f->adaptcoeffs - order,
-                                                    order, APESIGN(*data));
+        res = ctx->adsp.scalarproduct_and_madd_int16(f->coeffs,
+                                                     f->delay - order,
+                                                     f->adaptcoeffs - order,
+                                                     order, APESIGN(*data));
         res = (res + (1 << (fracbits - 1))) >> fracbits;
         res += *data;
         *data++ = res;
diff --git a/libavcodec/apedsp.h b/libavcodec/apedsp.h
new file mode 100644
index 0000000000..64e2749679
--- /dev/null
+++ b/libavcodec/apedsp.h
@@ -0,0 +1,44 @@
+/*
+ * Monkey's Audio lossless audio decoder
+ * Copyright (c) 2007 Benjamin Zores <ben@geexbox.org>
+ *  based upon libdemac from Dave Chapman.
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_APEDSP_H
+#define AVCODEC_APEDSP_H
+
+#include <stdint.h>
+
+typedef struct APEDSPContext {
+    /**
+     * Calculate scalar product of v1 and v2,
+     * and v1[i] += v3[i] * mul
+     * @param len length of vectors, should be multiple of 16
+     */
+    int32_t (*scalarproduct_and_madd_int16)(int16_t *v1 /* align 16 */,
+                                            const int16_t *v2,
+                                            const int16_t *v3,
+                                            int len, int mul);
+} APEDSPContext;
+
+void ff_apedsp_init_arm(APEDSPContext *c);
+void ff_apedsp_init_ppc(APEDSPContext *c);
+void ff_apedsp_init_x86(APEDSPContext *c);
+
+#endif /* AVCODEC_APEDSP_H */
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 9d5b6aab5b..13025af9c1 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -24,6 +24,7 @@ OBJS-$(CONFIG_VP3DSP)                  += arm/vp3dsp_init_arm.o
 
 OBJS-$(CONFIG_AAC_DECODER)             += arm/aacpsdsp_init_arm.o       \
                                           arm/sbrdsp_init_arm.o
+OBJS-$(CONFIG_APE_DECODER)             += arm/apedsp_init_arm.o
 OBJS-$(CONFIG_DCA_DECODER)             += arm/dcadsp_init_arm.o
 OBJS-$(CONFIG_FLAC_DECODER)            += arm/flacdsp_init_arm.o        \
                                           arm/flacdsp_arm.o
@@ -97,6 +98,7 @@ NEON-OBJS-$(CONFIG_VP3DSP)             += arm/vp3dsp_neon.o
 
 NEON-OBJS-$(CONFIG_AAC_DECODER)        += arm/aacpsdsp_neon.o           \
                                           arm/sbrdsp_neon.o
+NEON-OBJS-$(CONFIG_APE_DECODER)        += arm/apedsp_neon.o
 NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/dcadsp_neon.o             \
                                           arm/synth_filter_neon.o
 NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
diff --git a/libavcodec/arm/apedsp_init_arm.c b/libavcodec/arm/apedsp_init_arm.c
new file mode 100644
index 0000000000..47ea034359
--- /dev/null
+++ b/libavcodec/arm/apedsp_init_arm.c
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/apedsp.h"
+
+int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2,
+                                             const int16_t *v3, int len, int mul);
+
+av_cold void ff_apedsp_init_arm(APEDSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon;
+    }
+}
diff --git a/libavcodec/arm/apedsp_neon.S b/libavcodec/arm/apedsp_neon.S
new file mode 100644
index 0000000000..7cfbf43c6d
--- /dev/null
+++ b/libavcodec/arm/apedsp_neon.S
@@ -0,0 +1,62 @@
+/*
+ * ARM NEON optimised integer operations
+ * Copyright (c) 2009 Kostya Shishkov
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul)
+function ff_scalarproduct_and_madd_int16_neon, export=1
+        vld1.16         {d28[],d29[]}, [sp]
+        vmov.i16        q0,  #0
+        vmov.i16        q1,  #0
+        vmov.i16        q2,  #0
+        vmov.i16        q3,  #0
+        mov             r12, r0
+
+1:      vld1.16         {d16-d17}, [r0,:128]!
+        vld1.16         {d18-d19}, [r1]!
+        vld1.16         {d20-d21}, [r2]!
+        vld1.16         {d22-d23}, [r0,:128]!
+        vld1.16         {d24-d25}, [r1]!
+        vld1.16         {d26-d27}, [r2]!
+        vmul.s16        q10, q10,  q14
+        vmul.s16        q13, q13,  q14
+        vmlal.s16       q0,  d16,  d18
+        vmlal.s16       q1,  d17,  d19
+        vadd.s16        q10, q8,   q10
+        vadd.s16        q13, q11,  q13
+        vmlal.s16       q2,  d22,  d24
+        vmlal.s16       q3,  d23,  d25
+        vst1.16         {q10},     [r12,:128]!
+        subs            r3,  r3,   #16
+        vst1.16         {q13},     [r12,:128]!
+        bne             1b
+
+        vpadd.s32       d16, d0,   d1
+        vpadd.s32       d17, d2,   d3
+        vpadd.s32       d18, d4,   d5
+        vpadd.s32       d19, d6,   d7
+        vpadd.s32       d0,  d16,  d17
+        vpadd.s32       d1,  d18,  d19
+        vpadd.s32       d2,  d0,   d1
+        vpaddl.s32      d3,  d2
+        vmov.32         r0,  d3[0]
+        bx              lr
+endfunc
diff --git a/libavcodec/arm/dsputil_init_neon.c b/libavcodec/arm/dsputil_init_neon.c
index 16e052dddd..c9bdaa5a78 100644
--- a/libavcodec/arm/dsputil_init_neon.c
+++ b/libavcodec/arm/dsputil_init_neon.c
@@ -44,9 +44,6 @@ void ff_vector_clip_int32_neon(int32_t *dst, const int32_t *src, int32_t min,
 
 int32_t ff_scalarproduct_int16_neon(const int16_t *v1, const int16_t *v2, int len);
 
-int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2,
-                                             const int16_t *v3, int len, int mul);
-
 av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx,
                                   unsigned high_bit_depth)
 {
@@ -73,6 +70,4 @@ av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx,
     c->vector_clip_int32 = ff_vector_clip_int32_neon;
 
     c->scalarproduct_int16 = ff_scalarproduct_int16_neon;
-
-    c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon;
 }
diff --git a/libavcodec/arm/int_neon.S b/libavcodec/arm/int_neon.S
index 3d2faffa48..42f37392e1 100644
--- a/libavcodec/arm/int_neon.S
+++ b/libavcodec/arm/int_neon.S
@@ -48,43 +48,3 @@ function ff_scalarproduct_int16_neon, export=1
         vmov.32         r0,  d3[0]
         bx              lr
 endfunc
-
-@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul)
-function ff_scalarproduct_and_madd_int16_neon, export=1
-        vld1.16         {d28[],d29[]}, [sp]
-        vmov.i16        q0,  #0
-        vmov.i16        q1,  #0
-        vmov.i16        q2,  #0
-        vmov.i16        q3,  #0
-        mov             r12, r0
-
-1:      vld1.16         {d16-d17}, [r0,:128]!
-        vld1.16         {d18-d19}, [r1]!
-        vld1.16         {d20-d21}, [r2]!
-        vld1.16         {d22-d23}, [r0,:128]!
-        vld1.16         {d24-d25}, [r1]!
-        vld1.16         {d26-d27}, [r2]!
-        vmul.s16        q10, q10,  q14
-        vmul.s16        q13, q13,  q14
-        vmlal.s16       q0,  d16,  d18
-        vmlal.s16       q1,  d17,  d19
-        vadd.s16        q10, q8,   q10
-        vadd.s16        q13, q11,  q13
-        vmlal.s16       q2,  d22,  d24
-        vmlal.s16       q3,  d23,  d25
-        vst1.16         {q10},     [r12,:128]!
-        subs            r3,  r3,   #16
-        vst1.16         {q13},     [r12,:128]!
-        bne             1b
-
-        vpadd.s32       d16, d0,   d1
-        vpadd.s32       d17, d2,   d3
-        vpadd.s32       d18, d4,   d5
-        vpadd.s32       d19, d6,   d7
-        vpadd.s32       d0,  d16,  d17
-        vpadd.s32       d1,  d18,  d19
-        vpadd.s32       d2,  d0,   d1
-        vpaddl.s32      d3,  d2
-        vmov.32         r0,  d3[0]
-        bx              lr
-endfunc
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index 11447c01e8..6b846588fa 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -2069,19 +2069,6 @@ static int32_t scalarproduct_int16_c(const int16_t *v1, const int16_t *v2,
     return res;
 }
 
-static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
-                                              const int16_t *v3,
-                                              int order, int mul)
-{
-    int res = 0;
-
-    while (order--) {
-        res   += *v1 * *v2++;
-        *v1++ += mul * *v3++;
-    }
-    return res;
-}
-
 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
                                 int32_t max, unsigned int len)
 {
@@ -2294,8 +2281,6 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
     c->try_8x8basis = try_8x8basis_c;
     c->add_8x8basis = add_8x8basis_c;
 
-    c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
-
     c->scalarproduct_int16 = scalarproduct_int16_c;
     c->vector_clip_int32   = vector_clip_int32_c;
     c->vector_clipf        = vector_clipf_c;
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index d261f7e702..471988bddd 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -255,16 +255,6 @@ typedef struct DSPContext {
      */
     int32_t (*scalarproduct_int16)(const int16_t *v1,
                                    const int16_t *v2 /* align 16 */, int len);
-    /* ape functions */
-    /**
-     * Calculate scalar product of v1 and v2,
-     * and v1[i] += v3[i] * mul
-     * @param len length of vectors, should be multiple of 16
-     */
-    int32_t (*scalarproduct_and_madd_int16)(int16_t *v1 /* align 16 */,
-                                            const int16_t *v2,
-                                            const int16_t *v3,
-                                            int len, int mul);
 
     /**
      * Clip each element in an array of int32_t to a given minimum and
diff --git a/libavcodec/ppc/Makefile b/libavcodec/ppc/Makefile
index ec0674c817..b78d4be8ae 100644
--- a/libavcodec/ppc/Makefile
+++ b/libavcodec/ppc/Makefile
@@ -12,6 +12,7 @@ OBJS-$(CONFIG_MPEGVIDEO)               += ppc/mpegvideo_altivec.o
 OBJS-$(CONFIG_VIDEODSP)                += ppc/videodsp_ppc.o
 OBJS-$(CONFIG_VP3DSP)                  += ppc/vp3dsp_altivec.o
 
+OBJS-$(CONFIG_APE_DECODER)             += ppc/apedsp_altivec.o
 OBJS-$(CONFIG_SVQ1_ENCODER)            += ppc/svq1enc_altivec.o
 OBJS-$(CONFIG_VC1_DECODER)             += ppc/vc1dsp_altivec.o
 OBJS-$(CONFIG_VORBIS_DECODER)          += ppc/vorbisdsp_altivec.o
diff --git a/libavcodec/ppc/apedsp_altivec.c b/libavcodec/ppc/apedsp_altivec.c
new file mode 100644
index 0000000000..de9df45c6c
--- /dev/null
+++ b/libavcodec/ppc/apedsp_altivec.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#if HAVE_ALTIVEC_H
+#include <altivec.h>
+#endif
+
+#include "libavutil/attributes.h"
+#include "libavutil/ppc/types_altivec.h"
+#include "libavcodec/apedsp.h"
+
+#if HAVE_ALTIVEC
+static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1,
+                                                    const int16_t *v2,
+                                                    const int16_t *v3,
+                                                    int order, int mul)
+{
+    LOAD_ZERO;
+    vec_s16 *pv1 = (vec_s16 *) v1;
+    register vec_s16 muls = { mul, mul, mul, mul, mul, mul, mul, mul };
+    register vec_s16 t0, t1, i0, i1, i4;
+    register vec_s16 i2 = vec_ld(0, v2), i3 = vec_ld(0, v3);
+    register vec_s32 res = zero_s32v;
+    register vec_u8 align = vec_lvsl(0, v2);
+    int32_t ires;
+
+    order >>= 4;
+    do {
+        i1     = vec_ld(16, v2);
+        t0     = vec_perm(i2, i1, align);
+        i2     = vec_ld(32, v2);
+        t1     = vec_perm(i1, i2, align);
+        i0     = pv1[0];
+        i1     = pv1[1];
+        res    = vec_msum(t0, i0, res);
+        res    = vec_msum(t1, i1, res);
+        i4     = vec_ld(16, v3);
+        t0     = vec_perm(i3, i4, align);
+        i3     = vec_ld(32, v3);
+        t1     = vec_perm(i4, i3, align);
+        pv1[0] = vec_mladd(t0, muls, i0);
+        pv1[1] = vec_mladd(t1, muls, i1);
+        pv1   += 2;
+        v2    += 16;
+        v3    += 16;
+    } while (--order);
+    res = vec_splat(vec_sums(res, zero_s32v), 3);
+    vec_ste(res, 0, &ires);
+
+    return ires;
+}
+#endif /* HAVE_ALTIVEC */
+
+av_cold void ff_apedsp_init_ppc(APEDSPContext *c)
+{
+#if HAVE_ALTIVEC
+    c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec;
+#endif /* HAVE_ALTIVEC */
+}
diff --git a/libavcodec/ppc/int_altivec.c b/libavcodec/ppc/int_altivec.c
index fa3cb66095..d76d34a5b1 100644
--- a/libavcodec/ppc/int_altivec.c
+++ b/libavcodec/ppc/int_altivec.c
@@ -56,49 +56,7 @@ static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2,
     return ires;
 }
 
-static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1,
-                                                    const int16_t *v2,
-                                                    const int16_t *v3,
-                                                    int order, int mul)
-{
-    LOAD_ZERO;
-    vec_s16 *pv1 = (vec_s16 *) v1;
-    register vec_s16 muls = { mul, mul, mul, mul, mul, mul, mul, mul };
-    register vec_s16 t0, t1, i0, i1, i4;
-    register vec_s16 i2 = vec_ld(0, v2), i3 = vec_ld(0, v3);
-    register vec_s32 res = zero_s32v;
-    register vec_u8 align = vec_lvsl(0, v2);
-    int32_t ires;
-
-    order >>= 4;
-    do {
-        i1     = vec_ld(16, v2);
-        t0     = vec_perm(i2, i1, align);
-        i2     = vec_ld(32, v2);
-        t1     = vec_perm(i1, i2, align);
-        i0     = pv1[0];
-        i1     = pv1[1];
-        res    = vec_msum(t0, i0, res);
-        res    = vec_msum(t1, i1, res);
-        i4     = vec_ld(16, v3);
-        t0     = vec_perm(i3, i4, align);
-        i3     = vec_ld(32, v3);
-        t1     = vec_perm(i4, i3, align);
-        pv1[0] = vec_mladd(t0, muls, i0);
-        pv1[1] = vec_mladd(t1, muls, i1);
-        pv1   += 2;
-        v2    += 16;
-        v3    += 16;
-    } while (--order);
-    res = vec_splat(vec_sums(res, zero_s32v), 3);
-    vec_ste(res, 0, &ires);
-
-    return ires;
-}
-
 av_cold void ff_int_init_altivec(DSPContext *c, AVCodecContext *avctx)
 {
     c->scalarproduct_int16 = scalarproduct_int16_altivec;
-
-    c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec;
 }
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 8830a22a8f..10242269c2 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -25,6 +25,7 @@ OBJS-$(CONFIG_VP3DSP)                  += x86/vp3dsp_init.o
 OBJS-$(CONFIG_XMM_CLOBBER_TEST)        += x86/w64xmmtest.o
 
 OBJS-$(CONFIG_AAC_DECODER)             += x86/sbrdsp_init.o
+OBJS-$(CONFIG_APE_DECODER)             += x86/apedsp_init.o
 OBJS-$(CONFIG_CAVS_DECODER)            += x86/cavsdsp.o
 OBJS-$(CONFIG_DCA_DECODER)             += x86/dcadsp_init.o
 OBJS-$(CONFIG_DNXHD_ENCODER)           += x86/dnxhdenc_init.o
@@ -89,6 +90,7 @@ YASM-OBJS-$(CONFIG_VIDEODSP)           += x86/videodsp.o
 YASM-OBJS-$(CONFIG_VP3DSP)             += x86/vp3dsp.o
 
 YASM-OBJS-$(CONFIG_AAC_DECODER)        += x86/sbrdsp.o
+YASM-OBJS-$(CONFIG_APE_DECODER)        += x86/apedsp.o
 YASM-OBJS-$(CONFIG_DCA_DECODER)        += x86/dcadsp.o
 YASM-OBJS-$(CONFIG_PNG_DECODER)        += x86/pngdsp.o
 YASM-OBJS-$(CONFIG_PRORES_DECODER)     += x86/proresdsp.o
diff --git a/libavcodec/x86/apedsp.asm b/libavcodec/x86/apedsp.asm
new file mode 100644
index 0000000000..d721ebda6b
--- /dev/null
+++ b/libavcodec/x86/apedsp.asm
@@ -0,0 +1,167 @@
+;******************************************************************************
+;* Copyright (c) 2008 Loren Merritt
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_TEXT
+
+%macro SCALARPRODUCT 0
+; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
+;                                     int order, int mul)
+cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
+    shl orderq, 1
+    movd    m7, mulm
+%if mmsize == 16
+    pshuflw m7, m7, 0
+    punpcklqdq m7, m7
+%else
+    pshufw  m7, m7, 0
+%endif
+    pxor    m6, m6
+    add v1q, orderq
+    add v2q, orderq
+    add v3q, orderq
+    neg orderq
+.loop:
+    movu    m0, [v2q + orderq]
+    movu    m1, [v2q + orderq + mmsize]
+    mova    m4, [v1q + orderq]
+    mova    m5, [v1q + orderq + mmsize]
+    movu    m2, [v3q + orderq]
+    movu    m3, [v3q + orderq + mmsize]
+    pmaddwd m0, m4
+    pmaddwd m1, m5
+    pmullw  m2, m7
+    pmullw  m3, m7
+    paddd   m6, m0
+    paddd   m6, m1
+    paddw   m2, m4
+    paddw   m3, m5
+    mova    [v1q + orderq], m2
+    mova    [v1q + orderq + mmsize], m3
+    add     orderq, mmsize*2
+    jl .loop
+%if mmsize == 16
+    movhlps m0, m6
+    paddd   m6, m0
+    pshuflw m0, m6, 0x4e
+%else
+    pshufw  m0, m6, 0x4e
+%endif
+    paddd   m6, m0
+    movd   eax, m6
+    RET
+%endmacro
+
+INIT_MMX mmxext
+SCALARPRODUCT
+INIT_XMM sse2
+SCALARPRODUCT
+
+%macro SCALARPRODUCT_LOOP 1
+align 16
+.loop%1:
+    sub     orderq, mmsize*2
+%if %1
+    mova    m1, m4
+    mova    m4, [v2q + orderq]
+    mova    m0, [v2q + orderq + mmsize]
+    palignr m1, m0, %1
+    palignr m0, m4, %1
+    mova    m3, m5
+    mova    m5, [v3q + orderq]
+    mova    m2, [v3q + orderq + mmsize]
+    palignr m3, m2, %1
+    palignr m2, m5, %1
+%else
+    mova    m0, [v2q + orderq]
+    mova    m1, [v2q + orderq + mmsize]
+    mova    m2, [v3q + orderq]
+    mova    m3, [v3q + orderq + mmsize]
+%endif
+    %define t0  [v1q + orderq]
+    %define t1  [v1q + orderq + mmsize]
+%if ARCH_X86_64
+    mova    m8, t0
+    mova    m9, t1
+    %define t0  m8
+    %define t1  m9
+%endif
+    pmaddwd m0, t0
+    pmaddwd m1, t1
+    pmullw  m2, m7
+    pmullw  m3, m7
+    paddw   m2, t0
+    paddw   m3, t1
+    paddd   m6, m0
+    paddd   m6, m1
+    mova    [v1q + orderq], m2
+    mova    [v1q + orderq + mmsize], m3
+    jg .loop%1
+%if %1
+    jmp .end
+%endif
+%endmacro
+
+; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
+;                                     int order, int mul)
+INIT_XMM ssse3
+cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
+    shl orderq, 1
+    movd    m7, mulm
+    pshuflw m7, m7, 0
+    punpcklqdq m7, m7
+    pxor    m6, m6
+    mov    r4d, v2d
+    and    r4d, 15
+    and    v2q, ~15
+    and    v3q, ~15
+    mova    m4, [v2q + orderq]
+    mova    m5, [v3q + orderq]
+    ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
+    cmp    r4d, 0
+    je .loop0
+    cmp    r4d, 2
+    je .loop2
+    cmp    r4d, 4
+    je .loop4
+    cmp    r4d, 6
+    je .loop6
+    cmp    r4d, 8
+    je .loop8
+    cmp    r4d, 10
+    je .loop10
+    cmp    r4d, 12
+    je .loop12
+SCALARPRODUCT_LOOP 14
+SCALARPRODUCT_LOOP 12
+SCALARPRODUCT_LOOP 10
+SCALARPRODUCT_LOOP 8
+SCALARPRODUCT_LOOP 6
+SCALARPRODUCT_LOOP 4
+SCALARPRODUCT_LOOP 2
+SCALARPRODUCT_LOOP 0
+.end:
+    movhlps m0, m6
+    paddd   m6, m0
+    pshuflw m0, m6, 0x4e
+    paddd   m6, m0
+    movd   eax, m6
+    RET
diff --git a/libavcodec/x86/apedsp_init.c b/libavcodec/x86/apedsp_init.c
new file mode 100644
index 0000000000..f692c2b9b6
--- /dev/null
+++ b/libavcodec/x86/apedsp_init.c
@@ -0,0 +1,47 @@
+/*
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/apedsp.h"
+
+int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
+                                               const int16_t *v3,
+                                               int order, int mul);
+int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
+                                             const int16_t *v3,
+                                             int order, int mul);
+int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
+                                              const int16_t *v3,
+                                              int order, int mul);
+
+av_cold void ff_apedsp_init_x86(APEDSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMXEXT(cpu_flags))
+        c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
+
+    if (EXTERNAL_SSE2(cpu_flags))
+        c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
+
+    if (EXTERNAL_SSSE3(cpu_flags) &&
+        !(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
+        c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
+}
diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm
index 684f09b7fc..b5d6d3cc65 100644
--- a/libavcodec/x86/dsputil.asm
+++ b/libavcodec/x86/dsputil.asm
@@ -53,52 +53,6 @@ cglobal scalarproduct_int16, 3,3,3, v1, v2, order
     paddd   m2, m0
     movd   eax, m2
     RET
-
-; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
-;                                     int order, int mul)
-cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
-    shl orderq, 1
-    movd    m7, mulm
-%if mmsize == 16
-    pshuflw m7, m7, 0
-    punpcklqdq m7, m7
-%else
-    pshufw  m7, m7, 0
-%endif
-    pxor    m6, m6
-    add v1q, orderq
-    add v2q, orderq
-    add v3q, orderq
-    neg orderq
-.loop:
-    movu    m0, [v2q + orderq]
-    movu    m1, [v2q + orderq + mmsize]
-    mova    m4, [v1q + orderq]
-    mova    m5, [v1q + orderq + mmsize]
-    movu    m2, [v3q + orderq]
-    movu    m3, [v3q + orderq + mmsize]
-    pmaddwd m0, m4
-    pmaddwd m1, m5
-    pmullw  m2, m7
-    pmullw  m3, m7
-    paddd   m6, m0
-    paddd   m6, m1
-    paddw   m2, m4
-    paddw   m3, m5
-    mova    [v1q + orderq], m2
-    mova    [v1q + orderq + mmsize], m3
-    add     orderq, mmsize*2
-    jl .loop
-%if mmsize == 16
-    movhlps m0, m6
-    paddd   m6, m0
-    pshuflw m0, m6, 0x4e
-%else
-    pshufw  m0, m6, 0x4e
-%endif
-    paddd   m6, m0
-    movd   eax, m6
-    RET
 %endmacro
 
 INIT_MMX mmxext
@@ -106,97 +60,6 @@ SCALARPRODUCT
 INIT_XMM sse2
 SCALARPRODUCT
 
-%macro SCALARPRODUCT_LOOP 1
-align 16
-.loop%1:
-    sub     orderq, mmsize*2
-%if %1
-    mova    m1, m4
-    mova    m4, [v2q + orderq]
-    mova    m0, [v2q + orderq + mmsize]
-    palignr m1, m0, %1
-    palignr m0, m4, %1
-    mova    m3, m5
-    mova    m5, [v3q + orderq]
-    mova    m2, [v3q + orderq + mmsize]
-    palignr m3, m2, %1
-    palignr m2, m5, %1
-%else
-    mova    m0, [v2q + orderq]
-    mova    m1, [v2q + orderq + mmsize]
-    mova    m2, [v3q + orderq]
-    mova    m3, [v3q + orderq + mmsize]
-%endif
-    %define t0  [v1q + orderq]
-    %define t1  [v1q + orderq + mmsize]
-%if ARCH_X86_64
-    mova    m8, t0
-    mova    m9, t1
-    %define t0  m8
-    %define t1  m9
-%endif
-    pmaddwd m0, t0
-    pmaddwd m1, t1
-    pmullw  m2, m7
-    pmullw  m3, m7
-    paddw   m2, t0
-    paddw   m3, t1
-    paddd   m6, m0
-    paddd   m6, m1
-    mova    [v1q + orderq], m2
-    mova    [v1q + orderq + mmsize], m3
-    jg .loop%1
-%if %1
-    jmp .end
-%endif
-%endmacro
-
-; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
-;                                     int order, int mul)
-INIT_XMM ssse3
-cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
-    shl orderq, 1
-    movd    m7, mulm
-    pshuflw m7, m7, 0
-    punpcklqdq m7, m7
-    pxor    m6, m6
-    mov    r4d, v2d
-    and    r4d, 15
-    and    v2q, ~15
-    and    v3q, ~15
-    mova    m4, [v2q + orderq]
-    mova    m5, [v3q + orderq]
-    ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
-    cmp    r4d, 0
-    je .loop0
-    cmp    r4d, 2
-    je .loop2
-    cmp    r4d, 4
-    je .loop4
-    cmp    r4d, 6
-    je .loop6
-    cmp    r4d, 8
-    je .loop8
-    cmp    r4d, 10
-    je .loop10
-    cmp    r4d, 12
-    je .loop12
-SCALARPRODUCT_LOOP 14
-SCALARPRODUCT_LOOP 12
-SCALARPRODUCT_LOOP 10
-SCALARPRODUCT_LOOP 8
-SCALARPRODUCT_LOOP 6
-SCALARPRODUCT_LOOP 4
-SCALARPRODUCT_LOOP 2
-SCALARPRODUCT_LOOP 0
-.end:
-    movhlps m0, m6
-    paddd   m6, m0
-    pshuflw m0, m6, 0x4e
-    paddd   m6, m0
-    movd   eax, m6
-    RET
-
 
 ;-----------------------------------------------------------------------------
 ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
diff --git a/libavcodec/x86/dsputil_init.c b/libavcodec/x86/dsputil_init.c
index 10fa166db4..9b0788ff73 100644
--- a/libavcodec/x86/dsputil_init.c
+++ b/libavcodec/x86/dsputil_init.c
@@ -76,15 +76,6 @@ int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
                                       int order);
 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
                                     int order);
-int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
-                                               const int16_t *v3,
-                                               int order, int mul);
-int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
-                                             const int16_t *v3,
-                                             int order, int mul);
-int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
-                                              const int16_t *v3,
-                                              int order, int mul);
 
 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
@@ -568,7 +559,6 @@ static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
     SET_QPEL_FUNCS(put_no_rnd_qpel, 1,  8, mmxext, );
 
     c->scalarproduct_int16          = ff_scalarproduct_int16_mmxext;
-    c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
 #endif /* HAVE_MMXEXT_EXTERNAL */
 }
 
@@ -607,7 +597,6 @@ static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
 
 #if HAVE_SSE2_EXTERNAL
     c->scalarproduct_int16          = ff_scalarproduct_int16_sse2;
-    c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
     if (cpu_flags & AV_CPU_FLAG_ATOM) {
         c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
     } else {
@@ -621,8 +610,6 @@ static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
                                        int cpu_flags, unsigned high_bit_depth)
 {
 #if HAVE_SSSE3_EXTERNAL
-    if (!(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
-        c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
     c->bswap_buf = ff_bswap32_buf_ssse3;
 #endif /* HAVE_SSSE3_EXTERNAL */
 }