vp9mc/x86: sse2 MC assembly.

Also a slight change to the ssse3 code, which prevents a theoretical
overflow in the sharp filter.

Signed-off-by: Anton Khirnov <anton@khirnov.net>
This commit is contained in:
Ronald S. Bultje 2014-12-14 20:13:24 -05:00 committed by Anton Khirnov
parent 67922b4ee4
commit 9790b44a89
2 changed files with 319 additions and 101 deletions

View File

@ -51,39 +51,41 @@ fpel_func(avg, 32, avx2);
fpel_func(avg, 64, avx2);
#undef fpel_func
#define mc_func(avg, sz, dir, opt) \
#define mc_func(avg, sz, dir, opt, type, f_sz) \
void \
ff_vp9_ ## avg ## _8tap_1d_ ## dir ## _ ## sz ## _ ## opt(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t dst_stride, \
ptrdiff_t src_stride, \
int h, \
const int8_t (*filter)[32])
const type (*filter)[f_sz])
#define mc_funcs(sz, opt) \
mc_func(put, sz, h, opt); \
mc_func(avg, sz, h, opt); \
mc_func(put, sz, v, opt); \
mc_func(avg, sz, v, opt)
#define mc_funcs(sz, opt, type, f_sz) \
mc_func(put, sz, h, opt, type, f_sz); \
mc_func(avg, sz, h, opt, type, f_sz); \
mc_func(put, sz, v, opt, type, f_sz); \
mc_func(avg, sz, v, opt, type, f_sz)
mc_funcs(4, ssse3);
mc_funcs(8, ssse3);
mc_funcs(4, mmxext, int16_t, 8);
mc_funcs(8, sse2, int16_t, 8);
mc_funcs(4, ssse3, int8_t, 32);
mc_funcs(8, ssse3, int8_t, 32);
#if ARCH_X86_64
mc_funcs(16, ssse3);
mc_funcs(32, avx2);
mc_funcs(16, ssse3, int8_t, 32);
mc_funcs(32, avx2, int8_t, 32);
#endif
#undef mc_funcs
#undef mc_func
#define mc_rep_func(avg, sz, hsz, dir, opt) \
#define mc_rep_func(avg, sz, hsz, dir, opt, type, f_sz) \
static av_always_inline void \
ff_vp9_ ## avg ## _8tap_1d_ ## dir ## _ ## sz ## _ ## opt(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t dst_stride, \
ptrdiff_t src_stride, \
int h, \
const int8_t (*filter)[32]) \
const type (*filter)[f_sz]) \
{ \
ff_vp9_ ## avg ## _8tap_1d_ ## dir ## _ ## hsz ## _ ## opt(dst, src, \
dst_stride, \
@ -97,27 +99,31 @@ ff_vp9_ ## avg ## _8tap_1d_ ## dir ## _ ## sz ## _ ## opt(uint8_t *dst, \
h, filter); \
}
#define mc_rep_funcs(sz, hsz, opt) \
mc_rep_func(put, sz, hsz, h, opt); \
mc_rep_func(avg, sz, hsz, h, opt); \
mc_rep_func(put, sz, hsz, v, opt); \
mc_rep_func(avg, sz, hsz, v, opt)
#define mc_rep_funcs(sz, hsz, opt, type, f_sz) \
mc_rep_func(put, sz, hsz, h, opt, type, f_sz); \
mc_rep_func(avg, sz, hsz, h, opt, type, f_sz); \
mc_rep_func(put, sz, hsz, v, opt, type, f_sz); \
mc_rep_func(avg, sz, hsz, v, opt, type, f_sz)
mc_rep_funcs(16, 8, sse2, int16_t, 8);
#if ARCH_X86_32
mc_rep_funcs(16, 8, ssse3);
mc_rep_funcs(16, 8, ssse3, int8_t, 32);
#endif
mc_rep_funcs(32, 16, ssse3);
mc_rep_funcs(64, 32, ssse3);
mc_rep_funcs(32, 16, sse2, int16_t, 8);
mc_rep_funcs(32, 16, ssse3, int8_t, 32);
mc_rep_funcs(64, 32, sse2, int16_t, 8);
mc_rep_funcs(64, 32, ssse3, int8_t, 32);
#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
mc_rep_funcs(64, 32, avx2);
mc_rep_funcs(64, 32, avx2, int8_t, 32);
#endif
#undef mc_rep_funcs
#undef mc_rep_func
extern const int8_t ff_filters_ssse3[3][15][4][32];
extern const int16_t ff_filters_sse2[3][15][8][8];
#define filter_8tap_2d_fn(op, sz, f, fname, align, opt) \
#define filter_8tap_2d_fn(op, sz, f, f_opt, fname, align, opt) \
static void \
op ## _8tap_ ## fname ## _ ## sz ## hv_ ## opt(uint8_t *dst, \
const uint8_t *src, \
@ -129,39 +135,42 @@ op ## _8tap_ ## fname ## _ ## sz ## hv_ ## opt(uint8_t *dst,
ff_vp9_put_8tap_1d_h_ ## sz ## _ ## opt(temp, src - 3 * src_stride, \
64, src_stride, \
h + 7, \
ff_filters_ssse3[f][mx - 1]); \
ff_filters_ ## f_opt[f][mx - 1]); \
ff_vp9_ ## op ## _8tap_1d_v_ ## sz ## _ ## opt(dst, temp + 3 * 64, \
dst_stride, 64, \
h, \
ff_filters_ssse3[f][my - 1]); \
ff_filters_ ## f_opt[f][my - 1]); \
}
#define filters_8tap_2d_fn(op, sz, align, opt) \
filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, regular, align, opt) \
filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP, sharp, align, opt) \
filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth, align, opt)
#define filters_8tap_2d_fn(op, sz, align, opt, f_opt) \
filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, align, opt) \
filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP, f_opt, sharp, align, opt) \
filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH, f_opt, smooth, align, opt)
#define filters_8tap_2d_fn2(op, align, opt) \
filters_8tap_2d_fn(op, 64, align, opt) \
filters_8tap_2d_fn(op, 32, align, opt) \
filters_8tap_2d_fn(op, 16, align, opt) \
filters_8tap_2d_fn(op, 8, align, opt) \
filters_8tap_2d_fn(op, 4, align, opt)
#define filters_8tap_2d_fn2(op, align, opt4, opt8, f_opt) \
filters_8tap_2d_fn(op, 64, align, opt8, f_opt) \
filters_8tap_2d_fn(op, 32, align, opt8, f_opt) \
filters_8tap_2d_fn(op, 16, align, opt8, f_opt) \
filters_8tap_2d_fn(op, 8, align, opt8, f_opt) \
filters_8tap_2d_fn(op, 4, align, opt4, f_opt)
filters_8tap_2d_fn2(put, 16, ssse3)
filters_8tap_2d_fn2(avg, 16, ssse3)
filters_8tap_2d_fn2(put, 16, mmxext, sse2, sse2)
filters_8tap_2d_fn2(avg, 16, mmxext, sse2, sse2)
filters_8tap_2d_fn2(put, 16, ssse3, ssse3, ssse3)
filters_8tap_2d_fn2(avg, 16, ssse3, ssse3, ssse3)
#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
filters_8tap_2d_fn(put, 64, 32, avx2)
filters_8tap_2d_fn(put, 32, 32, avx2)
filters_8tap_2d_fn(avg, 64, 32, avx2)
filters_8tap_2d_fn(avg, 32, 32, avx2)
filters_8tap_2d_fn(put, 64, 32, avx2, ssse3)
filters_8tap_2d_fn(put, 32, 32, avx2, ssse3)
filters_8tap_2d_fn(avg, 64, 32, avx2, ssse3)
filters_8tap_2d_fn(avg, 32, 32, avx2, ssse3)
#endif
#undef filters_8tap_2d_fn2
#undef filters_8tap_2d_fn
#undef filter_8tap_2d_fn
#define filter_8tap_1d_fn(op, sz, f, fname, dir, dvar, opt) \
#define filter_8tap_1d_fn(op, sz, f, f_opt, fname, dir, dvar, opt) \
static void \
op ## _8tap_ ## fname ## _ ## sz ## dir ## _ ## opt(uint8_t *dst, \
const uint8_t *src, \
@ -173,32 +182,34 @@ op ## _8tap_ ## fname ## _ ## sz ## dir ## _ ## opt(uint8_t *dst, \
ff_vp9_ ## op ## _8tap_1d_ ## dir ## _ ## sz ## _ ## opt(dst, src, \
dst_stride, \
src_stride, h,\
ff_filters_ssse3[f][dvar - 1]); \
ff_filters_ ## f_opt[f][dvar - 1]); \
}
#define filters_8tap_1d_fn(op, sz, dir, dvar, opt) \
filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, regular, dir, dvar, opt) \
filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP, sharp, dir, dvar, opt) \
filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth, dir, dvar, opt)
#define filters_8tap_1d_fn(op, sz, dir, dvar, opt, f_opt) \
filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, dir, dvar, opt) \
filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP, f_opt, sharp, dir, dvar, opt) \
filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH, f_opt, smooth, dir, dvar, opt)
#define filters_8tap_1d_fn2(op, sz, opt) \
filters_8tap_1d_fn(op, sz, h, mx, opt) \
filters_8tap_1d_fn(op, sz, v, my, opt)
#define filters_8tap_1d_fn2(op, sz, opt, f_opt) \
filters_8tap_1d_fn(op, sz, h, mx, opt, f_opt) \
filters_8tap_1d_fn(op, sz, v, my, opt, f_opt)
#define filters_8tap_1d_fn3(op, opt) \
filters_8tap_1d_fn2(op, 64, opt) \
filters_8tap_1d_fn2(op, 32, opt) \
filters_8tap_1d_fn2(op, 16, opt) \
filters_8tap_1d_fn2(op, 8, opt) \
filters_8tap_1d_fn2(op, 4, opt)
#define filters_8tap_1d_fn3(op, opt4, opt8, f_opt) \
filters_8tap_1d_fn2(op, 64, opt8, f_opt) \
filters_8tap_1d_fn2(op, 32, opt8, f_opt) \
filters_8tap_1d_fn2(op, 16, opt8, f_opt) \
filters_8tap_1d_fn2(op, 8, opt8, f_opt) \
filters_8tap_1d_fn2(op, 4, opt4, f_opt)
filters_8tap_1d_fn3(put, ssse3)
filters_8tap_1d_fn3(avg, ssse3)
filters_8tap_1d_fn3(put, mmxext, sse2, sse2)
filters_8tap_1d_fn3(avg, mmxext, sse2, sse2)
filters_8tap_1d_fn3(put, ssse3, ssse3, ssse3)
filters_8tap_1d_fn3(avg, ssse3, ssse3, ssse3)
#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
filters_8tap_1d_fn2(put, 64, avx2)
filters_8tap_1d_fn2(put, 32, avx2)
filters_8tap_1d_fn2(avg, 64, avx2)
filters_8tap_1d_fn2(avg, 32, avx2)
filters_8tap_1d_fn2(put, 64, avx2, ssse3)
filters_8tap_1d_fn2(put, 32, avx2, ssse3)
filters_8tap_1d_fn2(avg, 64, avx2, ssse3)
filters_8tap_1d_fn2(avg, 32, avx2, ssse3)
#endif
#undef filters_8tap_1d_fn
@ -225,20 +236,23 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type ## _8tap_regular_ ## sz ## dir ## _ ## opt; \
dsp->mc[idx1][FILTER_8TAP_SHARP][idx2][idxh][idxv] = type ## _8tap_sharp_ ## sz ## dir ## _ ## opt
#define init_subpel2_32_64(idx, idxh, idxv, dir, type, opt) \
init_subpel1(0, idx, idxh, idxv, 64, dir, type, opt); \
init_subpel1(1, idx, idxh, idxv, 32, dir, type, opt)
#define init_subpel2(idx1, idx2, sz, type, opt) \
init_subpel1(idx1, idx2, 1, 1, sz, hv, type, opt); \
init_subpel1(idx1, idx2, 0, 1, sz, v, type, opt); \
init_subpel1(idx1, idx2, 1, 0, sz, h, type, opt)
#define init_subpel2(idx, idxh, idxv, dir, type, opt) \
init_subpel2_32_64(idx, idxh, idxv, dir, type, opt); \
init_subpel1(2, idx, idxh, idxv, 16, dir, type, opt); \
init_subpel1(3, idx, idxh, idxv, 8, dir, type, opt); \
init_subpel1(4, idx, idxh, idxv, 4, dir, type, opt)
#define init_subpel3_32_64(idx, type, opt) \
init_subpel2(0, idx, 64, type, opt); \
init_subpel2(1, idx, 32, type, opt)
#define init_subpel3(idx, type, opt) \
init_subpel2(idx, 1, 1, hv, type, opt); \
init_subpel2(idx, 0, 1, v, type, opt); \
init_subpel2(idx, 1, 0, h, type, opt)
#define init_subpel3_8to64(idx, type, opt) \
init_subpel3_32_64(idx, type, opt); \
init_subpel2(2, idx, 16, type, opt); \
init_subpel2(3, idx, 8, type, opt)
#define init_subpel3(idx, type, opt) \
init_subpel3_8to64(idx, type, opt); \
init_subpel2(4, idx, 4, type, opt)
if (EXTERNAL_MMX(cpu_flags)) {
init_fpel(4, 0, 4, put, mmx);
@ -246,6 +260,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
init_subpel2(4, 0, 4, put, mmxext);
init_subpel2(4, 1, 4, avg, mmxext);
init_fpel(4, 1, 4, avg, mmxext);
init_fpel(3, 1, 8, avg, mmxext);
}
@ -257,6 +273,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
}
if (EXTERNAL_SSE2(cpu_flags)) {
init_subpel3_8to64(0, put, sse2);
init_subpel3_8to64(1, avg, sse2);
init_fpel(2, 1, 16, avg, sse2);
init_fpel(1, 1, 32, avg, sse2);
init_fpel(0, 1, 64, avg, sse2);
@ -277,12 +295,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
init_fpel(0, 1, 64, avg, avx2);
#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
init_subpel2_32_64(0, 1, 1, hv, put, avx2);
init_subpel2_32_64(0, 0, 1, v, put, avx2);
init_subpel2_32_64(0, 1, 0, h, put, avx2);
init_subpel2_32_64(1, 1, 1, hv, avg, avx2);
init_subpel2_32_64(1, 0, 1, v, avg, avx2);
init_subpel2_32_64(1, 1, 0, h, avg, avx2);
init_subpel3_32_64(0, put, avx2);
init_subpel3_32_64(1, avg, avx2);
#endif /* ARCH_X86_64 && HAVE_AVX2_EXTERNAL */
}

View File

@ -25,15 +25,28 @@
SECTION_RODATA 32
cextern pw_256
cextern pw_64
%macro F8_TAPS 8
%macro F8_SSSE3_TAPS 8
times 16 db %1, %2
times 16 db %3, %4
times 16 db %5, %6
times 16 db %7, %8
%endmacro
; int8_t ff_filters_ssse3[3][15][4][32]
const filters_ssse3 ; smooth
%macro F8_SSE2_TAPS 8
times 8 dw %1
times 8 dw %2
times 8 dw %3
times 8 dw %4
times 8 dw %5
times 8 dw %6
times 8 dw %7
times 8 dw %8
%endmacro
%macro FILTER 1
const filters_%1 ; smooth
F8_TAPS -3, -1, 32, 64, 38, 1, -3, 0
F8_TAPS -2, -2, 29, 63, 41, 2, -3, 0
F8_TAPS -2, -2, 26, 63, 43, 4, -4, 0
@ -81,9 +94,102 @@ const filters_ssse3 ; smooth
F8_TAPS -2, 5, -10, 27, 121, -17, 7, -3
F8_TAPS -1, 3, -6, 17, 125, -13, 5, -2
F8_TAPS 0, 1, -3, 8, 127, -7, 3, -1
%endmacro
%define F8_TAPS F8_SSSE3_TAPS
; int8_t ff_filters_ssse3[3][15][4][32]
FILTER ssse3
%define F8_TAPS F8_SSE2_TAPS
; int16_t ff_filters_sse2[3][15][8][8]
FILTER sse2
SECTION .text
%macro filter_sse2_h_fn 1
%assign %%px mmsize/2
cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 15, dst, src, dstride, sstride, h, filtery
pxor m5, m5
mova m6, [pw_64]
mova m7, [filteryq+ 0]
%if ARCH_X86_64 && mmsize > 8
mova m8, [filteryq+ 16]
mova m9, [filteryq+ 32]
mova m10, [filteryq+ 48]
mova m11, [filteryq+ 64]
mova m12, [filteryq+ 80]
mova m13, [filteryq+ 96]
mova m14, [filteryq+112]
%endif
.loop:
movh m0, [srcq-3]
movh m1, [srcq-2]
movh m2, [srcq-1]
movh m3, [srcq+0]
movh m4, [srcq+1]
punpcklbw m0, m5
punpcklbw m1, m5
punpcklbw m2, m5
punpcklbw m3, m5
punpcklbw m4, m5
pmullw m0, m7
%if ARCH_X86_64 && mmsize > 8
pmullw m1, m8
pmullw m2, m9
pmullw m3, m10
pmullw m4, m11
%else
pmullw m1, [filteryq+ 16]
pmullw m2, [filteryq+ 32]
pmullw m3, [filteryq+ 48]
pmullw m4, [filteryq+ 64]
%endif
paddw m0, m1
paddw m2, m3
paddw m0, m4
movh m1, [srcq+2]
movh m3, [srcq+3]
movh m4, [srcq+4]
add srcq, sstrideq
punpcklbw m1, m5
punpcklbw m3, m5
punpcklbw m4, m5
%if ARCH_X86_64 && mmsize > 8
pmullw m1, m12
pmullw m3, m13
pmullw m4, m14
%else
pmullw m1, [filteryq+ 80]
pmullw m3, [filteryq+ 96]
pmullw m4, [filteryq+112]
%endif
paddw m0, m1
paddw m3, m4
paddw m0, m6
paddw m2, m3
paddsw m0, m2
psraw m0, 7
%ifidn %1, avg
movh m1, [dstq]
%endif
packuswb m0, m0
%ifidn %1, avg
pavgb m0, m1
%endif
movh [dstq], m0
add dstq, dstrideq
dec hd
jg .loop
RET
%endmacro
INIT_MMX mmxext
filter_sse2_h_fn put
filter_sse2_h_fn avg
INIT_XMM sse2
filter_sse2_h_fn put
filter_sse2_h_fn avg
%macro filter_h_fn 1
%assign %%px mmsize/2
cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 11, dst, src, dstride, sstride, h, filtery
@ -118,9 +224,9 @@ cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 11, dst, src, dstride, sstride, h, filt
pmaddubsw m4, [filteryq+64]
pmaddubsw m1, [filteryq+96]
%endif
paddw m0, m2
paddw m4, m1
paddsw m0, m4
paddw m0, m4
paddw m2, m1
paddsw m0, m2
pmulhrsw m0, m6
%ifidn %1, avg
movh m1, [dstq]
@ -175,12 +281,12 @@ cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 14, dst, src, dstride, sstride, h, filt
pmaddubsw m5, m10
pmaddubsw m6, m11
pmaddubsw m7, m11
paddw m0, m2
paddw m1, m3
paddw m4, m6
paddw m5, m7
paddsw m0, m4
paddsw m1, m5
paddw m0, m4
paddw m1, m5
paddw m2, m6
paddw m3, m7
paddsw m0, m2
paddsw m1, m3
pmulhrsw m0, m13
pmulhrsw m1, m13
packuswb m0, m1
@ -206,6 +312,104 @@ filter_hx2_fn avg
%endif ; ARCH_X86_64
%macro filter_sse2_v_fn 1
%assign %%px mmsize/2
%if ARCH_X86_64
cglobal vp9_%1_8tap_1d_v_ %+ %%px, 6, 8, 15, dst, src, dstride, sstride, h, filtery, src4, sstride3
%else
cglobal vp9_%1_8tap_1d_v_ %+ %%px, 4, 7, 15, dst, src, dstride, sstride, filtery, src4, sstride3
mov filteryq, r5mp
%define hd r4mp
%endif
pxor m5, m5
mova m6, [pw_64]
lea sstride3q, [sstrideq*3]
lea src4q, [srcq+sstrideq]
sub srcq, sstride3q
mova m7, [filteryq+ 0]
%if ARCH_X86_64 && mmsize > 8
mova m8, [filteryq+ 16]
mova m9, [filteryq+ 32]
mova m10, [filteryq+ 48]
mova m11, [filteryq+ 64]
mova m12, [filteryq+ 80]
mova m13, [filteryq+ 96]
mova m14, [filteryq+112]
%endif
.loop:
; FIXME maybe reuse loads from previous rows, or just
; more generally unroll this to prevent multiple loads of
; the same data?
movh m0, [srcq]
movh m1, [srcq+sstrideq]
movh m2, [srcq+sstrideq*2]
movh m3, [srcq+sstride3q]
add srcq, sstrideq
movh m4, [src4q]
punpcklbw m0, m5
punpcklbw m1, m5
punpcklbw m2, m5
punpcklbw m3, m5
punpcklbw m4, m5
pmullw m0, m7
%if ARCH_X86_64 && mmsize > 8
pmullw m1, m8
pmullw m2, m9
pmullw m3, m10
pmullw m4, m11
%else
pmullw m1, [filteryq+ 16]
pmullw m2, [filteryq+ 32]
pmullw m3, [filteryq+ 48]
pmullw m4, [filteryq+ 64]
%endif
paddw m0, m1
paddw m2, m3
paddw m0, m4
movh m1, [src4q+sstrideq]
movh m3, [src4q+sstrideq*2]
movh m4, [src4q+sstride3q]
add src4q, sstrideq
punpcklbw m1, m5
punpcklbw m3, m5
punpcklbw m4, m5
%if ARCH_X86_64 && mmsize > 8
pmullw m1, m12
pmullw m3, m13
pmullw m4, m14
%else
pmullw m1, [filteryq+ 80]
pmullw m3, [filteryq+ 96]
pmullw m4, [filteryq+112]
%endif
paddw m0, m1
paddw m3, m4
paddw m0, m6
paddw m2, m3
paddsw m0, m2
psraw m0, 7
%ifidn %1, avg
movh m1, [dstq]
%endif
packuswb m0, m0
%ifidn %1, avg
pavgb m0, m1
%endif
movh [dstq], m0
add dstq, dstrideq
dec hd
jg .loop
RET
%endmacro
INIT_MMX mmxext
filter_sse2_v_fn put
filter_sse2_v_fn avg
INIT_XMM sse2
filter_sse2_v_fn put
filter_sse2_v_fn avg
%macro filter_v_fn 1
%assign %%px mmsize/2
%if ARCH_X86_64
@ -252,9 +456,9 @@ cglobal vp9_%1_8tap_1d_v_ %+ %%px, 4, 7, 11, dst, src, dstride, sstride, filtery
pmaddubsw m4, [filteryq+64]
pmaddubsw m1, [filteryq+96]
%endif
paddw m0, m2
paddw m4, m1
paddsw m0, m4
paddw m0, m4
paddw m2, m1
paddsw m0, m2
pmulhrsw m0, m6
%ifidn %1, avg
movh m1, [dstq]
@ -317,12 +521,12 @@ cglobal vp9_%1_8tap_1d_v_ %+ %%px, 6, 8, 14, dst, src, dstride, sstride, h, filt
pmaddubsw m5, m10
pmaddubsw m6, m11
pmaddubsw m7, m11
paddw m0, m2
paddw m1, m3
paddw m4, m6
paddw m5, m7
paddsw m0, m4
paddsw m1, m5
paddw m0, m4
paddw m1, m5
paddw m2, m6
paddw m3, m7
paddsw m0, m2
paddsw m1, m3
pmulhrsw m0, m13
pmulhrsw m1, m13
packuswb m0, m1