avfilter/vf_v360: x86 SIMD for interpolations

This commit is contained in:
Paul B Mahol 2019-09-03 18:54:44 +02:00
parent f0d8005ec5
commit 058bbf48c6
5 changed files with 394 additions and 155 deletions

113
libavfilter/v360.h Normal file
View File

@ -0,0 +1,113 @@
/*
* Copyright (c) 2019 Eugene Lyapustin
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVFILTER_V360_H
#define AVFILTER_V360_H
#include "avfilter.h"
enum Projections {
EQUIRECTANGULAR,
CUBEMAP_3_2,
CUBEMAP_6_1,
EQUIANGULAR,
FLAT,
DUAL_FISHEYE,
BARREL,
CUBEMAP_1_6,
NB_PROJECTIONS,
};
enum InterpMethod {
NEAREST,
BILINEAR,
BICUBIC,
LANCZOS,
NB_INTERP_METHODS,
};
enum Faces {
TOP_LEFT,
TOP_MIDDLE,
TOP_RIGHT,
BOTTOM_LEFT,
BOTTOM_MIDDLE,
BOTTOM_RIGHT,
NB_FACES,
};
enum Direction {
RIGHT, ///< Axis +X
LEFT, ///< Axis -X
UP, ///< Axis +Y
DOWN, ///< Axis -Y
FRONT, ///< Axis -Z
BACK, ///< Axis +Z
NB_DIRECTIONS,
};
enum Rotation {
ROT_0,
ROT_90,
ROT_180,
ROT_270,
NB_ROTATIONS,
};
typedef struct V360Context {
const AVClass *class;
int in, out;
int interp;
int width, height;
char* in_forder;
char* out_forder;
char* in_frot;
char* out_frot;
int in_cubemap_face_order[6];
int out_cubemap_direction_order[6];
int in_cubemap_face_rotation[6];
int out_cubemap_face_rotation[6];
float in_pad, out_pad;
float yaw, pitch, roll;
int h_flip, v_flip, d_flip;
float h_fov, v_fov;
float flat_range[3];
int planewidth[4], planeheight[4];
int inplanewidth[4], inplaneheight[4];
int nb_planes;
uint16_t *u[4], *v[4];
int16_t *ker[4];
int (*remap_slice)(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs);
void (*remap_line)(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize,
const uint16_t *u, const uint16_t *v, const int16_t *ker);
} V360Context;
void ff_v360_init(V360Context *s, int depth);
void ff_v360_init_x86(V360Context *s, int depth);
#endif /* AVFILTER_V360_H */

View File

@ -41,88 +41,7 @@
#include "formats.h"
#include "internal.h"
#include "video.h"
enum Projections {
EQUIRECTANGULAR,
CUBEMAP_3_2,
CUBEMAP_6_1,
EQUIANGULAR,
FLAT,
DUAL_FISHEYE,
BARREL,
CUBEMAP_1_6,
NB_PROJECTIONS,
};
enum InterpMethod {
NEAREST,
BILINEAR,
BICUBIC,
LANCZOS,
NB_INTERP_METHODS,
};
enum Faces {
TOP_LEFT,
TOP_MIDDLE,
TOP_RIGHT,
BOTTOM_LEFT,
BOTTOM_MIDDLE,
BOTTOM_RIGHT,
NB_FACES,
};
enum Direction {
RIGHT, ///< Axis +X
LEFT, ///< Axis -X
UP, ///< Axis +Y
DOWN, ///< Axis -Y
FRONT, ///< Axis -Z
BACK, ///< Axis +Z
NB_DIRECTIONS,
};
enum Rotation {
ROT_0,
ROT_90,
ROT_180,
ROT_270,
NB_ROTATIONS,
};
typedef struct V360Context {
const AVClass *class;
int in, out;
int interp;
int width, height;
char* in_forder;
char* out_forder;
char* in_frot;
char* out_frot;
int in_cubemap_face_order[6];
int out_cubemap_direction_order[6];
int in_cubemap_face_rotation[6];
int out_cubemap_face_rotation[6];
float in_pad, out_pad;
float yaw, pitch, roll;
int h_flip, v_flip, d_flip;
float h_fov, v_fov;
float flat_range[3];
int planewidth[4], planeheight[4];
int inplanewidth[4], inplaneheight[4];
int nb_planes;
uint16_t *u[4], *v[4];
int16_t *ker[4];
int (*remap_slice)(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs);
} V360Context;
#include "v360.h"
typedef struct ThreadData {
AVFrame *in;
@ -251,47 +170,22 @@ static int query_formats(AVFilterContext *ctx)
return ff_set_common_formats(ctx, fmts_list);
}
/**
* Generate no-interpolation remapping function with a given pixel depth.
*
* @param bits number of bits per pixel
* @param div number of bytes per pixel
*/
#define DEFINE_REMAP1(bits, div) \
static int remap1_##bits##bit_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) \
{ \
ThreadData *td = (ThreadData*)arg; \
const V360Context *s = ctx->priv; \
const AVFrame *in = td->in; \
AVFrame *out = td->out; \
\
int plane, x, y; \
\
for (plane = 0; plane < s->nb_planes; plane++) { \
const int in_linesize = in->linesize[plane] / div; \
const int out_linesize = out->linesize[plane] / div; \
const uint##bits##_t *src = (const uint##bits##_t *)in->data[plane]; \
uint##bits##_t *dst = (uint##bits##_t *)out->data[plane]; \
const int width = s->planewidth[plane]; \
const int height = s->planeheight[plane]; \
\
const int slice_start = (height * jobnr ) / nb_jobs; \
const int slice_end = (height * (jobnr + 1)) / nb_jobs; \
\
for (y = slice_start; y < slice_end; y++) { \
const uint16_t *u = s->u[plane] + y * width; \
const uint16_t *v = s->v[plane] + y * width; \
uint##bits##_t *d = dst + y * out_linesize; \
for (x = 0; x < width; x++) \
*d++ = src[v[x] * in_linesize + u[x]]; \
} \
} \
\
return 0; \
#define DEFINE_REMAP1_LINE(bits, div) \
static void remap1_##bits##bit_line_c(uint8_t *dst, int width, const uint8_t *src, \
ptrdiff_t in_linesize, \
const uint16_t *u, const uint16_t *v, const int16_t *ker) \
{ \
const uint##bits##_t *s = (const uint##bits##_t *)src; \
uint##bits##_t *d = (uint##bits##_t *)dst; \
\
in_linesize /= div; \
\
for (int x = 0; x < width; x++) \
d[x] = s[v[x] * in_linesize + u[x]]; \
}
DEFINE_REMAP1( 8, 1)
DEFINE_REMAP1(16, 2)
DEFINE_REMAP1_LINE( 8, 1)
DEFINE_REMAP1_LINE(16, 2)
typedef struct XYRemap {
uint16_t u[4][4];
@ -304,9 +198,8 @@ typedef struct XYRemap {
*
* @param ws size of interpolation window
* @param bits number of bits per pixel
* @param div number of bytes per pixel
*/
#define DEFINE_REMAP(ws, bits, div) \
#define DEFINE_REMAP(ws, bits) \
static int remap##ws##_##bits##bit_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) \
{ \
ThreadData *td = (ThreadData*)arg; \
@ -314,48 +207,85 @@ static int remap##ws##_##bits##bit_slice(AVFilterContext *ctx, void *arg, int jo
const AVFrame *in = td->in; \
AVFrame *out = td->out; \
\
int plane, x, y, i, j; \
\
for (plane = 0; plane < s->nb_planes; plane++) { \
const int in_linesize = in->linesize[plane] / div; \
const int out_linesize = out->linesize[plane] / div; \
const uint##bits##_t *src = (const uint##bits##_t *)in->data[plane]; \
uint##bits##_t *dst = (uint##bits##_t *)out->data[plane]; \
for (int plane = 0; plane < s->nb_planes; plane++) { \
const int in_linesize = in->linesize[plane]; \
const int out_linesize = out->linesize[plane]; \
const uint8_t *src = in->data[plane]; \
uint8_t *dst = out->data[plane]; \
const int width = s->planewidth[plane]; \
const int height = s->planeheight[plane]; \
\
const int slice_start = (height * jobnr ) / nb_jobs; \
const int slice_end = (height * (jobnr + 1)) / nb_jobs; \
\
for (y = slice_start; y < slice_end; y++) { \
uint##bits##_t *d = dst + y * out_linesize; \
for (int y = slice_start; y < slice_end; y++) { \
const uint16_t *u = s->u[plane] + y * width * ws * ws; \
const uint16_t *v = s->v[plane] + y * width * ws * ws; \
const int16_t *ker = s->ker[plane] + y * width * ws * ws; \
for (x = 0; x < width; x++) { \
const uint16_t *uu = u + x * ws * ws; \
const uint16_t *vv = v + x * ws * ws; \
const int16_t *kker = ker + x * ws * ws; \
int tmp = 0; \
\
for (i = 0; i < ws; i++) { \
for (j = 0; j < ws; j++) { \
tmp += kker[i * ws + j] * src[vv[i * ws + j] * in_linesize + uu[i * ws + j]]; \
} \
} \
\
*d++ = av_clip_uint##bits(tmp >> (15 - ws)); \
} \
s->remap_line(dst + y * out_linesize, width, src, in_linesize, u, v, ker); \
} \
} \
\
return 0; \
}
DEFINE_REMAP(2, 8, 1)
DEFINE_REMAP(4, 8, 1)
DEFINE_REMAP(2, 16, 2)
DEFINE_REMAP(4, 16, 2)
DEFINE_REMAP(1, 8)
DEFINE_REMAP(2, 8)
DEFINE_REMAP(4, 8)
DEFINE_REMAP(1, 16)
DEFINE_REMAP(2, 16)
DEFINE_REMAP(4, 16)
#define DEFINE_REMAP_LINE(ws, bits, div) \
static void remap##ws##_##bits##bit_line_c(uint8_t *dst, int width, const uint8_t *src, \
ptrdiff_t in_linesize, \
const uint16_t *u, const uint16_t *v, const int16_t *ker) \
{ \
const uint##bits##_t *s = (const uint##bits##_t *)src; \
uint##bits##_t *d = (uint##bits##_t *)dst; \
\
in_linesize /= div; \
\
for (int x = 0; x < width; x++) { \
const uint16_t *uu = u + x * ws * ws; \
const uint16_t *vv = v + x * ws * ws; \
const int16_t *kker = ker + x * ws * ws; \
int tmp = 0; \
\
for (int i = 0; i < ws; i++) { \
for (int j = 0; j < ws; j++) { \
tmp += kker[i * ws + j] * s[vv[i * ws + j] * in_linesize + uu[i * ws + j]]; \
} \
} \
\
d[x] = av_clip_uint##bits(tmp >> 14); \
} \
}
DEFINE_REMAP_LINE(2, 8, 1)
DEFINE_REMAP_LINE(4, 8, 1)
DEFINE_REMAP_LINE(2, 16, 2)
DEFINE_REMAP_LINE(4, 16, 2)
void ff_v360_init(V360Context *s, int depth)
{
switch (s->interp) {
case NEAREST:
s->remap_line = depth <= 8 ? remap1_8bit_line_c : remap1_16bit_line_c;
break;
case BILINEAR:
s->remap_line = depth <= 8 ? remap2_8bit_line_c : remap2_16bit_line_c;
break;
case BICUBIC:
case LANCZOS:
s->remap_line = depth <= 8 ? remap4_8bit_line_c : remap4_16bit_line_c;
break;
}
if (ARCH_X86_64)
ff_v360_init_x86(s, depth);
}
/**
* Save nearest pixel coordinates for remapping.
@ -399,10 +329,10 @@ static void bilinear_kernel(float du, float dv, const XYRemap *r_tmp,
}
}
ker[0] = (1.f - du) * (1.f - dv) * 8192;
ker[1] = du * (1.f - dv) * 8192;
ker[2] = (1.f - du) * dv * 8192;
ker[3] = du * dv * 8192;
ker[0] = (1.f - du) * (1.f - dv) * 16384;
ker[1] = du * (1.f - dv) * 16384;
ker[2] = (1.f - du) * dv * 16384;
ker[3] = du * dv * 16384;
}
/**
@ -446,7 +376,7 @@ static void bicubic_kernel(float du, float dv, const XYRemap *r_tmp,
for (j = 0; j < 4; j++) {
u[i * 4 + j] = r_tmp->u[i][j];
v[i * 4 + j] = r_tmp->v[i][j];
ker[i * 4 + j] = du_coeffs[j] * dv_coeffs[i] * 2048;
ker[i * 4 + j] = du_coeffs[j] * dv_coeffs[i] * 16384;
}
}
}
@ -501,7 +431,7 @@ static void lanczos_kernel(float du, float dv, const XYRemap *r_tmp,
for (j = 0; j < 4; j++) {
u[i * 4 + j] = r_tmp->u[i][j];
v[i * 4 + j] = r_tmp->v[i][j];
ker[i * 4 + j] = du_coeffs[j] * dv_coeffs[i] * 2048;
ker[i * 4 + j] = du_coeffs[j] * dv_coeffs[i] * 16384;
}
}
}
@ -2038,6 +1968,8 @@ static int config_output(AVFilterLink *outlink)
av_assert0(0);
}
ff_v360_init(s, depth);
switch (s->in) {
case EQUIRECTANGULAR:
in_transform = xyz_to_equirect;

View File

@ -31,6 +31,7 @@ OBJS-$(CONFIG_TBLEND_FILTER) += x86/vf_blend_init.o
OBJS-$(CONFIG_THRESHOLD_FILTER) += x86/vf_threshold_init.o
OBJS-$(CONFIG_TINTERLACE_FILTER) += x86/vf_tinterlace_init.o
OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume_init.o
OBJS-$(CONFIG_V360_FILTER) += x86/vf_v360_init.o
OBJS-$(CONFIG_W3FDIF_FILTER) += x86/vf_w3fdif_init.o
OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif_init.o
@ -66,5 +67,6 @@ X86ASM-OBJS-$(CONFIG_TBLEND_FILTER) += x86/vf_blend.o
X86ASM-OBJS-$(CONFIG_THRESHOLD_FILTER) += x86/vf_threshold.o
X86ASM-OBJS-$(CONFIG_TINTERLACE_FILTER) += x86/vf_interlace.o
X86ASM-OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume.o
X86ASM-OBJS-$(CONFIG_V360_FILTER) += x86/vf_v360.o
X86ASM-OBJS-$(CONFIG_W3FDIF_FILTER) += x86/vf_w3fdif.o
X86ASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif.o x86/yadif-16.o x86/yadif-10.o

142
libavfilter/x86/vf_v360.asm Normal file
View File

@ -0,0 +1,142 @@
;*****************************************************************************
;* x86-optimized functions for v360 filter
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%if HAVE_AVX2_EXTERNAL && ARCH_X86_64
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
pb_mask: db 0,4,8,12,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
pd_255: times 4 dd 255
SECTION .text
; void ff_remap2_8bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize,
; const uint16_t *u, const uint16_t *v, const int16_t *ker);
INIT_YMM avx2
cglobal remap1_8bit_line, 6, 7, 6, dst, width, src, in_linesize, u, v, x
movsxdifnidn widthq, widthd
xor xq, xq
movd xm0, in_linesized
pcmpeqw m4, m4
VBROADCASTI128 m3, [pb_mask]
vpbroadcastd m0, xm0
.loop:
pmovsxwd m1, [vq + xq * 2]
pmovsxwd m2, [uq + xq * 2]
pmulld m1, m0
paddd m1, m2
mova m2, m4
vpgatherdd m5, [srcq + m1], m2
pshufb m1, m5, m3
vextracti128 xm2, m1, 1
movd [dstq+xq], xm1
movd [dstq+xq+4], xm2
add xq, mmsize / 4
cmp xq, widthq
jl .loop
RET
INIT_YMM avx2
cglobal remap2_8bit_line, 7, 8, 8, dst, width, src, in_linesize, u, v, ker, x
movsxdifnidn widthq, widthd
xor xq, xq
movd xm0, in_linesized
pcmpeqw m7, m7
vpbroadcastd m0, xm0
vpbroadcastd m6, [pd_255]
.loop:
pmovsxwd m1, [kerq + xq * 8]
pmovsxwd m2, [vq + xq * 8]
pmovsxwd m3, [uq + xq * 8]
pmulld m4, m2, m0
paddd m4, m3
mova m3, m7
vpgatherdd m2, [srcq + m4], m3
pand m2, m6
pmulld m2, m1
phaddd m2, m2
phaddd m1, m2, m2
psrld m1, m1, 0xe
vextracti128 xm2, m1, 1
pextrb [dstq+xq], xm1, 0
pextrb [dstq+xq+1], xm2, 0
add xq, mmsize / 16
cmp xq, widthq
jl .loop
RET
INIT_YMM avx2
cglobal remap4_8bit_line, 7, 9, 11, dst, width, src, in_linesize, u, v, ker, x, y
movsxdifnidn widthq, widthd
xor yq, yq
xor xq, xq
movd xm0, in_linesized
pcmpeqw m7, m7
vpbroadcastd m0, xm0
vpbroadcastd m6, [pd_255]
.loop:
pmovsxwd m1, [kerq + yq]
pmovsxwd m5, [kerq + yq + 16]
pmovsxwd m2, [vq + yq]
pmovsxwd m8, [vq + yq + 16]
pmovsxwd m3, [uq + yq]
pmovsxwd m9, [uq + yq + 16]
pmulld m4, m2, m0
pmulld m10, m8, m0
paddd m4, m3
paddd m10, m9
mova m3, m7
vpgatherdd m2, [srcq + m4], m3
mova m3, m7
vpgatherdd m4, [srcq + m10], m3
pand m2, m6
pand m4, m6
pmulld m2, m1
pmulld m4, m5
paddd m2, m4
vextracti128 xm1, m2, 1
paddd m1, m2
phaddd m1, m1
phaddd m1, m1
psrld m1, m1, 0xe
packuswb m1, m1
pextrb [dstq+xq], xm1, 0
add xq, 1
add yq, 32
cmp xq, widthq
jl .loop
RET
%endif

View File

@ -0,0 +1,50 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavfilter/v360.h"
void ff_remap1_8bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize,
const uint16_t *u, const uint16_t *v, const int16_t *ker);
void ff_remap2_8bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize,
const uint16_t *u, const uint16_t *v, const int16_t *ker);
void ff_remap4_8bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize,
const uint16_t *u, const uint16_t *v, const int16_t *ker);
av_cold void ff_v360_init_x86(V360Context *s, int depth)
{
#if ARCH_X86_64
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_AVX2_FAST(cpu_flags) && s->interp == NEAREST && depth <= 8)
s->remap_line = ff_remap1_8bit_line_avx2;
if (EXTERNAL_AVX2_FAST(cpu_flags) && s->interp == BILINEAR && depth <= 8)
s->remap_line = ff_remap2_8bit_line_avx2;
if (EXTERNAL_AVX2_FAST(cpu_flags) && (s->interp == BICUBIC ||
s->interp == LANCZOS) && depth <= 8)
s->remap_line = ff_remap4_8bit_line_avx2;
#endif
}