avcodec/mips: MSA (MIPS-SIMD-Arch) optimizations for HEVC bi mc functions

This patch adds MSA (MIPS-SIMD-Arch) optimizations for HEVC bi mc functions (qpel as well as epel) in new file hevc_mc_bi_msa.c
Adds new generic macros (needed for this patch) in libavutil/mips/generic_macros_msa.h
Adds HEVC specific macros (needed for this patch) in libavcodec/mips/hevc_macros_msa.h

Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com>
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
Shivraj Patil 2015-05-29 12:20:26 +05:30 committed by Michael Niedermayer
parent da05c62499
commit aede1a1a60
6 changed files with 4792 additions and 1 deletions

View File

@ -21,6 +21,7 @@ MIPSFPU-OBJS-$(CONFIG_AAC_ENCODER) += mips/iirfilter_mips.o
OBJS-$(CONFIG_HEVC_DECODER) += mips/hevcdsp_init_mips.o
OBJS-$(CONFIG_H264DSP) += mips/h264dsp_init_mips.o
MSA-OBJS-$(CONFIG_HEVC_DECODER) += mips/hevcdsp_msa.o \
mips/hevc_mc_uni_msa.o
mips/hevc_mc_uni_msa.o \
mips/hevc_mc_bi_msa.o
MSA-OBJS-$(CONFIG_H264DSP) += mips/h264dsp_msa.o
LOONGSON3-OBJS-$(CONFIG_H264DSP) += mips/h264dsp_mmi.o

View File

@ -37,6 +37,27 @@
out = (v4i32) __msa_pckev_b((v16i8) tmp1_m, (v16i8) tmp0_m); \
}
#define HEVC_PCK_SW_SB8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1) \
{ \
v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
\
PCKEV_H4_SH(in0, in1, in2, in3, in4, in5, in6, in7, \
tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
PCKEV_B2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1); \
}
#define HEVC_PCK_SW_SB12(in0, in1, in2, in3, in4, in5, in6, in7, \
in8, in9, in10, in11, out0, out1, out2) \
{ \
v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m, tmp4_m, tmp5_m; \
\
PCKEV_H4_SH(in0, in1, in2, in3, in4, in5, in6, in7, \
tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
PCKEV_H2_SH(in8, in9, in10, in11, tmp4_m, tmp5_m); \
PCKEV_B2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1); \
out2 = (v4i32) __msa_pckev_b((v16i8) tmp5_m, (v16i8) tmp4_m); \
}
#define HEVC_FILT_8TAP(in0, in1, in2, in3, \
filt0, filt1, filt2, filt3) \
( { \
@ -48,4 +69,13 @@
out_m; \
} )
#define HEVC_FILT_4TAP(in0, in1, filt0, filt1) \
( { \
v4i32 out_m; \
\
out_m = __msa_dotp_s_w(in0, (v8i16) filt0); \
out_m = __msa_dpadd_s_w(out_m, in1, (v8i16) filt1); \
out_m; \
} )
#endif /* AVCODEC_MIPS_HEVC_MACROS_MSA_H */

File diff suppressed because it is too large Load Diff

View File

@ -96,6 +96,74 @@ static av_cold void hevc_dsp_init_msa(HEVCDSPContext *c,
c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_8_msa;
c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_8_msa;
c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_8_msa;
c->put_hevc_qpel_bi[1][0][0] = ff_hevc_put_hevc_bi_pel_pixels4_8_msa;
c->put_hevc_qpel_bi[3][0][0] = ff_hevc_put_hevc_bi_pel_pixels8_8_msa;
c->put_hevc_qpel_bi[4][0][0] = ff_hevc_put_hevc_bi_pel_pixels12_8_msa;
c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_8_msa;
c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_8_msa;
c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_msa;
c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_msa;
c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_msa;
c->put_hevc_qpel_bi[1][0][1] = ff_hevc_put_hevc_bi_qpel_h4_8_msa;
c->put_hevc_qpel_bi[3][0][1] = ff_hevc_put_hevc_bi_qpel_h8_8_msa;
c->put_hevc_qpel_bi[4][0][1] = ff_hevc_put_hevc_bi_qpel_h12_8_msa;
c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_bi_qpel_h16_8_msa;
c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_bi_qpel_h24_8_msa;
c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_8_msa;
c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_msa;
c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_msa;
c->put_hevc_qpel_bi[1][1][0] = ff_hevc_put_hevc_bi_qpel_v4_8_msa;
c->put_hevc_qpel_bi[3][1][0] = ff_hevc_put_hevc_bi_qpel_v8_8_msa;
c->put_hevc_qpel_bi[4][1][0] = ff_hevc_put_hevc_bi_qpel_v12_8_msa;
c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_hevc_bi_qpel_v16_8_msa;
c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_hevc_bi_qpel_v24_8_msa;
c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_8_msa;
c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_msa;
c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_msa;
c->put_hevc_qpel_bi[1][1][1] = ff_hevc_put_hevc_bi_qpel_hv4_8_msa;
c->put_hevc_qpel_bi[3][1][1] = ff_hevc_put_hevc_bi_qpel_hv8_8_msa;
c->put_hevc_qpel_bi[4][1][1] = ff_hevc_put_hevc_bi_qpel_hv12_8_msa;
c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_bi_qpel_hv16_8_msa;
c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_bi_qpel_hv24_8_msa;
c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_bi_qpel_hv32_8_msa;
c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_8_msa;
c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_8_msa;
c->put_hevc_epel_bi[1][0][0] = ff_hevc_put_hevc_bi_pel_pixels4_8_msa;
c->put_hevc_epel_bi[2][0][0] = ff_hevc_put_hevc_bi_pel_pixels6_8_msa;
c->put_hevc_epel_bi[3][0][0] = ff_hevc_put_hevc_bi_pel_pixels8_8_msa;
c->put_hevc_epel_bi[4][0][0] = ff_hevc_put_hevc_bi_pel_pixels12_8_msa;
c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_8_msa;
c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_8_msa;
c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_msa;
c->put_hevc_epel_bi[1][0][1] = ff_hevc_put_hevc_bi_epel_h4_8_msa;
c->put_hevc_epel_bi[2][0][1] = ff_hevc_put_hevc_bi_epel_h6_8_msa;
c->put_hevc_epel_bi[3][0][1] = ff_hevc_put_hevc_bi_epel_h8_8_msa;
c->put_hevc_epel_bi[4][0][1] = ff_hevc_put_hevc_bi_epel_h12_8_msa;
c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_8_msa;
c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_8_msa;
c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_msa;
c->put_hevc_epel_bi[1][1][0] = ff_hevc_put_hevc_bi_epel_v4_8_msa;
c->put_hevc_epel_bi[2][1][0] = ff_hevc_put_hevc_bi_epel_v6_8_msa;
c->put_hevc_epel_bi[3][1][0] = ff_hevc_put_hevc_bi_epel_v8_8_msa;
c->put_hevc_epel_bi[4][1][0] = ff_hevc_put_hevc_bi_epel_v12_8_msa;
c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_8_msa;
c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_hevc_bi_epel_v24_8_msa;
c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_8_msa;
c->put_hevc_epel_bi[1][1][1] = ff_hevc_put_hevc_bi_epel_hv4_8_msa;
c->put_hevc_epel_bi[2][1][1] = ff_hevc_put_hevc_bi_epel_hv6_8_msa;
c->put_hevc_epel_bi[3][1][1] = ff_hevc_put_hevc_bi_epel_hv8_8_msa;
c->put_hevc_epel_bi[4][1][1] = ff_hevc_put_hevc_bi_epel_hv12_8_msa;
c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_bi_epel_hv16_8_msa;
c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_8_msa;
c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_msa;
}
}
#endif // #if HAVE_MSA

View File

@ -116,3 +116,83 @@ UNI_MC(qpel, hv, 48);
UNI_MC(qpel, hv, 64);
#undef UNI_MC
#define BI_MC(PEL, DIR, WIDTH) \
void ff_hevc_put_hevc_bi_##PEL##_##DIR####WIDTH##_8_msa(uint8_t *dst, \
ptrdiff_t dst_stride, \
uint8_t *src, \
ptrdiff_t src_stride, \
int16_t *src_16bit, \
int height, \
intptr_t mx, \
intptr_t my, \
int width)
BI_MC(pel, pixels, 4);
BI_MC(pel, pixels, 6);
BI_MC(pel, pixels, 8);
BI_MC(pel, pixels, 12);
BI_MC(pel, pixels, 16);
BI_MC(pel, pixels, 24);
BI_MC(pel, pixels, 32);
BI_MC(pel, pixels, 48);
BI_MC(pel, pixels, 64);
BI_MC(qpel, h, 4);
BI_MC(qpel, h, 8);
BI_MC(qpel, h, 12);
BI_MC(qpel, h, 16);
BI_MC(qpel, h, 24);
BI_MC(qpel, h, 32);
BI_MC(qpel, h, 48);
BI_MC(qpel, h, 64);
BI_MC(qpel, v, 4);
BI_MC(qpel, v, 8);
BI_MC(qpel, v, 12);
BI_MC(qpel, v, 16);
BI_MC(qpel, v, 24);
BI_MC(qpel, v, 32);
BI_MC(qpel, v, 48);
BI_MC(qpel, v, 64);
BI_MC(qpel, hv, 4);
BI_MC(qpel, hv, 8);
BI_MC(qpel, hv, 12);
BI_MC(qpel, hv, 16);
BI_MC(qpel, hv, 24);
BI_MC(qpel, hv, 32);
BI_MC(qpel, hv, 48);
BI_MC(qpel, hv, 64);
BI_MC(epel, h, 4);
BI_MC(epel, h, 6);
BI_MC(epel, h, 8);
BI_MC(epel, h, 12);
BI_MC(epel, h, 16);
BI_MC(epel, h, 24);
BI_MC(epel, h, 32);
BI_MC(epel, h, 48);
BI_MC(epel, h, 64);
BI_MC(epel, v, 4);
BI_MC(epel, v, 6);
BI_MC(epel, v, 8);
BI_MC(epel, v, 12);
BI_MC(epel, v, 16);
BI_MC(epel, v, 24);
BI_MC(epel, v, 32);
BI_MC(epel, v, 48);
BI_MC(epel, v, 64);
BI_MC(epel, hv, 4);
BI_MC(epel, hv, 6);
BI_MC(epel, hv, 8);
BI_MC(epel, hv, 12);
BI_MC(epel, hv, 16);
BI_MC(epel, hv, 24);
BI_MC(epel, hv, 32);
BI_MC(epel, hv, 48);
BI_MC(epel, hv, 64);
#undef BI_MC

View File

@ -29,18 +29,23 @@
#define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
#define LD_H(RTYPE, psrc) *((RTYPE *)(psrc))
#define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
#define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
#define LD_W(RTYPE, psrc) *((RTYPE *)(psrc))
#define LD_UW(...) LD_W(v4u32, __VA_ARGS__)
#define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
#define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
#define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
#define ST_UW(...) ST_W(v4u32, __VA_ARGS__)
#define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
#if (__mips_isa_rev >= 6)
@ -328,6 +333,46 @@
#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
/* Description : Load vectors with 8 halfword elements with stride
Arguments : Inputs - psrc (source pointer to load from)
- stride
Outputs - out0, out1
Details : Loads 8 halfword elements in 'out0' from (psrc)
Loads 8 halfword elements in 'out1' from (psrc + stride)
*/
#define LD_H2(RTYPE, psrc, stride, out0, out1) \
{ \
out0 = LD_H(RTYPE, (psrc)); \
out1 = LD_H(RTYPE, (psrc) + (stride)); \
}
#define LD_UH2(...) LD_H2(v8u16, __VA_ARGS__)
#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \
{ \
LD_H2(RTYPE, (psrc), stride, out0, out1); \
LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
}
#define LD_UH4(...) LD_H4(v8u16, __VA_ARGS__)
#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
#define LD_H6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \
{ \
LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
LD_H2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \
}
#define LD_UH6(...) LD_H6(v8u16, __VA_ARGS__)
#define LD_SH6(...) LD_H6(v8i16, __VA_ARGS__)
#define LD_H8(RTYPE, psrc, stride, \
out0, out1, out2, out3, out4, out5, out6, out7) \
{ \
LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
}
#define LD_UH8(...) LD_H8(v8u16, __VA_ARGS__)
#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
/* Description : Store vectors of 16 byte elements with stride
Arguments : Inputs - in0, in1, stride
Outputs - pdst (destination pointer to store to)
@ -478,6 +523,55 @@
\
SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \
}
#define ST4x8_UB(in0, in1, pdst, stride) \
{ \
uint8_t *pblk_4x8 = (uint8_t *) (pdst); \
\
ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \
ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
}
/* Description : Store as 6x4 byte block to destination memory from input
vectors
Arguments : Inputs - in0, in1, pdst, stride
Return Type - unsigned byte
Details : Index 0 word element from input vector 'in0' is copied and
stored on first line followed by index 2 halfword element
Index 2 word element from input vector 'in0' is copied and
stored on second line followed by index 2 halfword element
Index 0 word element from input vector 'in1' is copied and
stored on third line followed by index 2 halfword element
Index 2 word element from input vector 'in1' is copied and
stored on fourth line followed by index 2 halfword element
*/
#define ST6x4_UB(in0, in1, pdst, stride) \
{ \
uint32_t out0_m, out1_m, out2_m, out3_m; \
uint16_t out4_m, out5_m, out6_m, out7_m; \
uint8_t *pblk_6x4_m = (uint8_t *) (pdst); \
\
out0_m = __msa_copy_u_w((v4i32) in0, 0); \
out1_m = __msa_copy_u_w((v4i32) in0, 2); \
out2_m = __msa_copy_u_w((v4i32) in1, 0); \
out3_m = __msa_copy_u_w((v4i32) in1, 2); \
\
out4_m = __msa_copy_u_h((v8i16) in0, 2); \
out5_m = __msa_copy_u_h((v8i16) in0, 6); \
out6_m = __msa_copy_u_h((v8i16) in1, 2); \
out7_m = __msa_copy_u_h((v8i16) in1, 6); \
\
SW(out0_m, pblk_6x4_m); \
SH(out4_m, (pblk_6x4_m + 4)); \
pblk_6x4_m += stride; \
SW(out1_m, pblk_6x4_m); \
SH(out5_m, (pblk_6x4_m + 4)); \
pblk_6x4_m += stride; \
SW(out2_m, pblk_6x4_m); \
SH(out6_m, (pblk_6x4_m + 4)); \
pblk_6x4_m += stride; \
SW(out3_m, pblk_6x4_m); \
SH(out7_m, (pblk_6x4_m + 4)); \
}
/* Description : Store as 8x2 byte block to destination memory from input vector
Arguments : Inputs - in, pdst, stride
@ -529,6 +623,15 @@
ST8x4_UB(in0, in1, pblk_8x8_m, stride); \
ST8x4_UB(in2, in3, pblk_8x8_m + 4 * stride, stride); \
}
#define ST12x4_UB(in0, in1, in2, pdst, stride) \
{ \
uint8_t *pblk_12x4_m = (uint8_t *) (pdst); \
\
/* left 8x4 */ \
ST8x4_UB(in0, in1, pblk_12x4_m, stride); \
/* right 4x4 */ \
ST4x4_UB(in2, in2, 0, 1, 2, 3, pblk_12x4_m + 8, stride); \
}
/* Description : Store as 12x8 byte block to destination memory from
input vectors
@ -1246,6 +1349,8 @@
}
#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
#define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
/* Description : Pack even halfword elements of vector pairs
Arguments : Inputs - in0, in1, in2, in3
@ -1317,6 +1422,13 @@
}
#define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
#define XORI_B6_128(RTYPE, in0, in1, in2, in3, in4, in5) \
{ \
XORI_B4_128(RTYPE, in0, in1, in2, in3); \
XORI_B2_128(RTYPE, in4, in5); \
}
#define XORI_B6_128_SB(...) XORI_B6_128(v16i8, __VA_ARGS__)
#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \
{ \
XORI_B4_128(RTYPE, in0, in1, in2, in3); \
@ -1442,6 +1554,25 @@
}
#define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__)
#define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__)
/* Description : Shift right arithmetic rounded (immediate)
Arguments : Inputs - in0, in1, in2, in3, shift
Outputs - in0, in1, in2, in3 (in place)
Return Type - as per RTYPE
Details : Each element of vector 'in0' is shifted right arithmetic by
value in 'shift'.
The last discarded bit is added to shifted value for rounding
and the result is in place written to 'in0'
Similar for other pairs
*/
#define SRARI_H2(RTYPE, in0, in1, shift) \
{ \
in0 = (RTYPE) __msa_srari_h((v8i16) in0, shift); \
in1 = (RTYPE) __msa_srari_h((v8i16) in1, shift); \
}
#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
/* Description : Shift right arithmetic rounded (immediate)
Arguments : Inputs - in0, in1, shift
Outputs - in0, in1 (in place)
@ -1499,6 +1630,25 @@
ILVRL_B2_SH(zero_m, in, out0, out1); \
}
/* Description : Sign extend halfword elements from input vector and return
result in pair of vectors
Arguments : Inputs - in (1 input halfword vector)
Outputs - out0, out1 (sign extended 2 word vectors)
Return Type - signed word
Details : Sign bit of halfword elements from input vector 'in' is
extracted and interleaved right with same vector 'in0' to
generate 4 signed word elements in 'out0'
Then interleaved left with same vector 'in0' to
generate 4 signed word elements in 'out1'
*/
#define UNPCK_SH_SW(in, out0, out1) \
{ \
v8i16 tmp_m; \
\
tmp_m = __msa_clti_s_h((v8i16) in, 0); \
ILVRL_H2_SW(tmp_m, in, out0, out1); \
}
/* Description : Transposes input 4x4 byte block
Arguments : Inputs - in0, in1, in2, in3 (input 4x4 byte block)
Outputs - out0, out1, out2, out3 (output 4x4 byte block)