From 36091742d182b3ad4411aae22682354b3834a974 Mon Sep 17 00:00:00 2001 From: Kieran Kunhya Date: Wed, 26 Nov 2014 15:59:14 +0000 Subject: [PATCH] v210enc: Add SIMD optimised 8-bit and 10-bit encoders Signed-off-by: Michael Niedermayer --- libavcodec/v210enc.c | 187 +++++++++++++++++++++++++++------- libavcodec/v210enc.h | 33 ++++++ libavcodec/x86/Makefile | 2 + libavcodec/x86/v210enc.asm | 145 ++++++++++++++++++++++++++ libavcodec/x86/v210enc_init.c | 37 +++++++ libavutil/x86/x86util.asm | 5 + tests/ref/vsynth/vsynth1-v210 | 6 +- tests/ref/vsynth/vsynth2-v210 | 6 +- tests/ref/vsynth/vsynth3-v210 | 6 +- 9 files changed, 379 insertions(+), 48 deletions(-) create mode 100644 libavcodec/v210enc.h create mode 100644 libavcodec/x86/v210enc.asm create mode 100644 libavcodec/x86/v210enc_init.c diff --git a/libavcodec/v210enc.c b/libavcodec/v210enc.c index 1e53bdbdda..0d40f99993 100644 --- a/libavcodec/v210enc.c +++ b/libavcodec/v210enc.c @@ -24,82 +24,190 @@ #include "avcodec.h" #include "bytestream.h" #include "internal.h" +#include "v210enc.h" + +#define CLIP(v) av_clip(v, 4, 1019) +#define CLIP8(v) av_clip(v, 1, 254) + +#define WRITE_PIXELS(a, b, c) \ + do { \ + val = CLIP(*a++); \ + val |= (CLIP(*b++) << 10) | \ + (CLIP(*c++) << 20); \ + AV_WL32(dst, val); \ + dst += 4; \ + } while (0) + +#define WRITE_PIXELS8(a, b, c) \ + do { \ + val = (CLIP8(*a++) << 2); \ + val |= (CLIP8(*b++) << 12) | \ + (CLIP8(*c++) << 22); \ + AV_WL32(dst, val); \ + dst += 4; \ + } while (0) + +static void v210_planar_pack_8_c(const uint8_t *y, const uint8_t *u, + const uint8_t *v, uint8_t *dst, ptrdiff_t width) +{ + uint32_t val; + int i; + + /* unroll this to match the assembly */ + for( i = 0; i < width-11; i += 12 ){ + WRITE_PIXELS8(u, y, v); + WRITE_PIXELS8(y, u, y); + WRITE_PIXELS8(v, y, u); + WRITE_PIXELS8(y, v, y); + WRITE_PIXELS8(u, y, v); + WRITE_PIXELS8(y, u, y); + WRITE_PIXELS8(v, y, u); + WRITE_PIXELS8(y, v, y); + } +} + +static void v210_planar_pack_10_c(const uint16_t *y, const uint16_t *u, + const uint16_t *v, uint8_t *dst, ptrdiff_t width) +{ + uint32_t val; + int i; + + for( i = 0; i < width-5; i += 6 ){ + WRITE_PIXELS(u, y, v); + WRITE_PIXELS(y, u, y); + WRITE_PIXELS(v, y, u); + WRITE_PIXELS(y, v, y); + } +} static av_cold int encode_init(AVCodecContext *avctx) { + V210EncContext *s = avctx->priv_data; + if (avctx->width & 1) { av_log(avctx, AV_LOG_ERROR, "v210 needs even width\n"); return AVERROR(EINVAL); } - if (avctx->bits_per_raw_sample != 10) - av_log(avctx, AV_LOG_WARNING, "bits per raw sample: %d != 10-bit\n", - avctx->bits_per_raw_sample); - avctx->coded_frame = av_frame_alloc(); if (!avctx->coded_frame) return AVERROR(ENOMEM); avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I; + s->pack_line_8 = v210_planar_pack_8_c; + s->pack_line_10 = v210_planar_pack_10_c; + + if (ARCH_X86) + ff_v210enc_init_x86(s); + return 0; } static int encode_frame(AVCodecContext *avctx, AVPacket *pkt, const AVFrame *pic, int *got_packet) { + V210EncContext *s = avctx->priv_data; + int aligned_width = ((avctx->width + 47) / 48) * 48; int stride = aligned_width * 8 / 3; int line_padding = stride - ((avctx->width * 8 + 11) / 12) * 4; int h, w, ret; - const uint16_t *y = (const uint16_t*)pic->data[0]; - const uint16_t *u = (const uint16_t*)pic->data[1]; - const uint16_t *v = (const uint16_t*)pic->data[2]; - PutByteContext p; + uint8_t *dst; - if ((ret = ff_alloc_packet2(avctx, pkt, avctx->height * stride)) < 0) + if ((ret = ff_alloc_packet(pkt, avctx->height * stride)) < 0) { + av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n"); return ret; + } - bytestream2_init_writer(&p, pkt->data, pkt->size); + dst = pkt->data; -#define CLIP(v) av_clip(v, 4, 1019) + if (pic->format == AV_PIX_FMT_YUV422P10) { + const uint16_t *y = (const uint16_t*)pic->data[0]; + const uint16_t *u = (const uint16_t*)pic->data[1]; + const uint16_t *v = (const uint16_t*)pic->data[2]; + for (h = 0; h < avctx->height; h++) { + uint32_t val; + w = (avctx->width / 6) * 6; + s->pack_line_10(y, u, v, dst, w); -#define WRITE_PIXELS(a, b, c) \ - do { \ - val = CLIP(*a++); \ - val |= (CLIP(*b++) << 10) | \ - (CLIP(*c++) << 20); \ - bytestream2_put_le32u(&p, val); \ - } while (0) + y += w; + u += w >> 1; + v += w >> 1; + dst += (w / 6) * 16; + if (w < avctx->width - 1) { + WRITE_PIXELS(u, y, v); - for (h = 0; h < avctx->height; h++) { - uint32_t val; - for (w = 0; w < avctx->width - 5; w += 6) { - WRITE_PIXELS(u, y, v); - WRITE_PIXELS(y, u, y); - WRITE_PIXELS(v, y, u); - WRITE_PIXELS(y, v, y); - } - if (w < avctx->width - 1) { - WRITE_PIXELS(u, y, v); - - val = CLIP(*y++); - if (w == avctx->width - 2) - bytestream2_put_le32u(&p, val); + val = CLIP(*y++); + if (w == avctx->width - 2) { + AV_WL32(dst, val); + dst += 4; + } + } if (w < avctx->width - 3) { val |= (CLIP(*u++) << 10) | (CLIP(*y++) << 20); - bytestream2_put_le32u(&p, val); + AV_WL32(dst, val); + dst += 4; val = CLIP(*v++) | (CLIP(*y++) << 10); - bytestream2_put_le32u(&p, val); + AV_WL32(dst, val); + dst += 4; } + + memset(dst, 0, line_padding); + dst += line_padding; + + y += pic->linesize[0] / 2 - avctx->width; + u += pic->linesize[1] / 2 - avctx->width / 2; + v += pic->linesize[2] / 2 - avctx->width / 2; } + } + else if(pic->format == AV_PIX_FMT_YUV422P) { + const uint8_t *y = pic->data[0]; + const uint8_t *u = pic->data[1]; + const uint8_t *v = pic->data[2]; + for (h = 0; h < avctx->height; h++) { + uint32_t val; + w = (avctx->width / 12) * 12; + s->pack_line_8(y, u, v, dst, w); - bytestream2_set_buffer(&p, 0, line_padding); + y += w; + u += w >> 1; + v += w >> 1; + dst += (w / 12) * 32; - y += pic->linesize[0] / 2 - avctx->width; - u += pic->linesize[1] / 2 - avctx->width / 2; - v += pic->linesize[2] / 2 - avctx->width / 2; + for( ; w < avctx->width-5; w += 6 ){ + WRITE_PIXELS8(u, y, v); + WRITE_PIXELS8(y, u, y); + WRITE_PIXELS8(v, y, u); + WRITE_PIXELS8(y, v, y); + } + if (w < avctx->width - 1) { + WRITE_PIXELS8(u, y, v); + + val = CLIP8(*y++) << 2; + if (w == avctx->width - 2) { + AV_WL32(dst, val); + dst += 4; + } + } + if (w < avctx->width - 3) { + val |= (CLIP8(*u++) << 12) | (CLIP8(*y++) << 22); + AV_WL32(dst, val); + dst += 4; + + val = (CLIP8(*v++) << 2) | (CLIP8(*y++) << 12); + AV_WL32(dst, val); + dst += 4; + } + + memset(dst, 0, line_padding); + dst += line_padding; + + y += pic->linesize[0] - avctx->width; + u += pic->linesize[1] - avctx->width / 2; + v += pic->linesize[2] - avctx->width / 2; + } } pkt->flags |= AV_PKT_FLAG_KEY; @@ -119,8 +227,9 @@ AVCodec ff_v210_encoder = { .long_name = NULL_IF_CONFIG_SMALL("Uncompressed 4:2:2 10-bit"), .type = AVMEDIA_TYPE_VIDEO, .id = AV_CODEC_ID_V210, + .priv_data_size = sizeof(V210EncContext), .init = encode_init, .encode2 = encode_frame, .close = encode_close, - .pix_fmts = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUV422P10, AV_PIX_FMT_NONE }, + .pix_fmts = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV422P, AV_PIX_FMT_NONE }, }; diff --git a/libavcodec/v210enc.h b/libavcodec/v210enc.h new file mode 100644 index 0000000000..ea6ae41c71 --- /dev/null +++ b/libavcodec/v210enc.h @@ -0,0 +1,33 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCOENC_V210ENC_H +#define AVCOENC_V210ENC_H + +#include "libavutil/log.h" +#include "libavutil/opt.h" +#include "libavutil/pixfmt.h" + +typedef struct { + void (*pack_line_8)(const uint8_t *y, const uint8_t *u, const uint8_t *v, uint8_t *dst, ptrdiff_t width); + void (*pack_line_10)(const uint16_t *y, const uint16_t *u, const uint16_t *v, uint8_t *dst, ptrdiff_t width); +} V210EncContext; + +void ff_v210enc_init_x86(V210EncContext *s); + +#endif /* AVCOENC_V210ENC_H */ diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index df56fb7361..7c8e7aae63 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -54,6 +54,7 @@ OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc_init.o OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp_init.o OBJS-$(CONFIG_TTA_DECODER) += x86/ttadsp_init.o OBJS-$(CONFIG_V210_DECODER) += x86/v210-init.o +OBJS-$(CONFIG_V210_ENCODER) += x86/v210enc_init.o OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_init.o OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp_init.o OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp_init.o @@ -144,6 +145,7 @@ YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp.o \ YASM-OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc.o YASM-OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o YASM-OBJS-$(CONFIG_TTA_DECODER) += x86/ttadsp.o +YASM-OBJS-$(CONFIG_V210_ENCODER) += x86/v210enc.o YASM-OBJS-$(CONFIG_V210_DECODER) += x86/v210.o YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp.o YASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm new file mode 100644 index 0000000000..3245de3891 --- /dev/null +++ b/libavcodec/x86/v210enc.asm @@ -0,0 +1,145 @@ +;****************************************************************************** +;* V210 SIMD pack +;* Copyright (c) 2014 Kieran Kunhya +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +v210_enc_min_10: times 8 dw 0x4 +v210_enc_max_10: times 8 dw 0x3fb + +v210_enc_luma_mult_10: dw 4,1,16,4,1,16,0,0 +v210_enc_luma_shuf_10: db -1,0,1,-1,2,3,4,5,-1,6,7,-1,8,9,10,11 + +v210_enc_chroma_mult_10: dw 1,4,16,0,16,1,4,0 +v210_enc_chroma_shuf_10: db 0,1,8,9,-1,2,3,-1,10,11,4,5,-1,12,13,-1 + +v210_enc_min_8: times 16 db 0x1 +v210_enc_max_8: times 16 db 0xfe + +v210_enc_luma_shuf_8: db 6,-1,7,-1,8,-1,9,-1,10,-1,11,-1,-1,-1,-1,-1 +v210_enc_luma_mult_8: dw 16,4,64,16,4,64,0,0 + +v210_enc_chroma_shuf1_8: db 0,-1,1,-1,2,-1,3,-1,8,-1,9,-1,10,-1,11,-1 +v210_enc_chroma_shuf2_8: db 3,-1,4,-1,5,-1,7,-1,11,-1,12,-1,13,-1,15,-1 + +v210_enc_chroma_mult_8: dw 4,16,64,0,64,4,16,0 + +SECTION .text + +%macro v210_planar_pack_10 0 + +; v210_planar_pack_10(const uint16_t *y, const uint16_t *u, const uint16_t *v, uint8_t *dst, ptrdiff_t width) +cglobal v210_planar_pack_10, 5, 5, 4, y, u, v, dst, width + lea r0, [yq+2*widthq] + add uq, widthq + add vq, widthq + neg widthq + + mova m2, [v210_enc_min_10] + mova m3, [v210_enc_max_10] + +.loop + movu m0, [yq+2*widthq] + CLIPW m0, m2, m3 + + movq m1, [uq+widthq] + movhps m1, [vq+widthq] + CLIPW m1, m2, m3 + + pmullw m0, [v210_enc_luma_mult_10] + pshufb m0, [v210_enc_luma_shuf_10] + + pmullw m1, [v210_enc_chroma_mult_10] + pshufb m1, [v210_enc_chroma_shuf_10] + + por m0, m1 + + movu [dstq], m0 + + add dstq, mmsize + add widthq, 6 + jl .loop + + RET +%endmacro + +INIT_XMM ssse3 +v210_planar_pack_10 + +%macro v210_planar_pack_8 0 + +; v210_planar_pack_8(const uint8_t *y, const uint8_t *u, const uint8_t *v, uint8_t *dst, ptrdiff_t width) +cglobal v210_planar_pack_8, 5, 5, 7, y, u, v, dst, width + add yq, widthq + shr widthq, 1 + add uq, widthq + add vq, widthq + neg widthq + + mova m4, [v210_enc_min_8] + mova m5, [v210_enc_max_8] + pxor m6, m6 + +.loop + movu m1, [yq+2*widthq] + CLIPUB m1, m4, m5 + + punpcklbw m0, m1, m6 + ; can't unpack high bytes in the same way because we process + ; only six bytes at a time + pshufb m1, [v210_enc_luma_shuf_8] + + pmullw m0, [v210_enc_luma_mult_8] + pmullw m1, [v210_enc_luma_mult_8] + pshufb m0, [v210_enc_luma_shuf_10] + pshufb m1, [v210_enc_luma_shuf_10] + + movq m3, [uq+widthq] + movhps m3, [vq+widthq] + CLIPUB m3, m4, m5 + + ; shuffle and multiply to get the same packing as in 10-bit + pshufb m2, m3, [v210_enc_chroma_shuf1_8] + pshufb m3, [v210_enc_chroma_shuf2_8] + + pmullw m2, [v210_enc_chroma_mult_8] + pmullw m3, [v210_enc_chroma_mult_8] + pshufb m2, [v210_enc_chroma_shuf_10] + pshufb m3, [v210_enc_chroma_shuf_10] + + por m0, m2 + por m1, m3 + + movu [dstq], m0 + movu [dstq+mmsize], m1 + + add dstq, 2*mmsize + add widthq, 6 + jl .loop + + RET +%endmacro + +INIT_XMM ssse3 +v210_planar_pack_8 +INIT_XMM avx +v210_planar_pack_8 diff --git a/libavcodec/x86/v210enc_init.c b/libavcodec/x86/v210enc_init.c new file mode 100644 index 0000000000..3ac498ac5f --- /dev/null +++ b/libavcodec/x86/v210enc_init.c @@ -0,0 +1,37 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/x86/cpu.h" +#include "libavcodec/v210enc.h" + +void ff_v210_planar_pack_8_ssse3(const uint8_t *y, const uint8_t *u, const uint8_t *v, uint8_t *dst, ptrdiff_t width); +void ff_v210_planar_pack_8_avx(const uint8_t *y, const uint8_t *u, const uint8_t *v, uint8_t *dst, ptrdiff_t width); +void ff_v210_planar_pack_10_ssse3(const uint16_t *y, const uint16_t *u, const uint16_t *v, uint8_t *dst, ptrdiff_t width); + +av_cold void ff_v210enc_init_x86(V210EncContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if( EXTERNAL_SSSE3(cpu_flags) ) { + s->pack_line_8 = ff_v210_planar_pack_8_ssse3; + s->pack_line_10 = ff_v210_planar_pack_10_ssse3; + } + + if( EXTERNAL_AVX(cpu_flags) ) + s->pack_line_8 = ff_v210_planar_pack_8_avx; +} diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm index 0d0ef0799d..2d02f75069 100644 --- a/libavutil/x86/x86util.asm +++ b/libavutil/x86/x86util.asm @@ -641,6 +641,11 @@ %endif %endmacro +%macro CLIPUB 3 ;(dst, min, max) + pmaxub %1, %2 + pminub %1, %3 +%endmacro + %macro CLIPW 3 ;(dst, min, max) pmaxsw %1, %2 pminsw %1, %3 diff --git a/tests/ref/vsynth/vsynth1-v210 b/tests/ref/vsynth/vsynth1-v210 index dbafe428fe..0d12afa006 100644 --- a/tests/ref/vsynth/vsynth1-v210 +++ b/tests/ref/vsynth/vsynth1-v210 @@ -1,4 +1,4 @@ -895d30660eb4da017568141a8d1df4e8 *tests/data/fate/vsynth1-v210.avi +b066679e08cd90c342da21c88bec2a20 *tests/data/fate/vsynth1-v210.avi 14752448 tests/data/fate/vsynth1-v210.avi -50973792d3f1abe04a51ee0121f077f2 *tests/data/fate/vsynth1-v210.out.rawvideo -stddev: 1.85 PSNR: 42.78 MAXDIFF: 29 bytes: 7603200/ 7603200 +2ba7f4ca302f3c4147860b9dfb12b6e4 *tests/data/fate/vsynth1-v210.out.rawvideo +stddev: 1.84 PSNR: 42.81 MAXDIFF: 29 bytes: 7603200/ 7603200 diff --git a/tests/ref/vsynth/vsynth2-v210 b/tests/ref/vsynth/vsynth2-v210 index 1320c38a51..abb425ed21 100644 --- a/tests/ref/vsynth/vsynth2-v210 +++ b/tests/ref/vsynth/vsynth2-v210 @@ -1,4 +1,4 @@ -6fbbcfee1832fe4c62aacb70454cff62 *tests/data/fate/vsynth2-v210.avi +fa1c4b1b8d0e9454b4bc2269c7fe634b *tests/data/fate/vsynth2-v210.avi 14752448 tests/data/fate/vsynth2-v210.avi -a627fb50c8276200fd71383977d87ca3 *tests/data/fate/vsynth2-v210.out.rawvideo -stddev: 0.34 PSNR: 57.43 MAXDIFF: 6 bytes: 7603200/ 7603200 +7ba6e411e43c6b57c95c49d6848f41e6 *tests/data/fate/vsynth2-v210.out.rawvideo +stddev: 0.34 PSNR: 57.41 MAXDIFF: 6 bytes: 7603200/ 7603200 diff --git a/tests/ref/vsynth/vsynth3-v210 b/tests/ref/vsynth/vsynth3-v210 index d3275f05a9..0eaf041699 100644 --- a/tests/ref/vsynth/vsynth3-v210 +++ b/tests/ref/vsynth/vsynth3-v210 @@ -1,4 +1,4 @@ -d2f5e07f0c0e917d80d63f39d683919e *tests/data/fate/vsynth3-v210.avi +6618ab86d047f4fb8fdd2d633888b20b *tests/data/fate/vsynth3-v210.avi 224448 tests/data/fate/vsynth3-v210.avi -0cf7cf68724fa5146b1667e4fa08b0e1 *tests/data/fate/vsynth3-v210.out.rawvideo -stddev: 2.12 PSNR: 41.58 MAXDIFF: 26 bytes: 86700/ 86700 +198ffb24c06927d8aaac5e59d81a0934 *tests/data/fate/vsynth3-v210.out.rawvideo +stddev: 2.11 PSNR: 41.61 MAXDIFF: 27 bytes: 86700/ 86700