Merge commit '0d4d43513786f1df4d561e1fac924fb0722c6700'

* commit '0d4d43513786f1df4d561e1fac924fb0722c6700':
  hevc: Add NEON add_residual for bitdepth 8

See 03cecf45c1

Merged-by: James Almer <jamrial@gmail.com>
This commit is contained in:
James Almer 2017-10-30 17:38:57 -03:00
commit 999c2271a5
2 changed files with 92 additions and 75 deletions

View File

@ -29,75 +29,92 @@ const trans, align=4
.short 57, 43, 25, 9
endconst
function ff_hevc_add_residual_4x4_neon_8, export=1
vldm r1, {q0-q1}
vld1.32 d4[0], [r0], r2
vld1.32 d4[1], [r0], r2
vld1.32 d5[0], [r0], r2
vld1.32 d5[1], [r0], r2
sub r0, r0, r2, lsl #2
vmovl.u8 q8, d4
vmovl.u8 q9, d5
vqadd.s16 q0, q0, q8
vqadd.s16 q1, q1, q9
vqmovun.s16 d0, q0
vqmovun.s16 d1, q1
vst1.32 d0[0], [r0], r2
vst1.32 d0[1], [r0], r2
vst1.32 d1[0], [r0], r2
vst1.32 d1[1], [r0], r2
bx lr
function ff_hevc_add_residual_4x4_8_neon, export=1
vld1.16 {q0-q1}, [r1, :128]
vld1.32 d4[0], [r0, :32], r2
vld1.32 d4[1], [r0, :32], r2
vld1.32 d5[0], [r0, :32], r2
vld1.32 d5[1], [r0, :32], r2
sub r0, r0, r2, lsl #2
vmovl.u8 q8, d4
vmovl.u8 q9, d5
vqadd.s16 q0, q0, q8
vqadd.s16 q1, q1, q9
vqmovun.s16 d0, q0
vqmovun.s16 d1, q1
vst1.32 d0[0], [r0, :32], r2
vst1.32 d0[1], [r0, :32], r2
vst1.32 d1[0], [r0, :32], r2
vst1.32 d1[1], [r0, :32], r2
bx lr
endfunc
function ff_hevc_add_residual_8x8_neon_8, export=1
mov r3, #8
1: subs r3, #1
vld1.16 {q0}, [r1]!
vld1.8 d16, [r0]
vmovl.u8 q8, d16
vqadd.s16 q0, q8
vqmovun.s16 d0, q0
vst1.32 d0, [r0], r2
bne 1b
bx lr
function ff_hevc_add_residual_8x8_8_neon, export=1
mov r3, #8
1: subs r3, #2
vld1.16 {q0-q1}, [r1, :128]!
vld1.8 {d16}, [r0, :64]
add r12, r0, r2
vld1.8 {d17}, [r12, :64]
vmovl.u8 q9, d16
vmovl.u8 q8, d17
vqadd.s16 q0, q9
vqadd.s16 q1, q8
vqmovun.s16 d0, q0
vqmovun.s16 d1, q1
vst1.8 d0, [r0, :64], r2
vst1.8 d1, [r0, :64], r2
bne 1b
bx lr
endfunc
function ff_hevc_add_residual_16x16_neon_8, export=1
mov r3, #16
1: subs r3, #1
vld1.16 {q0, q1}, [r1]!
vld1.8 {q8}, [r0]
vmovl.u8 q9, d16
vmovl.u8 q10, d17
vqadd.s16 q0, q9
vqadd.s16 q1, q10
vqmovun.s16 d0, q0
vqmovun.s16 d1, q1
vst1.8 {q0}, [r0], r2
bne 1b
bx lr
function ff_hevc_add_residual_16x16_8_neon, export=1
mov r3, #16
add r12, r0, r2
add r2, r2, r2
1: subs r3, #2
vld1.8 {q8}, [r0, :128]
vld1.16 {q0, q1}, [r1, :128]!
vld1.8 {q11}, [r12, :128]
vld1.16 {q2, q3}, [r1, :128]!
vmovl.u8 q9, d16
vmovl.u8 q10, d17
vmovl.u8 q12, d22
vmovl.u8 q13, d23
vqadd.s16 q0, q9
vqadd.s16 q1, q10
vqadd.s16 q2, q12
vqadd.s16 q3, q13
vqmovun.s16 d0, q0
vqmovun.s16 d1, q1
vqmovun.s16 d2, q2
vqmovun.s16 d3, q3
vst1.8 {q0}, [r0, :128], r2
vst1.8 {q1}, [r12, :128], r2
bne 1b
bx lr
endfunc
function ff_hevc_add_residual_32x32_neon_8, export=1
mov r3, #32
1: subs r3, #1
vldm r1!, {q0-q3}
vld1.8 {q8, q9}, [r0]
vmovl.u8 q10, d16
vmovl.u8 q11, d17
vmovl.u8 q12, d18
vmovl.u8 q13, d19
vqadd.s16 q0, q10
vqadd.s16 q1, q11
vqadd.s16 q2, q12
vqadd.s16 q3, q13
vqmovun.s16 d0, q0
vqmovun.s16 d1, q1
vqmovun.s16 d2, q2
vqmovun.s16 d3, q3
vst1.8 {q0, q1}, [r0], r2
bne 1b
bx lr
function ff_hevc_add_residual_32x32_8_neon, export=1
mov r3, #32
1: subs r3, #1
vldm r1!, {q0-q3}
vld1.8 {q8, q9}, [r0, :128]
vmovl.u8 q10, d16
vmovl.u8 q11, d17
vmovl.u8 q12, d18
vmovl.u8 q13, d19
vqadd.s16 q0, q10
vqadd.s16 q1, q11
vqadd.s16 q2, q12
vqadd.s16 q3, q13
vqmovun.s16 d0, q0
vqmovun.s16 d1, q1
vqmovun.s16 d2, q2
vqmovun.s16 d3, q3
vst1.8 {q0, q1}, [r0, :128], r2
bne 1b
bx lr
endfunc
/* uses registers q2 - q9 for temp values */

View File

@ -27,6 +27,14 @@ void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta
void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, int16_t *coeffs,
ptrdiff_t stride);
void ff_hevc_add_residual_8x8_8_neon(uint8_t *_dst, int16_t *coeffs,
ptrdiff_t stride);
void ff_hevc_add_residual_16x16_8_neon(uint8_t *_dst, int16_t *coeffs,
ptrdiff_t stride);
void ff_hevc_add_residual_32x32_8_neon(uint8_t *_dst, int16_t *coeffs,
ptrdiff_t stride);
void ff_hevc_idct_4x4_dc_8_neon(int16_t *coeffs);
void ff_hevc_idct_8x8_dc_8_neon(int16_t *coeffs);
void ff_hevc_idct_16x16_dc_8_neon(int16_t *coeffs);
@ -42,14 +50,6 @@ void ff_hevc_idct_4x4_10_neon(int16_t *coeffs, int col_limit);
void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit);
void ff_hevc_idct_16x16_10_neon(int16_t *coeffs, int col_limit);
void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
void ff_hevc_add_residual_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
ptrdiff_t stride);
void ff_hevc_add_residual_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
ptrdiff_t stride);
void ff_hevc_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
ptrdiff_t stride);
void ff_hevc_add_residual_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
ptrdiff_t stride);
#define PUT_PIXELS(name) \
void name(int16_t *dst, uint8_t *src, \
@ -158,6 +158,10 @@ av_cold void ff_hevc_dsp_init_neon(HEVCDSPContext *c, const int bit_depth)
c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_neon;
c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_neon;
c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_neon;
c->add_residual[0] = ff_hevc_add_residual_4x4_8_neon;
c->add_residual[1] = ff_hevc_add_residual_8x8_8_neon;
c->add_residual[2] = ff_hevc_add_residual_16x16_8_neon;
c->add_residual[3] = ff_hevc_add_residual_32x32_8_neon;
c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_neon;
c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_neon;
c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_neon;
@ -165,10 +169,6 @@ av_cold void ff_hevc_dsp_init_neon(HEVCDSPContext *c, const int bit_depth)
c->idct[0] = ff_hevc_idct_4x4_8_neon;
c->idct[1] = ff_hevc_idct_8x8_8_neon;
c->idct[2] = ff_hevc_idct_16x16_8_neon;
c->add_residual[0] = ff_hevc_add_residual_4x4_neon_8;
c->add_residual[1] = ff_hevc_add_residual_8x8_neon_8;
c->add_residual[2] = ff_hevc_add_residual_16x16_neon_8;
c->add_residual[3] = ff_hevc_add_residual_32x32_neon_8;
c->transform_4x4_luma = ff_hevc_transform_luma_4x4_neon_8;
put_hevc_qpel_neon[1][0] = ff_hevc_put_qpel_v1_neon_8;
put_hevc_qpel_neon[2][0] = ff_hevc_put_qpel_v2_neon_8;