Merge commit '89d9869d2491d4209d707a8e7f29c58227ae5a4e'

* commit '89d9869d2491d4209d707a8e7f29c58227ae5a4e':
  hevc: Add NEON 16x16 IDCT

Merged-by: James Almer <jamrial@gmail.com>
This commit is contained in:
James Almer 2017-10-27 18:22:39 -03:00
commit 9840ca70e7
2 changed files with 201 additions and 0 deletions

View File

@ -429,7 +429,204 @@ function ff_hevc_idct_8x8_\bitdepth\()_neon, export=1
endfunc
.endm
.macro butterfly e, o, tmp_p, tmp_m
vadd.s32 \tmp_p, \e, \o
vsub.s32 \tmp_m, \e, \o
.endm
.macro tr16_8x4 in0, in1, in2, in3, in4, in5, in6, in7
tr_4x4_8 \in0, \in2, \in4, \in6, q8, q9, q10, q11, q12, q13, q14, q15
vmull.s16 q12, \in1, \in0[0]
vmull.s16 q13, \in1, \in0[1]
vmull.s16 q14, \in1, \in0[2]
vmull.s16 q15, \in1, \in0[3]
sum_sub q12, \in3, \in0[1], +
sum_sub q13, \in3, \in0[3], -
sum_sub q14, \in3, \in0[0], -
sum_sub q15, \in3, \in0[2], -
sum_sub q12, \in5, \in0[2], +
sum_sub q13, \in5, \in0[0], -
sum_sub q14, \in5, \in0[3], +
sum_sub q15, \in5, \in0[1], +
sum_sub q12, \in7, \in0[3], +
sum_sub q13, \in7, \in0[2], -
sum_sub q14, \in7, \in0[1], +
sum_sub q15, \in7, \in0[0], -
butterfly q8, q12, q0, q7
butterfly q9, q13, q1, q6
butterfly q10, q14, q2, q5
butterfly q11, q15, q3, q4
add r4, sp, #512
vst1.s16 {q0-q1}, [r4, :128]!
vst1.s16 {q2-q3}, [r4, :128]!
vst1.s16 {q4-q5}, [r4, :128]!
vst1.s16 {q6-q7}, [r4, :128]
.endm
.macro load16 in0, in1, in2, in3, in4, in5, in6, in7
vld1.s16 {\in0}, [r1, :64], r2
vld1.s16 {\in1}, [r3, :64], r2
vld1.s16 {\in2}, [r1, :64], r2
vld1.s16 {\in3}, [r3, :64], r2
vld1.s16 {\in4}, [r1, :64], r2
vld1.s16 {\in5}, [r3, :64], r2
vld1.s16 {\in6}, [r1, :64], r2
vld1.s16 {\in7}, [r3, :64], r2
.endm
.macro add_member in, t0, t1, t2, t3, t4, t5, t6, t7, op0, op1, op2, op3, op4, op5, op6, op7
sum_sub q5, \in, \t0, \op0
sum_sub q6, \in, \t1, \op1
sum_sub q7, \in, \t2, \op2
sum_sub q8, \in, \t3, \op3
sum_sub q9, \in, \t4, \op4
sum_sub q10, \in, \t5, \op5
sum_sub q11, \in, \t6, \op6
sum_sub q12, \in, \t7, \op7
.endm
.macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7
vadd.s32 q4, \in0, \in1
vsub.s32 \in0, \in0, \in1
vadd.s32 \in1, \in2, \in3
vsub.s32 \in2, \in2, \in3
vadd.s32 \in3, \in4, \in5
vsub.s32 \in4, \in4, \in5
vadd.s32 \in5, \in6, \in7
vsub.s32 \in6, \in6, \in7
.endm
.macro store16 in0, in1, in2, in3, in4, in5, in6, in7
vst1.s16 \in0, [r1, :64], r2
vst1.s16 \in1, [r3, :64], r4
vst1.s16 \in2, [r1, :64], r2
vst1.s16 \in3, [r3, :64], r4
vst1.s16 \in4, [r1, :64], r2
vst1.s16 \in5, [r3, :64], r4
vst1.s16 \in6, [r1, :64], r2
vst1.s16 \in7, [r3, :64], r4
.endm
.macro scale out0, out1, out2, out3, out4, out5, out6, out7, in0, in1, in2, in3, in4, in5, in6, in7, shift
vqrshrn.s32 \out0, \in0, \shift
vqrshrn.s32 \out1, \in1, \shift
vqrshrn.s32 \out2, \in2, \shift
vqrshrn.s32 \out3, \in3, \shift
vqrshrn.s32 \out4, \in4, \shift
vqrshrn.s32 \out5, \in5, \shift
vqrshrn.s32 \out6, \in6, \shift
vqrshrn.s32 \out7, \in7, \shift
.endm
.macro tr_16x4 name, shift
function func_tr_16x4_\name
mov r1, r5
add r3, r5, #64
mov r2, #128
load16 d0, d1, d2, d3, d4, d5, d6, d7
movrel r1, trans
tr16_8x4 d0, d1, d2, d3, d4, d5, d6, d7
add r1, r5, #32
add r3, r5, #(64 + 32)
mov r2, #128
load16 d8, d9, d2, d3, d4, d5, d6, d7
movrel r1, trans + 16
vld1.s16 {q0}, [r1, :128]
vmull.s16 q5, d8, d0[0]
vmull.s16 q6, d8, d0[1]
vmull.s16 q7, d8, d0[2]
vmull.s16 q8, d8, d0[3]
vmull.s16 q9, d8, d1[0]
vmull.s16 q10, d8, d1[1]
vmull.s16 q11, d8, d1[2]
vmull.s16 q12, d8, d1[3]
add_member d9, d0[1], d1[0], d1[3], d1[1], d0[2], d0[0], d0[3], d1[2], +, +, +, -, -, -, -, -
add_member d2, d0[2], d1[3], d0[3], d0[1], d1[2], d1[0], d0[0], d1[1], +, +, -, -, -, +, +, +
add_member d3, d0[3], d1[1], d0[1], d1[3], d0[0], d1[2], d0[2], d1[0], +, -, -, +, +, +, -, -
add_member d4, d1[0], d0[2], d1[2], d0[0], d1[3], d0[1], d1[1], d0[3], +, -, -, +, -, -, +, +
add_member d5, d1[1], d0[0], d1[0], d1[2], d0[1], d0[3], d1[3], d0[2], +, -, +, +, -, +, +, -
add_member d6, d1[2], d0[3], d0[0], d0[2], d1[1], d1[3], d1[0], d0[1], +, -, +, -, +, +, -, +
add_member d7, d1[3], d1[2], d1[1], d1[0], d0[3], d0[2], d0[1], d0[0], +, -, +, -, +, -, +, -
add r4, sp, #512
vld1.s16 {q0-q1}, [r4, :128]!
vld1.s16 {q2-q3}, [r4, :128]!
butterfly16 q0, q5, q1, q6, q2, q7, q3, q8
scale d26, d27, d28, d29, d30, d31, d16, d17, q4, q0, q5, q1, q6, q2, q7, q3, \shift
transpose8_4x4 d26, d28, d30, d16
transpose8_4x4 d17, d31, d29, d27
mov r1, r6
add r3, r6, #(24 +3*32)
mov r2, #32
mov r4, #-32
store16 d26, d27, d28, d29, d30, d31, d16, d17
add r4, sp, #576
vld1.s16 {q0-q1}, [r4, :128]!
vld1.s16 {q2-q3}, [r4, :128]
butterfly16 q0, q9, q1, q10, q2, q11, q3, q12
scale d26, d27, d28, d29, d30, d31, d8, d9, q4, q0, q9, q1, q10, q2, q11, q3, \shift
transpose8_4x4 d26, d28, d30, d8
transpose8_4x4 d9, d31, d29, d27
add r1, r6, #8
add r3, r6, #(16 + 3 * 32)
mov r2, #32
mov r4, #-32
store16 d26, d27, d28, d29, d30, d31, d8, d9
bx lr
endfunc
.endm
.macro idct_16x16 bitdepth
function ff_hevc_idct_16x16_\bitdepth\()_neon, export=1
@r0 - coeffs
push {r4-r7, lr}
vpush {q4-q7}
@ Align the stack, allocate a temp buffer
T mov r7, sp
T and r7, r7, #15
A and r7, sp, #15
add r7, r7, #640
sub sp, sp, r7
.irp i, 0, 1, 2, 3
add r5, r0, #(8 * \i)
add r6, sp, #(8 * \i * 16)
bl func_tr_16x4_firstpass
.endr
.irp i, 0, 1, 2, 3
add r5, sp, #(8 * \i)
add r6, r0, #(8 * \i * 16)
bl func_tr_16x4_secondpass_\bitdepth
.endr
add sp, sp, r7
vpop {q4-q7}
pop {r4-r7, pc}
endfunc
.endm
tr_16x4 firstpass, 7
tr_16x4 secondpass_8, 20 - 8
tr_16x4 secondpass_10, 20 - 10
.ltorg
idct_4x4 8
idct_4x4 10
idct_8x8 8
idct_8x8 10
idct_16x16 8
idct_16x16 10

View File

@ -29,8 +29,10 @@ void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_t
void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
void ff_hevc_idct_4x4_8_neon(int16_t *coeffs, int col_limit);
void ff_hevc_idct_8x8_8_neon(int16_t *coeffs, int col_limit);
void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit);
void ff_hevc_idct_4x4_10_neon(int16_t *coeffs, int col_limit);
void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit);
void ff_hevc_idct_16x16_10_neon(int16_t *coeffs, int col_limit);
void ff_hevc_idct_4x4_dc_neon_8(int16_t *coeffs);
void ff_hevc_idct_8x8_dc_neon_8(int16_t *coeffs);
void ff_hevc_idct_16x16_dc_neon_8(int16_t *coeffs);
@ -154,6 +156,7 @@ av_cold void ff_hevc_dsp_init_neon(HEVCDSPContext *c, const int bit_depth)
c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_neon;
c->idct[0] = ff_hevc_idct_4x4_8_neon;
c->idct[1] = ff_hevc_idct_8x8_8_neon;
c->idct[2] = ff_hevc_idct_16x16_8_neon;
c->idct_dc[0] = ff_hevc_idct_4x4_dc_neon_8;
c->idct_dc[1] = ff_hevc_idct_8x8_dc_neon_8;
c->idct_dc[2] = ff_hevc_idct_16x16_dc_neon_8;
@ -227,5 +230,6 @@ av_cold void ff_hevc_dsp_init_neon(HEVCDSPContext *c, const int bit_depth)
if (bit_depth == 10) {
c->idct[0] = ff_hevc_idct_4x4_10_neon;
c->idct[1] = ff_hevc_idct_8x8_10_neon;
c->idct[2] = ff_hevc_idct_16x16_10_neon;
}
}