From 89135716fd4c2963e01e0155547c47bf709f1aa3 Mon Sep 17 00:00:00 2001 From: Ben Avison Date: Thu, 20 Mar 2014 18:59:16 +0000 Subject: [PATCH] truehd: add hand-scheduled ARM asm version of ff_mlp_rematrix_channel. Profiling results for overall audio decode and the rematrix_channels function in particular are as follows: Before After Mean StdDev Mean StdDev Confidence Change 6:2 total 370.8 17.0 348.8 20.1 99.9% +6.3% 6:2 function 46.4 8.4 45.8 6.6 18.0% +1.2% (insignificant) 8:2 total 343.2 19.0 339.1 15.4 54.7% +1.2% (insignificant) 8:2 function 38.9 3.9 40.2 6.9 52.4% -3.2% (insignificant) 6:6 total 658.4 15.7 604.6 20.8 100.0% +8.9% 6:6 function 109.0 8.7 59.5 5.4 100.0% +83.3% 8:8 total 896.2 24.5 766.4 17.6 100.0% +16.9% 8:8 function 223.4 12.8 93.8 5.0 100.0% +138.3% The assembly version has also been tested with a fuzz tester to ensure that any combinations of inputs not exercised by my available test streams still generate mathematically identical results to the C version. Signed-off-by: Michael Niedermayer --- libavcodec/arm/mlpdsp_armv5te.S | 222 +++++++++++++++++++++++++++++++ libavcodec/arm/mlpdsp_init_arm.c | 12 ++ 2 files changed, 234 insertions(+) diff --git a/libavcodec/arm/mlpdsp_armv5te.S b/libavcodec/arm/mlpdsp_armv5te.S index f3295f5679..8355cdf3a2 100644 --- a/libavcodec/arm/mlpdsp_armv5te.S +++ b/libavcodec/arm/mlpdsp_armv5te.S @@ -431,3 +431,225 @@ endfunc .unreq ST3 .unreq I .unreq PSAMP + +/********************************************************************/ + +PSA .req a1 // samples +PCO .req a2 // coeffs +PBL .req a3 // bypassed_lsbs +INDEX .req a4 +CO0 .req v1 +CO1 .req v2 +CO2 .req v3 +CO3 .req v4 +SA0 .req v5 +SA1 .req v6 +SA2 .req sl +SA3 .req fp +AC0 .req ip +AC1 .req lr +NOISE .req SA0 +LSB .req SA1 +DCH .req SA2 // dest_ch +MASK .req SA3 + + // INDEX is used as follows: + // bits 0..6 index2 (values up to 17, but wider so that we can + // add to index field without needing to mask) + // bits 7..14 i (values up to 160) + // bit 15 underflow detect for i + // bits 25..31 (if access_unit_size_pow2 == 128) \ index + // bits 26..31 (if access_unit_size_pow2 == 64) / + +.macro implement_rematrix shift, index_mask, mask_minus1, maxchan + .if \maxchan == 1 + // We can just leave the coefficients in registers in this case + ldrd CO0, CO1, [PCO] + .endif +1: + .if \maxchan == 1 + ldrd SA0, SA1, [PSA] + smull AC0, AC1, CO0, SA0 + .elseif \maxchan == 5 + ldr CO0, [PCO, #0] + ldr SA0, [PSA, #0] + ldr CO1, [PCO, #4] + ldr SA1, [PSA, #4] + ldrd CO2, CO3, [PCO, #8] + smull AC0, AC1, CO0, SA0 + ldrd SA2, SA3, [PSA, #8] + smlal AC0, AC1, CO1, SA1 + ldrd CO0, CO1, [PCO, #16] + smlal AC0, AC1, CO2, SA2 + ldrd SA0, SA1, [PSA, #16] + smlal AC0, AC1, CO3, SA3 + smlal AC0, AC1, CO0, SA0 + .else // \maxchan == 7 + ldr CO2, [PCO, #0] + ldr SA2, [PSA, #0] + ldr CO3, [PCO, #4] + ldr SA3, [PSA, #4] + ldrd CO0, CO1, [PCO, #8] + smull AC0, AC1, CO2, SA2 + ldrd SA0, SA1, [PSA, #8] + smlal AC0, AC1, CO3, SA3 + ldrd CO2, CO3, [PCO, #16] + smlal AC0, AC1, CO0, SA0 + ldrd SA2, SA3, [PSA, #16] + smlal AC0, AC1, CO1, SA1 + ldrd CO0, CO1, [PCO, #24] + smlal AC0, AC1, CO2, SA2 + ldrd SA0, SA1, [PSA, #24] + smlal AC0, AC1, CO3, SA3 + smlal AC0, AC1, CO0, SA0 + .endif + ldm sp, {NOISE, DCH, MASK} + smlal AC0, AC1, CO1, SA1 + .if \shift != 0 + .if \index_mask == 63 + add NOISE, NOISE, INDEX, lsr #32-6 + ldrb LSB, [PBL], #MAX_CHANNELS + ldrsb NOISE, [NOISE] + add INDEX, INDEX, INDEX, lsl #32-6 + .else // \index_mask == 127 + add NOISE, NOISE, INDEX, lsr #32-7 + ldrb LSB, [PBL], #MAX_CHANNELS + ldrsb NOISE, [NOISE] + add INDEX, INDEX, INDEX, lsl #32-7 + .endif + sub INDEX, INDEX, #1<<7 + adds AC0, AC0, NOISE, lsl #\shift + 7 + adc AC1, AC1, NOISE, asr #31 + .else + ldrb LSB, [PBL], #MAX_CHANNELS + sub INDEX, INDEX, #1<<7 + .endif + add PSA, PSA, #MAX_CHANNELS*4 + mov AC0, AC0, lsr #14 + orr AC0, AC0, AC1, lsl #18 + .if !\mask_minus1 + and AC0, AC0, MASK + .endif + add AC0, AC0, LSB + tst INDEX, #1<<15 + str AC0, [PSA, DCH, lsl #2] // DCH is precompensated for the early increment of PSA + beq 1b + b 98f +.endm + +.macro switch_on_maxchan shift, index_mask, mask_minus1 + cmp v4, #5 + blo 51f + beq 50f + implement_rematrix \shift, \index_mask, \mask_minus1, 7 +50: implement_rematrix \shift, \index_mask, \mask_minus1, 5 +51: implement_rematrix \shift, \index_mask, \mask_minus1, 1 +.endm + +.macro switch_on_mask shift, index_mask + cmp sl, #-1 + bne 40f + switch_on_maxchan \shift, \index_mask, 1 +40: switch_on_maxchan \shift, \index_mask, 0 +.endm + +.macro switch_on_au_size shift + .if \shift == 0 + switch_on_mask \shift, undefined + .else + teq v6, #64 + bne 30f + orr INDEX, INDEX, v1, lsl #32-6 + switch_on_mask \shift, 63 +30: orr INDEX, INDEX, v1, lsl #32-7 + switch_on_mask \shift, 127 + .endif +.endm + +/* void ff_mlp_rematrix_channel_arm(int32_t *samples, + * const int32_t *coeffs, + * const uint8_t *bypassed_lsbs, + * const int8_t *noise_buffer, + * int index, + * unsigned int dest_ch, + * uint16_t blockpos, + * unsigned int maxchan, + * int matrix_noise_shift, + * int access_unit_size_pow2, + * int32_t mask); + */ +function ff_mlp_rematrix_channel_arm, export=1 + push {v1-fp,lr} + add v1, sp, #9*4 // point at arguments on stack + ldm v1, {v1-sl} + teq v4, #1 + itt ne + teqne v4, #5 + teqne v4, #7 + bne 99f + teq v6, #64 + it ne + teqne v6, #128 + bne 99f + sub v2, v2, #MAX_CHANNELS + push {a4,v2,sl} // initialise NOISE,DCH,MASK; make sp dword-aligned + movs INDEX, v3, lsl #7 + beq 98f // just in case, do nothing if blockpos = 0 + subs INDEX, INDEX, #1<<7 // offset by 1 so we borrow at the right time + adc lr, v1, v1 // calculate index2 (C was set by preceding subs) + orr INDEX, INDEX, lr + // Switch on matrix_noise_shift: values 0 and 1 are + // disproportionately common so do those in a form the branch + // predictor can accelerate. Values can only go up to 15. + cmp v5, #1 + beq 11f + blo 10f +A ldr pc, [pc, v5, lsl #2] +T tbh [pc, v5, lsl #1] +0: +A .word 0, 0, 0, 12f, 13f, 14f, 15f, 16f, 17f, 18f, 19f, 20f, 21f, 22f, 23f, 24f, 25f +T .hword 0, 0, (12f - 0b) / 2, (13f - 0b) / 2, (14f - 0b) / 2, (15f - 0b) / 2 +T .hword (16f - 0b) / 2, (17f - 0b) / 2, (18f - 0b) / 2, (19f - 0b) / 2 +T .hword (20f - 0b) / 2, (21f - 0b) / 2, (22f - 0b) / 2, (23f - 0b) / 2, (24f - 0b) / 2, (25f - 0b) / 2 +10: switch_on_au_size 0 +11: switch_on_au_size 1 +12: switch_on_au_size 2 +13: switch_on_au_size 3 +14: switch_on_au_size 4 +15: switch_on_au_size 5 +16: switch_on_au_size 6 +17: switch_on_au_size 7 +18: switch_on_au_size 8 +19: switch_on_au_size 9 +20: switch_on_au_size 10 +21: switch_on_au_size 11 +22: switch_on_au_size 12 +23: switch_on_au_size 13 +24: switch_on_au_size 14 +25: switch_on_au_size 15 + +98: add sp, sp, #3*4 + pop {v1-fp,pc} +99: // Can't handle these parameters, drop back to C + pop {v1-fp,lr} + b X(ff_mlp_rematrix_channel) +endfunc + + .unreq PSA + .unreq PCO + .unreq PBL + .unreq INDEX + .unreq CO0 + .unreq CO1 + .unreq CO2 + .unreq CO3 + .unreq SA0 + .unreq SA1 + .unreq SA2 + .unreq SA3 + .unreq AC0 + .unreq AC1 + .unreq NOISE + .unreq LSB + .unreq DCH + .unreq MASK diff --git a/libavcodec/arm/mlpdsp_init_arm.c b/libavcodec/arm/mlpdsp_init_arm.c index a8319ce88a..bad5f20122 100644 --- a/libavcodec/arm/mlpdsp_init_arm.c +++ b/libavcodec/arm/mlpdsp_init_arm.c @@ -29,6 +29,17 @@ void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff, int firorder, int iirorder, unsigned int filter_shift, int32_t mask, int blocksize, int32_t *sample_buffer); +void ff_mlp_rematrix_channel_arm(int32_t *samples, + const int32_t *coeffs, + const uint8_t *bypassed_lsbs, + const int8_t *noise_buffer, + int index, + unsigned int dest_ch, + uint16_t blockpos, + unsigned int maxchan, + int matrix_noise_shift, + int access_unit_size_pow2, + int32_t mask); av_cold void ff_mlpdsp_init_arm(MLPDSPContext *c) { @@ -36,5 +47,6 @@ av_cold void ff_mlpdsp_init_arm(MLPDSPContext *c) if (have_armv5te(cpu_flags)) { c->mlp_filter_channel = ff_mlp_filter_channel_arm; + c->mlp_rematrix_channel = ff_mlp_rematrix_channel_arm; } }