From 090d1f8d0e207dd1927b272823ea53242dc65f2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Thu, 2 May 2024 16:56:32 +0200 Subject: [PATCH] atrac3: Add a cross platform restrict modifier, use it for the vector math functions Also remove redundant parameters. There are a bunch of other places where we could use this, so just experimenting here. This enables autovectorization to work here. --- ext/at3_standalone/atrac3.cpp | 2 +- ext/at3_standalone/atrac3plusdsp.cpp | 18 ++++++++-------- ext/at3_standalone/compat.h | 10 +++++++-- ext/at3_standalone/float_dsp.h | 31 +++++++++++++--------------- 4 files changed, 32 insertions(+), 29 deletions(-) diff --git a/ext/at3_standalone/atrac3.cpp b/ext/at3_standalone/atrac3.cpp index fe0e71c841..6c1afcb166 100644 --- a/ext/at3_standalone/atrac3.cpp +++ b/ext/at3_standalone/atrac3.cpp @@ -143,7 +143,7 @@ static void imlt(ATRAC3Context *q, float *input, float *output, int odd_band) imdct_calc(&q->mdct_ctx, output, input); /* Perform windowing on the output. */ - vector_fmul(output, output, mdct_window, MDCT_SIZE); + vector_fmul(output, mdct_window, MDCT_SIZE); } /* diff --git a/ext/at3_standalone/atrac3plusdsp.cpp b/ext/at3_standalone/atrac3plusdsp.cpp index 1cf345ced6..a070148e0e 100644 --- a/ext/at3_standalone/atrac3plusdsp.cpp +++ b/ext/at3_standalone/atrac3plusdsp.cpp @@ -174,7 +174,7 @@ static void waves_synth(Atrac3pWaveSynthParams *synth_param, /* invert phase if requested */ if (invert_phase) - vector_fmul_scalar(out, out, -1.0f, 128); + vector_fmul_scalar(out, -1.0f, 128); /* fade in with steep Hann window if requested */ if (envelope->has_start_point) { @@ -255,14 +255,14 @@ void ff_atrac3p_generate_tones(Atrac3pChanUnitCtx *ch_unit, int ch_num, int sb, /* Hann windowing for non-faded wave signals */ if (tones_now->num_wavs && tones_next->num_wavs && reg1_env_nonzero && reg2_env_nonzero) { - vector_fmul(wavreg1, wavreg1, &hann_window[128], 128); - vector_fmul(wavreg2, wavreg2, hann_window, 128); + vector_fmul(wavreg1, &hann_window[128], 128); + vector_fmul(wavreg2, hann_window, 128); } else { if (tones_now->num_wavs && !tones_now->curr_env.has_stop_point) - vector_fmul(wavreg1, wavreg1, &hann_window[128], 128); + vector_fmul(wavreg1, &hann_window[128], 128); if (tones_next->num_wavs && !tones_next->curr_env.has_start_point) - vector_fmul(wavreg2, wavreg2, hann_window, 128); + vector_fmul(wavreg2, hann_window, 128); } /* Overlap and add to residual */ @@ -502,15 +502,15 @@ void ff_atrac3p_imdct(FFTContext *mdct_ctx, float *pIn, * Both regions are 32 samples long. */ if (wind_id & 2) { /* 1st half: steep window */ memset(pOut, 0, sizeof(float) * 32); - vector_fmul(&pOut[32], &pOut[32], av_sine_64, 64); + vector_fmul(&pOut[32], av_sine_64, 64); } else /* 1st half: simple sine window */ - vector_fmul(pOut, pOut, av_sine_128, ATRAC3P_MDCT_SIZE / 2); + vector_fmul(pOut, av_sine_128, ATRAC3P_MDCT_SIZE / 2); if (wind_id & 1) { /* 2nd half: steep window */ - vector_fmul_reverse(&pOut[160], &pOut[160], av_sine_64, 64); + vector_fmul_reverse(&pOut[160], av_sine_64, 64); memset(&pOut[224], 0, sizeof(float) * 32); } else /* 2nd half: simple sine window */ - vector_fmul_reverse(&pOut[128], &pOut[128], av_sine_128, ATRAC3P_MDCT_SIZE / 2); + vector_fmul_reverse(&pOut[128], av_sine_128, ATRAC3P_MDCT_SIZE / 2); } /* lookup table for fast modulo 23 op required for cyclic buffers of the IPQF */ diff --git a/ext/at3_standalone/compat.h b/ext/at3_standalone/compat.h index bd3e398b32..57b0244de4 100644 --- a/ext/at3_standalone/compat.h +++ b/ext/at3_standalone/compat.h @@ -4,15 +4,22 @@ // Compat hacks to make an FFMPEG-like environment, so we can keep the core code mostly unchanged. -#if defined(__GNUC__) +#if defined(__clang__) +#define DECLARE_ALIGNED(n, t, v) t __attribute__((aligned(n))) v +#define DECLARE_ASM_CONST(n, t, v) static const t av_used __attribute__((aligned(n))) v +#define av_restrict __restrict +#elif defined(__GNUC__) #define DECLARE_ALIGNED(n,t,v) t __attribute__ ((aligned (n))) v #define DECLARE_ASM_CONST(n,t,v) static const t av_used __attribute__ ((aligned (n))) v +#define av_restrict __restrict__ #elif defined(_MSC_VER) #define DECLARE_ALIGNED(n,t,v) __declspec(align(n)) t v #define DECLARE_ASM_CONST(n,t,v) __declspec(align(n)) static const t v +#define av_restrict __restrict #else #define DECLARE_ALIGNED(n,t,v) t v #define DECLARE_ASM_CONST(n,t,v) static const t v +#define av_restrict #endif #define AV_HAVE_FAST_UNALIGNED 0 @@ -22,7 +29,6 @@ // #define BITSTREAM_READER_LE #define LOCAL_ALIGNED(bits, type, name, subscript) type name subscript -#define av_restrict #define av_alias #define av_unused #define av_assert0(cond) diff --git a/ext/at3_standalone/float_dsp.h b/ext/at3_standalone/float_dsp.h index f02764382f..c9664a814b 100644 --- a/ext/at3_standalone/float_dsp.h +++ b/ext/at3_standalone/float_dsp.h @@ -18,20 +18,20 @@ #pragma once -inline void vector_fmul(float *dst, const float *src0, const float *src1, int len) { - int i; - for (i = 0; i < len; i++) - dst[i] = src0[i] * src1[i]; +#include "compat.h" + +inline void vector_fmul(float * av_restrict dst, const float * av_restrict src, int len) { + for (int i = 0; i < len; i++) + dst[i] = dst[i] * src[i]; } /** * Multiply a vector of floats by a scalar float. Source and * destination vectors must overlap exactly or not at all. */ -inline void vector_fmul_scalar(float *dst, const float *src, float mul, int len) { - int i; - for (i = 0; i < len; i++) - dst[i] = src[i] * mul; +inline void vector_fmul_scalar(float *dst, float mul, int len) { + for (int i = 0; i < len; i++) + dst[i] *= mul; } /** @@ -39,18 +39,15 @@ inline void vector_fmul_scalar(float *dst, const float *src, float mul, int len) * in a vector of floats. The second vector of floats is iterated over * in reverse order. * -* @param dst output vector +* @param dst output and first input vector * constraints: 32-byte aligned -* @param src0 first input vector -* constraints: 32-byte aligned -* @param src1 second input vector +* @param src second input vector * constraints: 32-byte aligned * @param len number of elements in the input * constraints: multiple of 16 */ -inline void vector_fmul_reverse(float *dst, const float *src0, const float *src1, int len) { - int i; - src1 += len - 1; - for (i = 0; i < len; i++) - dst[i] = src0[i] * src1[-i]; +inline void vector_fmul_reverse(float * av_restrict dst, const float * av_restrict src, int len) { + src += len - 1; + for (int i = 0; i < len; i++) + dst[i] *= src[-i]; }