From 3d7bf376fd6e73a3c32a4ef7afc83fdeb837a353 Mon Sep 17 00:00:00 2001 From: Olli Parviainen Date: Sun, 21 Jun 2020 20:38:00 +0300 Subject: [PATCH] Tuning for ARM NEON Tuning to enable ARM NEON SIMD performance improvements: - NEON detection in configure file - Remove manual loop unrolling, gcc autovectorization does better job without manually unrolled loops. - Avoid unaligned pointer accesses when using NEON --- configure.ac | 47 ++++++++++++-- include/soundtouch_config.h.in | 3 + source/SoundStretch/Makefile.am | 2 +- source/SoundTouch/FIRFilter.cpp | 21 ++----- source/SoundTouch/Makefile.am | 2 +- source/SoundTouch/TDStretch.cpp | 91 ++++++++++++---------------- source/SoundTouchDLL/make-gnu-dll.sh | 2 +- 7 files changed, 93 insertions(+), 75 deletions(-) diff --git a/configure.ac b/configure.ac index 0cfa019..1bfaa2e 100644 --- a/configure.ac +++ b/configure.ac @@ -15,7 +15,7 @@ dnl this program; if not, write to the Free Software Foundation, Inc., 59 Temple dnl Place - Suite 330, Boston, MA 02111-1307, USA # Process this file with autoconf to produce a configure script. -AC_INIT([SoundTouch], [2.1.2], [http://www.surina.net/soundtouch]) +AC_INIT([SoundTouch], [2.1.3], [http://www.surina.net/soundtouch]) dnl Default to libSoundTouch.so.$LIB_SONAME.0.0 LIB_SONAME=1 AC_SUBST(LIB_SONAME) @@ -31,6 +31,10 @@ AC_DISABLE_STATIC dnl This makes libtool only build shared libs AC_LANG(C++) +# Compiler flags. Apply -ffast-math to allow gcc autovectorization +# generate effective SIMD code. +CXXFLAGS="-O3 -ffast-math" + # Set AR_FLAGS to avoid build warning "ar: `u' modifier ignored since `D' is the default (see `U')" AR_FLAGS='cr' @@ -59,6 +63,7 @@ AC_HEADER_STDC #AC_HEADER_SYS_WAIT # add any others you want to check for here AC_CHECK_HEADERS([cpuid.h]) +AC_CHECK_HEADERS([arm_neon.h]) if test "x$ac_cv_header_cpuid_h" = "xno"; then AC_MSG_WARN([The cpuid.h file was not found therefore the x86 optimizations will be disabled.]) @@ -78,8 +83,7 @@ AC_C_INLINE AC_ARG_ENABLE(integer-samples, [AC_HELP_STRING([--enable-integer-samples], - [use integer samples instead of floats -[default=no]])],, + [use integer samples instead of floats [default=no]])],, [enable_integer_samples=no]) @@ -92,10 +96,17 @@ AC_ARG_ENABLE(openmp, # Useful when compiling on non-x86 architectures. AC_ARG_ENABLE([x86-optimizations], [AS_HELP_STRING([--enable-x86-optimizations], - [use MMX or SSE optimization -[default=yes]])],[enable_x86_optimizations="${enableval}"], + [use MMX or SSE optimization [default=yes]])],[enable_x86_optimizations="${enableval}"], [enable_x86_optimizations=yes]) +# Let the user enable/disable the x86 optimizations. +# Useful when compiling on non-x86 architectures. +AC_ARG_ENABLE([neon-optimizations], + [AS_HELP_STRING([--enable-neon-optimizations], + [use ARM NEON optimization [default=yes]])],[enable_neon_optimizations="${enableval}"], + [enable_neon_optimizations=yes]) + + # Tell the Makefile.am if the user wants to disable optimizations. # Makefile.am will enable them by default if support is available. # Note: We check if optimizations are supported a few lines down. @@ -195,6 +206,32 @@ else CPPFLAGS="-DSOUNDTOUCH_DISABLE_X86_OPTIMIZATIONS $CPPFLAGS" fi + +if test "x$enable_neon_optimizations" = "xyes" -a "x$ac_cv_header_arm_neon_h" = "xyes"; then + + # Check for ARM NEON support + original_saved_CXXFLAGS=$CXXFLAGS + have_neon=no + CXXFLAGS="-mfpu=neon -march=native $CXXFLAGS" + + # Check if can compile neon code using intrinsics, require GCC >= 4.3 for autovectorization. + AC_COMPILE_IFELSE([AC_LANG_SOURCE([[ + #if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3)) + #error "Need GCC >= 4.3 for neon autovectorization" + #endif + #include + int main () { + int32x4_t t = {1}; + return vaddq_s32(t,t)[0] == 2; + }]])],[have_neon=yes]) + CXXFLAGS=$original_saved_CXXFLAGS + if test "x$have_neon" = "xyes" ; then + echo "****** NEON support enabled ******" + CPPFLAGS="-mfpu=neon -march=native -mtune=native $CPPFLAGS" + AC_DEFINE(SOUNDTOUCH_USE_NEON,1,[Use ARM NEON extension]) + fi +fi + # Set AM_CXXFLAGS AC_SUBST([AM_CXXFLAGS], [$AM_CXXFLAGS]) diff --git a/include/soundtouch_config.h.in b/include/soundtouch_config.h.in index 74a9ccb..9b317a4 100644 --- a/include/soundtouch_config.h.in +++ b/include/soundtouch_config.h.in @@ -3,3 +3,6 @@ /* Use Integer as Sample type */ #undef SOUNDTOUCH_INTEGER_SAMPLES + +/* Use ARM NEON extension */ +#undef SOUNDTOUCH_USE_NEON diff --git a/source/SoundStretch/Makefile.am b/source/SoundStretch/Makefile.am index ac5292c..96689bd 100644 --- a/source/SoundStretch/Makefile.am +++ b/source/SoundStretch/Makefile.am @@ -44,7 +44,7 @@ soundstretch_LDADD=../SoundTouch/libSoundTouch.la -lm #soundstretch_LDFLAGS=-s ## additional compiler flags -soundstretch_CXXFLAGS=-O3 $(AM_CXXFLAGS) +soundstretch_CXXFLAGS=$(AM_CXXFLAGS) #clean-local: # -rm -f additional-files-to-remove-on-make-clean diff --git a/source/SoundTouch/FIRFilter.cpp b/source/SoundTouch/FIRFilter.cpp index 218e50e..62c7ca8 100644 --- a/source/SoundTouch/FIRFilter.cpp +++ b/source/SoundTouch/FIRFilter.cpp @@ -96,17 +96,10 @@ uint FIRFilter::evaluateFilterStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, ui suml = sumr = 0; ptr = src + j; - for (i = 0; i < length; i += 4) + for (i = 0; i < length; i ++) { - // loop is unrolled by factor of 4 here for efficiency - suml += ptr[2 * i + 0] * filterCoeffs[i + 0] + - ptr[2 * i + 2] * filterCoeffs[i + 1] + - ptr[2 * i + 4] * filterCoeffs[i + 2] + - ptr[2 * i + 6] * filterCoeffs[i + 3]; - sumr += ptr[2 * i + 1] * filterCoeffs[i + 0] + - ptr[2 * i + 3] * filterCoeffs[i + 1] + - ptr[2 * i + 5] * filterCoeffs[i + 2] + - ptr[2 * i + 7] * filterCoeffs[i + 3]; + suml += ptr[2 * i] * filterCoeffs[i]; + sumr += ptr[2 * i + 1] * filterCoeffs[i]; } #ifdef SOUNDTOUCH_INTEGER_SAMPLES @@ -148,13 +141,9 @@ uint FIRFilter::evaluateFilterMono(SAMPLETYPE *dest, const SAMPLETYPE *src, uint uint i; sum = 0; - for (i = 0; i < length; i += 4) + for (i = 0; i < length; i ++) { - // loop is unrolled by factor of 4 here for efficiency - sum += pSrc[i + 0] * filterCoeffs[i + 0] + - pSrc[i + 1] * filterCoeffs[i + 1] + - pSrc[i + 2] * filterCoeffs[i + 2] + - pSrc[i + 3] * filterCoeffs[i + 3]; + sum += pSrc[i] * filterCoeffs[i]; } #ifdef SOUNDTOUCH_INTEGER_SAMPLES sum >>= resultDivFactor; diff --git a/source/SoundTouch/Makefile.am b/source/SoundTouch/Makefile.am index 54e330a..eac15a9 100644 --- a/source/SoundTouch/Makefile.am +++ b/source/SoundTouch/Makefile.am @@ -33,7 +33,7 @@ libSoundTouch_la_SOURCES=AAFilter.cpp FIRFilter.cpp FIFOSampleBuffer.cpp \ InterpolateShannon.cpp # Compiler flags -AM_CXXFLAGS+=-O3 +#AM_CXXFLAGS+= # Compile the files that need MMX and SSE individually. libSoundTouch_la_LIBADD=libSoundTouchMMX.la libSoundTouchSSE.la diff --git a/source/SoundTouch/TDStretch.cpp b/source/SoundTouch/TDStretch.cpp index ce49310..d381bb4 100644 --- a/source/SoundTouch/TDStretch.cpp +++ b/source/SoundTouch/TDStretch.cpp @@ -1,4 +1,4 @@ -//////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// /// /// Sampled sound tempo changer/time stretch algorithm. Changes the sound tempo /// while maintaining the original pitch by using a time domain WSOLA-like @@ -54,6 +54,10 @@ using namespace soundtouch; #define max(x, y) (((x) > (y)) ? (x) : (y)) +#if defined(SOUNDTOUCH_USE_NEON) && defined(SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION) + // SIMD mode, allow shortcuts to avoid operations that aren't aligned to 16-byte boundary + #define ST_SIMD_AVOID_UNALIGNED +#endif /***************************************************************************** * @@ -315,9 +319,10 @@ int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos) { double corr; // Calculates correlation value for the mixing position corresponding to 'i' -#ifdef _OPENMP +#if defined(_OPENMP) || defined(ST_SIMD_AVOID_UNALIGNED) // in parallel OpenMP mode, can't use norm accumulator version as parallel executor won't // iterate the loop in sequential order + // in SIMD mode, avoid accumulator version to allow avoiding unaligned positions corr = calcCrossCorr(refPos + channels * i, pMidBuffer, norm); #else // In non-parallel version call "calcCrossCorrAccumulate" that is otherwise same @@ -830,21 +835,19 @@ void TDStretch::overlapStereo(short *poutput, const short *input) const // Overlaps samples in 'midBuffer' with the samples in 'input'. The 'Multi' // version of the routine. -void TDStretch::overlapMulti(SAMPLETYPE *poutput, const SAMPLETYPE *input) const +void TDStretch::overlapMulti(short *poutput, const short *input) const { - SAMPLETYPE m1=(SAMPLETYPE)0; - SAMPLETYPE m2; - int i=0; + short m1; + int i = 0; - for (m2 = (SAMPLETYPE)overlapLength; m2; m2 --) + for (m1 = 0; m1 < overlapLength; m1 ++) { + short m2 = (short)(overlapLength - m1); for (int c = 0; c < channels; c ++) { poutput[i] = (input[i] * m1 + pMidBuffer[i] * m2) / overlapLength; i++; } - - m1++; } } @@ -889,20 +892,20 @@ double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare, do unsigned long lnorm; int i; + #ifdef ST_SIMD_AVOID_UNALIGNED + // in SIMD mode skip 'mixingPos' positions that aren't aligned to 16-byte boundary + if (((ulongptr)mixingPos) & 15) return -1e50; + #endif + corr = lnorm = 0; - // Same routine for stereo and mono. For stereo, unroll loop for better - // efficiency and gives slightly better resolution against rounding. - // For mono it same routine, just unrolls loop by factor of 4 - for (i = 0; i < channels * overlapLength; i += 4) + // Same routine for stereo and mono + for (i = 0; i < channels * overlapLength; i += 2) { corr += (mixingPos[i] * compare[i] + - mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm; // notice: do intermediate division here to avoid integer overflow - corr += (mixingPos[i + 2] * compare[i + 2] + - mixingPos[i + 3] * compare[i + 3]) >> overlapDividerBitsNorm; + mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm; lnorm += (mixingPos[i] * mixingPos[i] + - mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBitsNorm; // notice: do intermediate division here to avoid integer overflow - lnorm += (mixingPos[i + 2] * mixingPos[i + 2] + - mixingPos[i + 3] * mixingPos[i + 3]) >> overlapDividerBitsNorm; + mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBitsNorm; + // do intermediate scalings to avoid integer overflow } if (lnorm > maxnorm) @@ -936,15 +939,11 @@ double TDStretch::calcCrossCorrAccumulate(const short *mixingPos, const short *c } corr = 0; - // Same routine for stereo and mono. For stereo, unroll loop for better - // efficiency and gives slightly better resolution against rounding. - // For mono it same routine, just unrolls loop by factor of 4 - for (i = 0; i < channels * overlapLength; i += 4) + // Same routine for stereo and mono. + for (i = 0; i < channels * overlapLength; i += 2) { corr += (mixingPos[i] * compare[i] + - mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm; // notice: do intermediate division here to avoid integer overflow - corr += (mixingPos[i + 2] * compare[i + 2] + - mixingPos[i + 3] * compare[i + 3]) >> overlapDividerBitsNorm; + mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm; } // update normalizer with last samples of this round @@ -1045,27 +1044,21 @@ void TDStretch::calculateOverlapLength(int overlapInMsec) /// Calculate cross-correlation double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, double &anorm) { - double corr; - double norm; + float corr; + float norm; int i; + #ifdef ST_SIMD_AVOID_UNALIGNED + // in SIMD mode skip 'mixingPos' positions that aren't aligned to 16-byte boundary + if (((ulongptr)mixingPos) & 15) return -1e50; + #endif + corr = norm = 0; - // Same routine for stereo and mono. For Stereo, unroll by factor of 2. - // For mono it's same routine yet unrollsd by factor of 4. - for (i = 0; i < channels * overlapLength; i += 4) + // Same routine for stereo and mono + for (i = 0; i < channels * overlapLength; i ++) { - corr += mixingPos[i] * compare[i] + - mixingPos[i + 1] * compare[i + 1]; - - norm += mixingPos[i] * mixingPos[i] + - mixingPos[i + 1] * mixingPos[i + 1]; - - // unroll the loop for better CPU efficiency: - corr += mixingPos[i + 2] * compare[i + 2] + - mixingPos[i + 3] * compare[i + 3]; - - norm += mixingPos[i + 2] * mixingPos[i + 2] + - mixingPos[i + 3] * mixingPos[i + 3]; + corr += mixingPos[i] * compare[i]; + norm += mixingPos[i] * mixingPos[i]; } anorm = norm; @@ -1076,7 +1069,7 @@ double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, do /// Update cross-correlation by accumulating "norm" coefficient by previously calculated value double TDStretch::calcCrossCorrAccumulate(const float *mixingPos, const float *compare, double &norm) { - double corr; + float corr; int i; corr = 0; @@ -1087,14 +1080,10 @@ double TDStretch::calcCrossCorrAccumulate(const float *mixingPos, const float *c norm -= mixingPos[-i] * mixingPos[-i]; } - // Same routine for stereo and mono. For Stereo, unroll by factor of 2. - // For mono it's same routine yet unrollsd by factor of 4. - for (i = 0; i < channels * overlapLength; i += 4) + // Same routine for stereo and mono + for (i = 0; i < channels * overlapLength; i ++) { - corr += mixingPos[i] * compare[i] + - mixingPos[i + 1] * compare[i + 1] + - mixingPos[i + 2] * compare[i + 2] + - mixingPos[i + 3] * compare[i + 3]; + corr += mixingPos[i] * compare[i]; } // update normalizer with last samples of this round diff --git a/source/SoundTouchDLL/make-gnu-dll.sh b/source/SoundTouchDLL/make-gnu-dll.sh index b38c643..95af7e4 100755 --- a/source/SoundTouchDLL/make-gnu-dll.sh +++ b/source/SoundTouchDLL/make-gnu-dll.sh @@ -18,5 +18,5 @@ fi echo "Building SoundTouchDLL for $arch with flags:$flags" -g++ -O3 -shared $flags -DDLL_EXPORTS -fvisibility=hidden -I../../include \ +g++ -O3 -ffast-math -shared $flags -DDLL_EXPORTS -fvisibility=hidden -I../../include \ -I../SoundTouch -o SoundTouchDll.so SoundTouchDLL.cpp ../SoundTouch/*.cpp