From 3d7bf376fd6e73a3c32a4ef7afc83fdeb837a353 Mon Sep 17 00:00:00 2001
From: Olli Parviainen <oparviai@iki.fi>
Date: Sun, 21 Jun 2020 20:38:00 +0300
Subject: [PATCH] Tuning for ARM NEON

Tuning to enable ARM NEON SIMD performance improvements:
- NEON detection in configure file
- Remove manual loop unrolling, gcc autovectorization does better job
without manually unrolled loops.
- Avoid unaligned pointer accesses when using NEON
---
 configure.ac                         | 47 ++++++++++++--
 include/soundtouch_config.h.in       |  3 +
 source/SoundStretch/Makefile.am      |  2 +-
 source/SoundTouch/FIRFilter.cpp      | 21 ++-----
 source/SoundTouch/Makefile.am        |  2 +-
 source/SoundTouch/TDStretch.cpp      | 91 ++++++++++++----------------
 source/SoundTouchDLL/make-gnu-dll.sh |  2 +-
 7 files changed, 93 insertions(+), 75 deletions(-)

diff --git a/configure.ac b/configure.ac
index 0cfa019..1bfaa2e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -15,7 +15,7 @@ dnl this program; if not, write to the Free Software Foundation, Inc., 59 Temple
 dnl Place - Suite 330, Boston, MA  02111-1307, USA
 # Process this file with autoconf to produce a configure script.
 
-AC_INIT([SoundTouch], [2.1.2], [http://www.surina.net/soundtouch])
+AC_INIT([SoundTouch], [2.1.3], [http://www.surina.net/soundtouch])
 dnl Default to libSoundTouch.so.$LIB_SONAME.0.0
 LIB_SONAME=1
 AC_SUBST(LIB_SONAME)
@@ -31,6 +31,10 @@ AC_DISABLE_STATIC	dnl This makes libtool only build shared libs
 
 AC_LANG(C++)
 
+# Compiler flags. Apply -ffast-math to allow gcc autovectorization
+# generate effective SIMD code.
+CXXFLAGS="-O3 -ffast-math"
+
 # Set AR_FLAGS to avoid build warning "ar: `u' modifier ignored since `D' is the default (see `U')"
 AR_FLAGS='cr'
 
@@ -59,6 +63,7 @@ AC_HEADER_STDC
 #AC_HEADER_SYS_WAIT
 #	add any others you want to check for here
 AC_CHECK_HEADERS([cpuid.h])
+AC_CHECK_HEADERS([arm_neon.h])
 
 if test "x$ac_cv_header_cpuid_h" = "xno"; then
 	AC_MSG_WARN([The cpuid.h file was not found therefore the x86 optimizations will be disabled.])
@@ -78,8 +83,7 @@ AC_C_INLINE
 
 AC_ARG_ENABLE(integer-samples,
               [AC_HELP_STRING([--enable-integer-samples],
-                              [use integer samples instead of floats
-[default=no]])],,
+                              [use integer samples instead of floats [default=no]])],,
               [enable_integer_samples=no])
 
 
@@ -92,10 +96,17 @@ AC_ARG_ENABLE(openmp,
 # Useful when compiling on non-x86 architectures.
 AC_ARG_ENABLE([x86-optimizations],
               [AS_HELP_STRING([--enable-x86-optimizations],
-                              [use MMX or SSE optimization
-[default=yes]])],[enable_x86_optimizations="${enableval}"],
+                              [use MMX or SSE optimization [default=yes]])],[enable_x86_optimizations="${enableval}"],
               [enable_x86_optimizations=yes])
 
+# Let the user enable/disable the x86 optimizations.
+# Useful when compiling on non-x86 architectures.
+AC_ARG_ENABLE([neon-optimizations],
+              [AS_HELP_STRING([--enable-neon-optimizations],
+                              [use ARM NEON optimization [default=yes]])],[enable_neon_optimizations="${enableval}"],
+              [enable_neon_optimizations=yes])
+
+
 # Tell the Makefile.am if the user wants to disable optimizations.
 # Makefile.am will enable them by default if support is available.
 # Note: We check if optimizations are supported a few lines down.
@@ -195,6 +206,32 @@ else
         CPPFLAGS="-DSOUNDTOUCH_DISABLE_X86_OPTIMIZATIONS $CPPFLAGS"
 fi
 
+
+if test "x$enable_neon_optimizations" = "xyes" -a "x$ac_cv_header_arm_neon_h" = "xyes"; then
+
+	# Check for ARM NEON support
+	original_saved_CXXFLAGS=$CXXFLAGS
+	have_neon=no
+	CXXFLAGS="-mfpu=neon -march=native $CXXFLAGS"
+
+	# Check if can compile neon code using intrinsics, require GCC >= 4.3 for autovectorization.
+	AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+	#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3))
+	#error "Need GCC >= 4.3 for neon autovectorization"
+	#endif
+	#include <arm_neon.h>
+	int main () {
+		int32x4_t t = {1};
+		return vaddq_s32(t,t)[0] == 2;
+	}]])],[have_neon=yes])
+	CXXFLAGS=$original_saved_CXXFLAGS
+	if test "x$have_neon" = "xyes" ; then
+		echo "****** NEON support enabled ******"
+		CPPFLAGS="-mfpu=neon -march=native -mtune=native $CPPFLAGS"
+		AC_DEFINE(SOUNDTOUCH_USE_NEON,1,[Use ARM NEON extension])
+	fi
+fi
+
 # Set AM_CXXFLAGS
 AC_SUBST([AM_CXXFLAGS], [$AM_CXXFLAGS])
 
diff --git a/include/soundtouch_config.h.in b/include/soundtouch_config.h.in
index 74a9ccb..9b317a4 100644
--- a/include/soundtouch_config.h.in
+++ b/include/soundtouch_config.h.in
@@ -3,3 +3,6 @@
 
 /* Use Integer as Sample type */
 #undef SOUNDTOUCH_INTEGER_SAMPLES
+
+/* Use ARM NEON extension */
+#undef SOUNDTOUCH_USE_NEON
diff --git a/source/SoundStretch/Makefile.am b/source/SoundStretch/Makefile.am
index ac5292c..96689bd 100644
--- a/source/SoundStretch/Makefile.am
+++ b/source/SoundStretch/Makefile.am
@@ -44,7 +44,7 @@ soundstretch_LDADD=../SoundTouch/libSoundTouch.la -lm
 #soundstretch_LDFLAGS=-s
 
 ## additional compiler flags
-soundstretch_CXXFLAGS=-O3 $(AM_CXXFLAGS)
+soundstretch_CXXFLAGS=$(AM_CXXFLAGS)
 
 #clean-local: 
 #	-rm -f additional-files-to-remove-on-make-clean
diff --git a/source/SoundTouch/FIRFilter.cpp b/source/SoundTouch/FIRFilter.cpp
index 218e50e..62c7ca8 100644
--- a/source/SoundTouch/FIRFilter.cpp
+++ b/source/SoundTouch/FIRFilter.cpp
@@ -96,17 +96,10 @@ uint FIRFilter::evaluateFilterStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, ui
         suml = sumr = 0;
         ptr = src + j;
 
-        for (i = 0; i < length; i += 4) 
+        for (i = 0; i < length; i ++) 
         {
-            // loop is unrolled by factor of 4 here for efficiency
-            suml += ptr[2 * i + 0] * filterCoeffs[i + 0] +
-                    ptr[2 * i + 2] * filterCoeffs[i + 1] +
-                    ptr[2 * i + 4] * filterCoeffs[i + 2] +
-                    ptr[2 * i + 6] * filterCoeffs[i + 3];
-            sumr += ptr[2 * i + 1] * filterCoeffs[i + 0] +
-                    ptr[2 * i + 3] * filterCoeffs[i + 1] +
-                    ptr[2 * i + 5] * filterCoeffs[i + 2] +
-                    ptr[2 * i + 7] * filterCoeffs[i + 3];
+            suml += ptr[2 * i] * filterCoeffs[i];
+            sumr += ptr[2 * i + 1] * filterCoeffs[i];
         }
 
 #ifdef SOUNDTOUCH_INTEGER_SAMPLES
@@ -148,13 +141,9 @@ uint FIRFilter::evaluateFilterMono(SAMPLETYPE *dest, const SAMPLETYPE *src, uint
         uint i;
 
         sum = 0;
-        for (i = 0; i < length; i += 4) 
+        for (i = 0; i < length; i ++) 
         {
-            // loop is unrolled by factor of 4 here for efficiency
-            sum += pSrc[i + 0] * filterCoeffs[i + 0] + 
-                   pSrc[i + 1] * filterCoeffs[i + 1] + 
-                   pSrc[i + 2] * filterCoeffs[i + 2] + 
-                   pSrc[i + 3] * filterCoeffs[i + 3];
+            sum += pSrc[i] * filterCoeffs[i];
         }
 #ifdef SOUNDTOUCH_INTEGER_SAMPLES
         sum >>= resultDivFactor;
diff --git a/source/SoundTouch/Makefile.am b/source/SoundTouch/Makefile.am
index 54e330a..eac15a9 100644
--- a/source/SoundTouch/Makefile.am
+++ b/source/SoundTouch/Makefile.am
@@ -33,7 +33,7 @@ libSoundTouch_la_SOURCES=AAFilter.cpp FIRFilter.cpp FIFOSampleBuffer.cpp    \
     InterpolateShannon.cpp
 
 # Compiler flags
-AM_CXXFLAGS+=-O3
+#AM_CXXFLAGS+=
 
 # Compile the files that need MMX and SSE individually.
 libSoundTouch_la_LIBADD=libSoundTouchMMX.la libSoundTouchSSE.la
diff --git a/source/SoundTouch/TDStretch.cpp b/source/SoundTouch/TDStretch.cpp
index ce49310..d381bb4 100644
--- a/source/SoundTouch/TDStretch.cpp
+++ b/source/SoundTouch/TDStretch.cpp
@@ -1,4 +1,4 @@
-////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
 /// 
 /// Sampled sound tempo changer/time stretch algorithm. Changes the sound tempo 
 /// while maintaining the original pitch by using a time domain WSOLA-like 
@@ -54,6 +54,10 @@ using namespace soundtouch;
 
 #define max(x, y) (((x) > (y)) ? (x) : (y))
 
+#if defined(SOUNDTOUCH_USE_NEON) && defined(SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION)
+    // SIMD mode, allow shortcuts to avoid operations that aren't aligned to 16-byte boundary
+    #define ST_SIMD_AVOID_UNALIGNED
+#endif
 
 /*****************************************************************************
  *
@@ -315,9 +319,10 @@ int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos)
     {
         double corr;
         // Calculates correlation value for the mixing position corresponding to 'i'
-#ifdef _OPENMP
+#if defined(_OPENMP) || defined(ST_SIMD_AVOID_UNALIGNED)
         // in parallel OpenMP mode, can't use norm accumulator version as parallel executor won't
         // iterate the loop in sequential order
+        // in SIMD mode, avoid accumulator version to allow avoiding unaligned positions
         corr = calcCrossCorr(refPos + channels * i, pMidBuffer, norm);
 #else
         // In non-parallel version call "calcCrossCorrAccumulate" that is otherwise same
@@ -830,21 +835,19 @@ void TDStretch::overlapStereo(short *poutput, const short *input) const
 
 // Overlaps samples in 'midBuffer' with the samples in 'input'. The 'Multi'
 // version of the routine.
-void TDStretch::overlapMulti(SAMPLETYPE *poutput, const SAMPLETYPE *input) const
+void TDStretch::overlapMulti(short *poutput, const short *input) const
 {
-    SAMPLETYPE m1=(SAMPLETYPE)0;
-    SAMPLETYPE m2;
-    int i=0;
+    short m1;
+    int i = 0;
 
-    for (m2 = (SAMPLETYPE)overlapLength; m2; m2 --)
+    for (m1 = 0; m1 < overlapLength; m1 ++)
     {
+        short m2 = (short)(overlapLength - m1);
         for (int c = 0; c < channels; c ++)
         {
             poutput[i] = (input[i] * m1 + pMidBuffer[i] * m2)  / overlapLength;
             i++;
         }
-
-        m1++;
     }
 }
 
@@ -889,20 +892,20 @@ double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare, do
     unsigned long lnorm;
     int i;
 
+    #ifdef ST_SIMD_AVOID_UNALIGNED
+        // in SIMD mode skip 'mixingPos' positions that aren't aligned to 16-byte boundary
+        if (((ulongptr)mixingPos) & 15) return -1e50;
+    #endif
+
     corr = lnorm = 0;
-    // Same routine for stereo and mono. For stereo, unroll loop for better
-    // efficiency and gives slightly better resolution against rounding. 
-    // For mono it same routine, just  unrolls loop by factor of 4
-    for (i = 0; i < channels * overlapLength; i += 4) 
+    // Same routine for stereo and mono
+    for (i = 0; i < channels * overlapLength; i += 2)
     {
         corr += (mixingPos[i] * compare[i] + 
-                 mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm;  // notice: do intermediate division here to avoid integer overflow
-        corr += (mixingPos[i + 2] * compare[i + 2] + 
-                mixingPos[i + 3] * compare[i + 3]) >> overlapDividerBitsNorm;
+                 mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm;
         lnorm += (mixingPos[i] * mixingPos[i] + 
-                mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBitsNorm; // notice: do intermediate division here to avoid integer overflow
-        lnorm += (mixingPos[i + 2] * mixingPos[i + 2] + 
-                mixingPos[i + 3] * mixingPos[i + 3]) >> overlapDividerBitsNorm;
+                  mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBitsNorm;
+        // do intermediate scalings to avoid integer overflow
     }
 
     if (lnorm > maxnorm)
@@ -936,15 +939,11 @@ double TDStretch::calcCrossCorrAccumulate(const short *mixingPos, const short *c
     }
 
     corr = 0;
-    // Same routine for stereo and mono. For stereo, unroll loop for better
-    // efficiency and gives slightly better resolution against rounding. 
-    // For mono it same routine, just  unrolls loop by factor of 4
-    for (i = 0; i < channels * overlapLength; i += 4) 
+    // Same routine for stereo and mono.
+    for (i = 0; i < channels * overlapLength; i += 2) 
     {
         corr += (mixingPos[i] * compare[i] + 
-                 mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm;  // notice: do intermediate division here to avoid integer overflow
-        corr += (mixingPos[i + 2] * compare[i + 2] + 
-                 mixingPos[i + 3] * compare[i + 3]) >> overlapDividerBitsNorm;
+                 mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm;
     }
 
     // update normalizer with last samples of this round
@@ -1045,27 +1044,21 @@ void TDStretch::calculateOverlapLength(int overlapInMsec)
 /// Calculate cross-correlation
 double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, double &anorm)
 {
-    double corr;
-    double norm;
+    float corr;
+    float norm;
     int i;
 
+    #ifdef ST_SIMD_AVOID_UNALIGNED
+        // in SIMD mode skip 'mixingPos' positions that aren't aligned to 16-byte boundary
+        if (((ulongptr)mixingPos) & 15) return -1e50;
+    #endif
+
     corr = norm = 0;
-    // Same routine for stereo and mono. For Stereo, unroll by factor of 2.
-    // For mono it's same routine yet unrollsd by factor of 4.
-    for (i = 0; i < channels * overlapLength; i += 4) 
+    // Same routine for stereo and mono
+    for (i = 0; i < channels * overlapLength; i ++)
     {
-        corr += mixingPos[i] * compare[i] +
-                mixingPos[i + 1] * compare[i + 1];
-
-        norm += mixingPos[i] * mixingPos[i] + 
-                mixingPos[i + 1] * mixingPos[i + 1];
-
-        // unroll the loop for better CPU efficiency:
-        corr += mixingPos[i + 2] * compare[i + 2] +
-                mixingPos[i + 3] * compare[i + 3];
-
-        norm += mixingPos[i + 2] * mixingPos[i + 2] +
-                mixingPos[i + 3] * mixingPos[i + 3];
+        corr += mixingPos[i] * compare[i];
+        norm += mixingPos[i] * mixingPos[i];
     }
 
     anorm = norm;
@@ -1076,7 +1069,7 @@ double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, do
 /// Update cross-correlation by accumulating "norm" coefficient by previously calculated value
 double TDStretch::calcCrossCorrAccumulate(const float *mixingPos, const float *compare, double &norm)
 {
-    double corr;
+    float corr;
     int i;
 
     corr = 0;
@@ -1087,14 +1080,10 @@ double TDStretch::calcCrossCorrAccumulate(const float *mixingPos, const float *c
         norm -= mixingPos[-i] * mixingPos[-i];
     }
 
-    // Same routine for stereo and mono. For Stereo, unroll by factor of 2.
-    // For mono it's same routine yet unrollsd by factor of 4.
-    for (i = 0; i < channels * overlapLength; i += 4) 
+    // Same routine for stereo and mono
+    for (i = 0; i < channels * overlapLength; i ++)
     {
-        corr += mixingPos[i] * compare[i] +
-                mixingPos[i + 1] * compare[i + 1] +
-                mixingPos[i + 2] * compare[i + 2] +
-                mixingPos[i + 3] * compare[i + 3];
+        corr += mixingPos[i] * compare[i];
     }
 
     // update normalizer with last samples of this round
diff --git a/source/SoundTouchDLL/make-gnu-dll.sh b/source/SoundTouchDLL/make-gnu-dll.sh
index b38c643..95af7e4 100755
--- a/source/SoundTouchDLL/make-gnu-dll.sh
+++ b/source/SoundTouchDLL/make-gnu-dll.sh
@@ -18,5 +18,5 @@ fi
 
 echo "Building SoundTouchDLL for $arch with flags:$flags"
 
-g++ -O3 -shared $flags -DDLL_EXPORTS -fvisibility=hidden -I../../include \
+g++ -O3 -ffast-math -shared $flags -DDLL_EXPORTS -fvisibility=hidden -I../../include \
     -I../SoundTouch -o SoundTouchDll.so SoundTouchDLL.cpp ../SoundTouch/*.cpp