From 5a1e783dae584a25683e43fddac3bc9e0d575a17 Mon Sep 17 00:00:00 2001
From: Karl Tomlinson <karlt+@karlt.net>
Date: Wed, 23 Jul 2014 21:49:04 +1200
Subject: [PATCH] b=1042508 move resampler simd optimizations to separate
 translation units r=padenot

This ensures that code in resample.c will run on Intel x86 cpus even when SSE
support has been compiled, and will provide similarly for neon support when
enabled.

--HG--
rename : media/libspeex_resampler/sse-detect-runtime.patch => media/libspeex_resampler/simd-detect-runtime.patch
rename : media/libspeex_resampler/src/resample_neon.h => media/libspeex_resampler/src/resample_neon.c
rename : media/libspeex_resampler/src/resample_sse.h => media/libspeex_resampler/src/resample_sse.c
rename : media/libspeex_resampler/src/sse_detect.cpp => media/libspeex_resampler/src/simd_detect.cpp
rename : media/libspeex_resampler/src/sse_detect.h => media/libspeex_resampler/src/simd_detect.h
extra : rebase_source : bdf1ef791129dedeadce7369354a5992729a99b7
---
 .../simd-detect-runtime.patch                 | 331 ++++++++++++++++++
 media/libspeex_resampler/src/moz.build        |  10 +-
 media/libspeex_resampler/src/resample.c       |  26 +-
 .../src/{resample_neon.h => resample_neon.c}  |   6 +-
 .../src/{resample_sse.h => resample_sse.c}    |  10 +-
 .../src/{sse_detect.cpp => simd_detect.cpp}   |  20 +-
 media/libspeex_resampler/src/simd_detect.h    |  43 +++
 media/libspeex_resampler/src/sse_detect.h     |  20 --
 .../sse-detect-runtime.patch                  | 192 ----------
 media/libspeex_resampler/update.sh            |   6 +-
 10 files changed, 413 insertions(+), 251 deletions(-)
 create mode 100644 media/libspeex_resampler/simd-detect-runtime.patch
 rename media/libspeex_resampler/src/{resample_neon.h => resample_neon.c} (96%)
 rename media/libspeex_resampler/src/{resample_sse.h => resample_sse.c} (89%)
 rename media/libspeex_resampler/src/{sse_detect.cpp => simd_detect.cpp} (58%)
 create mode 100644 media/libspeex_resampler/src/simd_detect.h
 delete mode 100644 media/libspeex_resampler/src/sse_detect.h
 delete mode 100644 media/libspeex_resampler/sse-detect-runtime.patch

diff --git a/media/libspeex_resampler/simd-detect-runtime.patch b/media/libspeex_resampler/simd-detect-runtime.patch
new file mode 100644
index 000000000000..c8b182ddadda
--- /dev/null
+++ b/media/libspeex_resampler/simd-detect-runtime.patch
@@ -0,0 +1,331 @@
+diff --git a/media/libspeex_resampler/src/resample.c b/media/libspeex_resampler/src/resample.c
+--- a/media/libspeex_resampler/src/resample.c
++++ b/media/libspeex_resampler/src/resample.c
+@@ -92,23 +92,17 @@ static void speex_free (void *ptr) {free
+                
+ #define IMAX(a,b) ((a) > (b) ? (a) : (b))
+ #define IMIN(a,b) ((a) < (b) ? (a) : (b))
+ 
+ #ifndef NULL
+ #define NULL 0
+ #endif
+ 
+-#ifdef _USE_SSE
+-#include "resample_sse.h"
+-#endif
+-
+-#ifdef _USE_NEON
+-#include "resample_neon.h"
+-#endif
++#include "simd_detect.h"
+ 
+ /* Numer of elements to allocate on the stack */
+ #ifdef VAR_ARRAYS
+ #define FIXED_STACK_ALLOC 8192
+ #else
+ #define FIXED_STACK_ALLOC 1024
+ #endif
+ 
+@@ -344,17 +338,19 @@ static int resampler_basic_direct_single
+    const spx_uint32_t den_rate = st->den_rate;
+    spx_word32_t sum;
+ 
+    while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
+    {
+       const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
+       const spx_word16_t *iptr = & in[last_sample];
+ 
+-#ifndef OVERRIDE_INNER_PRODUCT_SINGLE
++#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
++      if (!moz_speex_have_single_simd()) {
++#endif
+       int j;
+       sum = 0;
+       for(j=0;j<N;j++) sum += MULT16_16(sinct[j], iptr[j]);
+ 
+ /*    This code is slower on most DSPs which have only 2 accumulators.
+       Plus this this forces truncation to 32 bits and you lose the HW guard bits.
+       I think we can trust the compiler and let it vectorize and/or unroll itself.
+       spx_word32_t accum[4] = {0,0,0,0};
+@@ -362,18 +358,20 @@ static int resampler_basic_direct_single
+         accum[0] += MULT16_16(sinct[j], iptr[j]);
+         accum[1] += MULT16_16(sinct[j+1], iptr[j+1]);
+         accum[2] += MULT16_16(sinct[j+2], iptr[j+2]);
+         accum[3] += MULT16_16(sinct[j+3], iptr[j+3]);
+       }
+       sum = accum[0] + accum[1] + accum[2] + accum[3];
+ */
+       sum = SATURATE32PSHR(sum, 15, 32767);
+-#else
++#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
++      } else {
+       sum = inner_product_single(sinct, iptr, N);
++      }
+ #endif
+ 
+       out[out_stride * out_sample++] = sum;
+       last_sample += int_advance;
+       samp_frac_num += frac_advance;
+       if (samp_frac_num >= den_rate)
+       {
+          samp_frac_num -= den_rate;
+@@ -402,29 +400,33 @@ static int resampler_basic_direct_double
+    const spx_uint32_t den_rate = st->den_rate;
+    double sum;
+ 
+    while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
+    {
+       const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
+       const spx_word16_t *iptr = & in[last_sample];
+ 
+-#ifndef OVERRIDE_INNER_PRODUCT_DOUBLE
++#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
++      if(moz_speex_have_double_simd()) {
++#endif
+       int j;
+       double accum[4] = {0,0,0,0};
+ 
+       for(j=0;j<N;j+=4) {
+         accum[0] += sinct[j]*iptr[j];
+         accum[1] += sinct[j+1]*iptr[j+1];
+         accum[2] += sinct[j+2]*iptr[j+2];
+         accum[3] += sinct[j+3]*iptr[j+3];
+       }
+       sum = accum[0] + accum[1] + accum[2] + accum[3];
+-#else
++#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
++      } else {
+       sum = inner_product_double(sinct, iptr, N);
++      }
+ #endif
+ 
+       out[out_stride * out_sample++] = PSHR32(sum, 15);
+       last_sample += int_advance;
+       samp_frac_num += frac_advance;
+       if (samp_frac_num >= den_rate)
+       {
+          samp_frac_num -= den_rate;
+@@ -458,34 +460,38 @@ static int resampler_basic_interpolate_s
+ #ifdef FIXED_POINT
+       const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
+ #else
+       const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
+ #endif
+       spx_word16_t interp[4];
+ 
+ 
+-#ifndef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
++#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
++      if (!moz_speex_have_single_simd()) {
++#endif
+       int j;
+       spx_word32_t accum[4] = {0,0,0,0};
+ 
+       for(j=0;j<N;j++) {
+         const spx_word16_t curr_in=iptr[j];
+         accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
+         accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
+         accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
+         accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
+       }
+ 
+       cubic_coef(frac, interp);
+       sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1));
+       sum = SATURATE32PSHR(sum, 15, 32767);
+-#else
++#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
++      } else {
+       cubic_coef(frac, interp);
+       sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
++      }
+ #endif
+       
+       out[out_stride * out_sample++] = sum;
+       last_sample += int_advance;
+       samp_frac_num += frac_advance;
+       if (samp_frac_num >= den_rate)
+       {
+          samp_frac_num -= den_rate;
+@@ -521,33 +527,37 @@ static int resampler_basic_interpolate_d
+ #ifdef FIXED_POINT
+       const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
+ #else
+       const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
+ #endif
+       spx_word16_t interp[4];
+ 
+ 
+-#ifndef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
++#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
++      if (!moz_speex_have_double_simd()) {
++#endif
+       int j;
+       double accum[4] = {0,0,0,0};
+ 
+       for(j=0;j<N;j++) {
+         const double curr_in=iptr[j];
+         accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
+         accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
+         accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
+         accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
+       }
+ 
+       cubic_coef(frac, interp);
+       sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]);
+-#else
++#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
++      } else {
+       cubic_coef(frac, interp);
+       sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
++      }
+ #endif
+       
+       out[out_stride * out_sample++] = PSHR32(sum,15);
+       last_sample += int_advance;
+       samp_frac_num += frac_advance;
+       if (samp_frac_num >= den_rate)
+       {
+          samp_frac_num -= den_rate;
+diff --git a/media/libspeex_resampler/src/resample_neon.c b/media/libspeex_resampler/src/resample_neon.c
+--- a/media/libspeex_resampler/src/resample_neon.c
++++ b/media/libspeex_resampler/src/resample_neon.c
+@@ -31,16 +31,18 @@
+    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+ 
++#include "simd_detect.h"
++
+ #include <arm_neon.h>
+ 
+ #ifdef FIXED_POINT
+ #ifdef __thumb2__
+ static inline int32_t saturate_32bit_to_16bit(int32_t a) {
+     int32_t ret;
+     asm ("ssat %[ret], #16, %[a]"
+          : [ret] "=&r" (ret)
+@@ -60,17 +62,17 @@ static inline int32_t saturate_32bit_to_
+     return ret;
+ }
+ #endif
+ #undef WORD2INT
+ #define WORD2INT(x) (saturate_32bit_to_16bit(x))
+ 
+ #define OVERRIDE_INNER_PRODUCT_SINGLE
+ /* Only works when len % 4 == 0 */
+-static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
++int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
+ {
+     int32_t ret;
+     uint32_t remainder = len % 16;
+     len = len - remainder;
+ 
+     asm volatile ("	 cmp %[len], #0\n"
+ 		  "	 bne 1f\n"
+ 		  "	 vld1.16 {d16}, [%[b]]!\n"
+@@ -134,17 +136,17 @@ static inline int32_t saturate_float_to_
+          : "q0");
+     return ret;
+ }
+ #undef WORD2INT
+ #define WORD2INT(x) (saturate_float_to_16bit(x))
+ 
+ #define OVERRIDE_INNER_PRODUCT_SINGLE
+ /* Only works when len % 4 == 0 */
+-static inline float inner_product_single(const float *a, const float *b, unsigned int len)
++float inner_product_single(const float *a, const float *b, unsigned int len)
+ {
+     float ret;
+     uint32_t remainder = len % 16;
+     len = len - remainder;
+ 
+     asm volatile ("	 cmp %[len], #0\n"
+ 		  "	 bne 1f\n"
+ 		  "	 vld1.32 {q4}, [%[b]]!\n"
+diff --git a/media/libspeex_resampler/src/resample_sse.c b/media/libspeex_resampler/src/resample_sse.c
+--- a/media/libspeex_resampler/src/resample_sse.c
++++ b/media/libspeex_resampler/src/resample_sse.c
+@@ -29,37 +29,39 @@
+    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+ 
++#include "simd_detect.h"
++
+ #include <xmmintrin.h>
+ 
+ #define OVERRIDE_INNER_PRODUCT_SINGLE
+-static inline float inner_product_single(const float *a, const float *b, unsigned int len)
++float inner_product_single(const float *a, const float *b, unsigned int len)
+ {
+    int i;
+    float ret;
+    __m128 sum = _mm_setzero_ps();
+    for (i=0;i<len;i+=8)
+    {
+       sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i)));
+       sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i+4), _mm_loadu_ps(b+i+4)));
+    }
+    sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
+    sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
+    _mm_store_ss(&ret, sum);
+    return ret;
+ }
+ 
+ #define OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
+-static inline float interpolate_product_single(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
++float interpolate_product_single(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
+   int i;
+   float ret;
+   __m128 sum = _mm_setzero_ps();
+   __m128 f = _mm_loadu_ps(frac);
+   for(i=0;i<len;i+=2)
+   {
+     sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i), _mm_loadu_ps(b+i*oversample)));
+     sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i+1), _mm_loadu_ps(b+(i+1)*oversample)));
+@@ -70,17 +72,17 @@ static inline float interpolate_product_
+    _mm_store_ss(&ret, sum);
+    return ret;
+ }
+ 
+ #ifdef _USE_SSE2
+ #include <emmintrin.h>
+ #define OVERRIDE_INNER_PRODUCT_DOUBLE
+ 
+-static inline double inner_product_double(const float *a, const float *b, unsigned int len)
++double inner_product_double(const float *a, const float *b, unsigned int len)
+ {
+    int i;
+    double ret;
+    __m128d sum = _mm_setzero_pd();
+    __m128 t;
+    for (i=0;i<len;i+=8)
+    {
+       t = _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i));
+@@ -92,17 +94,17 @@ static inline double inner_product_doubl
+       sum = _mm_add_pd(sum, _mm_cvtps_pd(_mm_movehl_ps(t, t)));
+    }
+    sum = _mm_add_sd(sum, _mm_unpackhi_pd(sum, sum));
+    _mm_store_sd(&ret, sum);
+    return ret;
+ }
+ 
+ #define OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
+-static inline double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
++double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
+   int i;
+   double ret;
+   __m128d sum;
+   __m128d sum1 = _mm_setzero_pd();
+   __m128d sum2 = _mm_setzero_pd();
+   __m128 f = _mm_loadu_ps(frac);
+   __m128d f1 = _mm_cvtps_pd(f);
+   __m128d f2 = _mm_cvtps_pd(_mm_movehl_ps(f,f));
diff --git a/media/libspeex_resampler/src/moz.build b/media/libspeex_resampler/src/moz.build
index bb59bff118c6..5d6cd8fc3414 100644
--- a/media/libspeex_resampler/src/moz.build
+++ b/media/libspeex_resampler/src/moz.build
@@ -10,12 +10,9 @@ EXPORTS.speex += [
     'speex_resampler.h',
 ]
 
-SOURCES += [
-  'sse_detect.cpp',
-]
-
 SOURCES += [
     'resample.c',
+    'simd_detect.cpp',
 ]
 
 MSVC_ENABLE_PGO = True
@@ -40,7 +37,10 @@ else:
 if CONFIG['INTEL_ARCHITECTURE'] and not CONFIG['MOZ_SAMPLE_TYPE_S16']:
     DEFINES['_USE_SSE'] = True
     DEFINES['_USE_SSE2'] = True
-    SOURCES['resample.c'].flags += CONFIG['SSE2_FLAGS']
+    SOURCES += [
+        'resample_sse.c'
+    ]
+    SOURCES['resample_sse.c'].flags += CONFIG['SSE2_FLAGS']
 
 # Suppress warnings in third-party code.
 if CONFIG['GNU_CC']:
diff --git a/media/libspeex_resampler/src/resample.c b/media/libspeex_resampler/src/resample.c
index 6bd81d73b7eb..4268ec6b1cc9 100644
--- a/media/libspeex_resampler/src/resample.c
+++ b/media/libspeex_resampler/src/resample.c
@@ -97,23 +97,7 @@ static void speex_free (void *ptr) {free(ptr);}
 #define NULL 0
 #endif
 
-#include "sse_detect.h"
-
-/* We compile SSE code on x86 all the time, but we only use it if we find at
- * runtime that the CPU supports it. */
-#ifdef _USE_SSE
-#ifdef _MSC_VER
-#define inline __inline
-#endif
-#include "resample_sse.h"
-#ifdef _MSC_VER
-#undef inline
-#endif
-#endif
-
-#ifdef _USE_NEON
-#include "resample_neon.h"
-#endif
+#include "simd_detect.h"
 
 /* Numer of elements to allocate on the stack */
 #ifdef VAR_ARRAYS
@@ -360,7 +344,7 @@ static int resampler_basic_direct_single(SpeexResamplerState *st, spx_uint32_t c
       const spx_word16_t *iptr = & in[last_sample];
 
 #ifdef OVERRIDE_INNER_PRODUCT_SINGLE
-      if (!moz_has_sse()) {
+      if (!moz_speex_have_single_simd()) {
 #endif
       int j;
       sum = 0;
@@ -422,7 +406,7 @@ static int resampler_basic_direct_double(SpeexResamplerState *st, spx_uint32_t c
       const spx_word16_t *iptr = & in[last_sample];
 
 #ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
-      if(moz_has_sse2()) {
+      if(moz_speex_have_double_simd()) {
 #endif
       int j;
       double accum[4] = {0,0,0,0};
@@ -482,7 +466,7 @@ static int resampler_basic_interpolate_single(SpeexResamplerState *st, spx_uint3
 
 
 #ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
-      if (!moz_has_sse()) {
+      if (!moz_speex_have_single_simd()) {
 #endif
       int j;
       spx_word32_t accum[4] = {0,0,0,0};
@@ -549,7 +533,7 @@ static int resampler_basic_interpolate_double(SpeexResamplerState *st, spx_uint3
 
 
 #ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
-      if (!moz_has_sse2()) {
+      if (!moz_speex_have_double_simd()) {
 #endif
       int j;
       double accum[4] = {0,0,0,0};
diff --git a/media/libspeex_resampler/src/resample_neon.h b/media/libspeex_resampler/src/resample_neon.c
similarity index 96%
rename from media/libspeex_resampler/src/resample_neon.h
rename to media/libspeex_resampler/src/resample_neon.c
index 0acbd27b9a7f..0236e81f0532 100644
--- a/media/libspeex_resampler/src/resample_neon.h
+++ b/media/libspeex_resampler/src/resample_neon.c
@@ -36,6 +36,8 @@
    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
+#include "simd_detect.h"
+
 #include <arm_neon.h>
 
 #ifdef FIXED_POINT
@@ -65,7 +67,7 @@ static inline int32_t saturate_32bit_to_16bit(int32_t a) {
 
 #define OVERRIDE_INNER_PRODUCT_SINGLE
 /* Only works when len % 4 == 0 */
-static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
+int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
 {
     int32_t ret;
     uint32_t remainder = len % 16;
@@ -139,7 +141,7 @@ static inline int32_t saturate_float_to_16bit(float a) {
 
 #define OVERRIDE_INNER_PRODUCT_SINGLE
 /* Only works when len % 4 == 0 */
-static inline float inner_product_single(const float *a, const float *b, unsigned int len)
+float inner_product_single(const float *a, const float *b, unsigned int len)
 {
     float ret;
     uint32_t remainder = len % 16;
diff --git a/media/libspeex_resampler/src/resample_sse.h b/media/libspeex_resampler/src/resample_sse.c
similarity index 89%
rename from media/libspeex_resampler/src/resample_sse.h
rename to media/libspeex_resampler/src/resample_sse.c
index 64be8a161612..2eb7929ce21b 100644
--- a/media/libspeex_resampler/src/resample_sse.h
+++ b/media/libspeex_resampler/src/resample_sse.c
@@ -34,10 +34,12 @@
    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
+#include "simd_detect.h"
+
 #include <xmmintrin.h>
 
 #define OVERRIDE_INNER_PRODUCT_SINGLE
-static inline float inner_product_single(const float *a, const float *b, unsigned int len)
+float inner_product_single(const float *a, const float *b, unsigned int len)
 {
    int i;
    float ret;
@@ -54,7 +56,7 @@ static inline float inner_product_single(const float *a, const float *b, unsigne
 }
 
 #define OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
-static inline float interpolate_product_single(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
+float interpolate_product_single(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
   int i;
   float ret;
   __m128 sum = _mm_setzero_ps();
@@ -75,7 +77,7 @@ static inline float interpolate_product_single(const float *a, const float *b, u
 #include <emmintrin.h>
 #define OVERRIDE_INNER_PRODUCT_DOUBLE
 
-static inline double inner_product_double(const float *a, const float *b, unsigned int len)
+double inner_product_double(const float *a, const float *b, unsigned int len)
 {
    int i;
    double ret;
@@ -97,7 +99,7 @@ static inline double inner_product_double(const float *a, const float *b, unsign
 }
 
 #define OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
-static inline double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
+double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
   int i;
   double ret;
   __m128d sum;
diff --git a/media/libspeex_resampler/src/sse_detect.cpp b/media/libspeex_resampler/src/simd_detect.cpp
similarity index 58%
rename from media/libspeex_resampler/src/sse_detect.cpp
rename to media/libspeex_resampler/src/simd_detect.cpp
index b37112b66075..50111273b591 100644
--- a/media/libspeex_resampler/src/sse_detect.cpp
+++ b/media/libspeex_resampler/src/simd_detect.cpp
@@ -3,13 +3,25 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
-#include "mozilla/SSE.h"
-#include "sse_detect.h"
+#include "simd_detect.h"
 
-int moz_has_sse2() {
+#include "mozilla/SSE.h"
+#include "mozilla/arm.h"
+
+#ifdef _USE_SSE2
+int moz_speex_have_double_simd() {
   return mozilla::supports_sse2() ? 1 : 0;
 }
+#endif
 
-int moz_has_sse() {
+#ifdef _USE_SSE
+int moz_speex_have_single_simd() {
   return mozilla::supports_sse() ? 1 : 0;
 }
+#endif
+
+#ifdef _USE_NEON
+int moz_speex_have_single_simd() {
+  return mozilla::supports_neon() ? 1 : 0;
+}
+#endif
diff --git a/media/libspeex_resampler/src/simd_detect.h b/media/libspeex_resampler/src/simd_detect.h
new file mode 100644
index 000000000000..f563b82b9e3c
--- /dev/null
+++ b/media/libspeex_resampler/src/simd_detect.h
@@ -0,0 +1,43 @@
+/* vim: set shiftwidth=2 tabstop=8 autoindent cindent expandtab: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef simd_detect_h
+#define simd_detect_h
+
+#include "speex_resampler.h"
+#include "arch.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int moz_speex_have_single_simd();
+int moz_speex_have_double_simd();
+
+#if defined(_USE_SSE) || defined(_USE_NEON)
+#define OVERRIDE_INNER_PRODUCT_SINGLE
+#define inner_product_single CAT_PREFIX(RANDOM_PREFIX,_inner_product_single)
+spx_word32_t inner_product_single(const spx_word16_t *a, const spx_word16_t *b, unsigned int len);
+#endif
+#if defined(_USE_SSE)
+#define OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
+#define interpolate_product_single CAT_PREFIX(RANDOM_PREFIX,_interpolate_product_single)
+spx_word32_t interpolate_product_single(const spx_word16_t *a, const spx_word16_t *b, unsigned int len, const spx_uint32_t oversample, float *frac);
+#endif
+
+#if defined(_USE_SSE2)
+#define OVERRIDE_INNER_PRODUCT_DOUBLE
+#define inner_product_double CAT_PREFIX(RANDOM_PREFIX,_inner_product_double)
+double inner_product_double(const float *a, const float *b, unsigned int len);
+#define OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
+#define interpolate_product_double CAT_PREFIX(RANDOM_PREFIX,_interpolate_product_double)
+double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // simd_detect_h
diff --git a/media/libspeex_resampler/src/sse_detect.h b/media/libspeex_resampler/src/sse_detect.h
deleted file mode 100644
index b246bb5c7c24..000000000000
--- a/media/libspeex_resampler/src/sse_detect.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* vim: set shiftwidth=2 tabstop=8 autoindent cindent expandtab: */
-/* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-#ifndef SSE_DETECT
-#define SSE_DETECT
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-  int moz_has_sse2();
-  int moz_has_sse();
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // SSE_DETECT
diff --git a/media/libspeex_resampler/sse-detect-runtime.patch b/media/libspeex_resampler/sse-detect-runtime.patch
deleted file mode 100644
index f24f07ee0608..000000000000
--- a/media/libspeex_resampler/sse-detect-runtime.patch
+++ /dev/null
@@ -1,192 +0,0 @@
-diff --git a/media/libspeex_resampler/src/resample.c b/media/libspeex_resampler/src/resample.c
---- a/media/libspeex_resampler/src/resample.c
-+++ b/media/libspeex_resampler/src/resample.c
-@@ -90,18 +90,28 @@ static void speex_free (void *ptr) {free
-                
- #define IMAX(a,b) ((a) > (b) ? (a) : (b))
- #define IMIN(a,b) ((a) < (b) ? (a) : (b))
- 
- #ifndef NULL
- #define NULL 0
- #endif
- 
-+#include "sse_detect.h"
-+
-+/* We compile SSE code on x86 all the time, but we only use it if we find at
-+ * runtime that the CPU supports it. */
- #ifdef _USE_SSE
-+#ifdef _MSC_VER
-+#define inline __inline
-+#endif
- #include "resample_sse.h"
-+#ifdef _MSC_VER
-+#undef inline
-+#endif
- #endif
- 
- #ifdef _USE_NEON
- #include "resample_neon.h"
- #endif
- 
- /* Numer of elements to allocate on the stack */
- #ifdef VAR_ARRAYS
-@@ -342,17 +352,19 @@ static int resampler_basic_direct_single
-    const spx_uint32_t den_rate = st->den_rate;
-    spx_word32_t sum;
- 
-    while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
-    {
-       const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
-       const spx_word16_t *iptr = & in[last_sample];
- 
--#ifndef OVERRIDE_INNER_PRODUCT_SINGLE
-+#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
-+      if (!moz_has_sse()) {
-+#endif
-       int j;
-       sum = 0;
-       for(j=0;j<N;j++) sum += MULT16_16(sinct[j], iptr[j]);
- 
- /*    This code is slower on most DSPs which have only 2 accumulators.
-       Plus this this forces truncation to 32 bits and you lose the HW guard bits.
-       I think we can trust the compiler and let it vectorize and/or unroll itself.
-       spx_word32_t accum[4] = {0,0,0,0};
-@@ -360,18 +372,20 @@ static int resampler_basic_direct_single
-         accum[0] += MULT16_16(sinct[j], iptr[j]);
-         accum[1] += MULT16_16(sinct[j+1], iptr[j+1]);
-         accum[2] += MULT16_16(sinct[j+2], iptr[j+2]);
-         accum[3] += MULT16_16(sinct[j+3], iptr[j+3]);
-       }
-       sum = accum[0] + accum[1] + accum[2] + accum[3];
- */
-       sum = SATURATE32PSHR(sum, 15, 32767);
--#else
-+#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
-+      } else {
-       sum = inner_product_single(sinct, iptr, N);
-+      }
- #endif
- 
-       out[out_stride * out_sample++] = sum;
-       last_sample += int_advance;
-       samp_frac_num += frac_advance;
-       if (samp_frac_num >= den_rate)
-       {
-          samp_frac_num -= den_rate;
-@@ -400,29 +414,33 @@ static int resampler_basic_direct_double
-    const spx_uint32_t den_rate = st->den_rate;
-    double sum;
- 
-    while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
-    {
-       const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
-       const spx_word16_t *iptr = & in[last_sample];
- 
--#ifndef OVERRIDE_INNER_PRODUCT_DOUBLE
-+#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
-+      if(moz_has_sse2()) {
-+#endif
-       int j;
-       double accum[4] = {0,0,0,0};
- 
-       for(j=0;j<N;j+=4) {
-         accum[0] += sinct[j]*iptr[j];
-         accum[1] += sinct[j+1]*iptr[j+1];
-         accum[2] += sinct[j+2]*iptr[j+2];
-         accum[3] += sinct[j+3]*iptr[j+3];
-       }
-       sum = accum[0] + accum[1] + accum[2] + accum[3];
--#else
-+#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
-+      } else {
-       sum = inner_product_double(sinct, iptr, N);
-+      }
- #endif
- 
-       out[out_stride * out_sample++] = PSHR32(sum, 15);
-       last_sample += int_advance;
-       samp_frac_num += frac_advance;
-       if (samp_frac_num >= den_rate)
-       {
-          samp_frac_num -= den_rate;
-@@ -456,34 +474,38 @@ static int resampler_basic_interpolate_s
- #ifdef FIXED_POINT
-       const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
- #else
-       const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
- #endif
-       spx_word16_t interp[4];
- 
- 
--#ifndef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
-+#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
-+      if (!moz_has_sse()) {
-+#endif
-       int j;
-       spx_word32_t accum[4] = {0,0,0,0};
- 
-       for(j=0;j<N;j++) {
-         const spx_word16_t curr_in=iptr[j];
-         accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
-         accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
-         accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
-         accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
-       }
- 
-       cubic_coef(frac, interp);
-       sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1));
-       sum = SATURATE32PSHR(sum, 15, 32767);
--#else
-+#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
-+      } else {
-       cubic_coef(frac, interp);
-       sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
-+      }
- #endif
-       
-       out[out_stride * out_sample++] = sum;
-       last_sample += int_advance;
-       samp_frac_num += frac_advance;
-       if (samp_frac_num >= den_rate)
-       {
-          samp_frac_num -= den_rate;
-@@ -519,33 +541,37 @@ static int resampler_basic_interpolate_d
- #ifdef FIXED_POINT
-       const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
- #else
-       const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
- #endif
-       spx_word16_t interp[4];
- 
- 
--#ifndef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
-+#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
-+      if (!moz_has_sse2()) {
-+#endif
-       int j;
-       double accum[4] = {0,0,0,0};
- 
-       for(j=0;j<N;j++) {
-         const double curr_in=iptr[j];
-         accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
-         accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
-         accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
-         accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
-       }
- 
-       cubic_coef(frac, interp);
-       sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]);
--#else
-+#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
-+      } else {
-       cubic_coef(frac, interp);
-       sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
-+      }
- #endif
-       
-       out[out_stride * out_sample++] = PSHR32(sum,15);
-       last_sample += int_advance;
-       samp_frac_num += frac_advance;
-       if (samp_frac_num >= den_rate)
-       {
-          samp_frac_num -= den_rate;
diff --git a/media/libspeex_resampler/update.sh b/media/libspeex_resampler/update.sh
index 721ce41b435f..6f1f5007b7e6 100644
--- a/media/libspeex_resampler/update.sh
+++ b/media/libspeex_resampler/update.sh
@@ -10,8 +10,8 @@
 set -e -x
 
 cp $1/libspeexdsp/resample.c src
-cp $1/libspeexdsp/resample_sse.h src
-cp $1/libspeexdsp/resample_neon.h src
+cp $1/libspeexdsp/resample_sse.h src/resample_sse.c
+cp $1/libspeexdsp/resample_neon.h src/resample_neon.c
 cp $1/libspeexdsp/arch.h src
 cp $1/libspeexdsp/stack_alloc.h src
 cp $1/libspeexdsp/fixed_generic.h src
@@ -21,6 +21,6 @@ cp $1/COPYING .
 
 # apply outstanding local patches
 patch -p3 < outside-speex.patch
-patch -p3 < sse-detect-runtime.patch
+patch -p3 < simd-detect-runtime.patch
 patch -p3 < set-skip-frac.patch
 patch -p3 < hugemem.patch