mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-11-23 21:01:08 +00:00
b=1042508 move resampler simd optimizations to separate translation units r=padenot
This ensures that code in resample.c will run on Intel x86 cpus even when SSE support has been compiled, and will provide similarly for neon support when enabled. --HG-- rename : media/libspeex_resampler/sse-detect-runtime.patch => media/libspeex_resampler/simd-detect-runtime.patch rename : media/libspeex_resampler/src/resample_neon.h => media/libspeex_resampler/src/resample_neon.c rename : media/libspeex_resampler/src/resample_sse.h => media/libspeex_resampler/src/resample_sse.c rename : media/libspeex_resampler/src/sse_detect.cpp => media/libspeex_resampler/src/simd_detect.cpp rename : media/libspeex_resampler/src/sse_detect.h => media/libspeex_resampler/src/simd_detect.h extra : rebase_source : bdf1ef791129dedeadce7369354a5992729a99b7
This commit is contained in:
parent
75884b1238
commit
5a1e783dae
331
media/libspeex_resampler/simd-detect-runtime.patch
Normal file
331
media/libspeex_resampler/simd-detect-runtime.patch
Normal file
@ -0,0 +1,331 @@
|
||||
diff --git a/media/libspeex_resampler/src/resample.c b/media/libspeex_resampler/src/resample.c
|
||||
--- a/media/libspeex_resampler/src/resample.c
|
||||
+++ b/media/libspeex_resampler/src/resample.c
|
||||
@@ -92,23 +92,17 @@ static void speex_free (void *ptr) {free
|
||||
|
||||
#define IMAX(a,b) ((a) > (b) ? (a) : (b))
|
||||
#define IMIN(a,b) ((a) < (b) ? (a) : (b))
|
||||
|
||||
#ifndef NULL
|
||||
#define NULL 0
|
||||
#endif
|
||||
|
||||
-#ifdef _USE_SSE
|
||||
-#include "resample_sse.h"
|
||||
-#endif
|
||||
-
|
||||
-#ifdef _USE_NEON
|
||||
-#include "resample_neon.h"
|
||||
-#endif
|
||||
+#include "simd_detect.h"
|
||||
|
||||
/* Numer of elements to allocate on the stack */
|
||||
#ifdef VAR_ARRAYS
|
||||
#define FIXED_STACK_ALLOC 8192
|
||||
#else
|
||||
#define FIXED_STACK_ALLOC 1024
|
||||
#endif
|
||||
|
||||
@@ -344,17 +338,19 @@ static int resampler_basic_direct_single
|
||||
const spx_uint32_t den_rate = st->den_rate;
|
||||
spx_word32_t sum;
|
||||
|
||||
while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
|
||||
{
|
||||
const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
|
||||
const spx_word16_t *iptr = & in[last_sample];
|
||||
|
||||
-#ifndef OVERRIDE_INNER_PRODUCT_SINGLE
|
||||
+#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
|
||||
+ if (!moz_speex_have_single_simd()) {
|
||||
+#endif
|
||||
int j;
|
||||
sum = 0;
|
||||
for(j=0;j<N;j++) sum += MULT16_16(sinct[j], iptr[j]);
|
||||
|
||||
/* This code is slower on most DSPs which have only 2 accumulators.
|
||||
Plus this this forces truncation to 32 bits and you lose the HW guard bits.
|
||||
I think we can trust the compiler and let it vectorize and/or unroll itself.
|
||||
spx_word32_t accum[4] = {0,0,0,0};
|
||||
@@ -362,18 +358,20 @@ static int resampler_basic_direct_single
|
||||
accum[0] += MULT16_16(sinct[j], iptr[j]);
|
||||
accum[1] += MULT16_16(sinct[j+1], iptr[j+1]);
|
||||
accum[2] += MULT16_16(sinct[j+2], iptr[j+2]);
|
||||
accum[3] += MULT16_16(sinct[j+3], iptr[j+3]);
|
||||
}
|
||||
sum = accum[0] + accum[1] + accum[2] + accum[3];
|
||||
*/
|
||||
sum = SATURATE32PSHR(sum, 15, 32767);
|
||||
-#else
|
||||
+#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
|
||||
+ } else {
|
||||
sum = inner_product_single(sinct, iptr, N);
|
||||
+ }
|
||||
#endif
|
||||
|
||||
out[out_stride * out_sample++] = sum;
|
||||
last_sample += int_advance;
|
||||
samp_frac_num += frac_advance;
|
||||
if (samp_frac_num >= den_rate)
|
||||
{
|
||||
samp_frac_num -= den_rate;
|
||||
@@ -402,29 +400,33 @@ static int resampler_basic_direct_double
|
||||
const spx_uint32_t den_rate = st->den_rate;
|
||||
double sum;
|
||||
|
||||
while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
|
||||
{
|
||||
const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
|
||||
const spx_word16_t *iptr = & in[last_sample];
|
||||
|
||||
-#ifndef OVERRIDE_INNER_PRODUCT_DOUBLE
|
||||
+#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
|
||||
+ if(moz_speex_have_double_simd()) {
|
||||
+#endif
|
||||
int j;
|
||||
double accum[4] = {0,0,0,0};
|
||||
|
||||
for(j=0;j<N;j+=4) {
|
||||
accum[0] += sinct[j]*iptr[j];
|
||||
accum[1] += sinct[j+1]*iptr[j+1];
|
||||
accum[2] += sinct[j+2]*iptr[j+2];
|
||||
accum[3] += sinct[j+3]*iptr[j+3];
|
||||
}
|
||||
sum = accum[0] + accum[1] + accum[2] + accum[3];
|
||||
-#else
|
||||
+#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
|
||||
+ } else {
|
||||
sum = inner_product_double(sinct, iptr, N);
|
||||
+ }
|
||||
#endif
|
||||
|
||||
out[out_stride * out_sample++] = PSHR32(sum, 15);
|
||||
last_sample += int_advance;
|
||||
samp_frac_num += frac_advance;
|
||||
if (samp_frac_num >= den_rate)
|
||||
{
|
||||
samp_frac_num -= den_rate;
|
||||
@@ -458,34 +460,38 @@ static int resampler_basic_interpolate_s
|
||||
#ifdef FIXED_POINT
|
||||
const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
|
||||
#else
|
||||
const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
|
||||
#endif
|
||||
spx_word16_t interp[4];
|
||||
|
||||
|
||||
-#ifndef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
|
||||
+#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
|
||||
+ if (!moz_speex_have_single_simd()) {
|
||||
+#endif
|
||||
int j;
|
||||
spx_word32_t accum[4] = {0,0,0,0};
|
||||
|
||||
for(j=0;j<N;j++) {
|
||||
const spx_word16_t curr_in=iptr[j];
|
||||
accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
|
||||
accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
|
||||
accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
|
||||
accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
|
||||
}
|
||||
|
||||
cubic_coef(frac, interp);
|
||||
sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1));
|
||||
sum = SATURATE32PSHR(sum, 15, 32767);
|
||||
-#else
|
||||
+#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
|
||||
+ } else {
|
||||
cubic_coef(frac, interp);
|
||||
sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
|
||||
+ }
|
||||
#endif
|
||||
|
||||
out[out_stride * out_sample++] = sum;
|
||||
last_sample += int_advance;
|
||||
samp_frac_num += frac_advance;
|
||||
if (samp_frac_num >= den_rate)
|
||||
{
|
||||
samp_frac_num -= den_rate;
|
||||
@@ -521,33 +527,37 @@ static int resampler_basic_interpolate_d
|
||||
#ifdef FIXED_POINT
|
||||
const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
|
||||
#else
|
||||
const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
|
||||
#endif
|
||||
spx_word16_t interp[4];
|
||||
|
||||
|
||||
-#ifndef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
|
||||
+#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
|
||||
+ if (!moz_speex_have_double_simd()) {
|
||||
+#endif
|
||||
int j;
|
||||
double accum[4] = {0,0,0,0};
|
||||
|
||||
for(j=0;j<N;j++) {
|
||||
const double curr_in=iptr[j];
|
||||
accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
|
||||
accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
|
||||
accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
|
||||
accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
|
||||
}
|
||||
|
||||
cubic_coef(frac, interp);
|
||||
sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]);
|
||||
-#else
|
||||
+#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
|
||||
+ } else {
|
||||
cubic_coef(frac, interp);
|
||||
sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
|
||||
+ }
|
||||
#endif
|
||||
|
||||
out[out_stride * out_sample++] = PSHR32(sum,15);
|
||||
last_sample += int_advance;
|
||||
samp_frac_num += frac_advance;
|
||||
if (samp_frac_num >= den_rate)
|
||||
{
|
||||
samp_frac_num -= den_rate;
|
||||
diff --git a/media/libspeex_resampler/src/resample_neon.c b/media/libspeex_resampler/src/resample_neon.c
|
||||
--- a/media/libspeex_resampler/src/resample_neon.c
|
||||
+++ b/media/libspeex_resampler/src/resample_neon.c
|
||||
@@ -31,16 +31,18 @@
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
+#include "simd_detect.h"
|
||||
+
|
||||
#include <arm_neon.h>
|
||||
|
||||
#ifdef FIXED_POINT
|
||||
#ifdef __thumb2__
|
||||
static inline int32_t saturate_32bit_to_16bit(int32_t a) {
|
||||
int32_t ret;
|
||||
asm ("ssat %[ret], #16, %[a]"
|
||||
: [ret] "=&r" (ret)
|
||||
@@ -60,17 +62,17 @@ static inline int32_t saturate_32bit_to_
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
#undef WORD2INT
|
||||
#define WORD2INT(x) (saturate_32bit_to_16bit(x))
|
||||
|
||||
#define OVERRIDE_INNER_PRODUCT_SINGLE
|
||||
/* Only works when len % 4 == 0 */
|
||||
-static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
|
||||
+int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
|
||||
{
|
||||
int32_t ret;
|
||||
uint32_t remainder = len % 16;
|
||||
len = len - remainder;
|
||||
|
||||
asm volatile (" cmp %[len], #0\n"
|
||||
" bne 1f\n"
|
||||
" vld1.16 {d16}, [%[b]]!\n"
|
||||
@@ -134,17 +136,17 @@ static inline int32_t saturate_float_to_
|
||||
: "q0");
|
||||
return ret;
|
||||
}
|
||||
#undef WORD2INT
|
||||
#define WORD2INT(x) (saturate_float_to_16bit(x))
|
||||
|
||||
#define OVERRIDE_INNER_PRODUCT_SINGLE
|
||||
/* Only works when len % 4 == 0 */
|
||||
-static inline float inner_product_single(const float *a, const float *b, unsigned int len)
|
||||
+float inner_product_single(const float *a, const float *b, unsigned int len)
|
||||
{
|
||||
float ret;
|
||||
uint32_t remainder = len % 16;
|
||||
len = len - remainder;
|
||||
|
||||
asm volatile (" cmp %[len], #0\n"
|
||||
" bne 1f\n"
|
||||
" vld1.32 {q4}, [%[b]]!\n"
|
||||
diff --git a/media/libspeex_resampler/src/resample_sse.c b/media/libspeex_resampler/src/resample_sse.c
|
||||
--- a/media/libspeex_resampler/src/resample_sse.c
|
||||
+++ b/media/libspeex_resampler/src/resample_sse.c
|
||||
@@ -29,37 +29,39 @@
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
+#include "simd_detect.h"
|
||||
+
|
||||
#include <xmmintrin.h>
|
||||
|
||||
#define OVERRIDE_INNER_PRODUCT_SINGLE
|
||||
-static inline float inner_product_single(const float *a, const float *b, unsigned int len)
|
||||
+float inner_product_single(const float *a, const float *b, unsigned int len)
|
||||
{
|
||||
int i;
|
||||
float ret;
|
||||
__m128 sum = _mm_setzero_ps();
|
||||
for (i=0;i<len;i+=8)
|
||||
{
|
||||
sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i)));
|
||||
sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i+4), _mm_loadu_ps(b+i+4)));
|
||||
}
|
||||
sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
|
||||
sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
|
||||
_mm_store_ss(&ret, sum);
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
|
||||
-static inline float interpolate_product_single(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
|
||||
+float interpolate_product_single(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
|
||||
int i;
|
||||
float ret;
|
||||
__m128 sum = _mm_setzero_ps();
|
||||
__m128 f = _mm_loadu_ps(frac);
|
||||
for(i=0;i<len;i+=2)
|
||||
{
|
||||
sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i), _mm_loadu_ps(b+i*oversample)));
|
||||
sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i+1), _mm_loadu_ps(b+(i+1)*oversample)));
|
||||
@@ -70,17 +72,17 @@ static inline float interpolate_product_
|
||||
_mm_store_ss(&ret, sum);
|
||||
return ret;
|
||||
}
|
||||
|
||||
#ifdef _USE_SSE2
|
||||
#include <emmintrin.h>
|
||||
#define OVERRIDE_INNER_PRODUCT_DOUBLE
|
||||
|
||||
-static inline double inner_product_double(const float *a, const float *b, unsigned int len)
|
||||
+double inner_product_double(const float *a, const float *b, unsigned int len)
|
||||
{
|
||||
int i;
|
||||
double ret;
|
||||
__m128d sum = _mm_setzero_pd();
|
||||
__m128 t;
|
||||
for (i=0;i<len;i+=8)
|
||||
{
|
||||
t = _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i));
|
||||
@@ -92,17 +94,17 @@ static inline double inner_product_doubl
|
||||
sum = _mm_add_pd(sum, _mm_cvtps_pd(_mm_movehl_ps(t, t)));
|
||||
}
|
||||
sum = _mm_add_sd(sum, _mm_unpackhi_pd(sum, sum));
|
||||
_mm_store_sd(&ret, sum);
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
|
||||
-static inline double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
|
||||
+double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
|
||||
int i;
|
||||
double ret;
|
||||
__m128d sum;
|
||||
__m128d sum1 = _mm_setzero_pd();
|
||||
__m128d sum2 = _mm_setzero_pd();
|
||||
__m128 f = _mm_loadu_ps(frac);
|
||||
__m128d f1 = _mm_cvtps_pd(f);
|
||||
__m128d f2 = _mm_cvtps_pd(_mm_movehl_ps(f,f));
|
@ -10,12 +10,9 @@ EXPORTS.speex += [
|
||||
'speex_resampler.h',
|
||||
]
|
||||
|
||||
SOURCES += [
|
||||
'sse_detect.cpp',
|
||||
]
|
||||
|
||||
SOURCES += [
|
||||
'resample.c',
|
||||
'simd_detect.cpp',
|
||||
]
|
||||
|
||||
MSVC_ENABLE_PGO = True
|
||||
@ -40,7 +37,10 @@ else:
|
||||
if CONFIG['INTEL_ARCHITECTURE'] and not CONFIG['MOZ_SAMPLE_TYPE_S16']:
|
||||
DEFINES['_USE_SSE'] = True
|
||||
DEFINES['_USE_SSE2'] = True
|
||||
SOURCES['resample.c'].flags += CONFIG['SSE2_FLAGS']
|
||||
SOURCES += [
|
||||
'resample_sse.c'
|
||||
]
|
||||
SOURCES['resample_sse.c'].flags += CONFIG['SSE2_FLAGS']
|
||||
|
||||
# Suppress warnings in third-party code.
|
||||
if CONFIG['GNU_CC']:
|
||||
|
@ -97,23 +97,7 @@ static void speex_free (void *ptr) {free(ptr);}
|
||||
#define NULL 0
|
||||
#endif
|
||||
|
||||
#include "sse_detect.h"
|
||||
|
||||
/* We compile SSE code on x86 all the time, but we only use it if we find at
|
||||
* runtime that the CPU supports it. */
|
||||
#ifdef _USE_SSE
|
||||
#ifdef _MSC_VER
|
||||
#define inline __inline
|
||||
#endif
|
||||
#include "resample_sse.h"
|
||||
#ifdef _MSC_VER
|
||||
#undef inline
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef _USE_NEON
|
||||
#include "resample_neon.h"
|
||||
#endif
|
||||
#include "simd_detect.h"
|
||||
|
||||
/* Numer of elements to allocate on the stack */
|
||||
#ifdef VAR_ARRAYS
|
||||
@ -360,7 +344,7 @@ static int resampler_basic_direct_single(SpeexResamplerState *st, spx_uint32_t c
|
||||
const spx_word16_t *iptr = & in[last_sample];
|
||||
|
||||
#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
|
||||
if (!moz_has_sse()) {
|
||||
if (!moz_speex_have_single_simd()) {
|
||||
#endif
|
||||
int j;
|
||||
sum = 0;
|
||||
@ -422,7 +406,7 @@ static int resampler_basic_direct_double(SpeexResamplerState *st, spx_uint32_t c
|
||||
const spx_word16_t *iptr = & in[last_sample];
|
||||
|
||||
#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
|
||||
if(moz_has_sse2()) {
|
||||
if(moz_speex_have_double_simd()) {
|
||||
#endif
|
||||
int j;
|
||||
double accum[4] = {0,0,0,0};
|
||||
@ -482,7 +466,7 @@ static int resampler_basic_interpolate_single(SpeexResamplerState *st, spx_uint3
|
||||
|
||||
|
||||
#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
|
||||
if (!moz_has_sse()) {
|
||||
if (!moz_speex_have_single_simd()) {
|
||||
#endif
|
||||
int j;
|
||||
spx_word32_t accum[4] = {0,0,0,0};
|
||||
@ -549,7 +533,7 @@ static int resampler_basic_interpolate_double(SpeexResamplerState *st, spx_uint3
|
||||
|
||||
|
||||
#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
|
||||
if (!moz_has_sse2()) {
|
||||
if (!moz_speex_have_double_simd()) {
|
||||
#endif
|
||||
int j;
|
||||
double accum[4] = {0,0,0,0};
|
||||
|
@ -36,6 +36,8 @@
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "simd_detect.h"
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#ifdef FIXED_POINT
|
||||
@ -65,7 +67,7 @@ static inline int32_t saturate_32bit_to_16bit(int32_t a) {
|
||||
|
||||
#define OVERRIDE_INNER_PRODUCT_SINGLE
|
||||
/* Only works when len % 4 == 0 */
|
||||
static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
|
||||
int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
|
||||
{
|
||||
int32_t ret;
|
||||
uint32_t remainder = len % 16;
|
||||
@ -139,7 +141,7 @@ static inline int32_t saturate_float_to_16bit(float a) {
|
||||
|
||||
#define OVERRIDE_INNER_PRODUCT_SINGLE
|
||||
/* Only works when len % 4 == 0 */
|
||||
static inline float inner_product_single(const float *a, const float *b, unsigned int len)
|
||||
float inner_product_single(const float *a, const float *b, unsigned int len)
|
||||
{
|
||||
float ret;
|
||||
uint32_t remainder = len % 16;
|
@ -34,10 +34,12 @@
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "simd_detect.h"
|
||||
|
||||
#include <xmmintrin.h>
|
||||
|
||||
#define OVERRIDE_INNER_PRODUCT_SINGLE
|
||||
static inline float inner_product_single(const float *a, const float *b, unsigned int len)
|
||||
float inner_product_single(const float *a, const float *b, unsigned int len)
|
||||
{
|
||||
int i;
|
||||
float ret;
|
||||
@ -54,7 +56,7 @@ static inline float inner_product_single(const float *a, const float *b, unsigne
|
||||
}
|
||||
|
||||
#define OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
|
||||
static inline float interpolate_product_single(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
|
||||
float interpolate_product_single(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
|
||||
int i;
|
||||
float ret;
|
||||
__m128 sum = _mm_setzero_ps();
|
||||
@ -75,7 +77,7 @@ static inline float interpolate_product_single(const float *a, const float *b, u
|
||||
#include <emmintrin.h>
|
||||
#define OVERRIDE_INNER_PRODUCT_DOUBLE
|
||||
|
||||
static inline double inner_product_double(const float *a, const float *b, unsigned int len)
|
||||
double inner_product_double(const float *a, const float *b, unsigned int len)
|
||||
{
|
||||
int i;
|
||||
double ret;
|
||||
@ -97,7 +99,7 @@ static inline double inner_product_double(const float *a, const float *b, unsign
|
||||
}
|
||||
|
||||
#define OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
|
||||
static inline double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
|
||||
double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
|
||||
int i;
|
||||
double ret;
|
||||
__m128d sum;
|
@ -3,13 +3,25 @@
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#include "mozilla/SSE.h"
|
||||
#include "sse_detect.h"
|
||||
#include "simd_detect.h"
|
||||
|
||||
int moz_has_sse2() {
|
||||
#include "mozilla/SSE.h"
|
||||
#include "mozilla/arm.h"
|
||||
|
||||
#ifdef _USE_SSE2
|
||||
int moz_speex_have_double_simd() {
|
||||
return mozilla::supports_sse2() ? 1 : 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
int moz_has_sse() {
|
||||
#ifdef _USE_SSE
|
||||
int moz_speex_have_single_simd() {
|
||||
return mozilla::supports_sse() ? 1 : 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef _USE_NEON
|
||||
int moz_speex_have_single_simd() {
|
||||
return mozilla::supports_neon() ? 1 : 0;
|
||||
}
|
||||
#endif
|
43
media/libspeex_resampler/src/simd_detect.h
Normal file
43
media/libspeex_resampler/src/simd_detect.h
Normal file
@ -0,0 +1,43 @@
|
||||
/* vim: set shiftwidth=2 tabstop=8 autoindent cindent expandtab: */
|
||||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#ifndef simd_detect_h
|
||||
#define simd_detect_h
|
||||
|
||||
#include "speex_resampler.h"
|
||||
#include "arch.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
int moz_speex_have_single_simd();
|
||||
int moz_speex_have_double_simd();
|
||||
|
||||
#if defined(_USE_SSE) || defined(_USE_NEON)
|
||||
#define OVERRIDE_INNER_PRODUCT_SINGLE
|
||||
#define inner_product_single CAT_PREFIX(RANDOM_PREFIX,_inner_product_single)
|
||||
spx_word32_t inner_product_single(const spx_word16_t *a, const spx_word16_t *b, unsigned int len);
|
||||
#endif
|
||||
#if defined(_USE_SSE)
|
||||
#define OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
|
||||
#define interpolate_product_single CAT_PREFIX(RANDOM_PREFIX,_interpolate_product_single)
|
||||
spx_word32_t interpolate_product_single(const spx_word16_t *a, const spx_word16_t *b, unsigned int len, const spx_uint32_t oversample, float *frac);
|
||||
#endif
|
||||
|
||||
#if defined(_USE_SSE2)
|
||||
#define OVERRIDE_INNER_PRODUCT_DOUBLE
|
||||
#define inner_product_double CAT_PREFIX(RANDOM_PREFIX,_inner_product_double)
|
||||
double inner_product_double(const float *a, const float *b, unsigned int len);
|
||||
#define OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
|
||||
#define interpolate_product_double CAT_PREFIX(RANDOM_PREFIX,_interpolate_product_double)
|
||||
double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac);
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // simd_detect_h
|
@ -1,20 +0,0 @@
|
||||
/* vim: set shiftwidth=2 tabstop=8 autoindent cindent expandtab: */
|
||||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#ifndef SSE_DETECT
|
||||
#define SSE_DETECT
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
int moz_has_sse2();
|
||||
int moz_has_sse();
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // SSE_DETECT
|
@ -1,192 +0,0 @@
|
||||
diff --git a/media/libspeex_resampler/src/resample.c b/media/libspeex_resampler/src/resample.c
|
||||
--- a/media/libspeex_resampler/src/resample.c
|
||||
+++ b/media/libspeex_resampler/src/resample.c
|
||||
@@ -90,18 +90,28 @@ static void speex_free (void *ptr) {free
|
||||
|
||||
#define IMAX(a,b) ((a) > (b) ? (a) : (b))
|
||||
#define IMIN(a,b) ((a) < (b) ? (a) : (b))
|
||||
|
||||
#ifndef NULL
|
||||
#define NULL 0
|
||||
#endif
|
||||
|
||||
+#include "sse_detect.h"
|
||||
+
|
||||
+/* We compile SSE code on x86 all the time, but we only use it if we find at
|
||||
+ * runtime that the CPU supports it. */
|
||||
#ifdef _USE_SSE
|
||||
+#ifdef _MSC_VER
|
||||
+#define inline __inline
|
||||
+#endif
|
||||
#include "resample_sse.h"
|
||||
+#ifdef _MSC_VER
|
||||
+#undef inline
|
||||
+#endif
|
||||
#endif
|
||||
|
||||
#ifdef _USE_NEON
|
||||
#include "resample_neon.h"
|
||||
#endif
|
||||
|
||||
/* Numer of elements to allocate on the stack */
|
||||
#ifdef VAR_ARRAYS
|
||||
@@ -342,17 +352,19 @@ static int resampler_basic_direct_single
|
||||
const spx_uint32_t den_rate = st->den_rate;
|
||||
spx_word32_t sum;
|
||||
|
||||
while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
|
||||
{
|
||||
const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
|
||||
const spx_word16_t *iptr = & in[last_sample];
|
||||
|
||||
-#ifndef OVERRIDE_INNER_PRODUCT_SINGLE
|
||||
+#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
|
||||
+ if (!moz_has_sse()) {
|
||||
+#endif
|
||||
int j;
|
||||
sum = 0;
|
||||
for(j=0;j<N;j++) sum += MULT16_16(sinct[j], iptr[j]);
|
||||
|
||||
/* This code is slower on most DSPs which have only 2 accumulators.
|
||||
Plus this this forces truncation to 32 bits and you lose the HW guard bits.
|
||||
I think we can trust the compiler and let it vectorize and/or unroll itself.
|
||||
spx_word32_t accum[4] = {0,0,0,0};
|
||||
@@ -360,18 +372,20 @@ static int resampler_basic_direct_single
|
||||
accum[0] += MULT16_16(sinct[j], iptr[j]);
|
||||
accum[1] += MULT16_16(sinct[j+1], iptr[j+1]);
|
||||
accum[2] += MULT16_16(sinct[j+2], iptr[j+2]);
|
||||
accum[3] += MULT16_16(sinct[j+3], iptr[j+3]);
|
||||
}
|
||||
sum = accum[0] + accum[1] + accum[2] + accum[3];
|
||||
*/
|
||||
sum = SATURATE32PSHR(sum, 15, 32767);
|
||||
-#else
|
||||
+#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
|
||||
+ } else {
|
||||
sum = inner_product_single(sinct, iptr, N);
|
||||
+ }
|
||||
#endif
|
||||
|
||||
out[out_stride * out_sample++] = sum;
|
||||
last_sample += int_advance;
|
||||
samp_frac_num += frac_advance;
|
||||
if (samp_frac_num >= den_rate)
|
||||
{
|
||||
samp_frac_num -= den_rate;
|
||||
@@ -400,29 +414,33 @@ static int resampler_basic_direct_double
|
||||
const spx_uint32_t den_rate = st->den_rate;
|
||||
double sum;
|
||||
|
||||
while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
|
||||
{
|
||||
const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
|
||||
const spx_word16_t *iptr = & in[last_sample];
|
||||
|
||||
-#ifndef OVERRIDE_INNER_PRODUCT_DOUBLE
|
||||
+#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
|
||||
+ if(moz_has_sse2()) {
|
||||
+#endif
|
||||
int j;
|
||||
double accum[4] = {0,0,0,0};
|
||||
|
||||
for(j=0;j<N;j+=4) {
|
||||
accum[0] += sinct[j]*iptr[j];
|
||||
accum[1] += sinct[j+1]*iptr[j+1];
|
||||
accum[2] += sinct[j+2]*iptr[j+2];
|
||||
accum[3] += sinct[j+3]*iptr[j+3];
|
||||
}
|
||||
sum = accum[0] + accum[1] + accum[2] + accum[3];
|
||||
-#else
|
||||
+#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
|
||||
+ } else {
|
||||
sum = inner_product_double(sinct, iptr, N);
|
||||
+ }
|
||||
#endif
|
||||
|
||||
out[out_stride * out_sample++] = PSHR32(sum, 15);
|
||||
last_sample += int_advance;
|
||||
samp_frac_num += frac_advance;
|
||||
if (samp_frac_num >= den_rate)
|
||||
{
|
||||
samp_frac_num -= den_rate;
|
||||
@@ -456,34 +474,38 @@ static int resampler_basic_interpolate_s
|
||||
#ifdef FIXED_POINT
|
||||
const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
|
||||
#else
|
||||
const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
|
||||
#endif
|
||||
spx_word16_t interp[4];
|
||||
|
||||
|
||||
-#ifndef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
|
||||
+#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
|
||||
+ if (!moz_has_sse()) {
|
||||
+#endif
|
||||
int j;
|
||||
spx_word32_t accum[4] = {0,0,0,0};
|
||||
|
||||
for(j=0;j<N;j++) {
|
||||
const spx_word16_t curr_in=iptr[j];
|
||||
accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
|
||||
accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
|
||||
accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
|
||||
accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
|
||||
}
|
||||
|
||||
cubic_coef(frac, interp);
|
||||
sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1));
|
||||
sum = SATURATE32PSHR(sum, 15, 32767);
|
||||
-#else
|
||||
+#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
|
||||
+ } else {
|
||||
cubic_coef(frac, interp);
|
||||
sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
|
||||
+ }
|
||||
#endif
|
||||
|
||||
out[out_stride * out_sample++] = sum;
|
||||
last_sample += int_advance;
|
||||
samp_frac_num += frac_advance;
|
||||
if (samp_frac_num >= den_rate)
|
||||
{
|
||||
samp_frac_num -= den_rate;
|
||||
@@ -519,33 +541,37 @@ static int resampler_basic_interpolate_d
|
||||
#ifdef FIXED_POINT
|
||||
const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
|
||||
#else
|
||||
const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
|
||||
#endif
|
||||
spx_word16_t interp[4];
|
||||
|
||||
|
||||
-#ifndef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
|
||||
+#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
|
||||
+ if (!moz_has_sse2()) {
|
||||
+#endif
|
||||
int j;
|
||||
double accum[4] = {0,0,0,0};
|
||||
|
||||
for(j=0;j<N;j++) {
|
||||
const double curr_in=iptr[j];
|
||||
accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
|
||||
accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
|
||||
accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
|
||||
accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
|
||||
}
|
||||
|
||||
cubic_coef(frac, interp);
|
||||
sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]);
|
||||
-#else
|
||||
+#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
|
||||
+ } else {
|
||||
cubic_coef(frac, interp);
|
||||
sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
|
||||
+ }
|
||||
#endif
|
||||
|
||||
out[out_stride * out_sample++] = PSHR32(sum,15);
|
||||
last_sample += int_advance;
|
||||
samp_frac_num += frac_advance;
|
||||
if (samp_frac_num >= den_rate)
|
||||
{
|
||||
samp_frac_num -= den_rate;
|
@ -10,8 +10,8 @@
|
||||
set -e -x
|
||||
|
||||
cp $1/libspeexdsp/resample.c src
|
||||
cp $1/libspeexdsp/resample_sse.h src
|
||||
cp $1/libspeexdsp/resample_neon.h src
|
||||
cp $1/libspeexdsp/resample_sse.h src/resample_sse.c
|
||||
cp $1/libspeexdsp/resample_neon.h src/resample_neon.c
|
||||
cp $1/libspeexdsp/arch.h src
|
||||
cp $1/libspeexdsp/stack_alloc.h src
|
||||
cp $1/libspeexdsp/fixed_generic.h src
|
||||
@ -21,6 +21,6 @@ cp $1/COPYING .
|
||||
|
||||
# apply outstanding local patches
|
||||
patch -p3 < outside-speex.patch
|
||||
patch -p3 < sse-detect-runtime.patch
|
||||
patch -p3 < simd-detect-runtime.patch
|
||||
patch -p3 < set-skip-frac.patch
|
||||
patch -p3 < hugemem.patch
|
||||
|
Loading…
Reference in New Issue
Block a user