Backed out 4 changesets (bug 1854912) for causing failures in sub-sample-scheduling.html

Backed out changeset 6d23b44c6fc0 (bug 1854912)
Backed out changeset 3719cb3c102a (bug 1854912)
Backed out changeset 7fe68331b4e9 (bug 1854912)
Backed out changeset 1715d7aafa06 (bug 1854912)
This commit is contained in:
Noemi Erli 2023-10-02 21:19:40 +03:00
parent 26e0b0be0a
commit 99f938e4bd
14 changed files with 374 additions and 382 deletions

View File

@ -1,37 +1,22 @@
diff --git a/src/resample.c b/src/resample.c
--- a/src/resample.c
+++ b/src/resample.c
@@ -91,23 +91,17 @@ static void speex_free(void *ptr) {free(
#ifndef NULL
#define NULL 0
@@ -94,13 +94,7 @@ static void speex_free (void *ptr) {free(ptr);}
#define UINT32_MAX 4294967296U
#endif
#ifndef UINT32_MAX
#define UINT32_MAX 4294967295U
#endif
-#ifdef USE_SSE
-#ifdef _USE_SSE
-#include "resample_sse.h"
-#endif
-
-#ifdef USE_NEON
-#ifdef _USE_NEON
-#include "resample_neon.h"
-#endif
+#include "simd_detect.h"
/* Number of elements to allocate on the stack */
/* Numer of elements to allocate on the stack */
#ifdef VAR_ARRAYS
#define FIXED_STACK_ALLOC 8192
#else
#define FIXED_STACK_ALLOC 1024
#endif
@@ -341,17 +335,19 @@ static int resampler_basic_direct_single
const spx_uint32_t den_rate = st->den_rate;
spx_word32_t sum;
while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
{
@@ -346,7 +340,9 @@ static int resampler_basic_direct_single(SpeexResamplerState *st, spx_uint32_t c
const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
const spx_word16_t *iptr = & in[last_sample];
@ -42,17 +27,7 @@ diff --git a/src/resample.c b/src/resample.c
int j;
sum = 0;
for(j=0;j<N;j++) sum += MULT16_16(sinct[j], iptr[j]);
/* This code is slower on most DSPs which have only 2 accumulators.
Plus this this forces truncation to 32 bits and you lose the HW guard bits.
I think we can trust the compiler and let it vectorize and/or unroll itself.
spx_word32_t accum[4] = {0,0,0,0};
@@ -359,18 +355,20 @@ static int resampler_basic_direct_single
accum[0] += MULT16_16(sinct[j], iptr[j]);
accum[1] += MULT16_16(sinct[j+1], iptr[j+1]);
accum[2] += MULT16_16(sinct[j+2], iptr[j+2]);
accum[3] += MULT16_16(sinct[j+3], iptr[j+3]);
}
@@ -364,8 +360,10 @@ static int resampler_basic_direct_single(SpeexResamplerState *st, spx_uint32_t c
sum = accum[0] + accum[1] + accum[2] + accum[3];
*/
sum = SATURATE32PSHR(sum, 15, 32767);
@ -64,17 +39,7 @@ diff --git a/src/resample.c b/src/resample.c
#endif
out[out_stride * out_sample++] = sum;
last_sample += int_advance;
samp_frac_num += frac_advance;
if (samp_frac_num >= den_rate)
{
samp_frac_num -= den_rate;
@@ -399,29 +397,33 @@ static int resampler_basic_direct_double
const spx_uint32_t den_rate = st->den_rate;
double sum;
while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
{
@@ -404,7 +402,9 @@ static int resampler_basic_direct_double(SpeexResamplerState *st, spx_uint32_t c
const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
const spx_word16_t *iptr = & in[last_sample];
@ -85,10 +50,7 @@ diff --git a/src/resample.c b/src/resample.c
int j;
double accum[4] = {0,0,0,0};
for(j=0;j<N;j+=4) {
accum[0] += sinct[j]*iptr[j];
accum[1] += sinct[j+1]*iptr[j+1];
accum[2] += sinct[j+2]*iptr[j+2];
@@ -415,8 +415,10 @@ static int resampler_basic_direct_double(SpeexResamplerState *st, spx_uint32_t c
accum[3] += sinct[j+3]*iptr[j+3];
}
sum = accum[0] + accum[1] + accum[2] + accum[3];
@ -100,17 +62,7 @@ diff --git a/src/resample.c b/src/resample.c
#endif
out[out_stride * out_sample++] = PSHR32(sum, 15);
last_sample += int_advance;
samp_frac_num += frac_advance;
if (samp_frac_num >= den_rate)
{
samp_frac_num -= den_rate;
@@ -455,34 +457,38 @@ static int resampler_basic_interpolate_s
#ifdef FIXED_POINT
const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
#else
const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
#endif
@@ -460,7 +462,9 @@ static int resampler_basic_interpolate_single(SpeexResamplerState *st, spx_uint3
spx_word16_t interp[4];
@ -121,16 +73,9 @@ diff --git a/src/resample.c b/src/resample.c
int j;
spx_word32_t accum[4] = {0,0,0,0};
for(j=0;j<N;j++) {
const spx_word16_t curr_in=iptr[j];
accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
}
@@ -475,9 +479,11 @@ static int resampler_basic_interpolate_single(SpeexResamplerState *st, spx_uint3
cubic_coef(frac, interp);
sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]);
sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1));
sum = SATURATE32PSHR(sum, 15, 32767);
-#else
+#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
@ -141,17 +86,7 @@ diff --git a/src/resample.c b/src/resample.c
#endif
out[out_stride * out_sample++] = sum;
last_sample += int_advance;
samp_frac_num += frac_advance;
if (samp_frac_num >= den_rate)
{
samp_frac_num -= den_rate;
@@ -518,33 +524,37 @@ static int resampler_basic_interpolate_d
#ifdef FIXED_POINT
const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
#else
const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
#endif
@@ -523,7 +529,9 @@ static int resampler_basic_interpolate_double(SpeexResamplerState *st, spx_uint3
spx_word16_t interp[4];
@ -162,13 +97,7 @@ diff --git a/src/resample.c b/src/resample.c
int j;
double accum[4] = {0,0,0,0};
for(j=0;j<N;j++) {
const double curr_in=iptr[j];
accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
}
@@ -537,9 +545,11 @@ static int resampler_basic_interpolate_double(SpeexResamplerState *st, spx_uint3
cubic_coef(frac, interp);
sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]);
@ -181,15 +110,11 @@ diff --git a/src/resample.c b/src/resample.c
#endif
out[out_stride * out_sample++] = PSHR32(sum,15);
last_sample += int_advance;
samp_frac_num += frac_advance;
if (samp_frac_num >= den_rate)
{
samp_frac_num -= den_rate;
diff --git a/src/resample_neon.c b/src/resample_neon.c
--- a/src/resample_neon.c
+++ b/src/resample_neon.c
@@ -32,16 +32,17 @@
@@ -31,16 +31,18 @@
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
@ -197,46 +122,27 @@ diff --git a/src/resample_neon.c b/src/resample_neon.c
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <stdint.h>
+#include "simd_detect.h"
+
#include <arm_neon.h>
#ifdef FIXED_POINT
#if defined(__aarch64__)
#ifdef __thumb2__
static inline int32_t saturate_32bit_to_16bit(int32_t a) {
int32_t ret;
asm ("fmov s0, %w[a]\n"
"sqxtn h0, s0\n"
"sxtl v0.4s, v0.4h\n"
@@ -73,17 +74,17 @@
asm ("ssat %[ret], #16, %[a]"
: [ret] "=&r" (ret)
@@ -60,17 +62,17 @@ static inline int32_t saturate_32bit_to_
return ret;
}
#endif
#undef WORD2INT
#define WORD2INT(x) (saturate_32bit_to_16bit(x))
#define OVERRIDE_INNER_PRODUCT_SINGLE
/* Only works when len % 4 == 0 and len >= 4 */
#if defined(__aarch64__)
/* Only works when len % 4 == 0 */
-static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
+inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
{
int32_t ret;
uint32_t remainder = len % 16;
len = len - remainder;
asm volatile (" cmp %w[len], #0\n"
" b.ne 1f\n"
" ld1 {v16.4h}, [%[b]], #8\n"
@@ -128,17 +129,17 @@
: [ret] "=r" (ret), [a] "+r" (a), [b] "+r" (b),
[len] "+r" (len), [remainder] "+r" (remainder)
:
: "cc", "v0",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
return ret;
}
#else
-static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
+inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
+int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
{
int32_t ret;
uint32_t remainder = len % 16;
@ -245,36 +151,17 @@ diff --git a/src/resample_neon.c b/src/resample_neon.c
asm volatile (" cmp %[len], #0\n"
" bne 1f\n"
" vld1.16 {d16}, [%[b]]!\n"
@@ -218,17 +219,17 @@
#endif
@@ -134,17 +136,17 @@ static inline int32_t saturate_float_to_
: "q0");
return ret;
}
#undef WORD2INT
#define WORD2INT(x) (saturate_float_to_16bit(x))
#define OVERRIDE_INNER_PRODUCT_SINGLE
/* Only works when len % 4 == 0 and len >= 4 */
#if defined(__aarch64__)
/* Only works when len % 4 == 0 */
-static inline float inner_product_single(const float *a, const float *b, unsigned int len)
+inline float inner_product_single(const float *a, const float *b, unsigned int len)
{
float ret;
uint32_t remainder = len % 16;
len = len - remainder;
asm volatile (" cmp %w[len], #0\n"
" b.ne 1f\n"
" ld1 {v16.4s}, [%[b]], #16\n"
@@ -273,17 +274,17 @@
: [ret] "=w" (ret), [a] "+r" (a), [b] "+r" (b),
[len] "+r" (len), [remainder] "+r" (remainder)
:
: "cc", "v1", "v2", "v3", "v4",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
return ret;
}
#else
-static inline float inner_product_single(const float *a, const float *b, unsigned int len)
+inline float inner_product_single(const float *a, const float *b, unsigned int len)
+float inner_product_single(const float *a, const float *b, unsigned int len)
{
float ret;
uint32_t remainder = len % 16;
@ -333,7 +220,7 @@ diff --git a/src/resample_sse.c b/src/resample_sse.c
return ret;
}
#ifdef USE_SSE2
#ifdef _USE_SSE2
#include <emmintrin.h>
#define OVERRIDE_INNER_PRODUCT_DOUBLE

View File

@ -14,13 +14,13 @@ diff --git a/src/resample.c b/src/resample.c
+
#ifdef OUTSIDE_SPEEX
#include <stdlib.h>
static void *speex_alloc(int size) {return calloc(size,1);}
static void *speex_realloc(void *ptr, int size) {return realloc(ptr, size);}
static void speex_free(void *ptr) {free(ptr);}
#ifndef EXPORT
#define EXPORT
#endif
@@ -633,25 +645,26 @@ static int update_filter(SpeexResamplerS
static void *speex_alloc (int size) {return calloc(size,1);}
static void *speex_realloc (void *ptr, int size) {return realloc(ptr, size);}
static void speex_free (void *ptr) {free(ptr);}
#include "speex_resampler.h"
#include "arch.h"
#else /* OUTSIDE_SPEEX */
@@ -643,25 +645,26 @@ static int update_filter(SpeexResamplerS
st->oversample >>= 1;
if (st->oversample < 1)
st->oversample = 1;
@ -29,24 +29,23 @@ diff --git a/src/resample.c b/src/resample.c
st->cutoff = quality_map[st->quality].upsample_bandwidth;
}
- /* Choose the resampling type that requires the least amount of memory */
-#ifdef RESAMPLE_FULL_SINC_TABLE
- use_direct = 1;
- if (INT_MAX/sizeof(spx_word16_t)/st->den_rate < st->filt_len)
- goto fail;
-#else
- /* Choose the resampling type that requires the least amount of memory */
- use_direct = st->filt_len*st->den_rate <= st->filt_len*st->oversample+8
- && INT_MAX/sizeof(spx_word16_t)/st->den_rate >= st->filt_len;
+ use_direct =
+#ifdef RESAMPLE_HUGEMEM
+ /* Choose the direct resampler, even with higher initialization costs,
+ when resampling any multiple of 100 to 44100. */
+ st->den_rate <= 441
+#else
#else
- use_direct = st->filt_len*st->den_rate <= st->filt_len*st->oversample+8
+ /* Choose the resampling type that requires the least amount of memory */
+ st->filt_len*st->den_rate <= st->filt_len*st->oversample+8
#endif
+ && INT_MAX/sizeof(spx_word16_t)/st->den_rate >= st->filt_len;
+#endif
&& INT_MAX/sizeof(spx_word16_t)/st->den_rate >= st->filt_len;
-#endif
if (use_direct)
{
min_sinc_table_length = st->filt_len*st->den_rate;
@ -55,3 +54,4 @@ diff --git a/src/resample.c b/src/resample.c
goto fail;
min_sinc_table_length = st->filt_len*st->oversample+8;

View File

@ -0,0 +1,33 @@
https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Extended-Asm says
asm [volatile] ( AssemblerTemplate : [OutputOperands] [ : [InputOperands] [ : [Clobbers] ] ] )
which implies that Clobbers is optional even after the third colon, but
the gcc used for b2g_try_emulator_dep builds says
resample_neon.c: In function 'saturate_32bit_to_16bit':
resample_neon.c:50: error: expected string literal before ')' token
diff --git a/src/resample_neon.c b/src/resample_neon.c
--- a/src/resample_neon.c
+++ b/src/resample_neon.c
@@ -41,18 +41,17 @@
#include <arm_neon.h>
#ifdef FIXED_POINT
#ifdef __thumb2__
static inline int32_t saturate_32bit_to_16bit(int32_t a) {
int32_t ret;
asm ("ssat %[ret], #16, %[a]"
: [ret] "=&r" (ret)
- : [a] "r" (a)
- : );
+ : [a] "r" (a));
return ret;
}
#else
static inline int32_t saturate_32bit_to_16bit(int32_t a) {
int32_t ret;
asm ("vmov.s32 d0[0], %[a]\n"
"vqmovn.s32 d0, q0\n"
"vmov.s16 %[ret], d0[0]\n"

View File

@ -1,9 +1,7 @@
This is a fix for https://bugzilla.mozilla.org/show_bug.cgi?id=1274083
diff --git a/src/resample.c b/src/resample.c
--- a/src/resample.c
+++ b/src/resample.c
@@ -1129,18 +1129,19 @@ EXPORT int speex_resampler_set_rate_frac
@@ -1141,18 +1141,19 @@ EXPORT int speex_resampler_set_rate_frac
st->num_rate /= fact;
st->den_rate /= fact;
@ -12,10 +10,10 @@ diff --git a/src/resample.c b/src/resample.c
{
for (i=0;i<st->nb_channels;i++)
{
- if (multiply_frac(&st->samp_frac_num[i],st->samp_frac_num[i],st->den_rate,old_den) != RESAMPLER_ERR_SUCCESS)
- if (_muldiv(&st->samp_frac_num[i],st->samp_frac_num[i],st->den_rate,old_den) != RESAMPLER_ERR_SUCCESS)
- return RESAMPLER_ERR_OVERFLOW;
+ if (multiply_frac(&st->samp_frac_num[i],st->samp_frac_num[i],st->den_rate,old_den) != RESAMPLER_ERR_SUCCESS) {
+ st->samp_frac_num[i] = st->den_rate-1;
+ if (_muldiv(&st->samp_frac_num[i],st->samp_frac_num[i],st->den_rate,old_den) != RESAMPLER_ERR_SUCCESS) {
+ st->samp_frac_num[i] = st->den_rate-1;
+ }
/* Safety net */
if (st->samp_frac_num[i] >= st->den_rate)

View File

@ -0,0 +1,95 @@
diff --git a/src/arch.h b/src/arch.h
--- a/src/arch.h
+++ b/src/arch.h
@@ -172,26 +172,23 @@ typedef float spx_word32_t;
#define SHL(a,shift) (a)
#define SATURATE(x,a) (x)
#define ADD16(a,b) ((a)+(b))
#define SUB16(a,b) ((a)-(b))
#define ADD32(a,b) ((a)+(b))
#define SUB32(a,b) ((a)-(b))
#define MULT16_16_16(a,b) ((a)*(b))
+#define MULT16_32_32(a,b) ((a)*(b))
#define MULT16_16(a,b) ((spx_word32_t)(a)*(spx_word32_t)(b))
#define MAC16_16(c,a,b) ((c)+(spx_word32_t)(a)*(spx_word32_t)(b))
-#define MULT16_32_Q11(a,b) ((a)*(b))
-#define MULT16_32_Q13(a,b) ((a)*(b))
-#define MULT16_32_Q14(a,b) ((a)*(b))
#define MULT16_32_Q15(a,b) ((a)*(b))
#define MULT16_32_P15(a,b) ((a)*(b))
-#define MAC16_32_Q11(c,a,b) ((c)+(a)*(b))
#define MAC16_32_Q15(c,a,b) ((c)+(a)*(b))
#define MAC16_16_Q11(c,a,b) ((c)+(a)*(b))
#define MAC16_16_Q13(c,a,b) ((c)+(a)*(b))
#define MAC16_16_P13(c,a,b) ((c)+(a)*(b))
#define MULT16_16_Q11_32(a,b) ((a)*(b))
#define MULT16_16_Q13(a,b) ((a)*(b))
#define MULT16_16_Q14(a,b) ((a)*(b))
diff --git a/src/fixed_generic.h b/src/fixed_generic.h
--- a/src/fixed_generic.h
+++ b/src/fixed_generic.h
@@ -64,32 +64,27 @@
#define ADD16(a,b) ((spx_word16_t)((spx_word16_t)(a)+(spx_word16_t)(b)))
#define SUB16(a,b) ((spx_word16_t)(a)-(spx_word16_t)(b))
#define ADD32(a,b) ((spx_word32_t)(a)+(spx_word32_t)(b))
#define SUB32(a,b) ((spx_word32_t)(a)-(spx_word32_t)(b))
/* result fits in 16 bits */
-#define MULT16_16_16(a,b) ((((spx_word16_t)(a))*((spx_word16_t)(b))))
+#define MULT16_16_16(a,b) (((spx_word16_t)(a))*((spx_word16_t)(b)))
+/* result fits in 32 bits */
+#define MULT16_32_32(a,b) (((spx_word16_t)(a))*((spx_word32_t)(b)))
/* (spx_word32_t)(spx_word16_t) gives TI compiler a hint that it's 16x16->32 multiply */
#define MULT16_16(a,b) (((spx_word32_t)(spx_word16_t)(a))*((spx_word32_t)(spx_word16_t)(b)))
#define MAC16_16(c,a,b) (ADD32((c),MULT16_16((a),(b))))
-#define MULT16_32_Q12(a,b) ADD32(MULT16_16((a),SHR((b),12)), SHR(MULT16_16((a),((b)&0x00000fff)),12))
-#define MULT16_32_Q13(a,b) ADD32(MULT16_16((a),SHR((b),13)), SHR(MULT16_16((a),((b)&0x00001fff)),13))
-#define MULT16_32_Q14(a,b) ADD32(MULT16_16((a),SHR((b),14)), SHR(MULT16_16((a),((b)&0x00003fff)),14))
-
-#define MULT16_32_Q11(a,b) ADD32(MULT16_16((a),SHR((b),11)), SHR(MULT16_16((a),((b)&0x000007ff)),11))
-#define MAC16_32_Q11(c,a,b) ADD32(c,ADD32(MULT16_16((a),SHR((b),11)), SHR(MULT16_16((a),((b)&0x000007ff)),11)))
-
-#define MULT16_32_P15(a,b) ADD32(MULT16_16((a),SHR((b),15)), PSHR(MULT16_16((a),((b)&0x00007fff)),15))
-#define MULT16_32_Q15(a,b) ADD32(MULT16_16((a),SHR((b),15)), SHR(MULT16_16((a),((b)&0x00007fff)),15))
-#define MAC16_32_Q15(c,a,b) ADD32(c,ADD32(MULT16_16((a),SHR((b),15)), SHR(MULT16_16((a),((b)&0x00007fff)),15)))
+#define MULT16_32_P15(a,b) ADD32(MULT16_32_32(a,SHR((b),15)), PSHR(MULT16_16((a),((b)&0x00007fff)),15))
+#define MULT16_32_Q15(a,b) ADD32(MULT16_32_32(a,SHR((b),15)), SHR(MULT16_16((a),((b)&0x00007fff)),15))
+#define MAC16_32_Q15(c,a,b) ADD32(c,MULT16_32_Q15(a,b))
#define MAC16_16_Q11(c,a,b) (ADD32((c),SHR(MULT16_16((a),(b)),11)))
#define MAC16_16_Q13(c,a,b) (ADD32((c),SHR(MULT16_16((a),(b)),13)))
#define MAC16_16_P13(c,a,b) (ADD32((c),SHR(ADD32(4096,MULT16_16((a),(b))),13)))
#define MULT16_16_Q11_32(a,b) (SHR(MULT16_16((a),(b)),11))
#define MULT16_16_Q13(a,b) (SHR(MULT16_16((a),(b)),13))
diff --git a/src/resample.c b/src/resample.c
--- a/src/resample.c
+++ b/src/resample.c
@@ -474,17 +474,17 @@ static int resampler_basic_interpolate_s
const spx_word16_t curr_in=iptr[j];
accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
}
cubic_coef(frac, interp);
- sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1));
+ sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]);
sum = SATURATE32PSHR(sum, 15, 32767);
#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
} else {
cubic_coef(frac, interp);
sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
}
#endif

View File

@ -1,7 +1,7 @@
Copyright 2002-2008 Xiph.org Foundation
Copyright 2002-2008 Jean-Marc Valin
Copyright 2005-2007 Analog Devices Inc.
Copyright 2005-2008 Commonwealth Scientific and Industrial Research
Copyright 2005-2008 Commonwealth Scientific and Industrial Research
Organisation (CSIRO)
Copyright 1993, 2002, 2006 David Rowe
Copyright 2003 EpicGames

View File

@ -10,8 +10,8 @@ origin:
url: https://speex.org/
release: 738e17905e1ca2a1fa932ddd9c2a85d089f4e845 (2023-02-28T16:25:59.000-05:00).
revision: 738e17905e1ca2a1fa932ddd9c2a85d089f4e845
release: 79822c8fb79d202cbe7e899f7406acf17beb247c (2017-01-22T14:16:02.000-05:00).
revision: 79822c8fb79d202cbe7e899f7406acf17beb247c
license: BSD-3-Clause
license-file: COPYING
@ -35,6 +35,7 @@ vendoring:
- libspeexdsp/resample_sse.h
- libspeexdsp/resample_neon.h
- libspeexdsp/arch.h
- libspeexdsp/stack_alloc.h
- libspeexdsp/fixed_generic.h
- include/speex/speex_resampler.h
- AUTHORS
@ -57,6 +58,9 @@ vendoring:
- action: move-file
from: '{vendor_dir}/libspeexdsp/arch.h'
to: '{vendor_dir}/src/arch.h'
- action: move-file
from: '{vendor_dir}/libspeexdsp/stack_alloc.h'
to: '{vendor_dir}/src/stack_alloc.h'
- action: move-file
from: '{vendor_dir}/libspeexdsp/fixed_generic.h'
to: '{vendor_dir}/src/fixed_generic.h'
@ -69,4 +73,6 @@ vendoring:
- 02_simd-detect-runtime.patch
- 03_set-skip-frac.patch
- 04_hugemem.patch
- 05_set-rate-overflow-no-return.patch
- 05_remove-empty-asm-clobber.patch
- 06_set-rate-overflow-no-return.patch
- 07_integer-halving.patch

View File

@ -41,10 +41,10 @@
#ifdef FLOATING_POINT
#error You cannot compile as floating point and fixed point at the same time
#endif
#ifdef USE_SSE
#ifdef _USE_SSE
#error SSE is only for floating-point
#endif
#if defined(ARM4_ASM) + defined(ARM5E_ASM) + defined(BFIN_ASM) > 1
#if ((defined (ARM4_ASM)||defined (ARM4_ASM)) && defined(BFIN_ASM)) || (defined (ARM4_ASM)&&defined(ARM5E_ASM))
#error Make up your mind. What CPU do you have?
#endif
#ifdef VORBIS_PSYCHO
@ -56,10 +56,10 @@
#ifndef FLOATING_POINT
#error You now need to define either FIXED_POINT or FLOATING_POINT
#endif
#if defined(ARM4_ASM) || defined(ARM5E_ASM) || defined(BFIN_ASM)
#if defined (ARM4_ASM) || defined(ARM5E_ASM) || defined(BFIN_ASM)
#error I suppose you can have a [ARM4/ARM5E/Blackfin] that has float instructions?
#endif
#ifdef FIXED_DEBUG
#ifdef FIXED_POINT_DEBUG
#error "Don't you think enabling fixed-point is a good thing to do if you want to debug that?"
#endif
@ -117,9 +117,9 @@ typedef spx_word32_t spx_sig_t;
#ifdef ARM5E_ASM
#include "fixed_arm5e.h"
#elif defined(ARM4_ASM)
#elif defined (ARM4_ASM)
#include "fixed_arm4.h"
#elif defined(BFIN_ASM)
#elif defined (BFIN_ASM)
#include "fixed_bfin.h"
#endif
@ -207,7 +207,7 @@ typedef float spx_word32_t;
#endif
#if defined(CONFIG_TI_C54X) || defined(CONFIG_TI_C55X)
#if defined (CONFIG_TI_C54X) || defined (CONFIG_TI_C55X)
/* 2 on TI C5x DSP */
#define BYTES_PER_CHAR 2

View File

@ -77,7 +77,6 @@
#define MULT16_16(a,b) (((spx_word32_t)(spx_word16_t)(a))*((spx_word32_t)(spx_word16_t)(b)))
#define MAC16_16(c,a,b) (ADD32((c),MULT16_16((a),(b))))
#define MULT16_32_P15(a,b) ADD32(MULT16_32_32(a,SHR((b),15)), PSHR(MULT16_16((a),((b)&0x00007fff)),15))
#define MULT16_32_Q15(a,b) ADD32(MULT16_32_32(a,SHR((b),15)), SHR(MULT16_16((a),((b)&0x00007fff)),15))
#define MAC16_32_Q15(c,a,b) ADD32(c,MULT16_32_Q15(a,b))

View File

@ -28,15 +28,15 @@ DEFINES['FLOATING_POINT'] = True
# Only use SSE code when using floating point samples, and on x86
if CONFIG['INTEL_ARCHITECTURE']:
DEFINES['USE_SSE'] = True
DEFINES['USE_SSE2'] = True
DEFINES['_USE_SSE'] = True
DEFINES['_USE_SSE2'] = True
SOURCES += [
'resample_sse.c'
]
SOURCES['resample_sse.c'].flags += CONFIG['SSE2_FLAGS']
if CONFIG['CPU_ARCH'] == 'arm' and CONFIG['BUILD_ARM_NEON']:
DEFINES['USE_NEON'] = True
DEFINES['_USE_NEON'] = True
SOURCES += [
'resample_neon.c'
]

View File

@ -46,7 +46,7 @@
Smith, Julius O. Digital Audio Resampling Home Page
Center for Computer Research in Music and Acoustics (CCRMA),
Stanford University, 2007.
Web published at https://ccrma.stanford.edu/~jos/resample/.
Web published at http://ccrma.stanford.edu/~jos/resample/.
There is one main difference, though. This resampler uses cubic
interpolation instead of linear interpolation in the above paper. This
@ -65,12 +65,9 @@
#ifdef OUTSIDE_SPEEX
#include <stdlib.h>
static void *speex_alloc(int size) {return calloc(size,1);}
static void *speex_realloc(void *ptr, int size) {return realloc(ptr, size);}
static void speex_free(void *ptr) {free(ptr);}
#ifndef EXPORT
#define EXPORT
#endif
static void *speex_alloc (int size) {return calloc(size,1);}
static void *speex_realloc (void *ptr, int size) {return realloc(ptr, size);}
static void speex_free (void *ptr) {free(ptr);}
#include "speex_resampler.h"
#include "arch.h"
#else /* OUTSIDE_SPEEX */
@ -80,6 +77,7 @@ static void speex_free(void *ptr) {free(ptr);}
#include "os_support.h"
#endif /* OUTSIDE_SPEEX */
#include "stack_alloc.h"
#include <math.h>
#include <limits.h>
@ -95,12 +93,12 @@ static void speex_free(void *ptr) {free(ptr);}
#endif
#ifndef UINT32_MAX
#define UINT32_MAX 4294967295U
#define UINT32_MAX 4294967296U
#endif
#include "simd_detect.h"
/* Number of elements to allocate on the stack */
/* Numer of elements to allocate on the stack */
#ifdef VAR_ARRAYS
#define FIXED_STACK_ALLOC 8192
#else
@ -192,14 +190,16 @@ struct FuncDef {
int oversample;
};
static const struct FuncDef kaiser12_funcdef = {kaiser12_table, 64};
#define KAISER12 (&kaiser12_funcdef)
static const struct FuncDef kaiser10_funcdef = {kaiser10_table, 32};
#define KAISER10 (&kaiser10_funcdef)
static const struct FuncDef kaiser8_funcdef = {kaiser8_table, 32};
#define KAISER8 (&kaiser8_funcdef)
static const struct FuncDef kaiser6_funcdef = {kaiser6_table, 32};
#define KAISER6 (&kaiser6_funcdef)
static const struct FuncDef _KAISER12 = {kaiser12_table, 64};
#define KAISER12 (&_KAISER12)
/*static struct FuncDef _KAISER12 = {kaiser12_table, 32};
#define KAISER12 (&_KAISER12)*/
static const struct FuncDef _KAISER10 = {kaiser10_table, 32};
#define KAISER10 (&_KAISER10)
static const struct FuncDef _KAISER8 = {kaiser8_table, 32};
#define KAISER8 (&_KAISER8)
static const struct FuncDef _KAISER6 = {kaiser6_table, 32};
#define KAISER6 (&_KAISER6)
struct QualityMapping {
int base_length;
@ -584,7 +584,6 @@ static int resampler_basic_zero(SpeexResamplerState *st, spx_uint32_t channel_in
const int frac_advance = st->frac_advance;
const spx_uint32_t den_rate = st->den_rate;
(void)in;
while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
{
out[out_stride * out_sample++] = 0;
@ -602,15 +601,16 @@ static int resampler_basic_zero(SpeexResamplerState *st, spx_uint32_t channel_in
return out_sample;
}
static int multiply_frac(spx_uint32_t *result, spx_uint32_t value, spx_uint32_t num, spx_uint32_t den)
static int _muldiv(spx_uint32_t *result, spx_uint32_t value, spx_uint32_t mul, spx_uint32_t div)
{
spx_uint32_t major = value / den;
spx_uint32_t remain = value % den;
speex_assert(result);
spx_uint32_t major = value / div;
spx_uint32_t remainder = value % div;
/* TODO: Could use 64 bits operation to check for overflow. But only guaranteed in C99+ */
if (remain > UINT32_MAX / num || major > UINT32_MAX / num
|| major * num > UINT32_MAX - remain * num / den)
if (remainder > UINT32_MAX / mul || major > UINT32_MAX / mul
|| major * mul > UINT32_MAX - remainder * mul / div)
return RESAMPLER_ERR_OVERFLOW;
*result = remain * num / den + major * num;
*result = remainder * mul / div + major * mul;
return RESAMPLER_ERR_SUCCESS;
}
@ -631,7 +631,7 @@ static int update_filter(SpeexResamplerState *st)
{
/* down-sampling */
st->cutoff = quality_map[st->quality].downsample_bandwidth * st->den_rate / st->num_rate;
if (multiply_frac(&st->filt_len,st->filt_len,st->num_rate,st->den_rate) != RESAMPLER_ERR_SUCCESS)
if (_muldiv(&st->filt_len,st->filt_len,st->num_rate,st->den_rate) != RESAMPLER_ERR_SUCCESS)
goto fail;
/* Round up to make sure we have a multiple of 8 for SSE */
st->filt_len = ((st->filt_len-1)&(~0x7))+8;
@ -746,18 +746,16 @@ static int update_filter(SpeexResamplerState *st)
{
spx_uint32_t j;
spx_uint32_t olen = old_length;
spx_uint32_t start = i*st->mem_alloc_size;
spx_uint32_t magic_samples = st->magic_samples[i];
/*if (st->magic_samples[i])*/
{
/* Try and remove the magic samples as if nothing had happened */
/* FIXME: This is wrong but for now we need it to avoid going over the array bounds */
olen = old_length + 2*magic_samples;
for (j=old_length-1+magic_samples;j--;)
st->mem[start+j+magic_samples] = st->mem[i*old_alloc_size+j];
for (j=0;j<magic_samples;j++)
st->mem[start+j] = 0;
olen = old_length + 2*st->magic_samples[i];
for (j=old_length-1+st->magic_samples[i];j--;)
st->mem[i*st->mem_alloc_size+j+st->magic_samples[i]] = st->mem[i*old_alloc_size+j];
for (j=0;j<st->magic_samples[i];j++)
st->mem[i*st->mem_alloc_size+j] = 0;
st->magic_samples[i] = 0;
}
if (st->filt_len > olen)
@ -765,18 +763,17 @@ static int update_filter(SpeexResamplerState *st)
/* If the new filter length is still bigger than the "augmented" length */
/* Copy data going backward */
for (j=0;j<olen-1;j++)
st->mem[start+(st->filt_len-2-j)] = st->mem[start+(olen-2-j)];
st->mem[i*st->mem_alloc_size+(st->filt_len-2-j)] = st->mem[i*st->mem_alloc_size+(olen-2-j)];
/* Then put zeros for lack of anything better */
for (;j<st->filt_len-1;j++)
st->mem[start+(st->filt_len-2-j)] = 0;
st->mem[i*st->mem_alloc_size+(st->filt_len-2-j)] = 0;
/* Adjust last_sample */
st->last_sample[i] += (st->filt_len - olen)/2;
} else {
/* Put back some of the magic! */
magic_samples = (olen - st->filt_len)/2;
for (j=0;j<st->filt_len-1+magic_samples;j++)
st->mem[start+j] = st->mem[start+j+magic_samples];
st->magic_samples[i] = magic_samples;
st->magic_samples[i] = (olen - st->filt_len)/2;
for (j=0;j<st->filt_len-1+st->magic_samples[i];j++)
st->mem[i*st->mem_alloc_size+j] = st->mem[i*st->mem_alloc_size+j+st->magic_samples[i]];
}
}
} else if (st->filt_len < old_length)
@ -814,6 +811,7 @@ EXPORT SpeexResamplerState *speex_resampler_init(spx_uint32_t nb_channels, spx_u
EXPORT SpeexResamplerState *speex_resampler_init_frac(spx_uint32_t nb_channels, spx_uint32_t ratio_num, spx_uint32_t ratio_den, spx_uint32_t in_rate, spx_uint32_t out_rate, int quality, int *err)
{
spx_uint32_t i;
SpeexResamplerState *st;
int filter_err;
@ -993,7 +991,8 @@ EXPORT int speex_resampler_process_int(SpeexResamplerState *st, spx_uint32_t cha
const spx_uint32_t xlen = st->mem_alloc_size - (st->filt_len - 1);
#ifdef VAR_ARRAYS
const unsigned int ylen = (olen < FIXED_STACK_ALLOC) ? olen : FIXED_STACK_ALLOC;
spx_word16_t ystack[ylen];
VARDECL(spx_word16_t *ystack);
ALLOC(ystack, ylen, spx_word16_t);
#else
const unsigned int ylen = FIXED_STACK_ALLOC;
spx_word16_t ystack[FIXED_STACK_ALLOC];
@ -1108,7 +1107,7 @@ EXPORT void speex_resampler_get_rate(SpeexResamplerState *st, spx_uint32_t *in_r
*out_rate = st->out_rate;
}
static inline spx_uint32_t compute_gcd(spx_uint32_t a, spx_uint32_t b)
static inline spx_uint32_t _gcd(spx_uint32_t a, spx_uint32_t b)
{
while (b != 0)
{
@ -1138,7 +1137,7 @@ EXPORT int speex_resampler_set_rate_frac(SpeexResamplerState *st, spx_uint32_t r
st->num_rate = ratio_num;
st->den_rate = ratio_den;
fact = compute_gcd(st->num_rate, st->den_rate);
fact = _gcd (st->num_rate, st->den_rate);
st->num_rate /= fact;
st->den_rate /= fact;
@ -1147,8 +1146,8 @@ EXPORT int speex_resampler_set_rate_frac(SpeexResamplerState *st, spx_uint32_t r
{
for (i=0;i<st->nb_channels;i++)
{
if (multiply_frac(&st->samp_frac_num[i],st->samp_frac_num[i],st->den_rate,old_den) != RESAMPLER_ERR_SUCCESS) {
st->samp_frac_num[i] = st->den_rate-1;
if (_muldiv(&st->samp_frac_num[i],st->samp_frac_num[i],st->den_rate,old_den) != RESAMPLER_ERR_SUCCESS) {
st->samp_frac_num[i] = st->den_rate-1;
}
/* Safety net */
if (st->samp_frac_num[i] >= st->den_rate)

View File

@ -36,29 +36,17 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <stdint.h>
#include "simd_detect.h"
#include <arm_neon.h>
#ifdef FIXED_POINT
#if defined(__aarch64__)
static inline int32_t saturate_32bit_to_16bit(int32_t a) {
int32_t ret;
asm ("fmov s0, %w[a]\n"
"sqxtn h0, s0\n"
"sxtl v0.4s, v0.4h\n"
"fmov %w[ret], s0\n"
: [ret] "=r" (ret)
: [a] "r" (a)
: "v0" );
return ret;
}
#elif defined(__thumb2__)
#ifdef __thumb2__
static inline int32_t saturate_32bit_to_16bit(int32_t a) {
int32_t ret;
asm ("ssat %[ret], #16, %[a]"
: [ret] "=r" (ret)
: [a] "r" (a)
: );
: [ret] "=&r" (ret)
: [a] "r" (a));
return ret;
}
#else
@ -67,7 +55,7 @@ static inline int32_t saturate_32bit_to_16bit(int32_t a) {
asm ("vmov.s32 d0[0], %[a]\n"
"vqmovn.s32 d0, q0\n"
"vmov.s16 %[ret], d0[0]\n"
: [ret] "=r" (ret)
: [ret] "=&r" (ret)
: [a] "r" (a)
: "q0");
return ret;
@ -77,64 +65,8 @@ static inline int32_t saturate_32bit_to_16bit(int32_t a) {
#define WORD2INT(x) (saturate_32bit_to_16bit(x))
#define OVERRIDE_INNER_PRODUCT_SINGLE
/* Only works when len % 4 == 0 and len >= 4 */
#if defined(__aarch64__)
inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
{
int32_t ret;
uint32_t remainder = len % 16;
len = len - remainder;
asm volatile (" cmp %w[len], #0\n"
" b.ne 1f\n"
" ld1 {v16.4h}, [%[b]], #8\n"
" ld1 {v20.4h}, [%[a]], #8\n"
" subs %w[remainder], %w[remainder], #4\n"
" smull v0.4s, v16.4h, v20.4h\n"
" b.ne 4f\n"
" b 5f\n"
"1:"
" ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [%[b]], #32\n"
" ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [%[a]], #32\n"
" subs %w[len], %w[len], #16\n"
" smull v0.4s, v16.4h, v20.4h\n"
" smlal v0.4s, v17.4h, v21.4h\n"
" smlal v0.4s, v18.4h, v22.4h\n"
" smlal v0.4s, v19.4h, v23.4h\n"
" b.eq 3f\n"
"2:"
" ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [%[b]], #32\n"
" ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [%[a]], #32\n"
" subs %w[len], %w[len], #16\n"
" smlal v0.4s, v16.4h, v20.4h\n"
" smlal v0.4s, v17.4h, v21.4h\n"
" smlal v0.4s, v18.4h, v22.4h\n"
" smlal v0.4s, v19.4h, v23.4h\n"
" b.ne 2b\n"
"3:"
" cmp %w[remainder], #0\n"
" b.eq 5f\n"
"4:"
" ld1 {v18.4h}, [%[b]], #8\n"
" ld1 {v22.4h}, [%[a]], #8\n"
" subs %w[remainder], %w[remainder], #4\n"
" smlal v0.4s, v18.4h, v22.4h\n"
" b.ne 4b\n"
"5:"
" saddlv d0, v0.4s\n"
" sqxtn s0, d0\n"
" sqrshrn h0, s0, #15\n"
" sxtl v0.4s, v0.4h\n"
" fmov %w[ret], s0\n"
: [ret] "=r" (ret), [a] "+r" (a), [b] "+r" (b),
[len] "+r" (len), [remainder] "+r" (remainder)
:
: "cc", "v0",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
return ret;
}
#else
inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
/* Only works when len % 4 == 0 */
int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
{
int32_t ret;
uint32_t remainder = len % 16;
@ -181,105 +113,34 @@ inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned
" vqmovn.s64 d0, q0\n"
" vqrshrn.s32 d0, q0, #15\n"
" vmov.s16 %[ret], d0[0]\n"
: [ret] "=r" (ret), [a] "+r" (a), [b] "+r" (b),
: [ret] "=&r" (ret), [a] "+r" (a), [b] "+r" (b),
[len] "+r" (len), [remainder] "+r" (remainder)
:
: "cc", "q0",
"d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23");
"d16", "d17", "d18", "d19",
"d20", "d21", "d22", "d23");
return ret;
}
#endif // !defined(__aarch64__)
#elif defined(FLOATING_POINT)
#if defined(__aarch64__)
static inline int32_t saturate_float_to_16bit(float a) {
int32_t ret;
asm ("fcvtas s1, %s[a]\n"
"sqxtn h1, s1\n"
"sxtl v1.4s, v1.4h\n"
"fmov %w[ret], s1\n"
: [ret] "=r" (ret)
: [a] "w" (a)
: "v1");
return ret;
}
#else
static inline int32_t saturate_float_to_16bit(float a) {
int32_t ret;
asm ("vmov.f32 d0[0], %[a]\n"
"vcvt.s32.f32 d0, d0, #15\n"
"vqrshrn.s32 d0, q0, #15\n"
"vmov.s16 %[ret], d0[0]\n"
: [ret] "=r" (ret)
: [ret] "=&r" (ret)
: [a] "r" (a)
: "q0");
return ret;
}
#endif
#undef WORD2INT
#define WORD2INT(x) (saturate_float_to_16bit(x))
#define OVERRIDE_INNER_PRODUCT_SINGLE
/* Only works when len % 4 == 0 and len >= 4 */
#if defined(__aarch64__)
inline float inner_product_single(const float *a, const float *b, unsigned int len)
{
float ret;
uint32_t remainder = len % 16;
len = len - remainder;
asm volatile (" cmp %w[len], #0\n"
" b.ne 1f\n"
" ld1 {v16.4s}, [%[b]], #16\n"
" ld1 {v20.4s}, [%[a]], #16\n"
" subs %w[remainder], %w[remainder], #4\n"
" fmul v1.4s, v16.4s, v20.4s\n"
" b.ne 4f\n"
" b 5f\n"
"1:"
" ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%[b]], #64\n"
" ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [%[a]], #64\n"
" subs %w[len], %w[len], #16\n"
" fmul v1.4s, v16.4s, v20.4s\n"
" fmul v2.4s, v17.4s, v21.4s\n"
" fmul v3.4s, v18.4s, v22.4s\n"
" fmul v4.4s, v19.4s, v23.4s\n"
" b.eq 3f\n"
"2:"
" ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%[b]], #64\n"
" ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [%[a]], #64\n"
" subs %w[len], %w[len], #16\n"
" fmla v1.4s, v16.4s, v20.4s\n"
" fmla v2.4s, v17.4s, v21.4s\n"
" fmla v3.4s, v18.4s, v22.4s\n"
" fmla v4.4s, v19.4s, v23.4s\n"
" b.ne 2b\n"
"3:"
" fadd v16.4s, v1.4s, v2.4s\n"
" fadd v17.4s, v3.4s, v4.4s\n"
" cmp %w[remainder], #0\n"
" fadd v1.4s, v16.4s, v17.4s\n"
" b.eq 5f\n"
"4:"
" ld1 {v18.4s}, [%[b]], #16\n"
" ld1 {v22.4s}, [%[a]], #16\n"
" subs %w[remainder], %w[remainder], #4\n"
" fmla v1.4s, v18.4s, v22.4s\n"
" b.ne 4b\n"
"5:"
" faddp v1.4s, v1.4s, v1.4s\n"
" faddp %[ret].4s, v1.4s, v1.4s\n"
: [ret] "=w" (ret), [a] "+r" (a), [b] "+r" (b),
[len] "+r" (len), [remainder] "+r" (remainder)
:
: "cc", "v1", "v2", "v3", "v4",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
return ret;
}
#else
inline float inner_product_single(const float *a, const float *b, unsigned int len)
/* Only works when len % 4 == 0 */
float inner_product_single(const float *a, const float *b, unsigned int len)
{
float ret;
uint32_t remainder = len % 16;
@ -331,12 +192,11 @@ inline float inner_product_single(const float *a, const float *b, unsigned int l
" vadd.f32 d0, d0, d1\n"
" vpadd.f32 d0, d0, d0\n"
" vmov.f32 %[ret], d0[0]\n"
: [ret] "=r" (ret), [a] "+r" (a), [b] "+r" (b),
: [ret] "=&r" (ret), [a] "+r" (a), [b] "+r" (b),
[len] "+l" (len), [remainder] "+l" (remainder)
:
: "cc", "q0", "q1", "q2", "q3",
"q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11");
: "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
"q9", "q10", "q11");
return ret;
}
#endif // defined(__aarch64__)
#endif

View File

@ -73,7 +73,7 @@ float interpolate_product_single(const float *a, const float *b, unsigned int le
return ret;
}
#ifdef USE_SSE2
#ifdef _USE_SSE2
#include <emmintrin.h>
#define OVERRIDE_INNER_PRODUCT_DOUBLE

View File

@ -0,0 +1,115 @@
/* Copyright (C) 2002 Jean-Marc Valin */
/**
@file stack_alloc.h
@brief Temporary memory allocation on stack
*/
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of the Xiph.org Foundation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef STACK_ALLOC_H
#define STACK_ALLOC_H
#ifdef USE_ALLOCA
# ifdef WIN32
# include <malloc.h>
# else
# ifdef HAVE_ALLOCA_H
# include <alloca.h>
# else
# include <stdlib.h>
# endif
# endif
#endif
/**
* @def ALIGN(stack, size)
*
* Aligns the stack to a 'size' boundary
*
* @param stack Stack
* @param size New size boundary
*/
/**
* @def PUSH(stack, size, type)
*
* Allocates 'size' elements of type 'type' on the stack
*
* @param stack Stack
* @param size Number of elements
* @param type Type of element
*/
/**
* @def VARDECL(var)
*
* Declare variable on stack
*
* @param var Variable to declare
*/
/**
* @def ALLOC(var, size, type)
*
* Allocate 'size' elements of 'type' on stack
*
* @param var Name of variable to allocate
* @param size Number of elements
* @param type Type of element
*/
#ifdef ENABLE_VALGRIND
#include <valgrind/memcheck.h>
#define ALIGN(stack, size) ((stack) += ((size) - (long)(stack)) & ((size) - 1))
#define PUSH(stack, size, type) (VALGRIND_MAKE_NOACCESS(stack, 1000),ALIGN((stack),sizeof(type)),VALGRIND_MAKE_WRITABLE(stack, ((size)*sizeof(type))),(stack)+=((size)*sizeof(type)),(type*)((stack)-((size)*sizeof(type))))
#else
#define ALIGN(stack, size) ((stack) += ((size) - (long)(stack)) & ((size) - 1))
#define PUSH(stack, size, type) (ALIGN((stack),sizeof(type)),(stack)+=((size)*sizeof(type)),(type*)((stack)-((size)*sizeof(type))))
#endif
#if defined(VAR_ARRAYS)
#define VARDECL(var)
#define ALLOC(var, size, type) type var[size]
#elif defined(USE_ALLOCA)
#define VARDECL(var) var
#define ALLOC(var, size, type) var = alloca(sizeof(type)*(size))
#else
#define VARDECL(var) var
#define ALLOC(var, size, type) var = PUSH(stack, size, type)
#endif
#endif