mirror of
https://github.com/libretro/ppsspp.git
synced 2025-03-03 14:09:45 +00:00
Merge pull request #9874 from unknownbrackets/vol-overflow
Core: Handle 20-bit volumes in SIMD optimizations
This commit is contained in:
commit
2b1ad21bd5
@ -27,7 +27,7 @@
|
||||
|
||||
void AdjustVolumeBlockStandard(s16 *out, s16 *in, size_t size, int leftVol, int rightVol) {
|
||||
#ifdef _M_SSE
|
||||
if (leftVol <= 0x7fff && rightVol <= 0x7fff) {
|
||||
if (leftVol <= 0x7fff && -leftVol <= 0x8000 && rightVol <= 0x7fff && -rightVol <= 0x8000) {
|
||||
__m128i volume = _mm_set_epi16(leftVol, rightVol, leftVol, rightVol, leftVol, rightVol, leftVol, rightVol);
|
||||
while (size >= 16) {
|
||||
__m128i indata1 = _mm_loadu_si128((__m128i *)in);
|
||||
@ -39,15 +39,25 @@ void AdjustVolumeBlockStandard(s16 *out, s16 *in, size_t size, int leftVol, int
|
||||
size -= 16;
|
||||
}
|
||||
} else {
|
||||
// We have to shift inside the loop to avoid the signed multiply issue.
|
||||
leftVol >>= 1;
|
||||
rightVol >>= 1;
|
||||
__m128i volume = _mm_set_epi16(leftVol, rightVol, leftVol, rightVol, leftVol, rightVol, leftVol, rightVol);
|
||||
// We have to shift inside the loop to avoid the signed 16-bit multiply issue.
|
||||
int leftShift = 0;
|
||||
int leftVol16 = leftVol;
|
||||
while (leftVol16 > 0x7fff || -leftVol16 > 0x8000) {
|
||||
++leftShift;
|
||||
leftVol16 >>= 1;
|
||||
}
|
||||
int rightShift = 0;
|
||||
int rightVol16 = rightVol;
|
||||
while (rightVol16 > 0x7fff || -rightVol16 > 0x8000) {
|
||||
++rightShift;
|
||||
rightVol16 >>= 1;
|
||||
}
|
||||
__m128i volume = _mm_set_epi16(leftVol16, rightVol16, leftVol16, rightVol16, leftVol16, rightVol16, leftVol16, rightVol16);
|
||||
while (size >= 16) {
|
||||
__m128i indata1 = _mm_loadu_si128((__m128i *)in);
|
||||
__m128i indata2 = _mm_loadu_si128((__m128i *)(in + 8));
|
||||
_mm_storeu_si128((__m128i *)out, _mm_slli_epi16(_mm_mulhi_epi16(indata1, volume), 1));
|
||||
_mm_storeu_si128((__m128i *)(out + 8), _mm_slli_epi16(_mm_mulhi_epi16(indata2, volume), 1));
|
||||
_mm_storeu_si128((__m128i *)out, _mm_slli_epi16(_mm_mulhi_epi16(indata1, volume), leftShift));
|
||||
_mm_storeu_si128((__m128i *)(out + 8), _mm_slli_epi16(_mm_mulhi_epi16(indata2, volume), rightShift));
|
||||
in += 16;
|
||||
out += 16;
|
||||
size -= 16;
|
||||
|
@ -30,29 +30,32 @@
|
||||
static s16 MEMORY_ALIGNED16(volumeValues[4]) = {};
|
||||
|
||||
void AdjustVolumeBlockNEON(s16 *out, s16 *in, size_t size, int leftVol, int rightVol) {
|
||||
volumeValues[0] = leftVol >> 1;
|
||||
volumeValues[1] = rightVol >> 1;
|
||||
volumeValues[2] = leftVol >> 1;
|
||||
volumeValues[3] = rightVol >> 1;
|
||||
if (leftVol <= 0xFFFF && -leftVol <= 0x10000 && rightVol <= 0xFFFF && -rightVol <= 0x10000) {
|
||||
// Note: vqshrn_n_s32 takes a const argument, so we always go with 1 here, 15 there.
|
||||
volumeValues[0] = leftVol >> 1;
|
||||
volumeValues[1] = rightVol >> 1;
|
||||
volumeValues[2] = leftVol >> 1;
|
||||
volumeValues[3] = rightVol >> 1;
|
||||
|
||||
const int16x4_t vol = vld1_s16(volumeValues);
|
||||
const int16x4_t vol = vld1_s16(volumeValues);
|
||||
|
||||
while (size >= 16) {
|
||||
int16x8_t indata1 = vld1q_s16(in);
|
||||
int16x8_t indata2 = vld1q_s16(in + 8);
|
||||
while (size >= 16) {
|
||||
int16x8_t indata1 = vld1q_s16(in);
|
||||
int16x8_t indata2 = vld1q_s16(in + 8);
|
||||
|
||||
int32x4_t outh1 = vmull_s16(vget_high_s16(indata1), vol);
|
||||
int32x4_t outh2 = vmull_s16(vget_high_s16(indata2), vol);
|
||||
int32x4_t outl1 = vmull_s16(vget_low_s16(indata1), vol);
|
||||
int32x4_t outl2 = vmull_s16(vget_low_s16(indata2), vol);
|
||||
int32x4_t outh1 = vmull_s16(vget_high_s16(indata1), vol);
|
||||
int32x4_t outh2 = vmull_s16(vget_high_s16(indata2), vol);
|
||||
int32x4_t outl1 = vmull_s16(vget_low_s16(indata1), vol);
|
||||
int32x4_t outl2 = vmull_s16(vget_low_s16(indata2), vol);
|
||||
|
||||
int16x8_t outdata1 = vcombine_s16(vqshrn_n_s32(outl1, 15), vqshrn_n_s32(outh1, 15));
|
||||
int16x8_t outdata2 = vcombine_s16(vqshrn_n_s32(outl2, 15), vqshrn_n_s32(outh2, 15));
|
||||
vst1q_s16(out, outdata1);
|
||||
vst1q_s16(out + 8, outdata2);
|
||||
in += 16;
|
||||
out += 16;
|
||||
size -= 16;
|
||||
int16x8_t outdata1 = vcombine_s16(vqshrn_n_s32(outl1, 15), vqshrn_n_s32(outh1, 15));
|
||||
int16x8_t outdata2 = vcombine_s16(vqshrn_n_s32(outl2, 15), vqshrn_n_s32(outh2, 15));
|
||||
vst1q_s16(out, outdata1);
|
||||
vst1q_s16(out + 8, outdata2);
|
||||
in += 16;
|
||||
out += 16;
|
||||
size -= 16;
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < size; i += 2) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user