Optimize state_manager_push a bit.

Avoid redundant memcpy and SSE2 memory diff search.
Might get replaced by Alcaro's implementation soon though.
This commit is contained in:
Themaister 2014-02-22 13:13:18 +01:00
parent 8536488955
commit d896d0f6e4
2 changed files with 57 additions and 11 deletions

View File

@ -2214,7 +2214,10 @@ static void check_rewind(void)
if (cnt == 0)
#endif
{
RARCH_PERFORMANCE_INIT(rewind_serialize);
RARCH_PERFORMANCE_START(rewind_serialize);
pretro_serialize(g_extern.state_buf, g_extern.state_size);
RARCH_PERFORMANCE_STOP(rewind_serialize);
state_manager_push(g_extern.state_manager, g_extern.state_buf);
}
}

View File

@ -14,6 +14,7 @@
*/
#include "rewind.h"
#include "performance.h"
#include <stdlib.h>
#include <stdint.h>
#include "boolean.h"
@ -147,11 +148,45 @@ static void reassign_bottom(state_manager_t *state)
state->bottom_ptr = (state->bottom_ptr + 1) & state->buf_size_mask;
}
#if __SSE2__
#include <emmintrin.h>
// There's no equivalent in libc, you'd think so ... std::mismatch exists, but it's not optimized at all. :(
static unsigned find_mismatch(const uint32_t *a, const uint32_t *b, unsigned samples)
{
unsigned i;
unsigned sse_samples = samples & ~3;
for (i = 0; i < sse_samples; i += 4)
{
__m128i v0 = _mm_loadu_si128((const __m128i*)(a + i));
__m128i v1 = _mm_loadu_si128((const __m128i*)(b + i));
__m128i c = _mm_cmpeq_epi32(v0, v1);
uint32_t mask = _mm_movemask_epi8(c);
if (mask != 0xffff) // Something has changed, figure out where.
return i + (__builtin_ctz(~mask) >> 2);
}
for (; i < samples; i++)
if (a[i] != b[i])
return i;
return samples;
}
#else
static unsigned find_mismatch(const uint32_t *a, const uint32_t *b, unsigned samples)
{
unsigned i;
for (i = 0; i < samples; i++)
if (a[i] != b[i])
return i;
return samples;
}
#endif
static void generate_delta(state_manager_t *state, const void *data)
{
uint64_t i;
size_t i;
bool crossed = false;
const uint32_t *old_state = state->tmp_state;
uint32_t *old_state = state->tmp_state;
const uint32_t *new_state = (const uint32_t*)data;
state->buffer[state->top_ptr++] = 0; // For each separate delta, we have a 0 value sentinel in between.
@ -163,20 +198,25 @@ static void generate_delta(state_manager_t *state, const void *data)
for (i = 0; i < state->state_size; i++)
{
uint64_t xor_ = old_state[i] ^ new_state[i];
unsigned avail = state->state_size - i;
unsigned pos = find_mismatch(old_state + i, new_state + i, avail);
if (pos == avail)
break;
i += pos;
// If the data differs (xor != 0), we push that xor on the stack with index and xor.
// This can be reversed by reapplying the xor.
// This, if states don't really differ much, we'll save lots of space :)
// Hopefully this will work really well with save states.
if (xor_)
{
state->buffer[state->top_ptr] = (i << 32) | xor_;
state->top_ptr = (state->top_ptr + 1) & state->buf_size_mask;
uint32_t xor_ = old_state[i] ^ new_state[i];
old_state[i] = new_state[i];
if (state->top_ptr == state->bottom_ptr)
crossed = true;
}
state->buffer[state->top_ptr] = ((uint64_t)i << 32) | xor_;
state->top_ptr = (state->top_ptr + 1) & state->buf_size_mask;
if (state->top_ptr == state->bottom_ptr)
crossed = true;
}
if (crossed)
@ -185,8 +225,11 @@ static void generate_delta(state_manager_t *state, const void *data)
bool state_manager_push(state_manager_t *state, const void *data)
{
RARCH_PERFORMANCE_INIT(gen_delta);
RARCH_PERFORMANCE_START(gen_delta);
generate_delta(state, data);
memcpy(state->tmp_state, data, state->state_size * sizeof(uint32_t));
RARCH_PERFORMANCE_STOP(gen_delta);
state->first_pop = true;
return true;