mirror of
https://github.com/libretro/RetroArch.git
synced 2024-11-24 00:20:01 +00:00
Optimize state_manager_push a bit.
Avoid redundant memcpy and SSE2 memory diff search. Might get replaced by Alcaro's implementation soon though.
This commit is contained in:
parent
8536488955
commit
d896d0f6e4
@ -2214,7 +2214,10 @@ static void check_rewind(void)
|
||||
if (cnt == 0)
|
||||
#endif
|
||||
{
|
||||
RARCH_PERFORMANCE_INIT(rewind_serialize);
|
||||
RARCH_PERFORMANCE_START(rewind_serialize);
|
||||
pretro_serialize(g_extern.state_buf, g_extern.state_size);
|
||||
RARCH_PERFORMANCE_STOP(rewind_serialize);
|
||||
state_manager_push(g_extern.state_manager, g_extern.state_buf);
|
||||
}
|
||||
}
|
||||
|
65
rewind.c
65
rewind.c
@ -14,6 +14,7 @@
|
||||
*/
|
||||
|
||||
#include "rewind.h"
|
||||
#include "performance.h"
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include "boolean.h"
|
||||
@ -147,11 +148,45 @@ static void reassign_bottom(state_manager_t *state)
|
||||
state->bottom_ptr = (state->bottom_ptr + 1) & state->buf_size_mask;
|
||||
}
|
||||
|
||||
#if __SSE2__
|
||||
#include <emmintrin.h>
|
||||
// There's no equivalent in libc, you'd think so ... std::mismatch exists, but it's not optimized at all. :(
|
||||
static unsigned find_mismatch(const uint32_t *a, const uint32_t *b, unsigned samples)
|
||||
{
|
||||
unsigned i;
|
||||
unsigned sse_samples = samples & ~3;
|
||||
for (i = 0; i < sse_samples; i += 4)
|
||||
{
|
||||
__m128i v0 = _mm_loadu_si128((const __m128i*)(a + i));
|
||||
__m128i v1 = _mm_loadu_si128((const __m128i*)(b + i));
|
||||
__m128i c = _mm_cmpeq_epi32(v0, v1);
|
||||
uint32_t mask = _mm_movemask_epi8(c);
|
||||
if (mask != 0xffff) // Something has changed, figure out where.
|
||||
return i + (__builtin_ctz(~mask) >> 2);
|
||||
}
|
||||
|
||||
for (; i < samples; i++)
|
||||
if (a[i] != b[i])
|
||||
return i;
|
||||
|
||||
return samples;
|
||||
}
|
||||
#else
|
||||
static unsigned find_mismatch(const uint32_t *a, const uint32_t *b, unsigned samples)
|
||||
{
|
||||
unsigned i;
|
||||
for (i = 0; i < samples; i++)
|
||||
if (a[i] != b[i])
|
||||
return i;
|
||||
return samples;
|
||||
}
|
||||
#endif
|
||||
|
||||
static void generate_delta(state_manager_t *state, const void *data)
|
||||
{
|
||||
uint64_t i;
|
||||
size_t i;
|
||||
bool crossed = false;
|
||||
const uint32_t *old_state = state->tmp_state;
|
||||
uint32_t *old_state = state->tmp_state;
|
||||
const uint32_t *new_state = (const uint32_t*)data;
|
||||
|
||||
state->buffer[state->top_ptr++] = 0; // For each separate delta, we have a 0 value sentinel in between.
|
||||
@ -163,20 +198,25 @@ static void generate_delta(state_manager_t *state, const void *data)
|
||||
|
||||
for (i = 0; i < state->state_size; i++)
|
||||
{
|
||||
uint64_t xor_ = old_state[i] ^ new_state[i];
|
||||
unsigned avail = state->state_size - i;
|
||||
unsigned pos = find_mismatch(old_state + i, new_state + i, avail);
|
||||
if (pos == avail)
|
||||
break;
|
||||
|
||||
i += pos;
|
||||
|
||||
// If the data differs (xor != 0), we push that xor on the stack with index and xor.
|
||||
// This can be reversed by reapplying the xor.
|
||||
// This, if states don't really differ much, we'll save lots of space :)
|
||||
// Hopefully this will work really well with save states.
|
||||
if (xor_)
|
||||
{
|
||||
state->buffer[state->top_ptr] = (i << 32) | xor_;
|
||||
state->top_ptr = (state->top_ptr + 1) & state->buf_size_mask;
|
||||
uint32_t xor_ = old_state[i] ^ new_state[i];
|
||||
old_state[i] = new_state[i];
|
||||
|
||||
if (state->top_ptr == state->bottom_ptr)
|
||||
crossed = true;
|
||||
}
|
||||
state->buffer[state->top_ptr] = ((uint64_t)i << 32) | xor_;
|
||||
state->top_ptr = (state->top_ptr + 1) & state->buf_size_mask;
|
||||
|
||||
if (state->top_ptr == state->bottom_ptr)
|
||||
crossed = true;
|
||||
}
|
||||
|
||||
if (crossed)
|
||||
@ -185,8 +225,11 @@ static void generate_delta(state_manager_t *state, const void *data)
|
||||
|
||||
bool state_manager_push(state_manager_t *state, const void *data)
|
||||
{
|
||||
RARCH_PERFORMANCE_INIT(gen_delta);
|
||||
RARCH_PERFORMANCE_START(gen_delta);
|
||||
generate_delta(state, data);
|
||||
memcpy(state->tmp_state, data, state->state_size * sizeof(uint32_t));
|
||||
RARCH_PERFORMANCE_STOP(gen_delta);
|
||||
|
||||
state->first_pop = true;
|
||||
|
||||
return true;
|
||||
|
Loading…
Reference in New Issue
Block a user