mirror of
https://github.com/libretro/RetroArch.git
synced 2025-01-24 18:36:31 +00:00
Split these functions out. ctult needs them for netplay.
This commit is contained in:
parent
361879bc22
commit
e71d8f852a
510
rewind.c
510
rewind.c
@ -19,7 +19,6 @@
|
||||
#include "rewind.h"
|
||||
#include "performance.h"
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <retro_inline.h>
|
||||
#include "intl/intl.h"
|
||||
@ -63,6 +62,260 @@ repeat {
|
||||
size thisstart;
|
||||
#endif
|
||||
|
||||
size_t state_manager_raw_maxsize(size_t uncomp)
|
||||
{
|
||||
const int maxcblkcover = UINT16_MAX * sizeof(uint16_t); /* bytes covered by a compressed block */
|
||||
size_t uncomp16 = (uncomp + sizeof(uint16_t) - 1) & ~sizeof(uint16_t); /* uncompressed size, rounded to 16 bits */
|
||||
size_t maxcblks = (uncomp + maxcblkcover - 1) / maxcblkcover; /* number of blocks */
|
||||
return uncomp16 + maxcblks * sizeof(uint16_t)*2 /* two u16 overhead per block */ + sizeof(uint16_t)*3; /* three u16 to end it */
|
||||
}
|
||||
|
||||
void *state_manager_raw_alloc(size_t len, uint16_t uniq)
|
||||
{
|
||||
size_t len16 = (len + sizeof(uint16_t) - 1) & ~sizeof(uint16_t);
|
||||
|
||||
uint16_t *ret = (uint16_t*)calloc(len16 + sizeof(uint16_t) * 4 + 16, 1);
|
||||
|
||||
/* Force in a different byte at the end, so we don't need to check
|
||||
* bounds in the innermost loop (it's expensive).
|
||||
*
|
||||
* There is also a large amount of data that's the same, to stop
|
||||
* the other scan.
|
||||
*
|
||||
* There is also some padding at the end. This is so we don't
|
||||
* read outside the buffer end if we're reading in large blocks;
|
||||
*
|
||||
* It doesn't make any difference to us, but sacrificing 16 bytes to get
|
||||
* Valgrind happy is worth it. */
|
||||
ret[len16/sizeof(uint16_t) + 3] = uniq;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#if __SSE2__
|
||||
#if defined(__GNUC__)
|
||||
static INLINE int compat_ctz(unsigned x)
|
||||
{
|
||||
return __builtin_ctz(x);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Only checks at nibble granularity,
|
||||
* because that's what we need. */
|
||||
|
||||
static INLINE int compat_ctz(unsigned x)
|
||||
{
|
||||
if (x & 0x000f)
|
||||
return 0;
|
||||
if (x & 0x00f0)
|
||||
return 4;
|
||||
if (x & 0x0f00)
|
||||
return 8;
|
||||
if (x & 0xf000)
|
||||
return 12;
|
||||
return 16;
|
||||
}
|
||||
#endif
|
||||
|
||||
#include <emmintrin.h>
|
||||
/* There's no equivalent in libc, you'd think so ...
|
||||
* std::mismatch exists, but it's not optimized at all. */
|
||||
|
||||
static INLINE size_t find_change(const uint16_t *a, const uint16_t *b)
|
||||
{
|
||||
const __m128i *a128 = (const __m128i*)a;
|
||||
const __m128i *b128 = (const __m128i*)b;
|
||||
|
||||
for (;;)
|
||||
{
|
||||
__m128i v0 = _mm_loadu_si128(a128);
|
||||
__m128i v1 = _mm_loadu_si128(b128);
|
||||
__m128i c = _mm_cmpeq_epi32(v0, v1);
|
||||
uint32_t mask = _mm_movemask_epi8(c);
|
||||
|
||||
if (mask != 0xffff) /* Something has changed, figure out where. */
|
||||
{
|
||||
size_t ret = (((uint8_t*)a128 - (uint8_t*)a) |
|
||||
(compat_ctz(~mask))) >> 1;
|
||||
return ret | (a[ret] == b[ret]);
|
||||
}
|
||||
|
||||
a128++;
|
||||
b128++;
|
||||
}
|
||||
}
|
||||
#else
|
||||
static INLINE size_t find_change(const uint16_t *a, const uint16_t *b)
|
||||
{
|
||||
const uint16_t *a_org = a;
|
||||
#ifdef NO_UNALIGNED_MEM
|
||||
while (((uintptr_t)a & (sizeof(size_t) - 1)) && *a == *b)
|
||||
{
|
||||
a++;
|
||||
b++;
|
||||
}
|
||||
if (*a == *b)
|
||||
#endif
|
||||
{
|
||||
const size_t *a_big = (const size_t*)a;
|
||||
const size_t *b_big = (const size_t*)b;
|
||||
|
||||
while (*a_big == *b_big)
|
||||
{
|
||||
a_big++;
|
||||
b_big++;
|
||||
}
|
||||
a = (const uint16_t*)a_big;
|
||||
b = (const uint16_t*)b_big;
|
||||
|
||||
while (*a == *b)
|
||||
{
|
||||
a++;
|
||||
b++;
|
||||
}
|
||||
}
|
||||
return a - a_org;
|
||||
}
|
||||
#endif
|
||||
|
||||
static INLINE size_t find_same(const uint16_t *a, const uint16_t *b)
|
||||
{
|
||||
const uint16_t *a_org = a;
|
||||
#ifdef NO_UNALIGNED_MEM
|
||||
if (((uintptr_t)a & (sizeof(uint32_t) - 1)) && *a != *b)
|
||||
{
|
||||
a++;
|
||||
b++;
|
||||
}
|
||||
if (*a != *b)
|
||||
#endif
|
||||
{
|
||||
/* With this, it's random whether two consecutive identical
|
||||
* words are caught.
|
||||
*
|
||||
* Luckily, compression rate is the same for both cases, and
|
||||
* three is always caught.
|
||||
*
|
||||
* (We prefer to miss two-word blocks, anyways; fewer iterations
|
||||
* of the outer loop, as well as in the decompressor.) */
|
||||
const uint32_t *a_big = (const uint32_t*)a;
|
||||
const uint32_t *b_big = (const uint32_t*)b;
|
||||
|
||||
while (*a_big != *b_big)
|
||||
{
|
||||
a_big++;
|
||||
b_big++;
|
||||
}
|
||||
a = (const uint16_t*)a_big;
|
||||
b = (const uint16_t*)b_big;
|
||||
|
||||
if (a != a_org && a[-1] == b[-1])
|
||||
{
|
||||
a--;
|
||||
b--;
|
||||
}
|
||||
}
|
||||
return a - a_org;
|
||||
}
|
||||
|
||||
size_t state_manager_raw_compress(const void *src, const void *dst, size_t len, void *patch)
|
||||
{
|
||||
const uint16_t *old16 = (const uint16_t*)src;
|
||||
const uint16_t *new16 = (const uint16_t*)dst;
|
||||
uint16_t *compressed16 = (uint16_t*)patch;
|
||||
size_t num16s = (len + sizeof(uint16_t) - 1) / sizeof(uint16_t);
|
||||
|
||||
while (num16s)
|
||||
{
|
||||
size_t i;
|
||||
size_t skip = find_change(old16, new16);
|
||||
|
||||
if (skip >= num16s)
|
||||
break;
|
||||
|
||||
old16 += skip;
|
||||
new16 += skip;
|
||||
num16s -= skip;
|
||||
|
||||
if (skip > UINT16_MAX)
|
||||
{
|
||||
if (skip > UINT32_MAX)
|
||||
{
|
||||
/* This will make it scan the entire thing again,
|
||||
* but it only hits on 8GB unchanged data anyways,
|
||||
* and if you're doing that, you've got bigger problems. */
|
||||
skip = UINT32_MAX;
|
||||
}
|
||||
*compressed16++ = 0;
|
||||
*compressed16++ = skip;
|
||||
*compressed16++ = skip >> 16;
|
||||
skip = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
size_t changed = find_same(old16, new16);
|
||||
if (changed > UINT16_MAX)
|
||||
changed = UINT16_MAX;
|
||||
|
||||
*compressed16++ = changed;
|
||||
*compressed16++ = skip;
|
||||
|
||||
for (i = 0; i < changed; i++)
|
||||
compressed16[i] = old16[i];
|
||||
|
||||
old16 += changed;
|
||||
new16 += changed;
|
||||
num16s -= changed;
|
||||
compressed16 += changed;
|
||||
}
|
||||
|
||||
compressed16[0] = 0;
|
||||
compressed16[1] = 0;
|
||||
compressed16[2] = 0;
|
||||
|
||||
return (uint8_t*)(compressed16+3) - (uint8_t*)patch;
|
||||
}
|
||||
|
||||
void state_manager_raw_decompress(const void *patch, size_t patchlen, void *data, size_t datalen)
|
||||
{
|
||||
uint16_t *out16 = (uint16_t*)data;
|
||||
const uint16_t *patch16 = (const uint16_t*)patch;
|
||||
|
||||
(void)patchlen;
|
||||
(void)datalen;
|
||||
|
||||
for (;;)
|
||||
{
|
||||
uint16_t i;
|
||||
uint16_t numchanged = *(patch16++);
|
||||
|
||||
if (numchanged)
|
||||
{
|
||||
out16 += *patch16++;
|
||||
|
||||
/* We could do memcpy, but it seems that memcpy has a
|
||||
* constant-per-call overhead that actually shows up.
|
||||
*
|
||||
* Our average size in here seems to be 8 or something.
|
||||
* Therefore, we do something with lower overhead. */
|
||||
for (i = 0; i < numchanged; i++)
|
||||
out16[i] = patch16[i];
|
||||
|
||||
patch16 += numchanged;
|
||||
out16 += numchanged;
|
||||
}
|
||||
else
|
||||
{
|
||||
uint32_t numunchanged = patch16[0] | (patch16[1] << 16);
|
||||
|
||||
if (!numunchanged)
|
||||
break;
|
||||
patch16 += 2;
|
||||
out16 += numunchanged;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* The start offsets point to 'nextstart' of any given compressed frame.
|
||||
* Each uint16 is stored native endian; anything that claims any other
|
||||
* endianness refers to the endianness of this specific item.
|
||||
@ -125,46 +378,21 @@ struct state_manager
|
||||
|
||||
state_manager_t *state_manager_new(size_t state_size, size_t buffer_size)
|
||||
{
|
||||
size_t newblocksize;
|
||||
int maxcblks;
|
||||
const int maxcblkcover = UINT16_MAX * sizeof(uint16_t);
|
||||
state_manager_t *state = (state_manager_t*)calloc(1, sizeof(*state));
|
||||
|
||||
if (!state)
|
||||
return NULL;
|
||||
|
||||
newblocksize = ((state_size - 1) | (sizeof(uint16_t) - 1)) + 1;
|
||||
state->blocksize = newblocksize;
|
||||
|
||||
maxcblks = (state->blocksize + maxcblkcover - 1) / maxcblkcover;
|
||||
state->maxcompsize = state->blocksize + maxcblks * sizeof(uint16_t) * 2 +
|
||||
sizeof(uint16_t) + sizeof(uint32_t) + sizeof(size_t) * 2;
|
||||
|
||||
state->blocksize = (state_size + sizeof(uint16_t) - 1) & ~sizeof(uint16_t);
|
||||
/* the compressed data is surrounded by pointers to the other side */
|
||||
state->maxcompsize = state_manager_raw_maxsize(state_size) + sizeof(size_t) * 2;
|
||||
state->data = (uint8_t*)malloc(buffer_size);
|
||||
|
||||
state->thisblock = (uint8_t*)
|
||||
calloc(state->blocksize + sizeof(uint16_t) * 4 + 16, 1);
|
||||
state->nextblock = (uint8_t*)
|
||||
calloc(state->blocksize + sizeof(uint16_t) * 4 + 16, 1);
|
||||
state->thisblock = (uint8_t*)state_manager_raw_alloc(state_size, 0);
|
||||
state->nextblock = (uint8_t*)state_manager_raw_alloc(state_size, 1);
|
||||
if (!state->data || !state->thisblock || !state->nextblock)
|
||||
goto error;
|
||||
|
||||
/* Force in a different byte at the end, so we don't need to check
|
||||
* bounds in the innermost loop (it's expensive).
|
||||
*
|
||||
* There is also a large amount of data that's the same, to stop
|
||||
* the other scan.
|
||||
*
|
||||
* There is also some padding at the end. This is so we don't
|
||||
* read outside the buffer end if we're reading in large blocks;
|
||||
*
|
||||
* It doesn't make any difference to us, but sacrificing 16 bytes to get
|
||||
* Valgrind happy is worth it. */
|
||||
*(uint16_t*)(state->thisblock + state->blocksize + sizeof(uint16_t) * 3) =
|
||||
0xFFFF;
|
||||
*(uint16_t*)(state->nextblock + state->blocksize + sizeof(uint16_t) * 3) =
|
||||
0x0000;
|
||||
|
||||
state->capacity = buffer_size;
|
||||
|
||||
state->head = state->data + sizeof(size_t);
|
||||
@ -215,42 +443,7 @@ bool state_manager_pop(state_manager_t *state, const void **data)
|
||||
compressed = state->data + start + sizeof(size_t);
|
||||
out = state->thisblock;
|
||||
|
||||
/* Begin decompression code
|
||||
* out is the last pushed (or returned) state */
|
||||
compressed16 = (const uint16_t*)compressed;
|
||||
out16 = (uint16_t*)out;
|
||||
|
||||
for (;;)
|
||||
{
|
||||
uint16_t i;
|
||||
uint16_t numchanged = *(compressed16++);
|
||||
|
||||
if (numchanged)
|
||||
{
|
||||
out16 += *compressed16++;
|
||||
|
||||
/* We could do memcpy, but it seems that memcpy has a
|
||||
* constant-per-call overhead that actually shows up.
|
||||
*
|
||||
* Our average size in here seems to be 8 or something.
|
||||
* Therefore, we do something with lower overhead. */
|
||||
for (i = 0; i < numchanged; i++)
|
||||
out16[i] = compressed16[i];
|
||||
|
||||
compressed16 += numchanged;
|
||||
out16 += numchanged;
|
||||
}
|
||||
else
|
||||
{
|
||||
uint32_t numunchanged = compressed16[0] | (compressed16[1] << 16);
|
||||
|
||||
if (!numunchanged)
|
||||
break;
|
||||
compressed16 += 2;
|
||||
out16 += numunchanged;
|
||||
}
|
||||
}
|
||||
/* End decompression code */
|
||||
state_manager_raw_decompress(compressed, state->maxcompsize, out, state->blocksize);
|
||||
|
||||
state->entries--;
|
||||
*data = state->thisblock;
|
||||
@ -276,132 +469,6 @@ void state_manager_push_where(state_manager_t *state, void **data)
|
||||
*data = state->nextblock;
|
||||
}
|
||||
|
||||
#if __SSE2__
|
||||
#if defined(__GNUC__)
|
||||
static INLINE int compat_ctz(unsigned x)
|
||||
{
|
||||
return __builtin_ctz(x);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Only checks at nibble granularity,
|
||||
* because that's what we need. */
|
||||
|
||||
static INLINE int compat_ctz(unsigned x)
|
||||
{
|
||||
if (x & 0x000f)
|
||||
return 0;
|
||||
if (x & 0x00f0)
|
||||
return 4;
|
||||
if (x & 0x0f00)
|
||||
return 8;
|
||||
if (x & 0xf000)
|
||||
return 12;
|
||||
return 16;
|
||||
}
|
||||
#endif
|
||||
|
||||
#include <emmintrin.h>
|
||||
/* There's no equivalent in libc, you'd think so ...
|
||||
* std::mismatch exists, but it's not optimized at all. */
|
||||
|
||||
static INLINE size_t find_change(const uint16_t *a, const uint16_t *b)
|
||||
{
|
||||
const __m128i *a128 = (const __m128i*)a;
|
||||
const __m128i *b128 = (const __m128i*)b;
|
||||
|
||||
for (;;)
|
||||
{
|
||||
__m128i v0 = _mm_loadu_si128(a128);
|
||||
__m128i v1 = _mm_loadu_si128(b128);
|
||||
__m128i c = _mm_cmpeq_epi32(v0, v1);
|
||||
uint32_t mask = _mm_movemask_epi8(c);
|
||||
|
||||
if (mask != 0xffff) /* Something has changed, figure out where. */
|
||||
{
|
||||
size_t ret = (((uint8_t*)a128 - (uint8_t*)a) |
|
||||
(compat_ctz(~mask))) >> 1;
|
||||
return ret | (a[ret] == b[ret]);
|
||||
}
|
||||
|
||||
a128++;
|
||||
b128++;
|
||||
}
|
||||
}
|
||||
#else
|
||||
static INLINE size_t find_change(const uint16_t *a, const uint16_t *b)
|
||||
{
|
||||
const uint16_t *a_org = a;
|
||||
#ifdef NO_UNALIGNED_MEM
|
||||
while (((uintptr_t)a & (sizeof(size_t) - 1)) && *a == *b)
|
||||
{
|
||||
a++;
|
||||
b++;
|
||||
}
|
||||
if (*a == *b)
|
||||
#endif
|
||||
{
|
||||
const size_t *a_big = (const size_t*)a;
|
||||
const size_t *b_big = (const size_t*)b;
|
||||
|
||||
while (*a_big == *b_big)
|
||||
{
|
||||
a_big++;
|
||||
b_big++;
|
||||
}
|
||||
a = (const uint16_t*)a_big;
|
||||
b = (const uint16_t*)b_big;
|
||||
|
||||
while (*a == *b)
|
||||
{
|
||||
a++;
|
||||
b++;
|
||||
}
|
||||
}
|
||||
return a - a_org;
|
||||
}
|
||||
#endif
|
||||
|
||||
static INLINE size_t find_same(const uint16_t *a, const uint16_t *b)
|
||||
{
|
||||
const uint16_t *a_org = a;
|
||||
#ifdef NO_UNALIGNED_MEM
|
||||
if (((uintptr_t)a & (sizeof(uint32_t) - 1)) && *a != *b)
|
||||
{
|
||||
a++;
|
||||
b++;
|
||||
}
|
||||
if (*a != *b)
|
||||
#endif
|
||||
{
|
||||
/* With this, it's random whether two consecutive identical
|
||||
* words are caught.
|
||||
*
|
||||
* Luckily, compression rate is the same for both cases, and
|
||||
* three is always caught.
|
||||
*
|
||||
* (We prefer to miss two-word blocks, anyways; fewer iterations
|
||||
* of the outer loop, as well as in the decompressor.) */
|
||||
const uint32_t *a_big = (const uint32_t*)a;
|
||||
const uint32_t *b_big = (const uint32_t*)b;
|
||||
|
||||
while (*a_big != *b_big)
|
||||
{
|
||||
a_big++;
|
||||
b_big++;
|
||||
}
|
||||
a = (const uint16_t*)a_big;
|
||||
b = (const uint16_t*)b_big;
|
||||
|
||||
if (a != a_org && a[-1] == b[-1])
|
||||
{
|
||||
a--;
|
||||
b--;
|
||||
}
|
||||
}
|
||||
return a - a_org;
|
||||
}
|
||||
|
||||
void state_manager_push_do(state_manager_t *state)
|
||||
{
|
||||
uint8_t *swap = NULL;
|
||||
@ -438,62 +505,7 @@ recheckcapacity:;
|
||||
newb = state->nextblock;
|
||||
compressed = state->head + sizeof(size_t);
|
||||
|
||||
/* Begin compression code; 'compressed' will point to
|
||||
* the end of the compressed data (excluding the prev pointer). */
|
||||
old16 = (const uint16_t*)oldb;
|
||||
new16 = (const uint16_t*)newb;
|
||||
compressed16 = (uint16_t*)compressed;
|
||||
num16s = state->blocksize / sizeof(uint16_t);
|
||||
|
||||
while (num16s)
|
||||
{
|
||||
size_t i;
|
||||
size_t skip = find_change(old16, new16);
|
||||
|
||||
if (skip >= num16s)
|
||||
break;
|
||||
|
||||
old16 += skip;
|
||||
new16 += skip;
|
||||
num16s -= skip;
|
||||
|
||||
if (skip > UINT16_MAX)
|
||||
{
|
||||
if (skip > UINT32_MAX)
|
||||
{
|
||||
/* This will make it scan the entire thing again,
|
||||
* but it only hits on 8GB unchanged data anyways,
|
||||
* and if you're doing that, you've got bigger problems. */
|
||||
skip = UINT32_MAX;
|
||||
}
|
||||
*compressed16++ = 0;
|
||||
*compressed16++ = skip;
|
||||
*compressed16++ = skip >> 16;
|
||||
skip = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
size_t changed = find_same(old16, new16);
|
||||
if (changed > UINT16_MAX)
|
||||
changed = UINT16_MAX;
|
||||
|
||||
*compressed16++ = changed;
|
||||
*compressed16++ = skip;
|
||||
|
||||
for (i = 0; i < changed; i++)
|
||||
compressed16[i] = old16[i];
|
||||
|
||||
old16 += changed;
|
||||
new16 += changed;
|
||||
num16s -= changed;
|
||||
compressed16 += changed;
|
||||
}
|
||||
|
||||
compressed16[0] = 0;
|
||||
compressed16[1] = 0;
|
||||
compressed16[2] = 0;
|
||||
compressed = (uint8_t*)(compressed16 + 3);
|
||||
/* End compression code. */
|
||||
compressed += state_manager_raw_compress(oldb, newb, state->blocksize, compressed);
|
||||
|
||||
if (compressed - state->data + state->maxcompsize > state->capacity)
|
||||
{
|
||||
|
26
rewind.h
26
rewind.h
@ -24,6 +24,7 @@ extern "C" {
|
||||
|
||||
#include <stddef.h>
|
||||
#include <boolean.h>
|
||||
#include <stdint.h>
|
||||
|
||||
typedef struct state_manager state_manager_t;
|
||||
|
||||
@ -42,6 +43,31 @@ void state_manager_capacity(state_manager_t *state,
|
||||
|
||||
void init_rewind(void);
|
||||
|
||||
|
||||
/* Returns the maximum compressed size of a savestate. It is very likely to compress to far less. */
|
||||
size_t state_manager_raw_maxsize(size_t uncomp);
|
||||
|
||||
/*
|
||||
* See state_manager_raw_compress for information about this.
|
||||
* When you're done with it, send it to free().
|
||||
*/
|
||||
void *state_manager_raw_alloc(size_t len, uint16_t uniq);
|
||||
|
||||
/*
|
||||
* Takes two savestates and creates a patch that turns 'src' into 'dst'.
|
||||
* Both 'src' and 'dst' must be returned from state_manager_raw_alloc(), with the same 'len', and different 'uniq'.
|
||||
* 'patch' must be size 'state_manager_raw_maxsize(len)' or more.
|
||||
* Returns the number of bytes actually written to 'patch'.
|
||||
*/
|
||||
size_t state_manager_raw_compress(const void *src, const void *dst, size_t len, void *patch);
|
||||
|
||||
/*
|
||||
* Takes 'patch' from a previous call to 'state_manager_raw_compress' and applies it to 'data' ('src' from that call),
|
||||
* yielding 'dst' in that call.
|
||||
* If the given arguments do not match a previous call to state_manager_raw_compress(), anything at all can happen.
|
||||
*/
|
||||
void state_manager_raw_decompress(const void *patch, size_t patchlen, void *data, size_t datalen);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
Loading…
x
Reference in New Issue
Block a user