Split these functions out. ctult needs them for netplay.

This commit is contained in:
Alcaro 2015-06-27 04:53:37 +02:00
parent 361879bc22
commit e71d8f852a
2 changed files with 287 additions and 249 deletions

510
rewind.c
View File

@ -19,7 +19,6 @@
#include "rewind.h"
#include "performance.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <retro_inline.h>
#include "intl/intl.h"
@ -63,6 +62,260 @@ repeat {
size thisstart;
#endif
size_t state_manager_raw_maxsize(size_t uncomp)
{
const int maxcblkcover = UINT16_MAX * sizeof(uint16_t); /* bytes covered by a compressed block */
size_t uncomp16 = (uncomp + sizeof(uint16_t) - 1) & ~sizeof(uint16_t); /* uncompressed size, rounded to 16 bits */
size_t maxcblks = (uncomp + maxcblkcover - 1) / maxcblkcover; /* number of blocks */
return uncomp16 + maxcblks * sizeof(uint16_t)*2 /* two u16 overhead per block */ + sizeof(uint16_t)*3; /* three u16 to end it */
}
void *state_manager_raw_alloc(size_t len, uint16_t uniq)
{
size_t len16 = (len + sizeof(uint16_t) - 1) & ~sizeof(uint16_t);
uint16_t *ret = (uint16_t*)calloc(len16 + sizeof(uint16_t) * 4 + 16, 1);
/* Force in a different byte at the end, so we don't need to check
* bounds in the innermost loop (it's expensive).
*
* There is also a large amount of data that's the same, to stop
* the other scan.
*
* There is also some padding at the end. This is so we don't
* read outside the buffer end if we're reading in large blocks;
*
* It doesn't make any difference to us, but sacrificing 16 bytes to get
* Valgrind happy is worth it. */
ret[len16/sizeof(uint16_t) + 3] = uniq;
return ret;
}
#if __SSE2__
#if defined(__GNUC__)
static INLINE int compat_ctz(unsigned x)
{
return __builtin_ctz(x);
}
#else
/* Only checks at nibble granularity,
* because that's what we need. */
static INLINE int compat_ctz(unsigned x)
{
if (x & 0x000f)
return 0;
if (x & 0x00f0)
return 4;
if (x & 0x0f00)
return 8;
if (x & 0xf000)
return 12;
return 16;
}
#endif
#include <emmintrin.h>
/* There's no equivalent in libc, you'd think so ...
* std::mismatch exists, but it's not optimized at all. */
static INLINE size_t find_change(const uint16_t *a, const uint16_t *b)
{
const __m128i *a128 = (const __m128i*)a;
const __m128i *b128 = (const __m128i*)b;
for (;;)
{
__m128i v0 = _mm_loadu_si128(a128);
__m128i v1 = _mm_loadu_si128(b128);
__m128i c = _mm_cmpeq_epi32(v0, v1);
uint32_t mask = _mm_movemask_epi8(c);
if (mask != 0xffff) /* Something has changed, figure out where. */
{
size_t ret = (((uint8_t*)a128 - (uint8_t*)a) |
(compat_ctz(~mask))) >> 1;
return ret | (a[ret] == b[ret]);
}
a128++;
b128++;
}
}
#else
static INLINE size_t find_change(const uint16_t *a, const uint16_t *b)
{
const uint16_t *a_org = a;
#ifdef NO_UNALIGNED_MEM
while (((uintptr_t)a & (sizeof(size_t) - 1)) && *a == *b)
{
a++;
b++;
}
if (*a == *b)
#endif
{
const size_t *a_big = (const size_t*)a;
const size_t *b_big = (const size_t*)b;
while (*a_big == *b_big)
{
a_big++;
b_big++;
}
a = (const uint16_t*)a_big;
b = (const uint16_t*)b_big;
while (*a == *b)
{
a++;
b++;
}
}
return a - a_org;
}
#endif
static INLINE size_t find_same(const uint16_t *a, const uint16_t *b)
{
const uint16_t *a_org = a;
#ifdef NO_UNALIGNED_MEM
if (((uintptr_t)a & (sizeof(uint32_t) - 1)) && *a != *b)
{
a++;
b++;
}
if (*a != *b)
#endif
{
/* With this, it's random whether two consecutive identical
* words are caught.
*
* Luckily, compression rate is the same for both cases, and
* three is always caught.
*
* (We prefer to miss two-word blocks, anyways; fewer iterations
* of the outer loop, as well as in the decompressor.) */
const uint32_t *a_big = (const uint32_t*)a;
const uint32_t *b_big = (const uint32_t*)b;
while (*a_big != *b_big)
{
a_big++;
b_big++;
}
a = (const uint16_t*)a_big;
b = (const uint16_t*)b_big;
if (a != a_org && a[-1] == b[-1])
{
a--;
b--;
}
}
return a - a_org;
}
size_t state_manager_raw_compress(const void *src, const void *dst, size_t len, void *patch)
{
const uint16_t *old16 = (const uint16_t*)src;
const uint16_t *new16 = (const uint16_t*)dst;
uint16_t *compressed16 = (uint16_t*)patch;
size_t num16s = (len + sizeof(uint16_t) - 1) / sizeof(uint16_t);
while (num16s)
{
size_t i;
size_t skip = find_change(old16, new16);
if (skip >= num16s)
break;
old16 += skip;
new16 += skip;
num16s -= skip;
if (skip > UINT16_MAX)
{
if (skip > UINT32_MAX)
{
/* This will make it scan the entire thing again,
* but it only hits on 8GB unchanged data anyways,
* and if you're doing that, you've got bigger problems. */
skip = UINT32_MAX;
}
*compressed16++ = 0;
*compressed16++ = skip;
*compressed16++ = skip >> 16;
skip = 0;
continue;
}
size_t changed = find_same(old16, new16);
if (changed > UINT16_MAX)
changed = UINT16_MAX;
*compressed16++ = changed;
*compressed16++ = skip;
for (i = 0; i < changed; i++)
compressed16[i] = old16[i];
old16 += changed;
new16 += changed;
num16s -= changed;
compressed16 += changed;
}
compressed16[0] = 0;
compressed16[1] = 0;
compressed16[2] = 0;
return (uint8_t*)(compressed16+3) - (uint8_t*)patch;
}
void state_manager_raw_decompress(const void *patch, size_t patchlen, void *data, size_t datalen)
{
uint16_t *out16 = (uint16_t*)data;
const uint16_t *patch16 = (const uint16_t*)patch;
(void)patchlen;
(void)datalen;
for (;;)
{
uint16_t i;
uint16_t numchanged = *(patch16++);
if (numchanged)
{
out16 += *patch16++;
/* We could do memcpy, but it seems that memcpy has a
* constant-per-call overhead that actually shows up.
*
* Our average size in here seems to be 8 or something.
* Therefore, we do something with lower overhead. */
for (i = 0; i < numchanged; i++)
out16[i] = patch16[i];
patch16 += numchanged;
out16 += numchanged;
}
else
{
uint32_t numunchanged = patch16[0] | (patch16[1] << 16);
if (!numunchanged)
break;
patch16 += 2;
out16 += numunchanged;
}
}
}
/* The start offsets point to 'nextstart' of any given compressed frame.
* Each uint16 is stored native endian; anything that claims any other
* endianness refers to the endianness of this specific item.
@ -125,46 +378,21 @@ struct state_manager
state_manager_t *state_manager_new(size_t state_size, size_t buffer_size)
{
size_t newblocksize;
int maxcblks;
const int maxcblkcover = UINT16_MAX * sizeof(uint16_t);
state_manager_t *state = (state_manager_t*)calloc(1, sizeof(*state));
if (!state)
return NULL;
newblocksize = ((state_size - 1) | (sizeof(uint16_t) - 1)) + 1;
state->blocksize = newblocksize;
maxcblks = (state->blocksize + maxcblkcover - 1) / maxcblkcover;
state->maxcompsize = state->blocksize + maxcblks * sizeof(uint16_t) * 2 +
sizeof(uint16_t) + sizeof(uint32_t) + sizeof(size_t) * 2;
state->blocksize = (state_size + sizeof(uint16_t) - 1) & ~sizeof(uint16_t);
/* the compressed data is surrounded by pointers to the other side */
state->maxcompsize = state_manager_raw_maxsize(state_size) + sizeof(size_t) * 2;
state->data = (uint8_t*)malloc(buffer_size);
state->thisblock = (uint8_t*)
calloc(state->blocksize + sizeof(uint16_t) * 4 + 16, 1);
state->nextblock = (uint8_t*)
calloc(state->blocksize + sizeof(uint16_t) * 4 + 16, 1);
state->thisblock = (uint8_t*)state_manager_raw_alloc(state_size, 0);
state->nextblock = (uint8_t*)state_manager_raw_alloc(state_size, 1);
if (!state->data || !state->thisblock || !state->nextblock)
goto error;
/* Force in a different byte at the end, so we don't need to check
* bounds in the innermost loop (it's expensive).
*
* There is also a large amount of data that's the same, to stop
* the other scan.
*
* There is also some padding at the end. This is so we don't
* read outside the buffer end if we're reading in large blocks;
*
* It doesn't make any difference to us, but sacrificing 16 bytes to get
* Valgrind happy is worth it. */
*(uint16_t*)(state->thisblock + state->blocksize + sizeof(uint16_t) * 3) =
0xFFFF;
*(uint16_t*)(state->nextblock + state->blocksize + sizeof(uint16_t) * 3) =
0x0000;
state->capacity = buffer_size;
state->head = state->data + sizeof(size_t);
@ -215,42 +443,7 @@ bool state_manager_pop(state_manager_t *state, const void **data)
compressed = state->data + start + sizeof(size_t);
out = state->thisblock;
/* Begin decompression code
* out is the last pushed (or returned) state */
compressed16 = (const uint16_t*)compressed;
out16 = (uint16_t*)out;
for (;;)
{
uint16_t i;
uint16_t numchanged = *(compressed16++);
if (numchanged)
{
out16 += *compressed16++;
/* We could do memcpy, but it seems that memcpy has a
* constant-per-call overhead that actually shows up.
*
* Our average size in here seems to be 8 or something.
* Therefore, we do something with lower overhead. */
for (i = 0; i < numchanged; i++)
out16[i] = compressed16[i];
compressed16 += numchanged;
out16 += numchanged;
}
else
{
uint32_t numunchanged = compressed16[0] | (compressed16[1] << 16);
if (!numunchanged)
break;
compressed16 += 2;
out16 += numunchanged;
}
}
/* End decompression code */
state_manager_raw_decompress(compressed, state->maxcompsize, out, state->blocksize);
state->entries--;
*data = state->thisblock;
@ -276,132 +469,6 @@ void state_manager_push_where(state_manager_t *state, void **data)
*data = state->nextblock;
}
#if __SSE2__
#if defined(__GNUC__)
static INLINE int compat_ctz(unsigned x)
{
return __builtin_ctz(x);
}
#else
/* Only checks at nibble granularity,
* because that's what we need. */
static INLINE int compat_ctz(unsigned x)
{
if (x & 0x000f)
return 0;
if (x & 0x00f0)
return 4;
if (x & 0x0f00)
return 8;
if (x & 0xf000)
return 12;
return 16;
}
#endif
#include <emmintrin.h>
/* There's no equivalent in libc, you'd think so ...
* std::mismatch exists, but it's not optimized at all. */
static INLINE size_t find_change(const uint16_t *a, const uint16_t *b)
{
const __m128i *a128 = (const __m128i*)a;
const __m128i *b128 = (const __m128i*)b;
for (;;)
{
__m128i v0 = _mm_loadu_si128(a128);
__m128i v1 = _mm_loadu_si128(b128);
__m128i c = _mm_cmpeq_epi32(v0, v1);
uint32_t mask = _mm_movemask_epi8(c);
if (mask != 0xffff) /* Something has changed, figure out where. */
{
size_t ret = (((uint8_t*)a128 - (uint8_t*)a) |
(compat_ctz(~mask))) >> 1;
return ret | (a[ret] == b[ret]);
}
a128++;
b128++;
}
}
#else
static INLINE size_t find_change(const uint16_t *a, const uint16_t *b)
{
const uint16_t *a_org = a;
#ifdef NO_UNALIGNED_MEM
while (((uintptr_t)a & (sizeof(size_t) - 1)) && *a == *b)
{
a++;
b++;
}
if (*a == *b)
#endif
{
const size_t *a_big = (const size_t*)a;
const size_t *b_big = (const size_t*)b;
while (*a_big == *b_big)
{
a_big++;
b_big++;
}
a = (const uint16_t*)a_big;
b = (const uint16_t*)b_big;
while (*a == *b)
{
a++;
b++;
}
}
return a - a_org;
}
#endif
static INLINE size_t find_same(const uint16_t *a, const uint16_t *b)
{
const uint16_t *a_org = a;
#ifdef NO_UNALIGNED_MEM
if (((uintptr_t)a & (sizeof(uint32_t) - 1)) && *a != *b)
{
a++;
b++;
}
if (*a != *b)
#endif
{
/* With this, it's random whether two consecutive identical
* words are caught.
*
* Luckily, compression rate is the same for both cases, and
* three is always caught.
*
* (We prefer to miss two-word blocks, anyways; fewer iterations
* of the outer loop, as well as in the decompressor.) */
const uint32_t *a_big = (const uint32_t*)a;
const uint32_t *b_big = (const uint32_t*)b;
while (*a_big != *b_big)
{
a_big++;
b_big++;
}
a = (const uint16_t*)a_big;
b = (const uint16_t*)b_big;
if (a != a_org && a[-1] == b[-1])
{
a--;
b--;
}
}
return a - a_org;
}
void state_manager_push_do(state_manager_t *state)
{
uint8_t *swap = NULL;
@ -438,62 +505,7 @@ recheckcapacity:;
newb = state->nextblock;
compressed = state->head + sizeof(size_t);
/* Begin compression code; 'compressed' will point to
* the end of the compressed data (excluding the prev pointer). */
old16 = (const uint16_t*)oldb;
new16 = (const uint16_t*)newb;
compressed16 = (uint16_t*)compressed;
num16s = state->blocksize / sizeof(uint16_t);
while (num16s)
{
size_t i;
size_t skip = find_change(old16, new16);
if (skip >= num16s)
break;
old16 += skip;
new16 += skip;
num16s -= skip;
if (skip > UINT16_MAX)
{
if (skip > UINT32_MAX)
{
/* This will make it scan the entire thing again,
* but it only hits on 8GB unchanged data anyways,
* and if you're doing that, you've got bigger problems. */
skip = UINT32_MAX;
}
*compressed16++ = 0;
*compressed16++ = skip;
*compressed16++ = skip >> 16;
skip = 0;
continue;
}
size_t changed = find_same(old16, new16);
if (changed > UINT16_MAX)
changed = UINT16_MAX;
*compressed16++ = changed;
*compressed16++ = skip;
for (i = 0; i < changed; i++)
compressed16[i] = old16[i];
old16 += changed;
new16 += changed;
num16s -= changed;
compressed16 += changed;
}
compressed16[0] = 0;
compressed16[1] = 0;
compressed16[2] = 0;
compressed = (uint8_t*)(compressed16 + 3);
/* End compression code. */
compressed += state_manager_raw_compress(oldb, newb, state->blocksize, compressed);
if (compressed - state->data + state->maxcompsize > state->capacity)
{

View File

@ -24,6 +24,7 @@ extern "C" {
#include <stddef.h>
#include <boolean.h>
#include <stdint.h>
typedef struct state_manager state_manager_t;
@ -42,6 +43,31 @@ void state_manager_capacity(state_manager_t *state,
void init_rewind(void);
/* Returns the maximum compressed size of a savestate. It is very likely to compress to far less. */
size_t state_manager_raw_maxsize(size_t uncomp);
/*
* See state_manager_raw_compress for information about this.
* When you're done with it, send it to free().
*/
void *state_manager_raw_alloc(size_t len, uint16_t uniq);
/*
* Takes two savestates and creates a patch that turns 'src' into 'dst'.
* Both 'src' and 'dst' must be returned from state_manager_raw_alloc(), with the same 'len', and different 'uniq'.
* 'patch' must be size 'state_manager_raw_maxsize(len)' or more.
* Returns the number of bytes actually written to 'patch'.
*/
size_t state_manager_raw_compress(const void *src, const void *dst, size_t len, void *patch);
/*
* Takes 'patch' from a previous call to 'state_manager_raw_compress' and applies it to 'data' ('src' from that call),
* yielding 'dst' in that call.
* If the given arguments do not match a previous call to state_manager_raw_compress(), anything at all can happen.
*/
void state_manager_raw_decompress(const void *patch, size_t patchlen, void *data, size_t datalen);
#ifdef __cplusplus
}
#endif