Switch to master copy, in case anyone wants to compare.

This commit is contained in:
Alcaro 2014-02-22 19:21:26 +01:00
parent 5caca3539f
commit f055060878

610
rewind.c
View File

@ -1,25 +1,5 @@
/* RetroArch - A frontend for libretro. #include "minir.h"
* Copyright (C) 2010-2014 - Hans-Kristian Arntzen
*
* RetroArch is free software: you can redistribute it and/or modify it under the terms
* of the GNU General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
*
* RetroArch is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with RetroArch.
* If not, see <http://www.gnu.org/licenses/>.
*/
#define __STDC_LIMIT_MACROS
#include "rewind.h"
#include <stdlib.h> #include <stdlib.h>
#include "msvc/msvc-stdint/stdint.h"
//#define NO_UNALIGNED_MEM
//Uncomment the above if alignment is enforced.
//Format per frame: //Format per frame:
//size nextstart; //size nextstart;
@ -43,7 +23,7 @@
//Each size value is stored native endian if alignment is not enforced; if it is, they're little endian. //Each size value is stored native endian if alignment is not enforced; if it is, they're little endian.
//The start of the buffer contains a size pointing to the end of the buffer; the end points to its start. //The start of the buffer contains a size pointing to the end of the buffer; the end points to its start.
//Wrapping is handled by returning to the start of the buffer if the compressed data could potentially hit the edge; //Wrapping is handled by returning to the start of the buffer if the compressed data could potentially hit the edge;
// if the compressed data could potentially overwrite the tail pointer, the tail retreats until it can no longer collide. //if the compressed data could potentially overwrite the tail pointer, the tail retreats until it can no longer collide.
//This means that on average, ~2*maxcompsize is unused at any given moment. //This means that on average, ~2*maxcompsize is unused at any given moment.
#if SIZE_MAX == 0xFFFFFFFF #if SIZE_MAX == 0xFFFFFFFF
@ -61,29 +41,29 @@ extern char double_check_sizeof_size_t[(sizeof(size_t)==8)?1:-1];
#ifdef USE_64BIT #ifdef USE_64BIT
static inline void write_size_t(uint16_t* ptr, size_t val) static inline void write_size_t(uint16_t* ptr, size_t val)
{ {
ptr[0] = val>>0; ptr[0]=val>>0;
ptr[1] = val>>16; ptr[1]=val>>16;
ptr[2] = val>>32; ptr[2]=val>>32;
ptr[3] = val>>48; ptr[3]=val>>48;
} }
static inline size_t read_size_t(uint16_t* ptr) static inline size_t read_size_t(uint16_t* ptr)
{ {
return ((size_t)ptr[0]<<0 | return ((size_t)ptr[0]<<0 |
(size_t)ptr[1]<<16 | (size_t)ptr[1]<<16 |
(size_t)ptr[2]<<32 | (size_t)ptr[2]<<32 |
(size_t)ptr[3]<<48); (size_t)ptr[3]<<48);
} }
#else #else
static inline void write_size_t(uint16_t* ptr, size_t val) static inline void write_size_t(uint16_t* ptr, size_t val)
{ {
ptr[0] = val; ptr[0]=val;
ptr[1] = val>>16; ptr[1]=val>>16;
} }
static inline size_t read_size_t(uint16_t* ptr) static inline size_t read_size_t(uint16_t* ptr)
{ {
return (ptr[0] | (size_t)ptr[1]<<16); return (ptr[0] | (size_t)ptr[1]<<16);
} }
#endif #endif
@ -92,163 +72,95 @@ static inline size_t read_size_t(uint16_t* ptr)
#define write_size_t(ptr, val) (*(size_t*)(ptr) = (val)) #define write_size_t(ptr, val) (*(size_t*)(ptr) = (val))
#endif #endif
struct state_manager { struct rewindstack_impl {
char *data; struct rewindstack i;
size_t capacity;
char *head;//Reading and writing is done here. char * data;
char *tail;//If head comes close to this, discard a frame. size_t capacity;
char * head;//Reading and writing is done here.
char *thisblock; char * tail;//If head comes close to this, discard a frame.
char *nextblock;
bool thisblock_valid; char * thisblock;
char * nextblock;
size_t blocksize;//This one is runded up from reset::blocksize. bool thisblock_valid;
size_t maxcompsize;//size_t+(blocksize+131071)/131072*(blocksize+u16+u16)+u16+u32+size_t (yes, the math is a bit ugly)
size_t blocksize;//This one is runded up from reset::blocksize.
unsigned int entries; size_t maxcompsize;//size_t+(blocksize+131071)/131072*(blocksize+u16+u16)+u16+u32+size_t (yes, the math is a bit ugly)
unsigned int entries;
}; };
state_manager_t *state_manager_new(size_t state_size, size_t buffer_size) static void reset(struct rewindstack * this_, size_t blocksize, size_t capacity)
{ {
state_manager_t *state = (state_manager_t*)malloc(sizeof(*state)); struct rewindstack_impl * this=(struct rewindstack_impl*)this_;
state->capacity = 0; int newblocksize=((blocksize-1)|(sizeof(uint16_t)-1))+1;
state->blocksize = 0; if (this->blocksize!=newblocksize)
{
int newblocksize = ((state_size-1)|(sizeof(uint16_t)-1))+1; this->blocksize=newblocksize;
state->blocksize = newblocksize;
const int maxcblkcover=UINT16_MAX*sizeof(uint16_t);
const int maxcblkcover = UINT16_MAX*sizeof(uint16_t); const int maxcblks=(this->blocksize+maxcblkcover-1)/maxcblkcover;
const int maxcblks = (state->blocksize+maxcblkcover-1)/maxcblkcover; this->maxcompsize=this->blocksize + maxcblks*sizeof(uint16_t)*2 + sizeof(uint16_t)+sizeof(uint32_t) + sizeof(size_t)*2;
state->maxcompsize = state->blocksize + maxcblks*sizeof(uint16_t)*2 + sizeof(uint16_t)+sizeof(uint32_t) + sizeof(size_t)*2;
free(this->thisblock);
state->data = (char*)malloc(buffer_size); free(this->nextblock);
this->thisblock=calloc(this->blocksize+sizeof(uint16_t)*4+16, 1);
state->thisblock = (char*)calloc(state->blocksize+sizeof(uint16_t)*4+16, 1); this->nextblock=calloc(this->blocksize+sizeof(uint16_t)*4+16, 1);
state->nextblock = (char*)calloc(state->blocksize+sizeof(uint16_t)*4+16, 1); //Force in a different byte at the end, so we don't need to check bounds in the innermost loop (it's expensive).
if (!state->data || !state->thisblock || !state->nextblock) //There is also a large amount of data that's the same, to stop the other scan
{ //There is also some padding at the end. This is so we don't read outside the buffer end if we're reading in large blocks;
free(state->data); // it doesn't make any difference to us, but sacrificing 16 bytes to get Valgrind happy is worth it.
free(state->thisblock); *(uint16_t*)(this->thisblock+this->blocksize+sizeof(uint16_t)*3)=0xFFFF;
free(state->nextblock); *(uint16_t*)(this->nextblock+this->blocksize+sizeof(uint16_t)*3)=0x0000;
free(state); }
return NULL;
} if (capacity!=this->capacity)
//Force in a different byte at the end, so we don't need to check bounds in the innermost loop (it's expensive). {
//There is also a large amount of data that's the same, to stop the other scan free(this->data);
//There is also some padding at the end. This is so we don't read outside the buffer end if we're reading in large blocks; this->data=malloc(capacity);
// it doesn't make any difference to us, but sacrificing 16 bytes to get Valgrind happy is worth it. this->capacity=capacity;
*(uint16_t*)(state->thisblock+state->blocksize+sizeof(uint16_t)*3) = 0xFFFF; }
*(uint16_t*)(state->nextblock+state->blocksize+sizeof(uint16_t)*3) = 0x0000;
this->head=this->data+sizeof(size_t);
state->capacity=buffer_size; this->tail=this->data+sizeof(size_t);
state->head = state->data+sizeof(size_t); this->thisblock_valid=false;
state->tail = state->data+sizeof(size_t);
this->entries=0;
state->thisblock_valid = false;
state->entries = 0;
return state;
} }
void state_manager_free(state_manager_t *state) static void * push_begin(struct rewindstack * this_)
{ {
free(state->data); struct rewindstack_impl * this=(struct rewindstack_impl*)this_;
free(state->thisblock); //We need to ensure we have an uncompressed copy of the last pushed state, or we could
free(state->nextblock); // end up applying a 'patch' to wrong savestate, and that'd blow up rather quickly.
free(state); if (!this->thisblock_valid)
} {
if (this_->pull(this_))
bool state_manager_pop(state_manager_t *state, const void **data) {
{ this->thisblock_valid=true;
*data = NULL; this->entries++;
}
if (state->thisblock_valid) }
{ return this->nextblock;
state->thisblock_valid = false;
state->entries--;
*data = state->thisblock;
return true;
}
if (state->head == state->tail) return false;
size_t start = read_size_t((uint16_t*)(state->head - sizeof(size_t)));
state->head = state->data+start;
const char *compressed = state->data+start+sizeof(size_t);
char *out = state->thisblock;
//Begin decompression code
//out is the last pushed (or returned) state
const uint16_t *compressed16 = (const uint16_t*)compressed;
uint16_t *out16 = (uint16_t*)out;
while (true)
{
uint16_t numchanged = *(compressed16++);
if (numchanged)
{
out16 += *(compressed16++);
//We could do memcpy, but it seems that memcpy has a constant-per-call overhead that actually shows up.
//Our average size in here seems to be 8 or something.
//Therefore, we do something with lower overhead.
for (int i=0;i<numchanged;i++) out16[i] = compressed16[i];
compressed16 += numchanged;
out16 += numchanged;
}
else
{
uint32_t numunchanged = compressed16[0] | compressed16[1]<<16;
if (!numunchanged) break;
compressed16 += 2;
out16 += numunchanged;
}
}
//End decompression code
state->entries--;
*data = state->thisblock;
return true;
}
void state_manager_push_where(state_manager_t *state, void **data)
{
//We need to ensure we have an uncompressed copy of the last pushed state, or we could
// end up applying a 'patch' to wrong savestate, and that'd blow up rather quickly.
if (!state->thisblock_valid)
{
const void *ignored;
if (state_manager_pop(state, &ignored))
{
state->thisblock_valid = true;
state->entries++;
}
}
*data=state->nextblock;
} }
#if __SSE2__ #if __SSE2__
#if defined(__GNUC__) #if defined(__GNUC__)
static inline int compat_ctz(unsigned int x) static inline int compat_ctz(unsigned int x)
{ {
return __builtin_ctz(x); return __builtin_ctz(x);
} }
#else #else
// Only checks at nibble granularity, because that's what we need. // Only checks at nibble granularity, because that's what we need.
static inline int compat_ctz(unsigned int x) static inline int compat_ctz(unsigned int x)
{ {
if (x & 0x000f) if (x&0x000f) return 0;
return 0; if (x&0x00f0) return 4;
if (x & 0x00f0) if (x&0x0f00) return 8;
return 4; if (x&0xf000) return 12;
if (x & 0x0f00) return 16;
return 8;
if (x & 0xf000)
return 12;
return 16;
} }
#endif #endif
@ -308,11 +220,11 @@ static inline size_t find_change(const uint16_t * a, const uint16_t * b)
} }
#endif #endif
#if __SSE2__x // this is not a typo - do not fix unless you can show evidence that this version is faster #if __SSE2__x
//This one can give different answers than the C version in some cases. However, the compression ratio remains unaffected. //This one can give different answers than the C version in some cases. However, the compression ratio remains unaffected.
//It also appears to be slower. Probably due to the low average duration of this loop. //It also appears to be slower. Probably due to the low average duration of this loop.
static inline size_t find_same(const uint16_t * a, const uint16_t * b) static inline size_t find_same(const uint16_t * a, const uint16_t * b)
{ {gfgf
if (a[0]==b[0] && a[1]==b[1]) return 0; if (a[0]==b[0] && a[1]==b[1]) return 0;
if (a[1]==b[1] && a[2]==b[2]) return 1; if (a[1]==b[1] && a[2]==b[2]) return 1;
if (a[2]==b[2] && a[3]==b[3]) return 2; if (a[2]==b[2] && a[3]==b[3]) return 2;
@ -337,6 +249,8 @@ static inline size_t find_same(const uint16_t * a, const uint16_t * b)
} }
} }
#else #else
//desired comp ratio: 4.074198%
//*
static inline size_t find_same(const uint16_t * a, const uint16_t * b) static inline size_t find_same(const uint16_t * a, const uint16_t * b)
{ {
const uint16_t * a_org=a; const uint16_t * a_org=a;
@ -371,105 +285,261 @@ static inline size_t find_same(const uint16_t * a, const uint16_t * b)
} }
return a-a_org; return a-a_org;
} }
/*/
static inline size_t find_same(const uint16_t * a, const uint16_t * b)
{
const uint16_t * a_org=a;
//Comparing two or three words makes no real difference.
//With two, the smaller blocks are less likely to be chopped up elsewhere due to 64KB;
// with three, we get larger blocks which should be a minuscle bit faster to decompress,
// but probably a little slower to compress. Since compression is more bottleneck than decompression is, we favor that.
while (a[0]!=b[0] || a[1]!=b[1])
{
a++;
b++;
//Optimize this by only checking one at the time for as long as possible.
while (*a!=*b)
{
a++;
b++;
}
}
return a-a_org;
}
//*/
#endif #endif
void state_manager_push_do(state_manager_t *state) #include<stdio.h>
static void push_end(struct rewindstack * this_)
{ {
if (state->thisblock_valid) struct rewindstack_impl * this=(struct rewindstack_impl*)this_;
{ if (this->thisblock_valid)
if (state->capacity<sizeof(size_t)+state->maxcompsize) return; {
/*
recheckcapacity:; if(1)
size_t headpos = (state->head-state->data); {
size_t tailpos = (state->tail-state->data); static FILE * out=NULL;
size_t remaining = (tailpos+state->capacity-sizeof(size_t)-headpos-1)%state->capacity + 1; bool q=0;
if (remaining <= state->maxcompsize) if(!out)out=fopen("diff.bin", "wb"),q=1;
{ int p=0;
state->tail = state->data + read_size_t((uint16_t*)state->tail); while (p<this->blocksize)
state->entries--; {
goto recheckcapacity; int pp=p;
} while(this->thisblock[p]==this->nextblock[p]) p++;
unsigned int o2=p-pp;
const char *oldb = state->thisblock; while(o2>0xFFFF)
const char *newb = state->nextblock; {
char *compressed = state->head+sizeof(size_t); unsigned short l=0;
unsigned short h=0xFFFF;
//Begin compression code; 'compressed' will point to the end of the compressed data (excluding the prev pointer). fwrite(&h, 2,1, out);
const uint16_t *old16 = (const uint16_t*)oldb; fwrite(&l, 2,1, out);
const uint16_t *new16 = (const uint16_t*)newb; o2-=0xFFFF;
uint16_t *compressed16 = (uint16_t*)compressed; }
size_t num16s = state->blocksize/sizeof(uint16_t); unsigned short o=o2;
while (num16s) fwrite(&o, 2,1, out);
{ pp=p;
size_t skip = find_change(old16, new16); while(this->thisblock[p]!=this->nextblock[p]) p++;
o2=p-pp;
if (skip >= num16s) break; while(o2>0xFFFF)
old16+=skip; {
new16+=skip; unsigned short l=0;
num16s-=skip; unsigned short h=0xFFFF;
fwrite(&l, 2,1, out);
if (skip > UINT16_MAX) fwrite(&h, 2,1, out);
{ o2-=0xFFFF;
if (skip > UINT32_MAX) }
{ o=o2;
//This will make it scan the entire thing again, but it only hits on 8GB unchanged fwrite(&o, 2,1, out);
//data anyways, and if you're doing that, you've got bigger problems. }
skip = UINT32_MAX; fflush(out);
} if(q)printf("[%i %zu]\n",p,this->blocksize);
*(compressed16++) = 0; }
*(compressed16++) = skip; // */
*(compressed16++) = skip>>16; if (this->capacity<sizeof(size_t)+this->maxcompsize) return;
skip = 0;
continue; recheckcapacity:;
} size_t headpos=(this->head-this->data);
size_t tailpos=(this->tail-this->data);
size_t changed=find_same(old16, new16); size_t remaining=(tailpos+this->capacity-sizeof(size_t)-headpos-1)%this->capacity + 1;
if (changed>UINT16_MAX) changed=UINT16_MAX; if (remaining<=this->maxcompsize)
*(compressed16++)=changed; {
*(compressed16++)=skip; this->tail=this->data + read_size_t((uint16_t*)this->tail);
for (int i=0;i<changed;i++) compressed16[i]=old16[i]; this->entries--;
old16+=changed; goto recheckcapacity;
new16+=changed; }
compressed16+=changed;
num16s-=changed; const char* old=this->thisblock;
} const char* new=this->nextblock;
compressed16[0] = 0; char* compressed=this->head+sizeof(size_t);
compressed16[1] = 0;
compressed16[2] = 0; //Begin compression code; 'compressed' will point to the end of the compressed data (excluding the prev pointer).
compressed = (char*)(compressed16+3); const uint16_t * old16=(const uint16_t*)old;
//End compression code. const uint16_t * new16=(const uint16_t*)new;
uint16_t * compressed16=(uint16_t*)compressed;
if (compressed - state->data + state->maxcompsize > state->capacity) size_t num16s=this->blocksize/sizeof(uint16_t);
{ while (num16s)
compressed = state->data; {
if (state->tail == state->data+sizeof(size_t)) state->tail = state->data + *(size_t*)state->tail; size_t skip=find_change(old16, new16);
} //size_t skip=find_change_b(old16, new16);
write_size_t((uint16_t*)compressed, state->head-state->data); //if (skip!=skip2) abort();
compressed += sizeof(size_t);
write_size_t((uint16_t*)state->head, compressed-state->data); if (skip>=num16s) break;
state->head = compressed; old16+=skip;
} new16+=skip;
else num16s-=skip;
{
state->thisblock_valid = true; if (skip>UINT16_MAX)
} {
if (skip>UINT32_MAX)
char *swap = state->thisblock; {
state->thisblock = state->nextblock; //This will make it scan the entire thing again, but it only hits on 8GB unchanged
state->nextblock = swap; //data anyways, and if you're doing that, you've got bigger problems.
skip=UINT32_MAX;
state->entries++; }
*(compressed16++)=0;
return; *(compressed16++)=skip;
*(compressed16++)=skip>>16;
skip=0;
continue;
}
size_t changed=find_same(old16, new16);
if (changed>UINT16_MAX) changed=UINT16_MAX;
*(compressed16++)=changed;
*(compressed16++)=skip;
for (int i=0;i<changed;i++) compressed16[i]=old16[i];
old16+=changed;
new16+=changed;
compressed16+=changed;
num16s-=changed;
}
compressed16[0]=0;
compressed16[1]=0;
compressed16[2]=0;
compressed=(char*)(compressed16+3);
//End compression code.
if (compressed-this->data+this->maxcompsize > this->capacity)
{
compressed=this->data;
if (this->tail==this->data+sizeof(size_t)) this->tail=this->data + *(size_t*)this->tail;
}
write_size_t((uint16_t*)compressed, this->head-this->data);
compressed+=sizeof(size_t);
write_size_t((uint16_t*)this->head, compressed-this->data);
this->head=compressed;
}
else
{
this->thisblock_valid=true;
}
char * swap=this->thisblock;
this->thisblock=this->nextblock;
this->nextblock=swap;
this->entries++;
} }
void state_manager_capacity(state_manager_t *state, unsigned int * entries, size_t * bytes, bool * full) static void push_cancel(struct rewindstack * this_)
{ {
size_t headpos = (state->head-state->data); //struct rewindstack_impl * this=(struct rewindstack_impl*)this_;
size_t tailpos = (state->tail-state->data); //We ignore this. push_begin just returns a pointer anyways.
size_t remaining = (tailpos+state->capacity-sizeof(size_t)-headpos-1)%state->capacity + 1; }
if (entries) *entries = state->entries; static const void * pull(struct rewindstack * this_)
if (bytes) *bytes = (state->capacity-remaining); {
if (full) *full = (remaining<=state->maxcompsize*2); struct rewindstack_impl * this=(struct rewindstack_impl*)this_;
if (this->thisblock_valid)
{
this->thisblock_valid=false;
this->entries--;
return this->thisblock;
}
if (this->head==this->tail) return NULL;
size_t start=read_size_t((uint16_t*)(this->head - sizeof(size_t)));
this->head=this->data+start;
const char * compressed=this->data+start+sizeof(size_t);
char * out=this->thisblock;
//Begin decompression code
//out is the last pushed (or returned) state
const uint16_t * compressed16=(const uint16_t*)compressed;
uint16_t * out16=(uint16_t*)out;
while (true)
{
uint16_t numchanged=*(compressed16++);
if (numchanged)
{
out16+=*(compressed16++);
//We could do memcpy, but it seems that memcpy has a constant-per-call overhead that actually shows up.
//Our average size in here seems to be 8 or something.
//Therefore, we do something with lower overhead.
for (int i=0;i<numchanged;i++) out16[i]=compressed16[i];
compressed16+=numchanged;
out16+=numchanged;
}
else
{
uint32_t numunchanged=compressed16[0] | compressed16[1]<<16;
if (!numunchanged) break;
compressed16+=2;
out16+=numunchanged;
}
}
//End decompression code
this->entries--;
return this->thisblock;
}
static void capacity_f(struct rewindstack * this_, unsigned int * entries, size_t * bytes, bool * full)
{
struct rewindstack_impl * this=(struct rewindstack_impl*)this_;
size_t headpos=(this->head-this->data);
size_t tailpos=(this->tail-this->data);
size_t remaining=(tailpos+this->capacity-sizeof(size_t)-headpos-1)%this->capacity + 1;
if (entries) *entries=this->entries;
if (bytes) *bytes=(this->capacity-remaining);
if (full) *full=(remaining<=this->maxcompsize*2);
}
static void free_(struct rewindstack * this_)
{
struct rewindstack_impl * this=(struct rewindstack_impl*)this_;
free(this->data);
free(this->thisblock);
free(this->nextblock);
free(this);
}
struct rewindstack * rewindstack_create(size_t blocksize, size_t capacity)
{
struct rewindstack_impl * this=malloc(sizeof(struct rewindstack_impl));
this->i.reset=reset;
this->i.push_begin=push_begin;
this->i.push_end=push_end;
this->i.push_cancel=push_cancel;
this->i.pull=pull;
this->i.capacity=capacity_f;
this->i.free=free_;
this->data=NULL;
this->thisblock=NULL;
this->nextblock=NULL;
this->capacity=0;
this->blocksize=0;
reset((struct rewindstack*)this, blocksize, capacity);
return (struct rewindstack*)this;
} }