diff --git a/rewind.c b/rewind.c
index f19470c889..f1a1760d9e 100644
--- a/rewind.c
+++ b/rewind.c
@@ -1,25 +1,5 @@
-/* RetroArch - A frontend for libretro.
- * Copyright (C) 2010-2014 - Hans-Kristian Arntzen
- *
- * RetroArch is free software: you can redistribute it and/or modify it under the terms
- * of the GNU General Public License as published by the Free Software Found-
- * ation, either version 3 of the License, or (at your option) any later version.
- *
- * RetroArch is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
- * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
- * PURPOSE. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with RetroArch.
- * If not, see .
- */
-
-#define __STDC_LIMIT_MACROS
-#include "rewind.h"
+#include "minir.h"
#include
-#include "msvc/msvc-stdint/stdint.h"
-
-//#define NO_UNALIGNED_MEM
-//Uncomment the above if alignment is enforced.
//Format per frame:
//size nextstart;
@@ -43,7 +23,7 @@
//Each size value is stored native endian if alignment is not enforced; if it is, they're little endian.
//The start of the buffer contains a size pointing to the end of the buffer; the end points to its start.
//Wrapping is handled by returning to the start of the buffer if the compressed data could potentially hit the edge;
-// if the compressed data could potentially overwrite the tail pointer, the tail retreats until it can no longer collide.
+//if the compressed data could potentially overwrite the tail pointer, the tail retreats until it can no longer collide.
//This means that on average, ~2*maxcompsize is unused at any given moment.
#if SIZE_MAX == 0xFFFFFFFF
@@ -61,29 +41,29 @@ extern char double_check_sizeof_size_t[(sizeof(size_t)==8)?1:-1];
#ifdef USE_64BIT
static inline void write_size_t(uint16_t* ptr, size_t val)
{
- ptr[0] = val>>0;
- ptr[1] = val>>16;
- ptr[2] = val>>32;
- ptr[3] = val>>48;
+ ptr[0]=val>>0;
+ ptr[1]=val>>16;
+ ptr[2]=val>>32;
+ ptr[3]=val>>48;
}
static inline size_t read_size_t(uint16_t* ptr)
{
- return ((size_t)ptr[0]<<0 |
- (size_t)ptr[1]<<16 |
- (size_t)ptr[2]<<32 |
- (size_t)ptr[3]<<48);
+ return ((size_t)ptr[0]<<0 |
+ (size_t)ptr[1]<<16 |
+ (size_t)ptr[2]<<32 |
+ (size_t)ptr[3]<<48);
}
#else
static inline void write_size_t(uint16_t* ptr, size_t val)
{
- ptr[0] = val;
- ptr[1] = val>>16;
+ ptr[0]=val;
+ ptr[1]=val>>16;
}
static inline size_t read_size_t(uint16_t* ptr)
{
- return (ptr[0] | (size_t)ptr[1]<<16);
+ return (ptr[0] | (size_t)ptr[1]<<16);
}
#endif
@@ -92,163 +72,95 @@ static inline size_t read_size_t(uint16_t* ptr)
#define write_size_t(ptr, val) (*(size_t*)(ptr) = (val))
#endif
-struct state_manager {
- char *data;
- size_t capacity;
- char *head;//Reading and writing is done here.
- char *tail;//If head comes close to this, discard a frame.
-
- char *thisblock;
- char *nextblock;
- bool thisblock_valid;
-
- size_t blocksize;//This one is runded up from reset::blocksize.
- size_t maxcompsize;//size_t+(blocksize+131071)/131072*(blocksize+u16+u16)+u16+u32+size_t (yes, the math is a bit ugly)
-
- unsigned int entries;
+struct rewindstack_impl {
+ struct rewindstack i;
+
+ char * data;
+ size_t capacity;
+ char * head;//Reading and writing is done here.
+ char * tail;//If head comes close to this, discard a frame.
+
+ char * thisblock;
+ char * nextblock;
+ bool thisblock_valid;
+
+ size_t blocksize;//This one is runded up from reset::blocksize.
+ size_t maxcompsize;//size_t+(blocksize+131071)/131072*(blocksize+u16+u16)+u16+u32+size_t (yes, the math is a bit ugly)
+
+ unsigned int entries;
};
-state_manager_t *state_manager_new(size_t state_size, size_t buffer_size)
+static void reset(struct rewindstack * this_, size_t blocksize, size_t capacity)
{
- state_manager_t *state = (state_manager_t*)malloc(sizeof(*state));
-
- state->capacity = 0;
- state->blocksize = 0;
-
- int newblocksize = ((state_size-1)|(sizeof(uint16_t)-1))+1;
- state->blocksize = newblocksize;
-
- const int maxcblkcover = UINT16_MAX*sizeof(uint16_t);
- const int maxcblks = (state->blocksize+maxcblkcover-1)/maxcblkcover;
- state->maxcompsize = state->blocksize + maxcblks*sizeof(uint16_t)*2 + sizeof(uint16_t)+sizeof(uint32_t) + sizeof(size_t)*2;
-
- state->data = (char*)malloc(buffer_size);
-
- state->thisblock = (char*)calloc(state->blocksize+sizeof(uint16_t)*4+16, 1);
- state->nextblock = (char*)calloc(state->blocksize+sizeof(uint16_t)*4+16, 1);
- if (!state->data || !state->thisblock || !state->nextblock)
- {
- free(state->data);
- free(state->thisblock);
- free(state->nextblock);
- free(state);
- return NULL;
- }
- //Force in a different byte at the end, so we don't need to check bounds in the innermost loop (it's expensive).
- //There is also a large amount of data that's the same, to stop the other scan
- //There is also some padding at the end. This is so we don't read outside the buffer end if we're reading in large blocks;
- // it doesn't make any difference to us, but sacrificing 16 bytes to get Valgrind happy is worth it.
- *(uint16_t*)(state->thisblock+state->blocksize+sizeof(uint16_t)*3) = 0xFFFF;
- *(uint16_t*)(state->nextblock+state->blocksize+sizeof(uint16_t)*3) = 0x0000;
-
- state->capacity=buffer_size;
-
- state->head = state->data+sizeof(size_t);
- state->tail = state->data+sizeof(size_t);
-
- state->thisblock_valid = false;
-
- state->entries = 0;
-
- return state;
+ struct rewindstack_impl * this=(struct rewindstack_impl*)this_;
+
+ int newblocksize=((blocksize-1)|(sizeof(uint16_t)-1))+1;
+ if (this->blocksize!=newblocksize)
+ {
+ this->blocksize=newblocksize;
+
+ const int maxcblkcover=UINT16_MAX*sizeof(uint16_t);
+ const int maxcblks=(this->blocksize+maxcblkcover-1)/maxcblkcover;
+ this->maxcompsize=this->blocksize + maxcblks*sizeof(uint16_t)*2 + sizeof(uint16_t)+sizeof(uint32_t) + sizeof(size_t)*2;
+
+ free(this->thisblock);
+ free(this->nextblock);
+ this->thisblock=calloc(this->blocksize+sizeof(uint16_t)*4+16, 1);
+ this->nextblock=calloc(this->blocksize+sizeof(uint16_t)*4+16, 1);
+ //Force in a different byte at the end, so we don't need to check bounds in the innermost loop (it's expensive).
+ //There is also a large amount of data that's the same, to stop the other scan
+ //There is also some padding at the end. This is so we don't read outside the buffer end if we're reading in large blocks;
+ // it doesn't make any difference to us, but sacrificing 16 bytes to get Valgrind happy is worth it.
+ *(uint16_t*)(this->thisblock+this->blocksize+sizeof(uint16_t)*3)=0xFFFF;
+ *(uint16_t*)(this->nextblock+this->blocksize+sizeof(uint16_t)*3)=0x0000;
+ }
+
+ if (capacity!=this->capacity)
+ {
+ free(this->data);
+ this->data=malloc(capacity);
+ this->capacity=capacity;
+ }
+
+ this->head=this->data+sizeof(size_t);
+ this->tail=this->data+sizeof(size_t);
+
+ this->thisblock_valid=false;
+
+ this->entries=0;
}
-void state_manager_free(state_manager_t *state)
+static void * push_begin(struct rewindstack * this_)
{
- free(state->data);
- free(state->thisblock);
- free(state->nextblock);
- free(state);
-}
-
-bool state_manager_pop(state_manager_t *state, const void **data)
-{
- *data = NULL;
-
- if (state->thisblock_valid)
- {
- state->thisblock_valid = false;
- state->entries--;
- *data = state->thisblock;
- return true;
- }
-
- if (state->head == state->tail) return false;
-
- size_t start = read_size_t((uint16_t*)(state->head - sizeof(size_t)));
- state->head = state->data+start;
-
- const char *compressed = state->data+start+sizeof(size_t);
- char *out = state->thisblock;
- //Begin decompression code
- //out is the last pushed (or returned) state
- const uint16_t *compressed16 = (const uint16_t*)compressed;
- uint16_t *out16 = (uint16_t*)out;
- while (true)
- {
- uint16_t numchanged = *(compressed16++);
- if (numchanged)
- {
- out16 += *(compressed16++);
- //We could do memcpy, but it seems that memcpy has a constant-per-call overhead that actually shows up.
- //Our average size in here seems to be 8 or something.
- //Therefore, we do something with lower overhead.
- for (int i=0;ientries--;
-
- *data = state->thisblock;
- return true;
-}
-
-void state_manager_push_where(state_manager_t *state, void **data)
-{
- //We need to ensure we have an uncompressed copy of the last pushed state, or we could
- // end up applying a 'patch' to wrong savestate, and that'd blow up rather quickly.
- if (!state->thisblock_valid)
- {
- const void *ignored;
- if (state_manager_pop(state, &ignored))
- {
- state->thisblock_valid = true;
- state->entries++;
- }
- }
-
- *data=state->nextblock;
+ struct rewindstack_impl * this=(struct rewindstack_impl*)this_;
+ //We need to ensure we have an uncompressed copy of the last pushed state, or we could
+ // end up applying a 'patch' to wrong savestate, and that'd blow up rather quickly.
+ if (!this->thisblock_valid)
+ {
+ if (this_->pull(this_))
+ {
+ this->thisblock_valid=true;
+ this->entries++;
+ }
+ }
+ return this->nextblock;
}
#if __SSE2__
#if defined(__GNUC__)
static inline int compat_ctz(unsigned int x)
{
- return __builtin_ctz(x);
+ return __builtin_ctz(x);
}
#else
// Only checks at nibble granularity, because that's what we need.
static inline int compat_ctz(unsigned int x)
{
- if (x & 0x000f)
- return 0;
- if (x & 0x00f0)
- return 4;
- if (x & 0x0f00)
- return 8;
- if (x & 0xf000)
- return 12;
- return 16;
+ if (x&0x000f) return 0;
+ if (x&0x00f0) return 4;
+ if (x&0x0f00) return 8;
+ if (x&0xf000) return 12;
+ return 16;
}
#endif
@@ -308,11 +220,11 @@ static inline size_t find_change(const uint16_t * a, const uint16_t * b)
}
#endif
-#if __SSE2__x // this is not a typo - do not fix unless you can show evidence that this version is faster
+#if __SSE2__x
//This one can give different answers than the C version in some cases. However, the compression ratio remains unaffected.
//It also appears to be slower. Probably due to the low average duration of this loop.
static inline size_t find_same(const uint16_t * a, const uint16_t * b)
-{
+{gfgf
if (a[0]==b[0] && a[1]==b[1]) return 0;
if (a[1]==b[1] && a[2]==b[2]) return 1;
if (a[2]==b[2] && a[3]==b[3]) return 2;
@@ -337,6 +249,8 @@ static inline size_t find_same(const uint16_t * a, const uint16_t * b)
}
}
#else
+//desired comp ratio: 4.074198%
+//*
static inline size_t find_same(const uint16_t * a, const uint16_t * b)
{
const uint16_t * a_org=a;
@@ -371,105 +285,261 @@ static inline size_t find_same(const uint16_t * a, const uint16_t * b)
}
return a-a_org;
}
+/*/
+static inline size_t find_same(const uint16_t * a, const uint16_t * b)
+{
+ const uint16_t * a_org=a;
+
+ //Comparing two or three words makes no real difference.
+ //With two, the smaller blocks are less likely to be chopped up elsewhere due to 64KB;
+ // with three, we get larger blocks which should be a minuscle bit faster to decompress,
+ // but probably a little slower to compress. Since compression is more bottleneck than decompression is, we favor that.
+ while (a[0]!=b[0] || a[1]!=b[1])
+ {
+ a++;
+ b++;
+ //Optimize this by only checking one at the time for as long as possible.
+ while (*a!=*b)
+ {
+ a++;
+ b++;
+ }
+ }
+
+ return a-a_org;
+}
+//*/
#endif
-void state_manager_push_do(state_manager_t *state)
+#include
+static void push_end(struct rewindstack * this_)
{
- if (state->thisblock_valid)
- {
- if (state->capacitymaxcompsize) return;
-
- recheckcapacity:;
- size_t headpos = (state->head-state->data);
- size_t tailpos = (state->tail-state->data);
- size_t remaining = (tailpos+state->capacity-sizeof(size_t)-headpos-1)%state->capacity + 1;
- if (remaining <= state->maxcompsize)
- {
- state->tail = state->data + read_size_t((uint16_t*)state->tail);
- state->entries--;
- goto recheckcapacity;
- }
-
- const char *oldb = state->thisblock;
- const char *newb = state->nextblock;
- char *compressed = state->head+sizeof(size_t);
-
- //Begin compression code; 'compressed' will point to the end of the compressed data (excluding the prev pointer).
- const uint16_t *old16 = (const uint16_t*)oldb;
- const uint16_t *new16 = (const uint16_t*)newb;
- uint16_t *compressed16 = (uint16_t*)compressed;
- size_t num16s = state->blocksize/sizeof(uint16_t);
- while (num16s)
- {
- size_t skip = find_change(old16, new16);
-
- if (skip >= num16s) break;
- old16+=skip;
- new16+=skip;
- num16s-=skip;
-
- if (skip > UINT16_MAX)
- {
- if (skip > UINT32_MAX)
- {
- //This will make it scan the entire thing again, but it only hits on 8GB unchanged
- //data anyways, and if you're doing that, you've got bigger problems.
- skip = UINT32_MAX;
- }
- *(compressed16++) = 0;
- *(compressed16++) = skip;
- *(compressed16++) = skip>>16;
- skip = 0;
- continue;
- }
-
- size_t changed=find_same(old16, new16);
- if (changed>UINT16_MAX) changed=UINT16_MAX;
- *(compressed16++)=changed;
- *(compressed16++)=skip;
- for (int i=0;idata + state->maxcompsize > state->capacity)
- {
- compressed = state->data;
- if (state->tail == state->data+sizeof(size_t)) state->tail = state->data + *(size_t*)state->tail;
- }
- write_size_t((uint16_t*)compressed, state->head-state->data);
- compressed += sizeof(size_t);
- write_size_t((uint16_t*)state->head, compressed-state->data);
- state->head = compressed;
- }
- else
- {
- state->thisblock_valid = true;
- }
-
- char *swap = state->thisblock;
- state->thisblock = state->nextblock;
- state->nextblock = swap;
-
- state->entries++;
-
- return;
+ struct rewindstack_impl * this=(struct rewindstack_impl*)this_;
+ if (this->thisblock_valid)
+ {
+/*
+if(1)
+{
+static FILE * out=NULL;
+bool q=0;
+if(!out)out=fopen("diff.bin", "wb"),q=1;
+int p=0;
+while (pblocksize)
+{
+int pp=p;
+while(this->thisblock[p]==this->nextblock[p]) p++;
+unsigned int o2=p-pp;
+while(o2>0xFFFF)
+{
+unsigned short l=0;
+unsigned short h=0xFFFF;
+fwrite(&h, 2,1, out);
+fwrite(&l, 2,1, out);
+o2-=0xFFFF;
+}
+unsigned short o=o2;
+fwrite(&o, 2,1, out);
+pp=p;
+while(this->thisblock[p]!=this->nextblock[p]) p++;
+o2=p-pp;
+while(o2>0xFFFF)
+{
+unsigned short l=0;
+unsigned short h=0xFFFF;
+fwrite(&l, 2,1, out);
+fwrite(&h, 2,1, out);
+o2-=0xFFFF;
+}
+o=o2;
+fwrite(&o, 2,1, out);
+}
+fflush(out);
+if(q)printf("[%i %zu]\n",p,this->blocksize);
+}
+// */
+ if (this->capacitymaxcompsize) return;
+
+ recheckcapacity:;
+ size_t headpos=(this->head-this->data);
+ size_t tailpos=(this->tail-this->data);
+ size_t remaining=(tailpos+this->capacity-sizeof(size_t)-headpos-1)%this->capacity + 1;
+ if (remaining<=this->maxcompsize)
+ {
+ this->tail=this->data + read_size_t((uint16_t*)this->tail);
+ this->entries--;
+ goto recheckcapacity;
+ }
+
+ const char* old=this->thisblock;
+ const char* new=this->nextblock;
+ char* compressed=this->head+sizeof(size_t);
+
+ //Begin compression code; 'compressed' will point to the end of the compressed data (excluding the prev pointer).
+ const uint16_t * old16=(const uint16_t*)old;
+ const uint16_t * new16=(const uint16_t*)new;
+ uint16_t * compressed16=(uint16_t*)compressed;
+ size_t num16s=this->blocksize/sizeof(uint16_t);
+ while (num16s)
+ {
+ size_t skip=find_change(old16, new16);
+ //size_t skip=find_change_b(old16, new16);
+ //if (skip!=skip2) abort();
+
+ if (skip>=num16s) break;
+ old16+=skip;
+ new16+=skip;
+ num16s-=skip;
+
+ if (skip>UINT16_MAX)
+ {
+ if (skip>UINT32_MAX)
+ {
+ //This will make it scan the entire thing again, but it only hits on 8GB unchanged
+ //data anyways, and if you're doing that, you've got bigger problems.
+ skip=UINT32_MAX;
+ }
+ *(compressed16++)=0;
+ *(compressed16++)=skip;
+ *(compressed16++)=skip>>16;
+ skip=0;
+ continue;
+ }
+
+ size_t changed=find_same(old16, new16);
+ if (changed>UINT16_MAX) changed=UINT16_MAX;
+ *(compressed16++)=changed;
+ *(compressed16++)=skip;
+ for (int i=0;idata+this->maxcompsize > this->capacity)
+ {
+ compressed=this->data;
+ if (this->tail==this->data+sizeof(size_t)) this->tail=this->data + *(size_t*)this->tail;
+ }
+ write_size_t((uint16_t*)compressed, this->head-this->data);
+ compressed+=sizeof(size_t);
+ write_size_t((uint16_t*)this->head, compressed-this->data);
+ this->head=compressed;
+ }
+ else
+ {
+ this->thisblock_valid=true;
+ }
+
+ char * swap=this->thisblock;
+ this->thisblock=this->nextblock;
+ this->nextblock=swap;
+
+ this->entries++;
}
-void state_manager_capacity(state_manager_t *state, unsigned int * entries, size_t * bytes, bool * full)
+static void push_cancel(struct rewindstack * this_)
{
- size_t headpos = (state->head-state->data);
- size_t tailpos = (state->tail-state->data);
- size_t remaining = (tailpos+state->capacity-sizeof(size_t)-headpos-1)%state->capacity + 1;
-
- if (entries) *entries = state->entries;
- if (bytes) *bytes = (state->capacity-remaining);
- if (full) *full = (remaining<=state->maxcompsize*2);
+ //struct rewindstack_impl * this=(struct rewindstack_impl*)this_;
+ //We ignore this. push_begin just returns a pointer anyways.
+}
+
+static const void * pull(struct rewindstack * this_)
+{
+ struct rewindstack_impl * this=(struct rewindstack_impl*)this_;
+
+ if (this->thisblock_valid)
+ {
+ this->thisblock_valid=false;
+ this->entries--;
+ return this->thisblock;
+ }
+
+ if (this->head==this->tail) return NULL;
+
+ size_t start=read_size_t((uint16_t*)(this->head - sizeof(size_t)));
+ this->head=this->data+start;
+
+ const char * compressed=this->data+start+sizeof(size_t);
+ char * out=this->thisblock;
+ //Begin decompression code
+ //out is the last pushed (or returned) state
+ const uint16_t * compressed16=(const uint16_t*)compressed;
+ uint16_t * out16=(uint16_t*)out;
+ while (true)
+ {
+ uint16_t numchanged=*(compressed16++);
+ if (numchanged)
+ {
+ out16+=*(compressed16++);
+ //We could do memcpy, but it seems that memcpy has a constant-per-call overhead that actually shows up.
+ //Our average size in here seems to be 8 or something.
+ //Therefore, we do something with lower overhead.
+ for (int i=0;ientries--;
+
+ return this->thisblock;
+}
+
+static void capacity_f(struct rewindstack * this_, unsigned int * entries, size_t * bytes, bool * full)
+{
+ struct rewindstack_impl * this=(struct rewindstack_impl*)this_;
+
+ size_t headpos=(this->head-this->data);
+ size_t tailpos=(this->tail-this->data);
+ size_t remaining=(tailpos+this->capacity-sizeof(size_t)-headpos-1)%this->capacity + 1;
+
+ if (entries) *entries=this->entries;
+ if (bytes) *bytes=(this->capacity-remaining);
+ if (full) *full=(remaining<=this->maxcompsize*2);
+}
+
+static void free_(struct rewindstack * this_)
+{
+ struct rewindstack_impl * this=(struct rewindstack_impl*)this_;
+ free(this->data);
+ free(this->thisblock);
+ free(this->nextblock);
+ free(this);
+}
+
+struct rewindstack * rewindstack_create(size_t blocksize, size_t capacity)
+{
+ struct rewindstack_impl * this=malloc(sizeof(struct rewindstack_impl));
+ this->i.reset=reset;
+ this->i.push_begin=push_begin;
+ this->i.push_end=push_end;
+ this->i.push_cancel=push_cancel;
+ this->i.pull=pull;
+ this->i.capacity=capacity_f;
+ this->i.free=free_;
+
+ this->data=NULL;
+ this->thisblock=NULL;
+ this->nextblock=NULL;
+
+ this->capacity=0;
+ this->blocksize=0;
+
+ reset((struct rewindstack*)this, blocksize, capacity);
+
+ return (struct rewindstack*)this;
}