diff --git a/rewind.c b/rewind.c index f19470c889..f1a1760d9e 100644 --- a/rewind.c +++ b/rewind.c @@ -1,25 +1,5 @@ -/* RetroArch - A frontend for libretro. - * Copyright (C) 2010-2014 - Hans-Kristian Arntzen - * - * RetroArch is free software: you can redistribute it and/or modify it under the terms - * of the GNU General Public License as published by the Free Software Found- - * ation, either version 3 of the License, or (at your option) any later version. - * - * RetroArch is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR - * PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with RetroArch. - * If not, see . - */ - -#define __STDC_LIMIT_MACROS -#include "rewind.h" +#include "minir.h" #include -#include "msvc/msvc-stdint/stdint.h" - -//#define NO_UNALIGNED_MEM -//Uncomment the above if alignment is enforced. //Format per frame: //size nextstart; @@ -43,7 +23,7 @@ //Each size value is stored native endian if alignment is not enforced; if it is, they're little endian. //The start of the buffer contains a size pointing to the end of the buffer; the end points to its start. //Wrapping is handled by returning to the start of the buffer if the compressed data could potentially hit the edge; -// if the compressed data could potentially overwrite the tail pointer, the tail retreats until it can no longer collide. +//if the compressed data could potentially overwrite the tail pointer, the tail retreats until it can no longer collide. //This means that on average, ~2*maxcompsize is unused at any given moment. #if SIZE_MAX == 0xFFFFFFFF @@ -61,29 +41,29 @@ extern char double_check_sizeof_size_t[(sizeof(size_t)==8)?1:-1]; #ifdef USE_64BIT static inline void write_size_t(uint16_t* ptr, size_t val) { - ptr[0] = val>>0; - ptr[1] = val>>16; - ptr[2] = val>>32; - ptr[3] = val>>48; + ptr[0]=val>>0; + ptr[1]=val>>16; + ptr[2]=val>>32; + ptr[3]=val>>48; } static inline size_t read_size_t(uint16_t* ptr) { - return ((size_t)ptr[0]<<0 | - (size_t)ptr[1]<<16 | - (size_t)ptr[2]<<32 | - (size_t)ptr[3]<<48); + return ((size_t)ptr[0]<<0 | + (size_t)ptr[1]<<16 | + (size_t)ptr[2]<<32 | + (size_t)ptr[3]<<48); } #else static inline void write_size_t(uint16_t* ptr, size_t val) { - ptr[0] = val; - ptr[1] = val>>16; + ptr[0]=val; + ptr[1]=val>>16; } static inline size_t read_size_t(uint16_t* ptr) { - return (ptr[0] | (size_t)ptr[1]<<16); + return (ptr[0] | (size_t)ptr[1]<<16); } #endif @@ -92,163 +72,95 @@ static inline size_t read_size_t(uint16_t* ptr) #define write_size_t(ptr, val) (*(size_t*)(ptr) = (val)) #endif -struct state_manager { - char *data; - size_t capacity; - char *head;//Reading and writing is done here. - char *tail;//If head comes close to this, discard a frame. - - char *thisblock; - char *nextblock; - bool thisblock_valid; - - size_t blocksize;//This one is runded up from reset::blocksize. - size_t maxcompsize;//size_t+(blocksize+131071)/131072*(blocksize+u16+u16)+u16+u32+size_t (yes, the math is a bit ugly) - - unsigned int entries; +struct rewindstack_impl { + struct rewindstack i; + + char * data; + size_t capacity; + char * head;//Reading and writing is done here. + char * tail;//If head comes close to this, discard a frame. + + char * thisblock; + char * nextblock; + bool thisblock_valid; + + size_t blocksize;//This one is runded up from reset::blocksize. + size_t maxcompsize;//size_t+(blocksize+131071)/131072*(blocksize+u16+u16)+u16+u32+size_t (yes, the math is a bit ugly) + + unsigned int entries; }; -state_manager_t *state_manager_new(size_t state_size, size_t buffer_size) +static void reset(struct rewindstack * this_, size_t blocksize, size_t capacity) { - state_manager_t *state = (state_manager_t*)malloc(sizeof(*state)); - - state->capacity = 0; - state->blocksize = 0; - - int newblocksize = ((state_size-1)|(sizeof(uint16_t)-1))+1; - state->blocksize = newblocksize; - - const int maxcblkcover = UINT16_MAX*sizeof(uint16_t); - const int maxcblks = (state->blocksize+maxcblkcover-1)/maxcblkcover; - state->maxcompsize = state->blocksize + maxcblks*sizeof(uint16_t)*2 + sizeof(uint16_t)+sizeof(uint32_t) + sizeof(size_t)*2; - - state->data = (char*)malloc(buffer_size); - - state->thisblock = (char*)calloc(state->blocksize+sizeof(uint16_t)*4+16, 1); - state->nextblock = (char*)calloc(state->blocksize+sizeof(uint16_t)*4+16, 1); - if (!state->data || !state->thisblock || !state->nextblock) - { - free(state->data); - free(state->thisblock); - free(state->nextblock); - free(state); - return NULL; - } - //Force in a different byte at the end, so we don't need to check bounds in the innermost loop (it's expensive). - //There is also a large amount of data that's the same, to stop the other scan - //There is also some padding at the end. This is so we don't read outside the buffer end if we're reading in large blocks; - // it doesn't make any difference to us, but sacrificing 16 bytes to get Valgrind happy is worth it. - *(uint16_t*)(state->thisblock+state->blocksize+sizeof(uint16_t)*3) = 0xFFFF; - *(uint16_t*)(state->nextblock+state->blocksize+sizeof(uint16_t)*3) = 0x0000; - - state->capacity=buffer_size; - - state->head = state->data+sizeof(size_t); - state->tail = state->data+sizeof(size_t); - - state->thisblock_valid = false; - - state->entries = 0; - - return state; + struct rewindstack_impl * this=(struct rewindstack_impl*)this_; + + int newblocksize=((blocksize-1)|(sizeof(uint16_t)-1))+1; + if (this->blocksize!=newblocksize) + { + this->blocksize=newblocksize; + + const int maxcblkcover=UINT16_MAX*sizeof(uint16_t); + const int maxcblks=(this->blocksize+maxcblkcover-1)/maxcblkcover; + this->maxcompsize=this->blocksize + maxcblks*sizeof(uint16_t)*2 + sizeof(uint16_t)+sizeof(uint32_t) + sizeof(size_t)*2; + + free(this->thisblock); + free(this->nextblock); + this->thisblock=calloc(this->blocksize+sizeof(uint16_t)*4+16, 1); + this->nextblock=calloc(this->blocksize+sizeof(uint16_t)*4+16, 1); + //Force in a different byte at the end, so we don't need to check bounds in the innermost loop (it's expensive). + //There is also a large amount of data that's the same, to stop the other scan + //There is also some padding at the end. This is so we don't read outside the buffer end if we're reading in large blocks; + // it doesn't make any difference to us, but sacrificing 16 bytes to get Valgrind happy is worth it. + *(uint16_t*)(this->thisblock+this->blocksize+sizeof(uint16_t)*3)=0xFFFF; + *(uint16_t*)(this->nextblock+this->blocksize+sizeof(uint16_t)*3)=0x0000; + } + + if (capacity!=this->capacity) + { + free(this->data); + this->data=malloc(capacity); + this->capacity=capacity; + } + + this->head=this->data+sizeof(size_t); + this->tail=this->data+sizeof(size_t); + + this->thisblock_valid=false; + + this->entries=0; } -void state_manager_free(state_manager_t *state) +static void * push_begin(struct rewindstack * this_) { - free(state->data); - free(state->thisblock); - free(state->nextblock); - free(state); -} - -bool state_manager_pop(state_manager_t *state, const void **data) -{ - *data = NULL; - - if (state->thisblock_valid) - { - state->thisblock_valid = false; - state->entries--; - *data = state->thisblock; - return true; - } - - if (state->head == state->tail) return false; - - size_t start = read_size_t((uint16_t*)(state->head - sizeof(size_t))); - state->head = state->data+start; - - const char *compressed = state->data+start+sizeof(size_t); - char *out = state->thisblock; - //Begin decompression code - //out is the last pushed (or returned) state - const uint16_t *compressed16 = (const uint16_t*)compressed; - uint16_t *out16 = (uint16_t*)out; - while (true) - { - uint16_t numchanged = *(compressed16++); - if (numchanged) - { - out16 += *(compressed16++); - //We could do memcpy, but it seems that memcpy has a constant-per-call overhead that actually shows up. - //Our average size in here seems to be 8 or something. - //Therefore, we do something with lower overhead. - for (int i=0;ientries--; - - *data = state->thisblock; - return true; -} - -void state_manager_push_where(state_manager_t *state, void **data) -{ - //We need to ensure we have an uncompressed copy of the last pushed state, or we could - // end up applying a 'patch' to wrong savestate, and that'd blow up rather quickly. - if (!state->thisblock_valid) - { - const void *ignored; - if (state_manager_pop(state, &ignored)) - { - state->thisblock_valid = true; - state->entries++; - } - } - - *data=state->nextblock; + struct rewindstack_impl * this=(struct rewindstack_impl*)this_; + //We need to ensure we have an uncompressed copy of the last pushed state, or we could + // end up applying a 'patch' to wrong savestate, and that'd blow up rather quickly. + if (!this->thisblock_valid) + { + if (this_->pull(this_)) + { + this->thisblock_valid=true; + this->entries++; + } + } + return this->nextblock; } #if __SSE2__ #if defined(__GNUC__) static inline int compat_ctz(unsigned int x) { - return __builtin_ctz(x); + return __builtin_ctz(x); } #else // Only checks at nibble granularity, because that's what we need. static inline int compat_ctz(unsigned int x) { - if (x & 0x000f) - return 0; - if (x & 0x00f0) - return 4; - if (x & 0x0f00) - return 8; - if (x & 0xf000) - return 12; - return 16; + if (x&0x000f) return 0; + if (x&0x00f0) return 4; + if (x&0x0f00) return 8; + if (x&0xf000) return 12; + return 16; } #endif @@ -308,11 +220,11 @@ static inline size_t find_change(const uint16_t * a, const uint16_t * b) } #endif -#if __SSE2__x // this is not a typo - do not fix unless you can show evidence that this version is faster +#if __SSE2__x //This one can give different answers than the C version in some cases. However, the compression ratio remains unaffected. //It also appears to be slower. Probably due to the low average duration of this loop. static inline size_t find_same(const uint16_t * a, const uint16_t * b) -{ +{gfgf if (a[0]==b[0] && a[1]==b[1]) return 0; if (a[1]==b[1] && a[2]==b[2]) return 1; if (a[2]==b[2] && a[3]==b[3]) return 2; @@ -337,6 +249,8 @@ static inline size_t find_same(const uint16_t * a, const uint16_t * b) } } #else +//desired comp ratio: 4.074198% +//* static inline size_t find_same(const uint16_t * a, const uint16_t * b) { const uint16_t * a_org=a; @@ -371,105 +285,261 @@ static inline size_t find_same(const uint16_t * a, const uint16_t * b) } return a-a_org; } +/*/ +static inline size_t find_same(const uint16_t * a, const uint16_t * b) +{ + const uint16_t * a_org=a; + + //Comparing two or three words makes no real difference. + //With two, the smaller blocks are less likely to be chopped up elsewhere due to 64KB; + // with three, we get larger blocks which should be a minuscle bit faster to decompress, + // but probably a little slower to compress. Since compression is more bottleneck than decompression is, we favor that. + while (a[0]!=b[0] || a[1]!=b[1]) + { + a++; + b++; + //Optimize this by only checking one at the time for as long as possible. + while (*a!=*b) + { + a++; + b++; + } + } + + return a-a_org; +} +//*/ #endif -void state_manager_push_do(state_manager_t *state) +#include +static void push_end(struct rewindstack * this_) { - if (state->thisblock_valid) - { - if (state->capacitymaxcompsize) return; - - recheckcapacity:; - size_t headpos = (state->head-state->data); - size_t tailpos = (state->tail-state->data); - size_t remaining = (tailpos+state->capacity-sizeof(size_t)-headpos-1)%state->capacity + 1; - if (remaining <= state->maxcompsize) - { - state->tail = state->data + read_size_t((uint16_t*)state->tail); - state->entries--; - goto recheckcapacity; - } - - const char *oldb = state->thisblock; - const char *newb = state->nextblock; - char *compressed = state->head+sizeof(size_t); - - //Begin compression code; 'compressed' will point to the end of the compressed data (excluding the prev pointer). - const uint16_t *old16 = (const uint16_t*)oldb; - const uint16_t *new16 = (const uint16_t*)newb; - uint16_t *compressed16 = (uint16_t*)compressed; - size_t num16s = state->blocksize/sizeof(uint16_t); - while (num16s) - { - size_t skip = find_change(old16, new16); - - if (skip >= num16s) break; - old16+=skip; - new16+=skip; - num16s-=skip; - - if (skip > UINT16_MAX) - { - if (skip > UINT32_MAX) - { - //This will make it scan the entire thing again, but it only hits on 8GB unchanged - //data anyways, and if you're doing that, you've got bigger problems. - skip = UINT32_MAX; - } - *(compressed16++) = 0; - *(compressed16++) = skip; - *(compressed16++) = skip>>16; - skip = 0; - continue; - } - - size_t changed=find_same(old16, new16); - if (changed>UINT16_MAX) changed=UINT16_MAX; - *(compressed16++)=changed; - *(compressed16++)=skip; - for (int i=0;idata + state->maxcompsize > state->capacity) - { - compressed = state->data; - if (state->tail == state->data+sizeof(size_t)) state->tail = state->data + *(size_t*)state->tail; - } - write_size_t((uint16_t*)compressed, state->head-state->data); - compressed += sizeof(size_t); - write_size_t((uint16_t*)state->head, compressed-state->data); - state->head = compressed; - } - else - { - state->thisblock_valid = true; - } - - char *swap = state->thisblock; - state->thisblock = state->nextblock; - state->nextblock = swap; - - state->entries++; - - return; + struct rewindstack_impl * this=(struct rewindstack_impl*)this_; + if (this->thisblock_valid) + { +/* +if(1) +{ +static FILE * out=NULL; +bool q=0; +if(!out)out=fopen("diff.bin", "wb"),q=1; +int p=0; +while (pblocksize) +{ +int pp=p; +while(this->thisblock[p]==this->nextblock[p]) p++; +unsigned int o2=p-pp; +while(o2>0xFFFF) +{ +unsigned short l=0; +unsigned short h=0xFFFF; +fwrite(&h, 2,1, out); +fwrite(&l, 2,1, out); +o2-=0xFFFF; +} +unsigned short o=o2; +fwrite(&o, 2,1, out); +pp=p; +while(this->thisblock[p]!=this->nextblock[p]) p++; +o2=p-pp; +while(o2>0xFFFF) +{ +unsigned short l=0; +unsigned short h=0xFFFF; +fwrite(&l, 2,1, out); +fwrite(&h, 2,1, out); +o2-=0xFFFF; +} +o=o2; +fwrite(&o, 2,1, out); +} +fflush(out); +if(q)printf("[%i %zu]\n",p,this->blocksize); +} +// */ + if (this->capacitymaxcompsize) return; + + recheckcapacity:; + size_t headpos=(this->head-this->data); + size_t tailpos=(this->tail-this->data); + size_t remaining=(tailpos+this->capacity-sizeof(size_t)-headpos-1)%this->capacity + 1; + if (remaining<=this->maxcompsize) + { + this->tail=this->data + read_size_t((uint16_t*)this->tail); + this->entries--; + goto recheckcapacity; + } + + const char* old=this->thisblock; + const char* new=this->nextblock; + char* compressed=this->head+sizeof(size_t); + + //Begin compression code; 'compressed' will point to the end of the compressed data (excluding the prev pointer). + const uint16_t * old16=(const uint16_t*)old; + const uint16_t * new16=(const uint16_t*)new; + uint16_t * compressed16=(uint16_t*)compressed; + size_t num16s=this->blocksize/sizeof(uint16_t); + while (num16s) + { + size_t skip=find_change(old16, new16); + //size_t skip=find_change_b(old16, new16); + //if (skip!=skip2) abort(); + + if (skip>=num16s) break; + old16+=skip; + new16+=skip; + num16s-=skip; + + if (skip>UINT16_MAX) + { + if (skip>UINT32_MAX) + { + //This will make it scan the entire thing again, but it only hits on 8GB unchanged + //data anyways, and if you're doing that, you've got bigger problems. + skip=UINT32_MAX; + } + *(compressed16++)=0; + *(compressed16++)=skip; + *(compressed16++)=skip>>16; + skip=0; + continue; + } + + size_t changed=find_same(old16, new16); + if (changed>UINT16_MAX) changed=UINT16_MAX; + *(compressed16++)=changed; + *(compressed16++)=skip; + for (int i=0;idata+this->maxcompsize > this->capacity) + { + compressed=this->data; + if (this->tail==this->data+sizeof(size_t)) this->tail=this->data + *(size_t*)this->tail; + } + write_size_t((uint16_t*)compressed, this->head-this->data); + compressed+=sizeof(size_t); + write_size_t((uint16_t*)this->head, compressed-this->data); + this->head=compressed; + } + else + { + this->thisblock_valid=true; + } + + char * swap=this->thisblock; + this->thisblock=this->nextblock; + this->nextblock=swap; + + this->entries++; } -void state_manager_capacity(state_manager_t *state, unsigned int * entries, size_t * bytes, bool * full) +static void push_cancel(struct rewindstack * this_) { - size_t headpos = (state->head-state->data); - size_t tailpos = (state->tail-state->data); - size_t remaining = (tailpos+state->capacity-sizeof(size_t)-headpos-1)%state->capacity + 1; - - if (entries) *entries = state->entries; - if (bytes) *bytes = (state->capacity-remaining); - if (full) *full = (remaining<=state->maxcompsize*2); + //struct rewindstack_impl * this=(struct rewindstack_impl*)this_; + //We ignore this. push_begin just returns a pointer anyways. +} + +static const void * pull(struct rewindstack * this_) +{ + struct rewindstack_impl * this=(struct rewindstack_impl*)this_; + + if (this->thisblock_valid) + { + this->thisblock_valid=false; + this->entries--; + return this->thisblock; + } + + if (this->head==this->tail) return NULL; + + size_t start=read_size_t((uint16_t*)(this->head - sizeof(size_t))); + this->head=this->data+start; + + const char * compressed=this->data+start+sizeof(size_t); + char * out=this->thisblock; + //Begin decompression code + //out is the last pushed (or returned) state + const uint16_t * compressed16=(const uint16_t*)compressed; + uint16_t * out16=(uint16_t*)out; + while (true) + { + uint16_t numchanged=*(compressed16++); + if (numchanged) + { + out16+=*(compressed16++); + //We could do memcpy, but it seems that memcpy has a constant-per-call overhead that actually shows up. + //Our average size in here seems to be 8 or something. + //Therefore, we do something with lower overhead. + for (int i=0;ientries--; + + return this->thisblock; +} + +static void capacity_f(struct rewindstack * this_, unsigned int * entries, size_t * bytes, bool * full) +{ + struct rewindstack_impl * this=(struct rewindstack_impl*)this_; + + size_t headpos=(this->head-this->data); + size_t tailpos=(this->tail-this->data); + size_t remaining=(tailpos+this->capacity-sizeof(size_t)-headpos-1)%this->capacity + 1; + + if (entries) *entries=this->entries; + if (bytes) *bytes=(this->capacity-remaining); + if (full) *full=(remaining<=this->maxcompsize*2); +} + +static void free_(struct rewindstack * this_) +{ + struct rewindstack_impl * this=(struct rewindstack_impl*)this_; + free(this->data); + free(this->thisblock); + free(this->nextblock); + free(this); +} + +struct rewindstack * rewindstack_create(size_t blocksize, size_t capacity) +{ + struct rewindstack_impl * this=malloc(sizeof(struct rewindstack_impl)); + this->i.reset=reset; + this->i.push_begin=push_begin; + this->i.push_end=push_end; + this->i.push_cancel=push_cancel; + this->i.pull=pull; + this->i.capacity=capacity_f; + this->i.free=free_; + + this->data=NULL; + this->thisblock=NULL; + this->nextblock=NULL; + + this->capacity=0; + this->blocksize=0; + + reset((struct rewindstack*)this, blocksize, capacity); + + return (struct rewindstack*)this; }