Update to v094r29 release.

byuu says:

Note: for Windows users, please go to nall/intrinsics.hpp line 60 and
correct the typo from "DISPLAY_WINDOW" to "DISPLAY_WINDOWS" before
compiling, otherwise things won't work at all.

This will be a really major WIP for the core SNES emulation, so please
test as thoroughly as possible.

I rewrote the 65816 CPU core's dispatcher from a jump table to a switch
table. This was so that I could pass class variables as parameters to
opcodes without crazy theatrics.

With that, I killed the regs.r[N] stuff, the flag_t operator|=, &=, ^=
stuff, and all of the template versions of opcodes.

I also removed some stupid pointless flag tests in xcn and pflag that
would always be true.

I sure hope that AWJ is happy with this; because this change was so that
my flag assignments and branch tests won't need to build regs.P into
a full 8-bit variable anymore.

It does of course incur a slight performance hit when you pass in
variables by-value to functions, but it should help with binary size
(and thus cache) by reducing a lot of extra functions. (I know I could
have used template parameters for some things even with a switch table,
but chose not to for the aforementioned reasons.)

Overall, it's about a ~1% speedup from the previous build. The CPU core
instructions were never a bottleneck, but I did want to fix the P flag
building stuff because that really was a dumb mistake v_v'
This commit is contained in:
Tim Allen 2015-06-22 23:31:49 +10:00
parent ddffcd7600
commit 6b44980c6c
8 changed files with 198 additions and 56 deletions

104
amd64.c
View File

@ -1,11 +1,16 @@
/*
libco.amd64 (2009-10-12)
libco.amd64 (2015-06-19)
author: byuu
license: public domain
*/
#define LIBCO_C
#include "libco.h"
//Win64 only: provides a substantial speed-up, but will thrash XMM regs
//do not use this unless you are certain your application won't use SSE
//#define LIBCO_AMD64_NO_SSE
#include <assert.h>
#include <stdlib.h>
@ -18,21 +23,54 @@ static thread_local cothread_t co_active_handle = 0;
static void (*co_swap)(cothread_t, cothread_t) = 0;
#ifdef _WIN32
//ABI: Win64
/* ABI: Win64 */
static unsigned char co_swap_function[] = {
0x48, 0x89, 0x22, 0x48, 0x8B, 0x21, 0x58, 0x48, 0x89, 0x6A, 0x08, 0x48, 0x89, 0x72, 0x10, 0x48,
0x89, 0x7A, 0x18, 0x48, 0x89, 0x5A, 0x20, 0x4C, 0x89, 0x62, 0x28, 0x4C, 0x89, 0x6A, 0x30, 0x4C,
0x89, 0x72, 0x38, 0x4C, 0x89, 0x7A, 0x40, 0x48, 0x81, 0xC2, 0x80, 0x00, 0x00, 0x00, 0x48, 0x83,
0xE2, 0xF0, 0x0F, 0x29, 0x32, 0x0F, 0x29, 0x7A, 0x10, 0x44, 0x0F, 0x29, 0x42, 0x20, 0x44, 0x0F,
0x29, 0x4A, 0x30, 0x44, 0x0F, 0x29, 0x52, 0x40, 0x44, 0x0F, 0x29, 0x5A, 0x50, 0x44, 0x0F, 0x29,
0x62, 0x60, 0x44, 0x0F, 0x29, 0x6A, 0x70, 0x44, 0x0F, 0x29, 0xB2, 0x80, 0x00, 0x00, 0x00, 0x44,
0x0F, 0x29, 0xBA, 0x90, 0x00, 0x00, 0x00, 0x48, 0x8B, 0x69, 0x08, 0x48, 0x8B, 0x71, 0x10, 0x48,
0x8B, 0x79, 0x18, 0x48, 0x8B, 0x59, 0x20, 0x4C, 0x8B, 0x61, 0x28, 0x4C, 0x8B, 0x69, 0x30, 0x4C,
0x8B, 0x71, 0x38, 0x4C, 0x8B, 0x79, 0x40, 0x48, 0x81, 0xC1, 0x80, 0x00, 0x00, 0x00, 0x48, 0x83,
0xE1, 0xF0, 0x0F, 0x29, 0x31, 0x0F, 0x29, 0x79, 0x10, 0x44, 0x0F, 0x29, 0x41, 0x20, 0x44, 0x0F,
0x29, 0x49, 0x30, 0x44, 0x0F, 0x29, 0x51, 0x40, 0x44, 0x0F, 0x29, 0x59, 0x50, 0x44, 0x0F, 0x29,
0x61, 0x60, 0x44, 0x0F, 0x29, 0x69, 0x70, 0x44, 0x0F, 0x29, 0xB1, 0x80, 0x00, 0x00, 0x00, 0x44,
0x0F, 0x29, 0xB9, 0x90, 0x00, 0x00, 0x00, 0xFF, 0xE0,
0x48, 0x89, 0x22, /* mov [rdx],rsp */
0x48, 0x8b, 0x21, /* mov rsp,[rcx] */
0x58, /* pop rax */
0x48, 0x89, 0x6a, 0x08, /* mov [rdx+ 8],rbp */
0x48, 0x89, 0x72, 0x10, /* mov [rdx+16],rsi */
0x48, 0x89, 0x7a, 0x18, /* mov [rdx+24],rdi */
0x48, 0x89, 0x5a, 0x20, /* mov [rdx+32],rbx */
0x4c, 0x89, 0x62, 0x28, /* mov [rdx+40],r12 */
0x4c, 0x89, 0x6a, 0x30, /* mov [rdx+48],r13 */
0x4c, 0x89, 0x72, 0x38, /* mov [rdx+56],r14 */
0x4c, 0x89, 0x7a, 0x40, /* mov [rdx+64],r15 */
#if !defined(LIBCO_AMD64_NO_SSE)
0x0f, 0x29, 0x72, 0x50, /* movaps [rdx+ 80],xmm6 */
0x0f, 0x29, 0x7a, 0x60, /* movaps [rdx+ 96],xmm7 */
0x44, 0x0f, 0x29, 0x42, 0x70, /* movaps [rdx+112],xmm8 */
0x48, 0x83, 0xc2, 0x70, /* add rdx,112 */
0x44, 0x0f, 0x29, 0x4a, 0x10, /* movaps [rdx+ 16],xmm9 */
0x44, 0x0f, 0x29, 0x52, 0x20, /* movaps [rdx+ 32],xmm10 */
0x44, 0x0f, 0x29, 0x5a, 0x30, /* movaps [rdx+ 48],xmm11 */
0x44, 0x0f, 0x29, 0x62, 0x40, /* movaps [rdx+ 64],xmm12 */
0x44, 0x0f, 0x29, 0x6a, 0x50, /* movaps [rdx+ 80],xmm13 */
0x44, 0x0f, 0x29, 0x72, 0x60, /* movaps [rdx+ 96],xmm14 */
0x44, 0x0f, 0x29, 0x7a, 0x70, /* movaps [rdx+112],xmm15 */
#endif
0x48, 0x8b, 0x69, 0x08, /* mov rbp,[rcx+ 8] */
0x48, 0x8b, 0x71, 0x10, /* mov rsi,[rcx+16] */
0x48, 0x8b, 0x79, 0x18, /* mov rdi,[rcx+24] */
0x48, 0x8b, 0x59, 0x20, /* mov rbx,[rcx+32] */
0x4c, 0x8b, 0x61, 0x28, /* mov r12,[rcx+40] */
0x4c, 0x8b, 0x69, 0x30, /* mov r13,[rcx+48] */
0x4c, 0x8b, 0x71, 0x38, /* mov r14,[rcx+56] */
0x4c, 0x8b, 0x79, 0x40, /* mov r15,[rcx+64] */
#if !defined(LIBCO_AMD64_NO_SSE)
0x0f, 0x28, 0x71, 0x50, /* movaps xmm6, [rcx+ 80] */
0x0f, 0x28, 0x79, 0x60, /* movaps xmm7, [rcx+ 96] */
0x44, 0x0f, 0x28, 0x41, 0x70, /* movaps xmm8, [rcx+112] */
0x48, 0x83, 0xc1, 0x70, /* add rcx,112 */
0x44, 0x0f, 0x28, 0x49, 0x10, /* movaps xmm9, [rcx+ 16] */
0x44, 0x0f, 0x28, 0x51, 0x20, /* movaps xmm10,[rcx+ 32] */
0x44, 0x0f, 0x28, 0x59, 0x30, /* movaps xmm11,[rcx+ 48] */
0x44, 0x0f, 0x28, 0x61, 0x40, /* movaps xmm12,[rcx+ 64] */
0x44, 0x0f, 0x28, 0x69, 0x50, /* movaps xmm13,[rcx+ 80] */
0x44, 0x0f, 0x28, 0x71, 0x60, /* movaps xmm14,[rcx+ 96] */
0x44, 0x0f, 0x28, 0x79, 0x70, /* movaps xmm15,[rcx+112] */
#endif
0xff, 0xe0, /* jmp rax */
};
#include <windows.h>
@ -42,12 +80,24 @@ static void (*co_swap)(cothread_t, cothread_t) = 0;
VirtualProtect(co_swap_function, sizeof co_swap_function, PAGE_EXECUTE_READWRITE, &old_privileges);
}
#else
//ABI: SystemV
/* ABI: SystemV */
static unsigned char co_swap_function[] = {
0x48, 0x89, 0x26, 0x48, 0x8B, 0x27, 0x58, 0x48, 0x89, 0x6E, 0x08, 0x48, 0x89, 0x5E, 0x10, 0x4C,
0x89, 0x66, 0x18, 0x4C, 0x89, 0x6E, 0x20, 0x4C, 0x89, 0x76, 0x28, 0x4C, 0x89, 0x7E, 0x30, 0x48,
0x8B, 0x6F, 0x08, 0x48, 0x8B, 0x5F, 0x10, 0x4C, 0x8B, 0x67, 0x18, 0x4C, 0x8B, 0x6F, 0x20, 0x4C,
0x8B, 0x77, 0x28, 0x4C, 0x8B, 0x7F, 0x30, 0xFF, 0xE0,
0x48, 0x89, 0x26, /* mov [rsi],rsp */
0x48, 0x8b, 0x27, /* mov rsp,[rdi] */
0x58, /* pop rax */
0x48, 0x89, 0x6e, 0x08, /* mov [rsi+ 8],rbp */
0x48, 0x89, 0x5e, 0x10, /* mov [rsi+16],rbx */
0x4c, 0x89, 0x66, 0x18, /* mov [rsi+24],r12 */
0x4c, 0x89, 0x6e, 0x20, /* mov [rsi+32],r13 */
0x4c, 0x89, 0x76, 0x28, /* mov [rsi+40],r14 */
0x4c, 0x89, 0x7e, 0x30, /* mov [rsi+48],r15 */
0x48, 0x8b, 0x6f, 0x08, /* mov rbp,[rdi+ 8] */
0x48, 0x8b, 0x5f, 0x10, /* mov rbx,[rdi+16] */
0x4c, 0x8b, 0x67, 0x18, /* mov r12,[rdi+24] */
0x4c, 0x8b, 0x6f, 0x20, /* mov r13,[rdi+32] */
0x4c, 0x8b, 0x77, 0x28, /* mov r14,[rdi+40] */
0x4c, 0x8b, 0x7f, 0x30, /* mov r15,[rdi+48] */
0xff, 0xe0, /* jmp rax */
};
#include <unistd.h>
@ -62,7 +112,7 @@ static void (*co_swap)(cothread_t, cothread_t) = 0;
#endif
static void crash() {
assert(0); /* called only if cothread_t entrypoint returns */
assert(0); /* called only if cothread_t entrypoint returns */
}
cothread_t co_active() {
@ -77,14 +127,14 @@ cothread_t co_create(unsigned int size, void (*entrypoint)(void)) {
co_swap = (void (*)(cothread_t, cothread_t))co_swap_function;
}
if(!co_active_handle) co_active_handle = &co_active_buffer;
size += 512; /* allocate additional space for storage */
size &= ~15; /* align stack to 16-byte boundary */
size += 512; /* allocate additional space for storage */
size &= ~15; /* align stack to 16-byte boundary */
if(handle = (cothread_t)malloc(size)) {
long long *p = (long long*)((char*)handle + size); /* seek to top of stack */
*--p = (long long)crash; /* crash if entrypoint returns */
*--p = (long long)entrypoint; /* start of function */
*(long long*)handle = (long long)p; /* stack pointer */
long long *p = (long long*)((char*)handle + size); /* seek to top of stack */
*--p = (long long)crash; /* crash if entrypoint returns */
*--p = (long long)entrypoint; /* start of function */
*(long long*)handle = (long long)p; /* stack pointer */
}
return handle;

71
arm.c Normal file
View File

@ -0,0 +1,71 @@
/*
libco.arm (2015-06-18)
author: byuu
license: public domain
*/
#define LIBCO_C
#include "libco.h"
#include <assert.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/mman.h>
#ifdef __cplusplus
extern "C" {
#endif
static thread_local unsigned long co_active_buffer[64];
static thread_local cothread_t co_active_handle = 0;
static void (*co_swap)(cothread_t, cothread_t) = 0;
static unsigned long co_swap_function[] = {
0xe8a16ff0, /* stmia r1!, {r4-r11,sp,lr} */
0xe8b0aff0, /* ldmia r0!, {r4-r11,sp,pc} */
0xe12fff1e, /* bx lr */
};
void co_init() {
unsigned long addr = (unsigned long)co_swap_function;
unsigned long base = addr - (addr % sysconf(_SC_PAGESIZE));
unsigned long size = (addr - base) + sizeof co_swap_function;
mprotect((void*)base, size, PROT_READ | PROT_WRITE | PROT_EXEC);
}
cothread_t co_active() {
if(!co_active_handle) co_active_handle = &co_active_buffer;
return co_active_handle;
}
cothread_t co_create(unsigned int size, void (*entrypoint)(void)) {
unsigned long* handle = 0;
if(!co_swap) {
co_init();
co_swap = (void (*)(cothread_t, cothread_t))co_swap_function;
}
if(!co_active_handle) co_active_handle = &co_active_buffer;
size += 256;
size &= ~15;
if(handle = (unsigned long*)malloc(size)) {
unsigned long* p = (unsigned long*)((unsigned char*)handle + size);
handle[8] = (unsigned long)p;
handle[9] = (unsigned long)entrypoint;
}
return handle;
}
void co_delete(cothread_t handle) {
free(handle);
}
void co_switch(cothread_t handle) {
cothread_t co_previous_handle = co_active_handle;
co_swap(co_active_handle = handle, co_previous_handle);
}
#ifdef __cplusplus
}
#endif

View File

@ -6,9 +6,9 @@
#define LIBCO_C
#include "libco.h"
#define WINVER 0x0400
#define _WIN32_WINNT 0x0400
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#ifdef __cplusplus

35
libco.c
View File

@ -1,23 +1,30 @@
/*
libco
auto-selection module
license: public domain
*/
#if defined(__GNUC__) && defined(__i386__)
#include "x86.c"
#elif defined(__GNUC__) && defined(__amd64__)
#include "amd64.c"
#elif defined(__GNUC__) && defined(_ARCH_PPC)
#include "ppc.c"
#elif defined(__GNUC__)
#include "sjlj.c"
#elif defined(_MSC_VER) && defined(_M_IX86)
#include "x86.c"
#elif defined(_MSC_VER) && defined(_M_AMD64)
#include "amd64.c"
#if defined(__clang__) || defined(__GNUC__)
#if defined(__i386__)
#include "x86.c"
#elif defined(__amd64__)
#include "amd64.c"
#elif defined(__arm__)
#include "arm.c"
#elif defined(_ARCH_PPC)
#include "ppc.c"
#elif defined(_WIN32)
#include "fiber.c"
#else
#include "sjlj.c"
#endif
#elif defined(_MSC_VER)
#include "fiber.c"
#if defined(_M_IX86)
#include "x86.c"
#elif defined(_M_AMD64)
#include "amd64.c"
#else
#include "fiber.c"
#endif
#else
#error "libco: unsupported processor, compiler or operating system"
#endif

View File

@ -1,6 +1,7 @@
/*
libco
version: 0.16 (2010-12-24)
version: 0.17 (2015-06-18)
author: byuu
license: public domain
*/

1
ppc.c
View File

@ -9,6 +9,7 @@ floating-point and AltiVec save/restore */
#define LIBCO_C
#include "libco.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>

1
sjlj.c
View File

@ -13,6 +13,7 @@
#define LIBCO_C
#include "libco.h"
#include <stdlib.h>
#include <signal.h>
#include <setjmp.h>

37
x86.c
View File

@ -6,6 +6,7 @@
#define LIBCO_C
#include "libco.h"
#include <assert.h>
#include <stdlib.h>
@ -13,10 +14,10 @@
extern "C" {
#endif
#if defined(_MSC_VER)
#define fastcall __fastcall
#elif defined(__GNUC__)
#if defined(__clang__) || defined(__GNUC__)
#define fastcall __attribute__((fastcall))
#elif defined(_MSC_VER)
#define fastcall __fastcall
#else
#error "libco: please define fastcall macro"
#endif
@ -25,10 +26,20 @@ static thread_local long co_active_buffer[64];
static thread_local cothread_t co_active_handle = 0;
static void (fastcall *co_swap)(cothread_t, cothread_t) = 0;
//ABI: fastcall
/* ABI: fastcall */
static unsigned char co_swap_function[] = {
0x89, 0x22, 0x8B, 0x21, 0x58, 0x89, 0x6A, 0x04, 0x89, 0x72, 0x08, 0x89, 0x7A, 0x0C, 0x89, 0x5A,
0x10, 0x8B, 0x69, 0x04, 0x8B, 0x71, 0x08, 0x8B, 0x79, 0x0C, 0x8B, 0x59, 0x10, 0xFF, 0xE0,
0x89, 0x22, /* mov [edx],esp */
0x8b, 0x21, /* mov esp,[ecx] */
0x58, /* pop eax */
0x89, 0x6a, 0x04, /* mov [edx+ 4],ebp */
0x89, 0x72, 0x08, /* mov [edx+ 8],esi */
0x89, 0x7a, 0x0c, /* mov [edx+12],edi */
0x89, 0x5a, 0x10, /* mov [edx+16],ebx */
0x8b, 0x69, 0x04, /* mov ebp,[ecx+ 4] */
0x8b, 0x71, 0x08, /* mov esi,[ecx+ 8] */
0x8b, 0x79, 0x0c, /* mov edi,[ecx+12] */
0x8b, 0x59, 0x10, /* mov ebx,[ecx+16] */
0xff, 0xe0, /* jmp eax */
};
#ifdef _WIN32
@ -51,7 +62,7 @@ static unsigned char co_swap_function[] = {
#endif
static void crash() {
assert(0); /* called only if cothread_t entrypoint returns */
assert(0); /* called only if cothread_t entrypoint returns */
}
cothread_t co_active() {
@ -66,14 +77,14 @@ cothread_t co_create(unsigned int size, void (*entrypoint)(void)) {
co_swap = (void (fastcall*)(cothread_t, cothread_t))co_swap_function;
}
if(!co_active_handle) co_active_handle = &co_active_buffer;
size += 256; /* allocate additional space for storage */
size &= ~15; /* align stack to 16-byte boundary */
size += 256; /* allocate additional space for storage */
size &= ~15; /* align stack to 16-byte boundary */
if(handle = (cothread_t)malloc(size)) {
long *p = (long*)((char*)handle + size); /* seek to top of stack */
*--p = (long)crash; /* crash if entrypoint returns */
*--p = (long)entrypoint; /* start of function */
*(long*)handle = (long)p; /* stack pointer */
long *p = (long*)((char*)handle + size); /* seek to top of stack */
*--p = (long)crash; /* crash if entrypoint returns */
*--p = (long)entrypoint; /* start of function */
*(long*)handle = (long)p; /* stack pointer */
}
return handle;