Initial commit.

This commit is contained in:
Hans-Kristian Arntzen 2020-01-29 14:58:56 +01:00
commit 4312dcecde
57 changed files with 6678 additions and 0 deletions

6
.gitignore vendored Normal file
View File

@ -0,0 +1,6 @@
*.o
*.bin
*.elf
/cmake-build-*
/.idea
*.iml

61
CMakeLists.txt Normal file
View File

@ -0,0 +1,61 @@
cmake_minimum_required(VERSION 3.5)
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_C_STANDARD 99)
project(parallel-rsp LANGUAGES CXX C)
if (CMAKE_COMPILER_IS_GNUCXX OR (${CMAKE_CXX_COMPILER_ID} MATCHES "Clang"))
set(PARALLEL_RSP_CXX_FLAGS -Wall -Wextra -Wno-missing-field-initializers -Wno-empty-body -ffast-math -Wno-unused-parameter)
elseif (MSVC)
set(PARALLEL_RSP_CXX_FLAGS /D_CRT_SECURE_NO_WARNINGS /wd4267 /wd4244 /wd4309 /wd4005 /MP /DNOMINMAX)
endif()
add_library(parallel-rsp STATIC
main.cpp
rsp/vfunctions.cpp
rsp.cpp rsp.hpp
debug_jit.cpp debug_jit.hpp
rsp/ls.cpp rsp/pipeline.h
rsp/reciprocal.cpp rsp/reciprocal.h
rsp_1.1.h
rsp/cp0.cpp rsp/cp2.cpp
arch/x86_64/rsp/rsp_core.cpp
arch/x86_64/rsp/clamp.h
arch/x86_64/rsp/rsp.h
arch/x86_64/rsp/rsp_impl.h
arch/x86_64/rsp/vcr.h
arch/x86_64/rsp/vabs.h
arch/x86_64/rsp/vadd.h
arch/x86_64/rsp/vaddc.h
arch/x86_64/rsp/vand.h
arch/x86_64/rsp/vch.h
arch/x86_64/rsp/vcl.h
arch/x86_64/rsp/vcr.h
arch/x86_64/rsp/vcmp.h
arch/x86_64/rsp/vdivh.h
arch/x86_64/rsp/vmac.h
arch/x86_64/rsp/vmov.h
arch/x86_64/rsp/vmrg.h
arch/x86_64/rsp/vmudh.h
arch/x86_64/rsp/vmul.h
arch/x86_64/rsp/vmull.h
arch/x86_64/rsp/vmulh.h
arch/x86_64/rsp/vmuln.h
arch/x86_64/rsp/vor.h
arch/x86_64/rsp/vrcpsq.h
arch/x86_64/rsp/vrsq.h
arch/x86_64/rsp/vsub.h
arch/x86_64/rsp/vsubc.h
arch/x86_64/rsp/vxor.h
arch/x86_64/rsp/vmulm.h)
target_include_directories(parallel-rsp PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
target_include_directories(parallel-rsp PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/arch/x86_64/rsp)
target_compile_options(parallel-rsp PRIVATE ${PARALLEL_RSP_CXX_FLAGS})
target_compile_definitions(parallel-rsp PUBLIC DEBUG_JIT)
target_link_libraries(parallel-rsp dl)
add_executable(rsp-runner main.cpp)
target_link_libraries(rsp-runner PRIVATE parallel-rsp)
target_compile_options(rsp-runner PRIVATE ${PARALLEL_RSP_CXX_FLAGS})
set_target_properties(rsp-runner PROPERTIES LINK_FLAGS "-rdynamic")

13
CREDITS.txt Normal file
View File

@ -0,0 +1,13 @@
Written by Themaister.
The code is heavily reliant on MarathonMan's CEN64 RSP implementation, as well as CXD4's RSP implementation.
MIPS core: Rewritten from scratch
CP0: Near copy-pasta from CEN64
CP2: Near copy-pasta from CEN64
LS pipe: Near copy-pasta from CXD4
Mupen64plus glue code: Reused most of CXD4.
Lightning jitter interface: Written from scratch
The plugin's focus is to support dynamic recompilation for performance,
instead of being pure interpreters as CEN64 and CXD4's implementations are.

46
arch/x86_64/rsp/clamp.h Normal file
View File

@ -0,0 +1,46 @@
//
// arch/x86_64/rsp/clamp.h
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
static inline __m128i rsp_sclamp_acc_tomd(
__m128i acc_md, __m128i acc_hi) {
__m128i l = _mm_unpacklo_epi16(acc_md, acc_hi);
__m128i h = _mm_unpackhi_epi16(acc_md, acc_hi);
return _mm_packs_epi32(l, h);
}
static inline __m128i rsp_uclamp_acc(__m128i val,
__m128i acc_md, __m128i acc_hi, __m128i zero) {
__m128i clamp_mask, clamped_val;
__m128i hi_sign_check, md_sign_check;
__m128i md_negative, hi_negative;
__m128i tmp;
hi_negative = _mm_srai_epi16(acc_hi, 15);
md_negative = _mm_srai_epi16(acc_md, 15);
// We don't have to clamp if the HI part of the
// accumulator is sign-extended down to the MD part.
hi_sign_check = _mm_cmpeq_epi16(hi_negative, acc_hi);
md_sign_check = _mm_cmpeq_epi16(hi_negative, md_negative);
clamp_mask = _mm_and_si128(md_sign_check, hi_sign_check);
// Generate the value in the event we need to clamp.
// * hi_negative, mid_sign => xxxx
// * hi_negative, !mid_sign => 0000
// * !hi_negative, mid_sign => FFFF
// * !hi_negative, !mid_sign => xxxx
clamped_val = _mm_cmpeq_epi16(hi_negative, zero);
#ifndef __SSE4_1__
tmp = _mm_and_si128(clamp_mask, val);
val = _mm_andnot_si128(clamp_mask, clamped_val);
return _mm_or_si128(val, tmp);
#else
return _mm_blendv_epi8(clamped_val, val, clamp_mask);
#endif
}

147
arch/x86_64/rsp/rsp.h Normal file
View File

@ -0,0 +1,147 @@
//
// arch/x86_64/rsp/rsp.h
//
// Extern declarations for host RSP functions.
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
#ifndef __arch_rsp_h__
#define __arch_rsp_h__
#ifdef __SSE4_2__
#include <nmmintrin.h>
#elif defined(__SSE4_1__)
#include <smmintrin.h>
#elif defined(__SSSE3__)
#include <tmmintrin.h>
#elif defined(__SSE3__)
#include <pmmintrin.h>
#else
#include <emmintrin.h>
#endif
#include <stdint.h>
typedef __m128i rsp_vect_t;
namespace RSP
{
struct CPUState;
}
// Loads and shuffles a 16x8 vector according to element.
#ifdef __SSSE3__
extern const uint16_t shuffle_keys[16][8];
static inline __m128i rsp_vect_load_and_shuffle_operand(
const uint16_t *src, unsigned element) {
__m128i operand = _mm_load_si128((__m128i*) src);
__m128i key = _mm_load_si128((__m128i*) shuffle_keys[element]);
return _mm_shuffle_epi8(operand, key);
}
#else
__m128i rsp_vect_load_and_shuffle_operand(
const uint16_t *src, unsigned element);
#endif
// Loads a vector without shuffling its elements.
static inline __m128i rsp_vect_load_unshuffled_operand(const uint16_t *src) {
return _mm_load_si128((__m128i *) src);
}
// Writes an operand back to memory.
static inline void rsp_vect_write_operand(uint16_t *dest, __m128i src) {
_mm_store_si128((__m128i*) dest, src);
}
static inline __m128i read_acc_lo(const uint16_t *acc) {
return rsp_vect_load_unshuffled_operand(acc + 16);
}
static inline __m128i read_acc_md(const uint16_t *acc) {
return rsp_vect_load_unshuffled_operand(acc + 8);
}
static inline __m128i read_acc_hi(const uint16_t *acc) {
return rsp_vect_load_unshuffled_operand(acc);
}
static inline __m128i read_vcc_lo(const uint16_t *vcc) {
return rsp_vect_load_unshuffled_operand(vcc + 8);
}
static inline __m128i read_vcc_hi(const uint16_t *vcc) {
return rsp_vect_load_unshuffled_operand(vcc);
}
static inline __m128i read_vco_lo(const uint16_t *vco) {
return rsp_vect_load_unshuffled_operand(vco + 8);
}
static inline __m128i read_vco_hi(const uint16_t *vco) {
return rsp_vect_load_unshuffled_operand(vco);
}
static inline __m128i read_vce(const uint16_t *vce) {
return rsp_vect_load_unshuffled_operand(vce + 8);
}
static inline void write_acc_lo(uint16_t *acc, __m128i acc_lo) {
rsp_vect_write_operand(acc + 16, acc_lo);
}
static inline void write_acc_md(uint16_t *acc, __m128i acc_md) {
rsp_vect_write_operand(acc + 8, acc_md);
}
static inline void write_acc_hi(uint16_t *acc, __m128i acc_hi) {
rsp_vect_write_operand(acc, acc_hi);
}
static inline void write_vcc_lo(uint16_t *vcc, __m128i vcc_lo) {
rsp_vect_write_operand(vcc + 8, vcc_lo);
}
static inline void write_vcc_hi(uint16_t *vcc, __m128i vcc_hi) {
rsp_vect_write_operand(vcc, vcc_hi);
}
static inline void write_vco_lo(uint16_t *vco, __m128i vco_lo) {
rsp_vect_write_operand(vco + 8, vco_lo);
}
static inline void write_vco_hi(uint16_t *vco, __m128i vco_hi) {
rsp_vect_write_operand(vco, vco_hi);
}
static inline void write_vce(uint16_t *vce, __m128i vce_r) {
rsp_vect_write_operand(vce + 8, vce_r);
}
// Returns scalar bitmasks for VCO/VCC/VCE.
static inline int16_t rsp_get_flags(const uint16_t *flags) {
return (int16_t) _mm_movemask_epi8(
_mm_packs_epi16(
_mm_load_si128((__m128i *) (flags + 8)),
_mm_load_si128((__m128i *) (flags + 0))
)
);
}
void rsp_set_flags(uint16_t *flags, uint16_t rt);
// Zeroes out a vector register.
static inline __m128i rsp_vzero(void) {
return _mm_setzero_si128();
}
extern const uint16_t vdiv_mask_table[8][8];
#define HES(x) ((x) ^ 2)
#define BES(x) ((x) ^ 3)
#define MES(x) ((x) ^ 1)
#define READ_MEM_U8(mem, addr) \
(reinterpret_cast<const uint8_t*>(mem)[BES(addr)])
#define READ_MEM_U16(mem, addr) \
(reinterpret_cast<const uint16_t*>(mem)[HES(addr) >> 1])
#define READ_MEM_U32(mem, addr) \
(reinterpret_cast<const uint32_t*>(mem)[addr >> 2])
#define WRITE_MEM_U8(mem, addr, data) \
(reinterpret_cast<uint8_t*>(mem)[BES(addr)] = data)
#define WRITE_MEM_U16(mem, addr, data) \
(reinterpret_cast<uint16_t*>(mem)[HES(addr) >> 1] = data)
#define WRITE_MEM_U32(mem, addr, data) \
(reinterpret_cast<uint32_t*>(mem)[addr >> 2] = data)
#endif

View File

@ -0,0 +1,600 @@
//
// arch/x86_64/rsp/rsp.c
//
// Declarations for host RSP functions.
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
#include "rsp.h"
#include <string.h>
#include "../../../rsp.hpp"
#ifdef __SSSE3__
//
// This table is used to "shuffle" the RSP vector after loading it.
//
alignas(64) const uint16_t shuffle_keys[16][8] = {
/* -- */ {0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E},
/* -- */ {0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E},
/* 0q */ {0x0100, 0x0100, 0x0504, 0x0504, 0x0908, 0x0908, 0x0D0C, 0x0D0C},
/* 1q */ {0x0302, 0x0302, 0x0706, 0x0706, 0x0B0A, 0x0B0A, 0x0F0E, 0x0F0E},
/* 0h */ {0x0100, 0x0100, 0x0100, 0x0100, 0x0908, 0x0908, 0x0908, 0x0908},
/* 1h */ {0x0302, 0x0302, 0x0302, 0x0302, 0x0B0A, 0x0B0A, 0x0B0A, 0x0B0A},
/* 2h */ {0x0504, 0x0504, 0x0504, 0x0504, 0x0D0C, 0x0D0C, 0x0D0C, 0x0D0C},
/* 3h */ {0x0706, 0x0706, 0x0706, 0x0706, 0x0F0E, 0x0F0E, 0x0F0E, 0x0F0E},
/* 0w */ {0x0100, 0x0100, 0x0100, 0x0100, 0x0100, 0x0100, 0x0100, 0x0100},
/* 1w */ {0x0302, 0x0302, 0x0302, 0x0302, 0x0302, 0x0302, 0x0302, 0x0302},
/* 2w */ {0x0504, 0x0504, 0x0504, 0x0504, 0x0504, 0x0504, 0x0504, 0x0504},
/* 3w */ {0x0706, 0x0706, 0x0706, 0x0706, 0x0706, 0x0706, 0x0706, 0x0706},
/* 4w */ {0x0908, 0x0908, 0x0908, 0x0908, 0x0908, 0x0908, 0x0908, 0x0908},
/* 5w */ {0x0B0A, 0x0B0A, 0x0B0A, 0x0B0A, 0x0B0A, 0x0B0A, 0x0B0A, 0x0B0A},
/* 6w */ {0x0D0C, 0x0D0C, 0x0D0C, 0x0D0C, 0x0D0C, 0x0D0C, 0x0D0C, 0x0D0C},
/* 7w */ {0x0F0E, 0x0F0E, 0x0F0E, 0x0F0E, 0x0F0E, 0x0F0E, 0x0F0E, 0x0F0E},
};
#endif
//
// These tables are used to shift data loaded from DMEM.
// In addition to shifting, they also take into account that
// DMEM uses big-endian byte ordering, whereas vectors are
// 2-byte little-endian.
//
// Shift left LUT; shifts in zeros from the right, one byte at a time.
alignas(64) static const uint16_t sll_b2l_keys[16][8] = {
{0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F},
{0x8000, 0x0102, 0x0304, 0x0506, 0x0708, 0x090A, 0x0B0C, 0x0D0E},
{0x8080, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D},
{0x8080, 0x8000, 0x0102, 0x0304, 0x0506, 0x0708, 0x090A, 0x0B0C},
{0x8080, 0x8080, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B},
{0x8080, 0x8080, 0x8000, 0x0102, 0x0304, 0x0506, 0x0708, 0x090A},
{0x8080, 0x8080, 0x8080, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809},
{0x8080, 0x8080, 0x8080, 0x8000, 0x0102, 0x0304, 0x0506, 0x0708},
{0x8080, 0x8080, 0x8080, 0x8080, 0x0001, 0x0203, 0x0405, 0x0607},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8000, 0x0102, 0x0304, 0x0506},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0001, 0x0203, 0x0405},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8000, 0x0102, 0x0304},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0001, 0x0203},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8000, 0x0102},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0001},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8000},
};
// Shift left LUT; shirts low order to high order, inserting 0x00s.
alignas(64) static const uint16_t sll_l2b_keys[16][8] = {
{0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F},
{0x0180, 0x0300, 0x0502, 0x0704, 0x0906, 0x0B08, 0x0D0A, 0x0E0C},
{0x8080, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D},
{0x8080, 0x0180, 0x0300, 0x0502, 0x0704, 0x0906, 0x0B08, 0x0D0A},
{0x8080, 0x8080, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B},
{0x8080, 0x8080, 0x0180, 0x0300, 0x0502, 0x0704, 0x0906, 0x0B08},
{0x8080, 0x8080, 0x8080, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809},
{0x8080, 0x8080, 0x8080, 0x0180, 0x0300, 0x0502, 0x0704, 0x0906},
{0x8080, 0x8080, 0x8080, 0x8080, 0x0001, 0x0203, 0x0405, 0x0607},
{0x8080, 0x8080, 0x8080, 0x8080, 0x0180, 0x0300, 0x0502, 0x0704},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0001, 0x0203, 0x0405},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0180, 0x0300, 0x0502},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0001, 0x0203},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0180, 0x0300},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0001},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0180},
};
// Shift right LUT; shifts in zeros from the left, one byte at a time.
alignas(64) static const uint16_t srl_b2l_keys[16][8] = {
{0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F},
{0x0102, 0x0304, 0x0506, 0x0708, 0x090A, 0x0B0C, 0x0D0E, 0x0F80},
{0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x8080},
{0x0304, 0x0506, 0x0708, 0x090A, 0x0B0C, 0x0D0E, 0x0F80, 0x8080},
{0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x8080, 0x8080},
{0x0506, 0x0708, 0x090A, 0x0B0C, 0x0D0E, 0x0F80, 0x8080, 0x8080},
{0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x8080, 0x8080, 0x8080},
{0x0708, 0x090A, 0x0B0C, 0x0D0E, 0x0F80, 0x8080, 0x8080, 0x8080},
{0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x8080, 0x8080, 0x8080, 0x8080},
{0x090A, 0x0B0C, 0x0D0E, 0x0F80, 0x8080, 0x8080, 0x8080, 0x8080},
{0x0A0B, 0x0C0D, 0x0E0F, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080},
{0x0B0C, 0x0D0E, 0x0F80, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080},
{0x0C0D, 0x0E0F, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080},
{0x0D0E, 0x0F80, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080},
{0x0E0F, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080},
{0x0F80, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080},
};
alignas(64) static const uint16_t ror_b2l_keys[16][8] = {
{0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F},
{0x0102, 0x0304, 0x0506, 0x0708, 0x090A, 0x0B0C, 0x0D0E, 0x0F00},
{0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001},
{0x0304, 0x0506, 0x0708, 0x090A, 0x0B0C, 0x0D0E, 0x0F00, 0x0102},
{0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203},
{0x0506, 0x0708, 0x090A, 0x0B0C, 0x0D0E, 0x0F00, 0x0102, 0x0304},
{0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405},
{0x0708, 0x090A, 0x0B0C, 0x0D0E, 0x0F00, 0x0102, 0x0304, 0x0506},
{0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607},
{0x090A, 0x0B0C, 0x0D0E, 0x0F00, 0x0102, 0x0304, 0x0506, 0x0708},
{0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809},
{0x0B0C, 0x0D0E, 0x0F00, 0x0102, 0x0304, 0x0506, 0x0708, 0x090A},
{0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B},
{0x0D0E, 0x0F00, 0x0102, 0x0304, 0x0506, 0x0708, 0x090A, 0x0B0C},
{0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D},
{0x0F00, 0x0102, 0x0304, 0x0506, 0x0708, 0x090A, 0x0B0C, 0x0D0E},
};
// Rotate left LUT; rotates high order bytes back to low order.
alignas(64) static const uint16_t rol_l2b_keys[16][8] = {
{0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F},
{0x010E, 0x0300, 0x0502, 0x0704, 0x0906, 0x0B08, 0x0D0A, 0x0F0C},
{0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D},
{0x0F0C, 0x010E, 0x0300, 0x0502, 0x0704, 0x0906, 0x0B08, 0x0D0A},
{0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B},
{0x0D0A, 0x0F0C, 0x010E, 0x0300, 0x0502, 0x0704, 0x0906, 0x0B08},
{0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809},
{0x0B08, 0x0D0A, 0x0F0C, 0x010E, 0x0300, 0x0502, 0x0704, 0x0906},
{0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607},
{0x0906, 0x0B08, 0x0D0A, 0x0F0C, 0x010E, 0x0300, 0x0502, 0x0704},
{0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405},
{0x0704, 0x0906, 0x0B08, 0x0D0A, 0x0F0C, 0x010E, 0x0300, 0x0502},
{0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203},
{0x0502, 0x0704, 0x0906, 0x0B08, 0x0D0A, 0x0F0C, 0x010E, 0x0300},
{0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001},
{0x0300, 0x0502, 0x0704, 0x0906, 0x0B08, 0x0D0A, 0x0F0C, 0x010E},
};
// Rotate right LUT; rotates high order bytes back to low order.
alignas(64) static const uint16_t ror_l2b_keys[16][8] = {
{0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F},
{0x0300, 0x0502, 0x0704, 0x0906, 0x0B08, 0x0D0A, 0x0F0C, 0x010E},
{0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001},
{0x0502, 0x0704, 0x0906, 0x0B08, 0x0D0A, 0x0F0C, 0x010E, 0x0300},
{0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203},
{0x0704, 0x0906, 0x0B08, 0x0D0A, 0x0F0C, 0x010E, 0x0300, 0x0502},
{0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405},
{0x0906, 0x0B08, 0x0D0A, 0x0F0C, 0x010E, 0x0300, 0x0502, 0x0704},
{0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607},
{0x0B08, 0x0D0A, 0x0F0C, 0x010E, 0x0300, 0x0502, 0x0704, 0x0906},
{0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809},
{0x0D0A, 0x0F0C, 0x010E, 0x0300, 0x0502, 0x0704, 0x0906, 0x0B08},
{0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B},
{0x0F0C, 0x010E, 0x0300, 0x0502, 0x0704, 0x0906, 0x0B08, 0x0D0A},
{0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D},
{0x010E, 0x0300, 0x0502, 0x0704, 0x0906, 0x0B08, 0x0D0A, 0x0F0C},
};
#ifndef __SSSE3__
static inline __m128i sse2_pshufb_loop8(__m128i v, const uint8_t *keys) {
alignas(16) uint8_t temp[(0x80 |128) + 1] ;
unsigned j;
_mm_store_si128((__m128i *) temp, v);
temp[0x80] = 0;
#if 0
for (j = 0; j < 16; j++)
temp[j + 16] = temp[keys[j]];
#else
for (j = 0; j < 16; j+=4) {
temp[j + 16] = temp[keys[j+0]];
temp[j + 17] = temp[keys[j+1]];
temp[j + 18] = temp[keys[j+2]];
temp[j + 19] = temp[keys[j+3]];
}
#endif
return _mm_load_si128(((__m128i *)temp)+1);
}
static inline __m128i sse2_pshufb(__m128i v, const uint16_t *keys) {
union {
const uint16_t *k16;
const uint8_t *k8;
} x;
x.k16 = keys;
return sse2_pshufb_loop8(v, x.k8);
}
#endif
// Uses a LUT to populate flag registers.
void rsp_set_flags(uint16_t *flags, uint16_t rt) {
unsigned i;
static const uint16_t array[16][4] = {
{0x0000, 0x0000, 0x0000, 0x0000},
{0xFFFF, 0x0000, 0x0000, 0x0000},
{0x0000, 0xFFFF, 0x0000, 0x0000},
{0xFFFF, 0xFFFF, 0x0000, 0x0000},
{0x0000, 0x0000, 0xFFFF, 0x0000},
{0xFFFF, 0x0000, 0xFFFF, 0x0000},
{0x0000, 0xFFFF, 0xFFFF, 0x0000},
{0xFFFF, 0xFFFF, 0xFFFF, 0x0000},
{0x0000, 0x0000, 0x0000, 0xFFFF},
{0xFFFF, 0x0000, 0x0000, 0xFFFF},
{0x0000, 0xFFFF, 0x0000, 0xFFFF},
{0xFFFF, 0xFFFF, 0x0000, 0xFFFF},
{0x0000, 0x0000, 0xFFFF, 0xFFFF},
{0xFFFF, 0x0000, 0xFFFF, 0xFFFF},
{0x0000, 0xFFFF, 0xFFFF, 0xFFFF},
{0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF},
};
for (i = 0; i < 2; i++, rt >>= 4)
memcpy(flags + 8 + i * 4, array[rt & 0xF], sizeof(array[0]));
for (i = 0; i < 2; i++, rt >>= 4)
memcpy(flags + 0 + i * 4, array[rt & 0xF], sizeof(array[0]));
}
#ifndef __SSSE3__
__m128i rsp_vect_load_and_shuffle_operand(
const uint16_t *src, unsigned element) {
__m128i v;
switch(element) {
case 0:
case 1:
v = _mm_load_si128((__m128i *) src);
return v;
// element => 0q
case 2:
v = _mm_load_si128((__m128i *) src);
v = _mm_shufflelo_epi16(v, _MM_SHUFFLE(2,2,0,0));
v = _mm_shufflehi_epi16(v, _MM_SHUFFLE(2,2,0,0));
return v;
// element => 1q
case 3:
v = _mm_load_si128((__m128i *) src);
v = _mm_shufflelo_epi16(v, _MM_SHUFFLE(3,3,1,1));
v = _mm_shufflehi_epi16(v, _MM_SHUFFLE(3,3,1,1));
return v;
// element => 0h ... 3h
case 4:
case 5:
case 6:
case 7:
__asm__("" : "=x"(v)); /* Do not remove. */
v = _mm_insert_epi16(v, src[element - 4], 0);
v = _mm_insert_epi16(v, src[element - 0], 1);
v = _mm_shufflelo_epi16(v, _MM_SHUFFLE(1,1,0,0));
v = _mm_shuffle_epi32(v, _MM_SHUFFLE(1,1,0,0));
return v;
// element => 0w ... 7w
case 8:
case 9:
case 10:
case 11:
case 12:
case 13:
case 14:
case 15:
__asm__("" : "=x"(v)); /* Do not remove. */
v = _mm_insert_epi16(v, src[element - 8], 0);
v = _mm_unpacklo_epi16(v, v);
v = _mm_shuffle_epi32(v, _MM_SHUFFLE(0,0,0,0));
return v;
}
#ifdef NDEBUG
__builtin_unreachable();
#else
__builtin_trap();
#endif
}
#endif
//
// SSSE3+ accelerated loads for group I. Byteswap big-endian to 2-byte
// little-endian vector. Start at vector element offset, discarding any
// wraparound as necessary.
//
// TODO: Reverse-engineer what happens when loads to vector elements must
// wraparound. Do we just discard the data, as below, or does the
// data effectively get rotated around the edge of the vector?
//
void rsp_vload_group1(RSP::CPUState *rsp, uint32_t addr, unsigned element,
uint16_t *regp, rsp_vect_t reg, rsp_vect_t dqm) {
__m128i data;
unsigned offset = addr & 0x7;
unsigned ror = offset - element;
// Always load in 8-byte chunks to emulate wraparound.
if (offset) {
uint32_t aligned_addr_lo = addr & ~0x7;
uint32_t aligned_addr_hi = (aligned_addr_lo + 8) & 0xFFF;
__m128i temp;
data = _mm_loadl_epi64((__m128i *) (rsp->dmem + aligned_addr_lo));
temp = _mm_loadl_epi64((__m128i *) (rsp->dmem + aligned_addr_hi));
data = _mm_unpacklo_epi64(data, temp);
}
else
data = _mm_loadl_epi64((__m128i *) (rsp->dmem + addr));
// Shift the DQM up to the point where we mux in the data.
#ifndef __SSSE3__
dqm = sse2_pshufb(dqm, sll_b2l_keys[element]);
#else
__m128i ekey = _mm_load_si128((__m128i *) (sll_b2l_keys[element]));
dqm = _mm_shuffle_epi8(dqm, ekey);
#endif
// Align the data to the DQM so we can mask it in.
#ifndef __SSSE3__
data = sse2_pshufb(data, ror_b2l_keys[ror & 0xF]);
#else
ekey = _mm_load_si128((__m128i *) (ror_b2l_keys[ror & 0xF]));
data = _mm_shuffle_epi8(data, ekey);
#endif
// Mask and mux in the data.
#ifdef __SSE4_1__
reg = _mm_blendv_epi8(reg, data, dqm);
#else
data = _mm_and_si128(dqm, data);
reg = _mm_andnot_si128(dqm, reg);
reg = _mm_or_si128(data, reg);
#endif
_mm_store_si128((__m128i *) regp, reg);
}
//
// SSSE3+ accelerated loads for group II.
//
// TODO: Reverse-engineer what happens when loads to vector elements must
// wraparound. Do we just discard the data, as below, or does the
// data effectively get rotated around the edge of the vector?
//
// TODO: Reverse-engineer what happens when element != 0.
//
void rsp_vload_group2(RSP::CPUState *rsp, uint32_t addr, unsigned element,
uint16_t *regp, rsp_vect_t reg, rsp_vect_t dqm) {
unsigned offset = addr & 0x7;
__m128i data, zero;
// Always load in 8-byte chunks to emulate wraparound.
if (offset) {
uint32_t aligned_addr_lo = addr & ~0x7;
uint32_t aligned_addr_hi = (aligned_addr_lo + 8) & 0xFFF;
uint64_t datalow, datahigh;
memcpy(&datalow, rsp->dmem + aligned_addr_lo, sizeof(datalow));
memcpy(&datahigh, rsp->dmem + aligned_addr_hi, sizeof(datahigh));
// TODO: Get rid of GNU extensions.
datalow = __builtin_bswap64(datalow);
datahigh = __builtin_bswap64(datahigh);
datahigh >>= ((8 - offset) << 3);
datalow <<= (offset << 3);
datalow = datahigh | datalow;
datalow = __builtin_bswap64(datalow);
data = _mm_loadl_epi64((__m128i *) &datalow);
}
else
data = _mm_loadl_epi64((__m128i *) (rsp->dmem + addr));
// "Unpack" the data.
zero = _mm_setzero_si128();
data = _mm_unpacklo_epi8(zero, data);
#if 0
if (rsp->pipeline.exdf_latch.request.type != RSP_MEM_REQUEST_PACK)
#endif
data = _mm_srli_epi16(data, 1);
_mm_store_si128((__m128i *) regp, data);
}
//
// SSSE3+ accelerated loads for group IV. Byteswap big-endian to 2-byte
// little-endian vector. Stop loading at quadword boundaries.
//
// TODO: Reverse-engineer what happens when loads from vector elements
// must wraparound (i.e., the address offset is small, starting
// element is large).
//
void rsp_vload_group4(RSP::CPUState *rsp, uint32_t addr, unsigned element,
uint16_t *regp, rsp_vect_t reg, rsp_vect_t dqm) {
uint32_t aligned_addr = addr & 0xFF0;
unsigned offset = addr & 0xF;
unsigned ror;
__m128i data = _mm_load_si128((__m128i *) (rsp->dmem + aligned_addr));
// TODO: Use of element is almost certainly wrong...
ror = 16 - element + offset;
#if 0
if (rsp->pipeline.exdf_latch.request.type != RSP_MEM_REQUEST_QUAD)
#endif
dqm = _mm_cmpeq_epi8(_mm_setzero_si128(), dqm);
#ifndef __SSSE3__
data = sse2_pshufb(data, ror_b2l_keys[ror & 0xF]);
dqm = sse2_pshufb(dqm, ror_b2l_keys[ror & 0xF]);
#else
__m128i dkey = _mm_load_si128((__m128i *) (ror_b2l_keys[ror & 0xF]));
data = _mm_shuffle_epi8(data, dkey);
dqm = _mm_shuffle_epi8(dqm, dkey);
#endif
// Mask and mux in the data.
#ifdef __SSE4_1__
data = _mm_blendv_epi8(reg, data, dqm);
#else
data = _mm_and_si128(dqm, data);
reg = _mm_andnot_si128(dqm, reg);
data = _mm_or_si128(data, reg);
#endif
_mm_store_si128((__m128i *) regp, data);
}
//
// SSE3+ accelerated stores for group I. Byteswap 2-byte little-endian
// vector back to big-endian. Start at vector element offset, wrapping
// around the edge of the vector as necessary.
//
// TODO: Reverse-engineer what happens when stores from vector elements
// must wraparound. Do we just stop storing the data, or do we
// continue storing from the front of the vector, as below?
//
void rsp_vstore_group1(RSP::CPUState *rsp, uint32_t addr, unsigned element,
uint16_t *regp, rsp_vect_t reg, rsp_vect_t dqm) {
unsigned offset = addr & 0x7;
unsigned ror = element - offset;
__m128i data;
// Shift the DQM up to the point where we mux in the data.
#ifndef __SSSE3__
dqm = sse2_pshufb(dqm, sll_l2b_keys[offset]);
#else
__m128i ekey = _mm_load_si128((__m128i *) (sll_l2b_keys[offset]));
dqm = _mm_shuffle_epi8(dqm, ekey);
#endif
// Rotate the reg to align with the DQM.
#ifndef __SSSE3__
reg = sse2_pshufb(reg, ror_l2b_keys[ror & 0xF]);
#else
ekey = _mm_load_si128((__m128i *) (ror_l2b_keys[ror & 0xF]));
reg = _mm_shuffle_epi8(reg, ekey);
#endif
// Always load in 8-byte chunks to emulate wraparound.
if (offset) {
uint32_t aligned_addr_lo = addr & ~0x7;
uint32_t aligned_addr_hi = (aligned_addr_lo + 8) & 0xFFF;
__m128i temp;
data = _mm_loadl_epi64((__m128i *) (rsp->dmem + aligned_addr_lo));
temp = _mm_loadl_epi64((__m128i *) (rsp->dmem + aligned_addr_hi));
data = _mm_unpacklo_epi64(data, temp);
// Mask and mux in the data.
#ifdef __SSE4_1__
data = _mm_blendv_epi8(data, reg, dqm);
#else
data = _mm_andnot_si128(dqm, data);
reg = _mm_and_si128(dqm, reg);
data = _mm_or_si128(data, reg);
#endif
_mm_storel_epi64((__m128i *) (rsp->dmem + aligned_addr_lo), data);
data = _mm_srli_si128(data, 8);
_mm_storel_epi64((__m128i *) (rsp->dmem + aligned_addr_hi), data);
}
else {
data = _mm_loadl_epi64((__m128i *) (rsp->dmem + addr));
// Mask and mux in the data.
#ifdef __SSE4_1__
data = _mm_blendv_epi8(data, reg, dqm);
#else
data = _mm_andnot_si128(dqm, data);
reg = _mm_and_si128(dqm, reg);
data = _mm_or_si128(data, reg);
#endif
_mm_storel_epi64((__m128i *) (rsp->dmem + addr), data);
}
}
//
// SSE3+ accelerated stores for group II. Byteswap 2-byte little-endian
// vector back to big-endian. Start at vector element offset, wrapping
// around the edge of the vector as necessary.
//
// TODO: Reverse-engineer what happens when stores from vector elements
// must wraparound. Do we just stop storing the data, or do we
// continue storing from the front of the vector, as below?
//
// TODO: Reverse-engineer what happens when element != 0.
//
void rsp_vstore_group2(RSP::CPUState *rsp, uint32_t addr, unsigned element,
uint16_t *regp, rsp_vect_t reg, rsp_vect_t dqm) {
// "Pack" the data.
#if 0
if (rsp->pipeline.exdf_latch.request.type != RSP_MEM_REQUEST_PACK)
#endif
reg = _mm_slli_epi16(reg, 1);
reg = _mm_srai_epi16(reg, 8);
reg = _mm_packs_epi16(reg, reg);
// TODO: Always store in 8-byte chunks to emulate wraparound.
_mm_storel_epi64((__m128i *) (rsp->dmem + addr), reg);
}
//
// SSE3+ accelerated stores for group IV. Byteswap 2-byte little-endian
// vector back to big-endian. Stop storing at quadword boundaries.
//
void rsp_vstore_group4(RSP::CPUState *rsp, uint32_t addr, unsigned element,
uint16_t *regp, rsp_vect_t reg, rsp_vect_t dqm) {
uint32_t aligned_addr = addr & 0xFF0;
unsigned offset = addr & 0xF;
unsigned rol = offset;
__m128i data = _mm_load_si128((__m128i *) (rsp->dmem + aligned_addr));
#if 0
if (rsp->pipeline.exdf_latch.request.type == RSP_MEM_REQUEST_QUAD)
#else
if (0)
#endif
rol -= element;
// TODO: How is this adjusted for SRV when e != 0?
else
dqm = _mm_cmpeq_epi8(_mm_setzero_si128(), dqm);
#ifndef __SSSE3__
reg = sse2_pshufb(reg, rol_l2b_keys[rol & 0xF]);
#else
__m128i ekey = _mm_load_si128((__m128i *) (rol_l2b_keys[rol & 0xF]));
reg = _mm_shuffle_epi8(reg, ekey);
#endif
// Mask and mux out the data, write.
#ifdef __SSE4_1__
data = _mm_blendv_epi8(data, reg, dqm);
#else
reg = _mm_and_si128(dqm, reg);
data = _mm_andnot_si128(dqm, data);
data = _mm_or_si128(data, reg);
#endif
_mm_store_si128((__m128i *) (rsp->dmem + aligned_addr), data);
}

View File

@ -0,0 +1,29 @@
#ifndef RSP_IMPL_H
#define RSP_IMPL_H
#include "clamp.h"
#include "vabs.h"
#include "vadd.h"
#include "vaddc.h"
#include "vand.h"
#include "vch.h"
#include "vcmp.h"
#include "vcl.h"
#include "vcr.h"
#include "vmac.h"
#include "vmrg.h"
#include "vmul.h"
#include "vmulh.h"
#include "vmull.h"
#include "vmulm.h"
#include "vmuln.h"
#include "vor.h"
#include "vsub.h"
#include "vsubc.h"
#include "vxor.h"
#include "vrcpsq.h"
#include "vmov.h"
#include "vdivh.h"
#include "vrsq.h"
#endif

19
arch/x86_64/rsp/vabs.h Normal file
View File

@ -0,0 +1,19 @@
//
// arch/x86_64/rsp/vabs.h
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
static inline __m128i rsp_vabs(__m128i vs, __m128i vt, __m128i *acc_lo) {
__m128i vs_zero = _mm_cmpeq_epi16(vs, _mm_setzero_si128());
__m128i sign_lt = _mm_srai_epi16(vs, 15);
__m128i vd = _mm_andnot_si128(vs_zero, vt);
// Careful: if VT = 0x8000 and VS is negative,
// acc_lo will be 0x8000 but vd will be 0x7FFF.
vd = _mm_xor_si128(vd, sign_lt);
*acc_lo = _mm_sub_epi16(vd, sign_lt);
return _mm_subs_epi16(vd, sign_lt);
}

23
arch/x86_64/rsp/vadd.h Normal file
View File

@ -0,0 +1,23 @@
//
// arch/x86_64/rsp/vadd.h
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
static inline __m128i rsp_vadd(__m128i vs, __m128i vt,
__m128i carry, __m128i *acc_lo) {
__m128i vd, minimum, maximum;
// VCC uses unsaturated arithmetic.
vd = _mm_add_epi16(vs, vt);
*acc_lo = _mm_sub_epi16(vd, carry);
// VD is the signed sum of the two sources and the carry. Since we
// have to saturate the sum of all three, we have to be clever.
minimum = _mm_min_epi16(vs, vt);
maximum = _mm_max_epi16(vs, vt);
minimum = _mm_subs_epi16(minimum, carry);
return _mm_adds_epi16(minimum, maximum);
}

20
arch/x86_64/rsp/vaddc.h Normal file
View File

@ -0,0 +1,20 @@
//
// arch/x86_64/rsp/vaddc.h
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
static inline __m128i rsp_vaddc(__m128i vs, __m128i vt,
__m128i zero, __m128i *sn) {
__m128i sat_sum, unsat_sum;
sat_sum = _mm_adds_epu16(vs, vt);
unsat_sum = _mm_add_epi16(vs, vt);
*sn = _mm_cmpeq_epi16(sat_sum, unsat_sum);
*sn = _mm_cmpeq_epi16(*sn, zero);
return unsat_sum;
}

16
arch/x86_64/rsp/vand.h Normal file
View File

@ -0,0 +1,16 @@
//
// arch/x86_64/rsp/vand.h
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
static inline __m128i rsp_vand(__m128i vs, __m128i vt) {
return _mm_and_si128(vs, vt);
}
static inline __m128i rsp_vnand(__m128i vs, __m128i vt) {
__m128i vd = _mm_and_si128(vs, vt);
return _mm_xor_si128(vd, _mm_set1_epi32(0xffffffffu));
}

68
arch/x86_64/rsp/vch.h Normal file
View File

@ -0,0 +1,68 @@
//
// arch/x86_64/rsp/vch.h
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
static inline __m128i rsp_vch(__m128i vs, __m128i vt, __m128i zero,
__m128i *ge, __m128i *le, __m128i *eq, __m128i *sign, __m128i *vce) {
__m128i sign_negvt, vt_neg;
__m128i diff, diff_zero, diff_sel_mask;
__m128i diff_gez, diff_lez;
// sign = (vs ^ vt) < 0
*sign = _mm_xor_si128(vs, vt);
*sign = _mm_cmplt_epi16(*sign, zero);
// sign_negvt = sign ? -vt : vt
sign_negvt = _mm_xor_si128(vt, *sign);
sign_negvt = _mm_sub_epi16(sign_negvt, *sign);
// Compute diff, diff_zero:
diff = _mm_sub_epi16(vs, sign_negvt);
diff_zero = _mm_cmpeq_epi16(diff, zero);
// Compute le/ge:
vt_neg = _mm_cmplt_epi16(vt, zero);
diff_lez = _mm_cmpgt_epi16(diff, zero);
diff_gez = _mm_or_si128(diff_lez, diff_zero);
diff_lez = _mm_cmpeq_epi16(zero, diff_lez);
#ifdef __SSE4_1__
*ge = _mm_blendv_epi8(diff_gez, vt_neg, *sign);
*le = _mm_blendv_epi8(vt_neg, diff_lez, *sign);
#else
*ge = _mm_and_si128(*sign, vt_neg);
diff_gez = _mm_andnot_si128(*sign, diff_gez);
*ge = _mm_or_si128(*ge, diff_gez);
*le = _mm_and_si128(*sign, diff_lez);
diff_lez = _mm_andnot_si128(*sign, vt_neg);
*le = _mm_or_si128(*le, diff_lez);
#endif
// Compute vce:
*vce = _mm_cmpeq_epi16(diff, *sign);
*vce = _mm_and_si128(*vce, *sign);
// Compute !eq:
*eq = _mm_or_si128(diff_zero, *vce);
*eq = _mm_cmpeq_epi16(*eq, zero);
// Compute result:
#ifdef __SSE4_1__
diff_sel_mask = _mm_blendv_epi8(*ge, *le, *sign);
return _mm_blendv_epi8(vs, sign_negvt, diff_sel_mask);
#else
diff_lez = _mm_and_si128(*sign, *le);
diff_gez = _mm_andnot_si128(*sign, *ge);
diff_sel_mask = _mm_or_si128(diff_lez, diff_gez);
diff_lez = _mm_and_si128(diff_sel_mask, sign_negvt);
diff_gez = _mm_andnot_si128(diff_sel_mask, vs);
return _mm_or_si128(diff_lez, diff_gez);
#endif
}

75
arch/x86_64/rsp/vcl.h Normal file
View File

@ -0,0 +1,75 @@
//
// arch/x86_64/rsp/vcl.h
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
static inline __m128i rsp_vcl(__m128i vs, __m128i vt, __m128i zero,
__m128i *ge, __m128i *le, __m128i eq, __m128i sign, __m128i vce) {
__m128i sign_negvt, diff, ncarry, nvce, diff_zero;
__m128i le_case1, le_case2, le_eq, do_le;
__m128i ge_eq, do_ge, mux_mask;
// sign_negvt = sign ? -vt : vt
sign_negvt = _mm_xor_si128(vt, sign);
sign_negvt = _mm_sub_epi16(sign_negvt, sign);
// Compute diff, diff_zero, ncarry, and nvce:
// Note: diff = sign ? (vs + vt) : (vs - vt).
diff = _mm_sub_epi16(vs, sign_negvt);
ncarry = _mm_adds_epu16(vs, vt);
ncarry = _mm_cmpeq_epi16(diff, ncarry);
nvce = _mm_cmpeq_epi16(vce, zero);
diff_zero = _mm_cmpeq_epi16(diff, zero);
// Compute results for if (sign && ne):
le_case1 = _mm_and_si128(diff_zero, ncarry);
le_case1 = _mm_and_si128(nvce, le_case1);
le_case2 = _mm_or_si128(diff_zero, ncarry);
le_case2 = _mm_and_si128(vce, le_case2);
le_eq = _mm_or_si128(le_case1, le_case2);
// Compute results for if (!sign && ne):
ge_eq = _mm_subs_epu16(vt, vs);
ge_eq = _mm_cmpeq_epi16(ge_eq, zero);
// Blend everything together. Caveat: we don't update
// the results of ge/le if ne is false, so be careful.
do_le = _mm_andnot_si128(eq, sign);
#ifdef __SSE4_1__
*le = _mm_blendv_epi8(*le, le_eq, do_le);
#else
le_eq = _mm_and_si128(do_le, le_eq);
*le = _mm_andnot_si128(do_le, *le);
*le = _mm_or_si128(le_eq, *le);
#endif
do_ge = _mm_or_si128(sign, eq);
#ifdef __SSE4_1__
*ge = _mm_blendv_epi8(ge_eq, *ge, do_ge);
#else
*ge = _mm_and_si128(do_ge, *ge);
ge_eq = _mm_andnot_si128(do_ge, ge_eq);
*ge = _mm_or_si128(ge_eq, *ge);
#endif
// Mux the result based on the value of sign.
#ifdef __SSE4_1__
mux_mask = _mm_blendv_epi8(*ge, *le, sign);
#else
do_le = _mm_and_si128(sign, *le);
do_ge = _mm_andnot_si128(sign, *ge);
mux_mask = _mm_or_si128(do_le, do_ge);
#endif
#ifdef __SSE4_1__
return _mm_blendv_epi8(vs, sign_negvt, mux_mask);
#else
sign_negvt = _mm_and_si128(mux_mask, sign_negvt);
vs = _mm_andnot_si128(mux_mask, vs);
return _mm_or_si128(sign_negvt, vs);
#endif
}

85
arch/x86_64/rsp/vcmp.h Normal file
View File

@ -0,0 +1,85 @@
//
// arch/x86_64/rsp/vcmp.h
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
static inline __m128i rsp_veq(__m128i vs, __m128i vt,
__m128i zero, __m128i *le, __m128i eq, __m128i sign) {
__m128i equal = _mm_cmpeq_epi16(vs, vt);
*le = _mm_andnot_si128(eq, equal);
#ifdef __SSE4_1__
return _mm_blendv_epi8(vt, vs, *le);
#else
vs = _mm_and_si128(*le, vs);
vt = _mm_andnot_si128(*le, vt);
return _mm_or_si128(vs, vt);
#endif
}
static inline __m128i rsp_vge(__m128i vs, __m128i vt,
__m128i zero, __m128i *le, __m128i eq, __m128i sign) {
__m128i equal = _mm_cmpeq_epi16(vs, vt);
__m128i gt = _mm_cmpgt_epi16(vs, vt);
__m128i equalsign = _mm_and_si128(eq, sign);
equal = _mm_andnot_si128(equalsign, equal);
*le = _mm_or_si128(gt, equal);
#ifdef __SSE4_1__
return _mm_blendv_epi8(vt, vs, *le);
#else
vs = _mm_and_si128(*le, vs);
vt = _mm_andnot_si128(*le, vt);
return _mm_or_si128(vs, vt);
#endif
}
static inline __m128i rsp_vlt(__m128i vs, __m128i vt,
__m128i zero, __m128i *le, __m128i eq, __m128i sign) {
__m128i equal = _mm_cmpeq_epi16(vs, vt);
__m128i lt = _mm_cmplt_epi16(vs, vt);
equal = _mm_and_si128(eq, equal);
equal = _mm_and_si128(sign, equal);
*le = _mm_or_si128(lt, equal);
#ifdef __SSE4_1__
return _mm_blendv_epi8(vt, vs, *le);
#else
vs = _mm_and_si128(*le, vs);
vt = _mm_andnot_si128(*le, vt);
return _mm_or_si128(vs, vt);
#endif
}
static inline __m128i rsp_vne(__m128i vs, __m128i vt,
__m128i zero, __m128i *le, __m128i eq, __m128i sign) {
__m128i equal = _mm_cmpeq_epi16(vs, vt);
__m128i nequal = _mm_cmpeq_epi16(equal, zero);
*le = _mm_and_si128(eq, equal);
*le = _mm_or_si128(*le, nequal);
#ifdef INTENSE_DEBUG
for (unsigned i = 0; i < 8; i++)
fprintf(stderr, "VS[%d] = %d\n", i,
reinterpret_cast<int16_t*>(&vs)[i]);
for (unsigned i = 0; i < 8; i++)
fprintf(stderr, "VT[%d] = %d\n", i,
reinterpret_cast<int16_t*>(&vt)[i]);
#endif
#ifdef __SSE4_1__
return _mm_blendv_epi8(vt, vs, *le);
#else
vs = _mm_and_si128(*le, vs);
vt = _mm_andnot_si128(*le, vt);
return _mm_or_si128(vs, vt);
#endif
}

54
arch/x86_64/rsp/vcr.h Normal file
View File

@ -0,0 +1,54 @@
//
// arch/x86_64/rsp/vcr.h
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
static inline __m128i rsp_vcr(__m128i vs, __m128i vt,
__m128i zero, __m128i *ge, __m128i *le) {
__m128i diff_sel_mask, diff_gez, diff_lez;
__m128i sign, sign_notvt;
#ifdef INTENSE_DEBUG
for (unsigned i = 0; i < 8; i++)
fprintf(stderr, "VS[%d] = %d\n", i,
reinterpret_cast<int16_t*>(&vs)[i]);
for (unsigned i = 0; i < 8; i++)
fprintf(stderr, "VT[%d] = %d\n", i,
reinterpret_cast<int16_t*>(&vt)[i]);
#endif
// sign = (vs ^ vt) < 0
sign = _mm_xor_si128(vs, vt);
sign = _mm_srai_epi16(sign, 15);
// Compute le
diff_lez = _mm_and_si128(vs, sign);
diff_lez = _mm_add_epi16(diff_lez, vt);
*le = _mm_srai_epi16(diff_lez, 15);
// Compute ge
diff_gez = _mm_or_si128(vs, sign);
diff_gez = _mm_min_epi16(diff_gez, vt);
*ge = _mm_cmpeq_epi16(diff_gez, vt);
// sign_notvt = sn ? ~vt : vt
sign_notvt = _mm_xor_si128(vt, sign);
// Compute result:
#ifdef __SSE4_1__
diff_sel_mask = _mm_blendv_epi8(*ge, *le, sign);
return _mm_blendv_epi8(vs, sign_notvt, diff_sel_mask);
#else
diff_sel_mask = _mm_sub_epi16(*le, *ge);
diff_sel_mask = _mm_and_si128(diff_sel_mask, sign);
diff_sel_mask = _mm_add_epi16(diff_sel_mask, *ge);
zero = _mm_sub_epi16(sign_notvt, vs);
zero = _mm_and_si128(zero, diff_sel_mask);
return _mm_add_epi16(zero, vs);
#endif
}

18
arch/x86_64/rsp/vdivh.h Normal file
View File

@ -0,0 +1,18 @@
//
// arch/x86_64/rsp/vrcp.c
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
inline __m128i rsp_vdivh(RSP::CPUState *rsp,
unsigned src, unsigned e, unsigned dest, unsigned de) {
// Get the element from VT.
rsp->cp2.div_in = rsp->cp2.regs[src].e[e & 0x7];
// Write out the upper part of the result.
rsp->cp2.regs[dest].e[de & 0x7] = rsp->cp2.div_out;
return rsp_vect_load_unshuffled_operand(rsp->cp2.regs[dest].e);
}

66
arch/x86_64/rsp/vmac.h Normal file
View File

@ -0,0 +1,66 @@
//
// arch/x86_64/rsp/vmacf.h
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
template <bool VMACU>
static inline __m128i rsp_vmacf_vmacu(__m128i vs, __m128i vt,
__m128i zero, __m128i *acc_lo, __m128i *acc_md, __m128i *acc_hi) {
__m128i overflow_hi_mask, overflow_md_mask;
__m128i lo, md, hi, carry, overflow_mask;
// Get the product and shift it over
// being sure to save the carries.
lo = _mm_mullo_epi16(vs, vt);
hi = _mm_mulhi_epi16(vs, vt);
md = _mm_slli_epi16(hi, 1);
carry = _mm_srli_epi16(lo, 15);
hi = _mm_srai_epi16(hi, 15);
md = _mm_or_si128(md, carry);
lo = _mm_slli_epi16(lo, 1);
// Tricky part: start accumulating everything.
// Get/keep the carry as we'll add it in later.
overflow_mask = _mm_adds_epu16(*acc_lo, lo);
*acc_lo = _mm_add_epi16(*acc_lo, lo);
overflow_mask = _mm_cmpeq_epi16(*acc_lo, overflow_mask);
overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero);
// Add in the carry. If the middle portion is
// already 0xFFFF and we have a carry, we have
// to carry the all the way up to hi.
md = _mm_sub_epi16(md, overflow_mask);
carry = _mm_cmpeq_epi16(md, zero);
carry = _mm_and_si128(carry, overflow_mask);
hi = _mm_sub_epi16(hi, carry);
// Accumulate the middle portion.
overflow_mask = _mm_adds_epu16(*acc_md, md);
*acc_md = _mm_add_epi16(*acc_md, md);
overflow_mask = _mm_cmpeq_epi16(*acc_md, overflow_mask);
overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero);
// Finish up the accumulation of the... accumulator.
*acc_hi = _mm_add_epi16(*acc_hi, hi);
*acc_hi = _mm_sub_epi16(*acc_hi, overflow_mask);
// VMACU
if (VMACU) {
overflow_hi_mask = _mm_srai_epi16(*acc_hi, 15);
overflow_md_mask = _mm_srai_epi16(*acc_md, 15);
md = _mm_or_si128(overflow_md_mask, *acc_md);
overflow_mask = _mm_cmpgt_epi16(*acc_hi, zero);
md = _mm_andnot_si128(overflow_hi_mask, md);
return _mm_or_si128(overflow_mask, md);
}
// VMACF
else
return rsp_sclamp_acc_tomd(*acc_md, *acc_hi);
}

19
arch/x86_64/rsp/vmov.h Normal file
View File

@ -0,0 +1,19 @@
//
// arch/x86_64/rsp/vmov.c
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
inline __m128i rsp_vmov(RSP::CPUState *rsp,
unsigned src, unsigned e, unsigned dest, unsigned de) {
uint16_t data;
// Get the element from VT.
data = rsp->cp2.regs[src].e[e & 0x7];
// Write out the upper part of the result.
rsp->cp2.regs[dest].e[de & 0x7] = data;
return rsp_vect_load_unshuffled_operand(rsp->cp2.regs[dest].e);
}

17
arch/x86_64/rsp/vmrg.h Normal file
View File

@ -0,0 +1,17 @@
//
// arch/x86_64/rsp/vmrg.h
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
static inline __m128i rsp_vmrg(__m128i vs, __m128i vt, __m128i le) {
#ifdef __SSE4_1__
return _mm_blendv_epi8(vt, vs, le);
#else
vs = _mm_and_si128(le, vs);
vt = _mm_andnot_si128(le, vt);
return _mm_or_si128(vs, vt);
#endif
}

15
arch/x86_64/rsp/vmudh.h Normal file
View File

@ -0,0 +1,15 @@
//
// arch/x86_64/rsp/vmudh.h
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
static inline __m128i rsp_vmudh(__m128i vs, __m128i vt,
__m128i *acc_md, __m128i *acc_hi) {
*acc_md = _mm_mullo_epi16(vs, vt);
*acc_hi = _mm_mulhi_epi16(vs, vt);
return rsp_sclamp_acc_tomd(*acc_md, *acc_hi);
}

47
arch/x86_64/rsp/vmul.h Normal file
View File

@ -0,0 +1,47 @@
//
// arch/x86_64/rsp/vmul.h
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
//
// TODO: CHECK ME.
//
template <bool VMULU>
static inline __m128i rsp_vmulf_vmulu(__m128i vs, __m128i vt,
__m128i zero, __m128i *acc_lo, __m128i *acc_md, __m128i *acc_hi) {
__m128i lo, hi, round, sign1, sign2, eq, neq, neg;
lo = _mm_mullo_epi16(vs, vt);
round = _mm_cmpeq_epi16(zero, zero);
sign1 = _mm_srli_epi16(lo, 15);
lo = _mm_add_epi16(lo, lo);
round = _mm_slli_epi16(round, 15);
hi = _mm_mulhi_epi16(vs, vt);
sign2 = _mm_srli_epi16(lo, 15);
*acc_lo = _mm_add_epi16(round, lo);
sign1 = _mm_add_epi16(sign1, sign2);
hi = _mm_slli_epi16(hi, 1);
neq = eq = _mm_cmpeq_epi16(vs, vt);
*acc_md = _mm_add_epi16(hi, sign1);
neg = _mm_srai_epi16(*acc_md, 15);
// VMULU
if (VMULU) {
*acc_hi = _mm_andnot_si128(eq, neg);
hi =_mm_or_si128(*acc_md, neg);
return _mm_andnot_si128(*acc_hi, hi);
}
// VMULF
else {
eq = _mm_and_si128(eq, neg);
*acc_hi = _mm_andnot_si128(neq, neg);
return _mm_add_epi16(*acc_md, eq);
}
}

39
arch/x86_64/rsp/vmulh.h Normal file
View File

@ -0,0 +1,39 @@
//
// arch/x86_64/rsp/vmulh.h
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
template <bool VMADH>
static inline __m128i rsp_vmadh_vmudh(__m128i vs, __m128i vt,
__m128i zero, __m128i *acc_lo, __m128i *acc_md, __m128i *acc_hi) {
__m128i lo, hi, overflow_mask;
lo = _mm_mullo_epi16(vs, vt);
hi = _mm_mulhi_epi16(vs, vt);
// VMADH
if (VMADH) {
// Tricky part: start accumulate everything.
// Get/keep the carry as we'll add it in later.
overflow_mask = _mm_adds_epu16(*acc_md, lo);
*acc_md = _mm_add_epi16(*acc_md, lo);
overflow_mask = _mm_cmpeq_epi16(*acc_md, overflow_mask);
overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero);
hi = _mm_sub_epi16(hi, overflow_mask);
*acc_hi = _mm_add_epi16(*acc_hi, hi);
}
// VMUDH
else {
*acc_lo = zero;
*acc_md = lo;
*acc_hi = hi;
}
return rsp_sclamp_acc_tomd(*acc_md, *acc_hi);
}

54
arch/x86_64/rsp/vmull.h Normal file
View File

@ -0,0 +1,54 @@
//
// arch/x86_64/rsp/vmadl.h
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
template <bool VMADL>
static inline __m128i rsp_vmadl_vmudl(__m128i vs, __m128i vt,
__m128i zero, __m128i *acc_lo, __m128i *acc_md, __m128i *acc_hi) {
__m128i hi, overflow_mask;
hi = _mm_mulhi_epu16(vs, vt);
// VMADL
if (VMADL) {
// Tricky part: start accumulate everything.
// Get/keep the carry as we'll add it in later.
overflow_mask = _mm_adds_epu16(*acc_lo, hi);
*acc_lo = _mm_add_epi16(*acc_lo, hi);
overflow_mask = _mm_cmpeq_epi16(*acc_lo, overflow_mask);
overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero);
hi = _mm_sub_epi16(zero, overflow_mask);
// Check for overflow of the upper sum.
//
// TODO: Since hi can only be {0,1}, we should
// be able to generalize this for performance.
overflow_mask = _mm_adds_epu16(*acc_md, hi);
*acc_md = _mm_add_epi16(*acc_md, hi);
overflow_mask = _mm_cmpeq_epi16(*acc_md, overflow_mask);
overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero);
// Finish up the accumulation of the... accumulator.
// Since the product was unsigned, only worry about
// positive overflow (i.e.: borrowing not possible).
*acc_hi = _mm_sub_epi16(*acc_hi, overflow_mask);
return rsp_uclamp_acc(*acc_lo, *acc_md, *acc_hi, zero);
}
// VMUDL
else {
*acc_lo = hi;
*acc_md = zero;
*acc_hi = zero;
return hi;
}
}

65
arch/x86_64/rsp/vmulm.h Normal file
View File

@ -0,0 +1,65 @@
//
// arch/x86_64/rsp/vmulm.h
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
template <bool VMADM>
static inline __m128i rsp_vmadm_vmudm(__m128i vs, __m128i vt,
__m128i zero, __m128i *acc_lo, __m128i *acc_md, __m128i *acc_hi) {
__m128i lo, hi, sign, overflow_mask;
lo = _mm_mullo_epi16(vs, vt);
hi = _mm_mulhi_epu16(vs, vt);
// What we're really want to do is unsigned vs * signed vt.
// However, we have no such instructions to do so.
//
// There's a trick to "fix" an unsigned product, though:
// If vt was negative, take the upper 16-bits of the product
// and subtract vs.
sign = _mm_srai_epi16(vs, 15);
vt = _mm_and_si128(vt, sign);
hi = _mm_sub_epi16(hi, vt);
// VMADM
if (VMADM) {
// Tricky part: start accumulate everything.
// Get/keep the carry as we'll add it in later.
overflow_mask = _mm_adds_epu16(*acc_lo, lo);
*acc_lo = _mm_add_epi16(*acc_lo, lo);
overflow_mask = _mm_cmpeq_epi16(*acc_lo, overflow_mask);
overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero);
// This is REALLY clever. Since the product results from
// two 16-bit components, one positive and one negative,
// we don't have to worry about carrying the 1 (we can
// only borrow) past 32-bits. So we can just add it here.
hi = _mm_sub_epi16(hi, overflow_mask);
// Check for overflow of the upper sum.
overflow_mask = _mm_adds_epu16(*acc_md, hi);
*acc_md = _mm_add_epi16(*acc_md, hi);
overflow_mask = _mm_cmpeq_epi16(*acc_md, overflow_mask);
overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero);
// Finish up the accumulation of the... accumulator.
*acc_hi = _mm_add_epi16(*acc_hi, _mm_srai_epi16(hi, 15));
*acc_hi = _mm_sub_epi16(*acc_hi, overflow_mask);
return rsp_sclamp_acc_tomd(*acc_md, *acc_hi);
}
// VMUDM
else {
*acc_lo = lo;
*acc_md = hi;
*acc_hi = _mm_srai_epi16(hi, 15);
return hi;
}
}

87
arch/x86_64/rsp/vmuln.h Normal file
View File

@ -0,0 +1,87 @@
//
// arch/x86_64/rsp/vmuln.h
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
template <bool VMADN>
static inline __m128i rsp_vmadn_vmudn(__m128i vs, __m128i vt,
__m128i zero, __m128i *acc_lo, __m128i *acc_md, __m128i *acc_hi) {
__m128i lo, hi, sign, overflow_mask;
#ifdef INTENSE_DEBUG
if (VMADN)
{
for (unsigned i = 0; i < 8; i++)
fprintf(stderr, "ACC LO[%u] = %d\n", i, reinterpret_cast<int16_t*>(acc_lo)[i]);
for (unsigned i = 0; i < 8; i++)
fprintf(stderr, "ACC MD[%u] = %d\n", i, reinterpret_cast<int16_t*>(acc_md)[i]);
for (unsigned i = 0; i < 8; i++)
fprintf(stderr, "ACC HI[%u] = %d\n", i, reinterpret_cast<int16_t*>(acc_hi)[i]);
for (unsigned i = 0; i < 8; i++)
fprintf(stderr, "VS[%u] = %d\n", i, reinterpret_cast<int16_t*>(&vs)[i]);
for (unsigned i = 0; i < 8; i++)
fprintf(stderr, "VT[%u] = %d\n", i, reinterpret_cast<int16_t*>(&vt)[i]);
}
#endif
lo = _mm_mullo_epi16(vs, vt);
hi = _mm_mulhi_epu16(vs, vt);
// What we're really want to do is unsigned vs * signed vt.
// However, we have no such instructions to do so.
//
// There's a trick to "fix" an unsigned product, though:
// If vt was negative, take the upper 16-bits of the product
// and subtract vs.
sign = _mm_srai_epi16(vt, 15);
vs = _mm_and_si128(vs, sign);
hi = _mm_sub_epi16(hi, vs);
// VMADN
if (VMADN) {
// Tricky part: start accumulate everything.
// Get/keep the carry as we'll add it in later.
overflow_mask = _mm_adds_epu16(*acc_lo, lo);
*acc_lo = _mm_add_epi16(*acc_lo, lo);
overflow_mask = _mm_cmpeq_epi16(*acc_lo, overflow_mask);
overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero);
// This is REALLY clever. Since the product results from
// two 16-bit components, one positive and one negative,
// we don't have to worry about carrying the 1 (we can
// only borrow) past 32-bits. So we can just add it here.
hi = _mm_sub_epi16(hi, overflow_mask);
// Check for overflow of the upper sum.
overflow_mask = _mm_adds_epu16(*acc_md, hi);
*acc_md = _mm_add_epi16(*acc_md, hi);
overflow_mask = _mm_cmpeq_epi16(*acc_md, overflow_mask);
overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero);
// Finish up the accumulation of the... accumulator.
*acc_hi = _mm_add_epi16(*acc_hi, _mm_srai_epi16(hi, 15));
*acc_hi = _mm_sub_epi16(*acc_hi, overflow_mask);
#ifdef INTENSE_DEBUG
auto ret = rsp_uclamp_acc(*acc_lo, *acc_md, *acc_hi, zero);
for (unsigned i = 0; i < 8; i++)
fprintf(stderr, "VD[%u] = %d\n", i, reinterpret_cast<int16_t*>(&ret)[i]);
return ret;
#else
return rsp_uclamp_acc(*acc_lo, *acc_md, *acc_hi, zero);
#endif
}
// VMUDN
else {
*acc_lo = lo;
*acc_md = hi;
*acc_hi = _mm_srai_epi16(hi, 15);
return lo;
}
}

16
arch/x86_64/rsp/vor.h Normal file
View File

@ -0,0 +1,16 @@
//
// arch/x86_64/rsp/vor.h
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
static inline __m128i rsp_vor(__m128i vs, __m128i vt) {
return _mm_or_si128(vs, vt);
}
static inline __m128i rsp_vnor(__m128i vs, __m128i vt) {
__m128i vd = _mm_or_si128(vs, vt);
return _mm_xor_si128(vd, _mm_set1_epi32(0xffffffffu));
}

79
arch/x86_64/rsp/vrcpsq.h Normal file
View File

@ -0,0 +1,79 @@
//
// arch/x86_64/rsp/vrcpsq.c
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
#include "../../../rsp/reciprocal.h"
template <bool VRSQ>
inline __m128i rsp_vrcp_vrsq(RSP::CPUState *rsp, int dp,
unsigned src, unsigned e, unsigned dest, unsigned de) {
uint32_t dp_input, sp_input;
int32_t input, result;
int16_t vt;
int32_t input_mask, data;
unsigned shift, idx;
// Get the element from VT.
vt = rsp->cp2.regs[src].e[e & 0x7];
dp_input = ((uint32_t) rsp->cp2.div_in << 16) | (uint16_t) vt;
sp_input = vt;
input = (dp) ? dp_input : sp_input;
input_mask = input >> 31;
data = input ^ input_mask;
if (input > -32768)
data -= input_mask;
// Handle edge cases.
if (data == 0)
result = 0x7fffFFFFU;
else if (input == -32768)
result = 0xffff0000U;
// Main case: compute the reciprocal.
else {
// TODO: Clean this up.
#ifdef _MSC_VER
unsigned long bsf_index;
_BitScanReverse(&bsf_index, data);
shift = 31 - bsf_index;
#else
shift = __builtin_clz(data);
#endif
// VRSQ
if (VRSQ) {
idx = (((unsigned long long) data << shift) & 0x7FC00000U) >> 22;
idx = ((idx | 0x200) & 0x3FE) | (shift % 2);
result = rsp_reciprocal_rom[idx];
result = ((0x10000 | result) << 14) >> ((31 - shift) >> 1);
}
// VRCP
else {
idx = (((unsigned long long) data << shift) & 0x7FC00000U) >> 22;
result = rsp_reciprocal_rom[idx];
result = ((0x10000 | result) << 14) >> (31 - shift);
}
result = result ^ input_mask;
}
// Write out the results.
rsp->cp2.div_out = result >> 16;
rsp->cp2.regs[dest].e[de & 0x7] = result;
return rsp_vect_load_unshuffled_operand(rsp->cp2.regs[dest].e);
}

97
arch/x86_64/rsp/vrsq.h Normal file
View File

@ -0,0 +1,97 @@
//
// arch/x86_64/rsp/vrsq.c
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
#include "../../../rsp/reciprocal.h"
// Mask table for vrsq(LH) functions.
alignas(16) static const uint16_t vrsq_mask_table[8][8] = {
{0xffff, 0, 0, 0, 0, 0, 0, 0},
{0, 0xffff, 0, 0, 0, 0, 0, 0},
{0, 0, 0xffff, 0, 0, 0, 0, 0},
{0, 0, 0, 0xffff, 0, 0, 0, 0},
{0, 0, 0, 0, 0xffff, 0, 0, 0},
{0, 0, 0, 0, 0, 0xffff, 0, 0},
{0, 0, 0, 0, 0, 0, 0xffff, 0},
{0, 0, 0, 0, 0, 0, 0, 0xffff}
};
inline __m128i rsp_vrsq(RSP::CPUState *rsp, int dp,
unsigned src, unsigned e, unsigned dest, unsigned de) {
uint32_t dp_input, sp_input;
int32_t input, result;
int16_t vt;
int32_t input_mask, data;
unsigned shift, idx;
// Get the element from VT.
vt = rsp->cp2.regs[src].e[e & 0x7];
dp_input = ((uint32_t) rsp->cp2.div_in << 16) | (uint16_t) vt;
sp_input = vt;
input = (dp) ? dp_input : sp_input;
input_mask = input >> 31;
data = input ^ input_mask;
if (input > -32768)
data -= input_mask;
// Handle edge cases.
if (data == 0)
result = 0x7fffFFFFU;
else if (input == -32768)
result = 0xffff0000U;
// Main case: compute the reciprocal.
else {
//TODO: Clean this up.
#ifdef _MSC_VER
unsigned long bsf_index;
_BitScanReverse(&bsf_index, data);
shift = 31 - bsf_index;
#else
shift = __builtin_clz(data);
#endif
idx = (((unsigned long long) data << shift) & 0x7FC00000U) >> 22;
idx = ((idx | 0x200) & 0x3FE) | (shift % 2);
result = rsp_reciprocal_rom[idx];
result = ((0x10000 | result) << 14) >> ((31 - shift) >> 1);
result = result ^ input_mask;
}
// Write out the results.
rsp->cp2.div_out = result >> 16;
rsp->cp2.regs[dest].e[de & 0x7] = result;
return rsp_vect_load_unshuffled_operand(rsp->cp2.regs[dest].e);
}
inline __m128i rsp_vrsqh(RSP::CPUState *rsp,
unsigned src, unsigned e, unsigned dest, unsigned de) {
__m128i vd, vd_mask, b_result;
int16_t elements[8];
// Get the element from VT.
memcpy(elements, rsp->cp2.regs + src, sizeof(elements));
rsp->cp2.div_in = elements[e];
// Write out the upper part of the result.
vd_mask = _mm_load_si128((__m128i *) vrsq_mask_table[de]);
vd = _mm_load_si128((__m128i *) (rsp->cp2.regs + dest));
vd = _mm_andnot_si128(vd_mask, vd);
b_result = _mm_set1_epi16(rsp->cp2.div_out);
b_result = _mm_and_si128(vd_mask, b_result);
return _mm_or_si128(b_result, vd);
}

24
arch/x86_64/rsp/vsub.h Normal file
View File

@ -0,0 +1,24 @@
//
// arch/x86_64/rsp/vsub.h
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
static inline __m128i rsp_vsub(__m128i vs, __m128i vt,
__m128i carry, __m128i *acc_lo) {
__m128i unsat_diff, sat_diff, overflow, vd;
// acc_lo uses saturated arithmetic.
unsat_diff = _mm_sub_epi16(vt, carry);
sat_diff = _mm_subs_epi16(vt, carry);
*acc_lo = _mm_sub_epi16(vs, unsat_diff);
vd = _mm_subs_epi16(vs, sat_diff);
// VD is the signed diff of the two sources and the carry. Since we
// have to saturate the diff of all three, we have to be clever.
overflow = _mm_cmpgt_epi16(sat_diff, unsat_diff);
return _mm_adds_epi16(vd, overflow);
}

21
arch/x86_64/rsp/vsubc.h Normal file
View File

@ -0,0 +1,21 @@
//
// arch/x86_64/rsp/vsubc.h
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
static inline __m128i rsp_vsubc(__m128i vs, __m128i vt,
__m128i zero, __m128i *eq, __m128i *sn) {
__m128i equal, sat_udiff, sat_udiff_zero;
sat_udiff = _mm_subs_epu16(vs, vt);
equal = _mm_cmpeq_epi16(vs, vt);
sat_udiff_zero = _mm_cmpeq_epi16(sat_udiff, zero);
*eq = _mm_cmpeq_epi16(equal, zero);
*sn = _mm_andnot_si128(equal, sat_udiff_zero);
return _mm_sub_epi16(vs, vt);
}

16
arch/x86_64/rsp/vxor.h Normal file
View File

@ -0,0 +1,16 @@
//
// arch/x86_64/rsp/vxor.h
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
static inline __m128i rsp_vxor(__m128i vs, __m128i vt) {
return _mm_xor_si128(vs, vt);
}
static inline __m128i rsp_vnxor(__m128i vs, __m128i vt) {
__m128i vd = _mm_xor_si128(vs, vt);
return _mm_xor_si128(vd, _mm_set1_epi32(0xffffffffu));
}

4
debug-toolchain/Makefile Normal file
View File

@ -0,0 +1,4 @@
PROGRAM = test
MIPS_OBJ = main.o
include Makefile.mips

View File

@ -0,0 +1,38 @@
####
## Basic Makefile for RSP-MIPS
##
######
TARGET_ELF = $(PROGRAM).elf
TARGET_BIN = $(PROGRAM).bin
TARGET_GLOBAL_BIN = $(PROGRAM).global.bin
MIPS_LD_SCRIPT = rsp-mips.ld
MIPS_OBJCOPY = mipsel-linux-gnu-objcopy
MIPS_CC = mipsel-linux-gnu-gcc
MIPS_AS = mipsel-linux-gnu-as
MIPS_LD = mipsel-linux-gnu-ld
CRT_OBJ = start.o rsp-mips.o
all: $(TARGET_BIN) $(TARGET_GLOBAL_BIN)
$(TARGET_BIN): $(TARGET_ELF)
$(MIPS_OBJCOPY) -j .text $< $(TARGET_BIN) -O binary
$(TARGET_GLOBAL_BIN): $(TARGET_ELF)
$(MIPS_OBJCOPY) -j .data $< $(TARGET_GLOBAL_BIN) -O binary
$(TARGET_ELF): $(MIPS_OBJ) $(CRT_OBJ)
$(MIPS_LD) -T $(MIPS_LD_SCRIPT) -o $@ $(CRT_OBJ) $(MIPS_OBJ) -EB
%.o: %.s
$(MIPS_AS) -o $@ $< -EB -mabi=eabi -march=mips1
%.o: %.c rsp-mips.h
$(MIPS_CC) -c -o $@ $< -Os -EB -march=mips1 -mabi=eabi -mno-abicalls -std=gnu99 -nostdlib
clean:
rm -f $(MIPS_OBJ) $(TARGET_ELF) $(TARGET_HEX) $(TARGET_GLOBAL_HEX) $(TARGET_BIN) $(TARGET_GLOBAL_BIN)
.PHONY: all clean tools clean-tools

8
debug-toolchain/main.c Normal file
View File

@ -0,0 +1,8 @@
#include "rsp-mips.h"
u32 data[4] = { 0x10, 0x20, 0x30, 0x40 };
int main(void)
{
rsp_debug_break(data[0], data[1], data[2], data[3]);
}

View File

@ -0,0 +1,17 @@
#ifndef __RSP_MIPS_H
#define __RSP_MIPS_H
typedef signed char s8;
typedef unsigned char u8;
typedef signed short s16;
typedef unsigned short u16;
typedef signed int s32;
typedef unsigned int u32;
typedef signed long long s64;
typedef unsigned long long u64;
typedef u32 size_t;
void rsp_break(void);
void rsp_debug_break(u32 a, u32 b, u32 c, u32 d);
#endif

View File

@ -0,0 +1,9 @@
SECTIONS {
ENTRY(rsp_mips_start)
. = 0x00400000;
.text : { start.o (.text); * (.text*); }
. = 0x00700000;
.data : { * (.data); * (.rodata*); * (.sbss); * (.scommon); }
.bss : { * (.bss); }
}

View File

@ -0,0 +1,20 @@
.text
.section .text
.global rsp_break
.ent rsp_break
.type rsp_break, @function
rsp_break:
break
jr $ra
.end rsp_break
.size rsp_break, .-rsp_break
.global rsp_debug_break
.ent rsp_debug_break
.type rsp_debug_break, @function
rsp_debug_break:
break
jr $ra
.end rsp_debug_break
.size rsp_debug_break, .-rsp_debug_break

28
debug-toolchain/start.s Normal file
View File

@ -0,0 +1,28 @@
###
##
# Entry point and setup for our RSP
##
.equ RAM_SIZE, 4096
.text
.align 2
.section .text
.global rsp_mips_start
.extern main
.ent rsp_mips_start
.type rsp_mips_start, @function
rsp_mips_start:
nop
redo:
li $a0, 0 # argc = 0, it will never be used anyways on this stuff. :D
li $a1, 0 # argv = 0
li $sp, (0x00700000 + RAM_SIZE - 4) # Set up stack.
jal main
j redo
.end rsp_mips_start
.size rsp_mips_start, .-rsp_mips_start

81
debug_jit.cpp Normal file
View File

@ -0,0 +1,81 @@
#include "debug_jit.hpp"
#include <stdio.h>
#include <stdlib.h>
#include <dlfcn.h>
#include <string>
using namespace std;
namespace JIT
{
struct DebugBlock::Impl
{
Impl() = default;
Impl(Impl&&) = delete;
void operator=(Impl&&) = delete;
~Impl();
void *dylib = nullptr;
Func block = nullptr;
string name, soname;
bool compile(uint64_t hash, const std::string &source);
};
DebugBlock::DebugBlock(const unordered_map<string, uint64_t> &)
{}
DebugBlock::~DebugBlock()
{}
DebugBlock::Impl::~Impl()
{
if (dylib)
dlclose(dylib);
remove(soname.c_str());
//remove(name.c_str());
}
bool DebugBlock::compile(uint64_t hash, const std::string &source)
{
impl = unique_ptr<Impl>(new Impl);
bool ret = impl->compile(hash, source);
if (ret)
block = impl->block;
return ret;
}
bool DebugBlock::Impl::compile(uint64_t hash, const std::string &source)
{
name = "/tmp/";
name += to_string(hash);
soname = name;
name += ".c";
soname += ".so";
FILE *file = fopen(name.c_str(), "w");
if (!file)
return false;
fputs(source.c_str(), file);
fclose(file);
char command[256];
sprintf(command, "gcc -o %s %s -shared -fpic -O0 -g -std=c99 -Wl,--unresolved-symbols=ignore-all", soname.c_str(), name.c_str());
int ret = system(command);
if (ret != 0)
return false;
dylib = dlopen(soname.c_str(), RTLD_LOCAL | RTLD_LAZY);
if (!dylib)
return false;
block = reinterpret_cast<Func>(dlsym(dylib, "block_entry"));
if (!dylib)
return false;
return true;
}
}

28
debug_jit.hpp Normal file
View File

@ -0,0 +1,28 @@
#ifndef DEBUG_JIT_HPP__
#define DEBUG_JIT_HPP__
#include <memory>
#include <stdint.h>
#include <string>
#include <unordered_map>
namespace JIT
{
using Func = void (*)(void *, void *);
class DebugBlock
{
public:
DebugBlock(const std::unordered_map<std::string, uint64_t> &symbol_table);
~DebugBlock();
bool compile(uint64_t hash, const std::string &source);
Func get_func() const { return block; }
private:
struct Impl;
std::unique_ptr<Impl> impl;
Func block = nullptr;
};
}
#endif

214
llvm_jit.cpp Normal file
View File

@ -0,0 +1,214 @@
#include "llvm_jit.hpp"
#include <clang/CodeGen/CodeGenAction.h>
#include <clang/Driver/Compilation.h>
#include <clang/Driver/Driver.h>
#include <clang/Driver/Tool.h>
#include <clang/Frontend/CompilerInstance.h>
#include <clang/Frontend/CompilerInvocation.h>
#include <clang/Frontend/TextDiagnosticPrinter.h>
#include <clang/Lex/PreprocessorOptions.h>
#include <llvm/ADT/SmallString.h>
#include <llvm/ExecutionEngine/ExecutionEngine.h>
#include <llvm/ExecutionEngine/MCJIT.h>
#include <llvm/ExecutionEngine/JITSymbol.h>
#include <llvm/ExecutionEngine/ObjectCache.h>
#include <llvm/ExecutionEngine/SectionMemoryManager.h>
#include <llvm/ExecutionEngine/RuntimeDyld.h>
#include <llvm/IR/IRBuilder.h>
#include <llvm/IR/LLVMContext.h>
#include <llvm/IR/Module.h>
#include <llvm/IRReader/IRReader.h>
#include <llvm/Support/FileSystem.h>
#include <llvm/Support/Host.h>
#include <llvm/Support/ManagedStatic.h>
#include <llvm/Support/Path.h>
#include <llvm/Support/SourceMgr.h>
#include <llvm/Support/TargetSelect.h>
#include <llvm/Support/raw_ostream.h>
#include <llvm/ExecutionEngine/Orc/CompileUtils.h>
#include <llvm/ExecutionEngine/Orc/Core.h>
#include <llvm/ExecutionEngine/Orc/ExecutionUtils.h>
#include <llvm/ExecutionEngine/Orc/IRCompileLayer.h>
#include <llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h>
#include <stdio.h>
using namespace clang;
using namespace std;
namespace JIT
{
struct Block::Impl
{
Impl(LLVMEngine &engine_)
: engine(engine_)
{}
LLVMEngine &engine;
Func block = nullptr;
size_t block_size = 0;
bool compile(const std::string &source);
};
Block::Block(LLVMEngine &engine)
{
impl = std::unique_ptr<Impl>(new Impl(engine));
}
Block::~Block()
{
}
struct LLVMHolder
{
LLVMHolder()
{
llvm::InitializeNativeTarget();
llvm::InitializeNativeTargetAsmPrinter();
llvm::InitializeNativeTargetAsmParser();
}
~LLVMHolder()
{
llvm::llvm_shutdown();
}
};
struct LLVMEngine::Impl
{
Impl(const std::unordered_map<std::string, uint64_t> &symbol_table_)
: symbol_table(symbol_table_)
{
static LLVMHolder llvm_holder;
execution_session = llvm::make_unique<llvm::orc::ExecutionSession>();
execution_session->setErrorReporter([](llvm::Error error) {
if (error)
llvm::errs() << "Error: " << error << "\n";
});
llvm::orc::LegacyRTDyldObjectLinkingLayer::Resources resources;
resources.MemMgr = llvm::make_unique<llvm::SectionMemoryManager>();
resources.Resolver = llvm::orc::createLegacyLookupResolver(
*execution_session,
[this](const std::string &name) -> llvm::JITSymbol {
return findSymbol(name);
},
[](llvm::Error) {});
object_layer = llvm::make_unique<llvm::orc::LegacyRTDyldObjectLinkingLayer>(*execution_session,
[=](llvm::orc::VModuleKey) { return resources; });
auto host = llvm::orc::JITTargetMachineBuilder::detectHost();
target_machine = llvm::cantFail(host->createTargetMachine());
target_machine->setOptLevel(llvm::CodeGenOpt::Level::Default);
data_layout = llvm::make_unique<llvm::DataLayout>(std::move(*host->getDefaultDataLayoutForTarget()));
compile_layer = llvm::make_unique<llvm::orc::LegacyIRCompileLayer<
llvm::orc::LegacyRTDyldObjectLinkingLayer, llvm::orc::SimpleCompiler>>(*object_layer, llvm::orc::SimpleCompiler(*target_machine));
}
std::unique_ptr<EmitLLVMOnlyAction> compile_c(const std::string &source)
{
llvm::SmallVector<const char *, 4> args;
args.push_back("__block.c");
args.push_back("-std=c99");
args.push_back("-O2");
std::string string_buffer;
llvm::raw_string_ostream ss(string_buffer);
IntrusiveRefCntPtr<DiagnosticOptions> diag_opts = new DiagnosticOptions();
TextDiagnosticPrinter *diag_client = new TextDiagnosticPrinter(ss, &*diag_opts);
IntrusiveRefCntPtr<DiagnosticIDs> diag_id(new DiagnosticIDs());
DiagnosticsEngine diags(diag_id, &*diag_opts, diag_client);
auto CI = llvm::make_unique<CompilerInvocation>();
auto *invocation = CI.get();
CompilerInvocation::CreateFromArgs(*CI, args.data(), args.data() + args.size(), diags);
auto clang = llvm::make_unique<CompilerInstance>();
clang->setInvocation(std::move(CI));
clang->createDiagnostics();
auto act = llvm::make_unique<EmitLLVMOnlyAction>();
StringRef code_data(source);
auto buffer = llvm::MemoryBuffer::getMemBufferCopy(code_data);
invocation->getPreprocessorOpts().clearRemappedFiles();
invocation->getPreprocessorOpts().addRemappedFile("__block.c", buffer.release());
if (!clang->ExecuteAction(*act))
{
llvm::errs() << "ExecuteAction failed.\n";
return {};
}
return act;
}
Func compile(const std::string &source)
{
auto act = compile_c(source);
if (!act)
return nullptr;
auto K = execution_session->allocateVModule();
auto error = compile_layer->addModule(K, act->takeModule());
if (error)
return nullptr;
auto entry_point = compile_layer->findSymbolIn(K, "block_entry", true);
auto block = reinterpret_cast<Func>(llvm::cantFail(entry_point.getAddress()));
return block;
}
llvm::JITSymbol findSymbol(const std::string &name)
{
auto itr = symbol_table.find(name);
if (itr != symbol_table.end())
return llvm::JITSymbol(itr->second, llvm::JITSymbolFlags::None);
else
return llvm::JITSymbol(nullptr);
}
const std::unordered_map<std::string, uint64_t> &symbol_table;
llvm::LLVMContext context;
std::unique_ptr<llvm::orc::ExecutionSession> execution_session;
std::unique_ptr<llvm::orc::LegacyRTDyldObjectLinkingLayer> object_layer;
std::unique_ptr<llvm::orc::LegacyIRCompileLayer<
llvm::orc::LegacyRTDyldObjectLinkingLayer,
llvm::orc::SimpleCompiler>> compile_layer;
std::unique_ptr<llvm::TargetMachine> target_machine;
std::unique_ptr<llvm::orc::MangleAndInterner> mangler;
std::unique_ptr<llvm::DataLayout> data_layout;
};
LLVMEngine::LLVMEngine(const std::unordered_map<std::string, uint64_t> &symbol_table)
{
impl.reset(new Impl(symbol_table));
}
LLVMEngine::~LLVMEngine()
{
}
bool Block::compile(uint64_t, const std::string &source)
{
bool ret = impl->compile(source);
if (ret)
{
block = impl->block;
block_size = impl->block_size;
}
return ret;
}
bool Block::Impl::compile(const std::string &source)
{
block = engine.impl->compile(source);
return block != nullptr;
}
}

39
llvm_jit.hpp Normal file
View File

@ -0,0 +1,39 @@
#ifndef JIT_HPP
#define JIT_HPP
#include <memory>
#include <unordered_map>
#include <string>
namespace JIT
{
class LLVMEngine
{
public:
LLVMEngine(const std::unordered_map<std::string, uint64_t> &symbol_table);
~LLVMEngine();
private:
friend class Block;
struct Impl;
std::unique_ptr<Impl> impl;
};
using Func = void (*)(void *, void *);
class Block
{
public:
Block(LLVMEngine &engine);
~Block();
bool compile(uint64_t hash, const std::string &source);
Func get_func() const { return block; }
private:
struct Impl;
std::unique_ptr<Impl> impl;
Func block = nullptr;
size_t block_size = 0;
};
}
#endif

330
main.cpp Normal file
View File

@ -0,0 +1,330 @@
#include "rsp.hpp"
#include <vector>
#include <stdio.h>
using namespace std;
static inline uint32_t flip_endian(uint32_t v)
{
return
(v >> 24) |
(v << 24) |
((v >> 8) & 0x0000ff00) |
((v << 8) & 0x00ff0000);
}
static vector<uint32_t> read_binary(const char *path, bool flip)
{
FILE *f = fopen(path, "rb");
if (!f)
return {};
fseek(f, 0, SEEK_END);
long len = ftell(f);
rewind(f);
vector<uint32_t> v(len / 4);
fread(v.data(), sizeof(uint32_t), v.size(), f);
fclose(f);
if (flip)
for (auto &value : v)
value = flip_endian(value);
return v;
}
static bool read_tag_validate(FILE *file, const char *tag)
{
char tmp[9] = {};
if (fread(tmp, 1, 8, file) != 8)
throw runtime_error("Failed to read tag.");
if (strcmp(tmp, "EOF ") == 0)
return false;
if (strcmp(tmp, tag))
throw runtime_error("Unexpected tag.");
return true;
}
static bool read_block(FILE *file, const char *tag, void *buffer, size_t size)
{
if (!read_tag_validate(file, tag))
return false;
uint32_t block_size;
if (fread(&block_size, sizeof(block_size), 1, file) != 1)
throw runtime_error("EOF");
if (size != block_size)
throw runtime_error("Unexpected size");
if (fread(buffer, size, 1, file) != 1)
throw runtime_error("EOF");
return true;
}
static bool read_poke(FILE *file, RSP::CPU &cpu)
{
char tmp[9] = {};
if (fread(tmp, 1, 8, file) != 8)
throw runtime_error("Failed to read tag.");
if (strcmp(tmp, "ENDDMA ") == 0)
return false;
if (strcmp(tmp, "POKE "))
throw runtime_error("Unexpected tag.");
uint32_t offset;
uint32_t len;
if (fread(&offset, sizeof(offset), 1, file) != 1)
throw runtime_error("Wrong EOF");
if (fread(&len, sizeof(len), 1, file) != 1)
throw runtime_error("Wrong EOF");
if (offset >= 0x1000)
{
if (fread(reinterpret_cast<uint8_t *>(cpu.get_state().imem) + offset - 0x1000, len, 1, file) != 1)
throw runtime_error("Wrong EOF");
}
else
{
if (fread(reinterpret_cast<uint8_t *>(cpu.get_state().dmem) + offset, len, 1, file) != 1)
throw runtime_error("Wrong EOF");
}
return true;
}
static void validate_trace(RSP::CPU &cpu, const char *path)
{
auto &state = cpu.get_state();
uint32_t dmem[1024];
uint32_t imem[1024];
cpu.set_dmem(dmem);
cpu.set_imem(imem);
FILE *file = fopen(path, "rb");
if (!file)
throw runtime_error("Failed to load trace.");
try
{
read_tag_validate(file, "RSPDUMP1");
unsigned index = 0;
while (read_tag_validate(file, "BEGIN "))
{
read_block(file, "DMEM ", state.dmem, 0x1000);
read_block(file, "IMEM ", state.imem, 0x1000);
read_block(file, "SR32 ", state.sr, sizeof(state.sr));
read_block(file, "VR32 ", state.cp2.regs, sizeof(state.cp2.regs));
read_block(file, "VLO ", state.cp2.acc.e + RSP::RSP_ACC_LO, sizeof(uint16_t) * 8);
read_block(file, "VMD ", state.cp2.acc.e + RSP::RSP_ACC_MD, sizeof(uint16_t) * 8);
read_block(file, "VHI ", state.cp2.acc.e + RSP::RSP_ACC_HI, sizeof(uint16_t) * 8);
read_block(file, "PC ", &state.pc, sizeof(state.pc));
int16_t VCO, VCC, VCE;
read_block(file, "VCO ", &VCO, sizeof(VCO));
read_block(file, "VCC ", &VCC, sizeof(VCC));
read_block(file, "VCE ", &VCE, sizeof(VCE));
rsp_set_flags(state.cp2.flags[RSP::RSP_VCO].e, VCO);
rsp_set_flags(state.cp2.flags[RSP::RSP_VCC].e, VCC);
rsp_set_flags(state.cp2.flags[RSP::RSP_VCE].e, VCE);
RSP::ReturnMode mode = RSP::MODE_CONTINUE;
do
{
*state.cp0.cr[RSP::CP0_REGISTER_SP_STATUS] = 0;
cpu.invalidate_imem();
// Run till break.
mode = cpu.run();
if (mode == RSP::MODE_DMA_READ)
{
if (!read_tag_validate(file, "BEGINDMA"))
throw runtime_error("Expected BEGINDMA.");
while (read_poke(file, cpu));
}
} while (mode != RSP::MODE_BREAK);
uint32_t dmem[0x1000 >> 2];
uint32_t imem[0x1000 >> 2];
uint32_t sr[32];
uint16_t vr[32 * 8];
uint16_t vlo[8];
uint16_t vmd[8];
uint16_t vhi[8];
read_block(file, "DMEM END", dmem, sizeof(dmem));
read_block(file, "IMEM END", imem, sizeof(imem));
read_block(file, "SR32 END", sr, sizeof(sr));
read_block(file, "VR32 END", vr, sizeof(vr));
read_block(file, "VLO END", vlo, sizeof(vlo));
read_block(file, "VMD END", vmd, sizeof(vmd));
read_block(file, "VHI END", vhi, sizeof(vhi));
read_block(file, "VCO END", &VCO, sizeof(VCO));
read_block(file, "VCC END", &VCC, sizeof(VCC));
read_block(file, "VCE END", &VCE, sizeof(VCE));
unsigned errors = 0;
fprintf(stderr, "==== Trace #%u ====\n", index);
// Validate DMEM
for (unsigned i = 0; i < (0x1000 >> 2); i++)
{
if (state.dmem[i] != dmem[i])
{
fprintf(stderr, "DMEM32[0x%03x] fault. Expected 0x%08x, got 0x%08x!\n",
i, dmem[i], state.dmem[i]);
errors++;
}
}
// Validate IMEM (in case of DMA)
for (unsigned i = 0; i < (0x1000 >> 2); i++)
{
if (state.imem[i] != imem[i])
{
fprintf(stderr, "IMEM32[0x%03x] fault. Expected 0x%08x, got 0x%08x!\n",
i, dmem[i], state.dmem[i]);
errors++;
}
}
// Validate SR
for (unsigned i = 0; i < 32; i++)
{
if (sr[i] != state.sr[i])
{
fprintf(stderr, "SR[%02u] fault. Expected 0x%08x, got 0x%08x!\n",
i, sr[i], state.sr[i]);
errors++;
}
}
// Validate VR
for (unsigned i = 0; i < 16 * 8; i++)
{
if (vr[i] != state.cp2.regs[i >> 3].e[i & 7])
{
fprintf(stderr, "VR[%02u][%u] fault. Expected 0x%04x, got 0x%04x!\n",
i >> 3, i & 7, vr[i], state.cp2.regs[i >> 3].e[i & 7]);
errors++;
}
}
// Validate VLO
for (unsigned i = 0; i < 8; i++)
{
if (vlo[i] != state.cp2.acc.e[RSP::RSP_ACC_LO + i])
{
fprintf(stderr, "VLO[%u] fault. Expected 0x%04x, got 0x%04x!\n",
i, vlo[i], state.cp2.acc.e[RSP::RSP_ACC_LO + i]);
errors++;
}
}
// Validate VMD
for (unsigned i = 0; i < 8; i++)
{
if (vmd[i] != state.cp2.acc.e[RSP::RSP_ACC_MD + i])
{
fprintf(stderr, "VMD[%u] fault. Expected 0x%04x, got 0x%04x!\n",
i, vmd[i], state.cp2.acc.e[RSP::RSP_ACC_MD + i]);
errors++;
}
}
// Validate VHI
for (unsigned i = 0; i < 8; i++)
{
if (vhi[i] != state.cp2.acc.e[RSP::RSP_ACC_HI + i])
{
fprintf(stderr, "VHI[%u] fault. Expected 0x%04x, got 0x%04x!\n",
i, vhi[i], state.cp2.acc.e[RSP::RSP_ACC_HI + i]);
errors++;
}
}
// Validate flags
if (VCO != rsp_get_flags(state.cp2.flags[RSP::RSP_VCO].e))
{
fprintf(stderr, "VCO fault. Expected 0x%04x, got 0x%04x!\n",
VCO, rsp_get_flags(state.cp2.flags[RSP::RSP_VCO].e));
errors++;
}
if (VCC != rsp_get_flags(state.cp2.flags[RSP::RSP_VCC].e))
{
fprintf(stderr, "VCC fault. Expected 0x%04x, got 0x%04x!\n",
VCC, rsp_get_flags(state.cp2.flags[RSP::RSP_VCC].e));
errors++;
}
if (VCE != rsp_get_flags(state.cp2.flags[RSP::RSP_VCE].e))
{
fprintf(stderr, "VCE fault. Expected 0x%04x, got 0x%04x!\n",
VCE, rsp_get_flags(state.cp2.flags[RSP::RSP_VCE].e));
errors++;
}
read_tag_validate(file, "END ");
if (errors == 0)
fprintf(stderr, "SUCCESS! :D\n");
else
fprintf(stderr, "%u ERRORS! :{\n", errors);
fprintf(stderr, "======================\n\n");
index++;
}
}
catch (const std::exception &e)
{
fprintf(stderr, "Exception: %s\n", e.what());
}
fclose(file);
}
int main(int argc, char *argv[])
{
RSP::CPU cpu;
auto &state = cpu.get_state();
uint32_t cr[16] = {};
for (unsigned i = 0; i < 16; i++)
state.cp0.cr[i] = &cr[i];
if (argc == 3)
{
auto dmem = read_binary(argv[1], true);
auto imem = read_binary(argv[2], true);
if (imem.empty())
return 1;
dmem.resize(0x1000);
imem.resize(0x1000);
cpu.set_dmem(dmem.data());
cpu.set_imem(imem.data());
for (unsigned i = 0; i < 1; i++)
{
cpu.invalidate_imem();
cr[RSP::CP0_REGISTER_SP_STATUS] = 0;
cpu.run();
}
}
else if (argc == 2)
validate_trace(cpu, argv[1]);
else
return 1;
}

146
parallel.cpp Normal file
View File

@ -0,0 +1,146 @@
#include <stdint.h>
#include "rsp.hpp"
#include "rsp_1.1.h"
#include "m64p_plugin.h"
#define RSP_PARALLEL_VERSION 0x0101
#define RSP_PLUGIN_API_VERSION 0x020000
namespace RSP
{
RSP_INFO rsp;
CPU cpu;
short MFC0_count[32];
int SP_STATUS_TIMEOUT;
}
extern "C" {
#ifdef INTENSE_DEBUG
// Need super-fast hash here.
static uint64_t hash_imem(const uint8_t *data, size_t size)
{
uint64_t h = 0xcbf29ce484222325ull;
size_t i;
for (i = 0; i < size; i++)
h = (h * 0x100000001b3ull) ^ data[i];
return h;
}
void log_rsp_mem_parallel(void)
{
fprintf(stderr, "IMEM HASH: 0x%016llx\n", hash_imem(RSP::rsp.IMEM, 0x1000));
fprintf(stderr, "DMEM HASH: 0x%016llx\n", hash_imem(RSP::rsp.DMEM, 0x1000));
}
#endif
EXPORT unsigned int CALL parallelRSPDoRspCycles(unsigned int cycles)
{
if (*RSP::rsp.SP_STATUS_REG & (SP_STATUS_HALT | SP_STATUS_BROKE))
return 0;
// We don't know if Mupen from the outside invalidated our IMEM.
RSP::cpu.invalidate_imem();
// Run CPU until we either break or we need to fire an IRQ.
RSP::cpu.get_state().pc = *RSP::rsp.SP_PC_REG & 0xfff;
#ifdef INTENSE_DEBUG
fprintf(stderr, "RUN TASK: %u\n", RSP::cpu.get_state().pc);
log_rsp_mem_parallel();
#endif
for (auto &count : RSP::MFC0_count)
count = 0;
while (!(*RSP::rsp.SP_STATUS_REG & SP_STATUS_HALT))
{
auto mode = RSP::cpu.run();
if (mode == RSP::MODE_CHECK_FLAGS && (*RSP::cpu.get_state().cp0.irq & 1))
break;
}
*RSP::rsp.SP_PC_REG = 0x04001000 | (RSP::cpu.get_state().pc & 0xffc);
// From CXD4.
if (*RSP::rsp.SP_STATUS_REG & SP_STATUS_BROKE)
return cycles;
else if (*RSP::cpu.get_state().cp0.irq & 1)
RSP::rsp.CheckInterrupts();
else if (*RSP::rsp.SP_SEMAPHORE_REG != 0) // Semaphore lock fixes.
{}
else
RSP::SP_STATUS_TIMEOUT = 16; // From now on, wait 16 times, not 0x7fff
// CPU restarts with the correct SIGs.
*RSP::rsp.SP_STATUS_REG &= ~SP_STATUS_HALT;
return cycles;
}
EXPORT m64p_error CALL parallelRSPPluginGetVersion(m64p_plugin_type *PluginType, int *PluginVersion, int *APIVersion, const char **PluginNamePtr, int *Capabilities)
{
/* set version info */
if (PluginType != NULL)
*PluginType = M64PLUGIN_RSP;
if (PluginVersion != NULL)
*PluginVersion = RSP_PARALLEL_VERSION;
if (APIVersion != NULL)
*APIVersion = RSP_PLUGIN_API_VERSION;
if (Capabilities != NULL)
*Capabilities = 0;
return M64ERR_SUCCESS;
}
EXPORT void CALL parallelRSPRomClosed(void)
{
*RSP::rsp.SP_PC_REG = 0x00000000;
}
EXPORT void CALL parallelRSPInitiateRSP(RSP_INFO Rsp_Info, unsigned int *CycleCount)
{
if (CycleCount)
*CycleCount = 0;
if (Rsp_Info.DMEM == Rsp_Info.IMEM) /* usually dummy RSP data for testing */
return; /* DMA is not executed just because plugin initiates. */
RSP::rsp = Rsp_Info;
*RSP::rsp.SP_PC_REG = 0x04001000 & 0x00000FFF; /* task init bug on Mupen64 */
auto **cr = RSP::cpu.get_state().cp0.cr;
cr[0x0] = RSP::rsp.SP_MEM_ADDR_REG;
cr[0x1] = RSP::rsp.SP_DRAM_ADDR_REG;
cr[0x2] = RSP::rsp.SP_RD_LEN_REG;
cr[0x3] = RSP::rsp.SP_WR_LEN_REG;
cr[0x4] = RSP::rsp.SP_STATUS_REG;
cr[0x5] = RSP::rsp.SP_DMA_FULL_REG;
cr[0x6] = RSP::rsp.SP_DMA_BUSY_REG;
cr[0x7] = RSP::rsp.SP_SEMAPHORE_REG;
cr[0x8] = RSP::rsp.DPC_START_REG;
cr[0x9] = RSP::rsp.DPC_END_REG;
cr[0xA] = RSP::rsp.DPC_CURRENT_REG;
cr[0xB] = RSP::rsp.DPC_STATUS_REG;
cr[0xC] = RSP::rsp.DPC_CLOCK_REG;
cr[0xD] = RSP::rsp.DPC_BUFBUSY_REG;
cr[0xE] = RSP::rsp.DPC_PIPEBUSY_REG;
cr[0xF] = RSP::rsp.DPC_TMEM_REG;
*cr[RSP::CP0_REGISTER_SP_STATUS] = SP_STATUS_HALT;
RSP::cpu.get_state().cp0.irq = RSP::rsp.MI_INTR_REG;
// From CXD4.
RSP::SP_STATUS_TIMEOUT = 0x7fff;
RSP::cpu.set_dmem(reinterpret_cast<uint32_t*>(Rsp_Info.DMEM));
RSP::cpu.set_imem(reinterpret_cast<uint32_t*>(Rsp_Info.IMEM));
RSP::cpu.set_rdram(reinterpret_cast<uint32_t*>(Rsp_Info.RDRAM));
}
}

1425
rsp.cpp Normal file

File diff suppressed because it is too large Load Diff

113
rsp.hpp Normal file
View File

@ -0,0 +1,113 @@
#ifndef RSP_HPP__
#define RSP_HPP__
#include <stdint.h>
#include <string.h>
#include <unordered_map>
#include <memory>
#include <string>
#include "state.hpp"
#include "llvm_jit.hpp"
#include "debug_jit.hpp"
#include "rsp_op.hpp"
#include <setjmp.h>
namespace RSP
{
#ifdef DEBUG_JIT
using Block = JIT::DebugBlock;
#else
using Block = JIT::Block;
#endif
using Func = JIT::Func;
enum ReturnMode
{
MODE_ENTER = 0,
MODE_CONTINUE = 1,
MODE_BREAK = 2,
MODE_DMA_READ = 3,
MODE_CHECK_FLAGS = 4
};
class alignas(64) CPU
{
public:
CPU();
~CPU();
CPU(CPU&&) = delete;
void operator=(CPU&&) = delete;
void set_dmem(uint32_t *dmem)
{
state.dmem = dmem;
}
void set_imem(uint32_t *imem)
{
state.imem = imem;
}
void set_rdram(uint32_t *rdram)
{
state.rdram = rdram;
}
void invalidate_imem();
CPUState &get_state()
{
return state;
}
ReturnMode run();
void enter(uint32_t pc);
void call(uint32_t target, uint32_t ret);
int ret(uint32_t pc);
void exit(ReturnMode mode);
void print_registers();
private:
CPUState state;
Func blocks[IMEM_WORDS] = {};
std::unordered_map<std::string, uint64_t> symbol_table;
#ifndef DEBUG_JIT
JIT::LLVMEngine jit_engine;
#endif
std::unordered_map<uint64_t, std::unique_ptr<Block>> cached_blocks[IMEM_WORDS];
void invalidate_code();
uint64_t hash_imem(unsigned pc, unsigned count) const;
Func jit_region(uint64_t hash, unsigned pc, unsigned count);
std::string full_code;
std::string body;
void init_symbol_table();
alignas(64) uint32_t cached_imem[IMEM_WORDS] = {};
// Platform specific.
#ifdef __GNUC__
intptr_t env[64];
// We're reading this after setjmp returns so need to make sure the read happens when we expect it to.
volatile ReturnMode return_mode;
#else
#error "Need __builtin_setjmp/longjmp support alternative for other compilers ..."
#endif
#define CALL_STACK_SIZE 32
uint32_t call_stack[CALL_STACK_SIZE] = {};
unsigned call_stack_ptr = 0;
unsigned analyze_static_end(unsigned pc, unsigned end);
};
}
#endif

331
rsp/cp0.cpp Normal file
View File

@ -0,0 +1,331 @@
#include "../rsp.hpp"
#include "../state.hpp"
#ifdef PARALLEL_INTEGRATION
#include "../Rsp_#1.1.h"
#include "m64p_plugin.h"
namespace RSP
{
extern RSP_INFO rsp;
extern short MFC0_count[32];
extern int SP_STATUS_TIMEOUT;
}
#endif
using namespace RSP;
extern "C" {
#ifdef INTENSE_DEBUG
void log_rsp_mem_parallel(void);
#endif
int RSP_MFC0(RSP::CPUState *rsp, unsigned rt, unsigned rd)
{
rd &= 15;
uint32_t res = *rsp->cp0.cr[rd];
if (rt)
rsp->sr[rt] = res;
// CFG_MEND_SEMAPHORE_LOCK == 0 by default,
// so don't bother implementing semaphores.
// It makes Mario Golf run terribly for some reason.
#ifdef PARALLEL_INTEGRATION
// WAIT_FOR_CPU_HOST. From CXD4.
if (rd == CP0_REGISTER_SP_STATUS)
{
RSP::MFC0_count[rt] += 1;
if (RSP::MFC0_count[rt] >= RSP::SP_STATUS_TIMEOUT)
{
*RSP::rsp.SP_STATUS_REG |= SP_STATUS_HALT;
return MODE_CHECK_FLAGS;
}
}
#endif
//if (rd == 4) // SP_STATUS_REG
// fprintf(stderr, "READING STATUS REG!\n");
return MODE_CONTINUE;
}
static inline int rsp_status_write(RSP::CPUState *rsp, uint32_t rt)
{
//fprintf(stderr, "Writing 0x%x to status reg!\n", rt);
uint32_t status = *rsp->cp0.cr[CP0_REGISTER_SP_STATUS];
if (rt & SP_CLR_HALT)
status &= ~SP_STATUS_HALT;
else if (rt & SP_SET_HALT)
status |= SP_STATUS_HALT;
if (rt & SP_CLR_BROKE)
status &= ~SP_STATUS_BROKE;
if (rt & SP_CLR_INTR)
*rsp->cp0.irq &= ~1;
else if (rt & SP_SET_INTR)
*rsp->cp0.irq |= 1;
if (rt & SP_CLR_SSTEP)
status &= ~SP_STATUS_SSTEP;
else if (rt & SP_SET_SSTEP)
status |= SP_STATUS_SSTEP;
if (rt & SP_CLR_INTR_BREAK)
status &= ~SP_STATUS_INTR_BREAK;
else if (rt & SP_SET_INTR_BREAK)
status |= SP_STATUS_INTR_BREAK;
if (rt & SP_CLR_SIG0)
status &= ~SP_STATUS_SIG0;
else if (rt & SP_SET_SIG0)
status |= SP_STATUS_SIG0;
if (rt & SP_CLR_SIG1)
status &= ~SP_STATUS_SIG1;
else if (rt & SP_SET_SIG1)
status |= SP_STATUS_SIG1;
if (rt & SP_CLR_SIG2)
status &= ~SP_STATUS_SIG2;
else if (rt & SP_SET_SIG2)
status |= SP_STATUS_SIG2;
if (rt & SP_CLR_SIG3)
status &= ~SP_STATUS_SIG3;
else if (rt & SP_SET_SIG3)
status |= SP_STATUS_SIG3;
if (rt & SP_CLR_SIG4)
status &= ~SP_STATUS_SIG4;
else if (rt & SP_SET_SIG4)
status |= SP_STATUS_SIG4;
if (rt & SP_CLR_SIG5)
status &= ~SP_STATUS_SIG5;
else if (rt & SP_SET_SIG5)
status |= SP_STATUS_SIG5;
if (rt & SP_CLR_SIG6)
status &= ~SP_STATUS_SIG6;
else if (rt & SP_SET_SIG6)
status |= SP_STATUS_SIG6;
if (rt & SP_CLR_SIG7)
status &= ~SP_STATUS_SIG7;
else if (rt & SP_SET_SIG7)
status |= SP_STATUS_SIG7;
*rsp->cp0.cr[CP0_REGISTER_SP_STATUS] = status;
return ((*rsp->cp0.irq & 1) || (status & SP_STATUS_HALT)) ? MODE_CHECK_FLAGS : MODE_CONTINUE;
}
#ifdef PARALLEL_INTEGRATION
static int rsp_dma_read(RSP::CPUState *rsp)
{
uint32_t length_reg = *rsp->cp0.cr[CP0_REGISTER_DMA_READ_LENGTH];
uint32_t length = (length_reg & 0xFFF) + 1;
uint32_t skip = (length_reg >> 20) & 0xFFF;
unsigned count = (length_reg >> 12) & 0xFF;
// Force alignment.
length = (length + 0x7) & ~0x7;
*rsp->cp0.cr[CP0_REGISTER_DMA_CACHE] &= ~0x3;
*rsp->cp0.cr[CP0_REGISTER_DMA_DRAM] &= ~0x7;
// Check length.
if (((*rsp->cp0.cr[CP0_REGISTER_DMA_CACHE] & 0xFFF) + length) > 0x1000)
length = 0x1000 - (*rsp->cp0.cr[CP0_REGISTER_DMA_CACHE] & 0xFFF);
unsigned i = 0;
uint32_t source = *rsp->cp0.cr[CP0_REGISTER_DMA_DRAM];
uint32_t dest = *rsp->cp0.cr[CP0_REGISTER_DMA_CACHE];
#ifdef INTENSE_DEBUG
fprintf(stderr, "DMA READ: (0x%x <- 0x%x) len %u, count %u, skip %u\n",
dest & 0x1ffc, source & 0x7ffffc,
length, count + 1, skip);
#endif
do
{
unsigned j = 0;
do
{
uint32_t source_addr = (source + j) & 0x7FFFFC;
uint32_t dest_addr = (dest + j) & 0x1FFC;
uint32_t word = rsp->rdram[source_addr >> 2];
if (dest_addr & 0x1000)
{
// Invalidate IMEM.
unsigned block = (dest_addr & 0xfff) / CODE_BLOCK_SIZE;
rsp->dirty_blocks |= (0x3 << block) >> 1;
//rsp->dirty_blocks = ~0u;
rsp->imem[(dest_addr & 0xfff) >> 2] = word;
}
else
rsp->dmem[dest_addr >> 2] = word;
j += 4;
} while (j < length);
source += length + skip;
dest += length;
} while (++i <= count);
*rsp->cp0.cr[CP0_REGISTER_DMA_DRAM] = source;
*rsp->cp0.cr[CP0_REGISTER_DMA_CACHE] = dest;
#ifdef INTENSE_DEBUG
log_rsp_mem_parallel();
#endif
return rsp->dirty_blocks ? MODE_CHECK_FLAGS : MODE_CONTINUE;
}
static void rsp_dma_write(RSP::CPUState *rsp)
{
uint32_t length_reg = *rsp->cp0.cr[CP0_REGISTER_DMA_WRITE_LENGTH];
uint32_t length = (length_reg & 0xFFF) + 1;
uint32_t skip = (length_reg >> 20) & 0xFFF;
unsigned count = (length_reg >> 12) & 0xFF;
// Force alignment.
length = (length + 0x7) & ~0x7;
*rsp->cp0.cr[CP0_REGISTER_DMA_CACHE] &= ~0x3;
*rsp->cp0.cr[CP0_REGISTER_DMA_DRAM] &= ~0x7;
// Check length.
if (((*rsp->cp0.cr[CP0_REGISTER_DMA_CACHE] & 0xFFF) + length) > 0x1000)
length = 0x1000 - (*rsp->cp0.cr[CP0_REGISTER_DMA_CACHE] & 0xFFF);
uint32_t dest = *rsp->cp0.cr[CP0_REGISTER_DMA_DRAM];
uint32_t source = *rsp->cp0.cr[CP0_REGISTER_DMA_CACHE];
#ifdef INTENSE_DEBUG
fprintf(stderr, "DMA WRITE: (0x%x <- 0x%x) len %u, count %u, skip %u\n",
dest & 0x7ffffc, source & 0x1ffc,
length, count + 1, skip);
#endif
unsigned i = 0;
do
{
unsigned j = 0;
do
{
uint32_t source_addr = (source + j) & 0x1FFC;
uint32_t dest_addr = (dest + j) & 0x7FFFFC;
rsp->rdram[dest_addr >> 2] = (source_addr & 0x1000) ?
rsp->imem[(source_addr & 0xfff) >> 2] :
rsp->dmem[source_addr >> 2];
j += 4;
} while (j < length);
source += length;
dest += length + skip;
} while (++i <= count);
*rsp->cp0.cr[CP0_REGISTER_DMA_CACHE] = source;
*rsp->cp0.cr[CP0_REGISTER_DMA_DRAM] = dest;
#ifdef INTENSE_DEBUG
log_rsp_mem_parallel();
#endif
}
#endif
int RSP_MTC0(RSP::CPUState *rsp, unsigned rd, unsigned rt)
{
uint32_t val = rsp->sr[rt];
switch (static_cast<CP0Registers>(rd & 15))
{
case CP0_REGISTER_DMA_CACHE:
*rsp->cp0.cr[CP0_REGISTER_DMA_CACHE] = val & 0x1fff;
break;
case CP0_REGISTER_DMA_DRAM:
*rsp->cp0.cr[CP0_REGISTER_DMA_DRAM] = val & 0xffffff;
break;
case CP0_REGISTER_DMA_READ_LENGTH:
*rsp->cp0.cr[CP0_REGISTER_DMA_READ_LENGTH] = val;
#ifdef PARALLEL_INTEGRATION
return rsp_dma_read(rsp);
#else
return MODE_DMA_READ;
#endif
case CP0_REGISTER_DMA_WRITE_LENGTH:
*rsp->cp0.cr[CP0_REGISTER_DMA_WRITE_LENGTH] = val;
#ifdef PARALLEL_INTEGRATION
rsp_dma_write(rsp);
#endif
break;
case CP0_REGISTER_SP_STATUS:
return rsp_status_write(rsp, val);
case CP0_REGISTER_SP_RESERVED:
// CXD4 forces this to 0.
*rsp->cp0.cr[CP0_REGISTER_SP_RESERVED] = 0;
break;
case CP0_REGISTER_CMD_START:
#ifdef INTENSE_DEBUG
fprintf(stderr, "CMD_START 0x%x\n", val & 0xfffffff8u);
#endif
*rsp->cp0.cr[CP0_REGISTER_CMD_START] =
*rsp->cp0.cr[CP0_REGISTER_CMD_CURRENT] =
*rsp->cp0.cr[CP0_REGISTER_CMD_END] =
val & 0xfffffff8u;
break;
case CP0_REGISTER_CMD_END:
#ifdef INTENSE_DEBUG
fprintf(stderr, "CMD_END 0x%x\n", val & 0xfffffff8u);
#endif
*rsp->cp0.cr[CP0_REGISTER_CMD_END] = val & 0xfffffff8u;
#ifdef PARALLEL_INTEGRATION
RSP::rsp.ProcessRdpList();
#endif
break;
case CP0_REGISTER_CMD_CLOCK:
fprintf(stderr, "CMD_CLOCK");
*rsp->cp0.cr[CP0_REGISTER_CMD_CLOCK] = val;
break;
case CP0_REGISTER_CMD_STATUS:
*rsp->cp0.cr[CP0_REGISTER_CMD_STATUS] &= ~(!!(val & 0x1) << 0);
*rsp->cp0.cr[CP0_REGISTER_CMD_STATUS] |= (!!(val & 0x2) << 0);
*rsp->cp0.cr[CP0_REGISTER_CMD_STATUS] &= ~(!!(val & 0x4) << 1);
*rsp->cp0.cr[CP0_REGISTER_CMD_STATUS] |= (!!(val & 0x8) << 1);
*rsp->cp0.cr[CP0_REGISTER_CMD_STATUS] &= ~(!!(val & 0x10) << 2);
*rsp->cp0.cr[CP0_REGISTER_CMD_STATUS] |= (!!(val & 0x20) << 2);
*rsp->cp0.cr[CP0_REGISTER_CMD_TMEM_BUSY] &= !(val & 0x40) * -1;
*rsp->cp0.cr[CP0_REGISTER_CMD_CLOCK] &= !(val & 0x200) * -1;
break;
case CP0_REGISTER_CMD_CURRENT:
case CP0_REGISTER_CMD_BUSY:
case CP0_REGISTER_CMD_PIPE_BUSY:
case CP0_REGISTER_CMD_TMEM_BUSY:
break;
default:
*rsp->cp0.cr[rd & 15] = val;
break;
}
return MODE_CONTINUE;
}
}

71
rsp/cp2.cpp Normal file
View File

@ -0,0 +1,71 @@
#include "../rsp.hpp"
extern "C" {
void RSP_CFC2(RSP::CPUState *rsp, unsigned rt, unsigned rd)
{
unsigned src = rd & 3;
if (src == 3)
src = 2;
int16_t res = rsp_get_flags(rsp->cp2.flags[src].e);
if (rt)
rsp->sr[rt] = res;
}
void RSP_CTC2(RSP::CPUState *rsp, unsigned rt, unsigned rd)
{
rt = rsp->sr[rt] & 0xffff;
unsigned dst = rd & 3;
if (dst >= 2)
{
rt &= 0xff;
dst = 2;
}
rsp_set_flags(rsp->cp2.flags[dst].e, rt);
}
void RSP_MTC2(RSP::CPUState *rsp, unsigned rt, unsigned rd, unsigned element)
{
uint16_t *e = rsp->cp2.regs[rd].e;
#ifdef INTENSE_DEBUG
fprintf(stderr, "MTC2, rt = %u, [rt] = 0x%x, rd = %u, e = %u\n",
rt, rsp->sr[rt], rd, element);
#endif
unsigned lo = element >> 1;
rt = rsp->sr[rt];
if (element & 1)
{
unsigned hi = (element + 1) >> 1;
e[lo] = (e[lo] & 0xff00) | ((rt >> 8) & 0xff);
e[hi] = (e[lo] & 0x00ff) | ((rt & 0xff) << 8);
}
else
e[lo] = rt;
}
void RSP_MFC2(RSP::CPUState *rsp, unsigned rt, unsigned rd, unsigned element)
{
if (rt == 0)
return;
const uint16_t *e = rsp->cp2.regs[rd].e;
unsigned lo = element >> 1;
if (element & 1)
{
unsigned hi = ((element + 1) >> 1) & 7;
uint16_t high = e[lo] << 8;
uint8_t low = e[hi] >> 8;
rsp->sr[rt] = int16_t(high | low);
}
else
rsp->sr[rt] = int16_t(e[lo]);
}
}

380
rsp/ls.cpp Normal file
View File

@ -0,0 +1,380 @@
#include "../rsp.hpp"
extern "C" {
// Using mostly CXD4 implementation as a base here since it's easier to follow.
// CEN64's implementation seems much better, but takes more effort to port for now.
// Reading wide words together with SSE4 blend, SSSE3 pshufb, etc should make this much faster.
// Load 8-bit
void RSP_LBV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base)
{
unsigned addr = (rsp->sr[base] + offset * 1) & 0xfff;
reinterpret_cast<uint8_t*>(rsp->cp2.regs[rt].e)[MES(e)] = READ_MEM_U8(rsp->dmem, addr);
}
// Store 8-bit
void RSP_SBV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base)
{
unsigned addr = (rsp->sr[base] + offset * 1) & 0xfff;
uint8_t v = reinterpret_cast<uint8_t*>(rsp->cp2.regs[rt].e)[MES(e)];
#ifdef INTENSE_DEBUG
fprintf(stderr, "SBV: 0x%x (0x%x)\n", addr, v);
#endif
WRITE_MEM_U8(rsp->dmem, addr, v);
}
// Load 16-bit
void RSP_LSV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base)
{
if (e & 1)
return;
unsigned addr = (rsp->sr[base] + offset * 2) & 0xfff;
unsigned correction = addr & 3;
if (correction == 3)
return;
uint16_t result;
if (correction == 1)
result = (READ_MEM_U8(rsp->dmem, addr + 0) << 8) | (READ_MEM_U8(rsp->dmem, addr + 1) << 0);
else
result = READ_MEM_U16(rsp->dmem, addr);
rsp->cp2.regs[rt].e[e >> 1] = result;
}
// Store 16-bit
void RSP_SSV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base)
{
unsigned addr = (rsp->sr[base] + offset * 2) & 0xfff;
uint8_t v0 = reinterpret_cast<uint8_t*>(rsp->cp2.regs[rt].e)[MES(e)];
uint8_t v1 = reinterpret_cast<uint8_t*>(rsp->cp2.regs[rt].e)[MES((e + 1) & 0xf)];
#ifdef INTENSE_DEBUG
fprintf(stderr, "SSV: 0x%x (0x%x, 0x%x)\n", addr, v0, v1);
#endif
WRITE_MEM_U8(rsp->dmem, addr, v0);
WRITE_MEM_U8(rsp->dmem, (addr + 1) & 0xfff, v1);
}
// Load 32-bit
void RSP_LLV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base)
{
unsigned addr = (rsp->sr[base] + offset * 4) & 0xfff;
if (e & 1)
return;
if (addr & 1)
return;
e >>= 1;
rsp->cp2.regs[rt].e[e] = READ_MEM_U16(rsp->dmem, addr);
rsp->cp2.regs[rt].e[(e + 1) & 7] = READ_MEM_U16(rsp->dmem, (addr + 2) & 0xfff);
}
// Store 32-bit
void RSP_SLV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base)
{
if ((e & 1) || (e > 0xc))
return;
unsigned addr = (rsp->sr[base] + offset * 4) & 0xfff;
#ifdef INTENSE_DEBUG
fprintf(stderr, "SLV 0x%x, e = %u\n", addr, e);
#endif
if (addr & 1)
return;
e >>= 1;
uint16_t v0 = rsp->cp2.regs[rt].e[e];
uint16_t v1 = rsp->cp2.regs[rt].e[e + 1];
WRITE_MEM_U16(rsp->dmem, addr, v0);
WRITE_MEM_U16(rsp->dmem, (addr + 2) & 0xfff, v1);
}
// Load 64-bit
void RSP_LDV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base)
{
if (e & 1)
return;
unsigned addr = (rsp->sr[base] + offset * 8) & 0xfff;
auto *reg = rsp->cp2.regs[rt].e;
e >>= 1;
if (addr & 1)
{
reg[e + 0] = (READ_MEM_U8(rsp->dmem, addr + 0) << 8) | READ_MEM_U8(rsp->dmem, addr + 1);
reg[e + 1] = (READ_MEM_U8(rsp->dmem, addr + 2) << 8) | READ_MEM_U8(rsp->dmem, addr + 3);
reg[e + 2] = (READ_MEM_U8(rsp->dmem, addr + 4) << 8) | READ_MEM_U8(rsp->dmem, addr + 5);
reg[e + 3] = (READ_MEM_U8(rsp->dmem, addr + 6) << 8) | READ_MEM_U8(rsp->dmem, addr + 7);
}
else
{
reg[e + 0] = READ_MEM_U16(rsp->dmem, addr);
reg[e + 1] = READ_MEM_U16(rsp->dmem, (addr + 2) & 0xfff);
reg[e + 2] = READ_MEM_U16(rsp->dmem, (addr + 4) & 0xfff);
reg[e + 3] = READ_MEM_U16(rsp->dmem, (addr + 6) & 0xfff);
}
}
// Store 64-bit
void RSP_SDV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base)
{
unsigned addr = (rsp->sr[base] + offset * 8) & 0xfff;
#ifdef INTENSE_DEBUG
fprintf(stderr, "SDV 0x%x, e = %u\n", addr, e);
#endif
// Handle illegal scenario.
if ((e > 8) || (e & 1) || (addr & 1))
{
for (unsigned i = 0; i < 8; i++)
{
WRITE_MEM_U8(rsp->dmem, (addr + i) & 0xfff,
reinterpret_cast<const uint8_t*>(rsp->cp2.regs[rt].e)[MES((e + i) & 0xf)]);
}
}
else
{
e >>= 1;
for (unsigned i = 0; i < 4; i++)
{
WRITE_MEM_U16(rsp->dmem, (addr + 2 * i) & 0xfff,
rsp->cp2.regs[rt].e[e + i]);
}
}
}
// Load 8x8-bit into high bits.
void RSP_LPV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base)
{
if (e != 0)
return;
unsigned addr = (rsp->sr[base] + offset * 8) & 0xfff;
auto *reg = rsp->cp2.regs[rt].e;
for (unsigned i = 0; i < 8; i++)
reg[i] = READ_MEM_U8(rsp->dmem, (addr + i) & 0xfff) << 8;
}
void RSP_SPV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base)
{
if (e != 0)
return;
unsigned addr = (rsp->sr[base] + offset * 8) & 0xfff;
auto *reg = rsp->cp2.regs[rt].e;
for (unsigned i = 0; i < 8; i++)
WRITE_MEM_U8(rsp->dmem, (addr + i) & 0xfff, int16_t(reg[i]) >> 8);
}
// Load 8x8-bit into high bits, but shift by 7 instead of 8.
// Was probably used for certain fixed point algorithms to get more headroom without
// saturation, but weird nonetheless.
void RSP_LUV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base)
{
unsigned addr = (rsp->sr[base] + offset * 8) & 0xfff;
auto *reg = rsp->cp2.regs[rt].e;
if (e != 0)
{
// Special path for Mia Hamm soccer.
addr += -e & 0xf;
for (unsigned b = 0; b < 8; b++)
{
reg[b] = READ_MEM_U8(rsp->dmem, addr) << 7;
--e;
addr -= e ? 0 : 16;
++addr;
}
}
else
{
for (unsigned i = 0; i < 8; i++)
reg[i] = READ_MEM_U8(rsp->dmem, (addr + i) & 0xfff) << 7;
}
}
void RSP_SUV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base)
{
if (e != 0)
return;
unsigned addr = (rsp->sr[base] + offset * 8) & 0xfff;
auto *reg = rsp->cp2.regs[rt].e;
for (unsigned i = 0; i < 8; i++)
WRITE_MEM_U8(rsp->dmem, (addr + i) & 0xfff, int16_t(reg[i]) >> 7);
}
// Load 8x8-bits into high bits, but shift by 7 instead of 8.
// Seems to differ from LUV in that it loads every other byte instead of packed bytes.
void RSP_LHV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base)
{
if (e != 0)
return;
unsigned addr = (rsp->sr[base] + offset * 16) & 0xfff;
if (addr & 0xe)
return;
auto *reg = rsp->cp2.regs[rt].e;
for (unsigned i = 0; i < 8; i++)
reg[i] = READ_MEM_U8(rsp->dmem, addr + 2 * i) << 7;
}
void RSP_SHV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base)
{
if (e != 0)
return;
unsigned addr = (rsp->sr[base] + offset * 16) & 0xfff;
auto *reg = rsp->cp2.regs[rt].e;
for (unsigned i = 0; i < 8; i++)
WRITE_MEM_U8(rsp->dmem, (addr + 2 * i) & 0xfff, int16_t(reg[i]) >> 7);
}
// No idea what the purpose of this is.
void RSP_SFV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base)
{
unsigned addr = (rsp->sr[base] + offset * 16) & 0xff3;
auto *reg = rsp->cp2.regs[rt].e;
switch (e)
{
case 0:
WRITE_MEM_U8(rsp->dmem, (addr + 0) & 0xfff, int16_t(reg[0]) >> 7);
WRITE_MEM_U8(rsp->dmem, (addr + 4) & 0xfff, int16_t(reg[1]) >> 7);
WRITE_MEM_U8(rsp->dmem, (addr + 8) & 0xfff, int16_t(reg[2]) >> 7);
WRITE_MEM_U8(rsp->dmem, (addr + 12) & 0xfff, int16_t(reg[3]) >> 7);
break;
case 8:
WRITE_MEM_U8(rsp->dmem, (addr + 0) & 0xfff, int16_t(reg[4]) >> 7);
WRITE_MEM_U8(rsp->dmem, (addr + 4) & 0xfff, int16_t(reg[5]) >> 7);
WRITE_MEM_U8(rsp->dmem, (addr + 8) & 0xfff, int16_t(reg[6]) >> 7);
WRITE_MEM_U8(rsp->dmem, (addr + 12) & 0xfff, int16_t(reg[7]) >> 7);
break;
default:
break;
}
}
// Loads full 128-bit register, however, it seems to handle unaligned addresses in a very
// strange way.
void RSP_LQV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base)
{
if (e & 1)
return;
unsigned addr = (rsp->sr[base] + offset * 16) & 0xfff;
#ifdef INTENSE_DEBUG
fprintf(stderr, "LQV: 0x%x, e = %u, vt = %u, base = %u\n", addr, e, rt, base);
#endif
if (addr & 1)
return;
unsigned b = (addr & 0xf) >> 1;
e >>= 1;
auto *reg = rsp->cp2.regs[rt].e;
for (unsigned i = b; i < 8; i++, e++, addr += 2)
reg[e] = READ_MEM_U16(rsp->dmem, addr & 0xfff);
}
void RSP_SQV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base)
{
unsigned addr = (rsp->sr[base] + offset * 16) & 0xfff;
if (addr & 1)
return;
unsigned b = addr & 0xf;
auto *reg = rsp->cp2.regs[rt].e;
if (e != 0)
{
// Mia Hamm Soccer
for (unsigned i = 0; i < 16 - b; i++, addr++)
{
WRITE_MEM_U8(rsp->dmem, addr & 0xfff,
reinterpret_cast<const uint8_t*>(reg)[MES((e + i) & 0xf)]);
}
}
else
{
b >>= 1;
for (unsigned i = b; i < 8; i++, e++, addr += 2)
WRITE_MEM_U16(rsp->dmem, addr & 0xfff, reg[e]);
}
}
// Complements LQV?
void RSP_LRV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base)
{
if (e != 0)
return;
unsigned addr = (rsp->sr[base] + offset * 16) & 0xfff;
if (addr & 1)
return;
unsigned b = (addr & 0xf) >> 1;
addr &= ~0xf;
auto *reg = rsp->cp2.regs[rt].e;
for (e = 8 - b; e < 8; e++, addr += 2)
reg[e] = READ_MEM_U16(rsp->dmem, addr & 0xfff);
}
void RSP_SRV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base)
{
if (e != 0)
return;
unsigned addr = (rsp->sr[base] + offset * 16) & 0xfff;
if (addr & 1)
return;
unsigned b = (addr & 0xf) >> 1;
addr &= ~0xf;
auto *reg = rsp->cp2.regs[rt].e;
for (e = 8 - b; e < 8; e++, addr += 2)
WRITE_MEM_U16(rsp->dmem, addr & 0xfff, reg[e]);
}
// Transposed stuff?
void RSP_LTV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base)
{
if (e & 1)
return;
if (rt & 7)
return;
unsigned addr = (rsp->sr[base] + offset * 16) & 0xfff;
if (addr & 0xf)
return;
for (unsigned i = 0; i < 8; i++)
rsp->cp2.regs[rt + i].e[(-e / 2 + i) & 7] = READ_MEM_U16(rsp->dmem, addr + 2 * i);
}
void RSP_STV(RSP::CPUState *rsp, unsigned rt, unsigned e, int offset, unsigned base)
{
if (e & 1)
return;
if (rt & 7)
return;
unsigned addr = (rsp->sr[base] + offset * 16) & 0xfff;
if (addr & 0xf)
return;
for (unsigned i = 0; i < 8; i++)
{
WRITE_MEM_U16(rsp->dmem, addr + 2 * i,
rsp->cp2.regs[rt + ((e / 2 + i) & 7)].e[i]);
}
}
}

103
rsp/pipeline.h Normal file
View File

@ -0,0 +1,103 @@
//
// rsp/pipeline.h: RSP processor pipeline.
//
// CEN64: Cycle-Accurate Nintendo 64 Emulator.
// Copyright (C) 2015, Tyler J. Stachecki.
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
#ifndef __rsp_pipeline_h__
#define __rsp_pipeline_h__
#include "rsp/decoder.h"
#include "rsp/cp2.h"
#include "rsp/rsp.h"
struct rsp;
enum rsp_mem_request_type {
RSP_MEM_REQUEST_NONE,
RSP_MEM_REQUEST_INT_MEM,
RSP_MEM_REQUEST_VECTOR,
RSP_MEM_REQUEST_FOURTH,
RSP_MEM_REQUEST_HALF,
RSP_MEM_REQUEST_PACK,
RSP_MEM_REQUEST_QUAD,
RSP_MEM_REQUEST_REST,
RSP_MEM_REQUEST_UPACK
};
struct rsp_int_mem_packet {
uint32_t data;
uint32_t rdqm;
uint32_t wdqm;
unsigned rshift;
};
struct rsp_vect_mem_packet {
union aligned_rsp_1vect_t vdqm;
void (*vldst_func)(struct rsp *rsp, uint32_t addr, unsigned element,
uint16_t *regp, rsp_vect_t reg, rsp_vect_t dqm);
unsigned element;
unsigned dest;
};
union rsp_mem_packet {
struct rsp_int_mem_packet p_int;
struct rsp_vect_mem_packet p_vect;
};
struct rsp_mem_request {
uint32_t addr;
enum rsp_mem_request_type type;
union rsp_mem_packet packet;
};
struct rsp_latch {
uint32_t pc;
};
struct rsp_result {
uint32_t result;
unsigned dest;
};
struct rsp_ifrd_latch {
struct rsp_latch common;
struct rsp_opcode opcode;
uint32_t pc, iw;
};
struct rsp_rdex_latch {
struct rsp_latch common;
struct rsp_opcode opcode;
uint32_t iw;
};
struct rsp_exdf_latch {
struct rsp_latch common;
struct rsp_result result;
struct rsp_mem_request request;
};
struct rsp_dfwb_latch {
struct rsp_latch common;
struct rsp_result result;
};
struct rsp_pipeline {
struct rsp_dfwb_latch dfwb_latch;
struct rsp_exdf_latch exdf_latch;
struct rsp_rdex_latch rdex_latch;
struct rsp_ifrd_latch ifrd_latch;
};
cen64_cold void rsp_pipeline_init(struct rsp_pipeline *pipeline);
#endif

143
rsp/reciprocal.cpp Normal file
View File

@ -0,0 +1,143 @@
//
// common/reciprocal.c: RSP reciprocal ROM contents.
//
// CEN64: Cycle-Accurate Nintendo 64 Emulator.
// Copyright (C) 2015, Tyler J. Stachecki.
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
#include "reciprocal.h"
alignas(64) const uint16_t rsp_reciprocal_rom[1024] = {
0xFFFF, 0xFF00, 0xFE01, 0xFD04, 0xFC07, 0xFB0C, 0xFA11, 0xF918,
0xF81F, 0xF727, 0xF631, 0xF53B, 0xF446, 0xF352, 0xF25F, 0xF16D,
0xF07C, 0xEF8B, 0xEE9C, 0xEDAE, 0xECC0, 0xEBD3, 0xEAE8, 0xE9FD,
0xE913, 0xE829, 0xE741, 0xE65A, 0xE573, 0xE48D, 0xE3A9, 0xE2C5,
0xE1E1, 0xE0FF, 0xE01E, 0xDF3D, 0xDE5D, 0xDD7E, 0xDCA0, 0xDBC2,
0xDAE6, 0xDA0A, 0xD92F, 0xD854, 0xD77B, 0xD6A2, 0xD5CA, 0xD4F3,
0xD41D, 0xD347, 0xD272, 0xD19E, 0xD0CB, 0xCFF8, 0xCF26, 0xCE55,
0xCD85, 0xCCB5, 0xCBE6, 0xCB18, 0xCA4B, 0xC97E, 0xC8B2, 0xC7E7,
0xC71C, 0xC652, 0xC589, 0xC4C0, 0xC3F8, 0xC331, 0xC26B, 0xC1A5,
0xC0E0, 0xC01C, 0xBF58, 0xBE95, 0xBDD2, 0xBD10, 0xBC4F, 0xBB8F,
0xBACF, 0xBA10, 0xB951, 0xB894, 0xB7D6, 0xB71A, 0xB65E, 0xB5A2,
0xB4E8, 0xB42E, 0xB374, 0xB2BB, 0xB203, 0xB14B, 0xB094, 0xAFDE,
0xAF28, 0xAE73, 0xADBE, 0xAD0A, 0xAC57, 0xABA4, 0xAAF1, 0xAA40,
0xA98E, 0xA8DE, 0xA82E, 0xA77E, 0xA6D0, 0xA621, 0xA574, 0xA4C6,
0xA41A, 0xA36E, 0xA2C2, 0xA217, 0xA16D, 0xA0C3, 0xA01A, 0x9F71,
0x9EC8, 0x9E21, 0x9D79, 0x9CD3, 0x9C2D, 0x9B87, 0x9AE2, 0x9A3D,
0x9999, 0x98F6, 0x9852, 0x97B0, 0x970E, 0x966C, 0x95CB, 0x952B,
0x948B, 0x93EB, 0x934C, 0x92AD, 0x920F, 0x9172, 0x90D4, 0x9038,
0x8F9C, 0x8F00, 0x8E65, 0x8DCA, 0x8D30, 0x8C96, 0x8BFC, 0x8B64,
0x8ACB, 0x8A33, 0x899C, 0x8904, 0x886E, 0x87D8, 0x8742, 0x86AD,
0x8618, 0x8583, 0x84F0, 0x845C, 0x83C9, 0x8336, 0x82A4, 0x8212,
0x8181, 0x80F0, 0x8060, 0x7FD0, 0x7F40, 0x7EB1, 0x7E22, 0x7D93,
0x7D05, 0x7C78, 0x7BEB, 0x7B5E, 0x7AD2, 0x7A46, 0x79BA, 0x792F,
0x78A4, 0x781A, 0x7790, 0x7706, 0x767D, 0x75F5, 0x756C, 0x74E4,
0x745D, 0x73D5, 0x734F, 0x72C8, 0x7242, 0x71BC, 0x7137, 0x70B2,
0x702E, 0x6FA9, 0x6F26, 0x6EA2, 0x6E1F, 0x6D9C, 0x6D1A, 0x6C98,
0x6C16, 0x6B95, 0x6B14, 0x6A94, 0x6A13, 0x6993, 0x6914, 0x6895,
0x6816, 0x6798, 0x6719, 0x669C, 0x661E, 0x65A1, 0x6524, 0x64A8,
0x642C, 0x63B0, 0x6335, 0x62BA, 0x623F, 0x61C5, 0x614B, 0x60D1,
0x6058, 0x5FDF, 0x5F66, 0x5EED, 0x5E75, 0x5DFD, 0x5D86, 0x5D0F,
0x5C98, 0x5C22, 0x5BAB, 0x5B35, 0x5AC0, 0x5A4B, 0x59D6, 0x5961,
0x58ED, 0x5879, 0x5805, 0x5791, 0x571E, 0x56AC, 0x5639, 0x55C7,
0x5555, 0x54E3, 0x5472, 0x5401, 0x5390, 0x5320, 0x52AF, 0x5240,
0x51D0, 0x5161, 0x50F2, 0x5083, 0x5015, 0x4FA6, 0x4F38, 0x4ECB,
0x4E5E, 0x4DF1, 0x4D84, 0x4D17, 0x4CAB, 0x4C3F, 0x4BD3, 0x4B68,
0x4AFD, 0x4A92, 0x4A27, 0x49BD, 0x4953, 0x48E9, 0x4880, 0x4817,
0x47AE, 0x4745, 0x46DC, 0x4674, 0x460C, 0x45A5, 0x453D, 0x44D6,
0x446F, 0x4408, 0x43A2, 0x433C, 0x42D6, 0x4270, 0x420B, 0x41A6,
0x4141, 0x40DC, 0x4078, 0x4014, 0x3FB0, 0x3F4C, 0x3EE8, 0x3E85,
0x3E22, 0x3DC0, 0x3D5D, 0x3CFB, 0x3C99, 0x3C37, 0x3BD6, 0x3B74,
0x3B13, 0x3AB2, 0x3A52, 0x39F1, 0x3991, 0x3931, 0x38D2, 0x3872,
0x3813, 0x37B4, 0x3755, 0x36F7, 0x3698, 0x363A, 0x35DC, 0x357F,
0x3521, 0x34C4, 0x3467, 0x340A, 0x33AE, 0x3351, 0x32F5, 0x3299,
0x323E, 0x31E2, 0x3187, 0x312C, 0x30D1, 0x3076, 0x301C, 0x2FC2,
0x2F68, 0x2F0E, 0x2EB4, 0x2E5B, 0x2E02, 0x2DA9, 0x2D50, 0x2CF8,
0x2C9F, 0x2C47, 0x2BEF, 0x2B97, 0x2B40, 0x2AE8, 0x2A91, 0x2A3A,
0x29E4, 0x298D, 0x2937, 0x28E0, 0x288B, 0x2835, 0x27DF, 0x278A,
0x2735, 0x26E0, 0x268B, 0x2636, 0x25E2, 0x258D, 0x2539, 0x24E5,
0x2492, 0x243E, 0x23EB, 0x2398, 0x2345, 0x22F2, 0x22A0, 0x224D,
0x21FB, 0x21A9, 0x2157, 0x2105, 0x20B4, 0x2063, 0x2012, 0x1FC1,
0x1F70, 0x1F1F, 0x1ECF, 0x1E7F, 0x1E2E, 0x1DDF, 0x1D8F, 0x1D3F,
0x1CF0, 0x1CA1, 0x1C52, 0x1C03, 0x1BB4, 0x1B66, 0x1B17, 0x1AC9,
0x1A7B, 0x1A2D, 0x19E0, 0x1992, 0x1945, 0x18F8, 0x18AB, 0x185E,
0x1811, 0x17C4, 0x1778, 0x172C, 0x16E0, 0x1694, 0x1648, 0x15FD,
0x15B1, 0x1566, 0x151B, 0x14D0, 0x1485, 0x143B, 0x13F0, 0x13A6,
0x135C, 0x1312, 0x12C8, 0x127F, 0x1235, 0x11EC, 0x11A3, 0x1159,
0x1111, 0x10C8, 0x107F, 0x1037, 0x0FEF, 0x0FA6, 0x0F5E, 0x0F17,
0x0ECF, 0x0E87, 0x0E40, 0x0DF9, 0x0DB2, 0x0D6B, 0x0D24, 0x0CDD,
0x0C97, 0x0C50, 0x0C0A, 0x0BC4, 0x0B7E, 0x0B38, 0x0AF2, 0x0AAD,
0x0A68, 0x0A22, 0x09DD, 0x0998, 0x0953, 0x090F, 0x08CA, 0x0886,
0x0842, 0x07FD, 0x07B9, 0x0776, 0x0732, 0x06EE, 0x06AB, 0x0668,
0x0624, 0x05E1, 0x059E, 0x055C, 0x0519, 0x04D6, 0x0494, 0x0452,
0x0410, 0x03CE, 0x038C, 0x034A, 0x0309, 0x02C7, 0x0286, 0x0245,
0x0204, 0x01C3, 0x0182, 0x0141, 0x0101, 0x00C0, 0x0080, 0x0040,
0x6A09, 0xFFFF, 0x6955, 0xFF00, 0x68A1, 0xFE02, 0x67EF, 0xFD06,
0x673E, 0xFC0B, 0x668D, 0xFB12, 0x65DE, 0xFA1A, 0x6530, 0xF923,
0x6482, 0xF82E, 0x63D6, 0xF73B, 0x632B, 0xF648, 0x6280, 0xF557,
0x61D7, 0xF467, 0x612E, 0xF379, 0x6087, 0xF28C, 0x5FE0, 0xF1A0,
0x5F3A, 0xF0B6, 0x5E95, 0xEFCD, 0x5DF1, 0xEEE5, 0x5D4E, 0xEDFF,
0x5CAC, 0xED19, 0x5C0B, 0xEC35, 0x5B6B, 0xEB52, 0x5ACB, 0xEA71,
0x5A2C, 0xE990, 0x598F, 0xE8B1, 0x58F2, 0xE7D3, 0x5855, 0xE6F6,
0x57BA, 0xE61B, 0x5720, 0xE540, 0x5686, 0xE467, 0x55ED, 0xE38E,
0x5555, 0xE2B7, 0x54BE, 0xE1E1, 0x5427, 0xE10D, 0x5391, 0xE039,
0x52FC, 0xDF66, 0x5268, 0xDE94, 0x51D5, 0xDDC4, 0x5142, 0xDCF4,
0x50B0, 0xDC26, 0x501F, 0xDB59, 0x4F8E, 0xDA8C, 0x4EFE, 0xD9C1,
0x4E6F, 0xD8F7, 0x4DE1, 0xD82D, 0x4D53, 0xD765, 0x4CC6, 0xD69E,
0x4C3A, 0xD5D7, 0x4BAF, 0xD512, 0x4B24, 0xD44E, 0x4A9A, 0xD38A,
0x4A10, 0xD2C8, 0x4987, 0xD206, 0x48FF, 0xD146, 0x4878, 0xD086,
0x47F1, 0xCFC7, 0x476B, 0xCF0A, 0x46E5, 0xCE4D, 0x4660, 0xCD91,
0x45DC, 0xCCD6, 0x4558, 0xCC1B, 0x44D5, 0xCB62, 0x4453, 0xCAA9,
0x43D1, 0xC9F2, 0x434F, 0xC93B, 0x42CF, 0xC885, 0x424F, 0xC7D0,
0x41CF, 0xC71C, 0x4151, 0xC669, 0x40D2, 0xC5B6, 0x4055, 0xC504,
0x3FD8, 0xC453, 0x3F5B, 0xC3A3, 0x3EDF, 0xC2F4, 0x3E64, 0xC245,
0x3DE9, 0xC198, 0x3D6E, 0xC0EB, 0x3CF5, 0xC03F, 0x3C7C, 0xBF93,
0x3C03, 0xBEE9, 0x3B8B, 0xBE3F, 0x3B13, 0xBD96, 0x3A9C, 0xBCED,
0x3A26, 0xBC46, 0x39B0, 0xBB9F, 0x393A, 0xBAF8, 0x38C5, 0xBA53,
0x3851, 0xB9AE, 0x37DD, 0xB90A, 0x3769, 0xB867, 0x36F6, 0xB7C5,
0x3684, 0xB723, 0x3612, 0xB681, 0x35A0, 0xB5E1, 0x352F, 0xB541,
0x34BF, 0xB4A2, 0x344F, 0xB404, 0x33DF, 0xB366, 0x3370, 0xB2C9,
0x3302, 0xB22C, 0x3293, 0xB191, 0x3226, 0xB0F5, 0x31B9, 0xB05B,
0x314C, 0xAFC1, 0x30DF, 0xAF28, 0x3074, 0xAE8F, 0x3008, 0xADF7,
0x2F9D, 0xAD60, 0x2F33, 0xACC9, 0x2EC8, 0xAC33, 0x2E5F, 0xAB9E,
0x2DF6, 0xAB09, 0x2D8D, 0xAA75, 0x2D24, 0xA9E1, 0x2CBC, 0xA94E,
0x2C55, 0xA8BC, 0x2BEE, 0xA82A, 0x2B87, 0xA799, 0x2B21, 0xA708,
0x2ABB, 0xA678, 0x2A55, 0xA5E8, 0x29F0, 0xA559, 0x298B, 0xA4CB,
0x2927, 0xA43D, 0x28C3, 0xA3B0, 0x2860, 0xA323, 0x27FD, 0xA297,
0x279A, 0xA20B, 0x2738, 0xA180, 0x26D6, 0xA0F6, 0x2674, 0xA06C,
0x2613, 0x9FE2, 0x25B2, 0x9F59, 0x2552, 0x9ED1, 0x24F2, 0x9E49,
0x2492, 0x9DC2, 0x2432, 0x9D3B, 0x23D3, 0x9CB4, 0x2375, 0x9C2F,
0x2317, 0x9BA9, 0x22B9, 0x9B25, 0x225B, 0x9AA0, 0x21FE, 0x9A1C,
0x21A1, 0x9999, 0x2145, 0x9916, 0x20E8, 0x9894, 0x208D, 0x9812,
0x2031, 0x9791, 0x1FD6, 0x9710, 0x1F7B, 0x968F, 0x1F21, 0x960F,
0x1EC7, 0x9590, 0x1E6D, 0x9511, 0x1E13, 0x9492, 0x1DBA, 0x9414,
0x1D61, 0x9397, 0x1D09, 0x931A, 0x1CB1, 0x929D, 0x1C59, 0x9221,
0x1C01, 0x91A5, 0x1BAA, 0x9129, 0x1B53, 0x90AF, 0x1AFC, 0x9034,
0x1AA6, 0x8FBA, 0x1A50, 0x8F40, 0x19FA, 0x8EC7, 0x19A5, 0x8E4F,
0x1950, 0x8DD6, 0x18FB, 0x8D5E, 0x18A7, 0x8CE7, 0x1853, 0x8C70,
0x17FF, 0x8BF9, 0x17AB, 0x8B83, 0x1758, 0x8B0D, 0x1705, 0x8A98,
0x16B2, 0x8A23, 0x1660, 0x89AE, 0x160D, 0x893A, 0x15BC, 0x88C6,
0x156A, 0x8853, 0x1519, 0x87E0, 0x14C8, 0x876D, 0x1477, 0x86FB,
0x1426, 0x8689, 0x13D6, 0x8618, 0x1386, 0x85A7, 0x1337, 0x8536,
0x12E7, 0x84C6, 0x1298, 0x8456, 0x1249, 0x83E7, 0x11FB, 0x8377,
0x11AC, 0x8309, 0x115E, 0x829A, 0x1111, 0x822C, 0x10C3, 0x81BF,
0x1076, 0x8151, 0x1029, 0x80E4, 0x0FDC, 0x8078, 0x0F8F, 0x800C,
0x0F43, 0x7FA0, 0x0EF7, 0x7F34, 0x0EAB, 0x7EC9, 0x0E60, 0x7E5E,
0x0E15, 0x7DF4, 0x0DCA, 0x7D8A, 0x0D7F, 0x7D20, 0x0D34, 0x7CB6,
0x0CEA, 0x7C4D, 0x0CA0, 0x7BE5, 0x0C56, 0x7B7C, 0x0C0C, 0x7B14,
0x0BC3, 0x7AAC, 0x0B7A, 0x7A45, 0x0B31, 0x79DE, 0x0AE8, 0x7977,
0x0AA0, 0x7911, 0x0A58, 0x78AB, 0x0A10, 0x7845, 0x09C8, 0x77DF,
0x0981, 0x777A, 0x0939, 0x7715, 0x08F2, 0x76B1, 0x08AB, 0x764D,
0x0865, 0x75E9, 0x081E, 0x7585, 0x07D8, 0x7522, 0x0792, 0x74BF,
0x074D, 0x745D, 0x0707, 0x73FA, 0x06C2, 0x7398, 0x067D, 0x7337,
0x0638, 0x72D5, 0x05F3, 0x7274, 0x05AF, 0x7213, 0x056A, 0x71B3,
0x0526, 0x7152, 0x04E2, 0x70F2, 0x049F, 0x7093, 0x045B, 0x7033,
0x0418, 0x6FD4, 0x03D5, 0x6F76, 0x0392, 0x6F17, 0x0350, 0x6EB9,
0x030D, 0x6E5B, 0x02CB, 0x6DFD, 0x0289, 0x6DA0, 0x0247, 0x6D43,
0x0206, 0x6CE6, 0x01C4, 0x6C8A, 0x0183, 0x6C2D, 0x0142, 0x6BD1,
0x0101, 0x6B76, 0x00C0, 0x6B1A, 0x0080, 0x6ABF, 0x0040, 0x6A64
};

18
rsp/reciprocal.h Normal file
View File

@ -0,0 +1,18 @@
//
// common/reciprocal.h: RSP reciprocal ROM contents.
//
// CEN64: Cycle-Accurate Nintendo 64 Emulator.
// Copyright (C) 2015, Tyler J. Stachecki.
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
#ifndef __common_reciprocal_h__
#define __common_reciprocal_h__
#include <stdint.h>
extern const uint16_t rsp_reciprocal_rom[1024];
#endif

34
rsp/registers.md Normal file
View File

@ -0,0 +1,34 @@
//
// rsp/registers.md: RSP register enumerations.
//
// CEN64: Cycle-Accurate Nintendo 64 Emulator.
// Copyright (C) 2015, Tyler J. Stachecki.
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
#ifndef SP_REGISTER_LIST
#define SP_REGISTER_LIST \
X(SP_MEM_ADDR_REG) \
X(SP_DRAM_ADDR_REG) \
X(SP_RD_LEN_REG) \
X(SP_WR_LEN_REG) \
X(SP_STATUS_REG) \
X(SP_DMA_FULL_REG) \
X(SP_DMA_BUSY_REG) \
X(SP_SEMAPHORE_REG) \
X(CMD_START) \
X(CMD_END) \
X(CMD_CURRENT) \
X(CMD_STATUS) \
X(CMD_CLOCK) \
X(CMD_BUSY) \
X(CMD_PIPE_BUSY) \
X(CMD_TMEM_BUSY) \
X(SP_PC_REG) \
X(SP_IBIST_REG)
#endif
SP_REGISTER_LIST

703
rsp/vfunctions.cpp Normal file
View File

@ -0,0 +1,703 @@
//
// rsp/vfunctions.c: RSP vector execution functions.
//
// CEN64: Cycle-Accurate Nintendo 64 Emulator.
// Copyright (C) 2015, Tyler J. Stachecki.
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
#include "../rsp.hpp"
#include "rsp_impl.h"
#include "../rsp_op.hpp"
#define LOAD_VS() rsp_vect_load_unshuffled_operand(rsp->cp2.regs[vs].e)
#define LOAD_VT() rsp_vect_load_and_shuffle_operand(rsp->cp2.regs[vt].e, e)
#define STORE_RESULT() rsp_vect_write_operand(rsp->cp2.regs[vd].e, result)
extern "C" {
//
// VABS
//
void RSP_VABS(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
rsp_vect_t acc_lo;
rsp_vect_t result = rsp_vabs(LOAD_VS(), LOAD_VT(), &acc_lo);
write_acc_lo(acc, acc_lo);
rsp_vect_write_operand(rsp->cp2.regs[vd].e, result);
}
//
// VADD
//
void RSP_VADD(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
rsp_vect_t carry, acc_lo;
carry = read_vco_lo(rsp->cp2.flags[RSP::RSP_VCO].e);
rsp_vect_t result = rsp_vadd(LOAD_VS(), LOAD_VT(), carry, &acc_lo);
write_vco_hi(rsp->cp2.flags[RSP::RSP_VCO].e, rsp_vzero());
write_vco_lo(rsp->cp2.flags[RSP::RSP_VCO].e, rsp_vzero());
write_acc_lo(acc, acc_lo);
STORE_RESULT();
}
//
// VADDC
//
void RSP_VADDC(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
rsp_vect_t sn;
rsp_vect_t result = rsp_vaddc(LOAD_VS(), LOAD_VT(), rsp_vzero(), &sn);
write_vco_hi(rsp->cp2.flags[RSP::RSP_VCO].e, rsp_vzero()); // TODO: Confirm.
write_vco_lo(rsp->cp2.flags[RSP::RSP_VCO].e, sn);
write_acc_lo(acc, result);
STORE_RESULT();
}
//
// VAND
// VNAND
//
void RSP_VAND(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
rsp_vect_t result = rsp_vand(LOAD_VS(), LOAD_VT());
write_acc_lo(acc, result);
STORE_RESULT();
}
void RSP_VNAND(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
rsp_vect_t result = rsp_vnand(LOAD_VS(), LOAD_VT());
write_acc_lo(acc, result);
STORE_RESULT();
}
//
// VCH
//
void RSP_VCH(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
rsp_vect_t ge, le, sign, eq, vce;
rsp_vect_t result = rsp_vch(LOAD_VS(), LOAD_VT(), rsp_vzero(), &ge, &le, &eq, &sign, &vce);
write_vcc_hi(rsp->cp2.flags[RSP::RSP_VCC].e, ge);
write_vcc_lo(rsp->cp2.flags[RSP::RSP_VCC].e, le);
write_vco_hi(rsp->cp2.flags[RSP::RSP_VCO].e, eq);
write_vco_lo(rsp->cp2.flags[RSP::RSP_VCO].e, sign);
write_vce (rsp->cp2.flags[RSP::RSP_VCE].e, vce);
write_acc_lo(acc, result);
STORE_RESULT();
}
//
// VCL
//
void RSP_VCL(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
rsp_vect_t ge, le, eq, sign, vce;
ge = read_vcc_hi(rsp->cp2.flags[RSP::RSP_VCC].e);
le = read_vcc_lo(rsp->cp2.flags[RSP::RSP_VCC].e);
eq = read_vco_hi(rsp->cp2.flags[RSP::RSP_VCO].e);
sign = read_vco_lo(rsp->cp2.flags[RSP::RSP_VCO].e);
vce = read_vce(rsp->cp2.flags[RSP::RSP_VCE].e);
rsp_vect_t result = rsp_vcl(LOAD_VS(), LOAD_VT(), rsp_vzero(), &ge, &le, eq, sign, vce);
write_vcc_hi(rsp->cp2.flags[RSP::RSP_VCC].e, ge);
write_vcc_lo(rsp->cp2.flags[RSP::RSP_VCC].e, le);
write_vco_hi(rsp->cp2.flags[RSP::RSP_VCO].e, rsp_vzero());
write_vco_lo(rsp->cp2.flags[RSP::RSP_VCO].e, rsp_vzero());
write_vce (rsp->cp2.flags[RSP::RSP_VCE].e, rsp_vzero());
write_acc_lo(acc, result);
STORE_RESULT();
}
//
// VCR
//
void RSP_VCR(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
rsp_vect_t ge, le;
rsp_vect_t result = rsp_vcr(LOAD_VS(), LOAD_VT(), rsp_vzero(), &ge, &le);
#ifdef INTENSE_DEBUG
for (unsigned i = 0; i < 8; i++)
fprintf(stderr, "VD[%d] = %d\n", i,
reinterpret_cast<int16_t*>(&result)[i]);
#endif
write_vcc_hi(rsp->cp2.flags[RSP::RSP_VCC].e, ge);
write_vcc_lo(rsp->cp2.flags[RSP::RSP_VCC].e, le);
write_vco_hi(rsp->cp2.flags[RSP::RSP_VCO].e, rsp_vzero());
write_vco_lo(rsp->cp2.flags[RSP::RSP_VCO].e, rsp_vzero());
write_vce (rsp->cp2.flags[RSP::RSP_VCE].e, rsp_vzero());
write_acc_lo(acc, result);
STORE_RESULT();
}
//
// VEQ
// VGE
// VLT
// VNE
//
void RSP_VEQ(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
rsp_vect_t le, eq, sign;
eq = read_vco_hi(rsp->cp2.flags[RSP::RSP_VCO].e);
sign = read_vco_lo(rsp->cp2.flags[RSP::RSP_VCO].e);
rsp_vect_t result = rsp_veq(LOAD_VS(), LOAD_VT(), rsp_vzero(), &le, eq, sign);
write_vcc_hi(rsp->cp2.flags[RSP::RSP_VCC].e, rsp_vzero());
write_vcc_lo(rsp->cp2.flags[RSP::RSP_VCC].e, le);
write_vco_hi(rsp->cp2.flags[RSP::RSP_VCO].e, rsp_vzero());
write_vco_lo(rsp->cp2.flags[RSP::RSP_VCO].e, rsp_vzero());
write_acc_lo(acc, result);
STORE_RESULT();
}
void RSP_VGE(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
rsp_vect_t le, eq, sign;
eq = read_vco_hi(rsp->cp2.flags[RSP::RSP_VCO].e);
sign = read_vco_lo(rsp->cp2.flags[RSP::RSP_VCO].e);
rsp_vect_t result = rsp_vge(LOAD_VS(), LOAD_VT(), rsp_vzero(), &le, eq, sign);
write_vcc_hi(rsp->cp2.flags[RSP::RSP_VCC].e, rsp_vzero());
write_vcc_lo(rsp->cp2.flags[RSP::RSP_VCC].e, le);
write_vco_hi(rsp->cp2.flags[RSP::RSP_VCO].e, rsp_vzero());
write_vco_lo(rsp->cp2.flags[RSP::RSP_VCO].e, rsp_vzero());
write_acc_lo(acc, result);
STORE_RESULT();
}
void RSP_VLT(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
rsp_vect_t le, eq, sign;
eq = read_vco_hi(rsp->cp2.flags[RSP::RSP_VCO].e);
sign = read_vco_lo(rsp->cp2.flags[RSP::RSP_VCO].e);
rsp_vect_t result = rsp_vlt(LOAD_VS(), LOAD_VT(), rsp_vzero(), &le, eq, sign);
write_vcc_hi(rsp->cp2.flags[RSP::RSP_VCC].e, rsp_vzero());
write_vcc_lo(rsp->cp2.flags[RSP::RSP_VCC].e, le);
write_vco_hi(rsp->cp2.flags[RSP::RSP_VCO].e, rsp_vzero());
write_vco_lo(rsp->cp2.flags[RSP::RSP_VCO].e, rsp_vzero());
write_acc_lo(acc, result);
STORE_RESULT();
}
void RSP_VNE(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
rsp_vect_t le, eq, sign;
eq = read_vco_hi(rsp->cp2.flags[RSP::RSP_VCO].e);
sign = read_vco_lo(rsp->cp2.flags[RSP::RSP_VCO].e);
rsp_vect_t result = rsp_vne(LOAD_VS(), LOAD_VT(), rsp_vzero(), &le, eq, sign);
#ifdef INTENSE_DEBUG
for (unsigned i = 0; i < 8; i++)
fprintf(stderr, "VD[%d] = %d\n", i,
reinterpret_cast<int16_t*>(&result)[i]);
#endif
write_vcc_hi(rsp->cp2.flags[RSP::RSP_VCC].e, rsp_vzero());
write_vcc_lo(rsp->cp2.flags[RSP::RSP_VCC].e, le);
write_vco_hi(rsp->cp2.flags[RSP::RSP_VCO].e, rsp_vzero());
write_vco_lo(rsp->cp2.flags[RSP::RSP_VCO].e, rsp_vzero());
write_acc_lo(acc, result);
STORE_RESULT();
}
//
// VINVALID
//
void RSP_VINVALID(RSP::CPUState *, unsigned, unsigned, unsigned, unsigned)
{
fprintf(stderr, "Unimplemented ...\n");
}
//
// VMACF
// VMACU
//
void RSP_VMACF(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
rsp_vect_t acc_lo, acc_md, acc_hi, result;
acc_lo = read_acc_lo(acc);
acc_md = read_acc_md(acc);
acc_hi = read_acc_hi(acc);
result = rsp_vmacf_vmacu<false>(LOAD_VS(), LOAD_VT(), rsp_vzero(), &acc_lo, &acc_md, &acc_hi);
write_acc_lo(acc, acc_lo);
write_acc_md(acc, acc_md);
write_acc_hi(acc, acc_hi);
STORE_RESULT();
}
void RSP_VMACU(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
rsp_vect_t acc_lo, acc_md, acc_hi, result;
acc_lo = read_acc_lo(acc);
acc_md = read_acc_md(acc);
acc_hi = read_acc_hi(acc);
result = rsp_vmacf_vmacu<true>(LOAD_VS(), LOAD_VT(), rsp_vzero(), &acc_lo, &acc_md, &acc_hi);
write_acc_lo(acc, acc_lo);
write_acc_md(acc, acc_md);
write_acc_hi(acc, acc_hi);
STORE_RESULT();
}
//
// VMADH
// VMUDH
//
void RSP_VMADH(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
rsp_vect_t acc_lo, acc_md, acc_hi, result;
acc_lo = read_acc_lo(acc);
acc_md = read_acc_md(acc);
acc_hi = read_acc_hi(acc);
result = rsp_vmadh_vmudh<true>(LOAD_VS(), LOAD_VT(), rsp_vzero(), &acc_lo, &acc_md, &acc_hi);
write_acc_lo(acc, acc_lo);
write_acc_md(acc, acc_md);
write_acc_hi(acc, acc_hi);
STORE_RESULT();
}
void RSP_VMUDH(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
rsp_vect_t acc_lo, acc_md, acc_hi, result;
acc_lo = read_acc_lo(acc);
acc_md = read_acc_md(acc);
acc_hi = read_acc_hi(acc);
result = rsp_vmadh_vmudh<false>(LOAD_VS(), LOAD_VT(), rsp_vzero(), &acc_lo, &acc_md, &acc_hi);
write_acc_lo(acc, acc_lo);
write_acc_md(acc, acc_md);
write_acc_hi(acc, acc_hi);
STORE_RESULT();
}
//
// VMADL
// VMUDL
//
void RSP_VMADL(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
rsp_vect_t acc_lo, acc_md, acc_hi, result;
acc_lo = read_acc_lo(acc);
acc_md = read_acc_md(acc);
acc_hi = read_acc_hi(acc);
result = rsp_vmadl_vmudl<true>(LOAD_VS(), LOAD_VT(), rsp_vzero(), &acc_lo, &acc_md, &acc_hi);
write_acc_lo(acc, acc_lo);
write_acc_md(acc, acc_md);
write_acc_hi(acc, acc_hi);
STORE_RESULT();
}
void RSP_VMUDL(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
rsp_vect_t acc_lo, acc_md, acc_hi, result;
acc_lo = read_acc_lo(acc);
acc_md = read_acc_md(acc);
acc_hi = read_acc_hi(acc);
result = rsp_vmadl_vmudl<false>(LOAD_VS(), LOAD_VT(), rsp_vzero(), &acc_lo, &acc_md, &acc_hi);
write_acc_lo(acc, acc_lo);
write_acc_md(acc, acc_md);
write_acc_hi(acc, acc_hi);
STORE_RESULT();
}
//
// VMADM
// VMUDM
//
void RSP_VMADM(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
rsp_vect_t acc_lo, acc_md, acc_hi, result;
acc_lo = read_acc_lo(acc);
acc_md = read_acc_md(acc);
acc_hi = read_acc_hi(acc);
result = rsp_vmadm_vmudm<true>(LOAD_VS(), LOAD_VT(), rsp_vzero(), &acc_lo, &acc_md, &acc_hi);
write_acc_lo(acc, acc_lo);
write_acc_md(acc, acc_md);
write_acc_hi(acc, acc_hi);
STORE_RESULT();
}
void RSP_VMUDM(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
rsp_vect_t acc_lo, acc_md, acc_hi, result;
acc_lo = read_acc_lo(acc);
acc_md = read_acc_md(acc);
acc_hi = read_acc_hi(acc);
result = rsp_vmadm_vmudm<false>(LOAD_VS(), LOAD_VT(), rsp_vzero(), &acc_lo, &acc_md, &acc_hi);
write_acc_lo(acc, acc_lo);
write_acc_md(acc, acc_md);
write_acc_hi(acc, acc_hi);
STORE_RESULT();
}
//
// VMADN
// VMUDN
//
void RSP_VMADN(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
rsp_vect_t acc_lo, acc_md, acc_hi, result;
acc_lo = read_acc_lo(acc);
acc_md = read_acc_md(acc);
acc_hi = read_acc_hi(acc);
result = rsp_vmadn_vmudn<true>(LOAD_VS(), LOAD_VT(), rsp_vzero(), &acc_lo, &acc_md, &acc_hi);
write_acc_lo(acc, acc_lo);
write_acc_md(acc, acc_md);
write_acc_hi(acc, acc_hi);
STORE_RESULT();
}
void RSP_VMUDN(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
rsp_vect_t acc_lo, acc_md, acc_hi, result;
acc_lo = read_acc_lo(acc);
acc_md = read_acc_md(acc);
acc_hi = read_acc_hi(acc);
result = rsp_vmadn_vmudn<false>(LOAD_VS(), LOAD_VT(), rsp_vzero(), &acc_lo, &acc_md, &acc_hi);
write_acc_lo(acc, acc_lo);
write_acc_md(acc, acc_md);
write_acc_hi(acc, acc_hi);
STORE_RESULT();
}
//
// VMOV
//
void RSP_VMOV(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
unsigned de = vs & 0x7;
write_acc_lo(acc, LOAD_VT());
__m128i result = rsp_vmov(rsp, vt, e, vd, de);
STORE_RESULT();
}
//
// VMRG
//
void RSP_VMRG(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
rsp_vect_t le;
le = read_vcc_lo(rsp->cp2.flags[RSP::RSP_VCC].e);
rsp_vect_t result = rsp_vmrg(LOAD_VS(), LOAD_VT(), le);
write_vco_hi(rsp->cp2.flags[RSP::RSP_VCO].e, rsp_vzero());
write_vco_lo(rsp->cp2.flags[RSP::RSP_VCO].e, rsp_vzero());
write_acc_lo(acc, result);
STORE_RESULT();
}
//
// VMULF
// VMULU
//
void RSP_VMULF(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
rsp_vect_t acc_lo, acc_md, acc_hi, result;
result = rsp_vmulf_vmulu<false>(LOAD_VS(), LOAD_VT(), rsp_vzero(), &acc_lo, &acc_md, &acc_hi);
write_acc_lo(acc, acc_lo);
write_acc_md(acc, acc_md);
write_acc_hi(acc, acc_hi);
STORE_RESULT();
}
void RSP_VMULU(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
rsp_vect_t acc_lo, acc_md, acc_hi, result;
result = rsp_vmulf_vmulu<true>(LOAD_VS(), LOAD_VT(), rsp_vzero(), &acc_lo, &acc_md, &acc_hi);
write_acc_lo(acc, acc_lo);
write_acc_md(acc, acc_md);
write_acc_hi(acc, acc_hi);
STORE_RESULT();
}
//
// VNOP
//
void RSP_VNOP(RSP::CPUState *, unsigned, unsigned, unsigned, unsigned)
{
}
//
// VOR
// VNOR
//
void RSP_VOR(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
rsp_vect_t result = rsp_vor(LOAD_VS(), LOAD_VT());
write_acc_lo(acc, result);
STORE_RESULT();
}
void RSP_VNOR(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
rsp_vect_t result = rsp_vnor(LOAD_VS(), LOAD_VT());
write_acc_lo(acc, result);
STORE_RESULT();
}
//
// VRCP
// VRCPL
// VRSQ
// VRSQL
//
void RSP_VRCP(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
unsigned de = vs & 0x7;
e &= 0x7;
write_acc_lo(acc, LOAD_VT());
rsp->cp2.dp_flag = 0;
rsp_vect_t result = rsp_vrcp_vrsq<false>(rsp, 0, vt, e, vd, de);
STORE_RESULT();
}
void RSP_VRCPL(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
unsigned de = vs & 0x7;
e &= 0x7;
write_acc_lo(acc, LOAD_VT());
int dp = rsp->cp2.dp_flag & 1;
rsp->cp2.dp_flag = 0;
rsp_vect_t result = rsp_vrcp_vrsq<false>(rsp, dp, vt, e, vd, de);
STORE_RESULT();
}
void RSP_VRSQ(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
unsigned de = vs & 0x7;
e &= 0x7;
write_acc_lo(acc, LOAD_VT());
rsp->cp2.dp_flag = 0;
rsp_vect_t result = rsp_vrcp_vrsq<true>(rsp, 0, vt, e, vd, de);
STORE_RESULT();
}
void RSP_VRSQL(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
unsigned de = vs & 0x7;
e &= 0x7;
write_acc_lo(acc, LOAD_VT());
int dp = rsp->cp2.dp_flag & 1;
rsp->cp2.dp_flag = 0;
rsp_vect_t result = rsp_vrcp_vrsq<true>(rsp, dp, vt, e, vd, de);
STORE_RESULT();
}
//
// VRCPH
// VRSQH
//
void RSP_VRCPH(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
unsigned de = vs & 0x7;
e &= 0x7;
write_acc_lo(acc, LOAD_VT());
// Specify double-precision for VRCPL on the next pass.
rsp->cp2.dp_flag = 1;
rsp_vect_t result = rsp_vdivh(rsp, vt, e, vd, de);
STORE_RESULT();
}
void RSP_VRSQH(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
unsigned de = vs & 0x7;
e &= 0x7;
write_acc_lo(acc, LOAD_VT());
// Specify double-precision for VRCPL on the next pass.
rsp->cp2.dp_flag = 1;
rsp_vect_t result = rsp_vdivh(rsp, vt, e, vd, de);
STORE_RESULT();
}
//
// VSAR
//
void RSP_VSAR(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
rsp_vect_t result;
switch (e) {
case 8: result = read_acc_hi(acc); break;
case 9: result = read_acc_md(acc); break;
case 10: result = read_acc_lo(acc); break;
default: result = rsp_vzero(); break;
}
STORE_RESULT();
}
//
// VSUB
//
void RSP_VSUB(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
rsp_vect_t carry, acc_lo;
carry = read_vco_lo(rsp->cp2.flags[RSP::RSP_VCO].e);
rsp_vect_t result = rsp_vsub(LOAD_VS(), LOAD_VT(), carry, &acc_lo);
write_vco_hi(rsp->cp2.flags[RSP::RSP_VCO].e, rsp_vzero());
write_vco_lo(rsp->cp2.flags[RSP::RSP_VCO].e, rsp_vzero());
write_acc_lo(acc, acc_lo);
STORE_RESULT();
}
//
// VSUBC
//
void RSP_VSUBC(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
rsp_vect_t eq, sn;
rsp_vect_t result = rsp_vsubc(LOAD_VS(), LOAD_VT(), rsp_vzero(), &eq, &sn);
write_vco_hi(rsp->cp2.flags[RSP::RSP_VCO].e, eq);
write_vco_lo(rsp->cp2.flags[RSP::RSP_VCO].e, sn);
write_acc_lo(acc, result);
STORE_RESULT();
}
//
// VXOR
// VNXOR
//
void RSP_VXOR(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
rsp_vect_t result = rsp_vxor(LOAD_VS(), LOAD_VT());
write_acc_lo(acc, result);
STORE_RESULT();
}
void RSP_VNXOR(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
{
uint16_t *acc = rsp->cp2.acc.e;
rsp_vect_t result = rsp_vnxor(LOAD_VS(), LOAD_VT());
write_acc_lo(acc, result);
STORE_RESULT();
}
// RESERVED
void RSP_RESERVED(RSP::CPUState *rsp, unsigned vd, unsigned, unsigned, unsigned)
{
rsp_vect_t result = rsp_vzero();
STORE_RESULT();
}
}

226
rsp_1.1.h Normal file
View File

@ -0,0 +1,226 @@
#ifndef __RSP_1_1_H__
#define __RSP_1_1_H__
#if defined(__cplusplus)
extern "C" {
#endif
#define PLUGIN_TYPE_RSP 1
#define PLUGIN_TYPE_GFX 2
#define PLUGIN_TYPE_AUDIO 3
#define PLUGIN_TYPE_CONTROLLER 4
#if !defined(M64P_PLUGIN_API)
/*
* slight changes to zilmar's spec file for portability
*
* The raw plugin spec headers by zilmar required WIN32 definitions.
*
* Here, the sufficient ANSI approximations are given so that this header
* will operate more independently.
*/
struct HWND__ {int unused;};
typedef struct HWND__ *HWND;
struct HINSTANCE__ {int unused;};
typedef struct HINSTANCE__ *HINSTANCE;
struct HMENU__ {int unused;};
typedef struct HMENU__ *HMENU;
struct HDC__ {int unused;};
typedef struct HDC__ *HDC;
#endif
#if defined(_STDINT_H) || defined(M64P_PLUGIN_API)
typedef uint32_t RCPREG;
#elif (0)
typedef unsigned long RCPREG; /* necessary for 16-bit targets */
#else
typedef unsigned int RCPREG; /* ANSI approximation of 32-bit size */
#endif
typedef struct {
unsigned short Version; /* Should be set to 0x0101 */
unsigned short Type; /* Set to PLUGIN_TYPE_RSP */
char Name[100]; /* Name of the DLL */
/* If DLL supports memory these memory options then set them to TRUE or FALSE
if it does not support it */
int NormalMemory; /* a normal BYTE array */
int MemoryBswaped; /* a normal BYTE array where the memory has been pre-
byte-swapped on a DWORD (32 bits) boundary */
} PLUGIN_INFO;
#if !defined(M64P_PLUGIN_API)
typedef struct {
HINSTANCE hInst;
int MemoryBswaped; /* If this is set to TRUE, then the memory has been
pre-byte-swapped on a DWORD (32 bits) boundary */
unsigned char *RDRAM;
unsigned char *DMEM;
unsigned char *IMEM;
RCPREG *MI_INTR_REG;
RCPREG *SP_MEM_ADDR_REG;
RCPREG *SP_DRAM_ADDR_REG;
RCPREG *SP_RD_LEN_REG;
RCPREG *SP_WR_LEN_REG;
RCPREG *SP_STATUS_REG;
RCPREG *SP_DMA_FULL_REG;
RCPREG *SP_DMA_BUSY_REG;
RCPREG *SP_PC_REG; /* This was SUPPOSED to be defined after the next. */
RCPREG *SP_SEMAPHORE_REG;
/** RCPREG *SP_PC_REG; // CPU-mapped between SP and DP command buffer regs **/
RCPREG *DPC_START_REG;
RCPREG *DPC_END_REG;
RCPREG *DPC_CURRENT_REG;
RCPREG *DPC_STATUS_REG;
RCPREG *DPC_CLOCK_REG;
RCPREG *DPC_BUFBUSY_REG;
RCPREG *DPC_PIPEBUSY_REG;
RCPREG *DPC_TMEM_REG;
void (*CheckInterrupts)(void);
void (*ProcessDList)(void);
void (*ProcessAList)(void);
void (*ProcessRdpList)(void);
void (*ShowCFB)(void);
} RSP_INFO;
#endif
typedef struct {
void (*UpdateBreakPoints)(void);
void (*UpdateMemory)(void);
void (*UpdateR4300iRegisters)(void);
void (*Enter_BPoint_Window)(void);
void (*Enter_R4300i_Commands_Window)(void);
void (*Enter_R4300i_Register_Window)(void);
void (*Enter_RSP_Commands_Window)(void);
void (*Enter_Memory_Window)(void);
} DEBUG_INFO;
#if defined(M64P_PLUGIN_API)
#define M64P_PLUGIN_PROTOTYPES 1
#include "m64p_types.h"
#include "m64p_common.h"
#include "m64p_plugin.h"
#include "m64p_config.h"
#else
#if defined(WIN32)
#define EXPORT __declspec(dllexport)
#define CALL __cdecl
#else
#define EXPORT __attribute__((visibility("default")))
#define CALL
#endif
#endif
#if !defined(M64P_PLUGIN_API)
/******************************************************************
Function: CloseDLL
Purpose: This function is called when the emulator is closing
down allowing the DLL to de-initialise.
input: none
output: none
*******************************************************************/
EXPORT void CALL CloseDLL(void);
/******************************************************************
Function: DllAbout
Purpose: This function is optional function that is provided
to give further information about the DLL.
input: a handle to the window that calls this function
output: none
*******************************************************************/
EXPORT void CALL DllAbout(HWND hParent);
/******************************************************************
Function: DllConfig
Purpose: This function is optional function that is provided
to allow the user to configure the DLL
input: a handle to the window that calls this function
output: none
*******************************************************************/
EXPORT void CALL DllConfig(HWND hParent);
/******************************************************************
Function: DllTest
Purpose: This function is optional function that is provided
to allow the user to test the DLL
input: a handle to the window that calls this function
output: none
*******************************************************************/
EXPORT void CALL DllTest(HWND hParent);
#endif
/******************************************************************
Function: DoRspCycles
Purpose: This function is to allow the RSP to run in parallel
with the r4300 switching control back to the r4300 once
the function ends.
input: The number of cycles that is meant to be executed
output: The number of cycles that was executed. This value can
be greater than the number of cycles that the RSP
should have performed.
(this value is ignored if the RSP is stopped)
*******************************************************************/
EXPORT unsigned int CALL DoRspCycles(unsigned int Cycles);
/******************************************************************
Function: GetDllInfo
Purpose: This function allows the emulator to gather information
about the DLL by filling in the PluginInfo structure.
input: a pointer to a PLUGIN_INFO structure that needs to be
filled by the function. (see def above)
output: none
*******************************************************************/
EXPORT void CALL GetDllInfo(PLUGIN_INFO *PluginInfo);
/*
* `GetRspDebugInfo` -- customarily deprecated by cxd4
*
* It was extraordinarily easy to re-invent debug facilities without
* depending on the Microsoft-Windows-themed debug functions from this spec.
*
* What's more? No emulators supporting RSP plugins require this function.
* It can be safely ignored as a non-portable custom extension to the spec.
*/
/******************************************************************
Function: InitiateRSP
Purpose: This function is called when the DLL is started to give
information from the emulator that the n64 RSP
interface needs
input: Rsp_Info is passed to this function which is defined
above.
CycleCount is the number of cycles between switching
control between the RSP and r4300i core.
output: none
*******************************************************************/
EXPORT void CALL InitiateRSP(RSP_INFO Rsp_Info, unsigned int *CycleCount);
/*
* `InitiateRSPDebugger` -- customarily deprecated by cxd4
*
* Here, again, nothing about the full features of debugging this RSP
* emulator needed to depend on any WIN32 fixations in this plugin spec.
*
* Also, again, as with the case of `GetRspDebugInfo`, the test of time has
* passed the conclusion that no emulators require the RSP plugin to export
* this procedure's symbol to be considered a valid RSP plugin.
*/
/******************************************************************
Function: RomClosed
Purpose: This function is called when a rom is closed.
input: none
output: none
*******************************************************************/
EXPORT void CALL RomClosed(void);
#if defined(__cplusplus)
}
#endif
#endif

91
rsp_op.hpp Normal file
View File

@ -0,0 +1,91 @@
#ifndef RSP_OP_HPP__
#define RSP_OP_HPP__
extern "C" {
int RSP_MFC0(RSP::CPUState *rsp, unsigned rt, unsigned rd);
int RSP_MTC0(RSP::CPUState *rsp, unsigned rd, unsigned rt);
void RSP_MTC2(RSP::CPUState *rsp, unsigned rt, unsigned vd, unsigned e);
void RSP_MFC2(RSP::CPUState *rsp, unsigned rt, unsigned vs, unsigned e);
void RSP_CFC2(RSP::CPUState *rsp, unsigned rt, unsigned rd);
void RSP_CTC2(RSP::CPUState *rsp, unsigned rt, unsigned rd);
void RSP_CALL(void *opaque, unsigned target, unsigned ret);
void RSP_RETURN(void *opaque, unsigned pc);
void RSP_EXIT(void *opaque, int mode);
#define DECL_LS(op) \
void RSP_##op(RSP::CPUState *rsp, unsigned rt, unsigned element, int offset, unsigned base)
DECL_LS(LBV);
DECL_LS(LSV);
DECL_LS(LLV);
DECL_LS(LDV);
DECL_LS(LQV);
DECL_LS(LRV);
DECL_LS(LPV);
DECL_LS(LUV);
DECL_LS(LHV);
DECL_LS(LFV);
DECL_LS(LTV);
DECL_LS(SBV);
DECL_LS(SSV);
DECL_LS(SLV);
DECL_LS(SDV);
DECL_LS(SQV);
DECL_LS(SRV);
DECL_LS(SPV);
DECL_LS(SUV);
DECL_LS(SHV);
DECL_LS(SFV);
DECL_LS(STV);
#define DECL_COP2(op) \
void RSP_##op(RSP::CPUState *rsp, unsigned vd, unsigned vs, unsigned vt, unsigned e)
DECL_COP2(VMULF);
DECL_COP2(VMULU);
DECL_COP2(VMUDL);
DECL_COP2(VMUDM);
DECL_COP2(VMUDN);
DECL_COP2(VMUDH);
DECL_COP2(VMACF);
DECL_COP2(VMACU);
DECL_COP2(VMACQ);
DECL_COP2(VMADL);
DECL_COP2(VMADM);
DECL_COP2(VMADN);
DECL_COP2(VMADH);
DECL_COP2(VADD);
DECL_COP2(VSUB);
DECL_COP2(VABS);
DECL_COP2(VADDC);
DECL_COP2(VSUBC);
DECL_COP2(VSAR);
DECL_COP2(VLT);
DECL_COP2(VEQ);
DECL_COP2(VNE);
DECL_COP2(VGE);
DECL_COP2(VCL);
DECL_COP2(VCH);
DECL_COP2(VCR);
DECL_COP2(VMRG);
DECL_COP2(VAND);
DECL_COP2(VNAND);
DECL_COP2(VOR);
DECL_COP2(VNOR);
DECL_COP2(VXOR);
DECL_COP2(VNXOR);
DECL_COP2(VRCP);
DECL_COP2(VRCPL);
DECL_COP2(VRCPH);
DECL_COP2(VMOV);
DECL_COP2(VRSQ);
DECL_COP2(VRSQL);
DECL_COP2(VRSQH);
DECL_COP2(VNOP);
DECL_COP2(RESERVED);
}
#endif

136
state.hpp Normal file
View File

@ -0,0 +1,136 @@
#ifndef STATE_HPP__
#define STATE_HPP__
#include "rsp.h"
#define DMEM_SIZE (4 * 1024)
#define IMEM_SIZE (4 * 1024)
#define DMEM_WORDS (DMEM_SIZE / 4)
#define IMEM_WORDS (DMEM_SIZE / 4)
#define CODE_BLOCK_SIZE (256)
#define CODE_BLOCK_WORDS (CODE_BLOCK_SIZE / 4)
#define CODE_BLOCK_SIZE_LOG2 (8)
#define CODE_BLOCKS (IMEM_SIZE / CODE_BLOCK_SIZE)
namespace RSP
{
enum RSPFlags
{
RSP_VCO = 0,
RSP_VCC = 1,
RSP_VCE = 2
};
enum RSPAccumulator
{
RSP_ACC_LO = 16,
RSP_ACC_MD = 8,
RSP_ACC_HI = 0
};
enum CP0Registers
{
CP0_REGISTER_DMA_CACHE = 0,
CP0_REGISTER_DMA_DRAM = 1,
CP0_REGISTER_DMA_READ_LENGTH = 2,
CP0_REGISTER_DMA_WRITE_LENGTH = 3,
CP0_REGISTER_SP_STATUS = 4,
CP0_REGISTER_DMA_FULL = 5,
CP0_REGISTER_DMA_BUSY = 6,
CP0_REGISTER_SP_RESERVED = 7,
CP0_REGISTER_CMD_START = 8,
CP0_REGISTER_CMD_END = 9,
CP0_REGISTER_CMD_CURRENT = 10,
CP0_REGISTER_CMD_STATUS = 11,
CP0_REGISTER_CMD_CLOCK = 12,
CP0_REGISTER_CMD_BUSY = 13,
CP0_REGISTER_CMD_PIPE_BUSY = 14,
CP0_REGISTER_CMD_TMEM_BUSY = 15,
};
// SP_STATUS read bits.
#define SP_STATUS_HALT 0x0001
#define SP_STATUS_BROKE 0x0002
#define SP_STATUS_DMA_BUSY 0x0004
#define SP_STATUS_DMA_FULL 0x0008
#define SP_STATUS_IO_FULL 0x0010
#define SP_STATUS_SSTEP 0x0020
#define SP_STATUS_INTR_BREAK 0x0040
#define SP_STATUS_SIG0 0x0080
#define SP_STATUS_SIG1 0x0100
#define SP_STATUS_SIG2 0x0200
#define SP_STATUS_SIG3 0x0400
#define SP_STATUS_SIG4 0x0800
#define SP_STATUS_SIG5 0x1000
#define SP_STATUS_SIG6 0x2000
#define SP_STATUS_SIG7 0x4000
// SP_STATUS write bits.
#define SP_CLR_HALT 0x00000001
#define SP_SET_HALT 0x00000002
#define SP_CLR_BROKE 0x00000004
#define SP_CLR_INTR 0x00000008
#define SP_SET_INTR 0x00000010
#define SP_CLR_SSTEP 0x00000020
#define SP_SET_SSTEP 0x00000040
#define SP_CLR_INTR_BREAK 0x00000080
#define SP_SET_INTR_BREAK 0x00000100
#define SP_CLR_SIG0 0x00000200
#define SP_SET_SIG0 0x00000400
#define SP_CLR_SIG1 0x00000800
#define SP_SET_SIG1 0x00001000
#define SP_CLR_SIG2 0x00002000
#define SP_SET_SIG2 0x00004000
#define SP_CLR_SIG3 0x00008000
#define SP_SET_SIG3 0x00010000
#define SP_CLR_SIG4 0x00020000
#define SP_SET_SIG4 0x00040000
#define SP_CLR_SIG5 0x00080000
#define SP_SET_SIG5 0x00100000
#define SP_CLR_SIG6 0x00200000
#define SP_SET_SIG6 0x00400000
#define SP_CLR_SIG7 0x00800000
#define SP_SET_SIG7 0x01000000
template<int N>
struct alignas(rsp_vect_t) AlignedRSPVector
{
uint16_t e[8 * N];
};
struct CP0
{
uint32_t *cr[16] = {};
uint32_t *irq = nullptr;
};
struct alignas(64) CP2
{
AlignedRSPVector<1> regs[32];
AlignedRSPVector<2> flags[3];
AlignedRSPVector<3> acc;
int16_t div_out;
int16_t div_in;
int8_t dp_flag;
};
struct CPUState
{
uint32_t pc = 0;
uint32_t dirty_blocks = 0;
static_assert(CODE_BLOCKS <= 32, "Code blocks must fit in 32-bit register.");
uint32_t has_delay_slot = 0;
uint32_t branch_target = 0;
uint32_t sr[32] = {};
uint32_t *dmem = nullptr;
uint32_t *imem = nullptr;
uint32_t *rdram = nullptr;
CP2 cp2 = {};
CP0 cp0;
};
}
#endif