Add CXD4. ParallelRSP Preperations

2024-11-23 08:10:05 +00:00 · 2020-02-14 11:54:03 +01:00 · 2020-02-14 11:54:03 +01:00 · a3dcc7bfc8
commit a3dcc7bfc8
parent def603f06c
28 changed files with 9436 additions and 8 deletions
--- a/mupen64plus-core/src/device/r4300/r4300_core.c
+++ b/mupen64plus-core/src/device/r4300/r4300_core.c
@ -50,7 +50,7 @@ void init_r4300(struct r4300_core* r4300, struct memory* mem, struct mi_controll
        NULL;
 #endif

-    r4300->emumode = emumode;
+    r4300->emumode = EMUMODE_DYNAREC;
    init_cp0(&r4300->cp0, count_per_op, new_dynarec_hot_state, interrupt_handlers);
    init_cp1(&r4300->cp1, new_dynarec_hot_state);

--- a/mupen64plus-core/src/plugin/plugin.c
+++ b/mupen64plus-core/src/plugin/plugin.c
@ -36,7 +36,6 @@
 #include "device/rcp/vi/vi_controller.h"
 #include "dummy_audio.h"
 #include "dummy_input.h"
-#include "dummy_video.h"
 #include "main/main.h"
 #include "main/rom.h"
 #include "main/version.h"
@ -46,8 +45,6 @@
 #include <stdio.h>

 CONTROL Controls[4];
-/* global function pointers - initialized on core startup */
-

 /* local data structures and functions */
 #define DEFINE_GFX(X) \
@ -92,6 +89,9 @@ CONTROL Controls[4];
    }

 DEFINE_GFX(gln64);
+#if defined(HAVE_THR_AL)
+DEFINE_GFX(angrylion);
+#endif

 gfx_plugin_functions gfx;
 GFX_INFO gfx_info;
@ -169,10 +169,11 @@ static void EmptyFunc(void)
        X##RomClosed \
    }

-DEFINE_RSP(hle);
-#ifndef VC
-//DEFINE_RSP(lle);
+DEFINE_RSP(parallelRSP);
+#if defined(HAVE_LLE)
+DEFINE_RSP(cxd4);
 #endif
+
 static void                     (*l_mainRenderCallback)(int) = NULL;
 static ptr_SetRenderingCallback   l_old1SetRenderingCallback = NULL;

@ -189,6 +190,8 @@ static void backcompat_setRenderCallbackIntercept(void (*callback)(int))

 m64p_error plugin_start_gfx(void)
 {
+    printf("plugin_start_gfx\n");
+
    uint8_t media = *((uint8_t*)mem_base_u32(g_mem_base, MM_CART_ROM) + (0x3b ^ S8));

    /* Here we feed 64DD IPL ROM header to GFX plugin if 64DD is present.
@ -376,7 +379,7 @@ void plugin_connect_all()
    l_GfxAttached = 1;
    plugin_start_gfx();

-    rsp = rsp_hle;
+    rsp = rsp_parallelRSP;
    l_RspAttached = 1;
    plugin_start_rsp();

--- a/mupen64plus-rsp-cxd4/COPYING
+++ b/mupen64plus-rsp-cxd4/COPYING
@ -0,0 +1,121 @@
+Creative Commons Legal Code
+
+CC0 1.0 Universal
+
+    CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
+    LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
+    ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
+    INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
+    REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
+    PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
+    THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
+    HEREUNDER.
+
+Statement of Purpose
+
+The laws of most jurisdictions throughout the world automatically confer
+exclusive Copyright and Related Rights (defined below) upon the creator
+and subsequent owner(s) (each and all, an "owner") of an original work of
+authorship and/or a database (each, a "Work").
+
+Certain owners wish to permanently relinquish those rights to a Work for
+the purpose of contributing to a commons of creative, cultural and
+scientific works ("Commons") that the public can reliably and without fear
+of later claims of infringement build upon, modify, incorporate in other
+works, reuse and redistribute as freely as possible in any form whatsoever
+and for any purposes, including without limitation commercial purposes.
+These owners may contribute to the Commons to promote the ideal of a free
+culture and the further production of creative, cultural and scientific
+works, or to gain reputation or greater distribution for their Work in
+part through the use and efforts of others.
+
+For these and/or other purposes and motivations, and without any
+expectation of additional consideration or compensation, the person
+associating CC0 with a Work (the "Affirmer"), to the extent that he or she
+is an owner of Copyright and Related Rights in the Work, voluntarily
+elects to apply CC0 to the Work and publicly distribute the Work under its
+terms, with knowledge of his or her Copyright and Related Rights in the
+Work and the meaning and intended legal effect of CC0 on those rights.
+
+1. Copyright and Related Rights. A Work made available under CC0 may be
+protected by copyright and related or neighboring rights ("Copyright and
+Related Rights"). Copyright and Related Rights include, but are not
+limited to, the following:
+
+  i. the right to reproduce, adapt, distribute, perform, display,
+     communicate, and translate a Work;
+ ii. moral rights retained by the original author(s) and/or performer(s);
+iii. publicity and privacy rights pertaining to a person's image or
+     likeness depicted in a Work;
+ iv. rights protecting against unfair competition in regards to a Work,
+     subject to the limitations in paragraph 4(a), below;
+  v. rights protecting the extraction, dissemination, use and reuse of data
+     in a Work;
+ vi. database rights (such as those arising under Directive 96/9/EC of the
+     European Parliament and of the Council of 11 March 1996 on the legal
+     protection of databases, and under any national implementation
+     thereof, including any amended or successor version of such
+     directive); and
+vii. other similar, equivalent or corresponding rights throughout the
+     world based on applicable law or treaty, and any national
+     implementations thereof.
+
+2. Waiver. To the greatest extent permitted by, but not in contravention
+of, applicable law, Affirmer hereby overtly, fully, permanently,
+irrevocably and unconditionally waives, abandons, and surrenders all of
+Affirmer's Copyright and Related Rights and associated claims and causes
+of action, whether now known or unknown (including existing as well as
+future claims and causes of action), in the Work (i) in all territories
+worldwide, (ii) for the maximum duration provided by applicable law or
+treaty (including future time extensions), (iii) in any current or future
+medium and for any number of copies, and (iv) for any purpose whatsoever,
+including without limitation commercial, advertising or promotional
+purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
+member of the public at large and to the detriment of Affirmer's heirs and
+successors, fully intending that such Waiver shall not be subject to
+revocation, rescission, cancellation, termination, or any other legal or
+equitable action to disrupt the quiet enjoyment of the Work by the public
+as contemplated by Affirmer's express Statement of Purpose.
+
+3. Public License Fallback. Should any part of the Waiver for any reason
+be judged legally invalid or ineffective under applicable law, then the
+Waiver shall be preserved to the maximum extent permitted taking into
+account Affirmer's express Statement of Purpose. In addition, to the
+extent the Waiver is so judged Affirmer hereby grants to each affected
+person a royalty-free, non transferable, non sublicensable, non exclusive,
+irrevocable and unconditional license to exercise Affirmer's Copyright and
+Related Rights in the Work (i) in all territories worldwide, (ii) for the
+maximum duration provided by applicable law or treaty (including future
+time extensions), (iii) in any current or future medium and for any number
+of copies, and (iv) for any purpose whatsoever, including without
+limitation commercial, advertising or promotional purposes (the
+"License"). The License shall be deemed effective as of the date CC0 was
+applied by Affirmer to the Work. Should any part of the License for any
+reason be judged legally invalid or ineffective under applicable law, such
+partial invalidity or ineffectiveness shall not invalidate the remainder
+of the License, and in such case Affirmer hereby affirms that he or she
+will not (i) exercise any of his or her remaining Copyright and Related
+Rights in the Work or (ii) assert any associated claims and causes of
+action with respect to the Work, in either case contrary to Affirmer's
+express Statement of Purpose.
+
+4. Limitations and Disclaimers.
+
+ a. No trademark or patent rights held by Affirmer are waived, abandoned,
+    surrendered, licensed or otherwise affected by this document.
+ b. Affirmer offers the Work as-is and makes no representations or
+    warranties of any kind concerning the Work, express, implied,
+    statutory or otherwise, including without limitation warranties of
+    title, merchantability, fitness for a particular purpose, non
+    infringement, or the absence of latent or other defects, accuracy, or
+    the present or absence of errors, whether or not discoverable, all to
+    the greatest extent permissible under applicable law.
+ c. Affirmer disclaims responsibility for clearing rights of other persons
+    that may apply to the Work or any use thereof, including without
+    limitation any person's Copyright and Related Rights in the Work.
+    Further, Affirmer disclaims responsibility for obtaining any necessary
+    consents, permissions or other rights required for any use of the
+    Work.
+ d. Affirmer understands and acknowledges that Creative Commons is not a
+    party to this document and has no duty or obligation with respect to
+    this CC0 or use of the Work.
--- a/mupen64plus-rsp-cxd4/README.md
+++ b/mupen64plus-rsp-cxd4/README.md
@ -0,0 +1,191 @@
+# Vector Technology as Implemented for Use with a RISC and SIMD Technology Signal Processor
+A vector processor uses long registers addressable by segment-precision, where each segment is _n_ bits wide.  The power of a vector processor is that many complex matrix operations, whose algorithms take many scalar CPU instructions and clock cycles to emulate on a regular, personal computer processor, can often times formulate and transfer the correct result in less than a single clock cycle.  The impossibility to replicate this precise behavior has paved the way for vendor businesses to protect their systems against hardware emulation since the introduction of display devices rendering three-dimensional graphics.  The Nintendo 64 was the first video game system to employ this convenience to their advantage.
+***
+
+## Project Reality's Signal Processor
+
+In the engineering make-up of the Nintendo 64 (original codename:  Project Reality) is a modified MIPS family revision 4000 co-processor called the "Reality Coprocessor" (RCP).  More importantly, the signal processor in this component is responsible for all vector memory operations and transactions, which are almost all impossible to emulate with full accuracy on a scalar, personal computer processor.  The vector technology implemented into this design is that accepted from Silicon Graphics, Inc.
+
+### RSP Vector Operation Matrices
+
+Here, the entire MIPS R4000 instruction set was modified for very fast, exception-free processing flow, and operation definitions for each instruction do not fall within the scope of this section.  Presented instead are layouts of the new instructions added to the scalar unit (those under `LWC2` and `SWC2`, even though they do interface with the vector unit) and the vector unit (essentially, any instruction under `COP2` whose mnemonic starts with a 'V').  Information of how pre-existing MIPS R4000 instructions were modified or which ones were removed is the adventure of the MIPS programmer to research.
+
+`C2` _vd_, _vs_, _vt_[_element_] `/* exceptions:  scalar divide reads */`
+
+|  COP2  | element |  vs1  |  vs2  |  vt   |  func  |
+| ------ |:-------:| ----- | ----- | ----- | ------ |
+|`010010`| `1eeee` |`ttttt`|`sssss`|`ddddd`|`??????`|
+
+The major types of VU computational instructions are _multiply,_ _add,_ _select,_ _logical,_ and _divide._
+
+Multiply instructions are the most frequent and classifiable as follows:
+
+* If `a == 0`, then round the product loaded to the accumulator (`VMUL*` and `VMUD*`).
+* If `a == 1`, then the product is added to an accumulator element (`VMAC*` and `VMAD*`).
+* If `(format & 0b100) == 0`, then the operation is single-precision (`VMUL*` and `VMAC*`).
+* If `(format & 0b100) != 0`, then the operation is double-precision (`VMUD*` and `VMAD*`).
+
+|_op-code_|   Type   |
+| -------:| -------- |
+| `00axxx`| multiply |
+| `01xxxx`| add      |
+| `100xxx`| select   |
+| `101xxx`| logical  |
+| `110xxx`| divide   |
+
+* `00 (VMULF)` Vector Multiply Signed Fractions
+* `01 (VMULU)` Vector Multiply Unsigned Fractions
+* `02 reserved` `VRNDP` was intended for MPEG DCT rounding but omitted.
+* `03 reserved` `VMULQ` was intended for MPEG inverse quantization but omitted.
+* `04 (VMUDL)` Vector Multiply Low Partial Products
+* `05 (VMUDM)` Vector Multiply Mid Partial Products
+* `06 (VMUDN)` Vector Multiply Mid Partial Products
+* `07 (VMUDH)` Vector Multiply High Partial Products
+* `10 (VMACF)` Vector Multiply-Accumulate Signed Fractions
+* `11 (VMACU)` Vector Multiply-Accumulate Unsigned Fractions
+* `12 reserved` `VRNDN` was intended for MPEG DCT rounding but omitted.
+* `13 (VMACQ)` Vector Accumulator Oddification
+* `14 (VMADL)` Vector Multiply-Accumulate Low Partial Products
+* `15 (VMADM)` Vector Multiply-Accumulate Mid Partial Products
+* `16 (VMADN)` Vector Multiply-Accumulate Mid Partial Products
+* `17 (VMADH)` Vector Multiply-Accumulate High Partial Products
+* `20 (VADD)` Vector Add Short Elements
+* `21 (VSUB)` Vector Subtract Short Elements
+* `22 reserved`
+* `23 (VABS)` Vector Absolute Value of Short Elements
+* `24 (VADDC)` Vector Add Short Elements with Carry
+* `25 (VSUBC)` Vector Subtract Short Elements with Carry
+* `26 reserved`
+* `27 reserved`
+* `30 reserved`
+* `31 reserved`
+* `32 reserved`
+* `33 reserved`
+* `34 reserved`
+* `35 (VSAR)` Vector Accumulator Read
+* `36 reserved`
+* `37 reserved`
+* `40 (VLT)` Vector Select Less Than
+* `41 (VEQ)` Vector Select Equal
+* `42 (VNE)` Vector Select Not Equal
+* `43 (VGE)` Vector Select Greater Than or Equal
+* `44 (VCL)` Vector Select Clip Test Low
+* `45 (VCH)` Vector Select Clip Test High
+* `46 (VCR)` Vector Select Clip Test Low (single-precision)
+* `47 (VMRG)` Vector Select Merge
+* `50 (VAND)` Vector AND Short Elements
+* `51 (VNAND)` Vector NAND Short Elements
+* `52 (VOR)` Vector OR Short Elements
+* `53 (VNOR)` Vector NOR Short Elements
+* `54 (VXOR)` Vector XOR Short Elements
+* `55 (VNXOR)` Vector NXOR Short Elements
+* `56 reserved`
+* `57 reserved`
+* `60 (VRCP)` Vector Element Scalar Reciprocal (single-precision)
+* `61 (VRCPL)` Vector Element Scalar Reciprocal Low
+* `62 (VRCPH)` Vector Element Scalar Reciprocal High
+* `63 (VMOV)` Vector Element Scalar Move
+* `64 (VRSQ)` Vector Element Scalar SQRT Reciprocal (single-precision)
+* `65 (VRSQL)` Vector Element Scalar SQRT Reciprocal Low
+* `66 (VRSQH)` Vector Element Scalar SQRT Reciprocal High
+* `67 (VNOP)` Vector Null Instruction
+* `70 reserved`
+* `71 reserved`
+* `72 reserved`
+* `73 reserved`
+* `74 reserved`
+* `75 reserved`
+* `76 reserved`
+* `77 reserved`
+
+### RSP Vector Load Transfers
+
+The VR-DMEM transaction instruction cycles are still processed by the scalar unit, not the vector unit.  In the modern implementations accepted by most vector unit communications systems today, the transfer instructions are classifiable under five groups:
+
+1.  BV, SV, LV, DV
+2.  PV, UV, XV, ZV
+3.  HV, FV, AV
+4.  QV, RV
+5.  TV, WV
+
+Not all of those instructions were implemented as of the time of the Nintendo 64's RCP, however.  Additionally, their ordering in the opcode matrix was a little skewed to what is seen below.  At this time, it is better to use only three categories of instructions:
+* _normal_:  Anything under Group I or Group IV is normal type.  Only the element must be aligned; `addr & 1` may resolve true.
+* _packed_:  Anything under Group II or Group III.  Useful for working with specially mapped data, such as pixels.
+* _transposed_:  `LTV`, *LTWV,* `STV`, and `SWV` can be found in heaps of 16 instructions, all dedicated to matrix transposition through eight diagonals of halfword elements.
+
+`LWC2` _vt_[_element_], _offset_(_base_)
+
+|  LWC2  | base  |  vt   |  rd   | element |  offset  |
+| ------ | ----- | ----- | ----- |:-------:| -------- |
+|`110010`|`sssss`|`ttttt`|`?????`| `eeee`  | `Xxxxxxx`|
+
+* `00 (LBV)` Load Byte to Vector Unit
+* `01 (LSV)` Load Shortword to Vector Unit
+* `02 (LLV)` Load Longword to Vector Unit
+* `03 (LDV)` Load Doubleword to Vector Unit
+* `04 (LQV)` Load Quadword to Vector Unit
+* `05 (LRV)` Load Rest to Vector Unit
+* `06 (LPV)` Load Packed Signed to Vector Unit
+* `07 (LUV)` Load Packed Unsigned to Vector Unit
+* `10 (LHV)` Load Alternate Bytes to Vector Unit
+* `11 (LFV)` Load Alternate Fourths to Vector Unit
+* `12 reserved` *LTWV*
+* `13 (LTV)` Load Transposed to Vector Unit
+* `14 reserved`
+* `15 reserved`
+* `16 reserved`
+* `17 reserved`
+
+`SWC2` _vt_[_element_], _offset_(_base_)
+
+|  SWC2  | base  |  vt   |  rd   | element |  offset  |
+| ------ | ----- | ----- | ----- |:-------:| -------- |
+|`111010`|`sssss`|`ttttt`|`?????`| `eeee`  | `Xxxxxxx`|
+
+* `00 (SBV)` Store Byte from Vector Unit
+* `01 (SSV)` Store Shortword from Vector Unit
+* `02 (SLV)` Store Longword from Vector Unit
+* `03 (SDV)` Store Doubleword from Vector Unit
+* `04 (SQV)` Store Quadword from Vector Unit
+* `05 (SRV)` Store Rest from Vector Unit
+* `06 (SPV)` Store Packed Signed from Vector Unit
+* `07 (SUV)` Store Packed Unsigned from Vector Unit
+* `10 (SHV)` Store Alternate Bytes from Vector Unit
+* `11 (SFV)` Store Alternate Fourths from Vector Unit
+* `12 (SWV)` Store Transposed Wrapped from Vector Unit
+* `13 (STV)` Store Transposed from Vector Unit
+* `14 reserved`
+* `15 reserved`
+* `16 reserved`
+* `17 reserved`
+
+If, by any chance, the opcode specifier is greater than 17 [oct], it was probably meant to execute the extended counterparts to the above loads and stores, which were questionably obsolete and remain reserved.
+
+## Informational References for Vector Processor Architecture
+
+_Instruction Methods for Performing Data Formatting While Moving Data Between Memory and a Vector Register File_
+United States patent no. 5,812,147
+Timothy J. Van Hook
+*Silicon Graphics, Inc.*
+
+_Method and System for Efficient Matrix Multiplication in a SIMD Processor Architecture_
+United States patent no. 7,873,812
+Tibet Mimar
+
+_Efficient Handling of Vector High-Level Language Constructs in a SIMD Processor_
+United States patent no. 7,793,084
+Tibet Mimar
+
+_Flexible Vector Modes of Operation for SIMD Processor_
+patent pending?
+Tibet Mimar
+
+_Programming a Vector Processor and Parallel Programming of an Asymmetric Dual Multiprocessor Comprised of a Vector Processor and a RISC Processor_
+United States patent no. 6,016,395
+Moataz Ali Mohamed
+*Samsung Electronics Co., Ltd.*
+
+_Execution Unit for Processing a Data Stream Independently and in Parallel_
+United States patent no. 6,401,194
+Le Trong Nguyen
+*Samsung Electronics Co., Ltd.*
--- a/mupen64plus-rsp-cxd4/config.h
+++ b/mupen64plus-rsp-cxd4/config.h
@ -0,0 +1,66 @@
+/******************************************************************************\
+* Authors:  Iconoclast                                                         *
+* Release:  2013.12.04                                                         *
+* License:  CC0 Public Domain Dedication                                       *
+*                                                                              *
+* To the extent possible under law, the author(s) have dedicated all copyright *
+* and related and neighboring rights to this software to the public domain     *
+* worldwide. This software is distributed without any warranty.                *
+*                                                                              *
+* You should have received a copy of the CC0 Public Domain Dedication along    *
+* with this software.                                                          *
+* If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.             *
+\******************************************************************************/
+#ifndef _CXD4_CONFIG_H
+#define _CXD4_CONFIG_H
+
+extern unsigned char rsp_conf[32];
+
+#define MINIMUM_MESSAGE_PRIORITY    1
+#define EXTERN_COMMAND_LIST_GBI
+#define EXTERN_COMMAND_LIST_ABI
+#define SEMAPHORE_LOCK_CORRECTIONS
+#define WAIT_FOR_CPU_HOST
+#define EMULATE_STATIC_PC
+
+/*
+ * The config file used to be a 32-byte EEPROM with binary settings storage.
+ * It was found necessary for user and contributor convenience to replace.
+ *
+ * The current configuration system now uses Garteal's CFG text definitions.
+ */
+
+#define CFG_HLE_GFX     (rsp_conf[0x00])
+#define CFG_HLE_AUD     (rsp_conf[0x01])
+#define CFG_HLE_VID     (rsp_conf[0x02]) /* reserved/unused */
+#define CFG_HLE_JPG     (rsp_conf[0x03]) /* unused */
+/*
+ * Most of the point behind this config system is to let users use HLE video
+ * or audio plug-ins.  The other task types are used less than 1% of the time
+ * and only in a few games.  They require simulation from within the RSP
+ * internally, which I have no intention to ever support.  Some good research
+ * on a few of these special task types was done by Hacktarux in the MUPEN64
+ * HLE RSP plug-in, so consider using that instead for complete HLE.
+ */
+
+/*
+ * Schedule binary dump exports to the DllConfig schedule delay queue.
+ */
+#define CFG_QUEUE_E_DRAM    (*(int *)(rsp_conf + 0x04))
+#define CFG_QUEUE_E_DMEM    (*(int *)(rsp_conf + 0x08))
+#define CFG_QUEUE_E_IMEM    (*(int *)(rsp_conf + 0x0C))
+/*
+ * Note:  This never actually made it into the configuration system.
+ * Instead, DMEM and IMEM are always exported on every call to DllConfig().
+ */
+
+/*
+ * Special switches.
+ * (generally for correcting RSP clock behavior on Project64 2.x)
+ * Also includes RSP register states debugger.
+ */
+#define CFG_WAIT_FOR_CPU_HOST       (*(int *)(rsp_conf + 0x10))
+#define CFG_MEND_SEMAPHORE_LOCK     (*(int *)(rsp_conf + 0x14))
+#define CFG_TRACE_RSP_REGISTERS     (*(int *)(rsp_conf + 0x18))
+
+#endif
--- a/mupen64plus-rsp-cxd4/module.c
+++ b/mupen64plus-rsp-cxd4/module.c
@ -0,0 +1,496 @@
+/******************************************************************************\
+* Project:  Module Subsystem Interface to SP Interpreter Core                  *
+* Authors:  Iconoclast                                                         *
+* Release:  2016.03.23                                                         *
+* License:  CC0 Public Domain Dedication                                       *
+*                                                                              *
+* To the extent possible under law, the author(s) have dedicated all copyright *
+* and related and neighboring rights to this software to the public domain     *
+* worldwide. This software is distributed without any warranty.                *
+*                                                                              *
+* You should have received a copy of the CC0 Public Domain Dedication along    *
+* with this software.                                                          *
+* If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.             *
+\******************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef WIN32
+#include <windows.h>
+#endif
+
+#include "module.h"
+#include "main/rom.h"
+#include "su.h"
+
+RSP_INFO RSP_INFO_NAME;
+
+#define RSP_CXD4_VERSION 0x0101
+
+#include <stdarg.h>
+
+#define RSP_PLUGIN_API_VERSION 0x020000
+#define CONFIG_API_VERSION       0x020100
+#define CONFIG_PARAM_VERSION     1.00
+
+static void (*l_DebugCallback)(void *, int, const char *) = NULL;
+static void *l_DebugCallContext = NULL;
+static int l_PluginInit = 0;
+static m64p_handle l_ConfigRsp;
+
+#define VERSION_PRINTF_SPLIT(x) (((x) >> 16) & 0xffff), (((x) >> 8) & 0xff), ((x) & 0xff)
+
+#define API_PREFIX(x) cxd4##x
+
+NOINLINE void update_conf(const char* source)
+{
+    memset(conf, 0, sizeof(conf));
+#if 0
+#ifndef __LIBRETRO__
+    //yomoma: do not override config settings if running under libretro
+    CFG_HLE_GFX = ConfigGetParamBool(l_ConfigRsp, "DisplayListToGraphicsPlugin");
+    CFG_HLE_AUD = ConfigGetParamBool(l_ConfigRsp, "AudioListToAudioPlugin");
+    CFG_WAIT_FOR_CPU_HOST = ConfigGetParamBool(l_ConfigRsp, "WaitForCPUHost");
+    CFG_MEND_SEMAPHORE_LOCK = ConfigGetParamBool(l_ConfigRsp, "SupportCPUSemaphoreLock");
+#endif
+#endif
+
+#if 1
+      if (strstr((char*)ROM_HEADER.Name, (const char*)"WORLD DRIVER CHAMP") != NULL)
+         CFG_HLE_GFX = 0;
+     else if (strstr((char*)ROM_HEADER.Name, (const char*)"Indiana Jones") != NULL)
+         CFG_HLE_GFX = 0;
+     else if (strstr((char*)ROM_HEADER.Name, (const char*)"Rogue Squadron") != NULL)
+         CFG_HLE_GFX = 0;
+     else if (strstr((char*)ROM_HEADER.Name, (const char*)"rogue squadron") != NULL)
+         CFG_HLE_GFX = 0;
+     else if (strstr((char*)ROM_HEADER.Name, (const char*)"Battle for Naboo") != NULL)
+         CFG_HLE_GFX = 0;
+     else if (strstr((char*)ROM_HEADER.Name, (const char*)"Stunt Racer 64") != NULL)
+         CFG_HLE_GFX = 0;
+     else if (strstr((char*)ROM_HEADER.Name, (const char*)"GAUNTLET LEGENDS") != NULL)
+         CFG_HLE_GFX = 0;
+#endif
+}
+
+extern void DebugMessage(int level, const char *message, ...);
+
+EXPORT u32 CALL API_PREFIX(DoRspCycles)(u32 cycles)
+{
+    OSTask_type task_type;
+    register unsigned int i;
+
+    if (GET_RCP_REG(SP_STATUS_REG) & 0x00000003)
+    {
+        message("SP_STATUS_HALT");
+        return 0x00000000;
+    }
+
+    task_type = 0x00000000
+#ifdef USE_CLIENT_ENDIAN
+      | *((pi32)(DMEM + 0x000FC0U))
+#else
+      | (u32)DMEM[0xFC0] << 24
+      | (u32)DMEM[0xFC1] << 16
+      | (u32)DMEM[0xFC2] <<  8
+      | (u32)DMEM[0xFC3] <<  0
+#endif
+    ;
+    switch (task_type) {
+#ifdef EXTERN_COMMAND_LIST_GBI
+    case M_GFXTASK:
+        if (CFG_HLE_GFX == 0)
+            break;
+
+        if (*(pi32)(DMEM + 0xFF0) == 0x00000000)
+            break; /* Resident Evil 2, null task pointers */
+        if (GET_RSP_INFO(ProcessDlistList) == NULL)
+            { /* branch */ }
+        else
+            GET_RSP_INFO(ProcessDlistList)();
+
+        GET_RCP_REG(SP_STATUS_REG) |=
+            SP_STATUS_SIG2 | SP_STATUS_BROKE | SP_STATUS_HALT
+        ;
+        if (GET_RCP_REG(SP_STATUS_REG) & SP_STATUS_INTR_BREAK) {
+            GET_RCP_REG(MI_INTR_REG) |= 0x00000001;
+            GET_RSP_INFO(CheckInterrupts)();
+        }
+        GET_RCP_REG(DPC_STATUS_REG) &= ~0x00000002ul; /* DPC_STATUS_FREEZE */
+        return 0;
+#endif
+#ifdef EXTERN_COMMAND_LIST_ABI
+    case M_AUDTASK:
+        if (CFG_HLE_AUD == 0)
+            break;
+
+        if (GET_RSP_INFO(ProcessAlistList) == NULL)
+            { /* branch */ }
+        else
+            GET_RSP_INFO(ProcessAlistList)();
+
+        GET_RCP_REG(SP_STATUS_REG) |=
+            SP_STATUS_SIG2 | SP_STATUS_BROKE | SP_STATUS_HALT
+        ;
+        if (GET_RCP_REG(SP_STATUS_REG) & SP_STATUS_INTR_BREAK) {
+            GET_RCP_REG(MI_INTR_REG) |= 0x00000001;
+            GET_RSP_INFO(CheckInterrupts)();
+        }
+        return 0;
+#endif
+    case M_VIDTASK:
+        message("M_VIDTASK");
+        break;
+    case M_NJPEGTASK:
+        break; /* Zelda, Pokemon, others */
+    case M_NULTASK:
+        message("M_NULTASK");
+        break;
+    case M_HVQTASK:
+        message("M_HVQTASK");
+        break;
+    case M_HVQMTASK:
+        if (GET_RSP_INFO(ShowCFB) == NULL) /* Gfx #1.2 or older specs */
+            break;
+        GET_RSP_INFO(ShowCFB)(); /* forced FB refresh in case gfx plugin skip */
+        break;
+    }
+
+#ifdef WAIT_FOR_CPU_HOST
+    for (i = 0; i < 32; i++)
+        MFC0_count[i] = 0;
+#endif
+    run_task();
+
+#if 0
+/*
+ * An optional EMMS when compiling with Intel SIMD or MMX support.
+ *
+ * Whether or not MMX has been executed in this emulator, here is a good time
+ * to finally empty the MM state, at the end of a long interpreter loop.
+ */
+#ifdef ARCH_MIN_SSE2
+    _mm_empty();
+#endif
+#endif
+
+    if (*CR[0x4] & SP_STATUS_BROKE) /* normal exit, from executing BREAK */
+        return (cycles);
+    else if (GET_RCP_REG(MI_INTR_REG) & 1) /* interrupt set by MTC0 to break */
+        GET_RSP_INFO(CheckInterrupts)();
+    else if (*CR[0x7] != 0x00000000) /* semaphore lock fixes */
+        {}
+#ifdef WAIT_FOR_CPU_HOST
+    else
+        MF_SP_STATUS_TIMEOUT = 16; /* From now on, wait 16 times, not 32767. */
+#else
+    else { /* ??? unknown, possibly external intervention from CPU memory map */
+        message("SP_SET_HALT");
+        return (cycles);
+    }
+#endif
+    *CR[0x4] &= ~SP_STATUS_HALT; /* CPU restarts with the correct SIGs. */
+    return (cycles);
+}
+
+EXPORT void CALL API_PREFIX(GetDllInfo)(PLUGIN_INFO *PluginInfo)
+{
+    PluginInfo -> Version = PLUGIN_API_VERSION;
+    PluginInfo -> Type = PLUGIN_TYPE_RSP;
+    my_strcpy(PluginInfo -> Name, "Static Interpreter");
+    PluginInfo -> NormalMemory = 0;
+    PluginInfo -> MemoryBswaped = USE_CLIENT_ENDIAN;
+    return;
+}
+
+EXPORT m64p_error CALL API_PREFIX(PluginGetVersion)(m64p_plugin_type *PluginType, int *PluginVersion, int *APIVersion, const char **PluginNamePtr, int *Capabilities)
+{
+    /* set version info */
+    if (PluginType != NULL)
+        *PluginType = M64PLUGIN_RSP;
+
+    if (PluginVersion != NULL)
+        *PluginVersion = RSP_CXD4_VERSION;
+
+    if (APIVersion != NULL)
+        *APIVersion = RSP_PLUGIN_API_VERSION;
+
+    if (PluginNamePtr != NULL)
+        *PluginNamePtr = "Static Interpreter";
+
+    if (Capabilities != NULL)
+    {
+        *Capabilities = 0;
+    }
+
+    return M64ERR_SUCCESS;
+}
+
+
+EXPORT void CALL API_PREFIX(InitiateRSP)(RSP_INFO Rsp_Info, pu32 CycleCount)
+{
+    if (CycleCount != NULL) /* cycle-accuracy not doable with today's hosts */
+        *CycleCount = 0;
+    update_conf(CFG_FILE);
+
+    RSP_INFO_NAME = Rsp_Info;
+    DRAM = GET_RSP_INFO(RDRAM);
+    if (Rsp_Info.DMEM == Rsp_Info.IMEM) /* usually dummy RSP data for testing */
+        return; /* DMA is not executed just because plugin initiates. */
+    DMEM = GET_RSP_INFO(DMEM);
+    IMEM = GET_RSP_INFO(IMEM);
+
+    CR[0x0] = &GET_RCP_REG(SP_MEM_ADDR_REG);
+    CR[0x1] = &GET_RCP_REG(SP_DRAM_ADDR_REG);
+    CR[0x2] = &GET_RCP_REG(SP_RD_LEN_REG);
+    CR[0x3] = &GET_RCP_REG(SP_WR_LEN_REG);
+    CR[0x4] = &GET_RCP_REG(SP_STATUS_REG);
+    CR[0x5] = &GET_RCP_REG(SP_DMA_FULL_REG);
+    CR[0x6] = &GET_RCP_REG(SP_DMA_BUSY_REG);
+    CR[0x7] = &GET_RCP_REG(SP_SEMAPHORE_REG);
+    GET_RCP_REG(SP_PC_REG) = 0x04001000;
+    CR[0x8] = &GET_RCP_REG(DPC_START_REG);
+    CR[0x9] = &GET_RCP_REG(DPC_END_REG);
+    CR[0xA] = &GET_RCP_REG(DPC_CURRENT_REG);
+    CR[0xB] = &GET_RCP_REG(DPC_STATUS_REG);
+    CR[0xC] = &GET_RCP_REG(DPC_CLOCK_REG);
+    CR[0xD] = &GET_RCP_REG(DPC_BUFBUSY_REG);
+    CR[0xE] = &GET_RCP_REG(DPC_PIPEBUSY_REG);
+    CR[0xF] = &GET_RCP_REG(DPC_TMEM_REG);
+
+    MF_SP_STATUS_TIMEOUT = 32767;
+#if 1
+    GET_RCP_REG(SP_PC_REG) &= 0x00000FFFu; /* hack to fix Mupen64 */
+#endif
+    return;
+}
+
+EXPORT void CALL API_PREFIX(RomClosed)(void)
+{
+    GET_RCP_REG(SP_PC_REG) = 0x04001000;
+}
+
+NOINLINE void message(const char* body)
+{
+#if defined(M64P_PLUGIN_API)
+    DebugMessage(M64MSG_ERROR, body);
+#else
+    printf("%s\n", body);
+#endif
+
+}
+
+#ifdef SP_EXECUTE_LOG
+void step_SP_commands(uint32_t inst)
+{
+    unsigned char endian_swap[4];
+    char text[256];
+    char offset[4] = "";
+    char code[9] = "";
+
+    if (output_log == NULL)
+        return;
+
+    endian_swap[00] = (u8)((inst >> 24) & 0xFF);
+    endian_swap[01] = (u8)((inst >> 16) & 0xFF);
+    endian_swap[02] = (u8)((inst >>  8) & 0xFF);
+    endian_swap[03] = (u8)((inst >>  0) & 0xFF);
+    sprintf(&offset[0], "%03X", GET_RCP_REG(SP_PC_REG) & 0xFFF);
+    sprintf(&code[0], "%08X", inst);
+    strcpy(text, offset);
+    my_strcat(text, "\n");
+    my_strcat(text, code);
+    message(text); /* PC offset, MIPS hex. */
+    if (output_log != NULL)
+        my_fwrite(endian_swap, 4, 1, output_log);
+}
+#endif
+
+NOINLINE void export_data_cache(void)
+{
+    pu8 DMEM_swapped;
+    FILE * out;
+    register int i;
+ /* const int little_endian = GET_RSP_INFO(MemoryBswaped); */
+
+    DMEM_swapped = my_calloc(4096, 1);
+    for (i = 0; i < 4096; i++)
+        DMEM_swapped[i] = DMEM[BES(i)];
+    out = my_fopen("rcpcache.dhex", "wb");
+    my_fwrite(DMEM_swapped, 16, 4096 / 16, out);
+    my_fclose(out);
+    my_free(DMEM_swapped);
+    return;
+}
+NOINLINE void export_instruction_cache(void)
+{
+    pu8 IMEM_swapped;
+    FILE * out;
+    register int i;
+ /* const int little_endian = GET_RSP_INFO(MemoryBswaped); */
+
+    IMEM_swapped = my_calloc(4096, 1);
+    for (i = 0; i < 4096; i++)
+        IMEM_swapped[i] = IMEM[BES(i)];
+    out = my_fopen("rcpcache.ihex", "wb");
+    my_fwrite(IMEM_swapped, 16, 4096 / 16, out);
+    my_fclose(out);
+    my_free(IMEM_swapped);
+    return;
+}
+void export_SP_memory(void)
+{
+    export_data_cache();
+    export_instruction_cache();
+}
+
+/*
+ * Microsoft linker defaults to an entry point of `_DllMainCRTStartup',
+ * which attaches several CRT dependencies.  To eliminate CRT dependencies,
+ * we direct the linker to cursor the entry point to the lower-level
+ * `DllMain' symbol or, alternatively, link with /NOENTRY for no entry point.
+ */
+#ifdef WIN32
+BOOL WINAPI DllMain(
+    HINSTANCE hModule, DWORD ul_reason_for_call, LPVOID lpReserved)
+{
+    hModule = lpReserved = NULL; /* unused */
+    switch (ul_reason_for_call)
+    {
+case 1: /* DLL_PROCESS_ATTACH */
+        break;
+case 2: /* DLL_THREAD_ATTACH */
+        break;
+case 3: /* DLL_THREAD_DETACH */
+        break;
+case 0: /* DLL_PROCESS_DETACH */
+        break;
+    }
+    return 1; /* TRUE */
+}
+#endif
+
+/*
+ * low-level recreations of the C standard library functions for operating
+ * systems that define a C run-time or dependency on top of fixed OS calls
+ *
+ * Currently, this only addresses Microsoft Windows.
+ *
+ * None of these are meant to out-perform the original functions, by the way
+ * (especially with better intrinsic compiler support for stuff like memcpy),
+ * just to cut down on I-cache use for performance-irrelevant code sections
+ * and to avoid std. lib run-time dependencies on certain operating systems.
+ */
+
+NOINLINE p_void my_calloc(size_t count, size_t size)
+{
+#ifdef WIN32
+    return GlobalAlloc(GPTR, size * count);
+#else
+    return calloc(count, size);
+#endif
+}
+
+NOINLINE void my_free(p_void ptr)
+{
+#ifdef WIN32
+    while (GlobalFree(ptr) != NULL)
+        message("GlobalFree() failure");
+#else
+    free(ptr);
+#endif
+    return;
+}
+
+NOINLINE size_t my_strlen(const char* str)
+{
+    size_t ret_slot;
+
+    for (ret_slot = 0; *str != '\0'; ret_slot++, str++)
+        ;
+    return (ret_slot);
+}
+
+NOINLINE char* my_strcpy(char* destination, const char* source)
+{
+    register size_t i;
+    const size_t length = my_strlen(source) + 1; /* including null terminator */
+
+    for (i = 0; i < length; i++)
+        destination[i] = source[i];
+    return (destination);
+}
+
+NOINLINE char* my_strcat(char* destination, const char* source)
+{
+    const size_t length = my_strlen(destination);
+
+    my_strcpy(destination + length, source);
+    return (destination);
+}
+
+NOINLINE FILE* my_fopen(const char * filename, const char* mode)
+{
+#ifdef WIN32
+#if _MSC_VER >= 1400 && WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
+    return NULL;
+#else
+    return (FILE *)(HANDLE)CreateFileA(
+        filename,
+        (mode[0] == 'r') ? GENERIC_READ : GENERIC_WRITE,
+        (mode[0] == 'r') ? FILE_SHARE_READ : FILE_SHARE_WRITE,
+        NULL,
+        (mode[0] == 'r') ? OPEN_EXISTING : CREATE_ALWAYS,
+#if 0
+        FILE_FLAG_WRITE_THROUGH | FILE_FLAG_OVERLAPPED | FILE_FLAG_NO_BUFFERING,
+#else
+        (mode[0] == 'r') ? FILE_ATTRIBUTE_NORMAL : FILE_FLAG_WRITE_THROUGH,
+#endif
+        NULL
+    );
+#endif
+#else
+    return fopen(filename, mode);
+#endif
+}
+
+NOINLINE int my_fclose(FILE* stream)
+{
+    int ret_slot;
+#ifdef WIN32
+    ret_slot = !CloseHandle((HANDLE)stream);
+#else
+    ret_slot = fclose(stream);
+#endif
+    return (ret_slot);
+}
+
+NOINLINE size_t my_fread(p_void ptr, size_t size, size_t count, FILE* stream)
+{
+#ifdef WIN32
+    DWORD ret_slot;
+
+    ReadFile((HANDLE)stream, ptr, size * count, &ret_slot, NULL);
+#else
+    size_t ret_slot;
+
+    ret_slot = fread(ptr, size, count, stream);
+#endif
+    return (size_t)(ret_slot);
+}
+
+NOINLINE size_t my_fwrite(p_void ptr, size_t size, size_t count, FILE* stream)
+{
+#ifdef WIN32
+    DWORD ret_slot;
+
+    WriteFile((HANDLE)stream, ptr, size * count, &ret_slot, NULL);
+#else
+    size_t ret_slot;
+
+    ret_slot = fwrite(ptr, size, count, stream);
+#endif
+    return (size_t)(ret_slot);
+}
--- a/mupen64plus-rsp-cxd4/module.h
+++ b/mupen64plus-rsp-cxd4/module.h
@ -0,0 +1,100 @@
+/******************************************************************************\
+* Project:  Module Subsystem Interface to SP Interpreter Core                  *
+* Authors:  Iconoclast                                                         *
+* Release:  2015.11.14                                                         *
+* License:  CC0 Public Domain Dedication                                       *
+*                                                                              *
+* To the extent possible under law, the author(s) have dedicated all copyright *
+* and related and neighboring rights to this software to the public domain     *
+* worldwide. This software is distributed without any warranty.                *
+*                                                                              *
+* You should have received a copy of the CC0 Public Domain Dedication along    *
+* with this software.                                                          *
+* If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.             *
+\******************************************************************************/
+
+#ifndef _MODULE_H_
+#define _MODULE_H_
+
+#include <stdio.h>
+#include "rsp.h"
+
+typedef enum {
+    M_GFXTASK   = 1,
+    M_AUDTASK   = 2,
+    M_VIDTASK   = 3,
+    M_NJPEGTASK = 4,
+    M_NULTASK   = 5,
+    M_HVQTASK   = 6,
+    M_HVQMTASK  = 7
+} OSTask_type;
+
+#define CFG_FILE    "rsp_conf.bin"
+
+/*
+ * Most of the point behind this config system is to let users use HLE video
+ * or audio plug-ins.  The other task types are used less than 1% of the time
+ * and only in a few games.  They require simulation from within the RSP
+ * internally, which I have no intention to ever support.  Some good research
+ * on a few of these special task types was done by Hacktarux in the MUPEN64
+ * HLE RSP plug-in, so consider using that instead for complete HLE.
+ */
+#define CFG_HLE_GFX     (conf[0x00])
+#define CFG_HLE_AUD     (conf[0x01])
+#define CFG_HLE_VID     (conf[0x02]) /* reserved/unused */
+#define CFG_HLE_JPG     (conf[0x03]) /* unused */
+
+/*
+ * Schedule binary dump exports to the DllConfig schedule delay queue.
+ */
+#define CFG_QUEUE_E_DRAM    (*(pi32)(conf + 0x04))
+#define CFG_QUEUE_E_DMEM    (*(pi32)(conf + 0x08))
+#define CFG_QUEUE_E_IMEM    (*(pi32)(conf + 0x0C))
+/*
+ * Note:  This never actually made it into the configuration system.
+ * Instead, DMEM and IMEM are always exported on every call to DllConfig().
+ */
+
+/*
+ * Special switches.
+ * (generally for correcting RSP clock behavior on Project64 2.x)
+ * Also includes RSP register states debugger.
+ */
+#define CFG_WAIT_FOR_CPU_HOST       (*(pi32)(conf + 0x10))
+#define CFG_MEND_SEMAPHORE_LOCK     (*(pi32)(conf + 0x14))
+#define CFG_TRACE_RSP_REGISTERS     (*(pi32)(conf + 0x18))
+
+/*
+ * Update RSP configuration memory from local file resource.
+ */
+#define CHARACTERS_PER_LINE     (80)
+/* typical standard DOS text file limit per line */
+
+NOINLINE extern void update_conf(const char* source);
+
+NOINLINE extern void export_data_cache(void);
+NOINLINE extern void export_instruction_cache(void);
+
+#ifdef SP_EXECUTE_LOG
+static FILE *output_log;
+extern void step_SP_commands(u32 inst);
+#endif
+extern void export_SP_memory(void);
+
+/*
+ * low-level recreations of the C standard library functions for operating
+ * systems that provide an inconvenient C run-time ecosystem, like Windows
+ */
+NOINLINE extern p_void my_calloc(size_t count, size_t size);
+NOINLINE extern void my_free(p_void ptr);
+NOINLINE extern size_t my_strlen(const char* str);
+NOINLINE extern char* my_strcpy(char* destination, const char* source);
+NOINLINE extern char* my_strcat(char* destination, const char* source);
+NOINLINE extern FILE* my_fopen(const char * filename, const char* mode);
+NOINLINE extern int my_fclose(FILE* stream);
+NOINLINE extern size_t my_fread(
+    p_void ptr, size_t size, size_t count, FILE* stream);
+NOINLINE extern size_t my_fwrite(
+    p_void ptr, size_t size, size_t count, FILE* stream);
+
+#endif
--- a/mupen64plus-rsp-cxd4/my_types.h
+++ b/mupen64plus-rsp-cxd4/my_types.h
@ -0,0 +1,505 @@
+/*
+ * minimum data types for MIPS and the Ultra64 RCP
+ *
+ * No copyright is intended on this file. :)
+ *
+ * To work with features of the RCP hardware, we need at the very least:
+ *     1.  a 64-bit type or a type which can encompass 64-bit operations
+ *     2.  signed and unsigned 32-or-more-bit types (s32, u32)
+ *     3.  signed and unsigned 16-or-more-bit types (s16, u16)
+ *     4.  signed and unsigned 8-or-more-bit types (s8, u8)
+ *
+ * This tends to coincide with the regulations of <stdint.h> and even most of
+ * what is guaranteed by simple preprocessor logic and the C89 standard, so
+ * the deduction of RCP hardware types will have the following priority:
+ *     1.  compiler implementation of the <stdint.h> extension
+ *     2.  64-bit ABI detection by the preprocessor with help from <limits.h>
+ *     3.  preprocessor derivation of literal integer interpretation
+ *     4.  the presumption of C89 conformance for 8-, 16-, and 32-bit types
+ *         and the presumption of `long long` support for 64-bit types
+ */
+
+/*
+ * Rather than call it "n64_types.h" or "my_stdint.h", the idea is that this
+ * header should be maintainable to any independent implementation's needs,
+ * especially in the event that one decides that type requirements should be
+ * mandated by the user and not permanently merged into the C specifications.
+ *
+ * Custom, collision-free type definitions are also useful in that they can
+ * be tested for cross-ABI portability by changing a custom type like `u32`
+ * from `unsigned long` to `unsigned short` or vice-versa.
+ */
+#ifndef _MY_TYPES_H_
+#define _MY_TYPES_H_
+
+/*
+ * This is the only method we really need to care about for defining types.
+ *
+ * All concerns of absolute plausibility are addressed with minimum-width
+ * types; we do not require fixed-width types or any C99 dependency.
+ */
+#include <limits.h>
+
+/*
+ * Until proven otherwise, there are no standard integer types.
+ */
+#undef HAVE_STANDARD_INTEGER_TYPES
+
+/*
+ * an optional facility which could be used as an external alternative to
+ * deducing minimum-width types (if the compiler agrees to rely on this level
+ * of the language specifications to have it)
+ *
+ * Because no standard system is required to have any exact-width type, the
+ * C99 enforcement of <stdint.h> is more of an early initiative (as in,
+ * "better early than late" or "better early than never at all") rather than
+ * a fully portable resource available or even possible all of the time.
+ */
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
+#include <stdint.h>
+#endif
+
+/*
+ * Even with -std=c89 or -ansi, modern GCC will tend to supply <stdint.h>.
+ * That isn't needed--here--for LP64 ABI, however, so why assume it exists?
+ */
+#if defined(__GNUC__) && !defined(__LP64__)
+#include <stdint.h>
+#endif
+
+/*
+ * With or without external or internal support for <stdint.h>, we need to
+ * confirm the level of support for RCP data types on the Nintendo 64.
+ *
+ * We only need minimum-width data types, not exact-width types.
+ * Systems on which there is no 16- or 32-bit type, for example, can easily
+ * be accounted for by the code itself using optimizable AND bit-masks.
+ */
+#if defined(INT8_MIN) && defined(INT8_MAX)
+#define HAVE_INT8_EXACT
+#endif
+#if defined(INT_FAST8_MIN) && defined(INT_FAST8_MAX)
+#define HAVE_INT8_FAST
+#endif
+#if defined(INT_LEAST8_MIN) && defined(INT_LEAST8_MAX)
+#define HAVE_INT8_MINIMUM
+#endif
+#if defined(INT16_MIN) && defined(INT16_MAX)
+#define HAVE_INT16_EXACT
+#endif
+#if defined(INT_FAST16_MIN) && defined(INT_FAST16_MAX)
+#define HAVE_INT16_FAST
+#endif
+#if defined(INT_LEAST16_MIN) && defined(INT_LEAST16_MAX)
+#define HAVE_INT16_MINIMUM
+#endif
+#if defined(INT32_MIN) && defined(INT32_MAX)
+#define HAVE_INT32_EXACT
+#endif
+#if defined(INT_FAST32_MIN) && defined(INT_FAST32_MAX)
+#define HAVE_INT32_FAST
+#endif
+#if defined(INT_LEAST32_MIN) && defined(INT_LEAST32_MAX)
+#define HAVE_INT32_MINIMUM
+#endif
+#if defined(INT64_MIN) && defined(INT64_MAX)
+#define HAVE_INT64_EXACT
+#endif
+#if defined(INT_FAST64_MIN) && defined(INT_FAST64_MAX)
+#define HAVE_INT64_FAST
+#endif
+#if defined(INT_LEAST64_MIN) && defined(INT_LEAST64_MAX)
+#define HAVE_INT64_MINIMUM
+#endif
+
+#if defined(HAVE_INT8_EXACT)\
+ || defined(HAVE_INT8_FAST) \
+ || defined(HAVE_INT8_MINIMUM)
+#define HAVE_INT8
+#endif
+#if defined(HAVE_INT16_EXACT)\
+ || defined(HAVE_INT16_FAST) \
+ || defined(HAVE_INT16_MINIMUM)
+#define HAVE_INT16
+#endif
+#if defined(HAVE_INT32_EXACT)\
+ || defined(HAVE_INT32_FAST) \
+ || defined(HAVE_INT32_MINIMUM)
+#define HAVE_INT32
+#endif
+#if defined(HAVE_INT64_EXACT)\
+ || defined(HAVE_INT64_FAST) \
+ || defined(HAVE_INT64_MINIMUM)
+#define HAVE_INT64
+#endif
+
+/*
+ * This determines whether or not it is possible to use the evolution of the
+ * C standards for compiler advice on how to define the types or whether we
+ * will instead rely on preprocessor logic and ABI detection or C89 rules to
+ * define each of the types.
+ */
+#if defined(HAVE_INT8) \
+ && defined(HAVE_INT16)\
+ && defined(HAVE_INT32)\
+ && defined(HAVE_INT64)
+#define HAVE_STANDARD_INTEGER_TYPES
+#endif
+
+/*
+ * Since the Microsoft Windows API frequently uses `long` instead of `int` to
+ * guarantee 32-bit DWORD types, they were forced to propose a "LLP64" ABI.
+ */
+#if defined(_MSC_VER)
+#define MICROSOFT_ABI
+#endif
+
+#if defined(HAVE_INT8_EXACT)
+typedef int8_t                  s8;
+typedef uint8_t                 u8;
+typedef s8                      i8;
+#elif defined(HAVE_INT8_FAST)
+typedef int_fast8_t             s8;
+typedef uint_fast8_t            u8;
+typedef s8                      i8;
+#elif defined(HAVE_INT8_MINIMUM)
+typedef int_least8_t            s8;
+typedef uint_least8_t           u8;
+typedef s8                      i8;
+#elif defined(MICROSOFT_ABI)
+typedef signed __int8           s8;
+typedef unsigned __int8         u8;
+typedef __int8                  i8;
+
+#elif (SCHAR_MIN < -127 && SCHAR_MAX >= +127)
+typedef signed char             s8;
+typedef unsigned char           u8;
+typedef char                    i8;
+#else
+#error Non-ANSI-conformant `char` size.
+#endif
+
+#if defined(HAVE_INT16_EXACT)
+typedef int16_t                 s16;
+typedef uint16_t                u16;
+#elif defined(HAVE_INT16_FAST)
+typedef int_fast16_t            s16;
+typedef uint_fast16_t           u16;
+#elif defined(HAVE_INT16_MINIMUM)
+typedef int_least16_t           s16;
+typedef uint_least16_t          u16;
+#elif defined(MICROSOFT_ABI)
+typedef signed __int16          s16;
+typedef unsigned __int16        u16;
+
+#elif (SCHAR_MIN < -32767 && SCHAR_MAX >= +32767)
+typedef signed char             s16;
+typedef unsigned char           u16;
+#else
+typedef signed short            s16;
+typedef unsigned short          u16;
+#endif
+
+#if defined(HAVE_INT32_EXACT)
+typedef int32_t                 s32;
+typedef uint32_t                u32;
+#elif defined(HAVE_INT32_FAST)
+typedef int_fast32_t            s32;
+typedef uint_fast32_t           u32;
+#elif defined(HAVE_INT32_MINIMUM)
+typedef int_least32_t           s32;
+typedef uint_least32_t          u32;
+#elif defined(MICROSOFT_ABI)
+typedef signed __int32          s32;
+typedef unsigned __int32        u32;
+
+#elif (SCHAR_MIN < -2147483647L && SCHAR_MAX >= +2147483647L)
+typedef signed char             s32;
+typedef unsigned char           u32;
+#elif (SHRT_MIN < -2147483647L && SHRT_MAX >= +2147483647L)
+typedef signed short            s32;
+typedef unsigned short          u32;
+#elif (INT_MIN < -2147483647L && INT_MAX >= +2147483647L)
+typedef signed int              s32;
+typedef unsigned int            u32;
+#else
+typedef signed long             s32;
+typedef unsigned long           u32;
+#endif
+
+#if defined(HAVE_INT64_EXACT)
+typedef int64_t                 s64;
+typedef uint64_t                u64;
+#elif defined(HAVE_INT64_FAST)
+typedef int_fast64_t            s64;
+typedef uint_fast64_t           u64;
+#elif defined(HAVE_INT64_MINIMUM)
+typedef int_least64_t           s64;
+typedef uint_least64_t          u64;
+#elif defined(MICROSOFT_ABI)
+typedef signed __int64          s64;
+typedef unsigned __int64        u64;
+
+#elif defined(__LP64__) && (0x00000000FFFFFFFFUL < ~0UL)
+typedef signed long             s64;
+typedef unsigned long           u64;
+#elif (LONG_MIN < -9223372036854775807L && LONG_MAX >= +9223372036854775807L)
+typedef signed long             s64;
+typedef unsigned long           u64;
+#else
+typedef signed long long        s64;
+typedef unsigned long long      u64;
+#endif
+
+/*
+ * Although most types are signed by default, using `int' instead of `signed
+ * int' and `i32' instead of `s32' can be preferable to denote cases where
+ * the signedness of something operated on is irrelevant to the algorithm.
+ */
+typedef s16                     i16;
+typedef s32                     i32;
+typedef s64                     i64;
+
+/*
+ * If <stdint.h> was unavailable or not included (should be included before
+ * "my_types.h" if it is ever to be included), then perhaps this is the
+ * right opportunity to try defining the <stdint.h> types ourselves.
+ *
+ * Due to sole popularity, code can sometimes be easier to read when saying
+ * things like "int8_t" instead of "i8", just because more people are more
+ * likely to understand the <stdint.h> type names in generic C code.  To be
+ * as neutral as possible, people will have every right to sometimes prefer
+ * saying "uint32_t" instead of "u32" for the sake of modern standards.
+ *
+ * The below macro just means whether or not we had access to <stdint.h>
+ * material to deduce any of our 8-, 16-, 32-, or 64-bit type definitions.
+ */
+#ifndef HAVE_STANDARD_INTEGER_TYPES
+typedef s8      int8_t;
+typedef u8      uint8_t;
+typedef s16     int16_t;
+typedef u16     uint16_t;
+typedef s32     int32_t;
+typedef u32     uint32_t;
+typedef s64     int64_t;
+typedef u64     uint64_t;
+#define HAVE_STANDARD_INTEGER_TYPES
+#endif
+
+/*
+ * MIPS-native types are `float' for f32, `double' for f64.
+ * These type requirements are based on the MIPS manuals on the FPU.
+ */
+#include <float.h>
+
+#if (FLT_MANT_DIG >= 24) && (FLT_MAX_EXP > 127)
+typedef float                   f32;
+#elif (DBL_MANT_DIG >= 24) && (DBL_MAX_EXP > 127)
+typedef double                  f32;
+#elif (LDBL_MANT_DIG >= 24) && (LDBL_MAX_EXP > 127)
+typedef long double             f32;
+#else
+typedef struct {
+#if (UINT_MAX >= (0x00000001UL << 23) - 1UL)
+    unsigned f:  23; /* mantissa fraction */
+#else
+    unsigned long f;
+#endif
+    unsigned e:   8; /* biased exponent, from -126 to +127 for generic values */
+    unsigned s:   1; /* mantissa sign bit */
+} f32;
+#endif
+
+#if (DBL_MANT_DIG >= 53) && (DBL_MAX_EXP > 1023)
+typedef double                  f64;
+#elif (LDBL_MANT_DIG >= 53) && (LDBL_MAX_EXP > 1023)
+typedef long double             f64;
+#else
+typedef struct {
+    uint64_t f/*:  52*/;
+    unsigned e:  11;
+    unsigned s:   1;
+} f64;
+#endif
+
+/*
+ * Pointer types, serving as the memory reference address to the actual type.
+ * I thought this was useful to have due to the various reasons for declaring
+ * or using variable pointers in various styles and complex scenarios.
+ *     ex) i32* pointer;
+ *     ex) i32 * pointer;
+ *     ex) i32 *a, *b, *c;
+ *     neutral:  `pi32 pointer;' or `pi32 a, b, c;'
+ */
+typedef i8*                     pi8;
+typedef i16*                    pi16;
+typedef i32*                    pi32;
+typedef i64*                    pi64;
+
+typedef s8*                     ps8;
+typedef s16*                    ps16;
+typedef s32*                    ps32;
+typedef s64*                    ps64;
+
+typedef u8*                     pu8;
+typedef u16*                    pu16;
+typedef u32*                    pu32;
+typedef u64*                    pu64;
+
+typedef f32*                    pf32;
+typedef f64*                    pf64;
+typedef void*                   p_void;
+typedef void(*p_func)(void);
+
+/*
+ * helper macros with exporting functions for shared objects or dynamically
+ * loaded libraries
+ */
+#if defined(M64P_PLUGIN_API)
+#define M64P_PLUGIN_PROTOTYPES 1
+#include "m64p_common.h"
+#include "m64p_config.h"
+#include "m64p_plugin.h"
+#include "m64p_types.h"
+#if !defined(LIBRETRO)
+#include "osal_dynamiclib.h"
+#endif
+#else
+#if defined(_WIN32)
+#define EXPORT      __declspec(dllexport)
+#define CALL        __cdecl
+#else
+#define EXPORT      __attribute__((visibility("default")))
+#define CALL
+#endif
+#endif
+
+
+/*
+ * Commonly, Ultra64 will refer to these common symbols.
+ * They seem to be fairly widely used outside of just <windows.h>.
+ */
+#if !defined(TRUE) && !defined(FALSE)
+#define FALSE       0
+#define TRUE        1
+#endif
+
+/*
+ * Optimizing compilers aren't necessarily perfect compilers, but they do
+ * have that extra chance of supporting explicit [anti-]inline instructions.
+ */
+#ifdef _MSC_VER
+#define INLINE      __inline
+#define NOINLINE    __declspec(noinline)
+#define ALIGNED     _declspec(align(16))
+#elif defined(__GNUC__)
+#define INLINE      inline
+#define NOINLINE    __attribute__((noinline))
+#define ALIGNED     __attribute__((aligned(16)))
+#else
+#define INLINE
+#define NOINLINE
+#define ALIGNED
+#endif
+
+/*
+ * aliasing helpers
+ * Strictly put, this may be unspecified behavior, but it's nice to have!
+ */
+typedef union {
+    u8 B[2];
+    s8 SB[2];
+
+    i16 W;
+    u16 UW;
+    s16 SW; /* Here, again, explicitly writing "signed" may help clarity. */
+} word_16;
+typedef union {
+    u8 B[4];
+    s8 SB[4];
+
+    i16 H[2];
+    u16 UH[2];
+    s16 SH[2];
+
+    i32 W;
+    u32 UW;
+    s32 SW;
+} word_32;
+typedef union {
+    u8 B[8];
+    s8 SB[8];
+
+    i16 Q[4];
+    u16 UQ[4];
+    s16 SQ[4];
+
+    i32 H[2];
+    u32 UH[2];
+    s32 SH[2];
+
+    i64 W;
+    u64 UW;
+    s64 SW;
+} word_64;
+
+/*
+ * helper macros for indexing memory in the above unions
+ * EEP!  Currently concentrates mostly on 32-bit endianness.
+ */
+#ifndef ENDIAN_M
+#if defined(__BIG_ENDIAN__)
+#define ENDIAN_M    ( 0U)
+#else
+#define ENDIAN_M    (~0U)
+#endif
+#endif
+
+#define ENDIAN_SWAP_BYTE    (ENDIAN_M & 7U & 3U)
+#define ENDIAN_SWAP_HALF    (ENDIAN_M & 6U & 2U)
+#define ENDIAN_SWAP_BIMI    (ENDIAN_M & 5U & 1U)
+#define ENDIAN_SWAP_WORD    (ENDIAN_M & 4U & 0U)
+
+#define BES(address)    ((address) ^ ENDIAN_SWAP_BYTE)
+#define HES(address)    ((address) ^ ENDIAN_SWAP_HALF)
+#define MES(address)    ((address) ^ ENDIAN_SWAP_BIMI)
+#define WES(address)    ((address) ^ ENDIAN_SWAP_WORD)
+
+/*
+ * extra types of encoding for the well-known MIPS RISC architecture
+ * Possibly implement other machine types in future versions of this header.
+ */
+typedef struct {
+    unsigned opcode   :   6;
+    unsigned rs       :   5;
+    unsigned rt       :   5;
+    unsigned rd       :   5;
+    unsigned sa       :   5;
+    unsigned function :   6;
+} MIPS_type_R;
+typedef struct {
+    unsigned opcode   :   6;
+    unsigned rs       :   5;
+    unsigned rt       :   5;
+    unsigned immediate:  16;
+} MIPS_type_I;
+
+#if (UINT_MAX >= (0x00000001UL << 26) - 1UL)
+typedef struct {
+    unsigned opcode   :   6;
+    unsigned target   :  26;
+} MIPS_type_J;
+#else
+typedef struct {
+    unsigned opcode   :   6;
+    unsigned long target; /* If `int' can't store 26 bits, `long' can. */
+} MIPS_type_J;
+#endif
+
+#if defined(__arm__) && defined(__GNUC__)
+#define COMPILER_FENCE()     __asm__ __volatile__("":::"memory")
+#else
+#define COMPILER_FENCE()
+#endif
+
+#endif
--- a/mupen64plus-rsp-cxd4/rsp.c
+++ b/mupen64plus-rsp-cxd4/rsp.c
@ -0,0 +1,32 @@
+/******************************************************************************\
+* Project:  Module Subsystem Interface to SP Interpreter Core                  *
+* Authors:  Iconoclast                                                         *
+* Release:  2016.03.23                                                         *
+* License:  CC0 Public Domain Dedication                                       *
+*                                                                              *
+* To the extent possible under law, the author(s) have dedicated all copyright *
+* and related and neighboring rights to this software to the public domain     *
+* worldwide. This software is distributed without any warranty.                *
+*                                                                              *
+* You should have received a copy of the CC0 Public Domain Dedication along    *
+* with this software.                                                          *
+* If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.             *
+\******************************************************************************/
+
+#define LIBRETRO
+
+#if defined(USE_SSE2NEON) && defined(__ARM_NEON__)
+#include "sse2neon/SSE2NEON.h"
+#define ARCH_MIN_SSE2
+#endif
+
+#include "vu/add.c"
+#include "vu/divide.c"
+#include "vu/logical.c"
+#include "vu/multiply.c"
+#include "vu/select.c"
+#include "vu/vu.c"
+#include "su.c"
+#include "module.c"
+
+unsigned char rsp_conf[32];
--- a/mupen64plus-rsp-cxd4/rsp.h
+++ b/mupen64plus-rsp-cxd4/rsp.h
@ -0,0 +1,276 @@
+/*******************************************************************************
+* Common RSP plugin specifications:  version #1.2 created by zilmar            *
+* Revised 2014 by Iconoclast for more compliance, portability and readability. *
+*                                                                              *
+* All questions or suggestions should go through the EmuTalk plugin forum.     *
+* http://www.emutalk.net/forums/showforum.php?f=31                             *
+*******************************************************************************/
+
+#ifndef _RSP_H_INCLUDED__
+#define _RSP_H_INCLUDED__
+
+#include "my_types.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define PLUGIN_TYPE_RSP             1
+#define PLUGIN_TYPE_GFX             2
+#define PLUGIN_TYPE_AUDIO           3
+#define PLUGIN_TYPE_CONTROLLER      4
+
+#ifndef PLUGIN_API_VERSION
+#define PLUGIN_API_VERSION      0x0102
+#endif
+
+/* old names from the original specification file */
+#define hInst               hinst
+#define MemorySwapped       MemoryBswaped
+
+/*
+ * Declare RSP_INFO structure instance as:  `RSP_INFO RSP_INFO_NAME;'
+ * ... for the ability to use the below convenience macros.
+ *
+ * Doing the traditional `RSP_INFO rsp_info' declaration has also worked but
+ * requires accessing the RCP registers in a less portable way, for example:
+ * `*(rsp_info).MI_INTR_REG |= MI_INTR_MASK_SP;'
+ * versus
+ * `GET_RCP_REG(MI_INTR_REG) |= MI_INTR_MASK_SP;'.
+ */
+#ifndef RSP_INFO_NAME
+#ifdef M64P_PLUGIN_API
+#define RSP_INFO_NAME           RSP_info
+#else
+#define RSP_INFO_NAME           RCP_info_SP
+#endif
+#define GET_RSP_INFO(member)    ((RSP_INFO_NAME).member)
+#define GET_RCP_REG(member)     (*(RSP_INFO_NAME).member)
+#endif
+
+typedef struct {
+    i32 left;
+    i32 top;
+    i32 right;
+    i32 bottom;
+} winapi_rect;
+
+typedef struct {
+    p_void hdc;
+    int fErase;
+    winapi_rect rcPaint;
+    int fRestore;
+    int fIncUpdate;
+    u8 rgbReserved[32];
+} winapi_paintstruct;
+
+typedef struct {
+    u16 Version;        /* Set to PLUGIN_API_VERSION. */
+    u16 Type;           /* Set to PLUGIN_TYPE_RSP. */
+    char Name[100];     /* plugin title, to help the user select plugins */
+
+    /* If the plugin supports these memory options, then set them to true. */
+    int NormalMemory;   /* a normal byte array */
+    int MemorySwapped;  /* a normal byte array choosing the client-side,
+                           native hardware's endian over the MIPS target's */
+} PLUGIN_INFO;
+
+#if !defined(M64P_PLUGIN_API)
+typedef struct {
+    p_void hInst;
+    int MemorySwapped;
+
+    pu8 RDRAM; /* CPU-RCP dynamic RAM (sensitive to MemorySwapped flag) */
+    pu8 DMEM; /* high 4K of SP cache memory (sensitive to MemorySwapped flag) */
+    pu8 IMEM; /* low 4K of SP cache memory (sensitive to MemorySwapped flag) */
+
+    pu32 MI_INTR_REG;
+
+    pu32 SP_MEM_ADDR_REG;
+    pu32 SP_DRAM_ADDR_REG;
+    pu32 SP_RD_LEN_REG;
+    pu32 SP_WR_LEN_REG;
+    pu32 SP_STATUS_REG;
+    pu32 SP_DMA_FULL_REG;
+    pu32 SP_DMA_BUSY_REG;
+    pu32 SP_PC_REG; /* This was supposed to be defined AFTER semaphore. */
+    pu32 SP_SEMAPHORE_REG;
+#if 0
+    pu32 SP_PC_REG; /* CPU-mapped between SP and DP command buffer regs */
+#endif
+    pu32 DPC_START_REG;
+    pu32 DPC_END_REG;
+    pu32 DPC_CURRENT_REG;
+    pu32 DPC_STATUS_REG;
+    pu32 DPC_CLOCK_REG;
+    pu32 DPC_BUFBUSY_REG;
+    pu32 DPC_PIPEBUSY_REG;
+    pu32 DPC_TMEM_REG;
+
+    p_func CheckInterrupts;
+    p_func ProcessDList;
+    p_func ProcessAList;
+    p_func ProcessRdpList;
+    p_func ShowCFB;
+} RSP_INFO;
+#endif
+
+typedef struct {
+    /* menu */
+    /* Items should have an ID between 5001 and 5100. */
+    p_void hRSPMenu;
+    void (*ProcessMenuItem)(int ID);
+
+    /* break points */
+    int UseBPoints;
+    char BPPanelName[20];
+    p_func Add_BPoint;
+    void (*CreateBPPanel)(p_void hDlg, winapi_rect rcBox);
+    p_func HideBPPanel;
+    void (*PaintBPPanel)(winapi_paintstruct ps);
+    p_void ShowBPPanel;
+    void (*RefreshBpoints)(p_void hList);
+    void (*RemoveBpoint)(p_void hList, int index);
+    p_void RemoveAllBpoint;
+
+    /* RSP command window */
+    p_func Enter_RSP_Commands_Window;
+} RSPDEBUG_INFO;
+
+typedef struct {
+    p_func UpdateBreakPoints;
+    p_func UpdateMemory;
+    p_func UpdateR4300iRegisters;
+    p_func Enter_BPoint_Window;
+    p_func Enter_R4300i_Commands_Window;
+    p_func Enter_R4300i_Register_Window;
+    p_func Enter_RSP_Commands_Window;
+    p_func Enter_Memory_Window;
+} DEBUG_INFO;
+
+/******************************************************************************
+* name     :  CloseDLL
+* optional :  no
+* call time:  when the emulator is shutting down or chooses to free memory
+* input    :  none
+* output   :  none
+*******************************************************************************/
+EXPORT void CALL CloseDLL(void);
+
+/******************************************************************************
+* name     :  DllAbout
+* optional :  yes
+* call time:  upon a request to see information about the plugin (e.g., authors)
+* input    :  a pointer to the window that called this function
+* output   :  none
+*******************************************************************************/
+EXPORT void CALL DllAbout(p_void hParent);
+
+/******************************************************************************
+* name     :  DllConfig
+* optional :  yes
+* call time:  upon a request to configure the plugin (e.g., change settings)
+* input    :  a pointer to the window that called this function
+* output   :  none
+*******************************************************************************/
+EXPORT void CALL DllConfig(p_void hParent);
+
+/******************************************************************************
+* name     :  DllTest
+* optional :  yes
+* call time:  upon a request to test the plugin (e.g., system capabilities)
+* input    :  a pointer to the window that called this function
+* output   :  none
+*******************************************************************************/
+EXPORT void CALL DllTest(p_void hParent);
+
+/******************************************************************************
+* name     :  DoRspCycles
+* optional :  no
+* call time:  when the R4300 CPU alternates control to execute on the RSP
+* input    :  number of cycles meant to be executed (for segmented execution)
+* output   :  The number of cycles executed also was intended for cycle-timing
+*             attempts, much like Project64 itself originally was, and requires
+*             individual experiment.  This value is ignored if the RSP CPU flow
+*             was halted when the function completed.  In-depth debate:
+*             http://www.emutalk.net/showthread.php?t=43088
+*******************************************************************************/
+EXPORT u32 CALL DoRspCycles(u32 Cycles);
+
+/******************************************************************************
+* name     :  GetDllInfo
+* optional :  no
+* call time:  during the enumeration of valid plugins the emulator can load
+* input    :  a pointer to a PLUGIN_INFO stucture used to determine support
+* output   :  none
+*******************************************************************************/
+EXPORT void CALL GetDllInfo(PLUGIN_INFO * PluginInfo);
+
+/******************************************************************************
+* name     :  GetRspDebugInfo
+* optional :  yes
+* call time:  when the emulator requests information about what the RSP plugin
+*             is and is not programmed to debug
+* input    :  a pointer to a RSPDEBUG_INFO stucture to determine capabilities
+* output   :  none
+*******************************************************************************/
+EXPORT void CALL GetRspDebugInfo(RSPDEBUG_INFO * RSPDebugInfo);
+
+/******************************************************************************
+* name     :  InitiateRSP
+* optional :  no
+* call time:  after the emulator has successfully loaded the plugin but needs
+*             more information about it before proceeding to start emulation
+* input    :  a RSP_INFO structure mostly for setting up the RCP memory map
+* output   :  none
+*******************************************************************************/
+EXPORT void CALL InitiateRSP(RSP_INFO Rsp_Info, pu32 CycleCount);
+
+/******************************************************************************
+* name     :  InitiateRSPDebugger
+* optional :  yes
+* call time:  after plugin load, when the emulator is ready to supply an
+*             informational structure useful to the RSP plugin for integrating
+*             its debugger, if any, with the rest of the emulator
+* input    :  a DEBUG_INFO structure offering debugger integration information
+* output   :  none
+*******************************************************************************/
+EXPORT void CALL InitiateRSPDebugger(DEBUG_INFO DebugInfo);
+
+/******************************************************************************
+* name     :  RomClosed
+* optional :  no
+* call time:  when unloading the ROM (sometimes when emulation ends)
+* input    :  none
+* output   :  none
+*******************************************************************************/
+EXPORT void CALL RomClosed(void);
+
+/*
+ * required?? in version #1.2 of the RSP plugin spec
+ * Have not tested a #1.2 implementation yet so shouldn't document them yet.
+ *
+ * Most of these functions were made to inhibit private plugin distribution
+ * from Project64 in its commercial state, and there is no documentation of
+ * these in the source to Project64 2.x as of yet.
+ */
+#if (PLUGIN_API_VERSION >= 0x0102) && !defined(M64P_PLUGIN_API)
+EXPORT void CALL RomOpen(void);
+EXPORT void CALL EnableDebugging(int Enabled);
+EXPORT void CALL PluginLoaded(void);
+#endif
+
+/************ profiling **************/
+#define Default_ProfilingOn         0
+#define Default_IndvidualBlock      0
+#define Default_ShowErrors          0
+#define Default_AudioHle            0
+
+#define InterpreterCPU      0
+#define RecompilerCPU       1
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
--- a/mupen64plus-rsp-cxd4/rsp_dump.cpp
+++ b/mupen64plus-rsp-cxd4/rsp_dump.cpp
@ -0,0 +1,90 @@
+#include "rsp_dump.h"
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <assert.h>
+
+static FILE *file;
+static bool in_trace;
+
+void rsp_open_trace(const char *path)
+{
+   file = fopen(path, "wb");
+   fwrite("RSPDUMP1", 1, 8, file);
+}
+
+void rsp_close_trace(void)
+{
+   if (file)
+   {
+      fwrite("EOF     ", 1, 8, file);
+      fclose(file);
+   }
+
+   file = nullptr;
+}
+
+void rsp_dump_begin_trace(void)
+{
+   if (!file)
+      return;
+
+   fwrite("BEGIN   ", 1, 8, file);
+   in_trace = true;
+}
+
+int rsp_dump_recording_trace(void)
+{
+   return in_trace;
+}
+
+void rsp_dump_end_trace(void)
+{
+   if (!file)
+      return;
+
+   fwrite("END     ", 1, 8, file);
+   in_trace = false;
+}
+
+void rsp_dump_block(const char *tag, const void *data, size_t size)
+{
+   if (!file)
+      return;
+
+   uint32_t size_data = size;
+
+   assert(strlen(tag) == 8);
+   fwrite(tag, 1, strlen(tag), file);
+   fwrite(&size_data, sizeof(size_data), 1, file);
+   fwrite(data, size, 1, file);
+}
+
+void rsp_dump_begin_read_dma(void)
+{
+   if (!file)
+      return;
+
+   fwrite("BEGINDMA", 1, 8, file);
+}
+
+void rsp_dump_poke_mem(unsigned base, const void *data, size_t size)
+{
+   if (!file)
+      return;
+   uint32_t size_data = size;
+   uint32_t base_data = base;
+
+   fwrite("POKE    ", 1, 8, file);
+   fwrite(&base_data, sizeof(base_data), 1, file);
+   fwrite(&size_data, sizeof(size_data), 1, file);
+   fwrite(data, size, 1, file);
+}
+
+void rsp_dump_end_read_dma(void)
+{
+   if (!file)
+      return;
+
+   fwrite("ENDDMA  ", 1, 8, file);
+}
--- a/mupen64plus-rsp-cxd4/rsp_dump.h
+++ b/mupen64plus-rsp-cxd4/rsp_dump.h
@ -0,0 +1,28 @@
+#ifndef RSP_DUMP_H__
+#define RSP_DUMP_H__
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void rsp_open_trace(const char *path);
+void rsp_close_trace(void);
+
+void rsp_dump_begin_trace(void);
+void rsp_dump_end_trace(void);
+
+void rsp_dump_block(const char *tag, const void *data, size_t size);
+
+void rsp_dump_begin_read_dma(void);
+void rsp_dump_poke_mem(unsigned base, const void *data, size_t size);
+void rsp_dump_end_read_dma(void);
+
+int rsp_dump_recording_trace(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/mupen64plus-rsp-cxd4/sse2neon/SSE2NEON.h
+++ b/mupen64plus-rsp-cxd4/sse2neon/SSE2NEON.h
--- a/mupen64plus-rsp-cxd4/su.c
+++ b/mupen64plus-rsp-cxd4/su.c
--- a/mupen64plus-rsp-cxd4/su.h
+++ b/mupen64plus-rsp-cxd4/su.h
@ -0,0 +1,362 @@
+/******************************************************************************\
+* Project:  Basic MIPS R4000 Instruction Set for Scalar Unit Operations        *
+* Authors:  Iconoclast                                                         *
+* Release:  2016.03.23                                                         *
+* License:  CC0 Public Domain Dedication                                       *
+*                                                                              *
+* To the extent possible under law, the author(s) have dedicated all copyright *
+* and related and neighboring rights to this software to the public domain     *
+* worldwide. This software is distributed without any warranty.                *
+*                                                                              *
+* You should have received a copy of the CC0 Public Domain Dedication along    *
+* with this software.                                                          *
+* If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.             *
+\******************************************************************************/
+
+#ifndef _SU_H_
+#define _SU_H_
+
+#include <limits.h>
+#include <stdio.h>
+
+#include "my_types.h"
+#include "rsp.h"
+
+#define EXTERN_COMMAND_LIST_GBI
+#define EXTERN_COMMAND_LIST_ABI
+#define SEMAPHORE_LOCK_CORRECTIONS
+#define WAIT_FOR_CPU_HOST
+
+#if (0)
+#define SP_EXECUTE_LOG
+#define VU_EMULATE_SCALAR_ACCUMULATOR_READ
+#endif
+
+/*
+ * Currently, the plugin system this module is written for doesn't notify us
+ * of how much RDRAM is installed to the system, so we have to presume 8 MiB.
+ */
+#define MAX_DRAM_ADDR           0x007FFFFFul
+#define MAX_DRAM_DMA_ADDR       (MAX_DRAM_ADDR & ~7)
+
+/*
+ * Interact with memory using server-side byte order (MIPS big-endian) or
+ * client-side (VM host's) native byte order on a 32-bit boundary.
+ *
+ * Unfortunately, most op-codes are optimized to require this to be TRUE.
+ */
+#if (ENDIAN_M == 0)
+#define USE_CLIENT_ENDIAN       0
+#else
+#define USE_CLIENT_ENDIAN       1
+#endif
+
+/*
+ * Always keep this enabled for faster interpreter CPU.
+ *
+ * If you disable this, the branch delay slot algorithm will match the
+ * documentation found in the MIPS manuals (which is not entirely accurate).
+ *
+ * Enabled:
+ *     while (CPU_running) {
+ *         PC = static_delay_slot_adjustments();
+ *         switch (opcode) { ... continue; }
+ * Disabled:
+ *     while (CPU_running) {
+ *         switch (opcode) { ... break; }
+ *         PC = documented_branch_delay_slot();
+ *         continue;
+ */
+#if 1
+#define EMULATE_STATIC_PC
+#endif
+
+typedef enum {
+    zero = 0,
+    at =  1,
+#ifdef TRUE_MIPS_AND_NOT_JUST_THE_RSP_SUBSET
+    v0 =  2,
+    v1 =  3,
+
+    a0 =  4,
+    a1 =  5,
+    a2 =  6,
+    a3 =  7,
+
+    t0 =  8,
+    t1 =  9,
+    t2 = 10,
+    t3 = 11,
+    t4 = 12,
+    t5 = 13,
+    t6 = 14,
+    t7 = 15,
+    t8 = 24,
+    t9 = 25,
+
+    s0 = 16,
+    s1 = 17,
+    s2 = 18,
+    s3 = 19,
+    s4 = 20,
+    s5 = 21,
+    s6 = 22,
+    s7 = 23,
+
+    k0 = 26,
+    k1 = 27,
+
+    gp = 28,
+#endif
+    sp = 29,
+    fp = 30, /* new, official MIPS name for it:  "frame pointer" */
+    ra = 31,
+    S8 = fp
+} GPR_specifier;
+
+extern RSP_INFO RSP_INFO_NAME;
+extern pu8 DRAM;
+extern pu8 DMEM;
+extern pu8 IMEM;
+
+extern u8 conf[32];
+
+/*
+ * general-purpose scalar registers
+ *
+ * based on the MIPS instruction set architecture but without most of the
+ * original register names (for example, no kernel-reserved registers)
+ */
+extern u32 SR[32];
+
+#define FIT_IMEM(PC)    ((PC) & 0xFFFu & 0xFFCu)
+
+#ifdef EMULATE_STATIC_PC
+#define JUMP        goto set_branch_delay
+#else
+#define JUMP        break
+#endif
+
+#ifdef EMULATE_STATIC_PC
+#define BASE_OFF    0x000
+#else
+#define BASE_OFF    0x004
+#endif
+
+#ifndef EMULATE_STATIC_PC
+int stage;
+#endif
+
+extern int temp_PC;
+#ifdef WAIT_FOR_CPU_HOST
+extern short MFC0_count[32];
+/* Keep one C0 MF status read count for each scalar register. */
+#endif
+
+/*
+ * The number of times to tolerate executing `MFC0    $at, $c4`.
+ * Replace $at with any register--the timeout limit is per each.
+ *
+ * Set to a higher value to avoid prematurely quitting the interpreter.
+ * Set to a lower value for speed...you could get away with 10 sometimes.
+ */
+extern int MF_SP_STATUS_TIMEOUT;
+
+#define SLOT_OFF    ((BASE_OFF) + 0x000)
+#define LINK_OFF    ((BASE_OFF) + 0x004)
+extern void set_PC(unsigned int address);
+
+/*
+ * If the client CPU's shift amount is exactly 5 bits for a 32-bit source,
+ * then omit emulating (sa & 31) in the SLL/SRL/SRA interpreter steps.
+ * (Additionally, omit doing (GPR[rs] & 31) in SLLV/SRLV/SRAV.)
+ *
+ * As C pre-processor logic seems incapable of interpreting type storage,
+ * stuff like #if (1U << 31 == 1U << ~0U) will generally just fail.
+ */
+#if defined(ARCH_MIN_SSE2) && !defined(SSE2NEON)
+#define MASK_SA(sa) (sa)
+#define IW_RD(inst) ((u16)(inst) >> 11)
+#else
+#define MASK_SA(sa) ((sa) & 31)
+#define IW_RD(inst) (u8)(((inst) >> 11) % (1 << 5))
+#endif
+
+/*
+ * If primary op-code is SPECIAL (000000), we could skip ANDing the rs shift.
+ * Shifts losing precision are undefined, so don't assume that (1 >> 1 == 0).
+ */
+#if (0xFFFFFFFFul >> 31 != 0x000000001ul) || defined(_DEBUG)
+#define SPECIAL_DECODE_RS(inst)     (((inst) & 0x03E00000UL) >> 21)
+#else
+#define SPECIAL_DECODE_RS(inst)     ((inst) >> 21)
+#endif
+
+/*
+ * Try to stick to (unsigned char) to conform to strict aliasing rules.
+ *
+ * Do not say `u8`.  My custom type definitions are minimum-size types.
+ * Do not say `uint8_t`.  Exact-width types are not portable/universal.
+ */
+#if (CHAR_BIT != 8)
+#error Non-POSIX-compliant (char) storage width.
+#endif
+
+/*
+ * RSP general-purpose registers (GPRs) are always 32-bit scalars (SRs).
+ * SR_B(gpr, 0) is SR[gpr]31..24, and SR_B(gpr, 3) is SR[gpr]7..0.
+ */
+#define SR_B(scalar, i)         *((unsigned char *)&(SR[scalar]) + BES(i))
+
+/*
+ * Universal byte-access macro for 8-element vectors of 16-bit halfwords.
+ * Use this macro if you are not sure whether the element is odd or even.
+ *
+ * Maybe a typedef union{} can be better, but it's less readable for RSP
+ * vector registers.  Only 16-bit element computations exist, so the correct
+ * allocation of the register file is int16_t v[32][8], not a_union v[32].
+ *
+ * Either method--dynamic union reads or special aliasing--is undefined
+ * behavior and will not truly be portable code anyway, so it hardly matters.
+ */
+#define VR_B(vt, element)       *((unsigned char *)&(VR[vt][0]) + MES(element))
+
+/*
+ * Optimized byte-access macros for the vector registers.
+ * Use these ONLY if you know the element is even (VR_A) or odd (VR_U).
+ *
+ * They are faster because LEA PTR [offset +/- 1] means fewer CPU
+ * instructions generated than (offset ^ 1) does, in most cases.
+ */
+#define VR_A(vt, e)             *((unsigned char *)&(VR[vt][0]) + e + MES(0))
+#define VR_U(vt, e)             *((unsigned char *)&(VR[vt][0]) + e - MES(0))
+
+/*
+ * Use this ONLY if you know the element is even, not odd.
+ *
+ * This is only provided for purposes of consistency with VR_B() and friends.
+ * Saying `VR[vt][1] = x;` instead of `VR_S(vt, 2) = x` works as well.
+ */
+#define VR_S(vt, element)       *(pi16)((unsigned char *)&(VR[vt][0]) + element)
+
+/*** Scalar, Coprocessor Operations (system control) ***/
+#define SP_STATUS_HALT          (0x00000001ul <<  0)
+#define SP_STATUS_BROKE         (0x00000001ul <<  1)
+#define SP_STATUS_DMA_BUSY      (0x00000001ul <<  2)
+#define SP_STATUS_DMA_FULL      (0x00000001ul <<  3)
+#define SP_STATUS_IO_FULL       (0x00000001ul <<  4)
+#define SP_STATUS_SSTEP         (0x00000001ul <<  5)
+#define SP_STATUS_INTR_BREAK    (0x00000001ul <<  6)
+#define SP_STATUS_SIG0          (0x00000001ul <<  7)
+#define SP_STATUS_SIG1          (0x00000001ul <<  8)
+#define SP_STATUS_SIG2          (0x00000001ul <<  9)
+#define SP_STATUS_SIG3          (0x00000001ul << 10)
+#define SP_STATUS_SIG4          (0x00000001ul << 11)
+#define SP_STATUS_SIG5          (0x00000001ul << 12)
+#define SP_STATUS_SIG6          (0x00000001ul << 13)
+#define SP_STATUS_SIG7          (0x00000001ul << 14)
+
+#define NUMBER_OF_CP0_REGISTERS         16
+extern pu32 CR[NUMBER_OF_CP0_REGISTERS];
+
+extern void SP_DMA_READ(void);
+extern void SP_DMA_WRITE(void);
+
+extern u16 rwR_VCE(void);
+extern void rwW_VCE(u16 VCE);
+
+extern void MFC2(unsigned int rt, unsigned int vs, unsigned int e);
+extern void MTC2(unsigned int rt, unsigned int vd, unsigned int e);
+extern void CFC2(unsigned int rt, unsigned int rd);
+extern void CTC2(unsigned int rt, unsigned int rd);
+
+/*** Modern pseudo-operations (not real instructions, but nice shortcuts) ***/
+extern void ULW(unsigned int rd, u32 addr);
+extern void USW(unsigned int rs, u32 addr);
+
+/*
+ * The scalar unit controls the primary R4000 operations implementation,
+ * which inherently includes interfacing with the vector unit under COP2.
+ *
+ * Although no scalar unit operations are computational vector operations,
+ * several of them will access machine states shared with the vector unit.
+ *
+ * We will need access to the vector unit's vector register file and its
+ * vector control register file used mainly for vector select instructions.
+ */
+#include "vu/select.h"
+
+NOINLINE extern void res_S(void);
+
+extern void SP_CP0_MF(unsigned int rt, unsigned int rd);
+
+/*
+ * example syntax (basically the same for all LWC2/SWC2 ops):
+ * LTWV    $v0[0], -64($at)
+ * SBV     $v0[9], 0xFFE($0)
+ */
+typedef void(*mwc2_func)(
+    unsigned int vt,
+    unsigned int element,
+    signed int offset,
+    unsigned int base
+);
+
+extern mwc2_func LWC2[2 * 8*2];
+extern mwc2_func SWC2[2 * 8*2];
+
+extern void res_lsw(
+    unsigned int vt,
+    unsigned int element,
+    signed int offset,
+    unsigned int base
+);
+
+/*** Scalar, Coprocessor Operations (vector unit, scalar cache transfers) ***/
+extern void LBV(unsigned vt, unsigned element, signed offset, unsigned base);
+extern void LSV(unsigned vt, unsigned element, signed offset, unsigned base);
+extern void LLV(unsigned vt, unsigned element, signed offset, unsigned base);
+extern void LDV(unsigned vt, unsigned element, signed offset, unsigned base);
+extern void SBV(unsigned vt, unsigned element, signed offset, unsigned base);
+extern void SSV(unsigned vt, unsigned element, signed offset, unsigned base);
+extern void SLV(unsigned vt, unsigned element, signed offset, unsigned base);
+extern void SDV(unsigned vt, unsigned element, signed offset, unsigned base);
+
+/*
+ * Group II vector loads and stores:
+ * PV and UV (As of RCP implementation, XV and ZV are reserved opcodes.)
+ */
+extern void LPV(unsigned vt, unsigned element, signed offset, unsigned base);
+extern void LUV(unsigned vt, unsigned element, signed offset, unsigned base);
+extern void SPV(unsigned vt, unsigned element, signed offset, unsigned base);
+extern void SUV(unsigned vt, unsigned element, signed offset, unsigned base);
+
+/*
+ * Group III vector loads and stores:
+ * HV, FV, and AV (As of RCP implementation, AV opcodes are reserved.)
+ */
+extern void LHV(unsigned vt, unsigned element, signed offset, unsigned base);
+extern void LFV(unsigned vt, unsigned element, signed offset, unsigned base);
+extern void SHV(unsigned vt, unsigned element, signed offset, unsigned base);
+extern void SFV(unsigned vt, unsigned element, signed offset, unsigned base);
+
+/*
+ * Group IV vector loads and stores:
+ * QV and RV
+ */
+extern void LQV(unsigned vt, unsigned element, signed offset, unsigned base);
+extern void LRV(unsigned vt, unsigned element, signed offset, unsigned base);
+extern void SQV(unsigned vt, unsigned element, signed offset, unsigned base);
+extern void SRV(unsigned vt, unsigned element, signed offset, unsigned base);
+
+/*
+ * Group V vector loads and stores
+ * TV and SWV (As of RCP implementation, LTWV opcode was undesired.)
+ */
+extern void LTV(unsigned vt, unsigned element, signed offset, unsigned base);
+extern void SWV(unsigned vt, unsigned element, signed offset, unsigned base);
+extern void STV(unsigned vt, unsigned element, signed offset, unsigned base);
+
+NOINLINE extern void run_task(void);
+
+#endif
--- a/mupen64plus-rsp-cxd4/vu/add.c
+++ b/mupen64plus-rsp-cxd4/vu/add.c
@ -0,0 +1,367 @@
+/******************************************************************************\
+* Project:  MSP Simulation Layer for Vector Unit Computational Adds            *
+* Authors:  Iconoclast                                                         *
+* Release:  2016.03.23                                                         *
+* License:  CC0 Public Domain Dedication                                       *
+*                                                                              *
+* To the extent possible under law, the author(s) have dedicated all copyright *
+* and related and neighboring rights to this software to the public domain     *
+* worldwide. This software is distributed without any warranty.                *
+*                                                                              *
+* You should have received a copy of the CC0 Public Domain Dedication along    *
+* with this software.                                                          *
+* If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.             *
+\******************************************************************************/
+
+#include "add.h"
+
+#ifdef ARCH_MIN_SSE2
+static INLINE void SIGNED_CLAMP_ADD(pi16 VD, pi16 VS, pi16 VT)
+{
+    v16 dst, src, vco;
+    v16 max, min;
+
+    src = _mm_load_si128((v16 *)VS);
+    dst = _mm_load_si128((v16 *)VT);
+    vco = _mm_load_si128((v16 *)cf_co);
+
+/*
+ * Due to premature clamping in between adds, sometimes we need to add the
+ * LESSER of two integers, either VS or VT, to the carry-in flag matching the
+ * current vector register slice, BEFORE finally adding the greater integer.
+ */
+    max = _mm_max_epi16(dst, src);
+    min = _mm_min_epi16(dst, src);
+
+    min = _mm_adds_epi16(min, vco);
+    max = _mm_adds_epi16(max, min);
+    _mm_store_si128((v16 *)VD, max);
+    return;
+}
+static INLINE void SIGNED_CLAMP_SUB(pi16 VD, pi16 VS, pi16 VT)
+{
+    v16 dst, src, vco;
+    v16 dif, res, xmm;
+
+    src = _mm_load_si128((v16 *)VS);
+    dst = _mm_load_si128((v16 *)VT);
+    vco = _mm_load_si128((v16 *)cf_co);
+
+    res = _mm_subs_epi16(src, dst);
+
+/*
+ * Due to premature clamps in-between subtracting two of the three operands,
+ * we must be careful not to offset the result accidentally when subtracting
+ * the corresponding VCO flag AFTER the saturation from doing (VS - VT).
+ */
+    dif = _mm_add_epi16(res, vco);
+    dif = _mm_xor_si128(dif, res); /* Adding one suddenly inverts the sign? */
+    dif = _mm_and_si128(dif, dst); /* Sign change due to subtracting a neg. */
+    xmm = _mm_sub_epi16(src, dst);
+    src = _mm_andnot_si128(src, dif); /* VS must be >= 0x0000 for overflow. */
+    xmm = _mm_and_si128(xmm, src); /* VS + VT != INT16_MIN; VS + VT >= +32768 */
+    xmm = _mm_srli_epi16(xmm, 15); /* src = (INT16_MAX + 1 === INT16_MIN) ? */
+
+    xmm = _mm_andnot_si128(xmm, vco); /* If it's NOT overflow, keep flag. */
+    res = _mm_subs_epi16(res, xmm);
+    _mm_store_si128((v16 *)VD, res);
+    return;
+}
+#else
+static INLINE void SIGNED_CLAMP_ADD(pi16 VD, pi16 VS, pi16 VT)
+{
+    i32 sum[N];
+    i16 hi[N], lo[N];
+    register int i;
+
+    for (i = 0; i < N; i++)
+        sum[i] = VS[i] + VT[i] + cf_co[i];
+    for (i = 0; i < N; i++)
+        lo[i] = (sum[i] + 0x8000) >> 31;
+    for (i = 0; i < N; i++)
+        hi[i] = (0x7FFF - sum[i]) >> 31;
+    vector_copy(VD, VACC_L);
+    for (i = 0; i < N; i++)
+        VD[i] &= ~lo[i];
+    for (i = 0; i < N; i++)
+        VD[i] |=  hi[i];
+    for (i = 0; i < N; i++)
+        VD[i] ^= 0x8000 & (hi[i] | lo[i]);
+    return;
+}
+static INLINE void SIGNED_CLAMP_SUB(pi16 VD, pi16 VS, pi16 VT)
+{
+    i32 dif[N];
+    i16 hi[N], lo[N];
+    register int i;
+
+    for (i = 0; i < N; i++)
+        dif[i] = VS[i] - VT[i] - cf_co[i];
+    for (i = 0; i < N; i++)
+        lo[i] = (dif[i] + 0x8000) >> 31;
+    for (i = 0; i < N; i++)
+        hi[i] = (0x7FFF - dif[i]) >> 31;
+    vector_copy(VD, VACC_L);
+    for (i = 0; i < N; i++)
+        VD[i] &= ~lo[i];
+    for (i = 0; i < N; i++)
+        VD[i] |=  hi[i];
+    for (i = 0; i < N; i++)
+        VD[i] ^= 0x8000 & (hi[i] | lo[i]);
+    return;
+}
+#endif
+
+INLINE static void clr_ci(pi16 VD, pi16 VS, pi16 VT)
+{ /* clear CARRY and carry in to accumulators */
+    register int i;
+
+    for (i = 0; i < N; i++)
+        VACC_L[i] = VS[i] + VT[i] + cf_co[i];
+    SIGNED_CLAMP_ADD(VD, VS, VT);
+
+ /* CTC2    $0, $vco # zeroing RSP flags VCF[0] */
+    vector_wipe(cf_ne);
+    vector_wipe(cf_co);
+    return;
+}
+
+INLINE static void clr_bi(pi16 VD, pi16 VS, pi16 VT)
+{ /* clear CARRY and borrow in to accumulators */
+    register int i;
+
+    for (i = 0; i < N; i++)
+        VACC_L[i] = VS[i] - VT[i] - cf_co[i];
+    SIGNED_CLAMP_SUB(VD, VS, VT);
+
+ /* CTC2    $0, $vco # zeroing RSP flags VCF[0] */
+    vector_wipe(cf_ne);
+    vector_wipe(cf_co);
+    return;
+}
+
+/*
+ * -1:  VT *= -1, because VS < 0 // VT ^= -2 if even, or ^= -1, += 1
+ *  0:  VT *=  0, because VS = 0 // VT ^= VT
+ * +1:  VT *= +1, because VS > 0 // VT ^=  0
+ *      VT ^= -1, "negate" -32768 as ~+32767 (corner case hack for N64 SP)
+ */
+INLINE static void do_abs(pi16 VD, pi16 VS, pi16 VT)
+{
+    i16 neg[N], pos[N];
+    i16 nez[N], cch[N]; /* corner case hack -- abs(-32768) == +32767 */
+    ALIGNED i16 res[N];
+    register int i;
+
+    vector_copy(res, VT);
+    for (i = 0; i < N; i++)
+        cch[i]  = (res[i] == -32768);
+
+    for (i = 0; i < N; i++)
+        neg[i]  = (VS[i] <  0x0000);
+    for (i = 0; i < N; i++)
+        pos[i]  = (VS[i] >  0x0000);
+    vector_wipe(nez);
+
+    for (i = 0; i < N; i++)
+        nez[i] -= neg[i];
+    for (i = 0; i < N; i++)
+        nez[i] += pos[i];
+
+    for (i = 0; i < N; i++)
+        res[i] *= nez[i];
+    for (i = 0; i < N; i++)
+        res[i] -= cch[i];
+    vector_copy(VACC_L, res);
+    vector_copy(VD, VACC_L);
+    return;
+}
+
+INLINE static void set_co(pi16 VD, pi16 VS, pi16 VT)
+{ /* set CARRY and carry out from sum */
+    i32 sum[N];
+    register int i;
+
+    for (i = 0; i < N; i++)
+        sum[i] = (u16)(VS[i]) + (u16)(VT[i]);
+    for (i = 0; i < N; i++)
+        VACC_L[i] = VS[i] + VT[i];
+    vector_copy(VD, VACC_L);
+
+    vector_wipe(cf_ne);
+    for (i = 0; i < N; i++)
+        cf_co[i] = sum[i] >> 16; /* native:  (sum[i] > +65535) */
+    return;
+}
+
+INLINE static void set_bo(pi16 VD, pi16 VS, pi16 VT)
+{ /* set CARRY and borrow out from difference */
+    i32 dif[N];
+    register int i;
+
+    for (i = 0; i < N; i++)
+        dif[i] = (u16)(VS[i]) - (u16)(VT[i]);
+    for (i = 0; i < N; i++)
+        VACC_L[i] = VS[i] - VT[i];
+    for (i = 0; i < N; i++)
+        cf_ne[i] = (VS[i] != VT[i]);
+    for (i = 0; i < N; i++)
+        cf_co[i] = (dif[i] < 0);
+    vector_copy(VD, VACC_L);
+    return;
+}
+
+VECTOR_OPERATION VADD(v16 vs, v16 vt)
+{
+    ALIGNED i16 VD[N];
+#ifdef ARCH_MIN_SSE2
+    ALIGNED i16 VS[N], VT[N];
+
+    *(v16 *)VS = vs;
+    *(v16 *)VT = vt;
+#else
+    v16 VS, VT;
+
+    VS = vs;
+    VT = vt;
+#endif
+    clr_ci(VD, VS, VT);
+#ifdef ARCH_MIN_SSE2
+    COMPILER_FENCE();
+    vs = *(v16 *)VD;
+    return (vs);
+#else
+    vector_copy(V_result, VD);
+    return;
+#endif
+}
+
+VECTOR_OPERATION VSUB(v16 vs, v16 vt)
+{
+    ALIGNED i16 VD[N];
+#ifdef ARCH_MIN_SSE2
+    ALIGNED i16 VS[N], VT[N];
+
+    *(v16 *)VS = vs;
+    *(v16 *)VT = vt;
+#else
+    v16 VS, VT;
+
+    VS = vs;
+    VT = vt;
+#endif
+    clr_bi(VD, VS, VT);
+#ifdef ARCH_MIN_SSE2
+    COMPILER_FENCE();
+    vs = *(v16 *)VD;
+    return (vs);
+#else
+    vector_copy(V_result, VD);
+    return;
+#endif
+}
+
+VECTOR_OPERATION VABS(v16 vs, v16 vt)
+{
+    ALIGNED i16 VD[N];
+#ifdef ARCH_MIN_SSE2
+    ALIGNED i16 VS[N], VT[N];
+
+    *(v16 *)VS = vs;
+    *(v16 *)VT = vt;
+#else
+    v16 VS, VT;
+
+    VS = vs;
+    VT = vt;
+#endif
+    do_abs(VD, VS, VT);
+#ifdef ARCH_MIN_SSE2
+    COMPILER_FENCE();
+    vs = *(v16 *)VD;
+    return (vs);
+#else
+    vector_copy(V_result, VD);
+    return;
+#endif
+}
+
+VECTOR_OPERATION VADDC(v16 vs, v16 vt)
+{
+    ALIGNED i16 VD[N];
+#ifdef ARCH_MIN_SSE2
+    ALIGNED i16 VS[N], VT[N];
+
+    *(v16 *)VS = vs;
+    *(v16 *)VT = vt;
+#else
+    v16 VS, VT;
+
+    VS = vs;
+    VT = vt;
+#endif
+    set_co(VD, VS, VT);
+#ifdef ARCH_MIN_SSE2
+    COMPILER_FENCE();
+    vs = *(v16 *)VD;
+    return (vs);
+#else
+    vector_copy(V_result, VD);
+    return;
+#endif
+}
+
+VECTOR_OPERATION VSUBC(v16 vs, v16 vt)
+{
+    ALIGNED i16 VD[N];
+#ifdef ARCH_MIN_SSE2
+    ALIGNED i16 VS[N], VT[N];
+
+    *(v16 *)VS = vs;
+    *(v16 *)VT = vt;
+#else
+    v16 VS, VT;
+
+    VS = vs;
+    VT = vt;
+#endif
+    set_bo(VD, VS, VT);
+#ifdef ARCH_MIN_SSE2
+    COMPILER_FENCE();
+    vs = *(v16 *)VD;
+    return (vs);
+#else
+    vector_copy(V_result, VD);
+    return;
+#endif
+}
+
+VECTOR_OPERATION VSAW(v16 vs, v16 vt)
+{
+    unsigned int element;
+
+    element  = 0xF & (inst_word >> 21);
+    element ^= 0x8; /* Convert scalar whole elements 8:F to 0:7. */
+
+    if (element > 0x2) {
+        message("VSAW\nIllegal mask.");
+#ifdef ARCH_MIN_SSE2
+        vector_wipe(vs);
+#else
+        vector_wipe(V_result);
+#endif
+    } else {
+#ifdef ARCH_MIN_SSE2
+        vs = *(v16 *)VACC[element];
+#else
+        vector_copy(V_result, VACC[element]);
+#endif
+    }
+#ifdef ARCH_MIN_SSE2
+    return (vt = vs);
+#else
+    if (vt == vs)
+        return; /* -Wunused-but-set-parameter */
+    return;
+#endif
+}
--- a/mupen64plus-rsp-cxd4/vu/add.h
+++ b/mupen64plus-rsp-cxd4/vu/add.h
@ -0,0 +1,34 @@
+/******************************************************************************\
+* Project:  Instruction Mnemonics for Vector Unit Computational Adds           *
+* Authors:  Iconoclast                                                         *
+* Release:  2014.10.15                                                         *
+* License:  CC0 Public Domain Dedication                                       *
+*                                                                              *
+* To the extent possible under law, the author(s) have dedicated all copyright *
+* and related and neighboring rights to this software to the public domain     *
+* worldwide. This software is distributed without any warranty.                *
+*                                                                              *
+* You should have received a copy of the CC0 Public Domain Dedication along    *
+* with this software.                                                          *
+* If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.             *
+\******************************************************************************/
+
+#ifndef _ADD_H_
+#define _ADD_H_
+
+#include "vu.h"
+
+VECTOR_EXTERN
+    VADD   (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VSUB   (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VABS   (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VADDC  (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VSUBC  (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VSAW   (v16 vs, v16 vt);
+
+#endif
--- a/mupen64plus-rsp-cxd4/vu/divide.c
+++ b/mupen64plus-rsp-cxd4/vu/divide.c
--- a/mupen64plus-rsp-cxd4/vu/divide.h
+++ b/mupen64plus-rsp-cxd4/vu/divide.h
@ -0,0 +1,38 @@
+/******************************************************************************\
+* Project:  Instruction Mnemonics for Vector Unit Computational Divides        *
+* Authors:  Iconoclast                                                         *
+* Release:  2015.11.29                                                         *
+* License:  CC0 Public Domain Dedication                                       *
+*                                                                              *
+* To the extent possible under law, the author(s) have dedicated all copyright *
+* and related and neighboring rights to this software to the public domain     *
+* worldwide. This software is distributed without any warranty.                *
+*                                                                              *
+* You should have received a copy of the CC0 Public Domain Dedication along    *
+* with this software.                                                          *
+* If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.             *
+\******************************************************************************/
+
+#ifndef _DIVIDE_H_
+#define _DIVIDE_H_
+
+#include "vu.h"
+
+VECTOR_EXTERN
+    VRCP   (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VRCPL  (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VRCPH  (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VMOV   (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VRSQ   (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VRSQL  (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VRSQH  (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VNOP   (v16 vs, v16 vt);
+
+#endif
--- a/mupen64plus-rsp-cxd4/vu/logical.c
+++ b/mupen64plus-rsp-cxd4/vu/logical.c
@ -0,0 +1,112 @@
+/******************************************************************************\
+* Project:  MSP Simulation Layer for Vector Unit Computational Bit-Wise Logic  *
+* Authors:  Iconoclast                                                         *
+* Release:  2014.10.15                                                         *
+* License:  CC0 Public Domain Dedication                                       *
+*                                                                              *
+* To the extent possible under law, the author(s) have dedicated all copyright *
+* and related and neighboring rights to this software to the public domain     *
+* worldwide. This software is distributed without any warranty.                *
+*                                                                              *
+* You should have received a copy of the CC0 Public Domain Dedication along    *
+* with this software.                                                          *
+* If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.             *
+\******************************************************************************/
+
+#include "logical.h"
+
+VECTOR_OPERATION VAND(v16 vs, v16 vt)
+{
+#ifdef ARCH_MIN_SSE2
+    vector_and(vs, vt);
+    *(v16 *)VACC_L = vs;
+    return (vs);
+#else
+    vector_copy(VACC_L, vt);
+    vector_and(VACC_L, vs);
+    vector_copy(V_result, VACC_L);
+    return;
+#endif
+}
+
+VECTOR_OPERATION VNAND(v16 vs, v16 vt)
+{
+#ifdef ARCH_MIN_SSE2
+    vector_and(vt, vs);
+    vector_fill(vs);
+    vector_xor(vs, vt);
+    *(v16 *)VACC_L = vs;
+    return (vs);
+#else
+    vector_copy(VACC_L, vt);
+    vector_and(VACC_L, vs);
+    vector_fill(V_result);
+    vector_xor(VACC_L, V_result);
+    vector_copy(V_result, VACC_L);
+    return;
+#endif
+}
+
+VECTOR_OPERATION VOR(v16 vs, v16 vt)
+{
+#ifdef ARCH_MIN_SSE2
+    vector_or(vs, vt);
+    *(v16 *)VACC_L = vs;
+    return (vs);
+#else
+    vector_copy(VACC_L, vt);
+    vector_or(VACC_L, vs);
+    vector_copy(V_result, VACC_L);
+    return;
+#endif
+}
+
+VECTOR_OPERATION VNOR(v16 vs, v16 vt)
+{
+#ifdef ARCH_MIN_SSE2
+    vector_or(vt, vs);
+    vector_fill(vs);
+    vector_xor(vs, vt);
+    *(v16 *)VACC_L = vs;
+    return (vs);
+#else
+    vector_copy(VACC_L, vt);
+    vector_or(VACC_L, vs);
+    vector_fill(V_result);
+    vector_xor(VACC_L, V_result);
+    vector_copy(V_result, VACC_L);
+    return;
+#endif
+}
+
+VECTOR_OPERATION VXOR(v16 vs, v16 vt)
+{
+#ifdef ARCH_MIN_SSE2
+    vector_xor(vs, vt);
+    *(v16 *)VACC_L = vs;
+    return (vs);
+#else
+    vector_copy(VACC_L, vt);
+    vector_xor(VACC_L, vs);
+    vector_copy(V_result, VACC_L);
+    return;
+#endif
+}
+
+VECTOR_OPERATION VNXOR(v16 vs, v16 vt)
+{
+#ifdef ARCH_MIN_SSE2
+    vector_xor(vt, vs);
+    vector_fill(vs);
+    vector_xor(vs, vt);
+    *(v16 *)VACC_L = vs;
+    return (vs);
+#else
+    vector_copy(VACC_L, vt);
+    vector_xor(VACC_L, vs);
+    vector_fill(V_result);
+    vector_xor(VACC_L, V_result);
+    vector_copy(V_result, VACC_L);
+    return;
+#endif
+}
--- a/mupen64plus-rsp-cxd4/vu/logical.h
+++ b/mupen64plus-rsp-cxd4/vu/logical.h
@ -0,0 +1,34 @@
+/******************************************************************************\
+* Project:  Instruction Mnemonics for Vector Unit Computational Bit-Wise Logic *
+* Authors:  Iconoclast                                                         *
+* Release:  2014.10.15                                                         *
+* License:  CC0 Public Domain Dedication                                       *
+*                                                                              *
+* To the extent possible under law, the author(s) have dedicated all copyright *
+* and related and neighboring rights to this software to the public domain     *
+* worldwide. This software is distributed without any warranty.                *
+*                                                                              *
+* You should have received a copy of the CC0 Public Domain Dedication along    *
+* with this software.                                                          *
+* If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.             *
+\******************************************************************************/
+
+#ifndef _LOGICAL_H_
+#define _LOGICAL_H_
+
+#include "vu.h"
+
+VECTOR_EXTERN
+    VAND   (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VNAND  (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VOR    (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VNOR   (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VXOR   (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VNXOR  (v16 vs, v16 vt);
+
+#endif
--- a/mupen64plus-rsp-cxd4/vu/multiply.c
+++ b/mupen64plus-rsp-cxd4/vu/multiply.c
@ -0,0 +1,754 @@
+/******************************************************************************\
+* Project:  MSP Simulation Layer for Vector Unit Computational Multiplies      *
+* Authors:  Iconoclast                                                         *
+* Release:  2015.11.30                                                         *
+* License:  CC0 Public Domain Dedication                                       *
+*                                                                              *
+* To the extent possible under law, the author(s) have dedicated all copyright *
+* and related and neighboring rights to this software to the public domain     *
+* worldwide. This software is distributed without any warranty.                *
+*                                                                              *
+* You should have received a copy of the CC0 Public Domain Dedication along    *
+* with this software.                                                          *
+* If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.             *
+\******************************************************************************/
+
+#include "multiply.h"
+
+#ifdef ARCH_MIN_SSE2
+#define _mm_cmple_epu16(dst, src) \
+    _mm_cmpeq_epi16(_mm_subs_epu16(dst, src), _mm_setzero_si128())
+#define _mm_cmpgt_epu16(dst, src) \
+    _mm_andnot_si128(_mm_cmpeq_epi16(dst, src), _mm_cmple_epu16(src, dst))
+#define _mm_cmplt_epu16(dst, src) \
+    _mm_cmpgt_epu16(src, dst)
+
+#define _mm_mullo_epu16(dst, src) \
+    _mm_mullo_epi16(dst, src)
+
+static INLINE void SIGNED_CLAMP_AM(pi16 VD)
+{ /* typical sign-clamp of accumulator-mid (bits 31:16) */
+    v16 dst, src;
+    v16 pvd, pvs;
+
+    pvs = _mm_load_si128((v16 *)VACC_H);
+    pvd = _mm_load_si128((v16 *)VACC_M);
+    dst = _mm_unpacklo_epi16(pvd, pvs);
+    src = _mm_unpackhi_epi16(pvd, pvs);
+
+    dst = _mm_packs_epi32(dst, src);
+    _mm_store_si128((v16 *)VD, dst);
+    return;
+}
+#else
+static INLINE void SIGNED_CLAMP_AM(pi16 VD)
+{ /* typical sign-clamp of accumulator-mid (bits 31:16) */
+    i16 hi[N], lo[N];
+    register int i;
+
+    for (i = 0; i < N; i++)
+        lo[i]  = (VACC_H[i] < ~0);
+    for (i = 0; i < N; i++)
+        lo[i] |= (VACC_H[i] < 0) & !(VACC_M[i] < 0);
+    for (i = 0; i < N; i++)
+        hi[i]  = (VACC_H[i] >  0);
+    for (i = 0; i < N; i++)
+        hi[i] |= (VACC_H[i] == 0) & (VACC_M[i] < 0);
+    vector_copy(VD, VACC_M);
+    for (i = 0; i < N; i++)
+        VD[i] &= -(lo[i] ^ 1);
+    for (i = 0; i < N; i++)
+        VD[i] |= -(hi[i] ^ 0);
+    for (i = 0; i < N; i++)
+        VD[i] ^= 0x8000 * (hi[i] | lo[i]);
+    return;
+}
+#endif
+
+static INLINE void UNSIGNED_CLAMP(pi16 VD)
+{ /* sign-zero hybrid clamp of accumulator-mid (bits 31:16) */
+    ALIGNED i16 temp[N];
+    i16 cond[N];
+    register int i;
+
+    SIGNED_CLAMP_AM(temp); /* no direct map in SSE, but closely based on this */
+    for (i = 0; i < N; i++)
+        cond[i] = -(temp[i] >  VACC_M[i]); /* VD |= -(ACC47..16 > +32767) */
+    for (i = 0; i < N; i++)
+        VD[i] = temp[i] & ~(temp[i] >> 15); /* Only this clamp is unsigned. */
+    for (i = 0; i < N; i++)
+        VD[i] = VD[i] | cond[i];
+    return;
+}
+
+static INLINE void SIGNED_CLAMP_AL(pi16 VD)
+{ /* sign-clamp accumulator-low (bits 15:0) */
+    ALIGNED i16 temp[N];
+    i16 cond[N];
+    register int i;
+
+    SIGNED_CLAMP_AM(temp); /* no direct map in SSE, but closely based on this */
+    for (i = 0; i < N; i++)
+        cond[i] = (temp[i] != VACC_M[i]); /* result_clamped != result_raw ? */
+    for (i = 0; i < N; i++)
+        temp[i] ^= 0x8000; /* clamps 0x0000:0xFFFF instead of -0x8000:+0x7FFF */
+    for (i = 0; i < N; i++)
+        VD[i] = (cond[i] ? temp[i] : VACC_L[i]);
+    return;
+}
+
+INLINE static void do_macf(pi16 VD, pi16 VS, pi16 VT)
+{
+    i32 product[N];
+    u32 addend[N];
+    register int i;
+
+    for (i = 0; i < N; i++)
+        product[i] = VS[i] * VT[i];
+    for (i = 0; i < N; i++)
+        addend[i] = (product[i] << 1) & 0x00000000FFFF;
+    for (i = 0; i < N; i++)
+        addend[i] = (u16)(VACC_L[i]) + addend[i];
+    for (i = 0; i < N; i++)
+        VACC_L[i] = (i16)(addend[i]);
+    for (i = 0; i < N; i++)
+        addend[i] = (addend[i] >> 16) + (u16)(product[i] >> 15);
+    for (i = 0; i < N; i++)
+        addend[i] = (u16)(VACC_M[i]) + addend[i];
+    for (i = 0; i < N; i++)
+        VACC_M[i] = (i16)(addend[i]);
+    for (i = 0; i < N; i++)
+        VACC_H[i] -= (product[i] < 0);
+    for (i = 0; i < N; i++)
+        VACC_H[i] += addend[i] >> 16;
+    SIGNED_CLAMP_AM(VD);
+    return;
+}
+
+INLINE static void do_macu(pi16 VD, pi16 VS, pi16 VT)
+{
+    i32 product[N];
+    u32 addend[N];
+    register int i;
+
+    for (i = 0; i < N; i++)
+        product[i] = VS[i] * VT[i];
+    for (i = 0; i < N; i++)
+        addend[i] = (product[i] << 1) & 0x00000000FFFF;
+    for (i = 0; i < N; i++)
+        addend[i] = (u16)(VACC_L[i]) + addend[i];
+    for (i = 0; i < N; i++)
+        VACC_L[i] = (i16)(addend[i]);
+    for (i = 0; i < N; i++)
+        addend[i] = (addend[i] >> 16) + (u16)(product[i] >> 15);
+    for (i = 0; i < N; i++)
+        addend[i] = (u16)(VACC_M[i]) + addend[i];
+    for (i = 0; i < N; i++)
+        VACC_M[i] = (i16)(addend[i]);
+    for (i = 0; i < N; i++)
+        VACC_H[i] -= (product[i] < 0);
+    for (i = 0; i < N; i++)
+        VACC_H[i] += addend[i] >> 16;
+    UNSIGNED_CLAMP(VD);
+    return;
+}
+
+VECTOR_OPERATION VMULF(v16 vs, v16 vt)
+{
+#ifdef ARCH_MIN_SSE2
+    v16 negative;
+    v16 round;
+    v16 prod_hi, prod_lo;
+
+/*
+ * We cannot save register allocations by doing xmm0 *= xmm1 or xmm1 *= xmm0
+ * because we need to do future computations on the original source factors.
+ */
+    prod_lo = _mm_mullo_epi16(vs, vt);
+    prod_hi = _mm_mulhi_epi16(vs, vt);
+
+/*
+ * The final product is really 2*s*t + 32768.  Fortunately for us, however,
+ * no two 16-bit values can cause overflow when <<= 1 the HIGH word, anyway.
+ */
+    prod_hi = _mm_add_epi16(prod_hi, prod_hi); /* fast way of doing <<= 1 */
+    negative = _mm_srli_epi16(prod_lo, 15); /* shifting LOW overflows ? 1 : 0 */
+    prod_hi = _mm_add_epi16(prod_hi, negative); /* hi<<1 += MSB of lo */
+    prod_lo = _mm_add_epi16(prod_lo, prod_lo); /* fast way of doing <<= 1 */
+    negative = _mm_srli_epi16(prod_lo, 15); /* Adding 0x8000 sets MSB to 0? */
+
+/*
+ * special fractional round value:  (32-bit product) += 32768 (0x8000)
+ * two's compliment computation:  (0xFFFF << 15) & 0xFFFF
+ */
+    round = _mm_cmpeq_epi16(vs, vs); /* PCMPEQW xmmA, xmmA # all 1's forced */
+    round = _mm_slli_epi16(round, 15);
+
+    prod_lo = _mm_xor_si128(prod_lo, round); /* Or += 32768 works also. */
+    *(v16 *)VACC_L = prod_lo;
+    prod_hi = _mm_add_epi16(prod_hi, negative);
+    *(v16 *)VACC_M = prod_hi;
+
+/*
+ * VMULF does signed clamping.  However, in VMULF's case, the only possible
+ * combination of inputs to even cause a 32-bit signed clamp to a saturated
+ * 16-bit result is (-32768 * -32768), so, rather than fully emulating a
+ * signed clamp with SSE, we do an accurate-enough hack for this corner case.
+ */
+    negative = _mm_srai_epi16(prod_hi, 15);
+    vs = _mm_cmpeq_epi16(vs, round); /* vs == -32768 ? ~0 : 0 */
+    vt = _mm_cmpeq_epi16(vt, round); /* vt == -32768 ? ~0 : 0 */
+    vs = _mm_and_si128(vs, vt); /* vs == vt == -32768:  corner case confirmed */
+
+    negative = _mm_xor_si128(negative, vs);
+    *(v16 *)VACC_H = negative; /* 2*i16*i16 only fills L/M; VACC_H = 0 or ~0. */
+    vs = _mm_add_epi16(vs, prod_hi); /* prod_hi must be -32768; + -1 = +32767 */
+    return (vs);
+#else
+    word_64 product[N]; /* (-32768 * -32768)<<1 + 32768 confuses 32-bit type. */
+    register unsigned int i;
+
+    for (i = 0; i < N; i++)
+        product[i].W = vs[i] * vt[i];
+    for (i = 0; i < N; i++)
+        product[i].W <<= 1; /* special fractional shift value */
+    for (i = 0; i < N; i++)
+        product[i].W += 32768; /* special fractional round value */
+    for (i = 0; i < N; i++)
+        VACC_L[i] = (product[i].UW & 0x00000000FFFF) >>  0;
+    for (i = 0; i < N; i++)
+        VACC_M[i] = (product[i].UW & 0x0000FFFF0000) >> 16;
+    for (i = 0; i < N; i++)
+        VACC_H[i] = -(product[i].SW < 0); /* product>>32 & 0xFFFF */
+    SIGNED_CLAMP_AM(V_result);
+    return;
+#endif
+}
+
+VECTOR_OPERATION VMULU(v16 vs, v16 vt)
+{
+#ifdef ARCH_MIN_SSE2
+    v16 negative;
+    v16 round;
+    v16 prod_hi, prod_lo;
+
+/*
+ * Besides the unsigned clamping method (as opposed to VMULF's signed clamp),
+ * this operation's multiplication matches VMULF.  See VMULF for annotations.
+ */
+    prod_lo = _mm_mullo_epi16(vs, vt);
+    prod_hi = _mm_mulhi_epi16(vs, vt);
+
+    prod_hi = _mm_add_epi16(prod_hi, prod_hi);
+    negative = _mm_srli_epi16(prod_lo, 15);
+    prod_hi = _mm_add_epi16(prod_hi, negative);
+    prod_lo = _mm_add_epi16(prod_lo, prod_lo);
+    negative = _mm_srli_epi16(prod_lo, 15);
+
+    round = _mm_cmpeq_epi16(vs, vs);
+    round = _mm_slli_epi16(round, 15);
+
+    prod_lo = _mm_xor_si128(prod_lo, round);
+    *(v16 *)VACC_L = prod_lo;
+    prod_hi = _mm_add_epi16(prod_hi, negative);
+    *(v16 *)VACC_M = prod_hi;
+
+/*
+ * VMULU does unsigned clamping.  However, in VMULU's case, the only possible
+ * combinations that overflow, are either negative values or -32768 * -32768.
+ */
+    negative = _mm_srai_epi16(prod_hi, 15);
+    vs = _mm_cmpeq_epi16(vs, round); /* vs == -32768 ? ~0 : 0 */
+    vt = _mm_cmpeq_epi16(vt, round); /* vt == -32768 ? ~0 : 0 */
+    vs = _mm_and_si128(vs, vt); /* vs == vt == -32768:  corner case confirmed */
+    negative = _mm_xor_si128(negative, vs);
+    *(v16 *)VACC_H = negative; /* 2*i16*i16 only fills L/M; VACC_H = 0 or ~0. */
+
+    prod_lo = _mm_srai_epi16(prod_hi, 15); /* unsigned overflow mask */
+    vs = _mm_or_si128(prod_hi, prod_lo);
+    vs = _mm_andnot_si128(negative, vs); /* unsigned underflow mask */
+    return (vs);
+#else
+    word_64 product[N]; /* (-32768 * -32768)<<1 + 32768 confuses 32-bit type. */
+    register unsigned int i;
+
+    for (i = 0; i < N; i++)
+        product[i].W = vs[i] * vt[i];
+    for (i = 0; i < N; i++)
+        product[i].W <<= 1; /* special fractional shift value */
+    for (i = 0; i < N; i++)
+        product[i].W += 32768; /* special fractional round value */
+    for (i = 0; i < N; i++)
+        VACC_L[i] = (product[i].UW & 0x00000000FFFF) >>  0;
+    for (i = 0; i < N; i++)
+        VACC_M[i] = (product[i].UW & 0x0000FFFF0000) >> 16;
+    for (i = 0; i < N; i++)
+        VACC_H[i] = -(product[i].SW < 0); /* product>>32 & 0xFFFF */
+    UNSIGNED_CLAMP(V_result);
+    return;
+#endif
+}
+
+VECTOR_OPERATION VMUDL(v16 vs, v16 vt)
+{
+#ifdef ARCH_MIN_SSE2
+    vs = _mm_mulhi_epu16(vs, vt);
+    vector_wipe(vt); /* (UINT16_MAX * UINT16_MAX) >> 16 too small for MD/HI */
+    *(v16 *)VACC_L = vs;
+    *(v16 *)VACC_M = vt;
+    *(v16 *)VACC_H = vt;
+    return (vs); /* no possibilities to clamp */
+#else
+    word_32 product[N];
+    register unsigned int i;
+
+    for (i = 0; i < N; i++)
+        product[i].UW = (u16)vs[i] * (u16)vt[i];
+    for (i = 0; i < N; i++)
+        VACC_L[i] = product[i].UW >> 16; /* product[i].H[HES(0) >> 1] */
+    vector_copy(V_result, VACC_L);
+    vector_wipe(VACC_M);
+    vector_wipe(VACC_H);
+    return;
+#endif
+}
+
+VECTOR_OPERATION VMUDM(v16 vs, v16 vt)
+{
+#ifdef ARCH_MIN_SSE2
+    v16 prod_hi, prod_lo;
+
+    prod_lo = _mm_mullo_epi16(vs, vt);
+    prod_hi = _mm_mulhi_epu16(vs, vt);
+
+/*
+ * Based on a little pattern found by MarathonMan...
+ * If (vs < 0), then high 16 bits of (u16)vs * (u16)vt += ~(vt) + 1, or -vt.
+ */
+    vs = _mm_srai_epi16(vs, 15);
+    vt = _mm_and_si128(vt, vs);
+    prod_hi = _mm_sub_epi16(prod_hi, vt);
+
+    *(v16 *)VACC_L = prod_lo;
+    *(v16 *)VACC_M = prod_hi;
+    vs = prod_hi;
+    prod_hi = _mm_srai_epi16(prod_hi, 15);
+    *(v16 *)VACC_H = prod_hi;
+    return (vs);
+#else
+    word_32 product[N];
+    register unsigned int i;
+
+    for (i = 0; i < N; i++)
+        product[i].SW = (s16)vs[i] * (u16)vt[i];
+    for (i = 0; i < N; i++)
+        VACC_L[i] = (product[i].W & 0x00000000FFFF) >>  0;
+    for (i = 0; i < N; i++)
+        VACC_M[i] = (product[i].W & 0x0000FFFF0000) >> 16;
+    for (i = 0; i < N; i++)
+        VACC_H[i] = -(VACC_M[i] < 0);
+    vector_copy(V_result, VACC_M);
+    return;
+#endif
+}
+
+VECTOR_OPERATION VMUDN(v16 vs, v16 vt)
+{
+#ifdef ARCH_MIN_SSE2
+    v16 prod_hi, prod_lo;
+
+    prod_lo = _mm_mullo_epi16(vs, vt);
+    prod_hi = _mm_mulhi_epu16(vs, vt);
+
+/*
+ * Based on the pattern discovered for the similar VMUDM operation.
+ * If (vt < 0), then high 16 bits of (u16)vs * (u16)vt += ~(vs) + 1, or -vs.
+ */
+    vt = _mm_srai_epi16(vt, 15);
+    vs = _mm_and_si128(vs, vt);
+    prod_hi = _mm_sub_epi16(prod_hi, vs);
+
+    *(v16 *)VACC_L = prod_lo;
+    *(v16 *)VACC_M = prod_hi;
+    prod_hi = _mm_srai_epi16(prod_hi, 15);
+    *(v16 *)VACC_H = prod_hi;
+    return (vs = prod_lo);
+#else
+    word_32 product[N];
+    register unsigned int i;
+
+    for (i = 0; i < N; i++)
+        product[i].SW = (u16)vs[i] * (s16)vt[i];
+    for (i = 0; i < N; i++)
+        VACC_L[i] = (product[i].W & 0x00000000FFFF) >>  0;
+    for (i = 0; i < N; i++)
+        VACC_M[i] = (product[i].W & 0x0000FFFF0000) >> 16;
+    for (i = 0; i < N; i++)
+        VACC_H[i] = -(VACC_M[i] < 0);
+    vector_copy(V_result, VACC_L);
+    return;
+#endif
+}
+
+VECTOR_OPERATION VMUDH(v16 vs, v16 vt)
+{
+#ifdef ARCH_MIN_SSE2
+    v16 prod_high;
+
+    prod_high = _mm_mulhi_epi16(vs, vt);
+    vs        = _mm_mullo_epi16(vs, vt);
+
+    *(v16 *)VACC_L = _mm_setzero_si128();
+    *(v16 *)VACC_M = vs; /* acc 31..16 storing (VS*VT)15..0 */
+    *(v16 *)VACC_H = prod_high; /* acc 47..32 storing (VS*VT)31..16 */
+
+/*
+ * "Unpack" the low 16 bits and the high 16 bits of each 32-bit product to a
+ * couple xmm registers, re-storing them as 2 32-bit products each.
+ */
+    vt = _mm_unpackhi_epi16(vs, prod_high);
+    vs = _mm_unpacklo_epi16(vs, prod_high);
+
+/*
+ * Re-interleave or pack both 32-bit products in both xmm registers with
+ * signed saturation:  prod < -32768 to -32768 and prod > +32767 to +32767.
+ */
+    vs = _mm_packs_epi32(vs, vt);
+    return (vs);
+#else
+    word_32 product[N];
+    register unsigned int i;
+
+    for (i = 0; i < N; i++)
+        product[i].SW = (s16)vs[i] * (s16)vt[i];
+    vector_wipe(VACC_L);
+    for (i = 0; i < N; i++)
+        VACC_M[i] = (s16)(product[i].W >>  0); /* product[i].HW[HES(0) >> 1] */
+    for (i = 0; i < N; i++)
+        VACC_H[i] = (s16)(product[i].W >> 16); /* product[i].HW[HES(2) >> 1] */
+    SIGNED_CLAMP_AM(V_result);
+    return;
+#endif
+}
+
+VECTOR_OPERATION VMACF(v16 vs, v16 vt)
+{
+    ALIGNED i16 VD[N];
+#ifdef ARCH_MIN_SSE2
+    ALIGNED i16 VS[N], VT[N];
+
+    *(v16 *)VS = vs;
+    *(v16 *)VT = vt;
+#else
+    v16 VS, VT;
+
+    VS = vs;
+    VT = vt;
+#endif
+    do_macf(VD, VS, VT);
+#ifdef ARCH_MIN_SSE2
+    COMPILER_FENCE();
+    vs = *(v16 *)VD;
+    return (vs);
+#else
+    vector_copy(V_result, VD);
+    return;
+#endif
+}
+
+VECTOR_OPERATION VMACU(v16 vs, v16 vt)
+{
+    ALIGNED i16 VD[N];
+#ifdef ARCH_MIN_SSE2
+    ALIGNED i16 VS[N], VT[N];
+
+    *(v16 *)VS = vs;
+    *(v16 *)VT = vt;
+#else
+    v16 VS, VT;
+
+    VS = vs;
+    VT = vt;
+#endif
+    do_macu(VD, VS, VT);
+#ifdef ARCH_MIN_SSE2
+    COMPILER_FENCE();
+    vs = *(v16 *)VD;
+    return (vs);
+#else
+    vector_copy(V_result, VD);
+    return;
+#endif
+}
+
+VECTOR_OPERATION VMADL(v16 vs, v16 vt)
+{
+#ifdef ARCH_MIN_SSE2
+    v16 acc_hi, acc_md, acc_lo;
+    v16 prod_hi;
+    v16 overflow, overflow_new;
+
+ /* prod_lo = _mm_mullo_epu16(vs, vt); */
+    prod_hi = _mm_mulhi_epu16(vs, vt);
+
+    acc_lo = *(v16 *)VACC_L;
+    acc_md = *(v16 *)VACC_M;
+    acc_hi = *(v16 *)VACC_H;
+
+    acc_lo = _mm_add_epi16(acc_lo, prod_hi);
+    *(v16 *)VACC_L = acc_lo;
+
+    overflow = _mm_cmplt_epu16(acc_lo, prod_hi); /* overflow:  (x + y < y) */
+    acc_md = _mm_sub_epi16(acc_md, overflow);
+    *(v16 *)VACC_M = acc_md;
+
+/*
+ * Luckily for us, taking unsigned * unsigned always evaluates to something
+ * nonnegative, so we only have to worry about overflow from accumulating.
+ */
+    overflow_new = _mm_cmpeq_epi16(acc_md, _mm_setzero_si128());
+    overflow = _mm_and_si128(overflow, overflow_new);
+    acc_hi = _mm_sub_epi16(acc_hi, overflow);
+    *(v16 *)VACC_H = acc_hi;
+
+/*
+ * Do a signed clamp...sort of (VM?DM, VM?DH:  middle; VM?DL, VM?DN:  low).
+ *     if (acc_47..16 < -32768) result = -32768 ^ 0x8000;      # 0000
+ *     else if (acc_47..16 > +32767) result = +32767 ^ 0x8000; # FFFF
+ *     else { result = acc_15..0 & 0xFFFF; }
+ * So it is based on the standard signed clamping logic for VM?DM, VM?DH,
+ * except that extra steps must be concatenated to that definition.
+ */
+    vt = _mm_unpackhi_epi16(acc_md, acc_hi);
+    vs = _mm_unpacklo_epi16(acc_md, acc_hi);
+    vs = _mm_packs_epi32(vs, vt);
+
+    acc_md = _mm_cmpeq_epi16(acc_md, vs); /* (unclamped == clamped) ... */
+    acc_lo = _mm_and_si128(acc_lo, acc_md); /* ... ? low : mid */
+    vt = _mm_cmpeq_epi16(vt, vt);
+    acc_md = _mm_xor_si128(acc_md, vt); /* (unclamped != clamped) ... */
+
+    vs = _mm_and_si128(vs, acc_md); /* ... ? VS_clamped : 0x0000 */
+    vs = _mm_or_si128(vs, acc_lo); /*                   : acc_lo */
+    acc_md = _mm_slli_epi16(acc_md, 15); /* ... ? ^ 0x8000 : ^ 0x0000 */
+    vs = _mm_xor_si128(vs, acc_md); /* Stupid unsigned-clamp-ish adjustment. */
+    return (vs);
+#else
+    word_32 product[N], addend[N];
+    register unsigned int i;
+
+    for (i = 0; i < N; i++)
+        product[i].UW = (u16)vs[i] * (u16)vt[i];
+    for (i = 0; i < N; i++)
+        addend[i].UW = (u16)(product[i].UW >> 16) + (u16)VACC_L[i];
+    for (i = 0; i < N; i++)
+        VACC_L[i] = addend[i].UW & 0x0000FFFF;
+    for (i = 0; i < N; i++)
+        addend[i].UW = (addend[i].UW >> 16) + (0x000000000000 >> 16);
+    for (i = 0; i < N; i++)
+        addend[i].UW += (u16)VACC_M[i];
+    for (i = 0; i < N; i++)
+        VACC_M[i] = addend[i].UW & 0x0000FFFF;
+    for (i = 0; i < N; i++)
+        VACC_H[i] += addend[i].UW >> 16;
+    SIGNED_CLAMP_AL(V_result);
+    return;
+#endif
+}
+
+VECTOR_OPERATION VMADM(v16 vs, v16 vt)
+{
+#ifdef ARCH_MIN_SSE2
+    v16 acc_hi, acc_md, acc_lo;
+    v16 prod_hi, prod_lo;
+    v16 overflow;
+
+    prod_lo = _mm_mullo_epi16(vs, vt);
+    prod_hi = _mm_mulhi_epu16(vs, vt);
+
+    vs = _mm_srai_epi16(vs, 15);
+    vt = _mm_and_si128(vt, vs);
+    prod_hi = _mm_sub_epi16(prod_hi, vt);
+
+/*
+ * Writeback phase to the accumulator.
+ * VMADM stores accumulator += the product achieved by VMUDM.
+ */
+    acc_lo = *(v16 *)VACC_L;
+    acc_md = *(v16 *)VACC_M;
+    acc_hi = *(v16 *)VACC_H;
+
+    acc_lo = _mm_add_epi16(acc_lo, prod_lo);
+    *(v16 *)VACC_L = acc_lo;
+
+    overflow = _mm_cmplt_epu16(acc_lo, prod_lo); /* overflow:  (x + y < y) */
+    prod_hi = _mm_sub_epi16(prod_hi, overflow);
+    acc_md = _mm_add_epi16(acc_md, prod_hi);
+    *(v16 *)VACC_M = acc_md;
+
+    overflow = _mm_cmplt_epu16(acc_md, prod_hi);
+    prod_hi = _mm_srai_epi16(prod_hi, 15);
+    acc_hi = _mm_add_epi16(acc_hi, prod_hi);
+    acc_hi = _mm_sub_epi16(acc_hi, overflow);
+    *(v16 *)VACC_H = acc_hi;
+
+    vt = _mm_unpackhi_epi16(acc_md, acc_hi);
+    vs = _mm_unpacklo_epi16(acc_md, acc_hi);
+    vs = _mm_packs_epi32(vs, vt);
+    return (vs);
+#else
+    word_32 product[N], addend[N];
+    register unsigned int i;
+
+    for (i = 0; i < N; i++)
+        product[i].SW = (s16)vs[i] * (u16)vt[i];
+    for (i = 0; i < N; i++)
+        addend[i].UW = (product[i].W & 0x0000FFFF) + (u16)VACC_L[i];
+    for (i = 0; i < N; i++)
+        VACC_L[i] = addend[i].UW & 0x0000FFFF;
+    for (i = 0; i < N; i++)
+        addend[i].UW = (addend[i].UW >> 16) + (product[i].SW >> 16);
+    for (i = 0; i < N; i++)
+        addend[i].UW += (u16)VACC_M[i];
+    for (i = 0; i < N; i++)
+        VACC_M[i] = addend[i].UW & 0x0000FFFF;
+    for (i = 0; i < N; i++)
+        VACC_H[i] += addend[i].UW >> 16;
+    SIGNED_CLAMP_AM(V_result);
+    return;
+#endif
+}
+
+VECTOR_OPERATION VMADN(v16 vs, v16 vt)
+{
+#ifdef ARCH_MIN_SSE2
+    v16 acc_hi, acc_md, acc_lo;
+    v16 prod_hi, prod_lo;
+    v16 overflow;
+
+    prod_lo = _mm_mullo_epi16(vs, vt);
+    prod_hi = _mm_mulhi_epu16(vs, vt);
+
+    vt = _mm_srai_epi16(vt, 15);
+    vs = _mm_and_si128(vs, vt);
+    prod_hi = _mm_sub_epi16(prod_hi, vs);
+
+/*
+ * Writeback phase to the accumulator.
+ * VMADN stores accumulator += the product achieved by VMUDN.
+ */
+    acc_lo = *(v16 *)VACC_L;
+    acc_md = *(v16 *)VACC_M;
+    acc_hi = *(v16 *)VACC_H;
+
+    acc_lo = _mm_add_epi16(acc_lo, prod_lo);
+    *(v16 *)VACC_L = acc_lo;
+
+    overflow = _mm_cmplt_epu16(acc_lo, prod_lo); /* overflow:  (x + y < y) */
+    prod_hi = _mm_sub_epi16(prod_hi, overflow);
+    acc_md = _mm_add_epi16(acc_md, prod_hi);
+    *(v16 *)VACC_M = acc_md;
+
+    overflow = _mm_cmplt_epu16(acc_md, prod_hi);
+    prod_hi = _mm_srai_epi16(prod_hi, 15);
+    acc_hi = _mm_add_epi16(acc_hi, prod_hi);
+    acc_hi = _mm_sub_epi16(acc_hi, overflow);
+    *(v16 *)VACC_H = acc_hi;
+
+/*
+ * Do a signed clamp...sort of (VM?DM, VM?DH:  middle; VM?DL, VM?DN:  low).
+ *     if (acc_47..16 < -32768) result = -32768 ^ 0x8000;      # 0000
+ *     else if (acc_47..16 > +32767) result = +32767 ^ 0x8000; # FFFF
+ *     else { result = acc_15..0 & 0xFFFF; }
+ * So it is based on the standard signed clamping logic for VM?DM, VM?DH,
+ * except that extra steps must be concatenated to that definition.
+ */
+    vt = _mm_unpackhi_epi16(acc_md, acc_hi);
+    vs = _mm_unpacklo_epi16(acc_md, acc_hi);
+    vs = _mm_packs_epi32(vs, vt);
+
+    acc_md = _mm_cmpeq_epi16(acc_md, vs); /* (unclamped == clamped) ... */
+    acc_lo = _mm_and_si128(acc_lo, acc_md); /* ... ? low : mid */
+    vt = _mm_cmpeq_epi16(vt, vt);
+    acc_md = _mm_xor_si128(acc_md, vt); /* (unclamped != clamped) ... */
+
+    vs = _mm_and_si128(vs, acc_md); /* ... ? VS_clamped : 0x0000 */
+    vs = _mm_or_si128(vs, acc_lo); /*                   : acc_lo */
+    acc_md = _mm_slli_epi16(acc_md, 15); /* ... ? ^ 0x8000 : ^ 0x0000 */
+    vs = _mm_xor_si128(vs, acc_md); /* Stupid unsigned-clamp-ish adjustment. */
+    return (vs);
+#else
+    word_32 product[N], addend[N];
+    register unsigned int i;
+
+    for (i = 0; i < N; i++)
+        product[i].SW = (u16)vs[i] * (s16)vt[i];
+    for (i = 0; i < N; i++)
+        addend[i].UW = (product[i].W & 0x0000FFFF) + (u16)VACC_L[i];
+    for (i = 0; i < N; i++)
+        VACC_L[i] = addend[i].UW & 0x0000FFFF;
+    for (i = 0; i < N; i++)
+        addend[i].UW = (addend[i].UW >> 16) + (product[i].SW >> 16);
+    for (i = 0; i < N; i++)
+        addend[i].UW += (u16)VACC_M[i];
+    for (i = 0; i < N; i++)
+        VACC_M[i] = addend[i].UW & 0x0000FFFF;
+    for (i = 0; i < N; i++)
+        VACC_H[i] += addend[i].UW >> 16;
+    SIGNED_CLAMP_AL(V_result);
+    return;
+#endif
+}
+
+VECTOR_OPERATION VMADH(v16 vs, v16 vt)
+{
+#ifdef ARCH_MIN_SSE2
+    v16 acc_mid;
+    v16 prod_high;
+
+    prod_high = _mm_mulhi_epi16(vs, vt);
+    vs        = _mm_mullo_epi16(vs, vt);
+
+/*
+ * We're required to load the source product from the accumulator to add to.
+ * While we're at it, conveniently sneak in a acc[31..16] += (vs*vt)[15..0].
+ */
+    acc_mid = *(v16 *)VACC_M;
+    vs = _mm_add_epi16(vs, acc_mid);
+    *(v16 *)VACC_M = vs;
+    vt = *(v16 *)VACC_H;
+
+/*
+ * While accumulating base_lo + product_lo is easy, getting the correct data
+ * for base_hi + product_hi is tricky and needs unsigned overflow detection.
+ *
+ * The one-liner solution to detecting unsigned overflow (thus adding a carry
+ * value of 1 to the higher word) is _mm_cmplt_epu16, but none of the Intel
+ * MMX-based instruction sets define unsigned comparison ops FOR us, so...
+ */
+    vt = _mm_add_epi16(vt, prod_high);
+    vs = _mm_cmplt_epu16(vs, acc_mid); /* acc.mid + prod.low < acc.mid */
+    vt = _mm_sub_epi16(vt, vs); /* += 1 if overflow, by doing -= ~0 */
+    *(v16 *)VACC_H = vt;
+
+    vs = *(v16 *)VACC_M;
+    prod_high = _mm_unpackhi_epi16(vs, vt);
+    vs        = _mm_unpacklo_epi16(vs, vt);
+    vs = _mm_packs_epi32(vs, prod_high);
+    return (vs);
+#else
+    word_32 product[N], addend[N];
+    register unsigned int i;
+
+    for (i = 0; i < N; i++)
+        product[i].SW = (s16)vs[i] * (s16)vt[i];
+    for (i = 0; i < N; i++)
+        addend[i].UW = (u16)VACC_M[i] + (u16)(product[i].W);
+    for (i = 0; i < N; i++)
+        VACC_M[i] += (i16)product[i].SW;
+    for (i = 0; i < N; i++)
+        VACC_H[i] += (addend[i].UW >> 16) + (product[i].SW >> 16);
+    SIGNED_CLAMP_AM(V_result);
+    return;
+#endif
+}
--- a/mupen64plus-rsp-cxd4/vu/multiply.h
+++ b/mupen64plus-rsp-cxd4/vu/multiply.h
@ -0,0 +1,85 @@
+/******************************************************************************\
+* Project:  Instruction Mnemonics for Vector Unit Computational Multiplies     *
+* Authors:  Iconoclast                                                         *
+* Release:  2015.11.30                                                         *
+* License:  CC0 Public Domain Dedication                                       *
+*                                                                              *
+* To the extent possible under law, the author(s) have dedicated all copyright *
+* and related and neighboring rights to this software to the public domain     *
+* worldwide. This software is distributed without any warranty.                *
+*                                                                              *
+* You should have received a copy of the CC0 Public Domain Dedication along    *
+* with this software.                                                          *
+* If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.             *
+\******************************************************************************/
+
+#ifndef _MULTIPLY_H_
+#define _MULTIPLY_H_
+
+#include "vu.h"
+
+/*
+ * signed or unsigned muplication of fractions
+ */
+VECTOR_EXTERN
+    VMULF  (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VMULU  (v16 vs, v16 vt);
+/*
+ *  VRNDP  (v16 vs, v16 vt); # was on Ultra64 RCP but removed
+ *  VMULQ  (v16 vs, v16 vt); # was on Ultra64 RCP but removed
+ */
+
+/*
+ * double-precision multiplication of fractions
+ */
+VECTOR_EXTERN
+    VMUDL  (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VMUDM  (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VMUDN  (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VMUDH  (v16 vs, v16 vt);
+
+/*
+ * signed or unsigned accumulative multiplication and VMACQ
+ */
+VECTOR_EXTERN
+    VMACF  (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VMACU  (v16 vs, v16 vt);
+/*
+ *  VRNDN  (v16 vs, v16 vt); # was on Ultra64 RCP but removed
+ *  VMACQ  (v16 vs, v16 vt); # mentioned probably by mistake in RSP manual
+ */
+
+/*
+ * double-precision accumulative multiplication
+ */
+VECTOR_EXTERN
+    VMADL  (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VMADM  (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VMADN  (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VMADH  (v16 vs, v16 vt);
+
+/*
+ * an useful idea I thought of for the single-precision multiplies
+ * VMULF and VMULU
+ */
+#ifndef SEMIFRAC
+/*
+ * acc = VS * VT;
+ * acc = acc + 0x8000; # round value
+ * acc = acc << 1; # partial value shift
+ *
+ * Wrong:  ACC(HI) = -((INT32)(acc) < 0)
+ * Right:  ACC(HI) = -(SEMIFRAC < 0)
+ */
+#define SEMIFRAC    (VS[i]*VT[i]*2/2 + 0x8000/2)
+#endif
+
+#endif
--- a/mupen64plus-rsp-cxd4/vu/pack.h
+++ b/mupen64plus-rsp-cxd4/vu/pack.h
@ -0,0 +1,34 @@
+/******************************************************************************\
+* Project:  Instruction Mnemonics for Vector Unit Computational Packs          *
+* Authors:  Iconoclast                                                         *
+* Release:  2014.10.15                                                         *
+* License:  CC0 Public Domain Dedication                                       *
+*                                                                              *
+* To the extent possible under law, the author(s) have dedicated all copyright *
+* and related and neighboring rights to this software to the public domain     *
+* worldwide. This software is distributed without any warranty.                *
+*                                                                              *
+* You should have received a copy of the CC0 Public Domain Dedication along    *
+* with this software.                                                          *
+* If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.             *
+\******************************************************************************/
+
+#ifndef _PACK_H_
+#define _PACK_H_
+
+#include "vu.h"
+
+VECTOR_EXTERN
+    VEXTT  (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VEXTQ  (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VEXTN  (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VINST  (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VINSQ  (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VINSN  (v16 vs, v16 vt);
+
+#endif
--- a/mupen64plus-rsp-cxd4/vu/select.c
+++ b/mupen64plus-rsp-cxd4/vu/select.c
@ -0,0 +1,551 @@
+/******************************************************************************\
+* Project:  MSP Simulation Layer for Vector Unit Computational Test Selects    *
+* Authors:  Iconoclast                                                         *
+* Release:  2018.11.26                                                         *
+* License:  CC0 Public Domain Dedication                                       *
+*                                                                              *
+* To the extent possible under law, the author(s) have dedicated all copyright *
+* and related and neighboring rights to this software to the public domain     *
+* worldwide. This software is distributed without any warranty.                *
+*                                                                              *
+* You should have received a copy of the CC0 Public Domain Dedication along    *
+* with this software.                                                          *
+* If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.             *
+\******************************************************************************/
+
+#include "select.h"
+
+/*
+ * vector select merge (`VMRG`) formula
+ *
+ * This is really just a vectorizer for ternary conditional storage.
+ * I've named it so because it directly maps to the VMRG op-code.
+ * -- example --
+ * for (i = 0; i < N; i++)
+ *     if (c_pass)
+ *         dest = element_a;
+ *     else
+ *         dest = element_b;
+ */
+static void merge(pi16 VD, pi16 cmp, pi16 pass, pi16 fail)
+{
+    register int i;
+#if (0 != 0)
+/* Do not use this version yet, as it still does not vectorize to SSE2. */
+    for (i = 0; i < N; i++)
+        VD[i] = (cmp[i] != 0) ? pass[i] : fail[i];
+#else
+    i16 diff[N];
+
+    for (i = 0; i < N; i++)
+        diff[i] = pass[i] - fail[i];
+    for (i = 0; i < N; i++)
+        VD[i] = fail[i] + cmp[i]*diff[i]; /* actually `(cmp[i] != 0)*diff[i]` */
+#endif
+    return;
+}
+
+INLINE static void do_lt(pi16 VD, pi16 VS, pi16 VT)
+{
+    i16 cn[N];
+    i16 eq[N];
+    register int i;
+
+    for (i = 0; i < N; i++)
+        eq[i] = (VS[i] == VT[i]);
+    for (i = 0; i < N; i++)
+        cn[i] = cf_ne[i] & cf_co[i];
+    for (i = 0; i < N; i++)
+        eq[i] = eq[i] & cn[i];
+    for (i = 0; i < N; i++)
+        cf_comp[i] = (VS[i] < VT[i]); /* less than */
+    for (i = 0; i < N; i++)
+        cf_comp[i] = cf_comp[i] | eq[i]; /* ... or equal (uncommonly) */
+
+    merge(VACC_L, cf_comp, VS, VT);
+    vector_copy(VD, VACC_L);
+
+ /* CTC2    $0, $vco # zeroing RSP flags VCF[0] */
+    vector_wipe(cf_ne);
+    vector_wipe(cf_co);
+
+    vector_wipe(cf_clip);
+    return;
+}
+
+INLINE static void do_eq(pi16 VD, pi16 VS, pi16 VT)
+{
+    register int i;
+
+    for (i = 0; i < N; i++)
+        cf_comp[i] = (VS[i] == VT[i]);
+    for (i = 0; i < N; i++)
+        cf_comp[i] = cf_comp[i] & (cf_ne[i] ^ 1);
+#if (0)
+    merge(VACC_L, cf_comp, VS, VT); /* correct but redundant */
+#else
+    vector_copy(VACC_L, VT);
+#endif
+    vector_copy(VD, VACC_L);
+
+ /* CTC2    $0, $vco # zeroing RSP flags VCF[0] */
+    vector_wipe(cf_ne);
+    vector_wipe(cf_co);
+
+    vector_wipe(cf_clip);
+    return;
+}
+
+INLINE static void do_ne(pi16 VD, pi16 VS, pi16 VT)
+{
+    register int i;
+
+    for (i = 0; i < N; i++)
+        cf_comp[i] = (VS[i] != VT[i]);
+    for (i = 0; i < N; i++)
+        cf_comp[i] = cf_comp[i] | cf_ne[i];
+#if (0)
+    merge(VACC_L, cf_comp, VS, VT); /* correct but redundant */
+#else
+    vector_copy(VACC_L, VS);
+#endif
+    vector_copy(VD, VACC_L);
+
+ /* CTC2    $0, $vco # zeroing RSP flags VCF[0] */
+    vector_wipe(cf_ne);
+    vector_wipe(cf_co);
+
+    vector_wipe(cf_clip);
+    return;
+}
+
+INLINE static void do_ge(pi16 VD, pi16 VS, pi16 VT)
+{
+    i16 ce[N];
+    i16 eq[N];
+    register int i;
+
+    for (i = 0; i < N; i++)
+        eq[i] = (VS[i] == VT[i]);
+    for (i = 0; i < N; i++)
+        ce[i] = (cf_ne[i] & cf_co[i]) ^ 1;
+    for (i = 0; i < N; i++)
+        eq[i] = eq[i] & ce[i];
+    for (i = 0; i < N; i++)
+        cf_comp[i] = (VS[i] > VT[i]); /* greater than */
+    for (i = 0; i < N; i++)
+        cf_comp[i] = cf_comp[i] | eq[i]; /* ... or equal (commonly) */
+
+    merge(VACC_L, cf_comp, VS, VT);
+    vector_copy(VD, VACC_L);
+
+ /* CTC2    $0, $vco # zeroing RSP flags VCF[0] */
+    vector_wipe(cf_ne);
+    vector_wipe(cf_co);
+
+    vector_wipe(cf_clip);
+    return;
+}
+
+INLINE static void do_cl(pi16 VD, pi16 VS, pi16 VT)
+{
+    ALIGNED u16 VB[N], VC[N];
+    ALIGNED i16 eq[N], ge[N], le[N];
+    ALIGNED i16 gen[N], len[N], lz[N], uz[N], sn[N];
+    i16 diff[N];
+    i16 cmp[N];
+    register int i;
+
+    vector_copy((pi16)VB, VS);
+    vector_copy((pi16)VC, VT);
+
+/*
+    for (i = 0; i < N; i++)
+        ge[i] = cf_clip[i];
+    for (i = 0; i < N; i++)
+        le[i] = cf_comp[i];
+*/
+    for (i = 0; i < N; i++)
+        eq[i] = cf_ne[i] ^ 1;
+    vector_copy(sn, cf_co);
+
+/*
+ * Now that we have extracted all the flags, we will essentially be masking
+ * them back in where they came from redundantly, unless the corresponding
+ * NOTEQUAL bit from VCO upper was not set....
+ */
+    for (i = 0; i < N; i++)
+        VC[i] = VC[i] ^ -sn[i];
+    for (i = 0; i < N; i++)
+        VC[i] = VC[i] + sn[i]; /* conditional negation, if sn */
+    for (i = 0; i < N; i++)
+        diff[i] = VB[i] - VC[i];
+    for (i = 0; i < N; i++)
+        uz[i] = (VB[i] + (u16)VT[i] - 65536) >> 31;
+    for (i = 0; i < N; i++)
+        lz[i] = (diff[i] == 0x0000);
+    for (i = 0; i < N; i++)
+        gen[i] = lz[i] | uz[i];
+    for (i = 0; i < N; i++)
+        len[i] = lz[i] & uz[i];
+    for (i = 0; i < N; i++)
+        gen[i] = gen[i] & cf_vce[i];
+    for (i = 0; i < N; i++)
+        len[i] = len[i] & (cf_vce[i] ^ 1);
+    for (i = 0; i < N; i++)
+        len[i] = len[i] | gen[i];
+    for (i = 0; i < N; i++)
+        gen[i] = (VB[i] >= VC[i]);
+
+    for (i = 0; i < N; i++)
+        cmp[i] = eq[i] & sn[i];
+    merge(le, cmp, len, cf_comp);
+
+    for (i = 0; i < N; i++)
+        cmp[i] = eq[i] & (sn[i] ^ 1);
+    merge(ge, cmp, gen, cf_clip);
+
+    merge(cmp, sn, le, ge);
+    merge(VACC_L, cmp, (pi16)VC, VS);
+    vector_copy(VD, VACC_L);
+
+ /* CTC2    $0, $vco # zeroing RSP flags VCF[0] */
+    vector_wipe(cf_ne);
+    vector_wipe(cf_co);
+
+    vector_copy(cf_clip, ge);
+    vector_copy(cf_comp, le);
+
+ /* CTC2    $0, $vce # zeroing RSP flags VCF[2] */
+    vector_wipe(cf_vce);
+    return;
+}
+
+INLINE static void do_ch(pi16 VD, pi16 VS, pi16 VT)
+{
+    ALIGNED i16 VC[N];
+    ALIGNED i16 eq[N], ge[N], le[N];
+    ALIGNED i16 sn[N];
+#ifndef _DEBUG
+    i16 diff[N];
+#endif
+    i16 cch[N]; /* corner case hack:  -(-32768) with undefined sign */
+    register int i;
+
+    for (i = 0; i < N; i++)
+        cch[i] = (VT[i] == -32768) ? ~0 : 0; /* -(-32768) might not be >= 0. */
+    vector_copy(VC, VT);
+    for (i = 0; i < N; i++)
+        sn[i] = VS[i] ^ VT[i];
+    for (i = 0; i < N; i++)
+        sn[i] = (sn[i] < 0) ? ~0 :  0; /* signed SRA (sn), 15 */
+    for (i = 0; i < N; i++)
+        VC[i] ^= sn[i]; /* if (sn == ~0) {VT = ~VT;} else {VT =  VT;} */
+    for (i = 0; i < N; i++)
+        cf_vce[i]  = (VS[i] == VC[i]); /* 2's complement:  VC = -VT - 1 = ~VT */
+    for (i = 0; i < N; i++)
+        cf_vce[i] &= sn[i];
+
+/*
+ * if (sign flag), then converts ~(VT) into -(VT) a.k.a. ~(VT) - (-1)
+ * Note that if (VT == INT16_MIN) a.k.a. cch[i], -(-32768) is undefined.
+ */
+    for (i = 0; i < N; i++)
+        VC[i] -= sn[i] & ~cch[i]; /* converts ~(VT) into -(VT) if (sign) */
+
+    for (i = 0; i < N; i++)
+        eq[i]  = (VS[i] == VC[i]) & ~cch[i]; /* VS = -(-32768) never happens. */
+    for (i = 0; i < N; i++)
+        eq[i] |= cf_vce[i];
+
+#ifdef _DEBUG
+    for (i = 0; i < N; i++)
+        le[i] = sn[i] ? (VS[i] <= VC[i]) : (VC[i] < 0);
+    for (i = 0; i < N; i++)
+        ge[i] = sn[i] ? (VC[i] > 0x0000) : (VS[i] >= VC[i]);
+#elif (0)
+    for (i = 0; i < N; i++)
+        le[i] = sn[i] ? (VT[i] <= -VS[i]) : (VT[i] <= ~0x0000);
+    for (i = 0; i < N; i++)
+        ge[i] = sn[i] ? (~0x0000 >= VT[i]) : (VS[i] >= VT[i]);
+#else
+    for (i = 0; i < N; i++)
+        diff[i] = sn[i] | VS[i];
+    for (i = 0; i < N; i++)
+        ge[i] = (diff[i] >= VT[i]);
+
+    for (i = 0; i < N; i++)
+        sn[i] = (u16)(sn[i]) >> 15; /* ~0 to 1, 0 to 0 */
+
+    for (i = 0; i < N; i++)
+        diff[i] = VC[i] - VS[i];
+    for (i = 0; i < N; i++)
+        diff[i] = (diff[i] >= 0);
+    for (i = 0; i < N; i++)
+        le[i] = (VT[i] < 0);
+    merge(le, sn, diff, le);
+#endif
+
+    merge(cf_comp, sn, le, ge);
+    merge(VACC_L, cf_comp, VC, VS);
+    vector_copy(VD, VACC_L);
+
+    vector_copy(cf_clip, ge);
+    vector_copy(cf_comp, le);
+    for (i = 0; i < N; i++)
+        cf_ne[i] = eq[i] ^ 1;
+    vector_copy(cf_co, sn);
+    return;
+}
+
+INLINE static void do_cr(pi16 VD, pi16 VS, pi16 VT)
+{
+    ALIGNED i16 ge[N], le[N], sn[N];
+    ALIGNED i16 VC[N];
+    i16 cmp[N];
+    register int i;
+
+    vector_copy(VC, VT);
+    for (i = 0; i < N; i++)
+        sn[i] = VS[i] ^ VT[i];
+    for (i = 0; i < N; i++)
+        sn[i] = (sn[i] < 0) ? ~0 : 0;
+#ifdef _DEBUG
+    for (i = 0; i < N; i++)
+        le[i] = sn[i] ? (VT[i] <= ~VS[i]) : (VT[i] <= ~0x0000);
+    for (i = 0; i < N; i++)
+        ge[i] = sn[i] ? (~0x0000 >= VT[i]) : (VS[i] >= VT[i]);
+#else
+    for (i = 0; i < N; i++)
+        cmp[i] = ~(VS[i] & sn[i]);
+    for (i = 0; i < N; i++)
+        le[i] = (VT[i] <= cmp[i]);
+    for (i = 0; i < N; i++)
+        cmp[i] =  (VS[i] | sn[i]);
+    for (i = 0; i < N; i++)
+        ge[i] = (cmp[i] >= VT[i]);
+#endif
+    for (i = 0; i < N; i++)
+        VC[i] ^= sn[i]; /* if (sn == ~0) {VT = ~VT;} else {VT =  VT;} */
+    merge(cmp, sn, le, ge);
+    merge(VACC_L, cmp, VC, VS);
+    vector_copy(VD, VACC_L);
+
+ /* CTC2    $0, $vco # zeroing RSP flags VCF[0] */
+    vector_wipe(cf_ne);
+    vector_wipe(cf_co);
+
+    vector_copy(cf_clip, ge);
+    vector_copy(cf_comp, le);
+
+ /* CTC2    $0, $vce # zeroing RSP flags VCF[2] */
+    vector_wipe(cf_vce);
+    return;
+}
+
+INLINE static void do_mrg(pi16 VD, pi16 VS, pi16 VT)
+{
+    merge(VACC_L, cf_comp, VS, VT);
+    vector_copy(VD, VACC_L);
+    return;
+}
+
+VECTOR_OPERATION VLT(v16 vs, v16 vt)
+{
+    ALIGNED i16 VD[N];
+#ifdef ARCH_MIN_SSE2
+    ALIGNED i16 VS[N], VT[N];
+
+    *(v16 *)VS = vs;
+    *(v16 *)VT = vt;
+#else
+    v16 VS, VT;
+
+    VS = vs;
+    VT = vt;
+#endif
+    do_lt(VD, VS, VT);
+#ifdef ARCH_MIN_SSE2
+    COMPILER_FENCE();
+    vs = *(v16 *)VD;
+    return (vs);
+#else
+    vector_copy(V_result, VD);
+    return;
+#endif
+}
+
+VECTOR_OPERATION VEQ(v16 vs, v16 vt)
+{
+    ALIGNED i16 VD[N];
+#ifdef ARCH_MIN_SSE2
+    ALIGNED i16 VS[N], VT[N];
+
+    *(v16 *)VS = vs;
+    *(v16 *)VT = vt;
+#else
+    v16 VS, VT;
+
+    VS = vs;
+    VT = vt;
+#endif
+    do_eq(VD, VS, VT);
+#ifdef ARCH_MIN_SSE2
+    COMPILER_FENCE();
+    vs = *(v16 *)VD;
+    return (vs);
+#else
+    vector_copy(V_result, VD);
+    return;
+#endif
+}
+
+VECTOR_OPERATION VNE(v16 vs, v16 vt)
+{
+    ALIGNED i16 VD[N];
+#ifdef ARCH_MIN_SSE2
+    ALIGNED i16 VS[N], VT[N];
+
+    *(v16 *)VS = vs;
+    *(v16 *)VT = vt;
+#else
+    v16 VS, VT;
+
+    VS = vs;
+    VT = vt;
+#endif
+    do_ne(VD, VS, VT);
+#ifdef ARCH_MIN_SSE2
+    COMPILER_FENCE();
+    vs = *(v16 *)VD;
+    return (vs);
+#else
+    vector_copy(V_result, VD);
+    return;
+#endif
+}
+
+VECTOR_OPERATION VGE(v16 vs, v16 vt)
+{
+    ALIGNED i16 VD[N];
+#ifdef ARCH_MIN_SSE2
+    ALIGNED i16 VS[N], VT[N];
+
+    *(v16 *)VS = vs;
+    *(v16 *)VT = vt;
+#else
+    v16 VS, VT;
+
+    VS = vs;
+    VT = vt;
+#endif
+    do_ge(VD, VS, VT);
+#ifdef ARCH_MIN_SSE2
+    COMPILER_FENCE();
+    vs = *(v16 *)VD;
+    return (vs);
+#else
+    vector_copy(V_result, VD);
+    return;
+#endif
+}
+
+VECTOR_OPERATION VCL(v16 vs, v16 vt)
+{
+    ALIGNED i16 VD[N];
+#ifdef ARCH_MIN_SSE2
+    ALIGNED i16 VS[N], VT[N];
+
+    *(v16 *)VS = vs;
+    *(v16 *)VT = vt;
+#else
+    v16 VS, VT;
+
+    VS = vs;
+    VT = vt;
+#endif
+    do_cl(VD, VS, VT);
+#ifdef ARCH_MIN_SSE2
+    COMPILER_FENCE();
+    vs = *(v16 *)VD;
+    return (vs);
+#else
+    vector_copy(V_result, VD);
+    return;
+#endif
+}
+
+VECTOR_OPERATION VCH(v16 vs, v16 vt)
+{
+    ALIGNED i16 VD[N];
+#ifdef ARCH_MIN_SSE2
+    ALIGNED i16 VS[N], VT[N];
+
+    *(v16 *)VS = vs;
+    *(v16 *)VT = vt;
+#else
+    v16 VS, VT;
+
+    VS = vs;
+    VT = vt;
+#endif
+    do_ch(VD, VS, VT);
+#ifdef ARCH_MIN_SSE2
+    COMPILER_FENCE();
+    vs = *(v16 *)VD;
+    return (vs);
+#else
+    vector_copy(V_result, VD);
+    return;
+#endif
+}
+
+VECTOR_OPERATION VCR(v16 vs, v16 vt)
+{
+    ALIGNED i16 VD[N];
+#ifdef ARCH_MIN_SSE2
+    ALIGNED i16 VS[N], VT[N];
+
+    *(v16 *)VS = vs;
+    *(v16 *)VT = vt;
+#else
+    v16 VS, VT;
+
+    VS = vs;
+    VT = vt;
+#endif
+    do_cr(VD, VS, VT);
+#ifdef ARCH_MIN_SSE2
+    COMPILER_FENCE();
+    vs = *(v16 *)VD;
+    return (vs);
+#else
+    vector_copy(V_result, VD);
+    return;
+#endif
+}
+
+VECTOR_OPERATION VMRG(v16 vs, v16 vt)
+{
+    ALIGNED i16 VD[N];
+#ifdef ARCH_MIN_SSE2
+    ALIGNED i16 VS[N], VT[N];
+
+    *(v16 *)VS = vs;
+    *(v16 *)VT = vt;
+#else
+    v16 VS, VT;
+
+    VS = vs;
+    VT = vt;
+#endif
+    do_mrg(VD, VS, VT);
+#ifdef ARCH_MIN_SSE2
+    COMPILER_FENCE();
+    vs = *(v16 *)VD;
+    return (vs);
+#else
+    vector_copy(V_result, VD);
+    return;
+#endif
+}
--- a/mupen64plus-rsp-cxd4/vu/select.h
+++ b/mupen64plus-rsp-cxd4/vu/select.h
@ -0,0 +1,38 @@
+/******************************************************************************\
+* Project:  Instruction Mnemonics for Vector Unit Computational Test Selects   *
+* Authors:  Iconoclast                                                         *
+* Release:  2015.01.18                                                         *
+* License:  CC0 Public Domain Dedication                                       *
+*                                                                              *
+* To the extent possible under law, the author(s) have dedicated all copyright *
+* and related and neighboring rights to this software to the public domain     *
+* worldwide. This software is distributed without any warranty.                *
+*                                                                              *
+* You should have received a copy of the CC0 Public Domain Dedication along    *
+* with this software.                                                          *
+* If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.             *
+\******************************************************************************/
+
+#ifndef _SELECT_H_
+#define _SELECT_H_
+
+#include "vu.h"
+
+VECTOR_EXTERN
+    VLT    (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VEQ    (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VNE    (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VGE    (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VCL    (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VCH    (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VCR    (v16 vs, v16 vt);
+VECTOR_EXTERN
+    VMRG   (v16 vs, v16 vt);
+
+#endif
--- a/mupen64plus-rsp-cxd4/vu/vu.c
+++ b/mupen64plus-rsp-cxd4/vu/vu.c
@ -0,0 +1,235 @@
+/******************************************************************************\
+* Project:  MSP Emulation Layer for Vector Unit Computational Operations       *
+* Authors:  Iconoclast                                                         *
+* Release:  2016.03.23                                                         *
+* License:  CC0 Public Domain Dedication                                       *
+*                                                                              *
+* To the extent possible under law, the author(s) have dedicated all copyright *
+* and related and neighboring rights to this software to the public domain     *
+* worldwide. This software is distributed without any warranty.                *
+*                                                                              *
+* You should have received a copy of the CC0 Public Domain Dedication along    *
+* with this software.                                                          *
+* If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.             *
+\******************************************************************************/
+
+#include "vu.h"
+
+#include "multiply.h"
+#include "add.h"
+#include "select.h"
+#include "logical.h"
+#include "divide.h"
+#if 0
+#include "pack.h"
+#endif
+
+ALIGNED i16 VR[32][N << VR_STATIC_WRAPAROUND];
+ALIGNED i16 VACC[3][N];
+#ifndef ARCH_MIN_SSE2
+ALIGNED i16 V_result[N];
+#endif
+
+/*
+ * These normally should have type `int` because they are Boolean T/F arrays.
+ * However, since SSE2 uses 128-bit XMM's, and Win32 `int` storage is 32-bit,
+ * we have the problem of 32*8 > 128 bits, so we use `short` to reduce packs.
+ */
+ALIGNED i16 cf_ne[N]; /* $vco:  high "NOTEQUAL" */
+ALIGNED i16 cf_co[N]; /* $vco:  low "carry/borrow in/out" */
+ALIGNED i16 cf_clip[N]; /* $vcc:  high (clip tests:  VCL, VCH, VCR) */
+ALIGNED i16 cf_comp[N]; /* $vcc:  low (VEQ, VNE, VLT, VGE, VCL, VCH, VCR) */
+ALIGNED i16 cf_vce[N]; /* $vce:  vector compare extension register */
+
+VECTOR_OPERATION res_V(v16 vs, v16 vt)
+{
+    vt = vs; /* unused */
+    message("C2\nRESERVED"); /* uncertain how to handle reserved, untested */
+#ifdef ARCH_MIN_SSE2
+    vs = _mm_setzero_si128();
+    return (vt = vs); /* -Wunused-but-set-parameter */
+#else
+    vector_wipe(V_result);
+    if (vt == vs)
+        return; /* -Wunused-but-set-parameter */
+    return;
+#endif
+}
+VECTOR_OPERATION res_M(v16 vs, v16 vt)
+{ /* Ultra64 OS did have these, so one could implement this ext. */
+    message("VMUL IQ");
+#ifdef ARCH_MIN_SSE2
+    vs = res_V(vs, vt);
+    return (vs);
+#else
+    res_V(vs, vt);
+    return;
+#endif
+}
+
+/*
+ * Op-code-accurate matrix of all the known RSP vector operations.
+ * To do:  Either remove VMACQ, or add VRNDP, VRNDN, and VMULQ.
+ *
+ * Note that these are not our literal function names, just macro names.
+ */
+VECTOR_OPERATION (*COP2_C2[8 * 8])(v16, v16) = {
+    VMULF  ,VMULU  ,res_M  ,res_M  ,VMUDL  ,VMUDM  ,VMUDN  ,VMUDH  , /* 000 */
+    VMACF  ,VMACU  ,res_M  ,res_M  ,VMADL  ,VMADM  ,VMADN  ,VMADH  , /* 001 */
+    VADD   ,VSUB   ,res_V  ,VABS   ,VADDC  ,VSUBC  ,res_V  ,res_V  , /* 010 */
+    res_V  ,res_V  ,res_V  ,res_V  ,res_V  ,VSAW   ,res_V  ,res_V  , /* 011 */
+    VLT    ,VEQ    ,VNE    ,VGE    ,VCL    ,VCH    ,VCR    ,VMRG   , /* 100 */
+    VAND   ,VNAND  ,VOR    ,VNOR   ,VXOR   ,VNXOR  ,res_V  ,res_V  , /* 101 */
+    VRCP   ,VRCPL  ,VRCPH  ,VMOV   ,VRSQ   ,VRSQL  ,VRSQH  ,VNOP   , /* 110 */
+    res_V  ,res_V  ,res_V  ,res_V  ,res_V  ,res_V  ,res_V  ,res_V  , /* 111 */
+}; /* 000     001     010     011     100     101     110     111 */
+
+#ifndef ARCH_MIN_SSE2
+u16 get_VCO(void)
+{
+    register u16 vco;
+
+    vco = 0x0000
+      | (cf_ne[0xF % 8] << 0xF)
+      | (cf_ne[0xE % 8] << 0xE)
+      | (cf_ne[0xD % 8] << 0xD)
+      | (cf_ne[0xC % 8] << 0xC)
+      | (cf_ne[0xB % 8] << 0xB)
+      | (cf_ne[0xA % 8] << 0xA)
+      | (cf_ne[0x9 % 8] << 0x9)
+      | (cf_ne[0x8 % 8] << 0x8)
+      | (cf_co[0x7 % 8] << 0x7)
+      | (cf_co[0x6 % 8] << 0x6)
+      | (cf_co[0x5 % 8] << 0x5)
+      | (cf_co[0x4 % 8] << 0x4)
+      | (cf_co[0x3 % 8] << 0x3)
+      | (cf_co[0x2 % 8] << 0x2)
+      | (cf_co[0x1 % 8] << 0x1)
+      | (cf_co[0x0 % 8] << 0x0);
+    return (vco); /* Big endian becomes little. */
+}
+u16 get_VCC(void)
+{
+    register u16 vcc;
+
+    vcc = 0x0000
+      | (cf_clip[0xF % 8] << 0xF)
+      | (cf_clip[0xE % 8] << 0xE)
+      | (cf_clip[0xD % 8] << 0xD)
+      | (cf_clip[0xC % 8] << 0xC)
+      | (cf_clip[0xB % 8] << 0xB)
+      | (cf_clip[0xA % 8] << 0xA)
+      | (cf_clip[0x9 % 8] << 0x9)
+      | (cf_clip[0x8 % 8] << 0x8)
+      | (cf_comp[0x7 % 8] << 0x7)
+      | (cf_comp[0x6 % 8] << 0x6)
+      | (cf_comp[0x5 % 8] << 0x5)
+      | (cf_comp[0x4 % 8] << 0x4)
+      | (cf_comp[0x3 % 8] << 0x3)
+      | (cf_comp[0x2 % 8] << 0x2)
+      | (cf_comp[0x1 % 8] << 0x1)
+      | (cf_comp[0x0 % 8] << 0x0);
+    return (vcc); /* Big endian becomes little. */
+}
+u8 get_VCE(void)
+{
+    int result;
+    register u8 vce;
+
+    result = 0x00
+      | (cf_vce[07] << 0x7)
+      | (cf_vce[06] << 0x6)
+      | (cf_vce[05] << 0x5)
+      | (cf_vce[04] << 0x4)
+      | (cf_vce[03] << 0x3)
+      | (cf_vce[02] << 0x2)
+      | (cf_vce[01] << 0x1)
+      | (cf_vce[00] << 0x0);
+    vce = result & 0xFF;
+    return (vce); /* Big endian becomes little. */
+}
+#else
+u16 get_VCO(void)
+{
+    v16 xmm, hi, lo;
+    register u16 vco;
+
+    hi = _mm_load_si128((v16 *)cf_ne);
+    lo = _mm_load_si128((v16 *)cf_co);
+
+/*
+ * Rotate Boolean storage from LSB to MSB.
+ */
+    hi = _mm_slli_epi16(hi, 15);
+    lo = _mm_slli_epi16(lo, 15);
+
+    xmm = _mm_packs_epi16(lo, hi); /* Decompress INT16 Booleans to INT8 ones. */
+    vco = _mm_movemask_epi8(xmm) & 0x0000FFFF; /* PMOVMSKB combines each MSB. */
+    return (vco);
+}
+u16 get_VCC(void)
+{
+    v16 xmm, hi, lo;
+    register u16 vcc;
+
+    hi = _mm_load_si128((v16 *)cf_clip);
+    lo = _mm_load_si128((v16 *)cf_comp);
+
+/*
+ * Rotate Boolean storage from LSB to MSB.
+ */
+    hi = _mm_slli_epi16(hi, 15);
+    lo = _mm_slli_epi16(lo, 15);
+
+    xmm = _mm_packs_epi16(lo, hi); /* Decompress INT16 Booleans to INT8 ones. */
+    vcc = _mm_movemask_epi8(xmm) & 0x0000FFFF; /* PMOVMSKB combines each MSB. */
+    return (vcc);
+}
+u8 get_VCE(void)
+{
+    v16 xmm, hi, lo;
+    register u8 vce;
+
+    hi = _mm_setzero_si128();
+    lo = _mm_load_si128((v16 *)cf_vce);
+
+    lo = _mm_slli_epi16(lo, 15); /* Rotate Boolean storage from LSB to MSB. */
+
+    xmm = _mm_packs_epi16(lo, hi); /* Decompress INT16 Booleans to INT8 ones. */
+    vce = _mm_movemask_epi8(xmm) & 0x000000FF; /* PMOVMSKB combines each MSB. */
+    return (vce);
+}
+#endif
+
+/*
+ * CTC2 resources
+ * not sure how to vectorize going the other direction into SSE2
+ */
+void set_VCO(u16 vco)
+{
+    register int i;
+
+    for (i = 0; i < N; i++)
+        cf_co[i] = (vco >> (i + 0x0)) & 1;
+    for (i = 0; i < N; i++)
+        cf_ne[i] = (vco >> (i + 0x8)) & 1;
+    return; /* Little endian becomes big. */
+}
+void set_VCC(u16 vcc)
+{
+    register int i;
+
+    for (i = 0; i < N; i++)
+        cf_comp[i] = (vcc >> (i + 0x0)) & 1;
+    for (i = 0; i < N; i++)
+        cf_clip[i] = (vcc >> (i + 0x8)) & 1;
+    return; /* Little endian becomes big. */
+}
+void set_VCE(u8 vce)
+{
+    register int i;
+
+    for (i = 0; i < N; i++)
+        cf_vce[i] = (vce >> i) & 1;
+    return; /* Little endian becomes big. */
+}
--- a/mupen64plus-rsp-cxd4/vu/vu.h
+++ b/mupen64plus-rsp-cxd4/vu/vu.h
@ -0,0 +1,355 @@
+/******************************************************************************\
+* Project:  MSP Emulation Layer for Vector Unit Computational Operations       *
+* Authors:  Iconoclast                                                         *
+* Release:  2016.03.23                                                         *
+* License:  CC0 Public Domain Dedication                                       *
+*                                                                              *
+* To the extent possible under law, the author(s) have dedicated all copyright *
+* and related and neighboring rights to this software to the public domain     *
+* worldwide. This software is distributed without any warranty.                *
+*                                                                              *
+* You should have received a copy of the CC0 Public Domain Dedication along    *
+* with this software.                                                          *
+* If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.             *
+\******************************************************************************/
+#ifndef _VU_H_
+#define _VU_H_
+
+#if defined(ARCH_MIN_SSE2) && !defined(SSE2NEON)
+#include <emmintrin.h>
+#endif
+
+#include "../my_types.h"
+
+#define N       8
+/* N:  number of processor elements in SIMD processor */
+
+/*
+ * Illegal, unaligned LWC2 operations on the RSP may write past the terminal
+ * byte of a vector, while SWC2 operations may have to wrap around stores
+ * from the end to the start of a vector.  Both of these risk out-of-bounds
+ * memory access, but by doubling the number of bytes allocated (shift left)
+ * per each vector register, we could stabilize and probably optimize this.
+ */
+#if 0
+#define VR_STATIC_WRAPAROUND    0
+#else
+#define VR_STATIC_WRAPAROUND    1
+#endif
+
+/*
+ * We are going to need this for vector operations doing scalar things.
+ * The divides and VSAW need bit-wise information from the instruction word.
+ */
+extern u32 inst_word;
+
+/*
+ * RSP virtual registers (of vector unit)
+ * The most important are the 32 general-purpose vector registers.
+ * The correct way to accurately store these is using big-endian vectors.
+ *
+ * For ?WC2 we may need to do byte-precision access just as directly.
+ * This is amended by using the `VU_S` and `VU_B` macros defined in `rsp.h`.
+ */
+ALIGNED extern i16 VR[32][N << VR_STATIC_WRAPAROUND];
+
+/*
+ * The RSP accumulator is a vector of 3 48-bit integers.  Nearly all of the
+ * vector operations access it, but it's for multiply-accumulate operations.
+ *
+ * Access dimensions would be VACC[8][3] but are inverted for SIMD benefits.
+ */
+ALIGNED extern i16 VACC[3][N];
+
+/*
+ * When compiling without SSE2, we need to use a pointer to a destination
+ * vector instead of an XMM register in the return slot of the function.
+ * The vector "result" register will be emulated to serve this pointer
+ * as a shared global rather than the return slot of a function call.
+ */
+#ifndef ARCH_MIN_SSE2
+ALIGNED extern i16 V_result[N];
+#endif
+
+/*
+ * accumulator-indexing macros
+ */
+#define HI      00
+#define MD      01
+#define LO      02
+
+#define VACC_L      (VACC[LO])
+#define VACC_M      (VACC[MD])
+#define VACC_H      (VACC[HI])
+
+#define ACC_L(i)    (VACC_L)[i]
+#define ACC_M(i)    (VACC_M)[i]
+#define ACC_H(i)    (VACC_H)[i]
+
+#ifdef ARCH_MIN_SSE2
+typedef __m128i v16;
+#else
+typedef pi16 v16;
+#endif
+
+#ifdef ARCH_MIN_SSE2
+#define VECTOR_OPERATION    v16
+#else
+#define VECTOR_OPERATION    void
+#endif
+#define VECTOR_EXTERN       extern VECTOR_OPERATION
+
+NOINLINE extern void message(const char* body);
+
+VECTOR_EXTERN (*COP2_C2[8*7 + 8])(v16, v16);
+
+#ifdef ARCH_MIN_SSE2
+
+#define vector_copy(vd, vs) { \
+    *(v16 *)(vd) = *(v16 *)(vs); }
+#define vector_wipe(vd) { \
+    *(v16 *)&(vd) = _mm_cmpgt_epi16(*(v16 *)&(vd), *(v16 *)&(vd)); }
+#define vector_fill(vd) { \
+    *(v16 *)&(vd) = _mm_cmpeq_epi16(*(v16 *)&(vd), *(v16 *)&(vd)); }
+
+#define vector_and(vd, vs) { \
+    *(v16 *)&(vd) = _mm_and_si128  (*(v16 *)&(vd), *(v16 *)&(vs)); }
+#define vector_or(vd, vs) { \
+    *(v16 *)&(vd) = _mm_or_si128   (*(v16 *)&(vd), *(v16 *)&(vs)); }
+#define vector_xor(vd, vs) { \
+    *(v16 *)&(vd) = _mm_xor_si128  (*(v16 *)&(vd), *(v16 *)&(vs)); }
+
+/*
+ * Every competent vector unit should have at least two vector comparison
+ * operations:  EQ and LT/GT.  (MMX makes us say GT; SSE's LT is just a GT.)
+ *
+ * Default examples when compiling for the x86 SSE2 architecture below.
+ */
+#define vector_cmplt(vd, vs) { \
+    *(v16 *)&(vd) = _mm_cmplt_epi16(*(v16 *)&(vd), *(v16 *)&(vs)); }
+#define vector_cmpeq(vd, vs) { \
+    *(v16 *)&(vd) = _mm_cmpeq_epi16(*(v16 *)&(vd), *(v16 *)&(vs)); }
+#define vector_cmpgt(vd, vs) { \
+    *(v16 *)&(vd) = _mm_cmpgt_epi16(*(v16 *)&(vd), *(v16 *)&(vs)); }
+
+#else
+
+#define vector_copy(vd, vs) { \
+    (vd)[0] = (vs)[0]; \
+    (vd)[1] = (vs)[1]; \
+    (vd)[2] = (vs)[2]; \
+    (vd)[3] = (vs)[3]; \
+    (vd)[4] = (vs)[4]; \
+    (vd)[5] = (vs)[5]; \
+    (vd)[6] = (vs)[6]; \
+    (vd)[7] = (vs)[7]; \
+}
+#define vector_wipe(vd) { \
+    (vd)[0] =  0x0000; \
+    (vd)[1] =  0x0000; \
+    (vd)[2] =  0x0000; \
+    (vd)[3] =  0x0000; \
+    (vd)[4] =  0x0000; \
+    (vd)[5] =  0x0000; \
+    (vd)[6] =  0x0000; \
+    (vd)[7] =  0x0000; \
+}
+#define vector_fill(vd) { \
+    (vd)[0] = ~0x0000; \
+    (vd)[1] = ~0x0000; \
+    (vd)[2] = ~0x0000; \
+    (vd)[3] = ~0x0000; \
+    (vd)[4] = ~0x0000; \
+    (vd)[5] = ~0x0000; \
+    (vd)[6] = ~0x0000; \
+    (vd)[7] = ~0x0000; \
+}
+#define vector_and(vd, vs) { \
+    (vd)[0] &= (vs)[0]; \
+    (vd)[1] &= (vs)[1]; \
+    (vd)[2] &= (vs)[2]; \
+    (vd)[3] &= (vs)[3]; \
+    (vd)[4] &= (vs)[4]; \
+    (vd)[5] &= (vs)[5]; \
+    (vd)[6] &= (vs)[6]; \
+    (vd)[7] &= (vs)[7]; \
+}
+#define vector_or(vd, vs) { \
+    (vd)[0] |= (vs)[0]; \
+    (vd)[1] |= (vs)[1]; \
+    (vd)[2] |= (vs)[2]; \
+    (vd)[3] |= (vs)[3]; \
+    (vd)[4] |= (vs)[4]; \
+    (vd)[5] |= (vs)[5]; \
+    (vd)[6] |= (vs)[6]; \
+    (vd)[7] |= (vs)[7]; \
+}
+#define vector_xor(vd, vs) { \
+    (vd)[0] ^= (vs)[0]; \
+    (vd)[1] ^= (vs)[1]; \
+    (vd)[2] ^= (vs)[2]; \
+    (vd)[3] ^= (vs)[3]; \
+    (vd)[4] ^= (vs)[4]; \
+    (vd)[5] ^= (vs)[5]; \
+    (vd)[6] ^= (vs)[6]; \
+    (vd)[7] ^= (vs)[7]; \
+}
+
+#define vector_cmplt(vd, vs) { \
+    (vd)[0] = ((vd)[0] < (vs)[0]) ? ~0x0000 :  0x0000; \
+    (vd)[0] = ((vd)[1] < (vs)[1]) ? ~0x0000 :  0x0000; \
+    (vd)[0] = ((vd)[2] < (vs)[2]) ? ~0x0000 :  0x0000; \
+    (vd)[0] = ((vd)[3] < (vs)[3]) ? ~0x0000 :  0x0000; \
+    (vd)[0] = ((vd)[4] < (vs)[4]) ? ~0x0000 :  0x0000; \
+    (vd)[0] = ((vd)[5] < (vs)[5]) ? ~0x0000 :  0x0000; \
+    (vd)[0] = ((vd)[6] < (vs)[6]) ? ~0x0000 :  0x0000; \
+    (vd)[0] = ((vd)[7] < (vs)[7]) ? ~0x0000 :  0x0000; \
+}
+#define vector_cmpeq(vd, vs) { \
+    (vd)[0] = ((vd)[0] == (vs)[0]) ? ~0x0000 :  0x0000; \
+    (vd)[0] = ((vd)[1] == (vs)[1]) ? ~0x0000 :  0x0000; \
+    (vd)[0] = ((vd)[2] == (vs)[2]) ? ~0x0000 :  0x0000; \
+    (vd)[0] = ((vd)[3] == (vs)[3]) ? ~0x0000 :  0x0000; \
+    (vd)[0] = ((vd)[4] == (vs)[4]) ? ~0x0000 :  0x0000; \
+    (vd)[0] = ((vd)[5] == (vs)[5]) ? ~0x0000 :  0x0000; \
+    (vd)[0] = ((vd)[6] == (vs)[6]) ? ~0x0000 :  0x0000; \
+    (vd)[0] = ((vd)[7] == (vs)[7]) ? ~0x0000 :  0x0000; \
+}
+#define vector_cmpgt(vd, vs) { \
+    (vd)[0] = ((vd)[0] > (vs)[0]) ? ~0x0000 :  0x0000; \
+    (vd)[0] = ((vd)[1] > (vs)[1]) ? ~0x0000 :  0x0000; \
+    (vd)[0] = ((vd)[2] > (vs)[2]) ? ~0x0000 :  0x0000; \
+    (vd)[0] = ((vd)[3] > (vs)[3]) ? ~0x0000 :  0x0000; \
+    (vd)[0] = ((vd)[4] > (vs)[4]) ? ~0x0000 :  0x0000; \
+    (vd)[0] = ((vd)[5] > (vs)[5]) ? ~0x0000 :  0x0000; \
+    (vd)[0] = ((vd)[6] > (vs)[6]) ? ~0x0000 :  0x0000; \
+    (vd)[0] = ((vd)[7] > (vs)[7]) ? ~0x0000 :  0x0000; \
+}
+
+#endif
+
+/*
+ * Many vector units have pairs of "vector condition flags" registers.
+ * In SGI's vector unit implementation, these are denoted as the
+ * "vector control registers" under coprocessor 2.
+ *
+ * VCF-0 is the carry-out flags register:  $vco.
+ * VCF-1 is the compare code flags register:  $vcc.
+ * VCF-2 is the compare extension flags register:  $vce.
+ * There is no fourth RSP flags register.
+ */
+extern u16 VCO;
+extern u16 VCC;
+extern u8 VCE;
+
+ALIGNED extern i16 cf_ne[N];
+ALIGNED extern i16 cf_co[N];
+ALIGNED extern i16 cf_clip[N];
+ALIGNED extern i16 cf_comp[N];
+ALIGNED extern i16 cf_vce[N];
+
+extern u16 get_VCO(void);
+extern u16 get_VCC(void);
+extern u8 get_VCE(void);
+
+extern void set_VCO(u16 vco);
+extern void set_VCC(u16 vcc);
+extern void set_VCE(u8 vce);
+
+/*
+ * shuffling convenience macros for Intel SIMD
+ * An 8-bit shuffle imm. of SHUFFLE(0, 1, 2, 3) should be a null operation.
+ */
+#define B(x)    ((x) & 3)
+#define SHUFFLE(a,b,c,d)    ((B(d)<<6) | (B(c)<<4) | (B(b)<<2) | (B(a)<<0))
+
+/*
+ * RSP vector opcode function names are currently just literally named after
+ * the actual opcode that is being emulated, but names this short could
+ * collide with global symbols exported from somewhere else within the
+ * emulation thread.  (This did happen on Linux Mupen64, with my old function
+ * name "MFC0", which had to be renamed.)  Rather than uglify the function
+ * names, we'll treat them as macros from now on, should the need arise.
+ */
+#ifndef _WIN32
+
+#define VMULF       mulf_v_msp
+#define VMULU       mulu_v_msp
+#define VMULI       rndp_v_msp
+#define VMULQ       mulq_v_msp
+
+#define VMUDL       mudl_v_msp
+#define VMUDM       mudm_v_msp
+#define VMUDN       mudn_v_msp
+#define VMUDH       mudh_v_msp
+
+#define VMACF       macf_v_msp
+#define VMACU       macu_v_msp
+#define VMACI       rndn_v_msp
+#define VMACQ       macq_v_msp
+
+#define VMADL       madl_v_msp
+#define VMADM       madm_v_msp
+#define VMADN       madn_v_msp
+#define VMADH       madh_v_msp
+
+#define VADD        add_v_msp
+#define VSUB        sub_v_msp
+#define VSUT        sut_v_msp
+#define VABS        abs_v_msp
+
+#define VADDC       addc_v_msp
+#define VSUBC       subc_v_msp
+#define VADDB       addb_v_msp
+#define VSUBB       subb_v_msp
+
+#define VACCB       accb_v_msp
+#define VSUCB       sucb_v_msp
+#define VSAD        sad_v_msp
+#define VSAC        sac_v_msp
+
+#define VSUM        sum_v_msp
+#define VSAW        sar_v_msp
+/* #define VACC */
+/* #define VSUC */
+
+#define VLT         lt_v_msp
+#define VEQ         eq_v_msp
+#define VNE         ne_v_msp
+#define VGE         ge_v_msp
+
+#define VCL         cl_v_msp
+#define VCH         ch_v_msp
+#define VCR         cr_v_msp
+#define VMRG        mrg_v_msp
+
+#define VAND        and_v_msp
+#define VNAND       nand_v_msp
+#define VOR         or_v_msp
+#define VNOR        nor_v_msp
+#define VXOR        xor_v_msp
+#define VNXOR       nxor_v_msp
+
+#define VRCP        rcp_v_msp
+#define VRCPL       rcpl_v_msp
+#define VRCPH       rcph_v_msp
+#define VMOV        mov_v_msp
+
+#define VRSQ        rsq_v_msp
+#define VRSQL       rsql_v_msp
+#define VRSQH       rsqh_v_msp
+#define VNOP        nop_v_msp
+
+#define VEXTT       extt_v_msp
+#define VEXTQ       extq_v_msp
+#define VEXTN       extn_v_msp
+
+
+#define VINST       inst_v_msp
+#define VINSQ       insq_v_msp
+#define VINSN       insn_v_msp
+#define VNULLOP     nop_v_msp
+
+#endif
+
+#endif