commit 5b91733d6ff71a4c0efa9ce2ab955bf1822ccdcb
Author: Erik Abair <erik.abair@gmail.com>
Date:   Fri Jun 17 20:29:34 2022 -0700

    Initial import.

diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..0ca0998
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,4 @@
+---
+Language:        Cpp
+BasedOnStyle:  Google
+
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..460435e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,16 @@
+*.cpp.d
+*.obj
+*.iso
+*.lib
+*.exe
+*.xbe
+*.pdb
+
+.DS_Store
+.vscode/
+.vs/
+.idea/
+bin/
+__pycache__/
+build/
+cmake-build*/
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..6bc3038
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,60 @@
+cmake_minimum_required(VERSION 3.18)
+
+project(nv2a_vsh_cpu)
+
+set(CMAKE_VERBOSE_MAKEFILE TRUE)
+
+enable_testing()
+set(CMAKE_CXX_STANDARD 17)
+include(CMakePushCheckState)
+set(_CMAKE_PROCESSING_LANGUAGE "C")
+include(CheckSymbolExists)
+include (ExternalProject)
+include(FindPkgConfig)
+
+find_package(
+        Boost 1.70
+        COMPONENTS
+        unit_test_framework
+        REQUIRED
+)
+
+set(GENERATED_FILES_DIR "${CMAKE_BINARY_DIR}/generated")
+include_directories("${Boost_INCLUDE_DIR}" "${GENERATED_FILES_DIR}")
+
+set(CMAKE_CXX_FLAGS_DEBUG "-ggdb -O0")
+set(CMAKE_CXX_FLAGS_RELEASE "-O3")
+
+# xbdm executable ------------------------------------
+
+add_library(
+        nv2a_vsh_cpu
+        src/nv2a_vsh_cpu.c
+        src/nv2a_vsh_cpu.h
+)
+
+target_include_directories(
+        nv2a_vsh_cpu
+        PRIVATE
+        src
+)
+
+# Tests ----------------------------------------------
+add_executable(
+        operations_tests
+        test/operations/test_main.cpp
+        test/operations/test_basic.cpp
+)
+target_include_directories(
+        operations_tests
+        PRIVATE src
+        PRIVATE test
+)
+target_link_libraries(
+        operations_tests
+        LINK_PRIVATE
+        nv2a_vsh_cpu
+        ${Boost_LIBRARIES}
+)
+add_test(NAME operations_tests COMMAND operations_tests)
+add_dependencies(operations_tests nv2a_vsh_cpu)
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..fdddb29
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,24 @@
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <https://unlicense.org>
diff --git a/githooks/pre-commit b/githooks/pre-commit
new file mode 100755
index 0000000..7d507af
--- /dev/null
+++ b/githooks/pre-commit
@@ -0,0 +1,102 @@
+#!/bin/bash
+#
+# To enable this hook, rename this file to "pre-commit" and copy into the
+# ../.git/hooks directory.
+
+
+# Cross platform projects tend to avoid non-ASCII filenames; prevent
+# them from being added to the repository. We exploit the fact that the
+# printable range starts at the space character and ends with tilde.
+function check_no_nonascii_characters {
+  if [ "${allownonascii}" == "true" ]; then
+    return
+  fi
+
+  # Note that the use of brackets around a tr range is ok here, (it's
+  # even required, for portability to Solaris 10's /usr/bin/tr), since
+  # the square bracket bytes happen to fall in the designated range.
+  if test $(git diff --cached --name-only --diff-filter=A -z "${against}" |
+	      LC_ALL=C tr -d '[ -~]\0' | wc -c) != 0
+  then
+    cat <<\EOF
+Error: Attempt to add a non-ASCII file name.
+
+This can cause problems if you want to work with people on other platforms.
+
+To be portable it is advisable to rename the file.
+
+If you know what you are doing you can disable this check using:
+
+  git config hooks.allownonascii true
+EOF
+    exit 1
+  fi
+}
+
+
+function check_no_diffmarkers_or_whitespace_errors {
+  # If there are whitespace errors, print the offending file names and fail.
+  set -e
+  git diff-index --check --cached "${against}" --
+  set +e
+}
+
+
+function run_clang_format {
+  echo "${changed_c_filenames}" | grep -v '3rdparty'
+  if [[ "${changed_c_filenames}" == "" ]]; then
+    return
+  fi
+  # Run clang-format against any changed C++ files.
+  if ! which clang-format > /dev/null; then
+    cat <<\EOF
+Warning: clang-format is not installed or is not in the PATH.
+
+Please install and amend this commit.
+
+Debian:
+        sudo apt install clang-format
+EOF
+    return
+  fi
+
+  # Reformat the files in-place and re-add any that were changed.
+  #
+  # Note that this has the side effect of incorporating changes to staged files
+  # that were not themselves staged. E.g., if you edit a file, `git add`, then
+  # edit some more, then commit, all of the changes will be committed, not just
+  # the staged ones. Depending on typical workflows it might be better to do
+  # something more complicated here, or to just have the hook fail instead of
+  # perform an in-place fix.
+  files_to_format="$(echo "${changed_c_filenames}" | grep -v '3rdparty')"
+  echo "${files_to_format}" | xargs clang-format -i
+  echo "${files_to_format}" | xargs git add
+}
+
+
+# If you want to allow non-ASCII filenames set this variable to true.
+allownonascii=$(git config --bool hooks.allownonascii)
+
+if git rev-parse --verify HEAD >/dev/null 2>&1; then
+  against=HEAD
+else
+  # Initial commit: diff against an empty tree object
+  against=$(git hash-object -t tree /dev/null)
+fi
+
+# Redirect output to stderr.
+exec 1>&2
+
+
+added_and_modified_filenames="$(git diff --cached --name-only --diff-filter=d)"
+changed_c_filenames="$(echo "${added_and_modified_filenames}" | \
+                            grep -E '.*\.(c|cpp|h|hpp)$')"
+
+
+# Allow blank line at EOF.
+git config --local core.whitespace -blank-at-eof
+
+check_no_nonascii_characters
+check_no_diffmarkers_or_whitespace_errors
+run_clang_format
+
diff --git a/src/nv2a_vsh_cpu.c b/src/nv2a_vsh_cpu.c
new file mode 100644
index 0000000..dca0cb9
--- /dev/null
+++ b/src/nv2a_vsh_cpu.c
@@ -0,0 +1,266 @@
+#include "nv2a_vsh_cpu.h"
+
+#include <math.h>
+#include <string.h>
+
+void nv2a_vsh_cpu_mov(nv2a_vsh_register *out, const nv2a_vsh_register *a) {
+  memcpy(out, a, sizeof(*out));
+}
+
+void nv2a_vsh_cpu_arl(nv2a_vsh_register *out, const nv2a_vsh_register *a) {
+  float val = floorf(a->reg.x + 0.001f);
+  out->reg.x = val;
+  out->reg.y = val;
+  out->reg.z = val;
+  out->reg.w = val;
+}
+
+/*
+
+def _arl(inst: dict, input: Context, output: Context):
+    # TODO: Validate this behavior on HW.
+    val = input.get(inst["inputs"][0])[0]
+    val = int(math.floor(val + 0.001))
+    output.set(inst["output"], (val, val, val, val))
+
+
+def _mov(inst: dict, input: Context, output: Context):
+    for reg in inst["outputs"]:
+        output.set(reg, input.get(inst["inputs"][0]))
+
+
+def _mac_mul(inst: dict, input: Context, output: Context):
+    a = input.get(inst["inputs"][0])
+    b = input.get(inst["inputs"][1])
+    result = [a_val * b_val for a_val, b_val in zip(a, b)]
+    for reg in inst["outputs"]:
+        output.set(reg, tuple(result))
+
+
+def _mac_add(inst: dict, input: Context, output: Context):
+    a = input.get(inst["inputs"][0])
+    b = input.get(inst["inputs"][1])
+    result = [a_val + b_val for a_val, b_val in zip(a, b)]
+    for reg in inst["outputs"]:
+        output.set(reg, tuple(result))
+
+
+def _mac_mad(inst: dict, input: Context, output: Context):
+    a = input.get(inst["inputs"][0])
+    b = input.get(inst["inputs"][1])
+    result = [a_val * b_val for a_val, b_val in zip(a, b)]
+    c = input.get(inst["inputs"][2])
+    result = [a_val + b_val for a_val, b_val in zip(result, c)]
+    for reg in inst["outputs"]:
+        output.set(reg, tuple(result))
+
+
+def _mac_dp3(inst: dict, input: Context, output: Context):
+    a = input.get(inst["inputs"][0])
+    b = input.get(inst["inputs"][1])
+    result = [a_val * b_val for a_val, b_val in zip(a[:3], b[:3])]
+
+    val = functools.reduce(lambda x, y: x + y, result)
+    result = [val] * 4
+    for reg in inst["outputs"]:
+        output.set(reg, tuple(result))
+
+
+def _mac_dph(inst: dict, input: Context, output: Context):
+    a = input.get(inst["inputs"][0])
+    b = input.get(inst["inputs"][1])
+    result = [a_val * b_val for a_val, b_val in zip(a[:3], b[:3])]
+
+    val = functools.reduce(lambda x, y: x + y, result)
+    val += b[4]
+    result = [val] * 4
+    for reg in inst["outputs"]:
+        output.set(reg, tuple(result))
+
+
+def _mac_dp4(inst: dict, input: Context, output: Context):
+    a = input.get(inst["inputs"][0])
+    b = input.get(inst["inputs"][1])
+    result = [a_val * b_val for a_val, b_val in zip(a[:4], b[:4])]
+
+    val = functools.reduce(lambda x, y: x + y, result)
+    result = [val] * 4
+    for reg in inst["outputs"]:
+        output.set(reg, tuple(result))
+
+
+def _mac_dst(inst: dict, input: Context, output: Context):
+    a = input.get(inst["inputs"][0])
+    b = input.get(inst["inputs"][1])
+    result = (1.0, a[1] * b[1], a[2], b[3])
+    for reg in inst["outputs"]:
+        output.set(reg, tuple(result))
+
+
+def _mac_min(inst: dict, input: Context, output: Context):
+    a = input.get(inst["inputs"][0])
+    b = input.get(inst["inputs"][1])
+    result = [a_val if a_val < b_val else b_val for a_val, b_val in zip(a[:4],
+b[:4])] for reg in inst["outputs"]: output.set(reg, tuple(result))
+
+
+def _mac_max(inst: dict, input: Context, output: Context):
+    a = input.get(inst["inputs"][0])
+    b = input.get(inst["inputs"][1])
+    result = [a_val if a_val >= b_val else b_val for a_val, b_val in zip(a[:4],
+b[:4])] for reg in inst["outputs"]: output.set(reg, tuple(result))
+
+
+def _mac_slt(inst: dict, input: Context, output: Context):
+    a = input.get(inst["inputs"][0])
+    b = input.get(inst["inputs"][1])
+    result = [1.0 if a_val < b_val else 0.0 for a_val, b_val in zip(a[:4],
+b[:4])] for reg in inst["outputs"]: output.set(reg, tuple(result))
+
+
+def _mac_sge(inst: dict, input: Context, output: Context):
+    a = input.get(inst["inputs"][0])
+    b = input.get(inst["inputs"][1])
+    result = [1.0 if a_val >= b_val else 0.0 for a_val, b_val in zip(a[:4],
+b[:4])] for reg in inst["outputs"]: output.set(reg, tuple(result))
+
+
+_MAC_HANDLERS = {
+    nv2avsh.vsh_instruction.MAC.MAC_MOV: _mov,
+    nv2avsh.vsh_instruction.MAC.MAC_MUL: _mac_mul,
+    nv2avsh.vsh_instruction.MAC.MAC_ADD: _mac_add,
+    nv2avsh.vsh_instruction.MAC.MAC_MAD: _mac_mad,
+    nv2avsh.vsh_instruction.MAC.MAC_DP3: _mac_dp3,
+    nv2avsh.vsh_instruction.MAC.MAC_DPH: _mac_dph,
+    nv2avsh.vsh_instruction.MAC.MAC_DP4: _mac_dp4,
+    nv2avsh.vsh_instruction.MAC.MAC_DST: _mac_dst,
+    nv2avsh.vsh_instruction.MAC.MAC_MIN: _mac_min,
+    nv2avsh.vsh_instruction.MAC.MAC_MAX: _mac_max,
+    nv2avsh.vsh_instruction.MAC.MAC_SLT: _mac_slt,
+    nv2avsh.vsh_instruction.MAC.MAC_SGE: _mac_sge,
+    nv2avsh.vsh_instruction.MAC.MAC_ARL: _arl,
+}
+
+
+def _ilu_rcp(inst: dict, input: Context, output: Context):
+    a = input.get(inst["inputs"][0])
+
+    def compute(val):
+        if val == 1.0:
+            return 1.0
+
+        if val == 0.0:
+            return math.inf
+
+        return 1.0 / val
+
+    result = [compute(val) for val in a[:4]]
+    for reg in inst["outputs"]:
+        output.set(reg, (result[0], result[1], result[2], result[3]))
+
+
+def _ilu_rcc(inst: dict, input: Context, output: Context):
+    a = input.get(inst["inputs"][0])
+
+    def compute(input):
+        if input < -1.84467e19:
+            input = -1.84467e19
+        elif input > -5.42101e-20 and input < 0:
+            input = -5.42101e-020
+        elif input >= 0 and input < 5.42101e-20:
+            input = 5.42101e-20
+        elif input > 1.84467e19:
+            input = 1.84467e19
+
+        if input == 1.0:
+            return 1.0
+
+        return 1.0 / input
+
+    result = [compute(val) for val in a[:4]]
+    for reg in inst["outputs"]:
+        output.set(reg, (result[0], result[1], result[2], result[3]))
+
+
+def _ilu_rsq(inst: dict, input: Context, output: Context):
+    a = input.get(inst["inputs"][0])
+
+    def compute(input):
+        if input == 1.0:
+            return 1.0
+
+        if input == 0:
+            return math.inf
+
+        return 1.0 / math.sqrt(input)
+
+    result = [compute(abs(val)) for val in a[:4]]
+    for reg in inst["outputs"]:
+        output.set(reg, (result[0], result[1], result[2], result[3]))
+
+
+def _ilu_exp(inst: dict, input: Context, output: Context):
+    a = input.get(inst["inputs"][0])
+
+    tmp = math.floor(a[0])
+    x = math.pow(2, tmp)
+    y = a[0] - tmp
+    z = math.pow(2, a[0])
+    w = 1.0
+
+    for reg in inst["outputs"]:
+        output.set(reg, (x, y, z, w))
+
+
+def _ilu_log(inst: dict, input: Context, output: Context):
+    a = input.get(inst["inputs"][0])
+
+    tmp = math.floor(a[0])
+    if tmp == 0.0:
+        x = -math.inf
+        y = 1.0
+        z = -math.inf
+        w = 1.0
+    else:
+        x = math.floor(math.log2(tmp))
+        y = tmp / math.pow(2, math.floor(math.log2(tmp)))
+        z = math.log2(tmp)
+        w = 1.0
+
+    for reg in inst["outputs"]:
+        output.set(reg, (x, y, z, w))
+
+
+def _clamp(val, min_val, max_val):
+    return max(min(val, max_val), min_val)
+
+
+def _ilu_lit(inst: dict, input: Context, output: Context):
+    a = input.get(inst["inputs"][0])
+    epsilon = 1.0 / 256.0
+
+    sx = max(a[0], 0.0)
+    sy = max(a[1], 0.0)
+    sw = _clamp(a[3], -(128 - epsilon), 128 - epsilon)
+
+    x = 1.0
+    y = sx
+    z = 0.0
+    if sx > 0:
+        z = math.pow(2, sw * math.log2(sy))
+    w = 1.0
+
+    output.set(inst["output"], (x, y, z, w))
+
+
+_ILU_HANDLERS = {
+    nv2avsh.vsh_instruction.ILU.ILU_MOV: _mov,
+    nv2avsh.vsh_instruction.ILU.ILU_RCP: _ilu_rcp,
+    nv2avsh.vsh_instruction.ILU.ILU_RCC: _ilu_rcc,
+    nv2avsh.vsh_instruction.ILU.ILU_RSQ: _ilu_rsq,
+    nv2avsh.vsh_instruction.ILU.ILU_EXP: _ilu_exp,
+    nv2avsh.vsh_instruction.ILU.ILU_LOG: _ilu_log,
+    nv2avsh.vsh_instruction.ILU.ILU_LIT: _ilu_lit,
+}
+
+ */
diff --git a/src/nv2a_vsh_cpu.h b/src/nv2a_vsh_cpu.h
new file mode 100644
index 0000000..171b0f7
--- /dev/null
+++ b/src/nv2a_vsh_cpu.h
@@ -0,0 +1,58 @@
+#ifndef NV2A_VSH_CPU_SRC_NV2A_VSH_CPU_H_
+#define NV2A_VSH_CPU_SRC_NV2A_VSH_CPU_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct nv2a_vsh_register_components {
+  float x;
+  float y;
+  float z;
+  float w;
+};
+
+typedef union nv2a_vsh_register_ {
+  struct nv2a_vsh_register_components reg;
+  float raw[4];
+} nv2a_vsh_register;
+
+#define OP_1(name) \
+  void nv2a_vsh_cpu_##name(nv2a_vsh_register *out, const nv2a_vsh_register *a)
+#define OP_2(name)                                                             \
+  void nv2a_vsh_cpu_##name(nv2a_vsh_register *out, const nv2a_vsh_register *a, \
+                           const nv2a_vsh_register *b)
+#define OP_3(name)                                                             \
+  void nv2a_vsh_cpu_##name(nv2a_vsh_register *out, const nv2a_vsh_register *a, \
+                           const nv2a_vsh_register *b,                         \
+                           const nv2a_vsh_register *c)
+
+OP_1(mov);
+OP_1(arl);
+OP_2(mul);
+OP_2(add);
+OP_3(mad);
+OP_2(dp3);
+OP_2(dph);
+OP_2(dp4);
+OP_2(dst);
+OP_2(min);
+OP_2(max);
+OP_2(slt);
+OP_2(sge);
+OP_1(rcp);
+OP_1(rcc);
+OP_1(rsq);
+OP_1(exp);
+OP_1(log);
+OP_1(lit);
+
+#undef OP_1
+#undef OP_2
+#undef OP_3
+
+#ifdef __cplusplus
+};
+#endif
+
+#endif  // NV2A_VSH_CPU_SRC_NV2A_VSH_CPU_H_
diff --git a/test/operations/test_basic.cpp b/test/operations/test_basic.cpp
new file mode 100644
index 0000000..6bdabca
--- /dev/null
+++ b/test/operations/test_basic.cpp
@@ -0,0 +1,55 @@
+#include <boost/test/unit_test.hpp>
+
+#include "nv2a_vsh_cpu.h"
+
+BOOST_AUTO_TEST_SUITE(basic_operation_suite)
+
+BOOST_AUTO_TEST_CASE(mov) {
+  nv2a_vsh_register a = {0.0f, -1000.0f, 1000.0f, 64.123456f};
+
+  nv2a_vsh_register out;
+  nv2a_vsh_cpu_mov(&out, &a);
+
+  BOOST_TEST(out.reg.x == a.reg.x);
+  BOOST_TEST(out.reg.y == a.reg.y);
+  BOOST_TEST(out.reg.z == a.reg.z);
+  BOOST_TEST(out.reg.w == a.reg.w);
+}
+
+BOOST_AUTO_TEST_CASE(arl_trivial) {
+  nv2a_vsh_register a = {10.0f, -1000.0f, 1000.0f, 64.123456f};
+
+  nv2a_vsh_register out;
+  nv2a_vsh_cpu_arl(&out, &a);
+
+  BOOST_TEST(out.reg.x == a.reg.x);
+  BOOST_TEST(out.reg.y == a.reg.x);
+  BOOST_TEST(out.reg.z == a.reg.x);
+  BOOST_TEST(out.reg.w == a.reg.x);
+}
+
+BOOST_AUTO_TEST_CASE(arl_truncate) {
+  nv2a_vsh_register a = {10.12345f, -1000.0f, 1000.0f, 64.123456f};
+
+  nv2a_vsh_register out;
+  nv2a_vsh_cpu_arl(&out, &a);
+
+  BOOST_TEST(out.reg.x == 10.0f);
+  BOOST_TEST(out.reg.y == 10.0f);
+  BOOST_TEST(out.reg.z == 10.0f);
+  BOOST_TEST(out.reg.w == 10.0f);
+}
+
+BOOST_AUTO_TEST_CASE(arl_biased) {
+  nv2a_vsh_register a = {9.9999999f, -1000.0f, 1000.0f, 64.123456f};
+
+  nv2a_vsh_register out;
+  nv2a_vsh_cpu_arl(&out, &a);
+
+  BOOST_TEST(out.reg.x == 10.0f);
+  BOOST_TEST(out.reg.y == 10.0f);
+  BOOST_TEST(out.reg.z == 10.0f);
+  BOOST_TEST(out.reg.w == 10.0f);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/test/operations/test_main.cpp b/test/operations/test_main.cpp
new file mode 100644
index 0000000..ea2b541
--- /dev/null
+++ b/test/operations/test_main.cpp
@@ -0,0 +1,2 @@
+#define BOOST_TEST_MODULE OperationTests
+#include <boost/test/unit_test.hpp>