mirror of
https://github.com/PCSX2/pcsx2.git
synced 2026-01-31 01:15:24 +01:00
Compare commits
24 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4c8bf53e69 | ||
|
|
83fcd4bd1f | ||
|
|
bfd960ad1b | ||
|
|
6b81808ecc | ||
|
|
c615a6f6e2 | ||
|
|
40b522b42f | ||
|
|
f2655b763f | ||
|
|
fd145e65aa | ||
|
|
6596b7f27e | ||
|
|
9d767838d6 | ||
|
|
f55219bb1b | ||
|
|
805b647c73 | ||
|
|
fd0351ca8f | ||
|
|
ed5a7802f3 | ||
|
|
44f8317b7e | ||
|
|
0200933ddd | ||
|
|
f712b2b63a | ||
|
|
71923e7cba | ||
|
|
532d14611c | ||
|
|
3a91a07d51 | ||
|
|
6a8287ea9f | ||
|
|
dc051541bd | ||
|
|
3265c2a614 | ||
|
|
f798401e93 |
1
.github/workflows/scripts/controller-db/.gitignore
vendored
Normal file
1
.github/workflows/scripts/controller-db/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
*.txt
|
||||
38
.github/workflows/scripts/controller-db/update-db.py
vendored
Normal file
38
.github/workflows/scripts/controller-db/update-db.py
vendored
Normal file
@@ -0,0 +1,38 @@
|
||||
import os
|
||||
|
||||
relevant_categories = [
|
||||
"# Mac OS X",
|
||||
"# Linux"
|
||||
]
|
||||
|
||||
header_lines = []
|
||||
new_db_contents = []
|
||||
|
||||
def is_relevant_category(line):
|
||||
for category in relevant_categories:
|
||||
if category in line:
|
||||
return True
|
||||
return False
|
||||
|
||||
with open("./game_controller_db.txt") as file:
|
||||
lines = file.readlines()
|
||||
finished_header = False
|
||||
processing_section = False
|
||||
for line in lines:
|
||||
if finished_header is False:
|
||||
header_lines.append(line)
|
||||
if line == "\n":
|
||||
finished_header = True
|
||||
if processing_section and line == "\n":
|
||||
processing_section = False
|
||||
new_db_contents.append("\n")
|
||||
if is_relevant_category(line) and processing_section is False:
|
||||
processing_section = True
|
||||
new_db_contents.append(line)
|
||||
elif processing_section:
|
||||
new_db_contents.append(line)
|
||||
|
||||
os.remove("./game_controller_db.txt")
|
||||
with open("./game_controller_db.txt", "w") as f:
|
||||
f.writelines(header_lines)
|
||||
f.writelines(new_db_contents)
|
||||
8
.github/workflows/scripts/linux/appimage.sh
vendored
8
.github/workflows/scripts/linux/appimage.sh
vendored
@@ -37,11 +37,15 @@ mkdir -p squashfs-root/usr/share/icons && cp ./squashfs-root/PCSX2.png ./squashf
|
||||
mkdir -p squashfs-root/usr/share/icons/hicolor/scalable/apps && cp ./squashfs-root/PCSX2.png ./squashfs-root/usr/share/icons/hicolor/scalable/apps
|
||||
mkdir -p squashfs-root/usr/share/pixmaps && cp ./squashfs-root/PCSX2.png ./squashfs-root/usr/share/pixmaps
|
||||
mkdir -p squashfs-root/usr/lib/
|
||||
mkdir -p squashfs-root/usr/optional/libstdc++
|
||||
mkdir -p squashfs-root/usr/optional/libgcc_s
|
||||
cp ./.github/workflows/scripts/linux/AppRun "$GITHUB_WORKSPACE"/squashfs-root/AppRun
|
||||
curl -sSfL "https://github.com/AppImage/AppImageKit/releases/download/continuous/AppRun-$APPARCH" -o "$GITHUB_WORKSPACE"/squashfs-root/AppRun-patched
|
||||
curl -sSfL "https://github.com/darealshinji/AppImageKit-checkrt/releases/download/continuous/AppRun-patched-$APPARCH" -o "$GITHUB_WORKSPACE"/squashfs-root/AppRun-patched
|
||||
curl -sSfL "https://github.com/darealshinji/AppImageKit-checkrt/releases/download/continuous/exec-$APPARCH.so" -o "$GITHUB_WORKSPACE"/squashfs-root/usr/optional/exec.so
|
||||
chmod a+x ./squashfs-root/AppRun
|
||||
chmod a+x ./squashfs-root/runtime
|
||||
chmod a+x ./squashfs-root/AppRun-patched
|
||||
chmod a+x ./squashfs-root/usr/optional/exec.so
|
||||
echo "$name" > "$GITHUB_WORKSPACE"/squashfs-root/version.txt
|
||||
mkdir -p "$GITHUB_WORKSPACE"/squashfs-root/usr/bin/app
|
||||
cp -r "$GITHUB_WORKSPACE"/bin/Langs "$GITHUB_WORKSPACE"/squashfs-root/usr/bin/
|
||||
@@ -49,6 +53,8 @@ cp "$GITHUB_WORKSPACE"/bin/docs/{Configuration_Guide.pdf,PCSX2_FAQ.pdf} "$GITHUB
|
||||
cp "$GITHUB_WORKSPACE"/bin/cheats_ws.zip "$GITHUB_WORKSPACE"/squashfs-root/usr/bin/app
|
||||
cp ./bin/GameIndex.yaml "$GITHUB_WORKSPACE"/squashfs-root/usr/bin/app/GameIndex.yaml
|
||||
cp /usr/lib/$LIBARCH/libthai.so.0 "$GITHUB_WORKSPACE"/squashfs-root/usr/lib/
|
||||
cp --dereference /usr/lib/"$LIBARCH"/libstdc++.so.6 "$GITHUB_WORKSPACE"/squashfs-root/usr/optional/libstdc++/libstdc++.so.6
|
||||
cp --dereference /lib/"$LIBARCH"/libgcc_s.so.1 "$GITHUB_WORKSPACE"/squashfs-root/usr/optional/libgcc_s/libgcc_s.so.1
|
||||
export UPD_INFO="gh-releases-zsync|PCSX2|pcsx2|latest|$name.AppImage.zsync"
|
||||
export OUTPUT="$name.AppImage"
|
||||
/tmp/squashfs-root/AppRun --appdir="$GITHUB_WORKSPACE"/squashfs-root/ --plugin gtk -d "$GITHUB_WORKSPACE"/squashfs-root/PCSX2.desktop -i "$GITHUB_WORKSPACE"/squashfs-root/PCSX2.png --output appimage
|
||||
|
||||
@@ -3,8 +3,8 @@
|
||||
set -e
|
||||
|
||||
if [ "${COMPILER}" = "gcc" ]; then
|
||||
export CC=gcc
|
||||
export CXX=g++
|
||||
export CC=gcc-10
|
||||
export CXX=g++-10
|
||||
else
|
||||
export CC=clang
|
||||
export CXX=clang++
|
||||
|
||||
109
.github/workflows/scripts/linux/install-packages.sh
vendored
109
.github/workflows/scripts/linux/install-packages.sh
vendored
@@ -6,100 +6,42 @@ set -e
|
||||
declare -a BUILD_PACKAGES=(
|
||||
"ccache"
|
||||
"cmake"
|
||||
"g++-8-multilib"
|
||||
"ninja-build"
|
||||
)
|
||||
|
||||
declare -a GCC_PACKAGES=(
|
||||
# Nothing Unique Needed
|
||||
)
|
||||
|
||||
declare -a CLANG_PACKAGES=(
|
||||
"clang-format"
|
||||
"clang-tidy"
|
||||
"clang-tools"
|
||||
"clang"
|
||||
"clangd-10"
|
||||
"libc++-dev"
|
||||
"libc++1"
|
||||
"libc++abi-dev"
|
||||
"libc++abi1"
|
||||
"libclang-dev"
|
||||
"libclang1"
|
||||
"liblldb-10-dev"
|
||||
"libllvm-10-ocaml-dev"
|
||||
"libomp-dev"
|
||||
"libomp5"
|
||||
"lld"
|
||||
"lldb"
|
||||
"llvm-dev"
|
||||
"llvm-runtime"
|
||||
"llvm"
|
||||
"python3-clang-10"
|
||||
)
|
||||
|
||||
# Packages - PCSX2
|
||||
declare -a PCSX2_PACKAGES=(
|
||||
"curl"
|
||||
"fuse"
|
||||
"gettext"
|
||||
"libaio-dev"
|
||||
"libasound2-dev"
|
||||
"libatk1.0-dev"
|
||||
"libatk-bridge2.0-dev"
|
||||
"libbz2-dev"
|
||||
"libcairo2-dev"
|
||||
"libcggl"
|
||||
"libdbus-1-dev"
|
||||
"libegl1-mesa-dev"
|
||||
"libfontconfig1-dev"
|
||||
"libgdk-pixbuf2.0-dev"
|
||||
"libgirepository-1.0-1"
|
||||
"libgl-dev"
|
||||
"libgl1-mesa-dev"
|
||||
"libgl1-mesa-dri"
|
||||
"libgl1"
|
||||
"libgles2-mesa-dev"
|
||||
"libglew-dev"
|
||||
"libglib2.0-dev"
|
||||
"libglu1-mesa-dev"
|
||||
"libglu1-mesa"
|
||||
"libglvnd-dev"
|
||||
"libglx-mesa0"
|
||||
"libglx0"
|
||||
"libgtk-3-dev"
|
||||
"libgtk2.0-dev"
|
||||
"libharfbuzz-dev"
|
||||
"libibus-1.0-dev"
|
||||
"libjack-jackd2-dev"
|
||||
"libjpeg-dev"
|
||||
"libllvm10"
|
||||
"liblzma-dev"
|
||||
"liblzma5"
|
||||
"libpango1.0-dev"
|
||||
"libpcap0.8-dev"
|
||||
"libpng-dev"
|
||||
"libportaudiocpp0"
|
||||
"libpulse-dev"
|
||||
"librsvg2-dev"
|
||||
"libsdl1.2-dev"
|
||||
"libsdl2-dev"
|
||||
"libsamplerate0-dev"
|
||||
"libsoundtouch-dev"
|
||||
"libwxgtk3.0-dev"
|
||||
"libwxgtk3.0-gtk3-0v5"
|
||||
"libwxgtk3.0-gtk3-dev"
|
||||
"libx11-xcb-dev"
|
||||
"libxext-dev"
|
||||
"libxft-dev"
|
||||
"libxml2-dev"
|
||||
"nvidia-cg-toolkit"
|
||||
"pkg-config"
|
||||
"portaudio19-dev"
|
||||
"python"
|
||||
"zlib1g-dev"
|
||||
)
|
||||
|
||||
if [ "${COMPILER}" = "gcc" ]; then
|
||||
BUILD_PACKAGES+=("g++-10-multilib")
|
||||
else
|
||||
BUILD_PACKAGES+=("clang-9")
|
||||
PCSX2_PACKAGES+=("libstdc++-10-dev")
|
||||
fi
|
||||
|
||||
# - https://github.com/actions/virtual-environments/blob/main/images/linux/Ubuntu2004-README.md
|
||||
ARCH=""
|
||||
echo "${PLATFORM}"
|
||||
@@ -111,41 +53,14 @@ fi
|
||||
sudo apt-get -qq update
|
||||
|
||||
# Install packages needed for building
|
||||
BUILD_PACKAGE_STR=""
|
||||
for i in "${BUILD_PACKAGES[@]}"; do
|
||||
BUILD_PACKAGE_STR="${BUILD_PACKAGE_STR} ${i}"
|
||||
done
|
||||
|
||||
if [ "${COMPILER}" = "gcc" ]; then
|
||||
for i in "${GCC_PACKAGES[@]}"; do
|
||||
BUILD_PACKAGE_STR="${BUILD_PACKAGE_STR} ${i}"
|
||||
done
|
||||
else
|
||||
for i in "${CLANG_PACKAGES[@]}"; do
|
||||
BUILD_PACKAGE_STR="${BUILD_PACKAGE_STR} ${i}"
|
||||
done
|
||||
fi
|
||||
|
||||
echo "Will install the following packages for building - ${BUILD_PACKAGE_STR}"
|
||||
echo "Will install the following packages for building - ${BUILD_PACKAGES[*]}"
|
||||
#sudo apt remove gcc-9 g++-9
|
||||
sudo apt-get -y install ${BUILD_PACKAGE_STR}
|
||||
|
||||
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 10
|
||||
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-8 10
|
||||
sudo update-alternatives --install /usr/bin/cc cc /usr/bin/gcc 30
|
||||
sudo update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 30
|
||||
sudo apt-get -y install "${BUILD_PACKAGES[@]}"
|
||||
|
||||
# Install packages needed by pcsx2
|
||||
PCSX2_PACKAGES_STR=""
|
||||
for i in "${PCSX2_PACKAGES[@]}"; do
|
||||
PCSX2_PACKAGES_STR="${PCSX2_PACKAGES_STR} ${i}${ARCH}"
|
||||
done
|
||||
if [ "${PLATFORM}" == "x86" ]; then
|
||||
echo "Installing workaround attempt"
|
||||
sudo apt-get -y install libgcc-s1:i386
|
||||
fi
|
||||
echo "Will install the following packages for pcsx2 - ${PCSX2_PACKAGES_STR}"
|
||||
sudo apt-get -y install ${PCSX2_PACKAGES_STR}
|
||||
PCSX2_PACKAGES=("${PCSX2_PACKAGES[@]/%/"${ARCH}"}")
|
||||
echo "Will install the following packages for pcsx2 - ${PCSX2_PACKAGES[*]}"
|
||||
sudo apt-get -y install "${PCSX2_PACKAGES[@]}"
|
||||
|
||||
cd /tmp
|
||||
curl -sSfLO https://github.com/NixOS/patchelf/releases/download/0.12/patchelf-0.12.tar.bz2
|
||||
|
||||
@@ -11,9 +11,21 @@ for (var i = 0; i < assets.length; i++) {
|
||||
continue;
|
||||
}
|
||||
if (asset.name.includes("windows")) {
|
||||
windowsAssetLinks += `- [${asset.name}](${asset.browser_download_url})\n`
|
||||
let friendlyName = asset.name;
|
||||
try {
|
||||
friendlyName = asset.name.split("windows-")[1].split(".7z")[0].replace("-", " ");
|
||||
} catch (e) {
|
||||
console.log(e);
|
||||
}
|
||||
windowsAssetLinks += `- [${friendlyName}](${asset.browser_download_url})\n`
|
||||
} else if (asset.name.includes("linux")) {
|
||||
linuxAssetLinks += `- [${asset.name}](${asset.browser_download_url})\n`
|
||||
let friendlyName = asset.name;
|
||||
try {
|
||||
friendlyName = asset.name.split("linux-")[1].split(".AppImage")[0].replace("-", " ");
|
||||
} catch (e) {
|
||||
console.log(e);
|
||||
}
|
||||
linuxAssetLinks += `- [${friendlyName}](${asset.browser_download_url})\n`
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,7 +36,8 @@ const embed = new MessageEmbed()
|
||||
.addFields(
|
||||
{ name: 'Version', value: github.context.payload.release.tag_name, inline: true },
|
||||
{ name: 'Release Link', value: `[Github Release](${github.context.payload.release.html_url})`, inline: true },
|
||||
{ name: 'Installation Steps', value: '[See Here](https://github.com/PCSX2/pcsx2/wiki/Nightly-Build-Usage-Guide)', inline: true }
|
||||
{ name: 'Installation Steps', value: '[See Here](https://github.com/PCSX2/pcsx2/wiki/Nightly-Build-Usage-Guide)', inline: true },
|
||||
{ name: 'Included Changes', value: github.context.payload.release.body, inline: false }
|
||||
);
|
||||
|
||||
if (windowsAssetLinks != "") {
|
||||
|
||||
29
.github/workflows/update-controller-db.yml
vendored
Normal file
29
.github/workflows/update-controller-db.yml
vendored
Normal file
@@ -0,0 +1,29 @@
|
||||
name: 🏭 Update Controller Database
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: "0 16 * * 1" # every monday @ 12pm EST - https://crontab.guru/#0_16_*_*_1
|
||||
|
||||
jobs:
|
||||
update-controller-db:
|
||||
if: github.repository == 'PCSX2/pcsx2'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- name: Get Latest DB and Prepare DB File
|
||||
run: |
|
||||
cd .github/workflows/scripts/controller-db/
|
||||
wget -O game_controller_db.txt https://raw.githubusercontent.com/gabomdq/SDL_GameControllerDB/master/gamecontrollerdb.txt
|
||||
python ./update-db.py
|
||||
mv ./game_controller_db.txt ${{github.workspace}}/pcsx2/PAD/Linux/res/game_controller_db.txt
|
||||
|
||||
- name: Create Pull Request
|
||||
uses: peter-evans/create-pull-request@v3
|
||||
with:
|
||||
title: "pad-linux: Update to latest controller database"
|
||||
commit-message: "pad-linux: Update to latest controller database."
|
||||
committer: "PCSX2 Bot <PCSX2Bot@users.noreply.github.com>"
|
||||
author: "PCSX2 Bot <PCSX2Bot@users.noreply.github.com>"
|
||||
body: "Weekly automatic update of SDL Controller DB for Linux / Mac OS"
|
||||
reviewers: lightningterror
|
||||
@@ -31,6 +31,10 @@
|
||||
|
||||
#include "common/emitter/x86_intrin.h"
|
||||
|
||||
// The C++ standard doesn't allow `offsetof` to be used on non-constant values (e.g. `offsetof(class, field[i])`)
|
||||
// Use this in those situations
|
||||
#define OFFSETOF(a, b) (reinterpret_cast<size_t>(&(static_cast<a*>(0)->b)))
|
||||
|
||||
// Renamed ARRAYSIZE to ArraySize -- looks nice and gets rid of Windows.h conflicts (air)
|
||||
// Notes: I'd have used ARRAY_SIZE instead but ran into cross-platform lib conflicts with
|
||||
// that as well. >_<
|
||||
|
||||
@@ -639,21 +639,12 @@ set(pcsx2GSSources
|
||||
GS/Renderers/HW/GSTextureCache.cpp
|
||||
GS/Renderers/SW/GSDrawScanline.cpp
|
||||
GS/Renderers/SW/GSDrawScanlineCodeGenerator.cpp
|
||||
GS/Renderers/SW/GSDrawScanlineCodeGenerator.x64.cpp
|
||||
GS/Renderers/SW/GSDrawScanlineCodeGenerator.x64.avx.cpp
|
||||
GS/Renderers/SW/GSDrawScanlineCodeGenerator.x64.avx2.cpp
|
||||
GS/Renderers/SW/GSDrawScanlineCodeGenerator.x86.cpp
|
||||
GS/Renderers/SW/GSDrawScanlineCodeGenerator.x86.avx.cpp
|
||||
GS/Renderers/SW/GSDrawScanlineCodeGenerator.x86.avx2.cpp
|
||||
GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp
|
||||
GS/Renderers/SW/GSNewCodeGenerator.cpp
|
||||
GS/Renderers/SW/GSRasterizer.cpp
|
||||
GS/Renderers/SW/GSRendererSW.cpp
|
||||
GS/Renderers/SW/GSSetupPrimCodeGenerator.cpp
|
||||
GS/Renderers/SW/GSSetupPrimCodeGenerator.x64.cpp
|
||||
GS/Renderers/SW/GSSetupPrimCodeGenerator.x64.avx.cpp
|
||||
GS/Renderers/SW/GSSetupPrimCodeGenerator.x64.avx2.cpp
|
||||
GS/Renderers/SW/GSSetupPrimCodeGenerator.x86.cpp
|
||||
GS/Renderers/SW/GSSetupPrimCodeGenerator.x86.avx.cpp
|
||||
GS/Renderers/SW/GSSetupPrimCodeGenerator.x86.avx2.cpp
|
||||
GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp
|
||||
GS/Renderers/SW/GSTextureCacheSW.cpp
|
||||
GS/Renderers/SW/GSTextureSW.cpp
|
||||
GS/Renderers/OpenGL/GLLoader.cpp
|
||||
@@ -679,7 +670,6 @@ set(pcsx2GSHeaders
|
||||
GS/GSDrawingEnvironment.h
|
||||
GS/GSDump.h
|
||||
GS/GS_types.h
|
||||
GS/GS_codegen.h
|
||||
GS/GS.h
|
||||
GS/GSLocalMemory.h
|
||||
GS/GSLzma.h
|
||||
@@ -712,11 +702,14 @@ set(pcsx2GSHeaders
|
||||
GS/Renderers/HW/GSTextureCache.h
|
||||
GS/Renderers/HW/GSVertexHW.h
|
||||
GS/Renderers/SW/GSDrawScanlineCodeGenerator.h
|
||||
GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.h
|
||||
GS/Renderers/SW/GSDrawScanline.h
|
||||
GS/Renderers/SW/GSNewCodeGenerator.h
|
||||
GS/Renderers/SW/GSRasterizer.h
|
||||
GS/Renderers/SW/GSRendererSW.h
|
||||
GS/Renderers/SW/GSScanlineEnvironment.h
|
||||
GS/Renderers/SW/GSSetupPrimCodeGenerator.h
|
||||
GS/Renderers/SW/GSSetupPrimCodeGenerator.all.h
|
||||
GS/Renderers/SW/GSTextureCacheSW.h
|
||||
GS/Renderers/SW/GSTextureSW.h
|
||||
GS/Renderers/SW/GSVertexSW.h
|
||||
|
||||
@@ -1262,22 +1262,19 @@ void GSApp::Init()
|
||||
m_default_configuration["osd_monitor_enabled"] = "0";
|
||||
m_default_configuration["osd_max_log_messages"] = "2";
|
||||
m_default_configuration["override_geometry_shader"] = "-1";
|
||||
m_default_configuration["override_GL_ARB_compute_shader"] = "-1";
|
||||
m_default_configuration["override_GL_ARB_copy_image"] = "-1";
|
||||
m_default_configuration["override_GL_ARB_clear_texture"] = "-1";
|
||||
m_default_configuration["override_GL_ARB_clip_control"] = "-1";
|
||||
m_default_configuration["override_GL_ARB_direct_state_access"] = "-1";
|
||||
m_default_configuration["override_GL_ARB_draw_buffers_blend"] = "-1";
|
||||
m_default_configuration["override_GL_ARB_get_texture_sub_image"] = "-1";
|
||||
m_default_configuration["override_GL_ARB_gpu_shader5"] = "-1";
|
||||
m_default_configuration["override_GL_ARB_multi_bind"] = "-1";
|
||||
m_default_configuration["override_GL_ARB_shader_image_load_store"] = "-1";
|
||||
m_default_configuration["override_GL_ARB_shader_storage_buffer_object"] = "-1";
|
||||
m_default_configuration["override_GL_ARB_sparse_texture"] = "-1";
|
||||
m_default_configuration["override_GL_ARB_sparse_texture2"] = "-1";
|
||||
m_default_configuration["override_GL_ARB_texture_view"] = "-1";
|
||||
m_default_configuration["override_GL_ARB_vertex_attrib_binding"] = "-1";
|
||||
m_default_configuration["override_GL_ARB_texture_barrier"] = "-1";
|
||||
#ifdef GL_EXT_TEX_SUB_IMAGE
|
||||
m_default_configuration["override_GL_ARB_get_texture_sub_image"] = "-1";
|
||||
#endif
|
||||
m_default_configuration["paltex"] = "0";
|
||||
m_default_configuration["png_compression_level"] = std::to_string(Z_BEST_SPEED);
|
||||
m_default_configuration["preload_frame_with_gs_data"] = "0";
|
||||
|
||||
@@ -18,10 +18,11 @@
|
||||
template <int i>
|
||||
class GSAlignedClass
|
||||
{
|
||||
public:
|
||||
GSAlignedClass() {}
|
||||
virtual ~GSAlignedClass() {}
|
||||
protected:
|
||||
GSAlignedClass() = default;
|
||||
~GSAlignedClass() = default;
|
||||
|
||||
public:
|
||||
void* operator new(size_t size)
|
||||
{
|
||||
return _aligned_malloc(size, i);
|
||||
|
||||
@@ -158,36 +158,12 @@ void GSClut::Write(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
|
||||
m_read.dirty = true;
|
||||
|
||||
(this->*m_wc[TEX0.CSM][TEX0.CPSM][TEX0.PSM])(TEX0, TEXCLUT);
|
||||
|
||||
// Mirror write to other half of buffer to simulate wrapping memory
|
||||
|
||||
int offset = (TEX0.CSA & (TEX0.CPSM < PSM_PSMCT16 ? 15 : 31)) * 16;
|
||||
|
||||
if (TEX0.PSM == PSM_PSMT8 || TEX0.PSM == PSM_PSMT8H)
|
||||
{
|
||||
int size = TEX0.CPSM < PSM_PSMCT16 ? 512 : 256;
|
||||
|
||||
memcpy(m_clut + 512 + offset, m_clut + offset, sizeof(*m_clut) * std::min(size, 512 - offset));
|
||||
memcpy(m_clut, m_clut + 512, sizeof(*m_clut) * std::max(0, size + offset - 512));
|
||||
}
|
||||
else
|
||||
{
|
||||
int size = 16;
|
||||
|
||||
memcpy(m_clut + 512 + offset, m_clut + offset, sizeof(*m_clut) * size);
|
||||
|
||||
if (TEX0.CPSM < PSM_PSMCT16)
|
||||
{
|
||||
memcpy(m_clut + 512 + 256 + offset, m_clut + 256 + offset, sizeof(*m_clut) * size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSClut::WriteCLUT32_I8_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
|
||||
{
|
||||
ALIGN_STACK(32);
|
||||
//FIXME: Romance of the Three Kingdoms VIII text doesn't like the offset
|
||||
WriteCLUT_T32_I8_CSM1((uint32*)m_mem->BlockPtr32(0, 0, TEX0.CBP, 1), m_clut + ((TEX0.CSA & 15) << 4));
|
||||
WriteCLUT_T32_I8_CSM1((uint32*)m_mem->BlockPtr32(0, 0, TEX0.CBP, 1), m_clut, (TEX0.CSA & 15));
|
||||
}
|
||||
|
||||
void GSClut::WriteCLUT32_I4_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
|
||||
@@ -339,8 +315,7 @@ void GSClut::Read32(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA)
|
||||
{
|
||||
case PSM_PSMT8:
|
||||
case PSM_PSMT8H:
|
||||
clut += (TEX0.CSA & 15) << 4; // disney golf title screen
|
||||
ReadCLUT_T32_I8(clut, m_buff32);
|
||||
ReadCLUT_T32_I8(clut, m_buff32, (TEX0.CSA & 15) << 4);
|
||||
break;
|
||||
case PSM_PSMT4:
|
||||
case PSM_PSMT4HL:
|
||||
@@ -443,16 +418,16 @@ void GSClut::GetAlphaMinMax32(int& amin_out, int& amax_out)
|
||||
|
||||
//
|
||||
|
||||
void GSClut::WriteCLUT_T32_I8_CSM1(const uint32* RESTRICT src, uint16* RESTRICT clut)
|
||||
void GSClut::WriteCLUT_T32_I8_CSM1(const uint32* RESTRICT src, uint16* RESTRICT clut, uint16 offset)
|
||||
{
|
||||
// 4 blocks
|
||||
|
||||
for (int i = 0; i < 64; i += 16)
|
||||
// This is required when CSA is offset from the base of the CLUT so we point to the right data
|
||||
for (int i = offset; i < 16; i ++)
|
||||
{
|
||||
WriteCLUT_T32_I4_CSM1(&src[i + 0], &clut[i * 2 + 0]);
|
||||
WriteCLUT_T32_I4_CSM1(&src[i + 64], &clut[i * 2 + 16]);
|
||||
WriteCLUT_T32_I4_CSM1(&src[i + 128], &clut[i * 2 + 128]);
|
||||
WriteCLUT_T32_I4_CSM1(&src[i + 192], &clut[i * 2 + 144]);
|
||||
const int off = i << 4; // WriteCLUT_T32_I4_CSM1 loads 16 at a time
|
||||
// Source column
|
||||
const int s = clutTableT32I8[off & 0x70] | (off & 0x80);
|
||||
|
||||
WriteCLUT_T32_I4_CSM1(&src[s], &clut[off]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -532,11 +507,18 @@ __forceinline void GSClut::WriteCLUT_T16_I4_CSM1(const uint16* RESTRICT src, uin
|
||||
}
|
||||
}
|
||||
|
||||
void GSClut::ReadCLUT_T32_I8(const uint16* RESTRICT clut, uint32* RESTRICT dst)
|
||||
void GSClut::ReadCLUT_T32_I8(const uint16* RESTRICT clut, uint32* RESTRICT dst, int offset)
|
||||
{
|
||||
// Okay this deserves a small explanation
|
||||
// T32 I8 can address up to 256 colors however the offset can be "more than zero" when reading
|
||||
// Previously I assumed that it would wrap around the end of the buffer to the beginning
|
||||
// but it turns out this is incorrect, the address doesn't mirror, it clamps to to the last offset,
|
||||
// probably though some sort of addressing mechanism then picks the color from the lower 0xF of the requested CLUT entry.
|
||||
// if we don't do this, the dirt on GTA SA goes transparent and actually cleans the car driving through dirt.
|
||||
for (int i = 0; i < 256; i += 16)
|
||||
{
|
||||
ReadCLUT_T32_I4(&clut[i], &dst[i]);
|
||||
// Min value + offet or Last CSA * 16 (240)
|
||||
ReadCLUT_T32_I4(&clut[std::min((i + offset), 240)], &dst[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -74,11 +74,11 @@ class alignas(32) GSClut : public GSAlignedClass<32>
|
||||
|
||||
void WriteCLUT_NULL(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT);
|
||||
|
||||
static void WriteCLUT_T32_I8_CSM1(const uint32* RESTRICT src, uint16* RESTRICT clut);
|
||||
static void WriteCLUT_T32_I8_CSM1(const uint32* RESTRICT src, uint16* RESTRICT clut, uint16 offset);
|
||||
static void WriteCLUT_T32_I4_CSM1(const uint32* RESTRICT src, uint16* RESTRICT clut);
|
||||
static void WriteCLUT_T16_I8_CSM1(const uint16* RESTRICT src, uint16* RESTRICT clut);
|
||||
static void WriteCLUT_T16_I4_CSM1(const uint16* RESTRICT src, uint16* RESTRICT clut);
|
||||
static void ReadCLUT_T32_I8(const uint16* RESTRICT clut, uint32* RESTRICT dst);
|
||||
static void ReadCLUT_T32_I8(const uint16* RESTRICT clut, uint32* RESTRICT dst, int offset);
|
||||
static void ReadCLUT_T32_I4(const uint16* RESTRICT clut, uint32* RESTRICT dst);
|
||||
//static void ReadCLUT_T32_I4(const uint16* RESTRICT clut, uint32* RESTRICT dst32, uint64* RESTRICT dst64);
|
||||
//static void ReadCLUT_T16_I8(const uint16* RESTRICT clut, uint32* RESTRICT dst);
|
||||
|
||||
@@ -110,11 +110,7 @@ extern void vmfree(void* ptr, size_t size);
|
||||
|
||||
// Convert gcc see define into GS (windows) define
|
||||
#if defined(__AVX2__)
|
||||
#if defined(__x86_64__)
|
||||
#define _M_SSE 0x500 // TODO
|
||||
#else
|
||||
#define _M_SSE 0x501
|
||||
#endif
|
||||
#define _M_SSE 0x501
|
||||
#elif defined(__AVX__)
|
||||
#define _M_SSE 0x500
|
||||
#elif defined(__SSE4_1__)
|
||||
|
||||
@@ -156,22 +156,19 @@ namespace GLLoader
|
||||
|
||||
bool found_geometry_shader = true; // we require GL3.3 so geometry must be supported by default
|
||||
bool found_GL_ARB_clear_texture = false;
|
||||
bool found_GL_ARB_get_texture_sub_image = false; // Not yet used
|
||||
// DX11 GPU
|
||||
bool found_GL_ARB_gpu_shader5 = false; // Require IvyBridge
|
||||
bool found_GL_ARB_shader_image_load_store = false; // Intel IB. Nvidia/AMD miss Mesa implementation.
|
||||
bool found_GL_ARB_shader_storage_buffer_object = false;
|
||||
bool found_GL_ARB_compute_shader = false;
|
||||
bool found_GL_ARB_texture_view = false; // maybe older gpu can support it ?
|
||||
|
||||
// Mandatory in the future
|
||||
bool found_GL_ARB_multi_bind = false;
|
||||
bool found_GL_ARB_vertex_attrib_binding = false;
|
||||
|
||||
// In case sparse2 isn't supported
|
||||
bool found_compatible_GL_ARB_sparse_texture2 = false;
|
||||
bool found_compatible_sparse_depth = false;
|
||||
|
||||
// Not yet used
|
||||
#ifdef GL_EXT_TEX_SUB_IMAGE
|
||||
bool found_GL_ARB_get_texture_sub_image = false;
|
||||
#endif
|
||||
|
||||
static void mandatory(const std::string& ext)
|
||||
{
|
||||
if (!GLExtension::Has(ext))
|
||||
@@ -310,20 +307,17 @@ namespace GLLoader
|
||||
found_GL_ARB_gpu_shader5 = optional("GL_ARB_gpu_shader5");
|
||||
// GL4.2
|
||||
found_GL_ARB_shader_image_load_store = optional("GL_ARB_shader_image_load_store");
|
||||
// GL4.3
|
||||
found_GL_ARB_compute_shader = optional("GL_ARB_compute_shader");
|
||||
found_GL_ARB_shader_storage_buffer_object = optional("GL_ARB_shader_storage_buffer_object");
|
||||
found_GL_ARB_texture_view = optional("GL_ARB_texture_view");
|
||||
found_GL_ARB_vertex_attrib_binding = optional("GL_ARB_vertex_attrib_binding");
|
||||
// GL4.4
|
||||
found_GL_ARB_clear_texture = optional("GL_ARB_clear_texture");
|
||||
found_GL_ARB_multi_bind = optional("GL_ARB_multi_bind");
|
||||
// GL4.5
|
||||
optional("GL_ARB_direct_state_access");
|
||||
// Mandatory for the advance HW renderer effect. Unfortunately Mesa LLVMPIPE/SWR renderers doesn't support this extension.
|
||||
// Rendering might be corrupted but it could be good enough for test/virtual machine.
|
||||
optional("GL_ARB_texture_barrier");
|
||||
// Not yet used
|
||||
#ifdef GL_EXT_TEX_SUB_IMAGE
|
||||
found_GL_ARB_get_texture_sub_image = optional("GL_ARB_get_texture_sub_image");
|
||||
#endif
|
||||
}
|
||||
|
||||
if (vendor_id_amd)
|
||||
|
||||
@@ -485,8 +485,6 @@ private:
|
||||
std::unique_ptr<GL::StreamBuffer> m_vertex_stream_buffer;
|
||||
std::unique_ptr<GL::StreamBuffer> m_index_stream_buffer;
|
||||
GLuint m_vertex_array_object = 0;
|
||||
u32 m_vertex_buffer_base_vertex = 0;
|
||||
u32 m_index_buffer_offset = 0;
|
||||
GLenum m_draw_topology = 0;
|
||||
|
||||
std::unique_ptr<GL::StreamBuffer> m_vertex_uniform_stream_buffer;
|
||||
|
||||
@@ -468,7 +468,7 @@ bool GSTextureOGL::Map(GSMap& m, const GSVector4i* _r, int layer)
|
||||
// The fastest way will be to use a PBO to read the data asynchronously. Unfortunately GS
|
||||
// architecture is waiting the data right now.
|
||||
|
||||
#if 0
|
||||
#ifdef GL_EXT_TEX_SUB_IMAGE
|
||||
// Maybe it is as good as the code below. I don't know
|
||||
// With openGL 4.5 you can use glGetTextureSubImage
|
||||
|
||||
|
||||
3508
pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp
Normal file
3508
pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp
Normal file
File diff suppressed because it is too large
Load Diff
189
pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.h
Normal file
189
pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.h
Normal file
@@ -0,0 +1,189 @@
|
||||
/* PCSX2 - PS2 Emulator for PCs
|
||||
* Copyright (C) 2002-2021 PCSX2 Dev Team
|
||||
*
|
||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||
* ation, either version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
* PURPOSE. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with PCSX2.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "GSScanlineEnvironment.h"
|
||||
#include "GSNewCodeGenerator.h"
|
||||
|
||||
#undef _t // Conflict with wx, hopefully no one needs this
|
||||
|
||||
#if _M_SSE >= 0x501
|
||||
#define DRAW_SCANLINE_VECTOR_REGISTER Xbyak::Ymm
|
||||
#define DRAW_SCANLINE_USING_XMM 0
|
||||
#define DRAW_SCANLINE_USING_YMM 1
|
||||
#else
|
||||
#define DRAW_SCANLINE_VECTOR_REGISTER Xbyak::Xmm
|
||||
#define DRAW_SCANLINE_USING_XMM 1
|
||||
#define DRAW_SCANLINE_USING_YMM 0
|
||||
#endif
|
||||
|
||||
class GSDrawScanlineCodeGenerator2 : public GSNewCodeGenerator
|
||||
{
|
||||
using _parent = GSNewCodeGenerator;
|
||||
using XYm = DRAW_SCANLINE_VECTOR_REGISTER;
|
||||
|
||||
/// On x86-64 we reserve a bunch of GPRs for holding addresses of locals that would otherwise be hard to reach
|
||||
/// On x86-32 the same values are just raw 32-bit addresses
|
||||
using LocalAddr = Choose3264<size_t, AddressReg>::type;
|
||||
|
||||
constexpr static bool isXmm = std::is_same<XYm, Xbyak::Xmm>::value;
|
||||
constexpr static bool isYmm = std::is_same<XYm, Xbyak::Ymm>::value;
|
||||
constexpr static int wordsize = is64 ? 8 : 4;
|
||||
constexpr static int vecsize = isXmm ? 16 : 32;
|
||||
constexpr static int vecsizelog = isXmm ? 4 : 5;
|
||||
constexpr static int vecints = vecsize / 4;
|
||||
|
||||
|
||||
// MARK: - Constants
|
||||
|
||||
constexpr static int _32_args = 16;
|
||||
constexpr static int _invalid = 0xaaaaaaaa;
|
||||
#ifdef _WIN32
|
||||
constexpr static int _64_top = 8 * 0;
|
||||
// XMM registers will be saved to `rsp + _64_win_xmm_start + id - 6`
|
||||
// Which will put xmm6 after the temporaries, them xmm7, etc
|
||||
constexpr static int _64_win_xmm_start = 8 * 2;
|
||||
// Windows has no redzone and also has 10 xmm registers to save
|
||||
constexpr static int _64_win_stack_size = _64_win_xmm_start + 16 * 10;
|
||||
#else
|
||||
// System-V has a redzone so stick everything there
|
||||
constexpr static int _64_rz_rbx = -8 * 1;
|
||||
constexpr static int _64_rz_r12 = -8 * 2;
|
||||
constexpr static int _64_rz_r13 = -8 * 3;
|
||||
constexpr static int _64_rz_r14 = -8 * 4;
|
||||
constexpr static int _64_rz_r15 = -8 * 5;
|
||||
constexpr static int _64_top = -8 * 6;
|
||||
#endif
|
||||
constexpr static int _top = is64 ? _64_top : _32_args + 4;
|
||||
constexpr static int _v = is64 ? _invalid : _32_args + 8;
|
||||
|
||||
GSScanlineSelector m_sel;
|
||||
GSScanlineLocalData& m_local;
|
||||
bool m_rip;
|
||||
bool use_lod;
|
||||
|
||||
const XYm xym0{0}, xym1{1}, xym2{2}, xym3{3}, xym4{4}, xym5{5}, xym6{6}, xym7{7}, xym8{8}, xym9{9}, xym10{10}, xym11{11}, xym12{12}, xym13{13}, xym14{14}, xym15{15};
|
||||
/// Note: a2 and t3 are only available on x86-64
|
||||
/// Outside of Init, usable registers are a0, t0, t1, t2, t3[x64], rax, rbx, rdx, r10+
|
||||
const AddressReg a0, a1, a2, a3, t0, t1, t2, t3;
|
||||
const LocalAddr _g_const, _m_local, _m_local__gd, _m_local__gd__vm;
|
||||
/// Available on both x86 and x64, not always valid
|
||||
const XYm _rb, _ga, _fm, _zm, _fd, _test;
|
||||
/// Always valid if needed, x64 only
|
||||
const XYm _z, _f, _s, _t, _q, _f_rb, _f_ga;
|
||||
|
||||
/// Returns the first arg on 32-bit, second on 64-bit
|
||||
static LocalAddr chooseLocal(const void* addr32, AddressReg reg64)
|
||||
{
|
||||
return choose3264((size_t)addr32, reg64);
|
||||
}
|
||||
|
||||
public:
|
||||
GSDrawScanlineCodeGenerator2(Xbyak::CodeGenerator* base, CPUInfo cpu, void* param, uint64 key);
|
||||
void Generate();
|
||||
|
||||
private:
|
||||
/// Loads the given address into the given register if needed, and returns something that can be used in a `ptr[]`
|
||||
LocalAddr loadAddress(AddressReg reg, const void* addr);
|
||||
/// Broadcast 128 bits of floats from memory to the whole register, whatever size that register might be
|
||||
void broadcastf128(const XYm& reg, const Xbyak::Address& mem);
|
||||
/// Broadcast 128 bits of integers from memory to the whole register, whatever size that register might be
|
||||
void broadcasti128(const XYm& reg, const Xbyak::Address& mem);
|
||||
/// Broadcast a floating-point variable stored in GSScanlineLocalData to the whole register
|
||||
/// On YMM registers this will be a broadcast from a 32-bit value
|
||||
/// On XMM registers this will be a load of a full 128-bit value, with the broadcast happening before storing to the local data
|
||||
void broadcastssLocal(const XYm& reg, const Xbyak::Address& mem);
|
||||
/// Broadcast a qword variable stored in GSScanlineLocalData to the whole register
|
||||
/// On YMM registers this will be a broadcast from a 64-bit value
|
||||
/// On XMM registers this will be a load of a full 128-bit value, with the broadcast happening before storing to the local data
|
||||
void pbroadcastqLocal(const XYm& reg, const Xbyak::Address& mem);
|
||||
/// Broadcast a dword variable stored in GSScanlineLocalData to the whole register
|
||||
/// On YMM registers this will be a broadcast from a 32-bit value
|
||||
/// On XMM registers this will be a load of a full 128-bit value, with the broadcast happening before storing to the local data
|
||||
void pbroadcastdLocal(const XYm& reg, const Xbyak::Address& mem);
|
||||
/// Broadcast a word variable stored in GSScanlineLocalData to the whole register
|
||||
/// On YMM registers this will be a broadcast from a 16-bit value
|
||||
/// On XMM registers this will be a load of a full 128-bit value, with the broadcast happening before storing to the local data
|
||||
void pbroadcastwLocal(const XYm& reg, const Xbyak::Address& mem);
|
||||
/// Broadcast a 32-bit GPR to a vector register
|
||||
void broadcastGPRToVec(const XYm& vec, const Xbyak::Reg32& gpr);
|
||||
void modulate16(const XYm& a, const Xbyak::Operand& f, uint8 shift);
|
||||
void lerp16(const XYm& a, const XYm& b, const XYm& f, uint8 shift);
|
||||
void lerp16_4(const XYm& a, const XYm& b, const XYm& f);
|
||||
void mix16(const XYm& a, const XYm& b, const XYm& temp);
|
||||
void clamp16(const XYm& a, const XYm& temp);
|
||||
void alltrue(const XYm& test);
|
||||
void blend(const XYm& a, const XYm& b, const XYm& mask);
|
||||
void blendr(const XYm& b, const XYm& a, const XYm& mask);
|
||||
void blend8(const XYm& a, const XYm& b);
|
||||
void blend8r(const XYm& b, const XYm& a);
|
||||
void split16_2x8(const XYm& l, const XYm& h, const XYm& src);
|
||||
|
||||
void Init();
|
||||
void Step();
|
||||
void TestZ(const XYm& temp1, const XYm& temp2);
|
||||
void SampleTexture();
|
||||
void SampleTexture_TexelReadHelper(int mip_offset);
|
||||
void Wrap(const XYm& uv);
|
||||
void Wrap(const XYm& uv0, const XYm& uv1);
|
||||
void SampleTextureLOD();
|
||||
void WrapLOD(const XYm& uv);
|
||||
void WrapLOD(const XYm& uv0, const XYm& uv1);
|
||||
void AlphaTFX();
|
||||
void ReadMask();
|
||||
void TestAlpha();
|
||||
void ColorTFX();
|
||||
void Fog();
|
||||
void ReadFrame();
|
||||
void TestDestAlpha();
|
||||
void WriteMask();
|
||||
void WriteZBuf();
|
||||
void AlphaBlend();
|
||||
void WriteFrame();
|
||||
void ReadPixel(const XYm& dst, const XYm& tmp, const AddressReg& addr);
|
||||
#if DRAW_SCANLINE_USING_XMM
|
||||
void WritePixel(const XYm& src_, const AddressReg& addr, const Xbyak::Reg8& mask, bool fast, int psm, int fz);
|
||||
#else
|
||||
void WritePixel(const XYm& src_, const AddressReg& addr, const Xbyak::Reg32& mask, bool fast, int psm, int fz);
|
||||
#endif
|
||||
void WritePixel(const Xmm& src, const AddressReg& addr, uint8 i, uint8 j, int psm);
|
||||
void ReadTexel1(const XYm& dst, const XYm& src, const XYm& tmp1, const XYm& tmp2, int mip_offset);
|
||||
void ReadTexel4(
|
||||
const XYm& d0, const XYm& d1,
|
||||
const XYm& d2s0, const XYm& d3s1,
|
||||
const XYm& s2, const XYm& s3,
|
||||
const XYm& tmp1, const XYm& tmp2,
|
||||
int mip_offset);
|
||||
void ReadTexelImpl(
|
||||
const XYm& d0, const XYm& d1,
|
||||
const XYm& d2s0, const XYm& d3s1,
|
||||
const XYm& s2, const XYm& s3,
|
||||
const XYm& tmp1, const XYm& tmp2,
|
||||
int pixels, int mip_offset);
|
||||
void ReadTexelImplLoadTexLOD(int lod, int mip_offset);
|
||||
void ReadTexelImplYmm(
|
||||
const Ymm& d0, const Ymm& d1,
|
||||
const Ymm& d2s0, const Ymm& d3s1,
|
||||
const Ymm& s2, const Ymm& s3,
|
||||
const Ymm& tmp,
|
||||
int pixels, int mip_offset);
|
||||
void ReadTexelImplSSE4(
|
||||
const Xmm& d0, const Xmm& d1,
|
||||
const Xmm& d2s0, const Xmm& d3s1,
|
||||
const Xmm& s2, const Xmm& s3,
|
||||
int pixels, int mip_offset);
|
||||
void ReadTexelImpl(const Xmm& dst, const Xmm& addr, uint8 i, bool texInA3, bool preserveDst);
|
||||
};
|
||||
@@ -15,17 +15,8 @@
|
||||
|
||||
#include "PrecompiledHeader.h"
|
||||
#include "GSDrawScanlineCodeGenerator.h"
|
||||
#include "GSDrawScanlineCodeGenerator.all.h"
|
||||
|
||||
#if _M_SSE >= 0x501
|
||||
#else
|
||||
void GSDrawScanlineCodeGenerator::Generate()
|
||||
{
|
||||
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
|
||||
Generate_AVX();
|
||||
else
|
||||
Generate_SSE();
|
||||
}
|
||||
#endif
|
||||
|
||||
GSDrawScanlineCodeGenerator::GSDrawScanlineCodeGenerator(void* param, uint64 key, void* code, size_t maxsize)
|
||||
: GSCodeGenerator(code, maxsize)
|
||||
@@ -37,227 +28,5 @@ GSDrawScanlineCodeGenerator::GSDrawScanlineCodeGenerator(void* param, uint64 key
|
||||
if (m_sel.breakpoint)
|
||||
db(0xCC);
|
||||
|
||||
try
|
||||
{
|
||||
Generate();
|
||||
}
|
||||
catch (std::exception& e)
|
||||
{
|
||||
fprintf(stderr, "ERR:GSDrawScanlineCodeGenerator %s\n", e.what());
|
||||
}
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::modulate16(const Xmm& a, const Operand& f, uint8 shift)
|
||||
{
|
||||
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
|
||||
{
|
||||
if (shift == 0)
|
||||
{
|
||||
vpmulhrsw(a, f);
|
||||
}
|
||||
else
|
||||
{
|
||||
vpsllw(a, shift + 1);
|
||||
vpmulhw(a, f);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (shift == 0 && m_cpu.has(Xbyak::util::Cpu::tSSSE3))
|
||||
{
|
||||
pmulhrsw(a, f);
|
||||
}
|
||||
else
|
||||
{
|
||||
psllw(a, shift + 1);
|
||||
pmulhw(a, f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::lerp16(const Xmm& a, const Xmm& b, const Xmm& f, uint8 shift)
|
||||
{
|
||||
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
|
||||
{
|
||||
vpsubw(a, b);
|
||||
modulate16(a, f, shift);
|
||||
vpaddw(a, b);
|
||||
}
|
||||
else
|
||||
{
|
||||
psubw(a, b);
|
||||
modulate16(a, f, shift);
|
||||
paddw(a, b);
|
||||
}
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::lerp16_4(const Xmm& a, const Xmm& b, const Xmm& f)
|
||||
{
|
||||
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
|
||||
{
|
||||
vpsubw(a, b);
|
||||
vpmullw(a, f);
|
||||
vpsraw(a, 4);
|
||||
vpaddw(a, b);
|
||||
}
|
||||
else
|
||||
{
|
||||
psubw(a, b);
|
||||
pmullw(a, f);
|
||||
psraw(a, 4);
|
||||
paddw(a, b);
|
||||
}
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::mix16(const Xmm& a, const Xmm& b, const Xmm& temp)
|
||||
{
|
||||
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
|
||||
{
|
||||
vpblendw(a, b, 0xaa);
|
||||
}
|
||||
else
|
||||
{
|
||||
pblendw(a, b, 0xaa);
|
||||
}
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::clamp16(const Xmm& a, const Xmm& temp)
|
||||
{
|
||||
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
|
||||
{
|
||||
vpackuswb(a, a);
|
||||
|
||||
#if _M_SSE >= 0x501
|
||||
// Greg: why ?
|
||||
if (m_cpu.has(Xbyak::util::Cpu::tAVX2))
|
||||
{
|
||||
ASSERT(a.isYMM());
|
||||
vpermq(Ymm(a.getIdx()), Ymm(a.getIdx()), _MM_SHUFFLE(3, 1, 2, 0)); // this sucks
|
||||
}
|
||||
#endif
|
||||
|
||||
vpmovzxbw(a, a);
|
||||
}
|
||||
else
|
||||
{
|
||||
packuswb(a, a);
|
||||
pmovzxbw(a, a);
|
||||
}
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::alltrue(const Xmm& test)
|
||||
{
|
||||
uint32 mask = test.isYMM() ? 0xffffffff : 0xffff;
|
||||
|
||||
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
|
||||
{
|
||||
vpmovmskb(eax, test);
|
||||
cmp(eax, mask);
|
||||
je("step", T_NEAR);
|
||||
}
|
||||
else
|
||||
{
|
||||
pmovmskb(eax, test);
|
||||
cmp(eax, mask);
|
||||
je("step", T_NEAR);
|
||||
}
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::blend(const Xmm& a, const Xmm& b, const Xmm& mask)
|
||||
{
|
||||
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
|
||||
{
|
||||
vpand(b, mask);
|
||||
vpandn(mask, a);
|
||||
vpor(a, b, mask);
|
||||
}
|
||||
else
|
||||
{
|
||||
pand(b, mask);
|
||||
pandn(mask, a);
|
||||
por(b, mask);
|
||||
movdqa(a, b);
|
||||
}
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::blendr(const Xmm& b, const Xmm& a, const Xmm& mask)
|
||||
{
|
||||
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
|
||||
{
|
||||
vpand(b, mask);
|
||||
vpandn(mask, a);
|
||||
vpor(b, mask);
|
||||
}
|
||||
else
|
||||
{
|
||||
pand(b, mask);
|
||||
pandn(mask, a);
|
||||
por(b, mask);
|
||||
}
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::blend8(const Xmm& a, const Xmm& b)
|
||||
{
|
||||
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
|
||||
vpblendvb(a, a, b, xmm0);
|
||||
else
|
||||
pblendvb(a, b);
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::blend8r(const Xmm& b, const Xmm& a)
|
||||
{
|
||||
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
|
||||
{
|
||||
vpblendvb(b, a, b, xmm0);
|
||||
}
|
||||
else
|
||||
{
|
||||
pblendvb(a, b);
|
||||
movdqa(b, a);
|
||||
}
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::split16_2x8(const Xmm& l, const Xmm& h, const Xmm& src)
|
||||
{
|
||||
// l = src & 0xFF; (1 left shift + 1 right shift)
|
||||
// h = (src >> 8) & 0xFF; (1 right shift)
|
||||
|
||||
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
|
||||
{
|
||||
if (src == h)
|
||||
{
|
||||
vpsllw(l, src, 8);
|
||||
vpsrlw(h, 8);
|
||||
}
|
||||
else if (src == l)
|
||||
{
|
||||
vpsrlw(h, src, 8);
|
||||
vpsllw(l, 8);
|
||||
}
|
||||
else
|
||||
{
|
||||
vpsllw(l, src, 8);
|
||||
vpsrlw(h, src, 8);
|
||||
}
|
||||
vpsrlw(l, 8);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (src == h)
|
||||
{
|
||||
movdqa(l, src);
|
||||
}
|
||||
else if (src == l)
|
||||
{
|
||||
movdqa(h, src);
|
||||
}
|
||||
else
|
||||
{
|
||||
movdqa(l, src);
|
||||
movdqa(h, src);
|
||||
}
|
||||
psllw(l, 8);
|
||||
psrlw(l, 8);
|
||||
psrlw(h, 8);
|
||||
}
|
||||
GSDrawScanlineCodeGenerator2(this, CPUInfo(m_cpu), (void*)&m_local, m_sel.key).Generate();
|
||||
}
|
||||
|
||||
@@ -27,117 +27,12 @@
|
||||
|
||||
class GSDrawScanlineCodeGenerator : public GSCodeGenerator
|
||||
{
|
||||
typedef Xbyak::Ymm Ymm;
|
||||
typedef Xbyak::Xmm Xmm;
|
||||
typedef Xbyak::Reg8 Reg8;
|
||||
typedef Xbyak::Operand Operand;
|
||||
|
||||
void operator=(const GSDrawScanlineCodeGenerator&);
|
||||
|
||||
GSScanlineSelector m_sel;
|
||||
GSScanlineLocalData& m_local;
|
||||
bool m_rip;
|
||||
|
||||
void Generate();
|
||||
|
||||
#if _M_SSE >= 0x501
|
||||
|
||||
void Init();
|
||||
void Step();
|
||||
void TestZ(const Ymm& temp1, const Ymm& temp2);
|
||||
void SampleTexture();
|
||||
void Wrap(const Ymm& uv0);
|
||||
void Wrap(const Ymm& uv0, const Ymm& uv1);
|
||||
void SampleTextureLOD();
|
||||
void WrapLOD(const Ymm& uv0);
|
||||
void WrapLOD(const Ymm& uv0, const Ymm& uv1);
|
||||
void AlphaTFX();
|
||||
void ReadMask();
|
||||
void TestAlpha();
|
||||
void ColorTFX();
|
||||
void Fog();
|
||||
void ReadFrame();
|
||||
void TestDestAlpha();
|
||||
void WriteMask();
|
||||
void WriteZBuf();
|
||||
void AlphaBlend();
|
||||
void WriteFrame();
|
||||
void ReadPixel(const Ymm& dst, const Ymm& temp, const RegLong& addr);
|
||||
void WritePixel(const Ymm& src, const Ymm& temp, const RegLong& addr, const Xbyak::Reg32& mask, bool fast, int psm, int fz);
|
||||
void WritePixel(const Xmm& src, const RegLong& addr, uint8 i, uint8 j, int psm);
|
||||
void ReadTexel(int pixels, int mip_offset = 0);
|
||||
void ReadTexel(const Ymm& dst, const Ymm& addr, uint8 i);
|
||||
|
||||
#else
|
||||
|
||||
void Generate_SSE();
|
||||
void Init_SSE();
|
||||
void Step_SSE();
|
||||
void TestZ_SSE(const Xmm& temp1, const Xmm& temp2);
|
||||
void SampleTexture_SSE();
|
||||
void Wrap_SSE(const Xmm& uv0);
|
||||
void Wrap_SSE(const Xmm& uv0, const Xmm& uv1);
|
||||
void SampleTextureLOD_SSE();
|
||||
void WrapLOD_SSE(const Xmm& uv0);
|
||||
void WrapLOD_SSE(const Xmm& uv0, const Xmm& uv1);
|
||||
void AlphaTFX_SSE();
|
||||
void ReadMask_SSE();
|
||||
void TestAlpha_SSE();
|
||||
void ColorTFX_SSE();
|
||||
void Fog_SSE();
|
||||
void ReadFrame_SSE();
|
||||
void TestDestAlpha_SSE();
|
||||
void WriteMask_SSE();
|
||||
void WriteZBuf_SSE();
|
||||
void AlphaBlend_SSE();
|
||||
void WriteFrame_SSE();
|
||||
void ReadPixel_SSE(const Xmm& dst, const RegLong& addr);
|
||||
void WritePixel_SSE(const Xmm& src, const RegLong& addr, const Reg8& mask, bool fast, int psm, int fz);
|
||||
void WritePixel_SSE(const Xmm& src, const RegLong& addr, uint8 i, int psm);
|
||||
void ReadTexel_SSE(int pixels, int mip_offset = 0);
|
||||
void ReadTexel_SSE(const Xmm& dst, const Xmm& addr, uint8 i);
|
||||
|
||||
void Generate_AVX();
|
||||
void Init_AVX();
|
||||
void Step_AVX();
|
||||
void TestZ_AVX(const Xmm& temp1, const Xmm& temp2);
|
||||
void SampleTexture_AVX();
|
||||
void Wrap_AVX(const Xmm& uv0);
|
||||
void Wrap_AVX(const Xmm& uv0, const Xmm& uv1);
|
||||
void SampleTextureLOD_AVX();
|
||||
void WrapLOD_AVX(const Xmm& uv0);
|
||||
void WrapLOD_AVX(const Xmm& uv0, const Xmm& uv1);
|
||||
void AlphaTFX_AVX();
|
||||
void ReadMask_AVX();
|
||||
void TestAlpha_AVX();
|
||||
void ColorTFX_AVX();
|
||||
void Fog_AVX();
|
||||
void ReadFrame_AVX();
|
||||
void TestDestAlpha_AVX();
|
||||
void WriteMask_AVX();
|
||||
void WriteZBuf_AVX();
|
||||
void AlphaBlend_AVX();
|
||||
void WriteFrame_AVX();
|
||||
void ReadPixel_AVX(const Xmm& dst, const RegLong& addr);
|
||||
void WritePixel_AVX(const Xmm& src, const RegLong& addr, const Reg8& mask, bool fast, int psm, int fz);
|
||||
void WritePixel_AVX(const Xmm& src, const RegLong& addr, uint8 i, int psm);
|
||||
void ReadTexel_AVX(int pixels, int mip_offset = 0);
|
||||
void ReadTexel_AVX(const Xmm& dst, const Xmm& addr, uint8 i);
|
||||
|
||||
#endif
|
||||
|
||||
void modulate16(const Xmm& a, const Operand& f, uint8 shift);
|
||||
void lerp16(const Xmm& a, const Xmm& b, const Xmm& f, uint8 shift);
|
||||
void lerp16_4(const Xmm& a, const Xmm& b, const Xmm& f);
|
||||
void mix16(const Xmm& a, const Xmm& b, const Xmm& temp);
|
||||
void clamp16(const Xmm& a, const Xmm& temp);
|
||||
void alltrue(const Xmm& test);
|
||||
void blend(const Xmm& a, const Xmm& b, const Xmm& mask);
|
||||
void blendr(const Xmm& b, const Xmm& a, const Xmm& mask);
|
||||
void blend8(const Xmm& a, const Xmm& b);
|
||||
void blend8r(const Xmm& b, const Xmm& a);
|
||||
void split16_2x8(const Xmm& l, const Xmm& h, const Xmm& src);
|
||||
|
||||
public:
|
||||
GSDrawScanlineCodeGenerator(void* param, uint64 key, void* code, size_t maxsize);
|
||||
};
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,118 +0,0 @@
|
||||
/* PCSX2 - PS2 Emulator for PCs
|
||||
* Copyright (C) 2002-2021 PCSX2 Dev Team
|
||||
*
|
||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||
* ation, either version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
* PURPOSE. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with PCSX2.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "PrecompiledHeader.h"
|
||||
#include "GSDrawScanlineCodeGenerator.h"
|
||||
|
||||
#if _M_SSE < 0x501 && (defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
// It is useless to port the code to SSEx, better use the faster 32 bits version instead
|
||||
void GSDrawScanlineCodeGenerator::Generate_SSE()
|
||||
{
|
||||
// Avoid a crash if someone want to use it
|
||||
ret();
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::Init_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::Step_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::TestZ_SSE(const Xmm& temp1, const Xmm& temp2)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::SampleTexture_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::Wrap_SSE(const Xmm& uv)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::Wrap_SSE(const Xmm& uv0, const Xmm& uv1)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::AlphaTFX_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ReadMask_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::TestAlpha_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ColorTFX_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::Fog_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ReadFrame_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::TestDestAlpha_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::WriteMask_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::WriteZBuf_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::AlphaBlend_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::WriteFrame_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ReadPixel_SSE(const Xmm& dst, const RegLong& addr)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::WritePixel_SSE(const Xmm& src, const RegLong& addr, const Reg8& mask, bool fast, int psm, int fz)
|
||||
{
|
||||
}
|
||||
|
||||
//static const int s_offsets[4] = {0, 2, 8, 10};
|
||||
|
||||
void GSDrawScanlineCodeGenerator::WritePixel_SSE(const Xmm& src, const RegLong& addr, uint8 i, int psm)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ReadTexel_SSE(int pixels, int mip_offset)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ReadTexel_SSE(const Xmm& dst, const Xmm& addr, uint8 i)
|
||||
{
|
||||
}
|
||||
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -13,26 +13,5 @@
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
using namespace Xbyak;
|
||||
|
||||
#ifdef _M_AMD64
|
||||
// Yeah let use mips naming ;)
|
||||
#ifdef _WIN64
|
||||
#define a0 rcx
|
||||
#define a1 rdx
|
||||
#define a2 r8
|
||||
#define a3 r9
|
||||
#define t0 rdi
|
||||
#define t1 rsi
|
||||
#else
|
||||
#define a0 rdi
|
||||
#define a1 rsi
|
||||
#define a2 rdx
|
||||
#define a3 rcx
|
||||
#define t0 r8
|
||||
#define t1 r9
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include "PrecompiledHeader.h"
|
||||
#include "GSNewCodeGenerator.h"
|
||||
489
pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h
Normal file
489
pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h
Normal file
@@ -0,0 +1,489 @@
|
||||
/* PCSX2 - PS2 Emulator for PCs
|
||||
* Copyright (C) 2002-2021 PCSX2 Dev Team
|
||||
*
|
||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||
* ation, either version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
* PURPOSE. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with PCSX2.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "GS/GS_types.h"
|
||||
#include "xbyak/xbyak.h"
|
||||
#include "xbyak/xbyak_util.h"
|
||||
|
||||
namespace SSEVersion
|
||||
{
|
||||
enum SSEVersion
|
||||
{
|
||||
AVX2 = 0x501,
|
||||
AVX = 0x500,
|
||||
SSE41 = 0x401,
|
||||
};
|
||||
}
|
||||
|
||||
/// Similar to Xbyak::util::cpu but more open to us putting in extra flags (e.g. "vpgatherdd is fast"), as well as making it easier to test other configurations by artifically limiting features
|
||||
struct CPUInfo
|
||||
{
|
||||
bool hasFMA = false;
|
||||
SSEVersion::SSEVersion sseVersion = SSEVersion::SSE41;
|
||||
|
||||
CPUInfo() = default;
|
||||
CPUInfo(const Xbyak::util::Cpu& cpu)
|
||||
{
|
||||
auto version = SSEVersion::SSE41;
|
||||
if (cpu.has(cpu.tAVX))
|
||||
version = SSEVersion::AVX;
|
||||
if (cpu.has(cpu.tAVX2))
|
||||
version = SSEVersion::AVX2;
|
||||
|
||||
hasFMA = cpu.has(cpu.tFMA);
|
||||
sseVersion = version;
|
||||
}
|
||||
};
|
||||
|
||||
/// Code generator that automatically selects between SSE and AVX, x86 and x64 so you don't have to
|
||||
/// Should make combined SSE and AVX codegen much easier
|
||||
class GSNewCodeGenerator
|
||||
{
|
||||
public:
|
||||
using Address = Xbyak::Address;
|
||||
using Label = Xbyak::Label;
|
||||
using Operand = Xbyak::Operand;
|
||||
using Reg32e = Xbyak::Reg32e;
|
||||
using Reg32 = Xbyak::Reg32;
|
||||
using Reg16 = Xbyak::Reg16;
|
||||
using Reg8 = Xbyak::Reg8;
|
||||
using Reg = Xbyak::Reg;
|
||||
using Xmm = Xbyak::Xmm;
|
||||
using Ymm = Xbyak::Ymm;
|
||||
using Zmm = Xbyak::Zmm;
|
||||
|
||||
class Error : public std::exception
|
||||
{
|
||||
public:
|
||||
enum Value
|
||||
{
|
||||
ERR_64_BIT_REG_IN_32,
|
||||
ERR_64_INSTR_IN_32,
|
||||
ERR_SSE_INSTR_IN_AVX,
|
||||
ERR_AVX_INSTR_IN_SSE,
|
||||
};
|
||||
|
||||
Value value;
|
||||
|
||||
Error(Value value) : value(value) {}
|
||||
|
||||
const char* what() const noexcept
|
||||
{
|
||||
static const char* tbl[] = {
|
||||
"used 64-bit register in 32-bit code",
|
||||
"used 64-bit only instruction in 32-bit code",
|
||||
"used SSE instruction in AVX code",
|
||||
"used AVX instruction in SSE code",
|
||||
};
|
||||
if (static_cast<uint32>(value) < (sizeof(tbl) / sizeof(*tbl)))
|
||||
{
|
||||
return tbl[value];
|
||||
}
|
||||
else
|
||||
{
|
||||
return "GSNewCodeGenerator Unknown Error";
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
private:
|
||||
/// Make sure the register is okay to use
|
||||
void validateRegister(const Operand& op)
|
||||
{
|
||||
if (is64)
|
||||
return;
|
||||
if (op.isREG() && (op.isExtIdx() || op.isExt8bit()))
|
||||
throw Error(Error::ERR_64_BIT_REG_IN_32);
|
||||
if (op.isMEM())
|
||||
{
|
||||
auto e = static_cast<const Address&>(op).getRegExp();
|
||||
validateRegister(e.getIndex());
|
||||
validateRegister(e.getBase());
|
||||
}
|
||||
}
|
||||
/// For easier macro-ing
|
||||
void validateRegister(int imm)
|
||||
{
|
||||
}
|
||||
|
||||
void require64()
|
||||
{
|
||||
if (!is64)
|
||||
throw Error(Error::ERR_64_INSTR_IN_32);
|
||||
}
|
||||
void requireAVX()
|
||||
{
|
||||
if (!hasAVX)
|
||||
throw Error(Error::ERR_AVX_INSTR_IN_SSE);
|
||||
}
|
||||
|
||||
public:
|
||||
Xbyak::CodeGenerator& actual;
|
||||
|
||||
#if defined(_M_X86_64)
|
||||
constexpr static bool is32 = false;
|
||||
constexpr static bool is64 = true;
|
||||
using AddressReg = Xbyak::Reg64;
|
||||
using RipType = Xbyak::RegRip;
|
||||
|
||||
template <typename T32, typename T64>
|
||||
struct Choose3264 { using type = T64; };
|
||||
|
||||
template <typename T32, typename T64>
|
||||
static T64 choose3264(T32 t32, T64 t64) { return t64; }
|
||||
#else
|
||||
constexpr static bool is32 = true;
|
||||
constexpr static bool is64 = false;
|
||||
using AddressReg = Xbyak::Reg32;
|
||||
using RipType = int;
|
||||
|
||||
template <typename T32, typename T64>
|
||||
struct Choose3264 { using type = T32; };
|
||||
|
||||
template <typename T32, typename T64>
|
||||
static T32 choose3264(T32 t32, T64 t64) { return t32; }
|
||||
#endif
|
||||
|
||||
const bool hasAVX, hasAVX2, hasFMA;
|
||||
|
||||
const Xmm xmm0{0}, xmm1{1}, xmm2{2}, xmm3{3}, xmm4{4}, xmm5{5}, xmm6{6}, xmm7{7}, xmm8{8}, xmm9{9}, xmm10{10}, xmm11{11}, xmm12{12}, xmm13{13}, xmm14{14}, xmm15{15};
|
||||
const Ymm ymm0{0}, ymm1{1}, ymm2{2}, ymm3{3}, ymm4{4}, ymm5{5}, ymm6{6}, ymm7{7}, ymm8{8}, ymm9{9}, ymm10{10}, ymm11{11}, ymm12{12}, ymm13{13}, ymm14{14}, ymm15{15};
|
||||
const AddressReg rax{0}, rcx{1}, rdx{2}, rbx{3}, rsp{4}, rbp{5}, rsi{6}, rdi{7}, r8{8}, r9{9}, r10{10}, r11{11}, r12{12}, r13{13}, r14{14}, r15{15};
|
||||
const Reg32 eax{0}, ecx{1}, edx{2}, ebx{3}, esp{4}, ebp{5}, esi{6}, edi{7}, r8d{8}, r9d{9}, r10d{10}, r11d{11}, r12d{12}, r13d{13}, r14d{14}, r15d{15};
|
||||
const Reg16 ax{0}, cx{1}, dx{2}, bx{3}, sp{4}, bp{5}, si{6}, di{7};
|
||||
const Reg8 al{0}, cl{1}, dl{2}, bl{3}, ah{4}, ch{5}, dh{6}, bh{7};
|
||||
|
||||
const RipType rip{};
|
||||
const Xbyak::AddressFrame ptr{0}, byte{8}, word{16}, dword{32}, qword{64}, xword{128}, yword{256}, zword{512};
|
||||
|
||||
GSNewCodeGenerator(Xbyak::CodeGenerator* actual, CPUInfo cpu)
|
||||
: actual(*actual)
|
||||
, hasAVX(cpu.sseVersion >= SSEVersion::AVX)
|
||||
, hasAVX2(cpu.sseVersion >= SSEVersion::AVX2)
|
||||
, hasFMA(cpu.hasFMA)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
// ------------ Forwarding instructions ------------
|
||||
// Note: Only instructions used by codegen were added here, so if you're modifying codegen, you may need to add instructions here
|
||||
|
||||
// For instructions available in SSE and AVX, functions with the SSE name and arguments that forward to SSE or AVX depending on the target, as well as functions with the AVX name and arguments that forward to the AVX version or assert on SSE
|
||||
|
||||
// ARGS_* macros are provided for shorter argument lists. The following single-letter abbreviations are used: X=Xmm, Y=Ymm, O=Operand, A=Address, I=Immediate
|
||||
// FORWARD(argcount, category, instrname, argtypes...) forwards an instruction. The following categories are available:
|
||||
// BASE: non-SSE
|
||||
// SSE: available on SSE and v-prefixed on AVX
|
||||
// SSEONLY: available only on SSE (exception on AVX)
|
||||
// AVX: available only on AVX (exception on SSE)
|
||||
// AVX2: available only on AVX2 (exception on AVX/SSE)
|
||||
// FMA: available only with FMA
|
||||
// SFORWARD forwards an SSE-AVX pair where the AVX variant takes the same number of registers (e.g. pshufd dst, src + vpshufd dst, src)
|
||||
// AFORWARD forwards an SSE-AVX pair where the AVX variant takes an extra destination register (e.g. shufps dst, src + vshufps dst, src, src)
|
||||
|
||||
// Implementation details:
|
||||
// ACTUAL_FORWARD_*: Actually forward the function of the given type
|
||||
// FORWARD#: First validates the arguments (e.g. make sure you're not passing registers over 7 on x86), then forwards to an ACTUAL_FORWARD_*
|
||||
|
||||
// Big thanks to https://stackoverflow.com/a/24028231 for helping me figure out how to work around MSVC's terrible macro expander
|
||||
// Of course GCC/Clang don't like the workaround so enjoy the ifdefs
|
||||
#define EXPAND_ARGS(macro, args) macro args
|
||||
|
||||
#define ACTUAL_FORWARD_BASE(name, ...) \
|
||||
actual.name(__VA_ARGS__);
|
||||
|
||||
#define ACTUAL_FORWARD_SSE(name, ...) \
|
||||
if (hasAVX) \
|
||||
actual.v##name(__VA_ARGS__); \
|
||||
else \
|
||||
actual.name(__VA_ARGS__);
|
||||
|
||||
#define ACTUAL_FORWARD_SSEONLY(name, ...) \
|
||||
if (hasAVX) \
|
||||
throw Error(Error::ERR_SSE_INSTR_IN_AVX); \
|
||||
else \
|
||||
actual.name(__VA_ARGS__);
|
||||
|
||||
#define ACTUAL_FORWARD_AVX(name, ...) \
|
||||
if (hasAVX) \
|
||||
actual.name(__VA_ARGS__); \
|
||||
else \
|
||||
throw Error(Error::ERR_AVX_INSTR_IN_SSE);
|
||||
|
||||
#define ACTUAL_FORWARD_AVX2(name, ...) \
|
||||
if (hasAVX2) \
|
||||
actual.name(__VA_ARGS__); \
|
||||
else \
|
||||
throw Error(Error::ERR_AVX_INSTR_IN_SSE);
|
||||
|
||||
#define ACTUAL_FORWARD_FMA(name, ...) \
|
||||
if (hasFMA) \
|
||||
actual.name(__VA_ARGS__); \
|
||||
else \
|
||||
throw Error(Error::ERR_AVX_INSTR_IN_SSE);
|
||||
|
||||
#define FORWARD1(category, name, type) \
|
||||
void name(type a) \
|
||||
{ \
|
||||
validateRegister(a); \
|
||||
ACTUAL_FORWARD_##category(name, a) \
|
||||
}
|
||||
|
||||
#define FORWARD2(category, name, type1, type2) \
|
||||
void name(type1 a, type2 b) \
|
||||
{ \
|
||||
validateRegister(a); \
|
||||
validateRegister(b); \
|
||||
ACTUAL_FORWARD_##category(name, a, b) \
|
||||
}
|
||||
|
||||
#define FORWARD3(category, name, type1, type2, type3) \
|
||||
void name(type1 a, type2 b, type3 c) \
|
||||
{ \
|
||||
validateRegister(a); \
|
||||
validateRegister(b); \
|
||||
validateRegister(c); \
|
||||
ACTUAL_FORWARD_##category(name, a, b, c) \
|
||||
}
|
||||
|
||||
#define FORWARD4(category, name, type1, type2, type3, type4) \
|
||||
void name(type1 a, type2 b, type3 c, type4 d) \
|
||||
{ \
|
||||
validateRegister(a); \
|
||||
validateRegister(b); \
|
||||
validateRegister(c); \
|
||||
validateRegister(d); \
|
||||
ACTUAL_FORWARD_##category(name, a, b, c, d) \
|
||||
}
|
||||
|
||||
#ifdef __GNUC__
|
||||
#define FORWARD_(argcount, ...) FORWARD##argcount(__VA_ARGS__)
|
||||
// Gets the macro evaluator to evaluate in the right order
|
||||
#define FORWARD(...) FORWARD_(__VA_ARGS__)
|
||||
#else
|
||||
#define FORWARD_(argcount, ...) EXPAND_ARGS(FORWARD##argcount, (__VA_ARGS__))
|
||||
// Gets the macro evaluator to evaluate in the right order
|
||||
#define FORWARD(...) EXPAND_ARGS(FORWARD_, (__VA_ARGS__))
|
||||
#endif
|
||||
|
||||
#define FORWARD_SSE_XMM0(name) \
|
||||
void name(const Xmm& a, const Operand& b) \
|
||||
{ \
|
||||
validateRegister(a); \
|
||||
validateRegister(b); \
|
||||
if (hasAVX) \
|
||||
actual.v##name(a, b, Xmm(0)); \
|
||||
else \
|
||||
actual.name(a, b); \
|
||||
} \
|
||||
FORWARD(4, AVX, v##name, const Xmm&, const Xmm&, const Operand&, const Xmm&)
|
||||
|
||||
#define FORWARD_JUMP(name) \
|
||||
void name(const void *addr) { actual.name(addr); } \
|
||||
void name(const Label& label, Xbyak::CodeGenerator::LabelType type = Xbyak::CodeGenerator::T_AUTO) { actual.name(label, type); } \
|
||||
void name(const char *label, Xbyak::CodeGenerator::LabelType type = Xbyak::CodeGenerator::T_AUTO) { actual.name(label, type); }
|
||||
|
||||
#define ADD_ONE_2 3
|
||||
#define ADD_ONE_3 4
|
||||
|
||||
#ifdef __GNUC__
|
||||
#define SFORWARD(argcount, name, ...) FORWARD(argcount, SSE, name, __VA_ARGS__)
|
||||
#define AFORWARD_(argcount, name, arg1, ...) \
|
||||
SFORWARD(argcount, name, arg1, __VA_ARGS__) \
|
||||
FORWARD(ADD_ONE_##argcount, AVX, v##name, arg1, arg1, __VA_ARGS__)
|
||||
// Gets the macro evaluator to evaluate in the right order
|
||||
#define AFORWARD(...) EXPAND_ARGS(AFORWARD_, (__VA_ARGS__))
|
||||
#else
|
||||
#define SFORWARD(argcount, name, ...) EXPAND_ARGS(FORWARD, (argcount, SSE, name, __VA_ARGS__))
|
||||
#define AFORWARD_(argcount, name, arg1, ...) \
|
||||
EXPAND_ARGS(SFORWARD, (argcount, name, arg1, __VA_ARGS__)) \
|
||||
EXPAND_ARGS(FORWARD, (ADD_ONE_##argcount, AVX, v##name, arg1, arg1, __VA_ARGS__))
|
||||
// Gets the macro evaluator to evaluate in the right order
|
||||
#define AFORWARD(...) EXPAND_ARGS(AFORWARD_, (__VA_ARGS__))
|
||||
#endif
|
||||
|
||||
#define FORWARD_OO_OI(name) \
|
||||
FORWARD(2, BASE, name, ARGS_OO) \
|
||||
FORWARD(2, BASE, name, ARGS_OI)
|
||||
|
||||
#define ARGS_OI const Operand&, uint32
|
||||
#define ARGS_OO const Operand&, const Operand&
|
||||
#define ARGS_XI const Xmm&, int
|
||||
#define ARGS_XO const Xmm&, const Operand&
|
||||
#define ARGS_XOI const Xmm&, const Operand&, uint8
|
||||
#define ARGS_XXO const Xmm&, const Xmm&, const Operand&
|
||||
|
||||
// For instructions that are ifdef'd out without XBYAK64
|
||||
#ifdef XBYAK64
|
||||
#define REQUIRE64(action) require64(); action
|
||||
#else
|
||||
#define REQUIRE64(action) require64()
|
||||
#endif
|
||||
|
||||
const uint8 *getCurr() { return actual.getCurr(); }
|
||||
void align(int x = 16) { return actual.align(x); }
|
||||
void db(int code) { actual.db(code); }
|
||||
void L(const std::string& label) { actual.L(label); }
|
||||
|
||||
void cdqe() { REQUIRE64(actual.cdqe()); }
|
||||
void ret(int imm = 0) { actual.ret(imm); }
|
||||
void vzeroupper() { requireAVX(); actual.vzeroupper(); }
|
||||
void vzeroall() { requireAVX(); actual.vzeroall(); }
|
||||
|
||||
FORWARD_OO_OI(add)
|
||||
FORWARD_OO_OI(and)
|
||||
FORWARD_OO_OI(cmp)
|
||||
FORWARD_OO_OI(or)
|
||||
FORWARD_OO_OI(sub)
|
||||
FORWARD_OO_OI(xor)
|
||||
FORWARD(2, BASE, lea, const Reg&, const Address&)
|
||||
FORWARD(2, BASE, mov, const Operand&, size_t)
|
||||
FORWARD(2, BASE, mov, ARGS_OO)
|
||||
FORWARD(2, BASE, movzx, const Reg&, const Operand&)
|
||||
FORWARD(1, BASE, not, const Operand&)
|
||||
FORWARD(1, BASE, pop, const Operand&)
|
||||
FORWARD(1, BASE, push, const Operand&)
|
||||
FORWARD(2, BASE, sar, const Operand&, const Reg8&)
|
||||
FORWARD(2, BASE, sar, ARGS_OI)
|
||||
FORWARD(2, BASE, shl, const Operand&, const Reg8&)
|
||||
FORWARD(2, BASE, shl, ARGS_OI)
|
||||
FORWARD(2, BASE, shr, const Operand&, const Reg8&)
|
||||
FORWARD(2, BASE, shr, ARGS_OI)
|
||||
FORWARD(2, BASE, test, const Operand&, const Reg&);
|
||||
FORWARD(2, BASE, test, ARGS_OI);
|
||||
|
||||
FORWARD_JUMP(je)
|
||||
FORWARD_JUMP(jle)
|
||||
FORWARD_JUMP(jmp)
|
||||
|
||||
AFORWARD(2, addps, ARGS_XO)
|
||||
SFORWARD(2, cvtdq2ps, ARGS_XO)
|
||||
SFORWARD(2, cvtps2dq, ARGS_XO)
|
||||
SFORWARD(2, cvttps2dq, ARGS_XO)
|
||||
SFORWARD(3, extractps, const Operand&, const Xmm&, uint8)
|
||||
AFORWARD(2, maxps, ARGS_XO)
|
||||
AFORWARD(2, minps, ARGS_XO)
|
||||
SFORWARD(2, movaps, ARGS_XO)
|
||||
SFORWARD(2, movaps, const Address&, const Xmm&)
|
||||
SFORWARD(2, movd, const Address&, const Xmm&)
|
||||
SFORWARD(2, movd, const Reg32&, const Xmm&)
|
||||
SFORWARD(2, movd, const Xmm&, const Address&)
|
||||
SFORWARD(2, movd, const Xmm&, const Reg32&)
|
||||
SFORWARD(2, movdqa, ARGS_XO)
|
||||
SFORWARD(2, movdqa, const Address&, const Xmm&)
|
||||
SFORWARD(2, movhps, ARGS_XO)
|
||||
SFORWARD(2, movhps, const Address&, const Xmm&)
|
||||
SFORWARD(2, movq, const Address&, const Xmm&)
|
||||
SFORWARD(2, movq, const Xmm&, const Address&)
|
||||
AFORWARD(2, mulps, ARGS_XO)
|
||||
AFORWARD(2, orps, ARGS_XO)
|
||||
AFORWARD(2, packssdw, ARGS_XO)
|
||||
AFORWARD(2, packusdw, ARGS_XO)
|
||||
AFORWARD(2, packuswb, ARGS_XO)
|
||||
AFORWARD(2, paddd, ARGS_XO)
|
||||
AFORWARD(2, paddusb, ARGS_XO)
|
||||
AFORWARD(2, paddw, ARGS_XO)
|
||||
AFORWARD(2, pand, ARGS_XO)
|
||||
AFORWARD(2, pandn, ARGS_XO)
|
||||
AFORWARD(3, pblendw, ARGS_XOI)
|
||||
AFORWARD(2, pcmpeqd, ARGS_XO)
|
||||
AFORWARD(2, pcmpeqw, ARGS_XO)
|
||||
AFORWARD(2, pcmpgtd, ARGS_XO)
|
||||
SFORWARD(3, pextrd, const Operand&, const Xmm&, uint8)
|
||||
SFORWARD(3, pextrw, const Operand&, const Xmm&, uint8)
|
||||
AFORWARD(3, pinsrd, ARGS_XOI)
|
||||
AFORWARD(2, pmaxsw, ARGS_XO)
|
||||
AFORWARD(2, pminsd, ARGS_XO)
|
||||
AFORWARD(2, pminsw, ARGS_XO)
|
||||
SFORWARD(2, pmovsxbd, ARGS_XO)
|
||||
SFORWARD(2, pmovmskb, const Reg32e&, const Xmm&)
|
||||
SFORWARD(2, pmovzxbw, ARGS_XO)
|
||||
AFORWARD(2, pmulhrsw, ARGS_XO)
|
||||
AFORWARD(2, pmulhw, ARGS_XO)
|
||||
AFORWARD(2, pmullw, ARGS_XO)
|
||||
AFORWARD(2, por, ARGS_XO)
|
||||
SFORWARD(3, pshufd, ARGS_XOI)
|
||||
SFORWARD(3, pshufhw, ARGS_XOI)
|
||||
SFORWARD(3, pshuflw, ARGS_XOI)
|
||||
AFORWARD(2, pslld, ARGS_XI)
|
||||
AFORWARD(2, psllw, ARGS_XI)
|
||||
AFORWARD(2, psrad, ARGS_XI)
|
||||
AFORWARD(2, psrad, ARGS_XO)
|
||||
AFORWARD(2, psraw, ARGS_XI)
|
||||
AFORWARD(2, psrld, ARGS_XI)
|
||||
AFORWARD(2, psrldq, ARGS_XI)
|
||||
AFORWARD(2, psrlw, ARGS_XI)
|
||||
AFORWARD(2, psrlw, ARGS_XO)
|
||||
AFORWARD(2, psubd, ARGS_XO)
|
||||
AFORWARD(2, psubw, ARGS_XO)
|
||||
AFORWARD(2, punpckhdq, ARGS_XO)
|
||||
AFORWARD(2, punpckhwd, ARGS_XO)
|
||||
AFORWARD(2, punpcklbw, ARGS_XO)
|
||||
AFORWARD(2, punpckldq, ARGS_XO)
|
||||
AFORWARD(2, punpcklqdq,ARGS_XO)
|
||||
AFORWARD(2, punpcklwd, ARGS_XO)
|
||||
AFORWARD(2, pxor, ARGS_XO)
|
||||
SFORWARD(2, rcpps, ARGS_XO)
|
||||
AFORWARD(3, shufps, ARGS_XOI)
|
||||
AFORWARD(2, subps, ARGS_XO)
|
||||
AFORWARD(2, xorps, ARGS_XO)
|
||||
|
||||
FORWARD_SSE_XMM0(pblendvb)
|
||||
|
||||
FORWARD(2, AVX, vbroadcastss, ARGS_XO)
|
||||
FORWARD(2, AVX2, vbroadcasti128, const Ymm&, const Address&)
|
||||
FORWARD(2, AVX, vbroadcastf128, const Ymm&, const Address&)
|
||||
FORWARD(3, FMA, vfmadd213ps, ARGS_XXO)
|
||||
FORWARD(3, AVX2, vextracti128, const Operand&, const Ymm&, uint8)
|
||||
FORWARD(4, AVX2, vinserti128, const Ymm&, const Ymm&, const Operand&, uint8);
|
||||
FORWARD(2, AVX2, vpbroadcastd, ARGS_XO)
|
||||
FORWARD(2, AVX2, vpbroadcastq, ARGS_XO)
|
||||
FORWARD(2, AVX2, vpbroadcastw, ARGS_XO)
|
||||
FORWARD(3, AVX2, vpermq, const Ymm&, const Operand&, uint8)
|
||||
FORWARD(3, AVX2, vpgatherdd, const Xmm&, const Address&, const Xmm&);
|
||||
FORWARD(3, AVX2, vpsravd, ARGS_XXO)
|
||||
FORWARD(3, AVX2, vpsrlvd, ARGS_XXO)
|
||||
|
||||
#undef REQUIRE64
|
||||
#undef ARGS_OI
|
||||
#undef ARGS_OO
|
||||
#undef ARGS_XI
|
||||
#undef ARGS_XO
|
||||
#undef ARGS_XOI
|
||||
#undef ARGS_XXO
|
||||
#undef FORWARD_OO_OI
|
||||
#undef AFORWARD
|
||||
#undef AFORWARD_
|
||||
#undef SFORWARD
|
||||
#undef ADD_ONE_2
|
||||
#undef ADD_ONE_3
|
||||
#undef FORWARD_SSE_XMM0
|
||||
#undef FORWARD_JUMP
|
||||
#undef FORWARD
|
||||
#undef FORWARD_
|
||||
#undef FORWARD4
|
||||
#undef FORWARD3
|
||||
#undef FORWARD2
|
||||
#undef FORWARD1
|
||||
#undef ACTUAL_FORWARD_FMA
|
||||
#undef ACTUAL_FORWARD_AVX2
|
||||
#undef ACTUAL_FORWARD_AVX
|
||||
#undef ACTUAL_FORWARD_SSE
|
||||
#undef ACTUAL_FORWARD_SSEONLY
|
||||
#undef ACTUAL_FORWARD_BASE
|
||||
#undef EXPAND_ARGS
|
||||
};
|
||||
566
pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp
Normal file
566
pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp
Normal file
@@ -0,0 +1,566 @@
|
||||
/* PCSX2 - PS2 Emulator for PCs
|
||||
* Copyright (C) 2002-2021 PCSX2 Dev Team
|
||||
*
|
||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||
* ation, either version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
* PURPOSE. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with PCSX2.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "PrecompiledHeader.h"
|
||||
#include "GS/GS_types.h"
|
||||
#include "GSSetupPrimCodeGenerator.all.h"
|
||||
#include "GSVertexSW.h"
|
||||
|
||||
using namespace Xbyak;
|
||||
|
||||
#define _rip_local(field) ((is32 || m_rip) ? ptr[rip + (char*)&m_local.field] : ptr[_m_local + OFFSETOF(GSScanlineLocalData, field)])
|
||||
|
||||
#define _64_m_local _64_t0
|
||||
|
||||
/// On AVX, does a v-prefixed separate destination operation
|
||||
/// On SSE, moves src1 into dst using movdqa, then does the operation
|
||||
#define THREEARG(operation, dst, src1, ...) \
|
||||
do \
|
||||
{ \
|
||||
if (hasAVX) \
|
||||
{ \
|
||||
v##operation(dst, src1, __VA_ARGS__); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
movdqa(dst, src1); \
|
||||
operation(dst, __VA_ARGS__); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#if _M_SSE >= 0x501
|
||||
#define _rip_local_d(x) _rip_local(d8.x)
|
||||
#define _rip_local_d_p(x) _rip_local_d(p.x)
|
||||
#else
|
||||
#define _rip_local_d(x) _rip_local(d4.x)
|
||||
#define _rip_local_d_p(x) _rip_local_d(x)
|
||||
#endif
|
||||
|
||||
GSSetupPrimCodeGenerator2::GSSetupPrimCodeGenerator2(Xbyak::CodeGenerator* base, CPUInfo cpu, void* param, uint64 key)
|
||||
: _parent(base, cpu)
|
||||
, m_local(*(GSScanlineLocalData*)param)
|
||||
, m_rip(false), many_regs(false)
|
||||
// On x86 arg registers are very temporary but on x64 they aren't, so on x86 some registers overlap
|
||||
#ifdef _WIN32
|
||||
, _64_vertex(is64 ? rcx : r8)
|
||||
, _index(is64 ? rdx : rcx)
|
||||
, _dscan(is64 ? r8 : rdx)
|
||||
, _64_t0(r9), t1(is64 ? r10 : rcx)
|
||||
#else
|
||||
, _64_vertex(is64 ? rdi : r8)
|
||||
, _index(is64 ? rsi : rcx)
|
||||
, _dscan(rdx)
|
||||
, _64_t0(is64 ? rcx : r8), t1(is64 ? r8 : rcx)
|
||||
#endif
|
||||
, _m_local(chooseLocal(&m_local, _64_m_local))
|
||||
{
|
||||
m_sel.key = key;
|
||||
|
||||
m_en.z = m_sel.zb ? 1 : 0;
|
||||
m_en.f = m_sel.fb && m_sel.fge ? 1 : 0;
|
||||
m_en.t = m_sel.fb && m_sel.tfx != TFX_NONE ? 1 : 0;
|
||||
m_en.c = m_sel.fb && !(m_sel.tfx == TFX_DECAL && m_sel.tcc) ? 1 : 0;
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator2::broadcastf128(const XYm& reg, const Address& mem)
|
||||
{
|
||||
#if SETUP_PRIM_USING_YMM
|
||||
vbroadcastf128(reg, mem);
|
||||
#else
|
||||
movaps(reg, mem);
|
||||
#endif
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator2::Generate()
|
||||
{
|
||||
// Technically we just need the delta < 2GB
|
||||
m_rip = (size_t)&m_local < 0x80000000 && (size_t)getCurr() < 0x80000000;
|
||||
|
||||
bool needs_shift = (m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip;
|
||||
many_regs = is64 && isYmm && !m_sel.notest && needs_shift;
|
||||
|
||||
#ifdef _WIN64
|
||||
int needs_saving = many_regs ? 6 : m_sel.notest ? 0 : 2;
|
||||
if (needs_saving)
|
||||
{
|
||||
sub(rsp, 8 + 16 * needs_saving);
|
||||
for (int i = 0; i < needs_saving; i++)
|
||||
{
|
||||
movdqa(ptr[rsp + i * 16], Xmm(i + 6));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (is64 && !m_rip)
|
||||
mov(_64_m_local, (size_t)&m_local);
|
||||
|
||||
if (needs_shift)
|
||||
{
|
||||
if (is32)
|
||||
mov(_dscan, ptr[rsp + _32_dscan]);
|
||||
|
||||
if (isXmm)
|
||||
mov(rax, (size_t)g_const->m_shift_128b);
|
||||
else
|
||||
mov(rax, (size_t)g_const->m_shift_256b);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 2 : many_regs ? 9 : 5); i++)
|
||||
{
|
||||
movaps(XYm(3 + i), ptr[rax + i * vecsize]);
|
||||
}
|
||||
}
|
||||
|
||||
if (isXmm)
|
||||
Depth_XMM();
|
||||
else
|
||||
Depth_YMM();
|
||||
|
||||
Texture();
|
||||
|
||||
Color();
|
||||
|
||||
#ifdef _WIN64
|
||||
if (needs_saving)
|
||||
{
|
||||
for (int i = 0; i < needs_saving; i++)
|
||||
{
|
||||
movdqa(Xmm(i + 6), ptr[rsp + i * 16]);
|
||||
}
|
||||
add(rsp, 8 + 16 * needs_saving);
|
||||
}
|
||||
#endif
|
||||
if (isYmm)
|
||||
vzeroupper();
|
||||
ret();
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator2::Depth_XMM()
|
||||
{
|
||||
if (!m_en.z && !m_en.f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_sel.prim != GS_SPRITE_CLASS)
|
||||
{
|
||||
// GSVector4 p = dscan.p;
|
||||
|
||||
|
||||
movaps(xmm0, ptr[_dscan + offsetof(GSVertexSW, p)]);
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// GSVector4 df = p.wwww();
|
||||
|
||||
THREEARG(shufps, xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
|
||||
|
||||
THREEARG(mulps, xmm2, xmm1, xmm3);
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
movdqa(_rip_local_d_p(f), xmm2);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
|
||||
|
||||
THREEARG(mulps, xmm2, xmm1, XYm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
movdqa(_rip_local(d[i].f), xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// GSVector4 dz = p.zzzz();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
// m_local.d4.z = dz * 4.0f;
|
||||
|
||||
THREEARG(mulps, xmm1, xmm0, xmm3);
|
||||
movdqa(_rip_local_d_p(z), xmm1);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// m_local.d[i].z = dz * m_shift[i];
|
||||
|
||||
THREEARG(mulps, xmm1, xmm0, XYm(4 + i));
|
||||
movdqa(_rip_local(d[i].z), xmm1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4 p = vertex[index[1]].p;
|
||||
|
||||
if (is32)
|
||||
mov(_index, ptr[rsp + _32_index]);
|
||||
mov(eax, ptr[_index + sizeof(uint32) * 1]);
|
||||
shl(eax, 6); // * sizeof(GSVertexSW)
|
||||
if (is64)
|
||||
add(rax, _64_vertex);
|
||||
else
|
||||
add(rax, ptr[rsp + _32_vertex]);
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
|
||||
movaps(xmm0, ptr[rax + offsetof(GSVertexSW, p)]);
|
||||
|
||||
cvttps2dq(xmm1, xmm0);
|
||||
pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
movdqa(_rip_local(p.f), xmm1);
|
||||
}
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// uint32 z is bypassed in t.w
|
||||
|
||||
movdqa(xmm0, ptr[rax + offsetof(GSVertexSW, t)]);
|
||||
pshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
movdqa(_rip_local(p.z), xmm0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator2::Depth_YMM()
|
||||
{
|
||||
if (!m_en.z && !m_en.f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_sel.prim != GS_SPRITE_CLASS)
|
||||
{
|
||||
// GSVector4 dp8 = dscan.p * GSVector4::broadcast32(&shift[0]);
|
||||
|
||||
broadcastf128(xym0, ptr[_dscan + offsetof(GSVertexSW, p)]);
|
||||
|
||||
vmulps(ymm1, ymm0, ymm3);
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// m_local.d8.p.z = dp8.extract32<2>();
|
||||
|
||||
extractps(_rip_local_d_p(z), xmm1, 2);
|
||||
|
||||
// GSVector8 dz = GSVector8(dscan.p).zzzz();
|
||||
|
||||
vshufps(ymm2, ymm0, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
}
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// m_local.d8.p.f = GSVector4i(dp8).extract32<3>();
|
||||
|
||||
cvtps2dq(ymm1, ymm1);
|
||||
pextrd(_rip_local_d_p(f), xmm1, 3);
|
||||
|
||||
// GSVector8 df = GSVector8(dscan.p).wwww();
|
||||
|
||||
vshufps(ymm1, ymm0, ymm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
}
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : dsize); i++)
|
||||
{
|
||||
if (m_en.z)
|
||||
{
|
||||
// m_local.d[i].z = dz * shift[1 + i];
|
||||
|
||||
// Save a byte in the encoding for ymm8-11 by swapping with ymm2 (multiplication is communative)
|
||||
if (i < 4 || many_regs)
|
||||
vmulps(ymm0, Ymm(4 + i), ymm2);
|
||||
else
|
||||
vmulps(ymm0, ymm2, ptr[g_const->m_shift_256b[i + 1]]);
|
||||
movaps(_rip_local(d[i].z), ymm0);
|
||||
}
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// m_local.d[i].f = GSVector8i(df * m_shift[i]).xxzzlh();
|
||||
|
||||
if (i < 4 || many_regs)
|
||||
vmulps(ymm0, Ymm(4 + i), ymm1);
|
||||
else
|
||||
vmulps(ymm0, ymm1, ptr[g_const->m_shift_256b[i + 1]]);
|
||||
cvttps2dq(ymm0, ymm0);
|
||||
pshuflw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
pshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
movdqa(_rip_local(d[i].f), ymm0);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4 p = vertex[index[1]].p;
|
||||
|
||||
if (is32)
|
||||
mov(_index, ptr[rsp + _32_index]);
|
||||
mov(eax, ptr[_index + sizeof(uint32) * 1]);
|
||||
shl(eax, 6); // * sizeof(GSVertexSW)
|
||||
if (is64)
|
||||
add(rax, _64_vertex);
|
||||
else
|
||||
add(rax, ptr[rsp + _32_vertex]);
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// m_local.p.f = GSVector4i(vertex[index[1]].p).extract32<3>();
|
||||
|
||||
movaps(xmm0, ptr[rax + offsetof(GSVertexSW, p)]);
|
||||
cvttps2dq(xmm0, xmm0);
|
||||
pextrd(_rip_local(p.f), xmm0, 3);
|
||||
}
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// m_local.p.z = vertex[index[1]].t.u32[3]; // uint32 z is bypassed in t.w
|
||||
|
||||
mov(t1.cvt32(), ptr[rax + offsetof(GSVertexSW, t.w)]);
|
||||
mov(_rip_local(p.z), t1.cvt32());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator2::Texture()
|
||||
{
|
||||
if (!m_en.t)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// GSVector4 t = dscan.t;
|
||||
|
||||
broadcastf128(xym0, ptr[_dscan + offsetof(GSVertexSW, t)]);
|
||||
|
||||
THREEARG(mulps, xmm1, xmm0, xmm3);
|
||||
|
||||
if (m_sel.fst)
|
||||
{
|
||||
// m_local.d4.stq = GSVector4i(t * 4.0f);
|
||||
|
||||
cvttps2dq(xmm1, xmm1);
|
||||
|
||||
movdqa(_rip_local_d(stq), xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d4.stq = t * 4.0f;
|
||||
|
||||
movaps(_rip_local_d(stq), xmm1);
|
||||
}
|
||||
|
||||
for (int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
||||
{
|
||||
// GSVector4 ds = t.xxxx();
|
||||
// GSVector4 dt = t.yyyy();
|
||||
// GSVector4 dq = t.zzzz();
|
||||
|
||||
THREEARG(shufps, xym1, xym0, xym0, _MM_SHUFFLE(j, j, j, j));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : dsize); i++)
|
||||
{
|
||||
// GSVector4 v = ds/dt * m_shift[i];
|
||||
|
||||
if (i < 4 || many_regs)
|
||||
THREEARG(mulps, xym2, XYm(4 + i), xym1);
|
||||
else
|
||||
vmulps(ymm2, ymm1, ptr[g_const->m_shift_256b[i + 1]]);
|
||||
|
||||
if (m_sel.fst)
|
||||
{
|
||||
// m_local.d[i].s/t = GSVector4i(v);
|
||||
|
||||
cvttps2dq(xym2, xym2);
|
||||
|
||||
switch (j)
|
||||
{
|
||||
case 0: movdqa(_rip_local(d[i].s), xym2); break;
|
||||
case 1: movdqa(_rip_local(d[i].t), xym2); break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d[i].s/t/q = v;
|
||||
|
||||
switch (j)
|
||||
{
|
||||
case 0: movaps(_rip_local(d[i].s), xym2); break;
|
||||
case 1: movaps(_rip_local(d[i].t), xym2); break;
|
||||
case 2: movaps(_rip_local(d[i].q), xym2); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator2::Color()
|
||||
{
|
||||
if (!m_en.c)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_sel.iip)
|
||||
{
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
broadcastf128(xym0, ptr[_dscan + offsetof(GSVertexSW, c)]);
|
||||
|
||||
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
||||
|
||||
THREEARG(mulps, xmm1, xmm0, xmm3);
|
||||
cvttps2dq(xmm1, xmm1);
|
||||
pshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
packssdw(xmm1, xmm1);
|
||||
if (isXmm)
|
||||
movdqa(_rip_local_d(c), xmm1);
|
||||
else
|
||||
movq(_rip_local_d(c), xmm1);
|
||||
|
||||
// xym3 is not needed anymore
|
||||
|
||||
// GSVector4 dr = c.xxxx();
|
||||
// GSVector4 db = c.zzzz();
|
||||
|
||||
THREEARG(shufps, xym2, xym0, xym0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
THREEARG(shufps, xym3, xym0, xym0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : dsize); i++)
|
||||
{
|
||||
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
|
||||
|
||||
if (i < 4 || many_regs)
|
||||
THREEARG(mulps, xym0, XYm(4 + i), xym2);
|
||||
else
|
||||
vmulps(ymm0, ymm2, ptr[g_const->m_shift_256b[i + 1]]);
|
||||
cvttps2dq(xym0, xym0);
|
||||
packssdw(xym0, xym0);
|
||||
|
||||
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
|
||||
|
||||
if (i < 4 || many_regs)
|
||||
THREEARG(mulps, xym1, XYm(4 + i), xym3);
|
||||
else
|
||||
vmulps(ymm1, ymm3, ptr[g_const->m_shift_256b[i + 1]]);
|
||||
cvttps2dq(xym1, xym1);
|
||||
packssdw(xym1, xym1);
|
||||
|
||||
// m_local.d[i].rb = r.upl16(b);
|
||||
|
||||
punpcklwd(xym0, xym1);
|
||||
movdqa(_rip_local(d[i].rb), xym0);
|
||||
}
|
||||
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
broadcastf128(xym0, ptr[_dscan + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
|
||||
|
||||
// GSVector4 dg = c.yyyy();
|
||||
// GSVector4 da = c.wwww();
|
||||
|
||||
THREEARG(shufps, xym2, xym0, xym0, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
THREEARG(shufps, xym3, xym0, xym0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : dsize); i++)
|
||||
{
|
||||
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
|
||||
|
||||
if (i < 4 || many_regs)
|
||||
THREEARG(mulps, xym0, XYm(4 + i), xym2);
|
||||
else
|
||||
vmulps(ymm0, ymm2, ptr[g_const->m_shift_256b[i + 1]]);
|
||||
cvttps2dq(xym0, xym0);
|
||||
packssdw(xym0, xym0);
|
||||
|
||||
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
|
||||
|
||||
if (i < 4 || many_regs)
|
||||
THREEARG(mulps, xym1, XYm(4 + i), xym3);
|
||||
else
|
||||
vmulps(ymm1, ymm3, ptr[g_const->m_shift_256b[i + 1]]);
|
||||
cvttps2dq(xym1, xym1);
|
||||
packssdw(xym1, xym1);
|
||||
|
||||
// m_local.d[i].ga = g.upl16(a);
|
||||
|
||||
punpcklwd(xym0, xym1);
|
||||
movdqa(_rip_local(d[i].ga), xym0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4i c = GSVector4i(vertex[index[last].c);
|
||||
|
||||
int last = 0;
|
||||
|
||||
switch (m_sel.prim)
|
||||
{
|
||||
case GS_POINT_CLASS: last = 0; break;
|
||||
case GS_LINE_CLASS: last = 1; break;
|
||||
case GS_TRIANGLE_CLASS: last = 2; break;
|
||||
case GS_SPRITE_CLASS: last = 1; break;
|
||||
}
|
||||
|
||||
if (!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
|
||||
{
|
||||
if (is32)
|
||||
mov(_index, ptr[rsp + _32_index]);
|
||||
mov(eax, ptr[_index + sizeof(uint32) * last]);
|
||||
shl(eax, 6); // * sizeof(GSVertexSW)
|
||||
if (is64)
|
||||
add(rax, _64_vertex);
|
||||
else
|
||||
add(rax, ptr[rsp + _32_vertex]);
|
||||
}
|
||||
|
||||
if (isXmm)
|
||||
{
|
||||
cvttps2dq(xmm0, ptr[rax + offsetof(GSVertexSW, c)]);
|
||||
}
|
||||
else
|
||||
{
|
||||
vbroadcasti128(ymm0, ptr[rax + offsetof(GSVertexSW, c)]);
|
||||
cvttps2dq(ymm0, ymm0);
|
||||
}
|
||||
|
||||
// c = c.upl16(c.zwxy());
|
||||
|
||||
pshufd(xym1, xym0, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
punpcklwd(xym0, xym1);
|
||||
|
||||
// if(!tme) c = c.srl16(7);
|
||||
|
||||
if (m_sel.tfx == TFX_NONE)
|
||||
{
|
||||
psrlw(xym0, 7);
|
||||
}
|
||||
|
||||
// m_local.c.rb = c.xxxx();
|
||||
// m_local.c.ga = c.zzzz();
|
||||
|
||||
pshufd(xym1, xym0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
pshufd(xym2, xym0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
movdqa(_rip_local(c.rb), xym1);
|
||||
movdqa(_rip_local(c.ga), xym2);
|
||||
}
|
||||
}
|
||||
83
pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.h
Normal file
83
pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.h
Normal file
@@ -0,0 +1,83 @@
|
||||
/* PCSX2 - PS2 Emulator for PCs
|
||||
* Copyright (C) 2002-2021 PCSX2 Dev Team
|
||||
*
|
||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||
* ation, either version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
* PURPOSE. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with PCSX2.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "GSScanlineEnvironment.h"
|
||||
#include "GSNewCodeGenerator.h"
|
||||
|
||||
#if _M_SSE >= 0x501
|
||||
#define SETUP_PRIM_VECTOR_REGISTER Xbyak::Ymm
|
||||
#define SETUP_PRIM_USING_XMM 0
|
||||
#define SETUP_PRIM_USING_YMM 1
|
||||
#else
|
||||
#define SETUP_PRIM_VECTOR_REGISTER Xbyak::Xmm
|
||||
#define SETUP_PRIM_USING_XMM 1
|
||||
#define SETUP_PRIM_USING_YMM 0
|
||||
#endif
|
||||
|
||||
class GSSetupPrimCodeGenerator2 : public GSNewCodeGenerator
|
||||
{
|
||||
using _parent = GSNewCodeGenerator;
|
||||
using XYm = SETUP_PRIM_VECTOR_REGISTER;
|
||||
|
||||
using Xmm = Xbyak::Xmm;
|
||||
using Ymm = Xbyak::Ymm;
|
||||
|
||||
/// On x86-64 we reserve a bunch of GPRs for holding addresses of locals that would otherwise be hard to reach
|
||||
/// On x86-32 the same values are just raw 32-bit addresses
|
||||
using LocalAddr = Choose3264<size_t, AddressReg>::type;
|
||||
|
||||
constexpr static bool isXmm = std::is_same<XYm, Xbyak::Xmm>::value;
|
||||
constexpr static bool isYmm = std::is_same<XYm, Xbyak::Ymm>::value;
|
||||
constexpr static int vecsize = isXmm ? 16 : 32;
|
||||
|
||||
constexpr static int dsize = isXmm ? 4 : 8;
|
||||
|
||||
constexpr static int _32_args = 0;
|
||||
constexpr static int _invalid = 0xaaaaaaaa;
|
||||
constexpr static int _32_vertex = is64 ? _invalid : _32_args + 4;
|
||||
constexpr static int _32_index = is64 ? _invalid : _32_args + 8;
|
||||
constexpr static int _32_dscan = is64 ? _invalid : _32_args + 12;
|
||||
|
||||
GSScanlineSelector m_sel;
|
||||
GSScanlineLocalData& m_local;
|
||||
bool m_rip;
|
||||
bool many_regs;
|
||||
|
||||
struct {uint32 z:1, f:1, t:1, c:1;} m_en;
|
||||
|
||||
const XYm xym0{0}, xym1{1}, xym2{2}, xym3{3}, xym4{4}, xym5{5}, xym6{6}, xym7{7}, xym8{8}, xym9{9}, xym10{10}, xym11{11}, xym12{12}, xym13{13}, xym14{14}, xym15{15};
|
||||
const AddressReg _64_vertex, _index, _dscan, _64_t0, t1;
|
||||
const LocalAddr _m_local;
|
||||
/// Returns the first arg on 32-bit, second on 64-bit
|
||||
static LocalAddr chooseLocal(const void* addr32, AddressReg reg64)
|
||||
{
|
||||
return choose3264((size_t)addr32, reg64);
|
||||
}
|
||||
|
||||
public:
|
||||
GSSetupPrimCodeGenerator2(Xbyak::CodeGenerator* base, CPUInfo cpu, void* param, uint64 key);
|
||||
void Generate();
|
||||
|
||||
private:
|
||||
/// Broadcast 128 bits of floats from memory to the whole register, whatever size that register might be
|
||||
void broadcastf128(const XYm& reg, const Xbyak::Address& mem);
|
||||
|
||||
void Depth_XMM();
|
||||
void Depth_YMM();
|
||||
void Texture();
|
||||
void Color();
|
||||
};
|
||||
@@ -15,6 +15,7 @@
|
||||
|
||||
#include "PrecompiledHeader.h"
|
||||
#include "GSSetupPrimCodeGenerator.h"
|
||||
#include "GSSetupPrimCodeGenerator.all.h"
|
||||
|
||||
using namespace Xbyak;
|
||||
|
||||
@@ -30,19 +31,5 @@ GSSetupPrimCodeGenerator::GSSetupPrimCodeGenerator(void* param, uint64 key, void
|
||||
m_en.t = m_sel.fb && m_sel.tfx != TFX_NONE ? 1 : 0;
|
||||
m_en.c = m_sel.fb && !(m_sel.tfx == TFX_DECAL && m_sel.tcc) ? 1 : 0;
|
||||
|
||||
try
|
||||
{
|
||||
#if _M_SSE >= 0x501
|
||||
Generate_AVX2();
|
||||
#else
|
||||
if (m_cpu.has(util::Cpu::tAVX))
|
||||
Generate_AVX();
|
||||
else
|
||||
Generate_SSE();
|
||||
#endif
|
||||
}
|
||||
catch (std::exception& e)
|
||||
{
|
||||
fprintf(stderr, "ERR:GSSetupPrimCodeGenerator %s\n", e.what());
|
||||
}
|
||||
GSSetupPrimCodeGenerator2(this, CPUInfo(m_cpu), param, key).Generate();
|
||||
}
|
||||
|
||||
@@ -32,23 +32,6 @@ class GSSetupPrimCodeGenerator : public GSCodeGenerator
|
||||
uint32 z : 1, f : 1, t : 1, c : 1;
|
||||
} m_en;
|
||||
|
||||
#if _M_SSE < 0x501
|
||||
void Generate_SSE();
|
||||
void Depth_SSE();
|
||||
void Texture_SSE();
|
||||
void Color_SSE();
|
||||
|
||||
void Generate_AVX();
|
||||
void Depth_AVX();
|
||||
void Texture_AVX();
|
||||
void Color_AVX();
|
||||
#else
|
||||
void Generate_AVX2();
|
||||
void Depth_AVX2();
|
||||
void Texture_AVX2();
|
||||
void Color_AVX2();
|
||||
#endif
|
||||
|
||||
public:
|
||||
GSSetupPrimCodeGenerator(void* param, uint64 key, void* code, size_t maxsize);
|
||||
};
|
||||
|
||||
@@ -1,365 +0,0 @@
|
||||
/* PCSX2 - PS2 Emulator for PCs
|
||||
* Copyright (C) 2002-2021 PCSX2 Dev Team
|
||||
*
|
||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||
* ation, either version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
* PURPOSE. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with PCSX2.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "PrecompiledHeader.h"
|
||||
#include "GSSetupPrimCodeGenerator.h"
|
||||
#include "GSVertexSW.h"
|
||||
#include "GS/GS_codegen.h"
|
||||
|
||||
#if _M_SSE < 0x501 && (defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
#define _rip_local(field) (m_rip ? ptr[rip + &m_local.field] : ptr[t0 + offsetof(GSScanlineLocalData, field)])
|
||||
#define _rip_local_v(field, offset) (m_rip ? ptr[rip + &m_local.field] : ptr[t0 + offset])
|
||||
|
||||
void GSSetupPrimCodeGenerator::Generate_AVX()
|
||||
{
|
||||
// Technically we just need the delta < 2GB
|
||||
m_rip = (size_t)&m_local < 0x80000000 && (size_t)getCurr() < 0x80000000;
|
||||
|
||||
#ifdef _WIN64
|
||||
sub(rsp, 8 + 2 * 16);
|
||||
|
||||
vmovdqa(ptr[rsp + 0], xmm6);
|
||||
vmovdqa(ptr[rsp + 16], xmm7);
|
||||
#endif
|
||||
|
||||
if (!m_rip)
|
||||
mov(t0, (size_t)&m_local);
|
||||
|
||||
if ((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
|
||||
{
|
||||
mov(rax, (size_t)g_const->m_shift_128b);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 2 : 5); i++)
|
||||
{
|
||||
vmovaps(Xmm(3 + i), ptr[rax + i * 16]);
|
||||
}
|
||||
}
|
||||
|
||||
Depth_AVX();
|
||||
|
||||
Texture_AVX();
|
||||
|
||||
Color_AVX();
|
||||
|
||||
#ifdef _WIN64
|
||||
vmovdqa(xmm6, ptr[rsp + 0]);
|
||||
vmovdqa(xmm7, ptr[rsp + 16]);
|
||||
|
||||
add(rsp, 8 + 2 * 16);
|
||||
#endif
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Depth_AVX()
|
||||
{
|
||||
if (!m_en.z && !m_en.f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_sel.prim != GS_SPRITE_CLASS)
|
||||
{
|
||||
// GSVector4 p = dscan.p;
|
||||
|
||||
vmovaps(xmm0, ptr[a2 + offsetof(GSVertexSW, p)]);
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// GSVector4 df = p.wwww();
|
||||
|
||||
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
|
||||
|
||||
vmulps(xmm2, xmm1, xmm3);
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vmovdqa(_rip_local(d4.f), xmm2);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
|
||||
|
||||
vmulps(xmm2, xmm1, Xmm(4 + i));
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
|
||||
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].f) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
vmovdqa(_rip_local_v(d[i].f, variableOffset), xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// GSVector4 dz = p.zzzz();
|
||||
|
||||
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
// m_local.d4.z = dz * 4.0f;
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
vmovdqa(_rip_local(d4.z), xmm1);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// m_local.d[i].z = dz * m_shift[i];
|
||||
|
||||
vmulps(xmm1, xmm0, Xmm(4 + i));
|
||||
|
||||
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].z) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
vmovdqa(_rip_local_v(d[i].z, variableOffset), xmm1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4 p = vertex[index[1]].p;
|
||||
|
||||
mov(eax, ptr[a1 + sizeof(uint32) * 1]);
|
||||
shl(eax, 6); // * sizeof(GSVertexSW)
|
||||
add(rax, a0);
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
|
||||
vmovaps(xmm0, ptr[rax + offsetof(GSVertexSW, p)]);
|
||||
|
||||
vcvttps2dq(xmm1, xmm0);
|
||||
vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vmovdqa(_rip_local(p.f), xmm1);
|
||||
}
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// uint32 z is bypassed in t.w
|
||||
|
||||
vmovdqa(xmm0, ptr[rax + offsetof(GSVertexSW, t)]);
|
||||
vpshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
vmovdqa(_rip_local(p.z), xmm0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Texture_AVX()
|
||||
{
|
||||
if (!m_en.t)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// GSVector4 t = dscan.t;
|
||||
|
||||
vmovaps(xmm0, ptr[a2 + offsetof(GSVertexSW, t)]);
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
|
||||
if (m_sel.fst)
|
||||
{
|
||||
// m_local.d4.stq = GSVector4i(t * 4.0f);
|
||||
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
|
||||
vmovdqa(_rip_local(d4.stq), xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d4.stq = t * 4.0f;
|
||||
|
||||
vmovaps(_rip_local(d4.stq), xmm1);
|
||||
}
|
||||
|
||||
for (int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
||||
{
|
||||
// GSVector4 ds = t.xxxx();
|
||||
// GSVector4 dt = t.yyyy();
|
||||
// GSVector4 dq = t.zzzz();
|
||||
|
||||
vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// GSVector4 v = ds/dt * m_shift[i];
|
||||
|
||||
vmulps(xmm2, xmm1, Xmm(4 + i));
|
||||
|
||||
if (m_sel.fst)
|
||||
{
|
||||
// m_local.d[i].s/t = GSVector4i(v);
|
||||
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
|
||||
const size_t variableOffsetS = offsetof(GSScanlineLocalData, d[0].s) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
const size_t variableOffsetT = offsetof(GSScanlineLocalData, d[0].t) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
|
||||
switch (j)
|
||||
{
|
||||
case 0: vmovdqa(_rip_local_v(d[i].s, variableOffsetS), xmm2); break;
|
||||
case 1: vmovdqa(_rip_local_v(d[i].t, variableOffsetT), xmm2); break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d[i].s/t/q = v;
|
||||
|
||||
const size_t variableOffsetS = offsetof(GSScanlineLocalData, d[0].s) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
const size_t variableOffsetT = offsetof(GSScanlineLocalData, d[0].t) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
const size_t variableOffsetQ = offsetof(GSScanlineLocalData, d[0].q) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
|
||||
switch (j)
|
||||
{
|
||||
case 0: vmovaps(_rip_local_v(d[i].s, variableOffsetS), xmm2); break;
|
||||
case 1: vmovaps(_rip_local_v(d[i].t, variableOffsetT), xmm2); break;
|
||||
case 2: vmovaps(_rip_local_v(d[i].q, variableOffsetQ), xmm2); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Color_AVX()
|
||||
{
|
||||
if (!m_en.c)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_sel.iip)
|
||||
{
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
vmovaps(xmm0, ptr[a2 + offsetof(GSVertexSW, c)]);
|
||||
|
||||
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
vpackssdw(xmm1, xmm1);
|
||||
vmovdqa(_rip_local(d4.c), xmm1);
|
||||
|
||||
// xmm3 is not needed anymore
|
||||
|
||||
// GSVector4 dr = c.xxxx();
|
||||
// GSVector4 db = c.zzzz();
|
||||
|
||||
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm0, xmm2, Xmm(4 + i));
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
vpackssdw(xmm0, xmm0);
|
||||
|
||||
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm1, xmm3, Xmm(4 + i));
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpackssdw(xmm1, xmm1);
|
||||
|
||||
// m_local.d[i].rb = r.upl16(b);
|
||||
|
||||
vpunpcklwd(xmm0, xmm1);
|
||||
|
||||
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].rb) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
vmovdqa(_rip_local_v(d[i].rb, variableOffset), xmm0);
|
||||
}
|
||||
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
vmovaps(xmm0, ptr[a2 + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
|
||||
|
||||
// GSVector4 dg = c.yyyy();
|
||||
// GSVector4 da = c.wwww();
|
||||
|
||||
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm0, xmm2, Xmm(4 + i));
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
vpackssdw(xmm0, xmm0);
|
||||
|
||||
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm1, xmm3, Xmm(4 + i));
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpackssdw(xmm1, xmm1);
|
||||
|
||||
// m_local.d[i].ga = g.upl16(a);
|
||||
|
||||
vpunpcklwd(xmm0, xmm1);
|
||||
|
||||
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].ga) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
vmovdqa(_rip_local_v(d[i].ga, variableOffset), xmm0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4i c = GSVector4i(vertex[index[last].c);
|
||||
|
||||
int last = 0;
|
||||
|
||||
switch (m_sel.prim)
|
||||
{
|
||||
case GS_POINT_CLASS: last = 0; break;
|
||||
case GS_LINE_CLASS: last = 1; break;
|
||||
case GS_TRIANGLE_CLASS: last = 2; break;
|
||||
case GS_SPRITE_CLASS: last = 1; break;
|
||||
}
|
||||
|
||||
if (!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
|
||||
{
|
||||
mov(eax, ptr[a1 + sizeof(uint32) * last]);
|
||||
shl(eax, 6); // * sizeof(GSVertexSW)
|
||||
add(rax, a0);
|
||||
}
|
||||
|
||||
vcvttps2dq(xmm0, ptr[rax + offsetof(GSVertexSW, c)]);
|
||||
|
||||
// c = c.upl16(c.zwxy());
|
||||
|
||||
vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
vpunpcklwd(xmm0, xmm1);
|
||||
|
||||
// if(!tme) c = c.srl16(7);
|
||||
|
||||
if (m_sel.tfx == TFX_NONE)
|
||||
{
|
||||
vpsrlw(xmm0, 7);
|
||||
}
|
||||
|
||||
// m_local.c.rb = c.xxxx();
|
||||
// m_local.c.ga = c.zzzz();
|
||||
|
||||
vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vpshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
vmovdqa(_rip_local(c.rb), xmm1);
|
||||
vmovdqa(_rip_local(c.ga), xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,368 +0,0 @@
|
||||
/* PCSX2 - PS2 Emulator for PCs
|
||||
* Copyright (C) 2002-2021 PCSX2 Dev Team
|
||||
*
|
||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||
* ation, either version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
* PURPOSE. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with PCSX2.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "PrecompiledHeader.h"
|
||||
#include "GSSetupPrimCodeGenerator.h"
|
||||
#include "GSVertexSW.h"
|
||||
#include "GS/GS_codegen.h"
|
||||
|
||||
#if _M_SSE >= 0x501 && (defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
#define _rip_local(field) (m_rip ? ptr[rip + &m_local.field] : ptr[t0 + offsetof(GSScanlineLocalData, field)])
|
||||
#define _rip_local_v(field, offset) (m_rip ? ptr[rip + &m_local.field] : ptr[t0 + offset])
|
||||
|
||||
#define _m_shift(i) (Ymm(7 + i))
|
||||
|
||||
// FIXME windows ?
|
||||
#define _vertex rcx
|
||||
|
||||
void GSSetupPrimCodeGenerator::Generate_AVX2()
|
||||
{
|
||||
// Technically we just need the delta < 2GB
|
||||
m_rip = (size_t)&m_local < 0x80000000 && (size_t)getCurr() < 0x80000000;
|
||||
|
||||
#ifdef _WIN64
|
||||
sub(rsp, 8 + 2 * 16);
|
||||
|
||||
vmovdqa(ptr[rsp + 0], ymm6);
|
||||
vmovdqa(ptr[rsp + 16], ymm7);
|
||||
#endif
|
||||
|
||||
if (!m_rip)
|
||||
mov(t0, (size_t)&m_local);
|
||||
|
||||
if ((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
|
||||
{
|
||||
mov(rax, (size_t)g_const->m_shift_256b);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 2 : 9); i++)
|
||||
{
|
||||
vmovaps(_m_shift(i), ptr[rax + i * 32]);
|
||||
}
|
||||
}
|
||||
// ymm7 to ymm 15 = m_shift[i]
|
||||
|
||||
Depth_AVX2();
|
||||
|
||||
Texture_AVX2();
|
||||
|
||||
Color_AVX2();
|
||||
|
||||
#ifdef _WIN64
|
||||
vmovdqa(ymm6, ptr[rsp + 0]);
|
||||
vmovdqa(ymm7, ptr[rsp + 16]);
|
||||
|
||||
add(rsp, 8 + 2 * 16);
|
||||
#endif
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Depth_AVX2()
|
||||
{
|
||||
if (!m_en.z && !m_en.f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_sel.prim != GS_SPRITE_CLASS)
|
||||
{
|
||||
const Ymm& dscan_p = ymm6;
|
||||
|
||||
// GSVector4 dp8 = dscan.p * GSVector4::broadcast32(&shift[0]);
|
||||
|
||||
vbroadcastf128(dscan_p, ptr[a2 + offsetof(GSVertexSW, p)]);
|
||||
|
||||
vmulps(ymm1, dscan_p, _m_shift(0));
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// m_local.d8.p.z = dp8.extract32<2>();
|
||||
|
||||
vextractps(_rip_local(d8.p.z), xmm1, 2);
|
||||
|
||||
// GSVector8 dz = GSVector8(dscan.p).zzzz();
|
||||
|
||||
vshufps(ymm2, dscan_p, dscan_p, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 8); i++)
|
||||
{
|
||||
// m_local.d[i].z = dz * shift[1 + i];
|
||||
|
||||
vmulps(ymm0, ymm2, _m_shift(1 + i));
|
||||
|
||||
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].z) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
vmovaps(_rip_local_v(d[i].z, variableOffset), ymm0);
|
||||
}
|
||||
}
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// m_local.d8.p.f = GSVector4i(dp8).extract32<3>();
|
||||
|
||||
// FIXME no truncate ? why ? vcvttps2dq ?
|
||||
//vcvtps2dq(ymm2, ymm1); // let's guess a typo
|
||||
vcvttps2dq(ymm2, ymm1);
|
||||
vpextrd(_rip_local(d8.p.f), xmm2, 3);
|
||||
|
||||
// GSVector8 df = GSVector8(dscan.p).wwww();
|
||||
|
||||
vshufps(ymm3, dscan_p, dscan_p, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 8); i++)
|
||||
{
|
||||
// m_local.d[i].f = GSVector8i(df * m_shift[i]).xxzzlh();
|
||||
|
||||
vmulps(ymm0, ymm3, _m_shift(1 + i));
|
||||
vcvttps2dq(ymm0, ymm0);
|
||||
|
||||
vpshuflw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
|
||||
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].f) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
vmovdqa(_rip_local_v(d[i].f, variableOffset), ymm0);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4 p = vertex[index[1]].p;
|
||||
|
||||
mov(_vertex.cvt32(), ptr[a1 + sizeof(uint32) * 1]);
|
||||
shl(_vertex.cvt32(), 6); // * sizeof(GSVertexSW)
|
||||
add(_vertex, a0);
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// m_local.p.f = GSVector4i(vertex[index[1]].p).extract32<3>();
|
||||
|
||||
vmovaps(xmm0, ptr[_vertex + offsetof(GSVertexSW, p)]);
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
vpextrd(_rip_local(p.f), xmm0, 3);
|
||||
}
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// m_local.p.z = vertex[index[1]].t.u32[3]; // uint32 z is bypassed in t.w
|
||||
|
||||
mov(eax, ptr[ecx + offsetof(GSVertexSW, t.w)]);
|
||||
mov(_rip_local(p.z), eax);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Texture_AVX2()
|
||||
{
|
||||
if (!m_en.t)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// GSVector8 dt(dscan.t);
|
||||
|
||||
vbroadcastf128(ymm0, ptr[a2 + offsetof(GSVertexSW, t)]);
|
||||
|
||||
// GSVector8 dt8 = dt * shift[0];
|
||||
|
||||
vmulps(ymm1, ymm0, _m_shift(0));
|
||||
|
||||
if (m_sel.fst)
|
||||
{
|
||||
// m_local.84.stq = GSVector4i(t * 4.0f);
|
||||
|
||||
vcvttps2dq(ymm1, ymm1);
|
||||
|
||||
vmovdqa(_rip_local(d8.stq), xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d8.stq = t * 4.0f;
|
||||
|
||||
vmovaps(_rip_local(d8.stq), xmm1);
|
||||
}
|
||||
|
||||
for (int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
||||
{
|
||||
// GSVector8 dstq = dt.xxxx/yyyy/zzzz();
|
||||
|
||||
vshufps(ymm1, ymm0, ymm0, (uint8)_MM_SHUFFLE(j, j, j, j));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 8); i++)
|
||||
{
|
||||
// GSVector8 v = dstq * shift[1 + i];
|
||||
|
||||
vmulps(ymm2, ymm1, _m_shift(1 + i));
|
||||
|
||||
if (m_sel.fst)
|
||||
{
|
||||
// m_local.d[i].s/t = GSVector8::cast(GSVector8i(v));
|
||||
|
||||
vcvttps2dq(ymm2, ymm2);
|
||||
|
||||
const size_t variableOffsetS = offsetof(GSScanlineLocalData, d[0].s) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
const size_t variableOffsetT = offsetof(GSScanlineLocalData, d[0].t) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
|
||||
switch (j)
|
||||
{
|
||||
case 0: vmovdqa(_rip_local_v(d[i].s, variableOffsetS), ymm2); break;
|
||||
case 1: vmovdqa(_rip_local_v(d[i].t, variableOffsetT), ymm2); break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d[i].s/t/q = v;
|
||||
|
||||
const size_t variableOffsetS = offsetof(GSScanlineLocalData, d[0].s) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
const size_t variableOffsetT = offsetof(GSScanlineLocalData, d[0].t) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
const size_t variableOffsetQ = offsetof(GSScanlineLocalData, d[0].q) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
|
||||
switch (j)
|
||||
{
|
||||
case 0: vmovaps(_rip_local_v(d[i].s, variableOffsetS), ymm2); break;
|
||||
case 1: vmovaps(_rip_local_v(d[i].t, variableOffsetT), ymm2); break;
|
||||
case 2: vmovaps(_rip_local_v(d[i].q, variableOffsetQ), ymm2); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Color_AVX2()
|
||||
{
|
||||
if (!m_en.c)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_sel.iip)
|
||||
{
|
||||
const Ymm& dscan_c = ymm6;
|
||||
|
||||
// GSVector8 dc(dscan.c);
|
||||
|
||||
vbroadcastf128(dscan_c, ptr[a2 + offsetof(GSVertexSW, c)]);
|
||||
|
||||
// m_local.d8.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
||||
|
||||
vmulps(ymm1, dscan_c, ymm3);
|
||||
vcvttps2dq(ymm1, ymm1);
|
||||
vpshufd(ymm1, ymm1, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
vpackssdw(ymm1, ymm1);
|
||||
vmovq(_rip_local(d8.c), xmm1);
|
||||
|
||||
// GSVector8 dr = dc.xxxx();
|
||||
// GSVector8 db = dc.zzzz();
|
||||
|
||||
vshufps(ymm2, dscan_c, dscan_c, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vshufps(ymm3, dscan_c, dscan_c, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 8); i++)
|
||||
{
|
||||
// GSVector8i r = GSVector8i(dr * shift[1 + i]).ps32();
|
||||
|
||||
vmulps(ymm0, ymm2, _m_shift(1 + i));
|
||||
vcvttps2dq(ymm0, ymm0);
|
||||
vpackssdw(ymm0, ymm0);
|
||||
|
||||
// GSVector4i b = GSVector8i(db * shift[1 + i]).ps32();
|
||||
|
||||
vmulps(ymm1, ymm3, _m_shift(1 + i));
|
||||
vcvttps2dq(ymm1, ymm1);
|
||||
vpackssdw(ymm1, ymm1);
|
||||
|
||||
// m_local.d[i].rb = r.upl16(b);
|
||||
|
||||
vpunpcklwd(ymm0, ymm1);
|
||||
|
||||
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].rb) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
vmovdqa(_rip_local_v(d[i].rb, variableOffset), ymm0);
|
||||
}
|
||||
|
||||
// GSVector8 dg = dc.yyyy();
|
||||
// GSVector8 da = dc.wwww();
|
||||
|
||||
vshufps(ymm2, dscan_c, dscan_c, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
vshufps(ymm3, dscan_c, dscan_c, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 8); i++)
|
||||
{
|
||||
// GSVector8i g = GSVector8i(dg * shift[1 + i]).ps32();
|
||||
|
||||
vmulps(ymm0, ymm2, _m_shift(1 + i));
|
||||
vcvttps2dq(ymm0, ymm0);
|
||||
vpackssdw(ymm0, ymm0);
|
||||
|
||||
// GSVector8i a = GSVector8i(da * shift[1 + i]).ps32();
|
||||
|
||||
vmulps(ymm1, ymm3, _m_shift(1 + i));
|
||||
vcvttps2dq(ymm1, ymm1);
|
||||
vpackssdw(ymm1, ymm1);
|
||||
|
||||
// m_local.d[i].ga = g.upl16(a);
|
||||
|
||||
vpunpcklwd(ymm0, ymm1);
|
||||
|
||||
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].ga) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
vmovdqa(_rip_local_v(d[i].ga, variableOffset), ymm0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4i c = GSVector4i(vertex[index[last].c);
|
||||
|
||||
int last = 0;
|
||||
|
||||
switch (m_sel.prim)
|
||||
{
|
||||
case GS_POINT_CLASS: last = 0; break;
|
||||
case GS_LINE_CLASS: last = 1; break;
|
||||
case GS_TRIANGLE_CLASS: last = 2; break;
|
||||
case GS_SPRITE_CLASS: last = 1; break;
|
||||
}
|
||||
|
||||
if (!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
|
||||
{
|
||||
mov(_vertex.cvt32(), ptr[a1 + sizeof(uint32) * last]);
|
||||
shl(_vertex.cvt32(), 6); // * sizeof(GSVertexSW)
|
||||
add(_vertex, a0);
|
||||
}
|
||||
|
||||
vbroadcasti128(ymm0, ptr[_vertex + offsetof(GSVertexSW, c)]);
|
||||
vcvttps2dq(ymm0, ymm0);
|
||||
|
||||
// c = c.upl16(c.zwxy());
|
||||
|
||||
vpshufd(ymm1, ymm0, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
vpunpcklwd(ymm0, ymm1);
|
||||
|
||||
// if(!tme) c = c.srl16(7);
|
||||
|
||||
if (m_sel.tfx == TFX_NONE)
|
||||
{
|
||||
vpsrlw(ymm0, 7);
|
||||
}
|
||||
|
||||
// m_local.c.rb = c.xxxx();
|
||||
// m_local.c.ga = c.zzzz();
|
||||
|
||||
vpshufd(ymm1, ymm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vpshufd(ymm2, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
vmovdqa(_rip_local(c.rb), ymm1);
|
||||
vmovdqa(_rip_local(c.ga), ymm2);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,374 +0,0 @@
|
||||
/* PCSX2 - PS2 Emulator for PCs
|
||||
* Copyright (C) 2002-2021 PCSX2 Dev Team
|
||||
*
|
||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||
* ation, either version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
* PURPOSE. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with PCSX2.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "PrecompiledHeader.h"
|
||||
#include "GSSetupPrimCodeGenerator.h"
|
||||
#include "GSVertexSW.h"
|
||||
#include "GS/GS_codegen.h"
|
||||
|
||||
#if _M_SSE < 0x501 && (defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
void GSSetupPrimCodeGenerator::Generate_SSE()
|
||||
{
|
||||
#ifdef _WIN64
|
||||
sub(rsp, 8 + 2 * 16);
|
||||
|
||||
vmovdqa(ptr[rsp + 0], xmm6);
|
||||
vmovdqa(ptr[rsp + 16], xmm7);
|
||||
#endif
|
||||
|
||||
mov(t0, (size_t)&m_local);
|
||||
|
||||
if ((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
|
||||
{
|
||||
mov(rax, (size_t)g_const->m_shift_128b[0]);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 2 : 5); i++)
|
||||
{
|
||||
movaps(Xmm(3 + i), ptr[rax + i * 16]);
|
||||
}
|
||||
}
|
||||
|
||||
Depth_SSE();
|
||||
|
||||
Texture_SSE();
|
||||
|
||||
Color_SSE();
|
||||
|
||||
#ifdef _WIN64
|
||||
vmovdqa(xmm6, ptr[rsp + 0]);
|
||||
vmovdqa(xmm7, ptr[rsp + 16]);
|
||||
|
||||
add(rsp, 8 + 2 * 16);
|
||||
#endif
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Depth_SSE()
|
||||
{
|
||||
if (!m_en.z && !m_en.f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_sel.prim != GS_SPRITE_CLASS)
|
||||
{
|
||||
// GSVector4 p = dscan.p;
|
||||
|
||||
movaps(xmm0, ptr[a2 + offsetof(GSVertexSW, p)]);
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// GSVector4 df = p.wwww();
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
|
||||
|
||||
movaps(xmm2, xmm1);
|
||||
mulps(xmm2, xmm3);
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
movdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.f)], xmm2);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
|
||||
|
||||
movaps(xmm2, xmm1);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
|
||||
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].f) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
movdqa(ptr[t0 + variableOffset], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// GSVector4 dz = p.zzzz();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
// m_local.d4.z = dz * 4.0f;
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, xmm3);
|
||||
movdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.z)], xmm1);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// m_local.d[i].z = dz * m_shift[i];
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, Xmm(4 + i));
|
||||
|
||||
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].z) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
movdqa(ptr[t0 + variableOffset], xmm1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4 p = vertex[index[1]].p;
|
||||
|
||||
mov(eax, ptr[a1 + sizeof(uint32) * 1]);
|
||||
shl(eax, 6); // * sizeof(GSVertexSW)
|
||||
add(rax, a0);
|
||||
|
||||
movaps(xmm0, ptr[rax + offsetof(GSVertexSW, p)]);
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
|
||||
|
||||
cvttps2dq(xmm1, xmm0);
|
||||
pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
movdqa(ptr[t0 + offsetof(GSScanlineLocalData, p.f)], xmm1);
|
||||
}
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// uint32 z is bypassed in t.w
|
||||
|
||||
vmovdqa(xmm0, ptr[rax + offsetof(GSVertexSW, t)]);
|
||||
vpshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, p.z)], xmm0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Texture_SSE()
|
||||
{
|
||||
if (!m_en.t)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// GSVector4 t = dscan.t;
|
||||
|
||||
movaps(xmm0, ptr[a2 + offsetof(GSVertexSW, t)]);
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, xmm3);
|
||||
|
||||
if (m_sel.fst)
|
||||
{
|
||||
// m_local.d4.stq = GSVector4i(t * 4.0f);
|
||||
|
||||
cvttps2dq(xmm1, xmm1);
|
||||
|
||||
movdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d4.stq = t * 4.0f;
|
||||
|
||||
movaps(ptr[t0 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
|
||||
}
|
||||
|
||||
for (int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
||||
{
|
||||
// GSVector4 ds = t.xxxx();
|
||||
// GSVector4 dt = t.yyyy();
|
||||
// GSVector4 dq = t.zzzz();
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// GSVector4 v = ds/dt * m_shift[i];
|
||||
|
||||
movaps(xmm2, xmm1);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
|
||||
if (m_sel.fst)
|
||||
{
|
||||
// m_local.d[i].s/t = GSVector4i(v);
|
||||
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
|
||||
const size_t variableOffsetS = offsetof(GSScanlineLocalData, d[0].s) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
const size_t variableOffsetT = offsetof(GSScanlineLocalData, d[0].t) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
|
||||
switch (j)
|
||||
{
|
||||
case 0: movdqa(ptr[t0 + variableOffsetS], xmm2); break;
|
||||
case 1: movdqa(ptr[t0 + variableOffsetT], xmm2); break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d[i].s/t/q = v;
|
||||
|
||||
const size_t variableOffsetS = offsetof(GSScanlineLocalData, d[0].s) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
const size_t variableOffsetT = offsetof(GSScanlineLocalData, d[0].t) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
const size_t variableOffsetQ = offsetof(GSScanlineLocalData, d[0].q) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
|
||||
switch (j)
|
||||
{
|
||||
case 0: movaps(ptr[t0 + variableOffsetS], xmm2); break;
|
||||
case 1: movaps(ptr[t0 + variableOffsetT], xmm2); break;
|
||||
case 2: movaps(ptr[t0 + variableOffsetQ], xmm2); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Color_SSE()
|
||||
{
|
||||
if (!m_en.c)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_sel.iip)
|
||||
{
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
movaps(xmm0, ptr[a2 + offsetof(GSVertexSW, c)]);
|
||||
movaps(xmm1, xmm0);
|
||||
|
||||
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
||||
|
||||
movaps(xmm2, xmm0);
|
||||
mulps(xmm2, xmm3);
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
packssdw(xmm2, xmm2);
|
||||
movdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.c)], xmm2);
|
||||
|
||||
// xmm3 is not needed anymore
|
||||
|
||||
// GSVector4 dr = c.xxxx();
|
||||
// GSVector4 db = c.zzzz();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm2, xmm0);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
packssdw(xmm2, xmm2);
|
||||
|
||||
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm3, xmm1);
|
||||
mulps(xmm3, Xmm(4 + i));
|
||||
cvttps2dq(xmm3, xmm3);
|
||||
packssdw(xmm3, xmm3);
|
||||
|
||||
// m_local.d[i].rb = r.upl16(b);
|
||||
|
||||
punpcklwd(xmm2, xmm3);
|
||||
|
||||
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].rb) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
movdqa(ptr[t0 + variableOffset], xmm2);
|
||||
}
|
||||
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
movaps(xmm0, ptr[a2 + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
|
||||
movaps(xmm1, xmm0);
|
||||
|
||||
// GSVector4 dg = c.yyyy();
|
||||
// GSVector4 da = c.wwww();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm2, xmm0);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
packssdw(xmm2, xmm2);
|
||||
|
||||
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm3, xmm1);
|
||||
mulps(xmm3, Xmm(4 + i));
|
||||
cvttps2dq(xmm3, xmm3);
|
||||
packssdw(xmm3, xmm3);
|
||||
|
||||
// m_local.d[i].ga = g.upl16(a);
|
||||
|
||||
punpcklwd(xmm2, xmm3);
|
||||
|
||||
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].ga) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
movdqa(ptr[t0 + variableOffset], xmm2);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4i c = GSVector4i(vertex[index[last].c);
|
||||
|
||||
int last = 0;
|
||||
|
||||
switch (m_sel.prim)
|
||||
{
|
||||
case GS_POINT_CLASS: last = 0; break;
|
||||
case GS_LINE_CLASS: last = 1; break;
|
||||
case GS_TRIANGLE_CLASS: last = 2; break;
|
||||
case GS_SPRITE_CLASS: last = 1; break;
|
||||
}
|
||||
|
||||
if (!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
|
||||
{
|
||||
mov(eax, ptr[a1 + sizeof(uint32) * last]);
|
||||
shl(eax, 6); // * sizeof(GSVertexSW)
|
||||
add(rax, a0);
|
||||
}
|
||||
|
||||
cvttps2dq(xmm0, ptr[rax + offsetof(GSVertexSW, c)]);
|
||||
|
||||
// c = c.upl16(c.zwxy());
|
||||
|
||||
pshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
punpcklwd(xmm0, xmm1);
|
||||
|
||||
// if(!tme) c = c.srl16(7);
|
||||
|
||||
if (m_sel.tfx == TFX_NONE)
|
||||
{
|
||||
psrlw(xmm0, 7);
|
||||
}
|
||||
|
||||
// m_local.c.rb = c.xxxx();
|
||||
// m_local.c.ga = c.zzzz();
|
||||
|
||||
pshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
pshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
movdqa(ptr[t0 + offsetof(GSScanlineLocalData, c.rb)], xmm1);
|
||||
movdqa(ptr[t0 + offsetof(GSScanlineLocalData, c.ga)], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,335 +0,0 @@
|
||||
/* PCSX2 - PS2 Emulator for PCs
|
||||
* Copyright (C) 2002-2021 PCSX2 Dev Team
|
||||
*
|
||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||
* ation, either version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
* PURPOSE. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with PCSX2.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "PrecompiledHeader.h"
|
||||
#include "GSSetupPrimCodeGenerator.h"
|
||||
#include "GSVertexSW.h"
|
||||
#include "GS/GS_codegen.h"
|
||||
|
||||
#if _M_SSE < 0x501 && !(defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
static const int _args = 0;
|
||||
static const int _vertex = _args + 4;
|
||||
static const int _index = _args + 8;
|
||||
static const int _dscan = _args + 12;
|
||||
|
||||
void GSSetupPrimCodeGenerator::Generate_AVX()
|
||||
{
|
||||
if ((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
|
||||
{
|
||||
mov(edx, dword[esp + _dscan]);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 2 : 5); i++)
|
||||
{
|
||||
vmovaps(Xmm(3 + i), ptr[g_const->m_shift_128b[i]]);
|
||||
}
|
||||
}
|
||||
|
||||
Depth_AVX();
|
||||
|
||||
Texture_AVX();
|
||||
|
||||
Color_AVX();
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Depth_AVX()
|
||||
{
|
||||
if (!m_en.z && !m_en.f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_sel.prim != GS_SPRITE_CLASS)
|
||||
{
|
||||
// GSVector4 p = dscan.p;
|
||||
|
||||
vmovaps(xmm0, ptr[edx + offsetof(GSVertexSW, p)]);
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// GSVector4 df = p.wwww();
|
||||
|
||||
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
|
||||
|
||||
vmulps(xmm2, xmm1, xmm3);
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vmovdqa(ptr[&m_local.d4.f], xmm2);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
|
||||
|
||||
vmulps(xmm2, xmm1, Xmm(4 + i));
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vmovdqa(ptr[&m_local.d[i].f], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// GSVector4 dz = p.zzzz();
|
||||
|
||||
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
// m_local.d4.z = dz * 4.0f;
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
vmovdqa(ptr[&m_local.d4.z], xmm1);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// m_local.d[i].z = dz * m_shift[i];
|
||||
|
||||
vmulps(xmm1, xmm0, Xmm(4 + i));
|
||||
vmovdqa(ptr[&m_local.d[i].z], xmm1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4 p = vertex[index[1]].p;
|
||||
|
||||
mov(ecx, ptr[esp + _index]);
|
||||
mov(ecx, ptr[ecx + sizeof(uint32) * 1]);
|
||||
shl(ecx, 6); // * sizeof(GSVertexSW)
|
||||
add(ecx, ptr[esp + _vertex]);
|
||||
|
||||
vmovaps(xmm0, ptr[ecx + offsetof(GSVertexSW, p)]);
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
|
||||
|
||||
vcvttps2dq(xmm1, xmm0);
|
||||
vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vmovdqa(ptr[&m_local.p.f], xmm1);
|
||||
}
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// uint32 z is bypassed in t.w
|
||||
|
||||
vmovdqa(xmm0, ptr[ecx + offsetof(GSVertexSW, t)]);
|
||||
vpshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
vmovdqa(ptr[&m_local.p.z], xmm0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Texture_AVX()
|
||||
{
|
||||
if (!m_en.t)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// GSVector4 t = dscan.t;
|
||||
|
||||
vmovaps(xmm0, ptr[edx + offsetof(GSVertexSW, t)]);
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
|
||||
if (m_sel.fst)
|
||||
{
|
||||
// m_local.d4.stq = GSVector4i(t * 4.0f);
|
||||
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
|
||||
vmovdqa(ptr[&m_local.d4.stq], xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d4.stq = t * 4.0f;
|
||||
|
||||
vmovaps(ptr[&m_local.d4.stq], xmm1);
|
||||
}
|
||||
|
||||
for (int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
||||
{
|
||||
// GSVector4 ds = t.xxxx();
|
||||
// GSVector4 dt = t.yyyy();
|
||||
// GSVector4 dq = t.zzzz();
|
||||
|
||||
vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// GSVector4 v = ds/dt * m_shift[i];
|
||||
|
||||
vmulps(xmm2, xmm1, Xmm(4 + i));
|
||||
|
||||
if (m_sel.fst)
|
||||
{
|
||||
// m_local.d[i].s/t = GSVector4i(v);
|
||||
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
|
||||
switch (j)
|
||||
{
|
||||
case 0: vmovdqa(ptr[&m_local.d[i].s], xmm2); break;
|
||||
case 1: vmovdqa(ptr[&m_local.d[i].t], xmm2); break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d[i].s/t/q = v;
|
||||
|
||||
switch (j)
|
||||
{
|
||||
case 0: vmovaps(ptr[&m_local.d[i].s], xmm2); break;
|
||||
case 1: vmovaps(ptr[&m_local.d[i].t], xmm2); break;
|
||||
case 2: vmovaps(ptr[&m_local.d[i].q], xmm2); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Color_AVX()
|
||||
{
|
||||
if (!m_en.c)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_sel.iip)
|
||||
{
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
vmovaps(xmm0, ptr[edx + offsetof(GSVertexSW, c)]);
|
||||
|
||||
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
vpackssdw(xmm1, xmm1);
|
||||
vmovdqa(ptr[&m_local.d4.c], xmm1);
|
||||
|
||||
// xmm3 is not needed anymore
|
||||
|
||||
// GSVector4 dr = c.xxxx();
|
||||
// GSVector4 db = c.zzzz();
|
||||
|
||||
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm0, xmm2, Xmm(4 + i));
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
vpackssdw(xmm0, xmm0);
|
||||
|
||||
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm1, xmm3, Xmm(4 + i));
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpackssdw(xmm1, xmm1);
|
||||
|
||||
// m_local.d[i].rb = r.upl16(b);
|
||||
|
||||
vpunpcklwd(xmm0, xmm1);
|
||||
vmovdqa(ptr[&m_local.d[i].rb], xmm0);
|
||||
}
|
||||
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
vmovaps(xmm0, ptr[edx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
|
||||
|
||||
// GSVector4 dg = c.yyyy();
|
||||
// GSVector4 da = c.wwww();
|
||||
|
||||
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm0, xmm2, Xmm(4 + i));
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
vpackssdw(xmm0, xmm0);
|
||||
|
||||
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm1, xmm3, Xmm(4 + i));
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpackssdw(xmm1, xmm1);
|
||||
|
||||
// m_local.d[i].ga = g.upl16(a);
|
||||
|
||||
vpunpcklwd(xmm0, xmm1);
|
||||
vmovdqa(ptr[&m_local.d[i].ga], xmm0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4i c = GSVector4i(vertex[index[last].c);
|
||||
|
||||
int last = 0;
|
||||
|
||||
switch (m_sel.prim)
|
||||
{
|
||||
case GS_POINT_CLASS: last = 0; break;
|
||||
case GS_LINE_CLASS: last = 1; break;
|
||||
case GS_TRIANGLE_CLASS: last = 2; break;
|
||||
case GS_SPRITE_CLASS: last = 1; break;
|
||||
}
|
||||
|
||||
if (!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
|
||||
{
|
||||
mov(ecx, ptr[esp + _index]);
|
||||
mov(ecx, ptr[ecx + sizeof(uint32) * last]);
|
||||
shl(ecx, 6); // * sizeof(GSVertexSW)
|
||||
add(ecx, ptr[esp + _vertex]);
|
||||
}
|
||||
|
||||
vcvttps2dq(xmm0, ptr[ecx + offsetof(GSVertexSW, c)]);
|
||||
|
||||
// c = c.upl16(c.zwxy());
|
||||
|
||||
vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
vpunpcklwd(xmm0, xmm1);
|
||||
|
||||
// if(!tme) c = c.srl16(7);
|
||||
|
||||
if (m_sel.tfx == TFX_NONE)
|
||||
{
|
||||
vpsrlw(xmm0, 7);
|
||||
}
|
||||
|
||||
// m_local.c.rb = c.xxxx();
|
||||
// m_local.c.ga = c.zzzz();
|
||||
|
||||
vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vpshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
vmovdqa(ptr[&m_local.c.rb], xmm1);
|
||||
vmovdqa(ptr[&m_local.c.ga], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,360 +0,0 @@
|
||||
/* PCSX2 - PS2 Emulator for PCs
|
||||
* Copyright (C) 2002-2021 PCSX2 Dev Team
|
||||
*
|
||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||
* ation, either version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
* PURPOSE. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with PCSX2.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "PrecompiledHeader.h"
|
||||
#include "GSSetupPrimCodeGenerator.h"
|
||||
#include "GSVertexSW.h"
|
||||
#include "GS/GS_codegen.h"
|
||||
|
||||
#if _M_SSE >= 0x501 && !(defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
static const int _args = 0;
|
||||
static const int _vertex = _args + 4;
|
||||
static const int _index = _args + 8;
|
||||
static const int _dscan = _args + 12;
|
||||
|
||||
void GSSetupPrimCodeGenerator::Generate_AVX2()
|
||||
{
|
||||
if ((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
|
||||
{
|
||||
mov(edx, dword[esp + _dscan]);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 2 : 5); i++)
|
||||
{
|
||||
vmovaps(Ymm(3 + i), ptr[g_const->m_shift_256b[i]]);
|
||||
}
|
||||
}
|
||||
|
||||
Depth_AVX2();
|
||||
|
||||
Texture_AVX2();
|
||||
|
||||
Color_AVX2();
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Depth_AVX2()
|
||||
{
|
||||
if (!m_en.z && !m_en.f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_sel.prim != GS_SPRITE_CLASS)
|
||||
{
|
||||
// GSVector4 dp8 = dscan.p * GSVector4::broadcast32(&shift[0]);
|
||||
|
||||
vbroadcastf128(ymm0, ptr[edx + offsetof(GSVertexSW, p)]);
|
||||
|
||||
vmulps(ymm1, ymm0, ymm3);
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// m_local.d8.p.z = dp8.extract32<2>();
|
||||
|
||||
vextractps(ptr[&m_local.d8.p.z], xmm1, 2);
|
||||
}
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// m_local.d8.p.f = GSVector4i(dp8).extract32<3>();
|
||||
|
||||
vcvtps2dq(ymm2, ymm1);
|
||||
vpextrd(ptr[&m_local.d8.p.f], xmm2, 3);
|
||||
}
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// GSVector8 dz = GSVector8(dscan.p).zzzz();
|
||||
|
||||
vshufps(ymm2, ymm0, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
}
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// GSVector8 df = GSVector8(dscan.p).wwww();
|
||||
|
||||
vshufps(ymm1, ymm0, ymm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
}
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 8); i++)
|
||||
{
|
||||
if (m_en.z)
|
||||
{
|
||||
// m_local.d[i].z = dz * shift[1 + i];
|
||||
|
||||
if (i < 4)
|
||||
vmulps(ymm0, ymm2, Ymm(4 + i));
|
||||
else
|
||||
vmulps(ymm0, ymm2, ptr[g_const->m_shift_256b[i + 1]]);
|
||||
vmovaps(ptr[&m_local.d[i].z], ymm0);
|
||||
}
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// m_local.d[i].f = GSVector8i(df * m_shift[i]).xxzzlh();
|
||||
|
||||
if (i < 4)
|
||||
vmulps(ymm0, ymm1, Ymm(4 + i));
|
||||
else
|
||||
vmulps(ymm0, ymm1, ptr[g_const->m_shift_256b[i + 1]]);
|
||||
vcvttps2dq(ymm0, ymm0);
|
||||
vpshuflw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vmovdqa(ptr[&m_local.d[i].f], ymm0);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4 p = vertex[index[1]].p;
|
||||
|
||||
mov(ecx, ptr[esp + _index]);
|
||||
mov(ecx, ptr[ecx + sizeof(uint32) * 1]);
|
||||
shl(ecx, 6); // * sizeof(GSVertexSW)
|
||||
add(ecx, ptr[esp + _vertex]);
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// m_local.p.f = GSVector4i(vertex[index[1]].p).extract32<3>();
|
||||
|
||||
vmovaps(xmm0, ptr[ecx + offsetof(GSVertexSW, p)]);
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
vpextrd(ptr[&m_local.p.f], xmm0, 3);
|
||||
}
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// m_local.p.z = vertex[index[1]].t.u32[3]; // uint32 z is bypassed in t.w
|
||||
|
||||
mov(eax, ptr[ecx + offsetof(GSVertexSW, t.w)]);
|
||||
mov(ptr[&m_local.p.z], eax);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Texture_AVX2()
|
||||
{
|
||||
if (!m_en.t)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// GSVector8 dt(dscan.t);
|
||||
|
||||
vbroadcastf128(ymm0, ptr[edx + offsetof(GSVertexSW, t)]);
|
||||
|
||||
// GSVector8 dt8 = dt * shift[0];
|
||||
|
||||
vmulps(ymm1, ymm0, ymm3);
|
||||
|
||||
if (m_sel.fst)
|
||||
{
|
||||
// m_local.d8.stq = GSVector8::cast(GSVector8i(dt8));
|
||||
|
||||
vcvttps2dq(ymm1, ymm1);
|
||||
|
||||
vmovdqa(ptr[&m_local.d8.stq], xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d8.stq = dt8;
|
||||
|
||||
vmovaps(ptr[&m_local.d8.stq], xmm1);
|
||||
}
|
||||
|
||||
for (int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
||||
{
|
||||
// GSVector8 dstq = dt.xxxx/yyyy/zzzz();
|
||||
|
||||
vshufps(ymm1, ymm0, ymm0, (uint8)_MM_SHUFFLE(j, j, j, j));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 8); i++)
|
||||
{
|
||||
// GSVector8 v = dstq * shift[1 + i];
|
||||
|
||||
if (i < 4)
|
||||
vmulps(ymm2, ymm1, Ymm(4 + i));
|
||||
else
|
||||
vmulps(ymm2, ymm1, ptr[g_const->m_shift_256b[i + 1]]);
|
||||
|
||||
if (m_sel.fst)
|
||||
{
|
||||
// m_local.d[i].s/t = GSVector8::cast(GSVector8i(v));
|
||||
|
||||
vcvttps2dq(ymm2, ymm2);
|
||||
|
||||
switch (j)
|
||||
{
|
||||
case 0: vmovdqa(ptr[&m_local.d[i].s], ymm2); break;
|
||||
case 1: vmovdqa(ptr[&m_local.d[i].t], ymm2); break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d[i].s/t/q = v;
|
||||
|
||||
switch (j)
|
||||
{
|
||||
case 0: vmovaps(ptr[&m_local.d[i].s], ymm2); break;
|
||||
case 1: vmovaps(ptr[&m_local.d[i].t], ymm2); break;
|
||||
case 2: vmovaps(ptr[&m_local.d[i].q], ymm2); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Color_AVX2()
|
||||
{
|
||||
if (!m_en.c)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_sel.iip)
|
||||
{
|
||||
// GSVector8 dc(dscan.c);
|
||||
|
||||
vbroadcastf128(ymm0, ptr[edx + offsetof(GSVertexSW, c)]);
|
||||
|
||||
// m_local.d8.c = GSVector8i(dc * shift[0]).xzyw().ps32();
|
||||
|
||||
vmulps(ymm1, ymm0, ymm3);
|
||||
vcvttps2dq(ymm1, ymm1);
|
||||
vpshufd(ymm1, ymm1, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
vpackssdw(ymm1, ymm1);
|
||||
vmovq(ptr[&m_local.d8.c], xmm1);
|
||||
|
||||
// ymm3 is not needed anymore
|
||||
|
||||
// GSVector8 dr = dc.xxxx();
|
||||
// GSVector8 db = dc.zzzz();
|
||||
|
||||
vshufps(ymm2, ymm0, ymm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vshufps(ymm3, ymm0, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 8); i++)
|
||||
{
|
||||
// GSVector8i r = GSVector8i(dr * shift[1 + i]).ps32();
|
||||
|
||||
if (i < 4)
|
||||
vmulps(ymm0, ymm2, Ymm(4 + i));
|
||||
else
|
||||
vmulps(ymm0, ymm2, ptr[g_const->m_shift_256b[i + 1]]);
|
||||
vcvttps2dq(ymm0, ymm0);
|
||||
vpackssdw(ymm0, ymm0);
|
||||
|
||||
// GSVector4i b = GSVector8i(db * shift[1 + i]).ps32();
|
||||
|
||||
if (i < 4)
|
||||
vmulps(ymm1, ymm3, Ymm(4 + i));
|
||||
else
|
||||
vmulps(ymm1, ymm3, ptr[g_const->m_shift_256b[i + 1]]);
|
||||
vcvttps2dq(ymm1, ymm1);
|
||||
vpackssdw(ymm1, ymm1);
|
||||
|
||||
// m_local.d[i].rb = r.upl16(b);
|
||||
|
||||
vpunpcklwd(ymm0, ymm1);
|
||||
vmovdqa(ptr[&m_local.d[i].rb], ymm0);
|
||||
}
|
||||
|
||||
// GSVector8 dc(dscan.c);
|
||||
|
||||
vbroadcastf128(ymm0, ptr[edx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
|
||||
|
||||
// GSVector8 dg = dc.yyyy();
|
||||
// GSVector8 da = dc.wwww();
|
||||
|
||||
vshufps(ymm2, ymm0, ymm0, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
vshufps(ymm3, ymm0, ymm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 8); i++)
|
||||
{
|
||||
// GSVector8i g = GSVector8i(dg * shift[1 + i]).ps32();
|
||||
|
||||
if (i < 4)
|
||||
vmulps(ymm0, ymm2, Ymm(4 + i));
|
||||
else
|
||||
vmulps(ymm0, ymm2, ptr[g_const->m_shift_256b[i + 1]]);
|
||||
vcvttps2dq(ymm0, ymm0);
|
||||
vpackssdw(ymm0, ymm0);
|
||||
|
||||
// GSVector8i a = GSVector8i(da * shift[1 + i]).ps32();
|
||||
|
||||
if (i < 4)
|
||||
vmulps(ymm1, ymm3, Ymm(4 + i));
|
||||
else
|
||||
vmulps(ymm1, ymm3, ptr[g_const->m_shift_256b[i + 1]]);
|
||||
vcvttps2dq(ymm1, ymm1);
|
||||
vpackssdw(ymm1, ymm1);
|
||||
|
||||
// m_local.d[i].ga = g.upl16(a);
|
||||
|
||||
vpunpcklwd(ymm0, ymm1);
|
||||
vmovdqa(ptr[&m_local.d[i].ga], ymm0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector8i c = GSVector8i(GSVector8(vertex[index[last]].c));
|
||||
|
||||
int last = 0;
|
||||
|
||||
switch (m_sel.prim)
|
||||
{
|
||||
case GS_POINT_CLASS: last = 0; break;
|
||||
case GS_LINE_CLASS: last = 1; break;
|
||||
case GS_TRIANGLE_CLASS: last = 2; break;
|
||||
case GS_SPRITE_CLASS: last = 1; break;
|
||||
}
|
||||
|
||||
if (!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
|
||||
{
|
||||
mov(ecx, ptr[esp + _index]);
|
||||
mov(ecx, ptr[ecx + sizeof(uint32) * last]);
|
||||
shl(ecx, 6); // * sizeof(GSVertexSW)
|
||||
add(ecx, ptr[esp + _vertex]);
|
||||
}
|
||||
|
||||
vbroadcasti128(ymm0, ptr[ecx + offsetof(GSVertexSW, c)]);
|
||||
vcvttps2dq(ymm0, ymm0);
|
||||
|
||||
// c = c.upl16(c.zwxy());
|
||||
|
||||
vpshufd(ymm1, ymm0, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
vpunpcklwd(ymm0, ymm1);
|
||||
|
||||
// if(!tme) c = c.srl16(7);
|
||||
|
||||
if (m_sel.tfx == TFX_NONE)
|
||||
{
|
||||
vpsrlw(ymm0, 7);
|
||||
}
|
||||
|
||||
// m_local.c.rb = c.xxxx();
|
||||
// m_local.c.ga = c.zzzz();
|
||||
|
||||
vpshufd(ymm1, ymm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vpshufd(ymm2, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
vmovdqa(ptr[&m_local.c.rb], ymm1);
|
||||
vmovdqa(ptr[&m_local.c.ga], ymm2);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,350 +0,0 @@
|
||||
/* PCSX2 - PS2 Emulator for PCs
|
||||
* Copyright (C) 2002-2021 PCSX2 Dev Team
|
||||
*
|
||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||
* ation, either version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
* PURPOSE. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with PCSX2.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "PrecompiledHeader.h"
|
||||
#include "GSSetupPrimCodeGenerator.h"
|
||||
#include "GSVertexSW.h"
|
||||
#include "GS/GS_codegen.h"
|
||||
|
||||
#if _M_SSE < 0x501 && !(defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
static const int _args = 0;
|
||||
static const int _vertex = _args + 4;
|
||||
static const int _index = _args + 8;
|
||||
static const int _dscan = _args + 12;
|
||||
|
||||
void GSSetupPrimCodeGenerator::Generate_SSE()
|
||||
{
|
||||
if ((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
|
||||
{
|
||||
mov(edx, dword[esp + _dscan]);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 2 : 5); i++)
|
||||
{
|
||||
movaps(Xmm(3 + i), ptr[g_const->m_shift_128b[i]]);
|
||||
}
|
||||
}
|
||||
|
||||
Depth_SSE();
|
||||
|
||||
Texture_SSE();
|
||||
|
||||
Color_SSE();
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Depth_SSE()
|
||||
{
|
||||
if (!m_en.z && !m_en.f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_sel.prim != GS_SPRITE_CLASS)
|
||||
{
|
||||
// GSVector4 p = dscan.p;
|
||||
|
||||
movaps(xmm0, ptr[edx + offsetof(GSVertexSW, p)]);
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// GSVector4 df = p.wwww();
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
|
||||
|
||||
movaps(xmm2, xmm1);
|
||||
mulps(xmm2, xmm3);
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
movdqa(ptr[&m_local.d4.f], xmm2);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
|
||||
|
||||
movaps(xmm2, xmm1);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
movdqa(ptr[&m_local.d[i].f], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// GSVector4 dz = p.zzzz();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
// m_local.d4.z = dz * 4.0f;
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, xmm3);
|
||||
movdqa(ptr[&m_local.d4.z], xmm1);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// m_local.d[i].z = dz * m_shift[i];
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, Xmm(4 + i));
|
||||
movdqa(ptr[&m_local.d[i].z], xmm1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4 p = vertex[index[1]].p;
|
||||
|
||||
mov(ecx, ptr[esp + _index]);
|
||||
mov(ecx, ptr[ecx + sizeof(uint32) * 1]);
|
||||
shl(ecx, 6); // * sizeof(GSVertexSW)
|
||||
add(ecx, ptr[esp + _vertex]);
|
||||
|
||||
movaps(xmm0, ptr[ecx + offsetof(GSVertexSW, p)]);
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
|
||||
|
||||
cvttps2dq(xmm1, xmm0);
|
||||
pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
movdqa(ptr[&m_local.p.f], xmm1);
|
||||
}
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// uint32 z is bypassed in t.w
|
||||
|
||||
movdqa(xmm0, ptr[ecx + offsetof(GSVertexSW, t)]);
|
||||
pshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
movdqa(ptr[&m_local.p.z], xmm0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Texture_SSE()
|
||||
{
|
||||
if (!m_en.t)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// GSVector4 t = dscan.t;
|
||||
|
||||
movaps(xmm0, ptr[edx + offsetof(GSVertexSW, t)]);
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, xmm3);
|
||||
|
||||
if (m_sel.fst)
|
||||
{
|
||||
// m_local.d4.stq = GSVector4i(t * 4.0f);
|
||||
|
||||
cvttps2dq(xmm1, xmm1);
|
||||
|
||||
movdqa(ptr[&m_local.d4.stq], xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d4.stq = t * 4.0f;
|
||||
|
||||
movaps(ptr[&m_local.d4.stq], xmm1);
|
||||
}
|
||||
|
||||
for (int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
||||
{
|
||||
// GSVector4 ds = t.xxxx();
|
||||
// GSVector4 dt = t.yyyy();
|
||||
// GSVector4 dq = t.zzzz();
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// GSVector4 v = ds/dt * m_shift[i];
|
||||
|
||||
movaps(xmm2, xmm1);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
|
||||
if (m_sel.fst)
|
||||
{
|
||||
// m_local.d[i].s/t = GSVector4i(v);
|
||||
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
|
||||
switch (j)
|
||||
{
|
||||
case 0: movdqa(ptr[&m_local.d[i].s], xmm2); break;
|
||||
case 1: movdqa(ptr[&m_local.d[i].t], xmm2); break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d[i].s/t/q = v;
|
||||
|
||||
switch (j)
|
||||
{
|
||||
case 0: movaps(ptr[&m_local.d[i].s], xmm2); break;
|
||||
case 1: movaps(ptr[&m_local.d[i].t], xmm2); break;
|
||||
case 2: movaps(ptr[&m_local.d[i].q], xmm2); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Color_SSE()
|
||||
{
|
||||
if (!m_en.c)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_sel.iip)
|
||||
{
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
movaps(xmm0, ptr[edx + offsetof(GSVertexSW, c)]);
|
||||
movaps(xmm1, xmm0);
|
||||
|
||||
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
||||
|
||||
movaps(xmm2, xmm0);
|
||||
mulps(xmm2, xmm3);
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
packssdw(xmm2, xmm2);
|
||||
movdqa(ptr[&m_local.d4.c], xmm2);
|
||||
|
||||
// xmm3 is not needed anymore
|
||||
|
||||
// GSVector4 dr = c.xxxx();
|
||||
// GSVector4 db = c.zzzz();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm2, xmm0);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
packssdw(xmm2, xmm2);
|
||||
|
||||
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm3, xmm1);
|
||||
mulps(xmm3, Xmm(4 + i));
|
||||
cvttps2dq(xmm3, xmm3);
|
||||
packssdw(xmm3, xmm3);
|
||||
|
||||
// m_local.d[i].rb = r.upl16(b);
|
||||
|
||||
punpcklwd(xmm2, xmm3);
|
||||
movdqa(ptr[&m_local.d[i].rb], xmm2);
|
||||
}
|
||||
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
movaps(xmm0, ptr[edx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
|
||||
movaps(xmm1, xmm0);
|
||||
|
||||
// GSVector4 dg = c.yyyy();
|
||||
// GSVector4 da = c.wwww();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm2, xmm0);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
packssdw(xmm2, xmm2);
|
||||
|
||||
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm3, xmm1);
|
||||
mulps(xmm3, Xmm(4 + i));
|
||||
cvttps2dq(xmm3, xmm3);
|
||||
packssdw(xmm3, xmm3);
|
||||
|
||||
// m_local.d[i].ga = g.upl16(a);
|
||||
|
||||
punpcklwd(xmm2, xmm3);
|
||||
movdqa(ptr[&m_local.d[i].ga], xmm2);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4i c = GSVector4i(vertex[index[last].c);
|
||||
|
||||
int last = 0;
|
||||
|
||||
switch (m_sel.prim)
|
||||
{
|
||||
case GS_POINT_CLASS: last = 0; break;
|
||||
case GS_LINE_CLASS: last = 1; break;
|
||||
case GS_TRIANGLE_CLASS: last = 2; break;
|
||||
case GS_SPRITE_CLASS: last = 1; break;
|
||||
}
|
||||
|
||||
if (!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
|
||||
{
|
||||
mov(ecx, ptr[esp + _index]);
|
||||
mov(ecx, ptr[ecx + sizeof(uint32) * last]);
|
||||
shl(ecx, 6); // * sizeof(GSVertexSW)
|
||||
add(ecx, ptr[esp + _vertex]);
|
||||
}
|
||||
|
||||
cvttps2dq(xmm0, ptr[ecx + offsetof(GSVertexSW, c)]);
|
||||
|
||||
// c = c.upl16(c.zwxy());
|
||||
|
||||
pshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
punpcklwd(xmm0, xmm1);
|
||||
|
||||
// if(!tme) c = c.srl16(7);
|
||||
|
||||
if (m_sel.tfx == TFX_NONE)
|
||||
{
|
||||
psrlw(xmm0, 7);
|
||||
}
|
||||
|
||||
// m_local.c.rb = c.xxxx();
|
||||
// m_local.c.ga = c.zzzz();
|
||||
|
||||
pshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
pshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
movdqa(ptr[&m_local.c.rb], xmm1);
|
||||
movdqa(ptr[&m_local.c.ga], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -28,6 +28,8 @@
|
||||
|
||||
//#define DISABLE_DATE
|
||||
|
||||
// Not yet used/experimental OpenGL extensions
|
||||
//#define GL_EXT_TEX_SUB_IMAGE
|
||||
|
||||
#if !defined(NDEBUG) || defined(_DEBUG) || defined(_DEVEL)
|
||||
#define ENABLE_OGL_DEBUG // Create a debug context and check opengl command status. Allow also to dump various textures/states.
|
||||
|
||||
@@ -111,11 +111,17 @@ StereoOut32 V_Core::ReadInput()
|
||||
SetIrqCall(i);
|
||||
|
||||
// PlayMode & 2 is Bypass Mode, so it doesn't go through the SPU
|
||||
if ((AutoDMACtrl & (Index + 1)) && !(Index == 0 && (PlayMode & 2) != 0))
|
||||
if ((Index == 1) || !(Index == 0 && (PlayMode & 2) != 0))
|
||||
{
|
||||
retval = StereoOut32(
|
||||
(s32)(*GetMemPtr(0x2000 + (Index << 10) + ReadIndex)),
|
||||
(s32)(*GetMemPtr(0x2200 + (Index << 10) + ReadIndex)));
|
||||
|
||||
// Not accurate behaviour but shouldn't hurt for now, need to run some tests
|
||||
// to see why Prince of Persia Warrior Within buzzes when going in to the map
|
||||
// since it starts an ADMA of music, then kills ADMA, so it loops on a few ms of data.
|
||||
GetMemPtr(0x2000 + (Index << 10) + ReadIndex)[0] = 0;
|
||||
GetMemPtr(0x2200 + (Index << 10) + ReadIndex)[0] = 0;
|
||||
}
|
||||
|
||||
#ifdef PCSX2_DEVBUILD
|
||||
|
||||
@@ -161,7 +161,7 @@ void SysLogMachineCaps()
|
||||
// tagged commit - more modern implementation of dev build versioning
|
||||
// - there is no need to include the commit - that is associated with the tag,
|
||||
// - git is implied and the tag is timestamped
|
||||
Console.WriteLn(Color_StrongGreen, "\nPCSX2 Nightly - %s", GIT_TAG);
|
||||
Console.WriteLn(Color_StrongGreen, "\nPCSX2 Nightly - %s Compiled on %s", GIT_TAG, __DATE__);
|
||||
} else {
|
||||
Console.WriteLn(Color_StrongGreen, "\nPCSX2 %u.%u.%u-%lld"
|
||||
#ifndef DISABLE_BUILD_DATE
|
||||
|
||||
@@ -466,12 +466,8 @@
|
||||
<ClCompile Include="GS\GSDrawingContext.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanline.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x64.avx.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x64.avx2.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x64.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x86.avx.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x86.avx2.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x86.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.all.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSNewCodeGenerator.cpp" />
|
||||
<ClCompile Include="GS\GSDump.cpp" />
|
||||
<ClCompile Include="GS\Renderers\Common\GSFunctionMap.cpp" />
|
||||
<ClCompile Include="GS\Renderers\HW\GSHwHack.cpp" />
|
||||
@@ -490,12 +486,7 @@
|
||||
<ClCompile Include="GS\Window\GSSetting.cpp" />
|
||||
<ClCompile Include="GS\Window\GSSettingsDlg.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x64.avx.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x64.avx2.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x64.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x86.avx.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x86.avx2.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x86.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.all.cpp" />
|
||||
<ClCompile Include="GS\Renderers\OpenGL\GSShaderOGL.cpp" />
|
||||
<ClCompile Include="GS\GSState.cpp" />
|
||||
<ClCompile Include="GS\GSTables.cpp" />
|
||||
@@ -815,7 +806,6 @@
|
||||
<ClInclude Include="GS\Renderers\OpenGL\GLLoader.h" />
|
||||
<ClInclude Include="GS\Renderers\OpenGL\GLState.h" />
|
||||
<ClInclude Include="GS\GS.h" />
|
||||
<ClInclude Include="GS\GS_codegen.h" />
|
||||
<ClInclude Include="GS\GS_types.h" />
|
||||
<ClInclude Include="GS\GSAlignedClass.h" />
|
||||
<ClInclude Include="GS\GSBlock.h" />
|
||||
@@ -834,6 +824,8 @@
|
||||
<ClInclude Include="GS\GSDrawingEnvironment.h" />
|
||||
<ClInclude Include="GS\Renderers\SW\GSDrawScanline.h" />
|
||||
<ClInclude Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.h" />
|
||||
<ClInclude Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.all.h" />
|
||||
<ClInclude Include="GS\Renderers\SW\GSNewCodeGenerator.h" />
|
||||
<ClInclude Include="GS\GSDump.h" />
|
||||
<ClInclude Include="GS\Renderers\Common\GSFastList.h" />
|
||||
<ClInclude Include="GS\Renderers\Common\GSFunctionMap.h" />
|
||||
@@ -853,6 +845,7 @@
|
||||
<ClInclude Include="GS\Window\GSSetting.h" />
|
||||
<ClInclude Include="GS\Window\GSSettingsDlg.h" />
|
||||
<ClInclude Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.h" />
|
||||
<ClInclude Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.all.h" />
|
||||
<ClInclude Include="GS\Renderers\OpenGL\GSShaderOGL.h" />
|
||||
<ClInclude Include="GS\GSState.h" />
|
||||
<ClInclude Include="GS\GSTables.h" />
|
||||
|
||||
@@ -1517,22 +1517,10 @@
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.cpp">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x64.avx.cpp">
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.all.cpp">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x64.avx2.cpp">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x64.cpp">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x86.avx.cpp">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x86.avx2.cpp">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x86.cpp">
|
||||
<ClCompile Include="GS\Renderers\SW\GSNewCodeGenerator.cpp">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\Renderers\SW\GSRendererSW.cpp">
|
||||
@@ -1541,24 +1529,6 @@
|
||||
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.cpp">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x64.avx.cpp">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x64.avx2.cpp">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x64.cpp">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x86.avx.cpp">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x86.avx2.cpp">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x86.cpp">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\Renderers\SW\GSTextureCacheSW.cpp">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClCompile>
|
||||
@@ -2508,9 +2478,6 @@
|
||||
<ClInclude Include="GS\GS.h">
|
||||
<Filter>System\Ps2\GS</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="GS\GS_codegen.h">
|
||||
<Filter>System\Ps2\GS</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="GS\GS_types.h">
|
||||
<Filter>System\Ps2\GS</Filter>
|
||||
</ClInclude>
|
||||
@@ -2631,6 +2598,12 @@
|
||||
<ClInclude Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.h">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.all.h">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="GS\Renderers\SW\GSNewCodeGenerator.h">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="GS\Renderers\SW\GSRendererSW.h">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClInclude>
|
||||
@@ -2640,6 +2613,9 @@
|
||||
<ClInclude Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.h">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.all.h">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="GS\Renderers\SW\GSTextureCacheSW.h">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClInclude>
|
||||
|
||||
@@ -538,6 +538,11 @@ void recLDL()
|
||||
if (GPR_IS_CONST1(_Rs_))
|
||||
{
|
||||
u32 srcadr = g_cpuConstRegs[_Rs_].UL[0] + _Imm_;
|
||||
|
||||
// If _Rs_ is equal to _Rt_ we need to put the shift in to eax since it won't take the CONST path
|
||||
if (_Rs_ == _Rt_)
|
||||
xMOV(calleeSavedReg1d, srcadr);
|
||||
|
||||
srcadr &= ~0x07;
|
||||
|
||||
t2reg = vtlb_DynGenRead64_Const(64, srcadr, -1);
|
||||
@@ -609,6 +614,11 @@ void recLDR()
|
||||
if (GPR_IS_CONST1(_Rs_))
|
||||
{
|
||||
u32 srcadr = g_cpuConstRegs[_Rs_].UL[0] + _Imm_;
|
||||
|
||||
// If _Rs_ is equal to _Rt_ we need to put the shift in to eax since it won't take the CONST path
|
||||
if(_Rs_ == _Rt_)
|
||||
xMOV(calleeSavedReg1d, srcadr);
|
||||
|
||||
srcadr &= ~0x07;
|
||||
|
||||
t2reg = vtlb_DynGenRead64_Const(64, srcadr, -1);
|
||||
|
||||
Reference in New Issue
Block a user