Merge branch 'master' into vulkan

2024-11-23 11:39:45 +00:00 · 2021-06-08 12:15:34 +03:00 · 2021-06-08 12:15:34 +03:00 · 6cd9d42fd0
commit 6cd9d42fd0
parent e7a6d7921d 89ddf6456a
607 changed files with 452347 additions and 127070 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -7,3 +7,5 @@
 *.csproj    text eol=crlf -whitespace merge=union
 *.vcxproj   text eol=crlf -whitespace merge=union
 *.props     text eol=crlf -whitespace merge=union
+
+src/**/shaders/bytecode/** linguist-generated=true
--- a/.gitmodules
+++ b/.gitmodules
@ -7,9 +7,6 @@
 [submodule "third_party/binutils-ppc-cygwin"]
 	path = third_party/binutils-ppc-cygwin
 	url = https://github.com/benvanik/binutils-ppc-cygwin.git
-[submodule "third_party/libav"]
-	path = third_party/libav
-	url = https://github.com/xenia-project/libav.git
 [submodule "third_party/catch"]
 	path = third_party/catch
 	url = https://github.com/catchorg/Catch2.git
@ -42,7 +39,7 @@
 	url = https://github.com/jarro2783/cxxopts.git
 [submodule "third_party/SDL2"]
 	path = third_party/SDL2
-	url = https://github.com/spurious/SDL-mirror.git
+	url = https://github.com/JoelLinn/SDL.git
 [submodule "third_party/utfcpp"]
 	path = third_party/utfcpp
 	url = https://github.com/xenia-project/utfcpp.git
@ -67,6 +64,9 @@
 [submodule "third_party/xxhash"]
 	path = third_party/xxhash
 	url = https://github.com/Cyan4973/xxHash.git
+[submodule "third_party/FFmpeg"]
+	path = third_party/FFmpeg
+	url = https://github.com/xenia-project/FFmpeg.git
 [submodule "third_party/glslang"]
 	path = third_party/glslang
 	url = https://github.com/KhronosGroup/glslang.git
--- a/README.md
+++ b/README.md
@ -61,7 +61,7 @@ Fixes and optimizations are always welcome (please!), but in addition to
 that there are some major work areas still untouched:

 * Help work through [missing functionality/bugs in games](https://github.com/xenia-project/xenia/labels/compat)
-* Add input drivers for [DualShock4 (PS4) controllers](https://github.com/xenia-project/xenia/issues/60) (or anything else)
+* Add input drivers for [third-party controllers](https://github.com/xenia-project/xenia/issues/1333)
 * Skilled with Linux? A strong contributor is needed to [help with porting](https://github.com/xenia-project/xenia/labels/platform-linux)

 See more projects [good for contributors](https://github.com/xenia-project/xenia/labels/good%20first%20issue). It's a good idea to ask on Discord and check the issues page before beginning work on
--- a/premake5.lua
+++ b/premake5.lua
@ -42,7 +42,6 @@ end

 characterset("Unicode")
 flags({
-  --"ExtraWarnings",        -- Sets the compiler's maximum warning level.
  "FatalWarnings",        -- Treat warnings as errors.
 })

@ -100,8 +99,8 @@ filter("platforms:Linux")
  toolset("clang")
  buildoptions({
    -- "-mlzcnt",  -- (don't) Assume lzcnt is supported.
-    ({os.outputof("pkg-config --cflags gtk+-x11-3.0")})[1],
  })
+  pkg_config.all("gtk+-x11-3.0")
  links({
    "stdc++fs",
    "dl",
@ -109,16 +108,11 @@ filter("platforms:Linux")
    "pthread",
    "rt",
  })
-  linkoptions({
-    ({os.outputof("pkg-config --libs gtk+-3.0")})[1],
-  })

 filter({"platforms:Linux", "kind:*App"})
  linkgroups("On")

 filter({"platforms:Linux", "language:C++", "toolset:gcc"})
-  links({
-  })
  disablewarnings({
    "unused-result"
  })
@ -136,10 +130,6 @@ filter({"platforms:Linux", "toolset:gcc"})
  end

 filter({"platforms:Linux", "language:C++", "toolset:clang"})
-  links({
-    "c++",
-    "c++abi"
-  })
  disablewarnings({
    "deprecated-register"
  })
@ -196,6 +186,7 @@ filter("platforms:Windows")
    "shcore",
    "shlwapi",
    "dxguid",
+    "bcrypt",
  })

 -- Create scratch/ path
@ -203,7 +194,7 @@ if not os.isdir("scratch") then
  os.mkdir("scratch")
 end

-solution("xenia")
+workspace("xenia")
  uuid("931ef4b0-6170-4f7a-aaf2-0fece7632747")
  startproject("xenia-app")
  if os.istarget("android") then
@ -233,14 +224,32 @@ solution("xenia")
  include("third_party/discord-rpc.lua")
  include("third_party/cxxopts.lua")
  include("third_party/cpptoml.lua")
+  include("third_party/FFmpeg/premake5.lua")
  include("third_party/fmt.lua")
  include("third_party/glslang-spirv.lua")
  include("third_party/imgui.lua")
-  include("third_party/libav.lua")
  include("third_party/mspack.lua")
  include("third_party/snappy.lua")
  include("third_party/xxhash.lua")

+  if not os.istarget("android") then
+    -- SDL2 requires sdl2-config, and as of November 2020 isn't high-quality on
+    -- Android yet, most importantly in game controllers - the keycode and axis
+    -- enums are being ruined during conversion to SDL2 enums resulting in only
+    -- one controller (Nvidia Shield) being supported, digital triggers are also
+    -- not supported; lifecycle management (especially surface loss) is also
+    -- complicated.
+    include("third_party/SDL2.lua")
+  end
+
+  -- Disable treating warnings as fatal errors for all third party projects:
+  for _, prj in ipairs(premake.api.scope.current.solution.projects) do
+    project(prj.name)
+    removeflags({
+      "FatalWarnings",
+    })
+  end
+
  include("src/xenia")
  include("src/xenia/app/discord")
  include("src/xenia/apu")
@ -260,14 +269,6 @@ solution("xenia")
  include("src/xenia/vfs")

  if not os.istarget("android") then
-    -- SDL2 requires sdl2-config, and as of November 2020 isn't high-quality on
-    -- Android yet, most importantly in game controllers - the keycode and axis
-    -- enums are being ruined during conversion to SDL2 enums resulting in only
-    -- one controller (Nvidia Shield) being supported, digital triggers are also
-    -- not supported; lifecycle management (especially surface loss) is also
-    -- complicated.
-    include("third_party/SDL2.lua")
-
    include("src/xenia/apu/sdl")
    include("src/xenia/helper/sdl")
    include("src/xenia/hid/sdl")
--- a/src/xenia/app/emulator_window.cc
+++ b/src/xenia/app/emulator_window.cc
@ -430,31 +430,46 @@ void EmulatorWindow::ShowCommitID() {
 }

 void EmulatorWindow::UpdateTitle() {
-  std::string title(base_title_);
+  xe::StringBuffer sb;
+  sb.Append(base_title_);

+  // Title information, if available
  if (emulator()->is_title_open()) {
-    auto game_title = emulator()->game_title();
-    title += fmt::format(" | [{:08X}] {}", emulator()->title_id(), game_title);
+    sb.AppendFormat(u8" | [{:08X}", emulator()->title_id());
+    auto title_version = emulator()->title_version();
+    if (!title_version.empty()) {
+      sb.Append(u8" v");
+      sb.Append(title_version);
+    }
+    sb.Append(u8"]");
+
+    auto title_name = emulator()->title_name();
+    if (!title_name.empty()) {
+      sb.Append(u8" ");
+      sb.Append(title_name);
+    }
  }

+  // Graphics system name, if available
  auto graphics_system = emulator()->graphics_system();
  if (graphics_system) {
    auto graphics_name = graphics_system->name();
-    title += fmt::format(" <{}>", graphics_name);
+    if (!graphics_name.empty()) {
+      sb.Append(u8" <");
+      sb.Append(graphics_name);
+      sb.Append(u8">");
+    }
  }

  if (Clock::guest_time_scalar() != 1.0) {
-    title += fmt::format(" (@{:.2f}x)", Clock::guest_time_scalar());
+    sb.AppendFormat(u8" (@{:.2f}x)", Clock::guest_time_scalar());
  }

  if (initializing_shader_storage_) {
-    title +=
-        " (Preloading shaders"
-        u8"\u2026"
-        ")";
+    sb.Append(u8" (Preloading shaders\u2026)");
  }

-  window_->set_title(title);
+  window_->set_title(sb.to_string_view());
 }

 void EmulatorWindow::SetInitializingShaderStorage(bool initializing) {
--- a/src/xenia/apu/audio_system.cc
+++ b/src/xenia/apu/audio_system.cc
@ -223,7 +223,7 @@ void AudioSystem::UnregisterClient(size_t index) {
 }

 bool AudioSystem::Save(ByteStream* stream) {
-  stream->Write('XAUD');
+  stream->Write(kAudioSaveSignature);

  // Count the number of used clients first.
  // Any gaps should be handled gracefully.
@ -251,7 +251,7 @@ bool AudioSystem::Save(ByteStream* stream) {
 }

 bool AudioSystem::Restore(ByteStream* stream) {
-  if (stream->Read<uint32_t>() != 'XAUD') {
+  if (stream->Read<uint32_t>() != kAudioSaveSignature) {
    XELOGE("AudioSystem::Restore - Invalid magic value!");
    return false;
  }
--- a/src/xenia/apu/audio_system.h
+++ b/src/xenia/apu/audio_system.h
@ -23,6 +23,8 @@
 namespace xe {
 namespace apu {

+constexpr fourcc_t kAudioSaveSignature = make_fourcc("XAUD");
+
 class AudioDriver;
 class XmaDecoder;

--- a/src/xenia/apu/premake5.lua
+++ b/src/xenia/apu/premake5.lua
@ -14,6 +14,6 @@ project("xenia-apu")
  defines({
  })
  includedirs({
-    project_root.."/third_party/libav/",
+    project_root.."/third_party/FFmpeg/",
  })
  local_platform_files()
--- a/src/xenia/apu/xma_context.cc
+++ b/src/xenia/apu/xma_context.cc
--- a/src/xenia/apu/xma_context.h
+++ b/src/xenia/apu/xma_context.h
@ -2,7 +2,7 @@
 ******************************************************************************
 * Xenia : Xbox 360 Emulator Research Project                                 *
 ******************************************************************************
- * Copyright 2015 Ben Vanik. All rights reserved.                             *
+ * Copyright 2021 Ben Vanik. All rights reserved.                             *
 * Released under the BSD license - see LICENSE in the root for more details. *
 ******************************************************************************
 */
@ -10,10 +10,11 @@
 #ifndef XENIA_APU_XMA_CONTEXT_H_
 #define XENIA_APU_XMA_CONTEXT_H_

+#include <array>
 #include <atomic>
 #include <mutex>
 #include <queue>
-#include <vector>
+//#include <vector>

 #include "xenia/memory.h"
 #include "xenia/xbox.h"
@ -30,6 +31,7 @@

 // Forward declarations
 struct AVCodec;
+struct AVCodecParserContext;
 struct AVCodecContext;
 struct AVFrame;
 struct AVPacket;
@ -121,29 +123,29 @@ struct XMA_CONTEXT_DATA {
 static_assert_size(XMA_CONTEXT_DATA, 64);

 #pragma pack(push, 1)
-struct WmaProExtraData {
-  uint16_t bits_per_sample;
-  uint32_t channel_mask;
-  uint8_t unk06[8];
-  uint16_t decode_flags;
-  uint8_t unk10[2];
+// XMA2WAVEFORMATEX
+struct Xma2ExtraData {
+  uint8_t raw[34];
 };
-static_assert_size(WmaProExtraData, 18);
+static_assert_size(Xma2ExtraData, 34);
 #pragma pack(pop)

 class XmaContext {
 public:
  static const uint32_t kBytesPerPacket = 2048;
+  static const uint32_t kBitsPerPacket = kBytesPerPacket * 8;
+  static const uint32_t kBitsPerHeader = 33;

  static const uint32_t kBytesPerSample = 2;
  static const uint32_t kSamplesPerFrame = 512;
  static const uint32_t kSamplesPerSubframe = 128;
-  static const uint32_t kBytesPerFrame = kSamplesPerFrame * kBytesPerSample;
-  static const uint32_t kBytesPerSubframe =
+  static const uint32_t kBytesPerFrameChannel =
+      kSamplesPerFrame * kBytesPerSample;
+  static const uint32_t kBytesPerSubframeChannel =
      kSamplesPerSubframe * kBytesPerSample;

-  static const uint32_t kOutputBytesPerBlock = 256;
-  static const uint32_t kOutputMaxSizeBytes = 31 * kOutputBytesPerBlock;
+  // static const uint32_t kOutputBytesPerBlock = 256;
+  // static const uint32_t kOutputMaxSizeBytes = 31 * kOutputBytesPerBlock;

  explicit XmaContext();
  ~XmaContext();
@ -168,28 +170,29 @@ class XmaContext {
  void set_is_enabled(bool is_enabled) { is_enabled_ = is_enabled; }

 private:
+  static void SwapInputBuffer(XMA_CONTEXT_DATA* data);
+  static void NextPacket(XMA_CONTEXT_DATA* data);
  static int GetSampleRate(int id);
+  // Get the offset of the next frame. Does not traverse packets.
+  static size_t GetNextFrame(uint8_t* block, size_t size, size_t bit_offset);
+  // Get the containing packet number of the frame pointed to by the offset.
+  static int GetFramePacketNumber(uint8_t* block, size_t size,
+                                  size_t bit_offset);
+  // Get the packet number and the index of the frame inside that packet
+  static std::tuple<int, int> GetFrameNumber(uint8_t* block, size_t size,
+                                             size_t bit_offset);
+  // Get the number of frames contained in the packet (including truncated) and
+  // if the last frame is split.
+  static std::tuple<int, bool> GetPacketFrameCount(uint8_t* packet);
+
+  // Convert sample format and swap bytes
+  static void ConvertFrame(const uint8_t** samples, bool is_two_channel,
+                           uint8_t* output_buffer);

-  size_t SavePartial(uint8_t* packet, uint32_t frame_offset_bits,
-                     size_t frame_size_bits, bool append);
  bool ValidFrameOffset(uint8_t* block, size_t size_bytes,
                        size_t frame_offset_bits);
-  void DecodePackets(XMA_CONTEXT_DATA* data);
-  uint32_t GetFramePacketNumber(uint8_t* block, size_t size, size_t bit_offset);
-  int PrepareDecoder(uint8_t* block, size_t size, int sample_rate,
-                     int channels);
-
-  bool ConvertFrame(const uint8_t** samples, int num_channels, int num_samples,
-                    uint8_t* output_buffer);
-
-  int StartPacket(XMA_CONTEXT_DATA* data);
-
-  int PreparePacket(uint8_t* input, size_t seq_offset, size_t size,
-                    int sample_rate, int channels);
-  void DiscardPacket();
-
-  int DecodePacket(uint8_t* output, size_t offset, size_t size,
-                   size_t* read_bytes);
+  void Decode(XMA_CONTEXT_DATA* data);
+  int PrepareDecoder(uint8_t* packet, int sample_rate, bool is_two_channel);

  Memory* memory_ = nullptr;

@ -198,22 +201,35 @@ class XmaContext {
  std::mutex lock_;
  bool is_allocated_ = false;
  bool is_enabled_ = false;
+  // bool is_dirty_ = true;

-  // libav structures
-  AVCodec* codec_ = nullptr;
-  AVCodecContext* context_ = nullptr;
-  AVFrame* decoded_frame_ = nullptr;
-  AVPacket* packet_ = nullptr;
-  WmaProExtraData extra_data_;
+  // ffmpeg structures
+  AVPacket* av_packet_ = nullptr;
+  AVCodec* av_codec_ = nullptr;
+  AVCodecContext* av_context_ = nullptr;
+  AVFrame* av_frame_ = nullptr;
+  // uint32_t decoded_consumed_samples_ = 0; // TODO do this dynamically
+  // int decoded_idx_ = -1;

-  bool partial_frame_saved_ = false;
-  bool partial_frame_size_known_ = false;
-  size_t partial_frame_total_size_bits_ = 0;
-  size_t partial_frame_start_offset_bits_ = 0;
-  size_t partial_frame_offset_bits_ = 0;  // blah internal don't use this
-  std::vector<uint8_t> partial_frame_buffer_;
+  // bool partial_frame_saved_ = false;
+  // bool partial_frame_size_known_ = false;
+  // size_t partial_frame_total_size_bits_ = 0;
+  // size_t partial_frame_start_offset_bits_ = 0;
+  // size_t partial_frame_offset_bits_ = 0;  // blah internal don't use this
+  // std::vector<uint8_t> partial_frame_buffer_;
+  uint32_t packets_skip_ = 0;

-  uint8_t* current_frame_ = nullptr;
+  // bool split_frame_pending_ = false;
+  uint32_t split_frame_len_ = 0;
+  uint32_t split_frame_len_partial_ = 0;
+  uint8_t split_frame_padding_start_ = 0;
+  // first byte contains bit offset information
+  std::array<uint8_t, 1 + 4096> xma_frame_;
+
+  // uint8_t* current_frame_ = nullptr;
+  // conversion buffer for 2 channel frame
+  std::array<uint8_t, kBytesPerFrameChannel * 2> raw_frame_;
+  // std::vector<uint8_t> current_frame_ = std::vector<uint8_t>(0);
 };

 }  // namespace apu
--- a/src/xenia/apu/xma_decoder.cc
+++ b/src/xenia/apu/xma_decoder.cc
@ -2,7 +2,7 @@
 ******************************************************************************
 * Xenia : Xbox 360 Emulator Research Project                                 *
 ******************************************************************************
- * Copyright 2013 Ben Vanik. All rights reserved.                             *
+ * Copyright 2021 Ben Vanik. All rights reserved.                             *
 * Released under the BSD license - see LICENSE in the root for more details. *
 ******************************************************************************
 */
@ -21,7 +21,7 @@
 #include "xenia/kernel/xthread.h"

 extern "C" {
-#include "third_party/libav/libavutil/log.h"
+#include "third_party/FFmpeg/libavutil/log.h"
 }  // extern "C"

 // As with normal Microsoft, there are like twelve different ways to access
@ -48,7 +48,7 @@ extern "C" {
 // do this, it's likely they are either passing the context to XAudio or
 // using the XMA* functions.

-DEFINE_bool(libav_verbose, false, "Verbose libav output (debug and above)",
+DEFINE_bool(ffmpeg_verbose, false, "Verbose FFmpeg output (debug and above)",
            "APU");

 namespace xe {
@ -60,7 +60,7 @@ XmaDecoder::XmaDecoder(cpu::Processor* processor)
 XmaDecoder::~XmaDecoder() = default;

 void av_log_callback(void* avcl, int level, const char* fmt, va_list va) {
-  if (!cvars::libav_verbose && level > AV_LOG_WARNING) {
+  if (!cvars::ffmpeg_verbose && level > AV_LOG_WARNING) {
    return;
  }

@ -101,12 +101,12 @@ void av_log_callback(void* avcl, int level, const char* fmt, va_list va) {

  StringBuffer buff;
  buff.AppendVarargs(fmt, va);
-  xe::logging::AppendLogLineFormat(log_level, level_char, "libav: {}",
+  xe::logging::AppendLogLineFormat(log_level, level_char, "ffmpeg: {}",
                                   buff.to_string_view());
 }

 X_STATUS XmaDecoder::Setup(kernel::KernelState* kernel_state) {
-  // Setup libav logging callback
+  // Setup ffmpeg logging callback
  av_log_set_callback(av_log_callback);

  // Let the processor know we want register access callbacks.
@ -277,10 +277,10 @@ uint32_t XmaDecoder::ReadRegister(uint32_t addr) {
    default: {
      const auto register_info = register_file_.GetRegisterInfo(r);
      if (register_info) {
-        XELOGE("XMA: Read from unhandled register ({:04X}, {})", r,
+        XELOGW("XMA: Read from unhandled register ({:04X}, {})", r,
               register_info->name);
      } else {
-        XELOGE("XMA: Read from unknown register ({:04X})", r);
+        XELOGW("XMA: Read from unknown register ({:04X})", r);
      }
      break;
    }
@ -348,10 +348,10 @@ void XmaDecoder::WriteRegister(uint32_t addr, uint32_t value) {
      default: {
        const auto register_info = register_file_.GetRegisterInfo(r);
        if (register_info) {
-          XELOGE("XMA: Write to unhandled register ({:04X}, {}): {:08X}", r,
+          XELOGW("XMA: Write to unhandled register ({:04X}, {}): {:08X}", r,
                 register_info->name, value);
        } else {
-          XELOGE("XMA: Write to unknown register ({:04X}): {:08X}", r, value);
+          XELOGW("XMA: Write to unknown register ({:04X}): {:08X}", r, value);
        }
        break;
      }
--- a/src/xenia/apu/xma_helpers.h
+++ b/src/xenia/apu/xma_helpers.h
@ -2,7 +2,7 @@
 ******************************************************************************
 * Xenia : Xbox 360 Emulator Research Project                                 *
 ******************************************************************************
- * Copyright 2015 Ben Vanik. All rights reserved.                             *
+ * Copyright 2021 Ben Vanik. All rights reserved.                             *
 * Released under the BSD license - see LICENSE in the root for more details. *
 ******************************************************************************
 */
@ -18,6 +18,8 @@ namespace xe {
 namespace apu {
 namespace xma {

+static const uint32_t kMaxFrameLength = 0x7FFF;
+
 // Get number of frames that /begin/ in this packet.
 uint32_t GetPacketFrameCount(uint8_t* packet) {
  return (uint8_t)(packet[0] >> 2);
@ -27,11 +29,12 @@ uint32_t GetPacketFrameCount(uint8_t* packet) {
 uint32_t GetPacketFrameOffset(uint8_t* packet) {
  uint32_t val = (uint16_t)(((packet[0] & 0x3) << 13) | (packet[1] << 5) |
                            (packet[2] >> 3));
-  if (val == 0x7FFF) {
-    return -1;
-  } else {
-    return val + 32;
-  }
+  // if (val > kBitsPerPacket - kBitsPerHeader) {
+  //   // There is no data in this packet
+  //   return -1;
+  // } else {
+  return val + 32;
+  // }
 }

 uint32_t GetPacketMetadata(uint8_t* packet) {
--- a/src/xenia/base/arena.cc
+++ b/src/xenia/base/arena.cc
@ -2,7 +2,7 @@
 ******************************************************************************
 * Xenia : Xbox 360 Emulator Research Project                                 *
 ******************************************************************************
- * Copyright 2013 Ben Vanik. All rights reserved.                             *
+ * Copyright 2021 Ben Vanik. All rights reserved.                             *
 * Released under the BSD license - see LICENSE in the root for more details. *
 ******************************************************************************
 */
@ -13,6 +13,7 @@
 #include <memory>

 #include "xenia/base/assert.h"
+#include "xenia/base/math.h"

 namespace xe {

@ -45,12 +46,25 @@ void Arena::DebugFill() {
  }
 }

-void* Arena::Alloc(size_t size) {
+void* Arena::Alloc(size_t size, size_t align) {
+  assert_true(
+      xe::bit_count(align) == 1 && align <= 16,
+      "align needs to be a power of 2 and not greater than Chunk alignment");
+
+  // for alignment
+  const auto get_padding = [this, align]() -> size_t {
+    const size_t mask = align - 1;
+    size_t deviation = active_chunk_->offset & mask;
+    return (align - deviation) & mask;
+  };
+
  if (active_chunk_) {
-    if (active_chunk_->capacity - active_chunk_->offset < size + 4096) {
+    if (active_chunk_->capacity - active_chunk_->offset <
+        size + get_padding() + 4096) {
      Chunk* next = active_chunk_->next;
      if (!next) {
-        assert_true(size < chunk_size_, "need to support larger chunks");
+        assert_true(size + get_padding() < chunk_size_,
+                    "need to support larger chunks");
        next = new Chunk(chunk_size_);
        active_chunk_->next = next;
      }
@ -61,8 +75,11 @@ void* Arena::Alloc(size_t size) {
    head_chunk_ = active_chunk_ = new Chunk(chunk_size_);
  }

+  active_chunk_->offset += get_padding();
  uint8_t* p = active_chunk_->buffer + active_chunk_->offset;
  active_chunk_->offset += size;
+  assert_true((reinterpret_cast<size_t>(p) & (align - 1)) == 0,
+              "alignment failed");
  return p;
 }

@ -113,6 +130,8 @@ void Arena::CloneContents(void* buffer, size_t buffer_length) {
 Arena::Chunk::Chunk(size_t chunk_size)
    : next(nullptr), capacity(chunk_size), buffer(0), offset(0) {
  buffer = reinterpret_cast<uint8_t*>(malloc(capacity));
+  assert_true((reinterpret_cast<size_t>(buffer) & size_t(15)) == 0,
+              "16 byte alignment required");
 }

 Arena::Chunk::~Chunk() {
--- a/src/xenia/base/arena.h
+++ b/src/xenia/base/arena.h
@ -2,7 +2,7 @@
 ******************************************************************************
 * Xenia : Xbox 360 Emulator Research Project                                 *
 ******************************************************************************
- * Copyright 2013 Ben Vanik. All rights reserved.                             *
+ * Copyright 2021 Ben Vanik. All rights reserved.                             *
 * Released under the BSD license - see LICENSE in the root for more details. *
 ******************************************************************************
 */
@ -24,11 +24,13 @@ class Arena {
  void Reset();
  void DebugFill();

-  void* Alloc(size_t size);
+  void* Alloc(size_t size, size_t align);
  template <typename T>
  T* Alloc() {
-    return reinterpret_cast<T*>(Alloc(sizeof(T)));
+    return reinterpret_cast<T*>(Alloc(sizeof(T), alignof(T)));
  }
+  // When rewinding aligned allocations, any padding that was applied during
+  // allocation will be leaked
  void Rewind(size_t size);

  void* CloneContents();
--- a/src/xenia/base/bit_stream.cc
+++ b/src/xenia/base/bit_stream.cc
@ -2,7 +2,7 @@
 ******************************************************************************
 * Xenia : Xbox 360 Emulator Research Project                                 *
 ******************************************************************************
- * Copyright 2015 Ben Vanik. All rights reserved.                             *
+ * Copyright 2021 Ben Vanik. All rights reserved.                             *
 * Released under the BSD license - see LICENSE in the root for more details. *
 ******************************************************************************
 */
@ -111,6 +111,8 @@ size_t BitStream::Copy(uint8_t* dest_buffer, size_t num_bits) {
  // First: Copy the first few bits up to a byte boundary.
  if (rel_offset_bits) {
    uint64_t bits = Peek(8 - rel_offset_bits);
+    uint8_t clear_mask = ~((uint8_t(1) << rel_offset_bits) - 1);
+    dest_buffer[out_offset_bytes] &= clear_mask;
    dest_buffer[out_offset_bytes] |= (uint8_t)bits;

    bits_left -= 8 - rel_offset_bits;
@ -132,6 +134,8 @@ size_t BitStream::Copy(uint8_t* dest_buffer, size_t num_bits) {
    uint64_t bits = Peek(bits_left);
    bits <<= 8 - bits_left;

+    uint8_t clear_mask = ((uint8_t(1) << bits_left) - 1);
+    dest_buffer[out_offset_bytes] &= clear_mask;
    dest_buffer[out_offset_bytes] |= (uint8_t)bits;
    Advance(bits_left);
  }
--- a/src/xenia/base/byte_order.h
+++ b/src/xenia/base/byte_order.h
@ -11,103 +11,113 @@
 #define XENIA_BASE_BYTE_ORDER_H_

 #include <cstdint>
+#if defined __has_include
+#if __has_include(<version>)
+#include <version>
+#endif
+#endif
+#if __cpp_lib_endian
+#include <bit>
+#endif

 #include "xenia/base/assert.h"
 #include "xenia/base/platform.h"

-#if XE_PLATFORM_LINUX
-#include <byteswap.h>
+#if !__cpp_lib_endian
+// Polyfill
+#ifdef __BYTE_ORDER__
+namespace std {
+enum class endian {
+  little = __ORDER_LITTLE_ENDIAN__,
+  big = __ORDER_BIG_ENDIAN__,
+  native = __BYTE_ORDER__
+};
+}
+#else
+// Hardcode to little endian for now
+namespace std {
+enum class endian { little = 0, big = 1, native = 0 };
+}
 #endif
+#endif
+// Check for mixed endian
+static_assert((std::endian::native == std::endian::big) ||
+              (std::endian::native == std::endian::little));

 namespace xe {

-#if XE_PLATFORM_WIN32
+#if XE_COMPILER_MSVC
 #define XENIA_BASE_BYTE_SWAP_16 _byteswap_ushort
 #define XENIA_BASE_BYTE_SWAP_32 _byteswap_ulong
 #define XENIA_BASE_BYTE_SWAP_64 _byteswap_uint64
-#elif XE_PLATFORM_MAC
-#define XENIA_BASE_BYTE_SWAP_16 OSSwapInt16
-#define XENIA_BASE_BYTE_SWAP_32 OSSwapInt32
-#define XENIA_BASE_BYTE_SWAP_64 OSSwapInt64
 #else
-#define XENIA_BASE_BYTE_SWAP_16 bswap_16
-#define XENIA_BASE_BYTE_SWAP_32 bswap_32
-#define XENIA_BASE_BYTE_SWAP_64 bswap_64
+#define XENIA_BASE_BYTE_SWAP_16 __builtin_bswap16
+#define XENIA_BASE_BYTE_SWAP_32 __builtin_bswap32
+#define XENIA_BASE_BYTE_SWAP_64 __builtin_bswap64
 #endif  // XE_PLATFORM_WIN32

-inline int8_t byte_swap(int8_t value) { return value; }
-inline uint8_t byte_swap(uint8_t value) { return value; }
-inline int16_t byte_swap(int16_t value) {
-  return static_cast<int16_t>(
-      XENIA_BASE_BYTE_SWAP_16(static_cast<int16_t>(value)));
-}
-inline uint16_t byte_swap(uint16_t value) {
-  return XENIA_BASE_BYTE_SWAP_16(value);
-}
-inline uint16_t byte_swap(char16_t value) {
-  return static_cast<char16_t>(XENIA_BASE_BYTE_SWAP_16(value));
-}
-inline int32_t byte_swap(int32_t value) {
-  return static_cast<int32_t>(
-      XENIA_BASE_BYTE_SWAP_32(static_cast<int32_t>(value)));
-}
-inline uint32_t byte_swap(uint32_t value) {
-  return XENIA_BASE_BYTE_SWAP_32(value);
-}
-inline int64_t byte_swap(int64_t value) {
-  return static_cast<int64_t>(
-      XENIA_BASE_BYTE_SWAP_64(static_cast<int64_t>(value)));
-}
-inline uint64_t byte_swap(uint64_t value) {
-  return XENIA_BASE_BYTE_SWAP_64(value);
-}
-inline float byte_swap(float value) {
-  uint32_t temp = byte_swap(*reinterpret_cast<uint32_t*>(&value));
-  return *reinterpret_cast<float*>(&temp);
-}
-inline double byte_swap(double value) {
-  uint64_t temp = byte_swap(*reinterpret_cast<uint64_t*>(&value));
-  return *reinterpret_cast<double*>(&temp);
-}
-template <typename T>
+template <class T>
 inline T byte_swap(T value) {
-  if (sizeof(T) == 4) {
-    return static_cast<T>(byte_swap(static_cast<uint32_t>(value)));
-  } else if (sizeof(T) == 2) {
-    return static_cast<T>(byte_swap(static_cast<uint16_t>(value)));
-  } else {
-    assert_always("not handled");
+  static_assert(
+      sizeof(T) == 8 || sizeof(T) == 4 || sizeof(T) == 2 || sizeof(T) == 1,
+      "byte_swap(T value): Type T has illegal size");
+  if constexpr (sizeof(T) == 8) {
+    uint64_t temp =
+        XENIA_BASE_BYTE_SWAP_64(*reinterpret_cast<uint64_t*>(&value));
+    return *reinterpret_cast<T*>(&temp);
+  } else if constexpr (sizeof(T) == 4) {
+    uint32_t temp =
+        XENIA_BASE_BYTE_SWAP_32(*reinterpret_cast<uint32_t*>(&value));
+    return *reinterpret_cast<T*>(&temp);
+  } else if constexpr (sizeof(T) == 2) {
+    uint16_t temp =
+        XENIA_BASE_BYTE_SWAP_16(*reinterpret_cast<uint16_t*>(&value));
+    return *reinterpret_cast<T*>(&temp);
+  } else if constexpr (sizeof(T) == 1) {
+    return value;
  }
 }

-template <typename T>
-struct be {
-  be() = default;
-  be(const T& src) : value(xe::byte_swap(src)) {}  // NOLINT(runtime/explicit)
-  be(const be& other) { value = other.value; }     // NOLINT(runtime/explicit)
-  operator T() const { return xe::byte_swap(value); }
+template <typename T, std::endian E>
+struct endian_store {
+  endian_store() = default;
+  endian_store(const T& src) {
+    if constexpr (std::endian::native == E) {
+      value = src;
+    } else {
+      value = xe::byte_swap(src);
+    }
+  }
+  endian_store(const endian_store& other) { value = other.value; }
+  operator T() const {
+    if constexpr (std::endian::native == E) {
+      return value;
+    } else {
+      return xe::byte_swap(value);
+    }
+  }

-  be<T>& operator+=(int a) {
+  endian_store<T, E>& operator+=(int a) {
    *this = *this + a;
    return *this;
  }
-  be<T>& operator-=(int a) {
+  endian_store<T, E>& operator-=(int a) {
    *this = *this - a;
    return *this;
  }
-  be<T>& operator++() {
+  endian_store<T, E>& operator++() {
    *this += 1;
    return *this;
  }  // ++a
-  be<T> operator++(int) {
+  endian_store<T, E> operator++(int) {
    *this += 1;
    return (*this - 1);
  }  // a++
-  be<T>& operator--() {
+  endian_store<T, E>& operator--() {
    *this -= 1;
    return *this;
  }  // --a
-  be<T> operator--(int) {
+  endian_store<T, E> operator--(int) {
    *this -= 1;
    return (*this + 1);
  }  // a--
@ -115,6 +125,11 @@ struct be {
  T value;
 };

+template <typename T>
+using be = endian_store<T, std::endian::big>;
+template <typename T>
+using le = endian_store<T, std::endian::little>;
+
 }  // namespace xe

 #endif  // XENIA_BASE_BYTE_ORDER_H_
--- a/src/xenia/base/cvar.cc
+++ b/src/xenia/base/cvar.cc
@ -23,6 +23,7 @@ namespace cvar {
 cxxopts::Options options("xenia", "Xbox 360 Emulator");
 std::map<std::string, ICommandVar*>* CmdVars;
 std::map<std::string, IConfigVar*>* ConfigVars;
+std::multimap<uint32_t, const IConfigVarUpdate*>* IConfigVarUpdate::updates_;

 void PrintHelpAndExit() {
  std::cout << options.help({""}) << std::endl;
--- a/src/xenia/base/cvar.h
+++ b/src/xenia/base/cvar.h
@ -17,6 +17,7 @@

 #include "cpptoml/include/cpptoml.h"
 #include "cxxopts/include/cxxopts.hpp"
+#include "xenia/base/assert.h"
 #include "xenia/base/filesystem.h"
 #include "xenia/base/string_util.h"

@ -43,6 +44,7 @@ class IConfigVar : virtual public ICommandVar {
  virtual std::string config_value() const = 0;
  virtual void LoadConfigValue(std::shared_ptr<cpptoml::base> result) = 0;
  virtual void LoadGameConfigValue(std::shared_ptr<cpptoml::base> result) = 0;
+  virtual void ResetConfigValueToDefault() = 0;
 };

 template <class T>
@ -75,6 +77,7 @@ class ConfigVar : public CommandVar<T>, virtual public IConfigVar {
  ConfigVar<T>(const char* name, T* default_value, const char* description,
               const char* category, bool is_transient);
  std::string config_value() const override;
+  const T& GetTypedConfigValue() const;
  const std::string& category() const override;
  bool is_transient() const override;
  void AddToLaunchOptions(cxxopts::Options* options) override;
@ -89,6 +92,7 @@ class ConfigVar : public CommandVar<T>, virtual public IConfigVar {
  std::unique_ptr<T> config_value_ = nullptr;
  std::unique_ptr<T> game_config_value_ = nullptr;
  void UpdateValue() override;
+  void ResetConfigValueToDefault() override;
 };

 #pragma warning(pop)
@ -233,6 +237,10 @@ std::string ConfigVar<T>::config_value() const {
  return this->ToString(this->default_value_);
 }
 template <class T>
+const T& ConfigVar<T>::GetTypedConfigValue() const {
+  return config_value_ ? *config_value_ : this->default_value_;
+}
+template <class T>
 void CommandVar<T>::SetCommandLineValue(const T val) {
  commandline_value_ = std::make_unique<T>(val);
  UpdateValue();
@ -247,36 +255,47 @@ void ConfigVar<T>::SetGameConfigValue(T val) {
  game_config_value_ = std::make_unique<T>(val);
  UpdateValue();
 }
+template <class T>
+void ConfigVar<T>::ResetConfigValueToDefault() {
+  SetConfigValue(this->default_value_);
+}

+// CVars can be initialized before these, thus initialized on-demand using new.
 extern std::map<std::string, ICommandVar*>* CmdVars;
 extern std::map<std::string, IConfigVar*>* ConfigVars;

 inline void AddConfigVar(IConfigVar* cv) {
-  if (!ConfigVars) ConfigVars = new std::map<std::string, IConfigVar*>();
-  ConfigVars->insert(std::pair<std::string, IConfigVar*>(cv->name(), cv));
+  if (!ConfigVars) {
+    ConfigVars = new std::map<std::string, IConfigVar*>;
+  }
+  ConfigVars->emplace(cv->name(), cv);
 }
 inline void AddCommandVar(ICommandVar* cv) {
-  if (!CmdVars) CmdVars = new std::map<std::string, ICommandVar*>();
-  CmdVars->insert(std::pair<std::string, ICommandVar*>(cv->name(), cv));
+  if (!CmdVars) {
+    CmdVars = new std::map<std::string, ICommandVar*>;
+  }
+  CmdVars->emplace(cv->name(), cv);
 }
 void ParseLaunchArguments(int& argc, char**& argv,
                          const std::string_view positional_help,
                          const std::vector<std::string>& positional_options);

 template <typename T>
-T* define_configvar(const char* name, T* default_value, const char* description,
-                    const char* category, bool is_transient) {
-  IConfigVar* cfgVar = new ConfigVar<T>(name, default_value, description,
+IConfigVar* define_configvar(const char* name, T* default_value,
+                             const char* description, const char* category,
+                             bool is_transient) {
+  IConfigVar* cfgvar = new ConfigVar<T>(name, default_value, description,
                                        category, is_transient);
-  AddConfigVar(cfgVar);
-  return default_value;
+  AddConfigVar(cfgvar);
+  return cfgvar;
 }

 template <typename T>
-T* define_cmdvar(const char* name, T* default_value, const char* description) {
-  ICommandVar* cmdVar = new CommandVar<T>(name, default_value, description);
-  AddCommandVar(cmdVar);
-  return default_value;
+ICommandVar* define_cmdvar(const char* name, T* default_value,
+                           const char* description) {
+  ICommandVar* cmdvar = new CommandVar<T>(name, default_value, description);
+  AddCommandVar(cmdvar);
+  return cmdvar;
 }

 #define DEFINE_bool(name, default_value, description, category) \
@ -285,6 +304,9 @@ T* define_cmdvar(const char* name, T* default_value, const char* description) {
 #define DEFINE_int32(name, default_value, description, category) \
  DEFINE_CVar(name, default_value, description, category, false, int32_t)

+#define DEFINE_uint32(name, default_value, description, category) \
+  DEFINE_CVar(name, default_value, description, category, false, uint32_t)
+
 #define DEFINE_uint64(name, default_value, description, category) \
  DEFINE_CVar(name, default_value, description, category, false, uint64_t)

@ -314,7 +336,7 @@ T* define_cmdvar(const char* name, T* default_value, const char* description) {
  type name = default_value;                                                  \
  }                                                                           \
  namespace cv {                                                              \
-  static auto cv_##name = cvar::define_configvar(                             \
+  static cvar::IConfigVar* const cv_##name = cvar::define_configvar(          \
      #name, &cvars::name, description, category, is_transient);              \
  }

@ -324,7 +346,7 @@ T* define_cmdvar(const char* name, T* default_value, const char* description) {
  std::string name = default_value;                          \
  }                                                          \
  namespace cv {                                             \
-  static auto cv_##name =                                    \
+  static cvar::ICommandVar* const cv_##name =                \
      cvar::define_cmdvar(#name, &cvars::name, description); \
  }

@ -332,6 +354,8 @@ T* define_cmdvar(const char* name, T* default_value, const char* description) {

 #define DECLARE_int32(name) DECLARE_CVar(name, int32_t)

+#define DECLARE_uint32(name) DECLARE_CVar(name, uint32_t)
+
 #define DECLARE_uint64(name) DECLARE_CVar(name, uint64_t)

 #define DECLARE_double(name) DECLARE_CVar(name, double)
@ -345,6 +369,212 @@ T* define_cmdvar(const char* name, T* default_value, const char* description) {
  extern type name;              \
  }

+// Interface for changing the default value of a variable with auto-upgrading of
+// users' configs (to distinguish between a leftover old default and an explicit
+// override), without having to rename the variable.
+//
+// Two types of updates are supported:
+// - Changing the value of the variable (UPDATE_from_type) from an explicitly
+//   specified previous default value to a new one, but keeping the
+//   user-specified value if it was not the default, and thus explicitly
+//   overridden.
+// - Changing the meaning / domain of the variable (UPDATE_from_any), when
+//   previous user-specified overrides also stop making sense. Config variable
+//   type changes are also considered this type of updates (though
+//   UPDATE_from_type, if the new type doesn't match the previous one, is also
+//   safe to use - it behaves like UPDATE_from_any in this case).
+//
+// Rules of using UPDATE_:
+// - Do not remove previous UPDATE_ entries (both typed and from-any) if you're
+//   adding a new UPDATE_from_type.
+//   This ensures that if the default was changed from 1 to 2 and then to 3,
+//   both users who last launched Xenia when it was 1 and when it was 2 receive
+//   the update (however, those who have explicitly changed it from 2 to 1 when
+//   2 was the default will have it kept at 1).
+//   It's safe to remove the history before a new UPDATE_from_any, however.
+// - The date should preferably be in UTC+0 timezone.
+// - No other pull recent pull requests should have the same date (since builds
+//   are made after every commit).
+// - IConfigVarUpdate::kLastCommittedUpdateDate must be updated - see the
+//   comment near its declaration.
+
+constexpr uint32_t MakeConfigVarUpdateDate(uint32_t year, uint32_t month,
+                                           uint32_t day, uint32_t utc_hour) {
+  // Written to the config as a decimal number - pack as decimal for user
+  // readability.
+  // Using 31 bits in the 3rd millennium already - don't add more digits.
+  return utc_hour + day * 100 + month * 10000 + year * 1000000;
+}
+
+class IConfigVarUpdate {
+ public:
+  // This global highest version constant is used to ensure that version (which
+  // is stored as one value for the whole config file) is monotonically
+  // increased when commits - primarily pull requests - are pushed to the main
+  // branch.
+  //
+  // This is to prevent the following situation:
+  // - Pull request #1 created on day 1.
+  // - Pull request #2 created on day 2.
+  // - Pull request #2 from day 2 merged on day 3.
+  // - User launches the latest version on day 4.
+  //   CVar default changes from PR #2 (day 2) applied because the user's config
+  //   version is day 0, which is < 2.
+  //   User's config has day 2 version now.
+  // - Pull request #1 from day 1 merged on day 5.
+  // - User launches the latest version on day 5.
+  //   CVar default changes from PR #1 (day 1) IGNORED because the user's config
+  //   version is day 2, which is >= 1.
+  //
+  // If this constant is not updated, static_assert will be triggered for a new
+  // DEFINE_, requiring this constant to be raised. But changing this will
+  // result in merge conflicts in all other pull requests also changing cvar
+  // defaults - before they're merged, they will need to be updated, which will
+  // ensure monotonic growth of the versions of all cvars on the main branch. In
+  // the example above, PR #1 will need to be updated before it's merged.
+  //
+  // If you've encountered a merge conflict here in your pull request:
+  //   1) Update any UPDATE_s you've added in the pull request to the current
+  //      date.
+  //   2) Change this value to the same date.
+  // If you're reviewing a pull request with a change here, check if 1) has been
+  // done by the submitter before merging.
+  static constexpr uint32_t kLastCommittedUpdateDate =
+      MakeConfigVarUpdateDate(2020, 12, 31, 13);
+
+  virtual ~IConfigVarUpdate() = default;
+
+  virtual void Apply() const = 0;
+
+  static void ApplyUpdates(uint32_t config_date) {
+    if (!updates_) {
+      return;
+    }
+    auto it_end = updates_->end();
+    for (auto it = updates_->upper_bound(config_date); it != it_end; ++it) {
+      it->second->Apply();
+    }
+  }
+
+  // More reliable than kLastCommittedUpdateDate for actual usage
+  // (kLastCommittedUpdateDate is just a pull request merge order guard), though
+  // usually should be the same, but kLastCommittedUpdateDate may not include
+  // removal of cvars.
+  static uint32_t GetLastUpdateDate() {
+    return (updates_ && !updates_->empty()) ? updates_->crbegin()->first : 0;
+  }
+
+ protected:
+  IConfigVarUpdate(IConfigVar* const& config_var, uint32_t year, uint32_t month,
+                   uint32_t day, uint32_t utc_hour)
+      : config_var_(config_var) {
+    if (!updates_) {
+      updates_ = new std::multimap<uint32_t, const IConfigVarUpdate*>;
+    }
+    updates_->emplace(MakeConfigVarUpdateDate(year, month, day, utc_hour),
+                      this);
+  }
+
+  IConfigVar& config_var() const {
+    assert_not_null(config_var_);
+    return *config_var_;
+  }
+
+ private:
+  // Reference to pointer to loosen initialization order requirements.
+  IConfigVar* const& config_var_;
+
+  // Updates can be initialized before these, thus initialized on demand using
+  // `new`.
+  static std::multimap<uint32_t, const IConfigVarUpdate*>* updates_;
+};
+
+class ConfigVarUpdateFromAny : public IConfigVarUpdate {
+ public:
+  ConfigVarUpdateFromAny(IConfigVar* const& config_var, uint32_t year,
+                         uint32_t month, uint32_t day, uint32_t utc_hour)
+      : IConfigVarUpdate(config_var, year, month, day, utc_hour) {}
+  void Apply() const override { config_var().ResetConfigValueToDefault(); }
+};
+
+template <typename T>
+class ConfigVarUpdate : public IConfigVarUpdate {
+ public:
+  ConfigVarUpdate(IConfigVar* const& config_var, uint32_t year, uint32_t month,
+                  uint32_t day, uint32_t utc_hour, const T& old_default_value)
+      : IConfigVarUpdate(config_var, year, month, day, utc_hour),
+        old_default_value_(old_default_value) {}
+  void Apply() const override {
+    IConfigVar& config_var_untyped = config_var();
+    ConfigVar<T>* config_var_typed =
+        dynamic_cast<ConfigVar<T>*>(&config_var_untyped);
+    // Update only from the previous default value if the same type,
+    // unconditionally reset if the type has been changed.
+    if (!config_var_typed ||
+        config_var_typed->GetTypedConfigValue() == old_default_value_) {
+      config_var_untyped.ResetConfigValueToDefault();
+    }
+  }
+
+ private:
+  T old_default_value_;
+};
+
+#define UPDATE_from_any(name, year, month, day, utc_hour)                     \
+  static_assert(                                                              \
+      cvar::MakeConfigVarUpdateDate(year, month, day, utc_hour) <=            \
+          cvar::IConfigVarUpdate::kLastCommittedUpdateDate,                   \
+      "A new config variable default value update was added - raise "         \
+      "cvar::IConfigVarUpdate::kLastCommittedUpdateDate to the same date in " \
+      "base/cvar.h to ensure coherence between different pull requests "      \
+      "updating config variable defaults.");                                  \
+  namespace cv {                                                              \
+  static const cvar::ConfigVarUpdateFromAny                                   \
+      update_##name_##year_##month_##day_##utc_hour(cv_##name, year, month,   \
+                                                    day, utc_hour);           \
+  }
+
+#define UPDATE_CVar(name, year, month, day, utc_hour, old_default_value, type) \
+  static_assert(                                                               \
+      cvar::MakeConfigVarUpdateDate(year, month, day, utc_hour) <=             \
+          cvar::IConfigVarUpdate::kLastCommittedUpdateDate,                    \
+      "A new config variable default value update was added - raise "          \
+      "cvar::IConfigVarUpdate::kLastCommittedUpdateDate to the same date in "  \
+      "base/cvar.h to ensure coherence between different pull requests "       \
+      "updating config variable defaults.");                                   \
+  namespace cv {                                                               \
+  static const cvar::ConfigVarUpdate<type>                                     \
+      update_##name_##year_##month_##day_##utc_hour(cv_##name, year, month,    \
+                                                    day, utc_hour,             \
+                                                    old_default_value);        \
+  }
+
+#define UPDATE_from_bool(name, year, month, day, utc_hour, old_default_value) \
+  UPDATE_CVar(name, year, month, day, utc_hour, old_default_value, bool)
+
+#define UPDATE_from_int32(name, year, month, day, utc_hour, old_default_value) \
+  UPDATE_CVar(name, year, month, day, utc_hour, old_default_value, int32_t)
+
+#define UPDATE_from_uint32(name, year, month, day, utc_hour, \
+                           old_default_value)                \
+  UPDATE_CVar(name, year, month, day, utc_hour, old_default_value, uint32_t)
+
+#define UPDATE_from_uint64(name, year, month, day, utc_hour, \
+                           old_default_value)                \
+  UPDATE_CVar(name, year, month, day, utc_hour, old_default_value, uint64_t)
+
+#define UPDATE_from_double(name, year, month, day, utc_hour, \
+                           old_default_value)                \
+  UPDATE_CVar(name, year, month, day, utc_hour, old_default_value, double)
+
+#define UPDATE_from_string(name, year, month, day, utc_hour, \
+                           old_default_value)                \
+  UPDATE_CVar(name, year, month, day, utc_hour, old_default_value, std::string)
+
+#define UPDATE_from_path(name, year, month, day, utc_hour, old_default_value) \
+  UPDATE_CVar(name, year, month, day, utc_hour, old_default_value,            \
+              std::filesystem::path)
+
 }  // namespace cvar

 #endif  // XENIA_CVAR_H_
--- a/src/xenia/base/fuzzy.cc
+++ b/src/xenia/base/fuzzy.cc
@ -9,6 +9,7 @@

 #include "xenia/base/fuzzy.h"

+#include <cctype>
 #include <cstring>
 #include <iostream>

--- a/src/xenia/base/hash.h
+++ b/src/xenia/base/hash.h
@ -12,6 +12,8 @@

 #include <cstddef>

+#include "xenia/base/xxhash.h"
+
 namespace xe {
 namespace hash {

@ -24,6 +26,13 @@ struct IdentityHasher {
  size_t operator()(const Key& key) const { return static_cast<size_t>(key); }
 };

+template <typename Key>
+struct XXHasher {
+  size_t operator()(const Key& key) const {
+    return static_cast<size_t>(XXH3_64bits(&key, sizeof(key)));
+  }
+};
+
 }  // namespace hash
 }  // namespace xe

--- a/src/xenia/base/main.h
+++ b/src/xenia/base/main.h
@ -10,6 +10,7 @@
 #ifndef XENIA_BASE_MAIN_H_
 #define XENIA_BASE_MAIN_H_

+#include <optional>
 #include <string>
 #include <vector>

@ -25,19 +26,26 @@ bool has_console_attached();
 // launch.
 struct EntryInfo {
  std::string name;
-  std::string positional_usage;
-  std::vector<std::string> positional_options;
  int (*entry_point)(const std::vector<std::string>& args);
+  bool transparent_options;  // no argument parsing
+  std::optional<std::string> positional_usage;
+  std::optional<std::vector<std::string>> positional_options;
 };
 EntryInfo GetEntryInfo();

 #define DEFINE_ENTRY_POINT(name, entry_point, positional_usage, ...)       \
  xe::EntryInfo xe::GetEntryInfo() {                                       \
    std::initializer_list<std::string> positional_options = {__VA_ARGS__}; \
-    return xe::EntryInfo(                                                  \
-        {name, positional_usage,                                           \
-         std::vector<std::string>(std::move(positional_options)),          \
-         entry_point});                                                    \
+    return xe::EntryInfo{                                                  \
+        name, entry_point, false, positional_usage,                        \
+        std::vector<std::string>(std::move(positional_options))};          \
+  }
+
+// TODO(Joel Linn): Add some way to filter consumed arguments in
+// cvar::ParseLaunchArguments()
+#define DEFINE_ENTRY_POINT_TRANSPARENT(name, entry_point)                      \
+  xe::EntryInfo xe::GetEntryInfo() {                                           \
+    return xe::EntryInfo{name, entry_point, true, std::nullopt, std::nullopt}; \
  }

 }  // namespace xe
--- a/src/xenia/base/main_posix.cc
+++ b/src/xenia/base/main_posix.cc
@ -2,7 +2,7 @@
 ******************************************************************************
 * Xenia : Xbox 360 Emulator Research Project                                 *
 ******************************************************************************
- * Copyright 2020 Ben Vanik. All rights reserved.                             *
+ * Copyright 2021 Ben Vanik. All rights reserved.                             *
 * Released under the BSD license - see LICENSE in the root for more details. *
 ******************************************************************************
 */
@ -23,8 +23,10 @@ bool has_console_attached() { return true; }
 extern "C" int main(int argc, char** argv) {
  auto entry_info = xe::GetEntryInfo();

-  cvar::ParseLaunchArguments(argc, argv, entry_info.positional_usage,
-                             entry_info.positional_options);
+  if (!entry_info.transparent_options) {
+    cvar::ParseLaunchArguments(argc, argv, entry_info.positional_usage.value(),
+                               entry_info.positional_options.value());
+  }

  std::vector<std::string> args;
  for (int n = 0; n < argc; n++) {
--- a/src/xenia/base/main_win.cc
+++ b/src/xenia/base/main_win.cc
@ -2,7 +2,7 @@
 ******************************************************************************
 * Xenia : Xbox 360 Emulator Research Project                                 *
 ******************************************************************************
- * Copyright 2020 Ben Vanik. All rights reserved.                             *
+ * Copyright 2021 Ben Vanik. All rights reserved.                             *
 * Released under the BSD license - see LICENSE in the root for more details. *
 ******************************************************************************
 */
@ -104,8 +104,10 @@ static bool parse_launch_arguments(const xe::EntryInfo& entry_info,

  LocalFree(wargv);

-  cvar::ParseLaunchArguments(argc, argv, entry_info.positional_usage,
-                             entry_info.positional_options);
+  if (!entry_info.transparent_options) {
+    cvar::ParseLaunchArguments(argc, argv, entry_info.positional_usage.value(),
+                               entry_info.positional_options.value());
+  }

  args.clear();
  for (int n = 0; n < argc; n++) {
--- a/src/xenia/base/mapped_memory_win.cc
+++ b/src/xenia/base/mapped_memory_win.cc
@ -18,6 +18,11 @@
 #include "xenia/base/memory.h"
 #include "xenia/base/platform_win.h"

+#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP | \
+                            WINAPI_PARTITION_SYSTEM | WINAPI_PARTITION_GAMES)
+#define XE_BASE_MAPPED_MEMORY_WIN_USE_DESKTOP_FUNCTIONS
+#endif
+
 namespace xe {

 class Win32MappedMemory : public MappedMemory {
@ -70,7 +75,7 @@ class Win32MappedMemory : public MappedMemory {
    size_t aligned_length = length + (offset - aligned_offset);

    UnmapViewOfFile(data_);
-#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#ifdef XE_BASE_MAPPED_MEMORY_WIN_USE_DESKTOP_FUNCTIONS
    data_ = MapViewOfFile(mapping_handle, view_access_, aligned_offset >> 32,
                          aligned_offset & 0xFFFFFFFF, aligned_length);
 #else
@ -139,7 +144,7 @@ std::unique_ptr<MappedMemory> MappedMemory::Open(
    return nullptr;
  }

-#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#ifdef XE_BASE_MAPPED_MEMORY_WIN_USE_DESKTOP_FUNCTIONS
  mm->mapping_handle = CreateFileMapping(
      mm->file_handle, nullptr, mapping_protect, DWORD(aligned_length >> 32),
      DWORD(aligned_length), nullptr);
@ -152,7 +157,7 @@ std::unique_ptr<MappedMemory> MappedMemory::Open(
    return nullptr;
  }

-#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#ifdef XE_BASE_MAPPED_MEMORY_WIN_USE_DESKTOP_FUNCTIONS
  mm->data_ = reinterpret_cast<uint8_t*>(MapViewOfFile(
      mm->mapping_handle, view_access, DWORD(aligned_offset >> 32),
      DWORD(aligned_offset), aligned_length));
@ -257,7 +262,7 @@ class Win32ChunkedMappedMemoryWriter : public ChunkedMappedMemoryWriter {
        return false;
      }

-#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#ifdef XE_BASE_MAPPED_MEMORY_WIN_USE_DESKTOP_FUNCTIONS
      mapping_handle_ =
          CreateFileMapping(file_handle_, nullptr, mapping_protect,
                            DWORD(capacity_ >> 32), DWORD(capacity_), nullptr);
@ -275,11 +280,11 @@ class Win32ChunkedMappedMemoryWriter : public ChunkedMappedMemoryWriter {
      if (low_address_space) {
        bool successful = false;
        data_ = reinterpret_cast<uint8_t*>(0x10000000);
-#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#ifndef XE_BASE_MAPPED_MEMORY_WIN_USE_DESKTOP_FUNCTIONS
        HANDLE process = GetCurrentProcess();
 #endif
        for (int i = 0; i < 1000; ++i) {
-#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#ifdef XE_BASE_MAPPED_MEMORY_WIN_USE_DESKTOP_FUNCTIONS
          if (MapViewOfFileEx(mapping_handle_, view_access, 0, 0, capacity_,
                              data_)) {
            successful = true;
@ -311,7 +316,7 @@ class Win32ChunkedMappedMemoryWriter : public ChunkedMappedMemoryWriter {
          }
        }
      } else {
-#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#ifdef XE_BASE_MAPPED_MEMORY_WIN_USE_DESKTOP_FUNCTIONS
        data_ = reinterpret_cast<uint8_t*>(
            MapViewOfFile(mapping_handle_, view_access, 0, 0, capacity_));
 #else
--- a/src/xenia/base/math.h
+++ b/src/xenia/base/math.h
@ -17,6 +17,16 @@
 #include <limits>
 #include <numeric>
 #include <type_traits>
+
+#if defined __has_include
+#if __has_include(<version>)
+#include <version>
+#endif
+#endif
+#if __cpp_lib_bitops
+#include <bit>
+#endif
+
 #include "xenia/base/platform.h"

 #if XE_ARCH_AMD64
@ -50,8 +60,20 @@ constexpr T round_up(T value, V multiple, bool force_non_zero = true) {
  return (value + multiple - 1) / multiple * multiple;
 }

-constexpr float saturate(float value) {
-  return std::max(std::min(1.0f, value), -1.0f);
+// Using the same conventions as in shading languages, returning 0 for NaN.
+// std::max is `a < b ? b : a`, thus in case of NaN, the first argument is
+// always returned. Also -0 is not < +0, so +0 is also chosen for it.
+template <typename T>
+constexpr T saturate_unsigned(T value) {
+  return std::min(static_cast<T>(1.0f), std::max(static_cast<T>(0.0f), value));
+}
+
+// This diverges from the GPU NaN rules for signed normalized formats (NaN
+// should be converted to 0, not to -1), but this expectation is not needed most
+// of time, and cannot be met for free (unlike for 0...1 clamping).
+template <typename T>
+constexpr T saturate_signed(T value) {
+  return std::min(static_cast<T>(1.0f), std::max(static_cast<T>(-1.0f), value));
 }

 // Gets the next power of two value that is greater than or equal to the given
@ -104,6 +126,23 @@ constexpr uint32_t select_bits(uint32_t value, uint32_t a, uint32_t b) {
  return (value & make_bitmask(a, b)) >> a;
 }

+#if __cpp_lib_bitops
+template <class T>
+constexpr inline uint32_t bit_count(T v) {
+  return static_cast<uint32_t>(std::popcount(v));
+}
+#else
+#if XE_COMPILER_MSVC || XE_COMPILER_INTEL
+inline uint32_t bit_count(uint32_t v) { return __popcnt(v); }
+inline uint32_t bit_count(uint64_t v) {
+  return static_cast<uint32_t>(__popcnt64(v));
+}
+#elif XE_COMPILER_GCC || XE_COMPILER_CLANG
+static_assert(sizeof(unsigned int) == sizeof(uint32_t));
+static_assert(sizeof(unsigned long long) == sizeof(uint64_t));
+inline uint32_t bit_count(uint32_t v) { return __builtin_popcount(v); }
+inline uint32_t bit_count(uint64_t v) { return __builtin_popcountll(v); }
+#else
 inline uint32_t bit_count(uint32_t v) {
  v = v - ((v >> 1) & 0x55555555);
  v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
@ -119,6 +158,8 @@ inline uint32_t bit_count(uint64_t v) {
  v = v + (v >> 32) & 0x0000007F;
  return static_cast<uint32_t>(v);
 }
+#endif
+#endif

 // lzcnt instruction, typed for integers of all sizes.
 // The number of leading zero bits in the value parameter. If value is zero, the
@ -245,7 +286,7 @@ inline bool bit_scan_forward(uint32_t v, uint32_t* out_first_set_index) {
  return i != 0;
 }
 inline bool bit_scan_forward(uint64_t v, uint32_t* out_first_set_index) {
-  int i = ffsll(v);
+  int i = __builtin_ffsll(v);
  *out_first_set_index = i - 1;
  return i != 0;
 }
--- a/src/xenia/base/memory.cc
+++ b/src/xenia/base/memory.cc
@ -43,6 +43,16 @@ void copy_128_aligned(void* dest, const void* src, size_t count) {
 }

 #if XE_ARCH_AMD64
+
+// This works around a GCC bug
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100801
+// TODO(Joel Linn): Remove this when fixed GCC versions are common place.
+#if XE_COMPILER_GNUC
+#define XE_WORKAROUND_LOOP_KILL_MOD(x) \
+  if ((count % (x)) == 0) __builtin_unreachable();
+#else
+#define XE_WORKAROUND_LOOP_KILL_MOD(x)
+#endif
 void copy_and_swap_16_aligned(void* dest_ptr, const void* src_ptr,
                              size_t count) {
  assert_zero(reinterpret_cast<uintptr_t>(dest_ptr) & 0xF);
@ -61,6 +71,7 @@ void copy_and_swap_16_aligned(void* dest_ptr, const void* src_ptr,
    _mm_store_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
  }
  for (; i < count; ++i) {  // handle residual elements
+    XE_WORKAROUND_LOOP_KILL_MOD(8);
    dest[i] = byte_swap(src[i]);
  }
 }
@ -80,6 +91,7 @@ void copy_and_swap_16_unaligned(void* dest_ptr, const void* src_ptr,
    _mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
  }
  for (; i < count; ++i) {  // handle residual elements
+    XE_WORKAROUND_LOOP_KILL_MOD(8);
    dest[i] = byte_swap(src[i]);
  }
 }
@ -102,6 +114,7 @@ void copy_and_swap_32_aligned(void* dest_ptr, const void* src_ptr,
    _mm_store_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
  }
  for (; i < count; ++i) {  // handle residual elements
+    XE_WORKAROUND_LOOP_KILL_MOD(4);
    dest[i] = byte_swap(src[i]);
  }
 }
@ -121,6 +134,7 @@ void copy_and_swap_32_unaligned(void* dest_ptr, const void* src_ptr,
    _mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
  }
  for (; i < count; ++i) {  // handle residual elements
+    XE_WORKAROUND_LOOP_KILL_MOD(4);
    dest[i] = byte_swap(src[i]);
  }
 }
@ -143,6 +157,7 @@ void copy_and_swap_64_aligned(void* dest_ptr, const void* src_ptr,
    _mm_store_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
  }
  for (; i < count; ++i) {  // handle residual elements
+    XE_WORKAROUND_LOOP_KILL_MOD(2);
    dest[i] = byte_swap(src[i]);
  }
 }
@ -162,6 +177,7 @@ void copy_and_swap_64_unaligned(void* dest_ptr, const void* src_ptr,
    _mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
  }
  for (; i < count; ++i) {  // handle residual elements
+    XE_WORKAROUND_LOOP_KILL_MOD(2);
    dest[i] = byte_swap(src[i]);
  }
 }
@ -178,6 +194,7 @@ void copy_and_swap_16_in_32_aligned(void* dest_ptr, const void* src_ptr,
    _mm_store_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
  }
  for (; i < count; ++i) {  // handle residual elements
+    XE_WORKAROUND_LOOP_KILL_MOD(4);
    dest[i] = (src[i] >> 16) | (src[i] << 16);
  }
 }
@ -194,6 +211,7 @@ void copy_and_swap_16_in_32_unaligned(void* dest_ptr, const void* src_ptr,
    _mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
  }
  for (; i < count; ++i) {  // handle residual elements
+    XE_WORKAROUND_LOOP_KILL_MOD(4);
    dest[i] = (src[i] >> 16) | (src[i] << 16);
  }
 }
--- a/src/xenia/base/memory.h
+++ b/src/xenia/base/memory.h
@ -15,6 +15,7 @@
 #include <filesystem>
 #include <functional>
 #include <string>
+#include <string_view>

 #include "xenia/base/assert.h"
 #include "xenia/base/byte_order.h"
@ -441,6 +442,26 @@ inline void store_and_swap<std::u16string>(void* mem,
  return store_and_swap<std::u16string_view>(mem, value);
 }

+using fourcc_t = uint32_t;
+
+// Get FourCC in host byte order
+// make_fourcc('a', 'b', 'c', 'd') == 0x61626364
+constexpr inline fourcc_t make_fourcc(char a, char b, char c, char d) {
+  return fourcc_t((static_cast<fourcc_t>(a) << 24) |
+                  (static_cast<fourcc_t>(b) << 16) |
+                  (static_cast<fourcc_t>(c) << 8) | static_cast<fourcc_t>(d));
+}
+
+// Get FourCC in host byte order
+// This overload requires fourcc.length() == 4
+// make_fourcc("abcd") == 'abcd' == 0x61626364 for most compilers
+constexpr inline fourcc_t make_fourcc(const std::string_view fourcc) {
+  if (fourcc.length() != 4) {
+    throw std::runtime_error("Invalid fourcc length");
+  }
+  return make_fourcc(fourcc[0], fourcc[1], fourcc[2], fourcc[3]);
+}
+
 }  // namespace xe

 #endif  // XENIA_BASE_MEMORY_H_
--- a/src/xenia/base/memory_win.cc
+++ b/src/xenia/base/memory_win.cc
@ -11,6 +11,11 @@

 #include "xenia/base/platform_win.h"

+#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP | \
+                            WINAPI_PARTITION_SYSTEM | WINAPI_PARTITION_GAMES)
+#define XE_BASE_MEMORY_WIN_USE_DESKTOP_FUNCTIONS
+#endif
+
 namespace xe {
 namespace memory {

@ -75,12 +80,11 @@ PageAccess ToXeniaProtectFlags(DWORD access) {
 }

 bool IsWritableExecutableMemorySupported() {
-#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#ifdef XE_BASE_MEMORY_WIN_USE_DESKTOP_FUNCTIONS
  return true;
 #else
-  // To test FromApp functions on desktop, replace
-  // WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) with 0 in the #ifs and
-  // link to WindowsApp.lib.
+  // To test FromApp functions on desktop, undefine
+  // XE_BASE_MEMORY_WIN_USE_DESKTOP_FUNCTIONS and link to WindowsApp.lib.
  return false;
 #endif
 }
@ -103,7 +107,7 @@ void* AllocFixed(void* base_address, size_t length,
      break;
  }
  DWORD protect = ToWin32ProtectFlags(access);
-#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#ifdef XE_BASE_MEMORY_WIN_USE_DESKTOP_FUNCTIONS
  return VirtualAlloc(base_address, length, alloc_type, protect);
 #else
  return VirtualAllocFromApp(base_address, length, ULONG(alloc_type),
@ -135,7 +139,7 @@ bool Protect(void* base_address, size_t length, PageAccess access,
    *out_old_access = PageAccess::kNoAccess;
  }
  DWORD new_protect = ToWin32ProtectFlags(access);
-#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#ifdef XE_BASE_MEMORY_WIN_USE_DESKTOP_FUNCTIONS
  DWORD old_protect = 0;
  BOOL result = VirtualProtect(base_address, length, new_protect, &old_protect);
 #else
@ -174,7 +178,7 @@ FileMappingHandle CreateFileMappingHandle(const std::filesystem::path& path,
  DWORD protect =
      ToWin32ProtectFlags(access) | (commit ? SEC_COMMIT : SEC_RESERVE);
  auto full_path = "Local" / path;
-#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#ifdef XE_BASE_MEMORY_WIN_USE_DESKTOP_FUNCTIONS
  return CreateFileMappingW(INVALID_HANDLE_VALUE, nullptr, protect,
                            static_cast<DWORD>(length >> 32),
                            static_cast<DWORD>(length), full_path.c_str());
@ -191,7 +195,7 @@ void CloseFileMappingHandle(FileMappingHandle handle,

 void* MapFileView(FileMappingHandle handle, void* base_address, size_t length,
                  PageAccess access, size_t file_offset) {
-#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#ifdef XE_BASE_MEMORY_WIN_USE_DESKTOP_FUNCTIONS
  DWORD target_address_low = static_cast<DWORD>(file_offset);
  DWORD target_address_high = static_cast<DWORD>(file_offset >> 32);
  DWORD file_access = 0;
--- a/src/xenia/base/platform.h
+++ b/src/xenia/base/platform.h
@ -85,18 +85,17 @@
 #endif  // XE_PLATFORM_MAC

 #if XE_COMPILER_MSVC
-#define XEPACKEDSTRUCT(name, value) \
-  __pragma(pack(push, 1)) struct name value __pragma(pack(pop));
-#define XEPACKEDSTRUCTANONYMOUS(value) \
-  __pragma(pack(push, 1)) struct value __pragma(pack(pop));
-#define XEPACKEDUNION(name, value) \
-  __pragma(pack(push, 1)) union name value __pragma(pack(pop));
+#define _XEPACKEDSCOPE(body) __pragma(pack(push, 1)) body __pragma(pack(pop));
 #else
-#define XEPACKEDSTRUCT(name, value) struct __attribute__((packed)) name value;
-#define XEPACKEDSTRUCTANONYMOUS(value) struct __attribute__((packed)) value;
-#define XEPACKEDUNION(name, value) union __attribute__((packed)) name value;
+#define _XEPACKEDSCOPE(body)     \
+  _Pragma("pack(push, 1)") body; \
+  _Pragma("pack(pop)");
 #endif  // XE_PLATFORM_WIN32

+#define XEPACKEDSTRUCT(name, value) _XEPACKEDSCOPE(struct name value)
+#define XEPACKEDSTRUCTANONYMOUS(value) _XEPACKEDSCOPE(struct value)
+#define XEPACKEDUNION(name, value) _XEPACKEDSCOPE(union name value)
+
 namespace xe {

 #if XE_PLATFORM_WIN32
--- a/src/xenia/base/platform_win.h
+++ b/src/xenia/base/platform_win.h
@ -22,6 +22,7 @@
 #define NOMINMAX
 #include <ObjBase.h>
 #include <SDKDDKVer.h>
+#include <bcrypt.h>
 #include <dwmapi.h>
 #include <shellapi.h>
 #include <shlwapi.h>
--- a/src/xenia/base/string_key.h
+++ b/src/xenia/base/string_key.h
@ -87,12 +87,12 @@ struct string_key_case : internal::string_key_base {

 namespace std {
 template <>
-struct std::hash<xe::string_key> {
+struct hash<xe::string_key> {
  std::size_t operator()(const xe::string_key& t) const { return t.hash(); }
 };

 template <>
-struct std::hash<xe::string_key_case> {
+struct hash<xe::string_key_case> {
  std::size_t operator()(const xe::string_key_case& t) const {
    return t.hash();
  }
--- a/src/xenia/base/testing/memory_test.cc
+++ b/src/xenia/base/testing/memory_test.cc
@ -2,7 +2,7 @@
 ******************************************************************************
 * Xenia : Xbox 360 Emulator Research Project                                 *
 ******************************************************************************
- * Copyright 2015 Ben Vanik. All rights reserved.                             *
+ * Copyright 2021 Ben Vanik. All rights reserved.                             *
 * Released under the BSD license - see LICENSE in the root for more details. *
 ******************************************************************************
 */
@ -18,7 +18,7 @@ namespace xe {
 namespace base {
 namespace test {

-TEST_CASE("copy_128_aligned", "Copy and Swap") {
+TEST_CASE("copy_128_aligned", "[copy_and_swap]") {
  alignas(128) uint8_t src[256], dest[256];
  for (uint8_t i = 0; i < 255; ++i) {
    src[i] = 255 - i;
@ -37,7 +37,7 @@ TEST_CASE("copy_128_aligned", "Copy and Swap") {
  REQUIRE(std::memcmp(dest, src + 1, 128));
 }

-TEST_CASE("copy_and_swap_16_aligned", "Copy and Swap") {
+TEST_CASE("copy_and_swap_16_aligned", "[copy_and_swap]") {
  alignas(16) uint16_t a = 0x1111, b = 0xABCD;
  copy_and_swap_16_aligned(&a, &b, 1);
  REQUIRE(a == 0xCDAB);
@ -93,7 +93,7 @@ TEST_CASE("copy_and_swap_16_aligned", "Copy and Swap") {
  REQUIRE(std::strcmp(f, "s atdnra dlagimnne.t") == 0);
 }

-TEST_CASE("copy_and_swap_16_unaligned", "Copy and Swap") {
+TEST_CASE("copy_and_swap_16_unaligned", "[copy_and_swap]") {
  uint16_t a = 0x1111, b = 0xABCD;
  copy_and_swap_16_unaligned(&a, &b, 1);
  REQUIRE(a == 0xCDAB);
@ -139,7 +139,7 @@ TEST_CASE("copy_and_swap_16_unaligned", "Copy and Swap") {
                      "noeg rhtnas atdnra dlagimnne.t") == 0);
 }

-TEST_CASE("copy_and_swap_32_aligned", "Copy and Swap") {
+TEST_CASE("copy_and_swap_32_aligned", "[copy_and_swap]") {
  alignas(32) uint32_t a = 0x11111111, b = 0x89ABCDEF;
  copy_and_swap_32_aligned(&a, &b, 1);
  REQUIRE(a == 0xEFCDAB89);
@ -195,7 +195,7 @@ TEST_CASE("copy_and_swap_32_aligned", "Copy and Swap") {
  REQUIRE(std::strcmp(f, "ats radnla dmngi.tne") == 0);
 }

-TEST_CASE("copy_and_swap_32_unaligned", "Copy and Swap") {
+TEST_CASE("copy_and_swap_32_unaligned", "[copy_and_swap]") {
  uint32_t a = 0x11111111, b = 0x89ABCDEF;
  copy_and_swap_32_unaligned(&a, &b, 1);
  REQUIRE(a == 0xEFCDAB89);
@ -259,7 +259,7 @@ TEST_CASE("copy_and_swap_32_unaligned", "Copy and Swap") {
                      "regnahtats radnla dmngi.tne") == 0);
 }

-TEST_CASE("copy_and_swap_64_aligned", "Copy and Swap") {
+TEST_CASE("copy_and_swap_64_aligned", "[copy_and_swap]") {
  alignas(64) uint64_t a = 0x1111111111111111, b = 0x0123456789ABCDEF;
  copy_and_swap_64_aligned(&a, &b, 1);
  REQUIRE(a == 0xEFCDAB8967452301);
@ -317,7 +317,7 @@ TEST_CASE("copy_and_swap_64_aligned", "Copy and Swap") {
  REQUIRE(std::strcmp(f, "radnats mngila d") == 0);
 }

-TEST_CASE("copy_and_swap_64_unaligned", "Copy and Swap") {
+TEST_CASE("copy_and_swap_64_unaligned", "[copy_and_swap]") {
  uint64_t a = 0x1111111111111111, b = 0x0123456789ABCDEF;
  copy_and_swap_64_unaligned(&a, &b, 1);
  REQUIRE(a == 0xEFCDAB8967452301);
@ -407,12 +407,12 @@ TEST_CASE("copy_and_swap_64_unaligned", "Copy and Swap") {
                      "regradnats mngila d") == 0);
 }

-TEST_CASE("copy_and_swap_16_in_32_aligned", "Copy and Swap") {
+TEST_CASE("copy_and_swap_16_in_32_aligned", "[copy_and_swap]") {
  // TODO(bwrsandman): test once properly understood.
  REQUIRE(true == true);
 }

-TEST_CASE("copy_and_swap_16_in_32_unaligned", "Copy and Swap") {
+TEST_CASE("copy_and_swap_16_in_32_unaligned", "[copy_and_swap]") {
  // TODO(bwrsandman): test once properly understood.
  REQUIRE(true == true);
 }
@ -425,7 +425,7 @@ TEST_CASE("create_and_close_file_mapping", "Virtual Memory Mapping") {
  xe::memory::CloseFileMappingHandle(memory, path);
 }

-TEST_CASE("map_view", "Virtual Memory Mapping") {
+TEST_CASE("map_view", "[virtual_memory_mapping]") {
  auto path = fmt::format("xenia_test_{}", Clock::QueryHostTickCount());
  const size_t length = 0x100;
  auto memory = xe::memory::CreateFileMappingHandle(
@ -442,7 +442,7 @@ TEST_CASE("map_view", "Virtual Memory Mapping") {
  xe::memory::CloseFileMappingHandle(memory, path);
 }

-TEST_CASE("read_write_view", "Virtual Memory Mapping") {
+TEST_CASE("read_write_view", "[virtual_memory_mapping]") {
  const size_t length = 0x100;
  auto path = fmt::format("xenia_test_{}", Clock::QueryHostTickCount());
  auto memory = xe::memory::CreateFileMappingHandle(
@ -469,6 +469,40 @@ TEST_CASE("read_write_view", "Virtual Memory Mapping") {
  xe::memory::CloseFileMappingHandle(memory, path);
 }

+TEST_CASE("make_fourcc", "[fourcc]") {
+  SECTION("'1234'") {
+    const uint32_t fourcc_host = 0x31323334;
+    constexpr fourcc_t fourcc_1 = make_fourcc('1', '2', '3', '4');
+    constexpr fourcc_t fourcc_2 = make_fourcc("1234");
+    REQUIRE(fourcc_1 == fourcc_host);
+    REQUIRE(fourcc_2 == fourcc_host);
+    REQUIRE(fourcc_1 == fourcc_2);
+    REQUIRE(fourcc_2 == fourcc_1);
+  }
+
+  SECTION("'ABcd'") {
+    const uint32_t fourcc_host = 0x41426364;
+    constexpr fourcc_t fourcc_1 = make_fourcc('A', 'B', 'c', 'd');
+    constexpr fourcc_t fourcc_2 = make_fourcc("ABcd");
+    REQUIRE(fourcc_1 == fourcc_host);
+    REQUIRE(fourcc_2 == fourcc_host);
+    REQUIRE(fourcc_1 == fourcc_2);
+    REQUIRE(fourcc_2 == fourcc_1);
+  }
+
+  SECTION("'XEN\\0'") {
+    const uint32_t fourcc_host = 0x58454E00;
+    constexpr fourcc_t fourcc = make_fourcc('X', 'E', 'N', '\0');
+    REQUIRE(fourcc == fourcc_host);
+  }
+
+  SECTION("length()!=4") {
+    REQUIRE_THROWS(make_fourcc("AB\0\0"));
+    REQUIRE_THROWS(make_fourcc("AB\0\0AB"));
+    REQUIRE_THROWS(make_fourcc("ABCDEFGH"));
+  }
+}
+
 }  // namespace test
 }  // namespace base
 }  // namespace xe
--- a/src/xenia/base/testing/threading_test.cc
+++ b/src/xenia/base/testing/threading_test.cc
@ -2,7 +2,7 @@
 ******************************************************************************
 * Xenia : Xbox 360 Emulator Research Project                                 *
 ******************************************************************************
-* Copyright 2018 Ben Vanik. All rights reserved.                             *
+* Copyright 2021 Ben Vanik. All rights reserved.                             *
 * Released under the BSD license - see LICENSE in the root for more details. *
 ******************************************************************************
 */
@ -84,17 +84,17 @@ TEST_CASE("Enable process to set thread affinity") {
  EnableAffinityConfiguration();
 }

-TEST_CASE("Yield Current Thread", "MaybeYield") {
+TEST_CASE("Yield Current Thread", "[maybe_yield]") {
  // Run to see if there are any errors
  MaybeYield();
 }

-TEST_CASE("Sync with Memory Barrier", "SyncMemory") {
+TEST_CASE("Sync with Memory Barrier", "[sync_memory]") {
  // Run to see if there are any errors
  SyncMemory();
 }

-TEST_CASE("Sleep Current Thread", "Sleep") {
+TEST_CASE("Sleep Current Thread", "[sleep]") {
  auto wait_time = 50ms;
  auto start = std::chrono::steady_clock::now();
  Sleep(wait_time);
@ -102,7 +102,7 @@ TEST_CASE("Sleep Current Thread", "Sleep") {
  REQUIRE(duration >= wait_time);
 }

-TEST_CASE("Sleep Current Thread in Alertable State", "Sleep") {
+TEST_CASE("Sleep Current Thread in Alertable State", "[sleep]") {
  auto wait_time = 50ms;
  auto start = std::chrono::steady_clock::now();
  auto result = threading::AlertableSleep(wait_time);
@ -154,7 +154,7 @@ TEST_CASE("HighResolutionTimer") {
  // Time the actual sleep duration
  {
    const auto interval = 50ms;
-    std::atomic<uint64_t> counter;
+    std::atomic<uint64_t> counter(0);
    auto start = std::chrono::steady_clock::now();
    auto cb = [&counter] { ++counter; };
    auto pTimer = HighResolutionTimer::CreateRepeating(interval, cb);
@ -201,7 +201,7 @@ TEST_CASE("HighResolutionTimer") {
  // spawned from differing threads
 }

-TEST_CASE("Wait on Multiple Handles", "Wait") {
+TEST_CASE("Wait on Multiple Handles", "[wait]") {
  auto mutant = Mutant::Create(true);
  auto semaphore = Semaphore::Create(10, 10);
  auto event_ = Event::CreateManualResetEvent(false);
@ -244,7 +244,7 @@ TEST_CASE("Signal and Wait") {
  REQUIRE(result == WaitResult::kSuccess);
 }

-TEST_CASE("Wait on Event", "Event") {
+TEST_CASE("Wait on Event", "[event]") {
  auto evt = Event::CreateAutoResetEvent(false);
  WaitResult result;

@ -262,7 +262,7 @@ TEST_CASE("Wait on Event", "Event") {
  REQUIRE(result == WaitResult::kTimeout);
 }

-TEST_CASE("Reset Event", "Event") {
+TEST_CASE("Reset Event", "[event]") {
  auto evt = Event::CreateAutoResetEvent(false);
  WaitResult result;

@ -283,7 +283,7 @@ TEST_CASE("Reset Event", "Event") {
  REQUIRE(result == WaitResult::kSuccess);
 }

-TEST_CASE("Wait on Multiple Events", "Event") {
+TEST_CASE("Wait on Multiple Events", "[event]") {
  auto events = std::array<std::unique_ptr<Event>, 4>{
      Event::CreateAutoResetEvent(false),
      Event::CreateAutoResetEvent(false),
@ -348,7 +348,7 @@ TEST_CASE("Wait on Multiple Events", "Event") {
  // REQUIRE(order[3] == '3');
 }

-TEST_CASE("Wait on Semaphore", "Semaphore") {
+TEST_CASE("Wait on Semaphore", "[semaphore]") {
  WaitResult result;
  std::unique_ptr<Semaphore> sem;
  int previous_count = 0;
@ -406,9 +406,13 @@ TEST_CASE("Wait on Semaphore", "Semaphore") {
  sem = Semaphore::Create(5, 5);
  Sleep(10ms);
  // Occupy the semaphore with 5 threads
-  auto func = [&sem] {
+  std::atomic<int> wait_count(0);
+  volatile bool threads_terminate(false);
+  auto func = [&sem, &wait_count, &threads_terminate] {
    auto res = Wait(sem.get(), false, 100ms);
-    Sleep(500ms);
+    wait_count++;
+    while (!threads_terminate) {
+    }
    if (res == WaitResult::kSuccess) {
      sem->Release(1, nullptr);
    }
@ -417,12 +421,14 @@ TEST_CASE("Wait on Semaphore", "Semaphore") {
      std::thread(func), std::thread(func), std::thread(func),
      std::thread(func), std::thread(func),
  };
-  // Give threads time to acquire semaphore
-  Sleep(10ms);
+  // Wait for threads to finish semaphore calls
+  while (wait_count != 5) {
+  }
  // Attempt to acquire full semaphore with current (6th) thread
  result = Wait(sem.get(), false, 20ms);
  REQUIRE(result == WaitResult::kTimeout);
  // Give threads time to release semaphore
+  threads_terminate = true;
  for (auto& t : threads) {
    t.join();
  }
@ -444,7 +450,7 @@ TEST_CASE("Wait on Semaphore", "Semaphore") {
  // REQUIRE(sem.get() == nullptr);
 }

-TEST_CASE("Wait on Multiple Semaphores", "Semaphore") {
+TEST_CASE("Wait on Multiple Semaphores", "[semaphore]") {
  WaitResult all_result;
  std::pair<WaitResult, size_t> any_result;
  int previous_count;
@ -501,7 +507,7 @@ TEST_CASE("Wait on Multiple Semaphores", "Semaphore") {
  REQUIRE(previous_count == 4);
 }

-TEST_CASE("Wait on Mutant", "Mutant") {
+TEST_CASE("Wait on Mutant", "[mutant]") {
  WaitResult result;
  std::unique_ptr<Mutant> mut;

@ -558,7 +564,7 @@ TEST_CASE("Wait on Mutant", "Mutant") {
  REQUIRE(mut->Release());
 }

-TEST_CASE("Wait on Multiple Mutants", "Mutant") {
+TEST_CASE("Wait on Multiple Mutants", "[mutant]") {
  WaitResult all_result;
  std::pair<WaitResult, size_t> any_result;
  std::unique_ptr<Mutant> mut0, mut1;
@ -621,7 +627,7 @@ TEST_CASE("Wait on Multiple Mutants", "Mutant") {
  thread2.join();
 }

-TEST_CASE("Wait on Timer", "Timer") {
+TEST_CASE("Wait on Timer", "[timer]") {
  WaitResult result;
  std::unique_ptr<Timer> timer;

@ -686,7 +692,7 @@ TEST_CASE("Wait on Timer", "Timer") {
  REQUIRE(result == WaitResult::kTimeout);  // No more signals from repeating
 }

-TEST_CASE("Wait on Multiple Timers", "Timer") {
+TEST_CASE("Wait on Multiple Timers", "[timer]") {
  WaitResult all_result;
  std::pair<WaitResult, size_t> any_result;

@ -724,13 +730,13 @@ TEST_CASE("Wait on Multiple Timers", "Timer") {
  REQUIRE(any_result.second == 1);
 }

-TEST_CASE("Create and Trigger Timer Callbacks", "Timer") {
+TEST_CASE("Create and Trigger Timer Callbacks", "[timer]") {
  // TODO(bwrsandman): Check which thread performs callback and timing of
  // callback
  REQUIRE(true);
 }

-TEST_CASE("Set and Test Current Thread ID", "Thread") {
+TEST_CASE("Set and Test Current Thread ID", "[thread]") {
  // System ID
  auto system_id = current_thread_system_id();
  REQUIRE(system_id > 0);
@ -763,71 +769,76 @@ TEST_CASE("Set and Test Current Thread Name", "Thread") {
  REQUIRE_NOTHROW(set_name(old_thread_name));
 }

-TEST_CASE("Create and Run Thread", "Thread") {
+TEST_CASE("Create and Run Thread", "[thread]") {
  std::unique_ptr<Thread> thread;
  WaitResult result;
  Thread::CreationParameters params = {};
  auto func = [] { Sleep(20ms); };

-  // Create most basic case of thread
-  thread = Thread::Create(params, func);
-  REQUIRE(thread->native_handle() != nullptr);
-  REQUIRE_NOTHROW(thread->affinity_mask());
-  REQUIRE(thread->name().empty());
-  result = Wait(thread.get(), false, 50ms);
-  REQUIRE(result == WaitResult::kSuccess);
+  SECTION("Create most basic case of thread") {
+    thread = Thread::Create(params, func);
+    REQUIRE(thread->native_handle() != nullptr);
+    REQUIRE_NOTHROW(thread->affinity_mask());
+    REQUIRE(thread->name().empty());
+    result = Wait(thread.get(), false, 50ms);
+    REQUIRE(result == WaitResult::kSuccess);
+  }

-  // Add thread name
-  std::string new_name = "Test thread name";
-  thread = Thread::Create(params, func);
-  auto name = thread->name();
-  INFO(name.c_str());
-  REQUIRE(name.empty());
-  thread->set_name(new_name);
-  REQUIRE(thread->name() == new_name);
-  result = Wait(thread.get(), false, 50ms);
-  REQUIRE(result == WaitResult::kSuccess);
+  SECTION("Add thread name") {
+    std::string new_name = "Test thread name";
+    thread = Thread::Create(params, func);
+    auto name = thread->name();
+    INFO(name.c_str());
+    REQUIRE(name.empty());
+    thread->set_name(new_name);
+    REQUIRE(thread->name() == new_name);
+    result = Wait(thread.get(), false, 50ms);
+    REQUIRE(result == WaitResult::kSuccess);
+  }

-  // Use Terminate to end an infinitely looping thread
-  thread = Thread::Create(params, [] {
-    while (true) {
-      Sleep(1ms);
-    }
-  });
-  result = Wait(thread.get(), false, 50ms);
-  REQUIRE(result == WaitResult::kTimeout);
-  thread->Terminate(-1);
-  result = Wait(thread.get(), false, 50ms);
-  REQUIRE(result == WaitResult::kSuccess);
+  SECTION("Use Terminate to end an infinitely looping thread") {
+    thread = Thread::Create(params, [] {
+      while (true) {
+        Sleep(1ms);
+      }
+    });
+    result = Wait(thread.get(), false, 50ms);
+    REQUIRE(result == WaitResult::kTimeout);
+    thread->Terminate(-1);
+    result = Wait(thread.get(), false, 50ms);
+    REQUIRE(result == WaitResult::kSuccess);
+  }

-  // Call Exit from inside an infinitely looping thread
-  thread = Thread::Create(params, [] {
-    while (true) {
+  SECTION("Call Exit from inside an infinitely looping thread") {
+    thread = Thread::Create(params, [] {
      Thread::Exit(-1);
-    }
-  });
-  result = Wait(thread.get(), false, 50ms);
-  REQUIRE(result == WaitResult::kSuccess);
+      FAIL("Function must not return");
+    });
+    result = Wait(thread.get(), false, 50ms);
+    REQUIRE(result == WaitResult::kSuccess);
+  }

-  // Call timeout wait on self
-  result = Wait(Thread::GetCurrentThread(), false, 50ms);
-  REQUIRE(result == WaitResult::kTimeout);
+  SECTION("Call timeout wait on self") {
+    result = Wait(Thread::GetCurrentThread(), false, 50ms);
+    REQUIRE(result == WaitResult::kTimeout);
+  }

-  params.stack_size = 16 * 1024 * 1024;
-  thread = Thread::Create(params, [] {
-    while (true) {
+  SECTION("16kb stack size") {
+    params.stack_size = 16 * 1024 * 1024;
+    thread = Thread::Create(params, [] {
      Thread::Exit(-1);
-    }
-  });
-  REQUIRE(thread != nullptr);
-  result = Wait(thread.get(), false, 50ms);
-  REQUIRE(result == WaitResult::kSuccess);
+      FAIL("Function must not return");
+    });
+    REQUIRE(thread != nullptr);
+    result = Wait(thread.get(), false, 50ms);
+    REQUIRE(result == WaitResult::kSuccess);
+  }

  // TODO(bwrsandman): Test with different priorities
  // TODO(bwrsandman): Test setting and getting thread affinity
 }

-TEST_CASE("Test Suspending Thread", "Thread") {
+TEST_CASE("Test Suspending Thread", "[thread]") {
  std::unique_ptr<Thread> thread;
  WaitResult result;
  Thread::CreationParameters params = {};
@ -888,7 +899,7 @@ TEST_CASE("Test Suspending Thread", "Thread") {
  REQUIRE(result == threading::WaitResult::kSuccess);
 }

-TEST_CASE("Test Thread QueueUserCallback", "Thread") {
+TEST_CASE("Test Thread QueueUserCallback", "[thread]") {
  std::unique_ptr<Thread> thread;
  WaitResult result;
  Thread::CreationParameters params = {};
--- a/src/xenia/base/testing/utf8_test.cc
+++ b/src/xenia/base/testing/utf8_test.cc
@ -2,7 +2,7 @@
 ******************************************************************************
 * Xenia : Xbox 360 Emulator Research Project                                 *
 ******************************************************************************
- * Copyright 2020 Ben Vanik. All rights reserved.                             *
+ * Copyright 2021 Ben Vanik. All rights reserved.                             *
 * Released under the BSD license - see LICENSE in the root for more details. *
 ******************************************************************************
 */
@ -16,16 +16,220 @@

 namespace xe::base::test {

+// TODO(gibbed): bit messy?
+// TODO(gibbed): predicate variant?
+
+#define TEST_EXAMPLE(func, left, right) REQUIRE(func(left) == right)
+
+#define TEST_EXAMPLES_1(func, language, results) \
+  TEST_EXAMPLE(func, examples::k##language##Values[0], results.language[0])
+#define TEST_EXAMPLES_2(func, language, results)                             \
+  TEST_EXAMPLE(func, examples::k##language##Values[0], results.language[0]); \
+  TEST_EXAMPLE(func, examples::k##language##Values[1], results.language[1])
+#define TEST_EXAMPLES_3(func, language, results)                             \
+  TEST_EXAMPLE(func, examples::k##language##Values[0], results.language[0]); \
+  TEST_EXAMPLE(func, examples::k##language##Values[1], results.language[1]); \
+  TEST_EXAMPLE(func, examples::k##language##Values[2], results.language[2])
+
+namespace examples {
+
 // https://www.cl.cam.ac.uk/~mgk25/ucs/examples/quickbrown.txt

-TEST_CASE("utf8::split", "UTF-8 Split") {
+const size_t kDanishCount = 1;
+const char* kDanishValues[kDanishCount] = {
+    u8"Quizdeltagerne spiste jordbær med fløde, mens cirkusklovnen Wolther "
+    u8"spillede på xylofon.",
+};
+#define TEST_LANGUAGE_EXAMPLES_Danish(func, results) \
+  TEST_EXAMPLES_1(func, Danish, results)
+
+const size_t kGermanCount = 3;
+const char* kGermanValues[kGermanCount] = {
+    u8"Falsches Üben von Xylophonmusik quält jeden größeren Zwerg",
+    u8"Zwölf Boxkämpfer jagten Eva quer über den Sylter Deich",
+    u8"Heizölrückstoßabdämpfung",
+};
+#define TEST_LANGUAGE_EXAMPLES_German(func, results) \
+  TEST_EXAMPLES_2(func, German, results)
+
+const size_t kGreekCount = 2;
+const char* kGreekValues[kGreekCount] = {
+    u8"Γαζέες καὶ μυρτιὲς δὲν θὰ βρῶ πιὰ στὸ χρυσαφὶ ξέφωτο",
+    u8"Ξεσκεπάζω τὴν ψυχοφθόρα βδελυγμία",
+};
+#define TEST_LANGUAGE_EXAMPLES_Greek(func, results) \
+  TEST_EXAMPLES_2(func, Greek, results)
+
+const size_t kEnglishCount = 1;
+const char* kEnglishValues[kEnglishCount] = {
+    u8"The quick brown fox jumps over the lazy dog",
+};
+#define TEST_LANGUAGE_EXAMPLES_English(func, results) \
+  TEST_EXAMPLES_1(func, English, results)
+
+const size_t kSpanishCount = 1;
+const char* kSpanishValues[kSpanishCount] = {
+    u8"El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y frío, "
+    u8"añoraba a su querido cachorro.",
+};
+#define TEST_LANGUAGE_EXAMPLES_Spanish(func, results) \
+  TEST_EXAMPLES_1(func, Spanish, results)
+
+const size_t kFrenchCount = 3;
+const char* kFrenchValues[kFrenchCount] = {
+    u8"Portez ce vieux whisky au juge blond qui fume sur son île intérieure, à "
+    u8"côté de l'alcôve ovoïde, où les bûches se consument dans l'âtre, ce qui "
+    u8"lui permet de penser à la cænogenèse de l'être dont il est question "
+    u8"dans la cause ambiguë entendue à Moÿ, dans un capharnaüm qui, "
+    u8"pense-t-il, diminue çà et là la qualité de son œuvre.",
+    u8"l'île exiguë\n"
+    u8"Où l'obèse jury mûr\n"
+    u8"Fête l'haï volapük,\n"
+    u8"Âne ex aéquo au whist,\n"
+    u8"Ôtez ce vœu déçu.",
+    u8"Le cœur déçu mais l'âme plutôt naïve, Louÿs rêva de crapaüter en canoë "
+    u8"au delà des îles, près du mälström où brûlent les novæ.",
+};
+#define TEST_LANGUAGE_EXAMPLES_French(func, results) \
+  TEST_EXAMPLES_3(func, French, results)
+
+const size_t kIrishGaelicCount = 1;
+const char* kIrishGaelicValues[kIrishGaelicCount] = {
+    u8"D'fhuascail Íosa, Úrmhac na hÓighe Beannaithe, pór Éava agus Ádhaimh",
+};
+#define TEST_LANGUAGE_EXAMPLES_IrishGaelic(func, results) \
+  TEST_EXAMPLES_1(func, IrishGaelic, results)
+
+const size_t kHungarianCount = 1;
+const char* kHungarianValues[kHungarianCount] = {
+    u8"Árvíztűrő tükörfúrógép",
+};
+#define TEST_LANGUAGE_EXAMPLES_Hungarian(func, results) \
+  TEST_EXAMPLES_1(func, Hungarian, results)
+
+const size_t kIcelandicCount = 2;
+const char* kIcelandicValues[kIcelandicCount] = {
+    u8"Kæmi ný öxi hér ykist þjófum nú bæði víl og ádrepa",
+    u8"Sævör grét áðan því úlpan var ónýt",
+};
+#define TEST_LANGUAGE_EXAMPLES_Icelandic(func, results) \
+  TEST_EXAMPLES_2(func, Icelandic, results)
+
+const size_t kJapaneseCount = 2;
+const char* kJapaneseValues[kJapaneseCount] = {
+    u8"いろはにほへとちりぬるを\n"
+    u8"わかよたれそつねならむ\n"
+    u8"うゐのおくやまけふこえて\n"
+    u8"あさきゆめみしゑひもせす\n",
+    u8"イロハニホヘト チリヌルヲ ワカヨタレソ ツネナラム\n"
+    u8"ウヰノオクヤマ ケフコエテ アサキユメミシ ヱヒモセスン",
+};
+#define TEST_LANGUAGE_EXAMPLES_Japanese(func, results) \
+  TEST_EXAMPLES_2(func, Japanese, results)
+
+const size_t kHebrewCount = 1;
+const char* kHebrewValues[kHebrewCount] = {
+    u8"? דג סקרן שט בים מאוכזב ולפתע מצא לו חברה איך הקליטה",
+};
+#define TEST_LANGUAGE_EXAMPLES_Hebrew(func, results) \
+  TEST_EXAMPLES_1(func, Hebrew, results)
+
+const size_t kPolishCount = 1;
+const char* kPolishValues[kPolishCount] = {
+    u8"Pchnąć w tę łódź jeża lub ośm skrzyń fig",
+};
+#define TEST_LANGUAGE_EXAMPLES_Polish(func, results) \
+  TEST_EXAMPLES_1(func, Polish, results)
+
+const size_t kRussianCount = 2;
+const char* kRussianValues[kRussianCount] = {
+    u8"В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!",
+    u8"Съешь же ещё этих мягких французских булок да выпей чаю",
+};
+#define TEST_LANGUAGE_EXAMPLES_Russian(func, results) \
+  TEST_EXAMPLES_2(func, Russian, results)
+
+const size_t kTurkishCount = 1;
+const char* kTurkishValues[kTurkishCount] = {
+    u8"Pijamalı hasta, yağız şoföre çabucak güvendi.",
+};
+#define TEST_LANGUAGE_EXAMPLES_Turkish(func, results) \
+  TEST_EXAMPLES_1(func, Turkish, results)
+
+#define TEST_LANGUAGE_EXAMPLES(func, results)        \
+  TEST_LANGUAGE_EXAMPLES_Danish(func, results);      \
+  TEST_LANGUAGE_EXAMPLES_German(func, results);      \
+  TEST_LANGUAGE_EXAMPLES_Greek(func, results);       \
+  TEST_LANGUAGE_EXAMPLES_English(func, results);     \
+  TEST_LANGUAGE_EXAMPLES_Spanish(func, results);     \
+  TEST_LANGUAGE_EXAMPLES_French(func, results);      \
+  TEST_LANGUAGE_EXAMPLES_IrishGaelic(func, results); \
+  TEST_LANGUAGE_EXAMPLES_Hungarian(func, results);   \
+  TEST_LANGUAGE_EXAMPLES_Icelandic(func, results);   \
+  TEST_LANGUAGE_EXAMPLES_Japanese(func, results);    \
+  TEST_LANGUAGE_EXAMPLES_Hebrew(func, results);      \
+  TEST_LANGUAGE_EXAMPLES_Polish(func, results);      \
+  TEST_LANGUAGE_EXAMPLES_Russian(func, results);     \
+  TEST_LANGUAGE_EXAMPLES_Turkish(func, results)
+
+}  // namespace examples
+
+#define TEST_EXAMPLE_RESULT(language) T language[examples::k##language##Count]
+template <typename T>
+struct example_results {
+  TEST_EXAMPLE_RESULT(Danish);
+  TEST_EXAMPLE_RESULT(German);
+  TEST_EXAMPLE_RESULT(Greek);
+  TEST_EXAMPLE_RESULT(English);
+  TEST_EXAMPLE_RESULT(Spanish);
+  TEST_EXAMPLE_RESULT(French);
+  TEST_EXAMPLE_RESULT(IrishGaelic);
+  TEST_EXAMPLE_RESULT(Hungarian);
+  TEST_EXAMPLE_RESULT(Icelandic);
+  TEST_EXAMPLE_RESULT(Japanese);
+  TEST_EXAMPLE_RESULT(Hebrew);
+  TEST_EXAMPLE_RESULT(Polish);
+  TEST_EXAMPLE_RESULT(Russian);
+  TEST_EXAMPLE_RESULT(Turkish);
+};
+#undef TEST_EXAMPLE_RESULT
+
+TEST_CASE("UTF-8 Count", "[utf8]") {
+  example_results<size_t> results = {};
+  results.Danish[0] = 88;
+  results.German[0] = 58;
+  results.German[1] = 54;
+  results.Greek[0] = 52;
+  results.Greek[1] = 33;
+  results.English[0] = 43;
+  results.Spanish[0] = 99;
+  results.French[0] = 327;
+  results.French[1] = 93;
+  results.French[2] = 126;
+  results.IrishGaelic[0] = 68;
+  results.Hungarian[0] = 22;
+  results.Icelandic[0] = 50;
+  results.Icelandic[1] = 34;
+  results.Japanese[0] = 51;
+  results.Japanese[1] = 55;
+  results.Hebrew[0] = 52;
+  results.Polish[0] = 40;
+  results.Russian[0] = 54;
+  results.Russian[1] = 55;
+  results.Turkish[0] = 45;
+  TEST_LANGUAGE_EXAMPLES(utf8::count, results);
+}
+
+// TODO(gibbed): lower_ascii
+// TODO(gibbed): upper_ascii
+// TODO(gibbed): hash_fnv1a
+// TODO(gibbed): hash_fnv1a_case
+
+TEST_CASE("UTF-8 Split", "[utf8]") {
  std::vector<std::string_view> parts;

  // Danish
-  parts = utf8::split(
-      u8"Quizdeltagerne spiste jordbær med fløde, mens cirkusklovnen Wolther "
-      u8"spillede på xylofon.",
-      u8"æcå");
+  parts = utf8::split(examples::kDanishValues[0], u8"æcå");
  REQUIRE(parts.size() == 4);
  REQUIRE(parts[0] == u8"Quizdeltagerne spiste jordb");
  REQUIRE(parts[1] == u8"r med fløde, mens ");
@ -33,43 +237,41 @@ TEST_CASE("utf8::split", "UTF-8 Split") {
  REQUIRE(parts[3] == u8" xylofon.");

  // German
-  parts = utf8::split(
-      u8"Falsches Üben von Xylophonmusik quält jeden größeren Zwerg\n"
-      u8"Zwölf Boxkämpfer jagten Eva quer über den Sylter Deich\n"
-      u8"Heizölrückstoßabdämpfung",
-      u8"ßS");
-  REQUIRE(parts.size() == 4);
+  parts = utf8::split(examples::kGermanValues[0], u8"ßS");
+  REQUIRE(parts.size() == 2);
  REQUIRE(parts[0] == u8"Falsches Üben von Xylophonmusik quält jeden grö");
-  REQUIRE(parts[1] ==
-          u8"eren Zwerg\nZwölf Boxkämpfer jagten Eva quer über den ");
-  REQUIRE(parts[2] == u8"ylter Deich\nHeizölrücksto");
-  REQUIRE(parts[3] == u8"abdämpfung");
+  REQUIRE(parts[1] == u8"eren Zwerg");
+  parts = utf8::split(examples::kGermanValues[1], u8"ßS");
+  REQUIRE(parts.size() == 2);
+  REQUIRE(parts[0] == u8"Zwölf Boxkämpfer jagten Eva quer über den ");
+  REQUIRE(parts[1] == u8"ylter Deich");
+  parts = utf8::split(examples::kGermanValues[2], u8"ßS");
+  REQUIRE(parts.size() == 2);
+  REQUIRE(parts[0] == u8"Heizölrücksto");
+  REQUIRE(parts[1] == u8"abdämpfung");

  // Greek
-  parts = utf8::split(
-      u8"Γαζέες καὶ μυρτιὲς δὲν θὰ βρῶ πιὰ στὸ χρυσαφὶ ξέφωτο\n"
-      u8"Ξεσκεπάζω τὴν ψυχοφθόρα βδελυγμία",
-      u8"πφ");
-  REQUIRE(parts.size() == 6);
+  parts = utf8::split(examples::kGreekValues[0], u8"πφ");
+  REQUIRE(parts.size() == 4);
  REQUIRE(parts[0] == u8"Γαζέες καὶ μυρτιὲς δὲν θὰ βρῶ ");
  REQUIRE(parts[1] == u8"ιὰ στὸ χρυσα");
  REQUIRE(parts[2] == u8"ὶ ξέ");
-  REQUIRE(parts[3] == u8"ωτο\nΞεσκε");
-  REQUIRE(parts[4] == u8"άζω τὴν ψυχο");
-  REQUIRE(parts[5] == u8"θόρα βδελυγμία");
+  REQUIRE(parts[3] == u8"ωτο");
+  parts = utf8::split(examples::kGreekValues[1], u8"πφ");
+  REQUIRE(parts.size() == 3);
+  REQUIRE(parts[0] == u8"Ξεσκε");
+  REQUIRE(parts[1] == u8"άζω τὴν ψυχο");
+  REQUIRE(parts[2] == u8"θόρα βδελυγμία");

  // English
-  parts = utf8::split("The quick brown fox jumps over the lazy dog", "xy");
+  parts = utf8::split(examples::kEnglishValues[0], "xy");
  REQUIRE(parts.size() == 3);
  REQUIRE(parts[0] == u8"The quick brown fo");
  REQUIRE(parts[1] == u8" jumps over the laz");
  REQUIRE(parts[2] == u8" dog");

  // Spanish
-  parts = utf8::split(
-      u8"El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y "
-      u8"frío, añoraba a su querido cachorro.",
-      u8"ójd");
+  parts = utf8::split(examples::kSpanishValues[0], u8"ójd");
  REQUIRE(parts.size() == 4);
  REQUIRE(parts[0] == u8"El pingüino Wenceslao hizo kil");
  REQUIRE(parts[1] == u8"metros ba");
@ -88,52 +290,254 @@ TEST_CASE("utf8::split", "UTF-8 Split") {
  // TODO(gibbed): Turkish
 }

-TEST_CASE("utf8::equal_z", "UTF-8 Equal Z") {
+TEST_CASE("UTF-8 Equal Z", "[utf8]") {
  REQUIRE(utf8::equal_z(u8"foo", u8"foo\0"));
  REQUIRE_FALSE(utf8::equal_z(u8"bar", u8"baz\0"));
 }

-TEST_CASE("utf8::equal_case_z", "UTF-8 Equal Case Z") {
-  REQUIRE(utf8::equal_z(u8"foo", u8"foo\0"));
-  REQUIRE_FALSE(utf8::equal_z(u8"bar", u8"baz\0"));
+TEST_CASE("UTF-8 Equal Case", "[utf8]") {
+  REQUIRE(utf8::equal_case(u8"foo", u8"foo\0"));
+  REQUIRE_FALSE(utf8::equal_case(u8"bar", u8"baz\0"));
 }

-TEST_CASE("utf8::join_paths", "UTF-8 Join Paths") {
-  REQUIRE(utf8::join_paths({u8"X:", u8"foo", u8"bar", u8"baz", u8"qux"},
-                           '\\') == "X:\\foo\\bar\\baz\\qux");
-  REQUIRE(utf8::join_paths({u8"X:", u8"foo", u8"bar", u8"baz", u8"qux"}, '/') ==
-          "X:/foo/bar/baz/qux");
+TEST_CASE("UTF-8 Equal Case Z", "[utf8]") {
+  REQUIRE(utf8::equal_case_z(u8"foo", u8"foo\0"));
+  REQUIRE_FALSE(utf8::equal_case_z(u8"bar", u8"baz\0"));
 }

-TEST_CASE("utf8::fix_path_separators", "UTF-8 Fix Path Separators") {
-  REQUIRE(utf8::fix_path_separators("X:\\foo/bar\\baz/qux", '\\') ==
-          "X:\\foo\\bar\\baz\\qux");
-  REQUIRE(utf8::fix_path_separators("X:\\foo/bar\\baz/qux", '/') ==
-          "X:/foo/bar/baz/qux");
+// TODO(gibbed): find_any_of
+// TODO(gibbed): find_any_of_case
+// TODO(gibbed): find_first_of
+// TODO(gibbed): find_first_of_case
+// TODO(gibbed): starts_with
+// TODO(gibbed): starts_with_case
+// TODO(gibbed): ends_with
+// TODO(gibbed): ends_with_case
+// TODO(gibbed): split_path
+
+#define TEST_PATH(func, input, output)                                 \
+  do {                                                                 \
+    std::string input_value = input;                                   \
+    std::string output_value = output;                                 \
+    REQUIRE(func(input_value, '/') == output_value);                   \
+    std::replace(input_value.begin(), input_value.end(), '/', '\\');   \
+    std::replace(output_value.begin(), output_value.end(), '/', '\\'); \
+    REQUIRE(func(input_value, '\\') == output_value);                  \
+  } while (0)
+
+#define TEST_PATH_RAW(func, input, output)                             \
+  do {                                                                 \
+    std::string output_value = output;                                 \
+    REQUIRE(func(input, '/') == output_value);                         \
+    std::replace(output_value.begin(), output_value.end(), '/', '\\'); \
+    REQUIRE(func(input, '\\') == output_value);                        \
+  } while (0)
+
+#define TEST_PATHS(func, output, ...)                                      \
+  do {                                                                     \
+    std::vector<std::string> input_values = {__VA_ARGS__};                 \
+    std::string output_value = output;                                     \
+    REQUIRE(func(input_values, '/') == output_value);                      \
+    for (auto it = input_values.begin(); it != input_values.end(); ++it) { \
+      std::replace((*it).begin(), (*it).end(), '/', '\\');                 \
+    }                                                                      \
+    std::replace(output_value.begin(), output_value.end(), '/', '\\');     \
+    REQUIRE(func(input_values, '\\') == output_value);                     \
+  } while (0)
+
+TEST_CASE("UTF-8 Join Paths", "[utf8]") {
+  TEST_PATHS(utf8::join_paths, u8"");
+  TEST_PATHS(utf8::join_paths, u8"foo", u8"foo");
+  TEST_PATHS(utf8::join_paths, u8"foo/bar", u8"foo", u8"bar");
+  TEST_PATHS(utf8::join_paths, "X:/foo/bar/baz/qux", u8"X:", u8"foo", u8"bar",
+             u8"baz", u8"qux");
 }

-TEST_CASE("utf8::find_name_from_path", "UTF-8 Find Name From Path") {
-  REQUIRE(utf8::find_name_from_path("X:\\foo\\bar\\baz\\qux", '\\') == "qux");
-  REQUIRE(utf8::find_name_from_path("X:/foo/bar/baz/qux", '/') == "qux");
+// TODO(gibbed): join_guest_paths
+
+TEST_CASE("UTF-8 Fix Path Separators", "[utf8]") {
+  TEST_PATH_RAW(utf8::fix_path_separators, "", "");
+  TEST_PATH_RAW(utf8::fix_path_separators, "\\", "/");
+  TEST_PATH_RAW(utf8::fix_path_separators, "/", "/");
+  TEST_PATH_RAW(utf8::fix_path_separators, "\\foo", "/foo");
+  TEST_PATH_RAW(utf8::fix_path_separators, "\\foo/", "/foo/");
+  TEST_PATH_RAW(utf8::fix_path_separators, "/foo", "/foo");
+  TEST_PATH_RAW(utf8::fix_path_separators, "\\foo/bar\\baz/qux",
+                "/foo/bar/baz/qux");
+  TEST_PATH_RAW(utf8::fix_path_separators, "\\\\foo//bar\\\\baz//qux",
+                "/foo/bar/baz/qux");
+  TEST_PATH_RAW(utf8::fix_path_separators, "foo", "foo");
+  TEST_PATH_RAW(utf8::fix_path_separators, "foo/", "foo/");
+  TEST_PATH_RAW(utf8::fix_path_separators, "foo/bar\\baz/qux",
+                "foo/bar/baz/qux");
+  TEST_PATH_RAW(utf8::fix_path_separators, "foo//bar\\\\baz//qux",
+                "foo/bar/baz/qux");
+  TEST_PATH_RAW(utf8::fix_path_separators, "X:", "X:");
+  TEST_PATH_RAW(utf8::fix_path_separators, "X:\\", "X:/");
+  TEST_PATH_RAW(utf8::fix_path_separators, "X:/", "X:/");
+  TEST_PATH_RAW(utf8::fix_path_separators, "X:\\foo", "X:/foo");
+  TEST_PATH_RAW(utf8::fix_path_separators, "X:\\foo/", "X:/foo/");
+  TEST_PATH_RAW(utf8::fix_path_separators, "X:/foo", "X:/foo");
+  TEST_PATH_RAW(utf8::fix_path_separators, "X:\\foo/bar\\baz/qux",
+                "X:/foo/bar/baz/qux");
+  TEST_PATH_RAW(utf8::fix_path_separators, "X:\\\\foo//bar\\\\baz//qux",
+                "X:/foo/bar/baz/qux");
 }

-TEST_CASE("utf8::find_base_path", "UTF-8 Find Base Path") {
-  REQUIRE(utf8::find_base_path("X:\\foo\\bar\\baz\\qux", '\\') ==
-          "X:\\foo\\bar\\baz");
-  REQUIRE(utf8::find_base_path("X:/foo/bar/baz/qux", '/') == "X:/foo/bar/baz");
+// TODO(gibbed): fix_guest_path_separators
+
+TEST_CASE("UTF-8 Find Name From Path", "[utf8]") {
+  TEST_PATH(utf8::find_name_from_path, "/", "");
+  TEST_PATH(utf8::find_name_from_path, "foo/bar/baz/qux/", "qux");
+  TEST_PATH(utf8::find_name_from_path, "foo/bar/baz/qux.txt", "qux.txt");
+  TEST_PATH(utf8::find_name_from_path, "ほげ/ぴよ/ふが/ほげら/ほげほげ/",
+            "ほげほげ");
+  TEST_PATH(utf8::find_name_from_path, "ほげ/ぴよ/ふが/ほげら/ほげほげ.txt",
+            "ほげほげ.txt");
+  TEST_PATH(utf8::find_name_from_path, "/foo/bar/baz/qux.txt", "qux.txt");
+  TEST_PATH(utf8::find_name_from_path, "/ほげ/ぴよ/ふが/ほげら/ほげほげ/",
+            "ほげほげ");
+  TEST_PATH(utf8::find_name_from_path, "/ほげ/ぴよ/ふが/ほげら/ほげほげ.txt",
+            "ほげほげ.txt");
+  TEST_PATH(utf8::find_name_from_path, "X:/foo/bar/baz/qux.txt", "qux.txt");
+  TEST_PATH(utf8::find_name_from_path, "X:/ほげ/ぴよ/ふが/ほげら/ほげほげ/",
+            "ほげほげ");
+  TEST_PATH(utf8::find_name_from_path, "X:/ほげ/ぴよ/ふが/ほげら/ほげほげ.txt",
+            "ほげほげ.txt");
+  TEST_PATH(utf8::find_name_from_path, "X:/ほげ/ぴよ/ふが/ほげら.ほげほげ",
+            "ほげら.ほげほげ");
 }

-TEST_CASE("utf8::canonicalize_path", "UTF-8 Canonicalize Path") {
-  REQUIRE(utf8::canonicalize_path("X:\\foo\\bar\\baz\\qux", '\\') ==
-          "X:\\foo\\bar\\baz\\qux");
-  REQUIRE(utf8::canonicalize_path("X:\\foo\\.\\baz\\qux", '\\') ==
-          "X:\\foo\\baz\\qux");
-  REQUIRE(utf8::canonicalize_path("X:\\foo\\..\\baz\\qux", '\\') ==
-          "X:\\baz\\qux");
-  REQUIRE(utf8::canonicalize_path("X:\\.\\bar\\baz\\qux", '\\') ==
-          "X:\\bar\\baz\\qux");
-  REQUIRE(utf8::canonicalize_path("X:\\..\\bar\\baz\\qux", '\\') ==
-          "X:\\bar\\baz\\qux");
+// TODO(gibbed): find_name_from_guest_path
+
+TEST_CASE("UTF-8 Find Base Name From Path", "[utf8]") {
+  TEST_PATH(utf8::find_base_name_from_path, "foo/bar/baz/qux.txt", "qux");
+  TEST_PATH(utf8::find_base_name_from_path, "foo/bar/baz/qux/", "qux");
+  TEST_PATH(utf8::find_base_name_from_path,
+            "ほげ/ぴよ/ふが/ほげら/ほげほげ.txt", "ほげほげ");
+  TEST_PATH(utf8::find_base_name_from_path, "ほげ/ぴよ/ふが/ほげら/ほげほげ/",
+            "ほげほげ");
+  TEST_PATH(utf8::find_base_name_from_path, "ほげ/ぴよ/ふが/ほげら.ほげほげ",
+            "ほげら");
+  TEST_PATH(utf8::find_base_name_from_path, "/foo/bar/baz/qux.txt", "qux");
+  TEST_PATH(utf8::find_base_name_from_path, "/foo/bar/baz/qux/", "qux");
+  TEST_PATH(utf8::find_base_name_from_path,
+            "/ほげ/ぴよ/ふが/ほげら/ほげほげ.txt", "ほげほげ");
+  TEST_PATH(utf8::find_base_name_from_path, "/ほげ/ぴよ/ふが/ほげら/ほげほげ/",
+            "ほげほげ");
+  TEST_PATH(utf8::find_base_name_from_path, "/ほげ/ぴよ/ふが/ほげら.ほげほげ",
+            "ほげら");
+  TEST_PATH(utf8::find_base_name_from_path, "X:/foo/bar/baz/qux.txt", "qux");
+  TEST_PATH(utf8::find_base_name_from_path, "X:/foo/bar/baz/qux/", "qux");
+  TEST_PATH(utf8::find_base_name_from_path,
+            "X:/ほげ/ぴよ/ふが/ほげら/ほげほげ.txt", "ほげほげ");
+  TEST_PATH(utf8::find_base_name_from_path,
+            "X:/ほげ/ぴよ/ふが/ほげら/ほげほげ/", "ほげほげ");
+  TEST_PATH(utf8::find_base_name_from_path, "X:/ほげ/ぴよ/ふが/ほげら.ほげほげ",
+            "ほげら");
 }

+// TODO(gibbed): find_base_name_from_guest_path
+
+TEST_CASE("UTF-8 Find Base Path", "[utf8]") {
+  TEST_PATH(utf8::find_base_path, "", "");
+  TEST_PATH(utf8::find_base_path, "/", "");
+  TEST_PATH(utf8::find_base_path, "//", "");
+  TEST_PATH(utf8::find_base_path, "/foo", "");
+  TEST_PATH(utf8::find_base_path, "/foo/", "");
+  TEST_PATH(utf8::find_base_path, "/foo/bar", "/foo");
+  TEST_PATH(utf8::find_base_path, "/foo/bar/", "/foo");
+  TEST_PATH(utf8::find_base_path, "/foo/bar/baz/qux", "/foo/bar/baz");
+  TEST_PATH(utf8::find_base_path, "/foo/bar/baz/qux/", "/foo/bar/baz");
+  TEST_PATH(utf8::find_base_path, "/ほげ/ぴよ/ふが/ほげら/ほげほげ",
+            "/ほげ/ぴよ/ふが/ほげら");
+  TEST_PATH(utf8::find_base_path, "/ほげ/ぴよ/ふが/ほげら/ほげほげ/",
+            "/ほげ/ぴよ/ふが/ほげら");
+  TEST_PATH(utf8::find_base_path, "foo", "");
+  TEST_PATH(utf8::find_base_path, "foo/", "");
+  TEST_PATH(utf8::find_base_path, "foo/bar", "foo");
+  TEST_PATH(utf8::find_base_path, "foo/bar/", "foo");
+  TEST_PATH(utf8::find_base_path, "foo/bar/baz/qux", "foo/bar/baz");
+  TEST_PATH(utf8::find_base_path, "foo/bar/baz/qux/", "foo/bar/baz");
+  TEST_PATH(utf8::find_base_path, "ほげ/ぴよ/ふが/ほげら/ほげほげ",
+            "ほげ/ぴよ/ふが/ほげら");
+  TEST_PATH(utf8::find_base_path, "ほげ/ぴよ/ふが/ほげら/ほげほげ/",
+            "ほげ/ぴよ/ふが/ほげら");
+  TEST_PATH(utf8::find_base_path, "X:", "");
+  TEST_PATH(utf8::find_base_path, "X:/", "");
+  TEST_PATH(utf8::find_base_path, "X:/foo", "X:");
+  TEST_PATH(utf8::find_base_path, "X:/foo/", "X:");
+  TEST_PATH(utf8::find_base_path, "X:/foo/bar", "X:/foo");
+  TEST_PATH(utf8::find_base_path, "X:/foo/bar/", "X:/foo");
+  TEST_PATH(utf8::find_base_path, "X:/foo/bar/baz/qux", "X:/foo/bar/baz");
+  TEST_PATH(utf8::find_base_path, "X:/foo/bar/baz/qux/", "X:/foo/bar/baz");
+  TEST_PATH(utf8::find_base_path, "X:/ほげ/ぴよ/ふが/ほげら/ほげほげ",
+            "X:/ほげ/ぴよ/ふが/ほげら");
+  TEST_PATH(utf8::find_base_path, "X:/ほげ/ぴよ/ふが/ほげら/ほげほげ/",
+            "X:/ほげ/ぴよ/ふが/ほげら");
+}
+
+// TODO(gibbed): find_base_guest_path
+
+TEST_CASE("UTF-8 Canonicalize Path", "[utf8]") {
+  TEST_PATH(utf8::canonicalize_path, "foo/bar/baz/qux", "foo/bar/baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "foo/bar/baz/qux/", "foo/bar/baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "foo/./baz/qux", "foo/baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "foo/./baz/qux/", "foo/baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "foo/../baz/qux", "baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "foo/../baz/qux/", "baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "foo/./baz/../qux", "foo/qux");
+  TEST_PATH(utf8::canonicalize_path, "foo/./baz/../qux/", "foo/qux");
+  TEST_PATH(utf8::canonicalize_path, "foo/./../baz/qux", "baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "foo/./../baz/qux/", "baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "./bar/baz/qux", "bar/baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "./bar/baz/qux/", "bar/baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "../bar/baz/qux", "bar/baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "../bar/baz/qux/", "bar/baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "ほげ/ぴよ/./ふが/../ほげら/ほげほげ",
+            "ほげ/ぴよ/ほげら/ほげほげ");
+  TEST_PATH(utf8::canonicalize_path, "ほげ/ぴよ/./ふが/../ほげら/ほげほげ/",
+            "ほげ/ぴよ/ほげら/ほげほげ");
+  TEST_PATH(utf8::canonicalize_path, "/foo/bar/baz/qux", "/foo/bar/baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "/foo/bar/baz/qux/", "/foo/bar/baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "/foo/./baz/qux", "/foo/baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "/foo/./baz/qux/", "/foo/baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "/foo/../baz/qux", "/baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "/foo/../baz/qux/", "/baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "/foo/./baz/../qux", "/foo/qux");
+  TEST_PATH(utf8::canonicalize_path, "/foo/./baz/../qux/", "/foo/qux");
+  TEST_PATH(utf8::canonicalize_path, "/foo/./../baz/qux", "/baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "/foo/./../baz/qux/", "/baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "/./bar/baz/qux", "/bar/baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "/./bar/baz/qux/", "/bar/baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "/../bar/baz/qux", "/bar/baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "/../bar/baz/qux/", "/bar/baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "/ほげ/ぴよ/./ふが/../ほげら/ほげほげ",
+            "/ほげ/ぴよ/ほげら/ほげほげ");
+  TEST_PATH(utf8::canonicalize_path, "/ほげ/ぴよ/./ふが/../ほげら/ほげほげ/",
+            "/ほげ/ぴよ/ほげら/ほげほげ");
+  TEST_PATH(utf8::canonicalize_path, "X:/foo/bar/baz/qux",
+            "X:/foo/bar/baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "X:/foo/bar/baz/qux/",
+            "X:/foo/bar/baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "X:/foo/./baz/qux", "X:/foo/baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "X:/foo/./baz/qux/", "X:/foo/baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "X:/foo/../baz/qux", "X:/baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "X:/foo/../baz/qux/", "X:/baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "X:/foo/./baz/../qux", "X:/foo/qux");
+  TEST_PATH(utf8::canonicalize_path, "X:/foo/./baz/../qux/", "X:/foo/qux");
+  TEST_PATH(utf8::canonicalize_path, "X:/foo/./../baz/qux", "X:/baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "X:/foo/./../baz/qux/", "X:/baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "X:/./bar/baz/qux", "X:/bar/baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "X:/./bar/baz/qux/", "X:/bar/baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "X:/../bar/baz/qux", "X:/bar/baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "X:/../bar/baz/qux/", "X:/bar/baz/qux");
+  TEST_PATH(utf8::canonicalize_path, "X:/ほげ/ぴよ/./ふが/../ほげら/ほげほげ",
+            "X:/ほげ/ぴよ/ほげら/ほげほげ");
+  TEST_PATH(utf8::canonicalize_path, "X:/ほげ/ぴよ/./ふが/../ほげら/ほげほげ/",
+            "X:/ほげ/ぴよ/ほげら/ほげほげ");
+}
+
+// TODO(gibbed): canonicalize_guest_path
+
 }  // namespace xe::base::test
--- a/src/xenia/base/threading_posix.cc
+++ b/src/xenia/base/threading_posix.cc
@ -155,29 +155,36 @@ bool SetTlsValue(TlsHandle handle, uintptr_t value) {
 class PosixHighResolutionTimer : public HighResolutionTimer {
 public:
  explicit PosixHighResolutionTimer(std::function<void()> callback)
-      : callback_(std::move(callback)), timer_(nullptr) {}
+      : callback_(std::move(callback)), valid_(false) {}
  ~PosixHighResolutionTimer() override {
-    if (timer_) timer_delete(timer_);
+    if (valid_) timer_delete(timer_);
  }

  bool Initialize(std::chrono::milliseconds period) {
+    if (valid_) {
+      // Double initialization
+      assert_always();
+      return false;
+    }
    // Create timer
    sigevent sev{};
    sev.sigev_notify = SIGEV_SIGNAL;
    sev.sigev_signo = GetSystemSignal(SignalType::kHighResolutionTimer);
    sev.sigev_value.sival_ptr = (void*)&callback_;
-    if (timer_create(CLOCK_REALTIME, &sev, &timer_) == -1) return false;
+    if (timer_create(CLOCK_MONOTONIC, &sev, &timer_) == -1) return false;

    // Start timer
    itimerspec its{};
    its.it_value = DurationToTimeSpec(period);
    its.it_interval = its.it_value;
-    return timer_settime(timer_, 0, &its, nullptr) != -1;
+    valid_ = timer_settime(timer_, 0, &its, nullptr) != -1;
+    return valid_;
  }

 private:
  std::function<void()> callback_;
  timer_t timer_;
+  bool valid_;  // all values for timer_t are legal so we need this
 };

 std::unique_ptr<HighResolutionTimer> HighResolutionTimer::CreateRepeating(
@ -187,7 +194,7 @@ std::unique_ptr<HighResolutionTimer> HighResolutionTimer::CreateRepeating(
  if (!timer->Initialize(period)) {
    return nullptr;
  }
-  return std::unique_ptr<HighResolutionTimer>(timer.release());
+  return std::move(timer);
 }

 class PosixConditionBase {
@ -419,7 +426,7 @@ class PosixCondition<Timer> : public PosixConditionBase {
      sev.sigev_notify = SIGEV_SIGNAL;
      sev.sigev_signo = GetSystemSignal(SignalType::kTimer);
      sev.sigev_value.sival_ptr = this;
-      if (timer_create(CLOCK_REALTIME, &sev, &timer_) == -1) return false;
+      if (timer_create(CLOCK_MONOTONIC, &sev, &timer_) == -1) return false;
    }

    // Start timer
@ -728,31 +735,44 @@ class PosixCondition<Thread> : public PosixConditionBase {
  }

  void Terminate(int exit_code) {
+    bool is_current_thread = pthread_self() == thread_;
    {
      std::unique_lock<std::mutex> lock(state_mutex_);
+      if (state_ == State::kFinished) {
+        if (is_current_thread) {
+          // This is really bad. Some thread must have called Terminate() on us
+          // just before we decided to terminate ourselves
+          assert_always();
+          for (;;) {
+            // Wait for pthread_cancel() to actually happen.
+          }
+        }
+        return;
+      }
      state_ = State::kFinished;
    }

-    std::lock_guard<std::mutex> lock(mutex_);
-
-    // Sometimes the thread can call terminate twice before stopping
-    if (thread_ == 0) return;
-    auto thread = thread_;
-
-    exit_code_ = exit_code;
-    signaled_ = true;
-    cond_.notify_all();
+    {
+      std::lock_guard<std::mutex> lock(mutex_);

+      exit_code_ = exit_code;
+      signaled_ = true;
+      cond_.notify_all();
+    }
+    if (is_current_thread) {
+      pthread_exit(reinterpret_cast<void*>(exit_code));
+    } else {
 #ifdef XE_PLATFORM_ANDROID
-    if (pthread_kill(thread, GetSystemSignal(SignalType::kThreadTerminate)) !=
-        0) {
-      assert_always();
-    }
+      if (pthread_kill(thread_,
+                       GetSystemSignal(SignalType::kThreadTerminate)) != 0) {
+        assert_always();
+      }
 #else
-    if (pthread_cancel(thread) != 0) {
-      assert_always();
-    }
+      if (pthread_cancel(thread_) != 0) {
+        assert_always();
+      }
 #endif
+    }
  }

  void WaitStarted() const {
@ -778,7 +798,6 @@ class PosixCondition<Thread> : public PosixConditionBase {
  inline void post_execution() override {
    if (thread_) {
      pthread_join(thread_, nullptr);
-      thread_ = 0;
    }
  }
  pthread_t thread_;
@ -1115,13 +1134,12 @@ Thread* Thread::GetCurrentThread() {
 void Thread::Exit(int exit_code) {
  if (current_thread_) {
    current_thread_->Terminate(exit_code);
-    // Sometimes the current thread keeps running after being cancelled.
-    // Prevent other calls from this thread from using current_thread_.
-    current_thread_ = nullptr;
  } else {
    // Should only happen with the main thread
    pthread_exit(reinterpret_cast<void*>(exit_code));
  }
+  // Function must not return
+  assert_always();
 }

 void set_name(const std::string_view name) {
--- a/src/xenia/base/threading_win.cc
+++ b/src/xenia/base/threading_win.cc
@ -111,30 +111,34 @@ bool SetTlsValue(TlsHandle handle, uintptr_t value) {
 class Win32HighResolutionTimer : public HighResolutionTimer {
 public:
  Win32HighResolutionTimer(std::function<void()> callback)
-      : callback_(callback) {}
+      : callback_(std::move(callback)) {}
  ~Win32HighResolutionTimer() override {
-    if (handle_) {
+    if (valid_) {
      DeleteTimerQueueTimer(nullptr, handle_, INVALID_HANDLE_VALUE);
      handle_ = nullptr;
    }
  }

  bool Initialize(std::chrono::milliseconds period) {
-    return CreateTimerQueueTimer(
-               &handle_, nullptr,
-               [](PVOID param, BOOLEAN timer_or_wait_fired) {
-                 auto timer =
-                     reinterpret_cast<Win32HighResolutionTimer*>(param);
-                 timer->callback_();
-               },
-               this, 0, DWORD(period.count()), WT_EXECUTEINTIMERTHREAD)
-               ? true
-               : false;
+    if (valid_) {
+      // Double initialization
+      assert_always();
+      return false;
+    }
+    valid_ = !!CreateTimerQueueTimer(
+        &handle_, nullptr,
+        [](PVOID param, BOOLEAN timer_or_wait_fired) {
+          auto timer = reinterpret_cast<Win32HighResolutionTimer*>(param);
+          timer->callback_();
+        },
+        this, 0, DWORD(period.count()), WT_EXECUTEINTIMERTHREAD);
+    return valid_;
  }

 private:
-  HANDLE handle_ = nullptr;
  std::function<void()> callback_;
+  HANDLE handle_ = nullptr;
+  bool valid_ = false;  // Documentation does not state which HANDLE is invalid
 };

 std::unique_ptr<HighResolutionTimer> HighResolutionTimer::CreateRepeating(
@ -143,7 +147,7 @@ std::unique_ptr<HighResolutionTimer> HighResolutionTimer::CreateRepeating(
  if (!timer->Initialize(period)) {
    return nullptr;
  }
-  return std::unique_ptr<HighResolutionTimer>(timer.release());
+  return std::move(timer);
 }

 template <typename T>
--- a/src/xenia/base/utf8.cc
+++ b/src/xenia/base/utf8.cc
@ -19,9 +19,7 @@
 namespace utfcpp = utf8;

 using citer = std::string_view::const_iterator;
-using criter = std::string_view::const_reverse_iterator;
 using utf8_citer = utfcpp::iterator<std::string_view::const_iterator>;
-using utf8_criter = utfcpp::iterator<std::string_view::const_reverse_iterator>;

 namespace xe::utf8 {

@ -54,25 +52,10 @@ std::pair<utf8_citer, utf8_citer> make_citer(const utf8_citer begin,
          utf8_citer(end.base(), begin.base(), end.base())};
 }

-std::pair<utf8_criter, utf8_criter> make_criter(const std::string_view view) {
-  return {utf8_criter(view.crbegin(), view.crbegin(), view.crend()),
-          utf8_criter(view.crend(), view.crbegin(), view.crend())};
-}
-
-std::pair<utf8_criter, utf8_criter> make_criter(const utf8_criter begin,
-                                                const utf8_criter end) {
-  return {utf8_criter(begin.base(), begin.base(), end.base()),
-          utf8_criter(end.base(), begin.base(), end.base())};
-}
-
 size_t byte_length(utf8_citer begin, utf8_citer end) {
  return size_t(std::distance(begin.base(), end.base()));
 }

-size_t byte_length(utf8_criter begin, utf8_criter end) {
-  return size_t(std::distance(begin.base(), end.base()));
-}
-
 size_t count(const std::string_view view) {
  return size_t(utfcpp::distance(view.cbegin(), view.cend()));
 }
@ -435,21 +418,23 @@ bool ends_with(const std::string_view haystack, const std::string_view needle) {
    return false;
  }

-  auto [haystack_begin, haystack_end] = make_criter(haystack);
-  auto [needle_begin, needle_end] = make_criter(needle);
+  auto [haystack_begin, haystack_end] = make_citer(haystack);
+  auto [needle_begin, needle_end] = make_citer(needle);
  auto needle_count = count(needle);

-  auto it = haystack_begin;
+  auto it = haystack_end;
  auto end = it;
-  for (size_t i = 0; i < needle_count; ++i) {
-    if (end == haystack_end) {
+  --it;
+
+  for (size_t i = 1; i < needle_count; ++i) {
+    if (it == haystack_begin) {
      // not enough room in target for search
      return false;
    }
-    ++end;
+    --it;
  }

-  auto [sub_start, sub_end] = make_criter(it, end);
+  auto [sub_start, sub_end] = make_citer(it, end);
  return std::equal(needle_begin, needle_end, sub_start, sub_end);
 }

@ -461,21 +446,23 @@ bool ends_with_case(const std::string_view haystack,
    return false;
  }

-  auto [haystack_begin, haystack_end] = make_criter(haystack);
-  auto [needle_begin, needle_end] = make_criter(needle);
+  auto [haystack_begin, haystack_end] = make_citer(haystack);
+  auto [needle_begin, needle_end] = make_citer(needle);
  auto needle_count = count(needle);

-  auto it = haystack_begin;
+  auto it = haystack_end;
  auto end = it;
+  --it;
+
  for (size_t i = 0; i < needle_count; ++i) {
-    if (end == haystack_end) {
+    if (it == haystack_begin) {
      // not enough room in target for search
      return false;
    }
-    ++end;
+    --it;
  }

-  auto [sub_start, sub_end] = make_criter(it, end);
+  auto [sub_start, sub_end] = make_citer(it, end);
  return std::equal(needle_begin, needle_end, sub_start, sub_end,
                    equal_ascii_case);
 }
@ -492,7 +479,9 @@ std::string join_paths(const std::string_view left_path,
    return std::string(left_path);
  }

-  auto [it, end] = make_criter(left_path);
+  utf8_citer it;
+  std::tie(std::ignore, it) = make_citer(left_path);
+  --it;

  std::string result = std::string(left_path);
  if (*it != static_cast<uint32_t>(separator)) {
@ -501,7 +490,20 @@ std::string join_paths(const std::string_view left_path,
  return result + std::string(right_path);
 }

-std::string join_paths(std::vector<std::string_view> paths,
+std::string join_paths(const std::vector<std::string>& paths,
+                       char32_t separator) {
+  std::string result;
+  auto it = paths.cbegin();
+  if (it != paths.cend()) {
+    result = *it++;
+    for (; it != paths.cend(); ++it) {
+      result = join_paths(result, *it, separator);
+    }
+  }
+  return result;
+}
+
+std::string join_paths(const std::vector<std::string_view>& paths,
                       char32_t separator) {
  std::string result;
  auto it = paths.cbegin();
@ -528,8 +530,20 @@ std::string fix_path_separators(const std::string_view path,
  std::string result;
  auto it = path_begin;
  auto last = it;
+
+  auto is_separator = [old_separator, new_separator](char32_t c) {
+    return c == uint32_t(old_separator) || c == uint32_t(new_separator);
+  };
+
+  // Begins with a separator
+  if (is_separator(*it)) {
+    utfcpp::append(new_separator, result);
+    ++it;
+    last = it;
+  }
+
  for (;;) {
-    it = std::find(it, path_end, uint32_t(old_separator));
+    it = std::find_if(it, path_end, is_separator);
    if (it == path_end) {
      break;
    }
@ -563,25 +577,40 @@ std::string find_name_from_path(const std::string_view path,
    return std::string();
  }

-  auto [begin, end] = make_criter(path);
+  auto [begin, end] = make_citer(path);

-  auto it = begin;
+  auto it = end;
+  --it;
+
+  // path is padded with separator
  size_t padding = 0;
  if (*it == uint32_t(separator)) {
-    ++it;
+    if (it == begin) {
+      return std::string();
+    }
+    --it;
    padding = 1;
  }

-  if (it == end) {
+  // path is just separator
+  if (it == begin) {
    return std::string();
  }

-  it = std::find(it, end, uint32_t(separator));
-  if (it == end) {
+  // search for separator
+  while (it != begin) {
+    if (*it == uint32_t(separator)) {
+      break;
+    }
+    --it;
+  }
+
+  // no separator -- copy entire string (except trailing separator)
+  if (it == begin) {
    return std::string(path.substr(0, path.size() - padding));
  }

-  auto length = byte_length(begin, it);
+  auto length = byte_length(std::next(it), end);
  auto offset = path.length() - length;
  return std::string(path.substr(offset, length - padding));
 }
@ -593,20 +622,25 @@ std::string find_base_name_from_path(const std::string_view path,
    return std::string();
  }

-  auto [begin, end] = make_criter(name);
+  auto [begin, end] = make_citer(name);

-  auto it = std::find(begin, end, uint32_t('.'));
-  if (it == end) {
+  auto it = end;
+  --it;
+
+  while (it != begin) {
+    if (*it == uint32_t('.')) {
+      break;
+    }
+    --it;
+  }
+
+  if (it == begin) {
    return name;
  }

-  it++;
-  if (it == end) {
-    return std::string();
-  }
-
-  auto length = name.length() - byte_length(begin, it);
-  return std::string(name.substr(0, length));
+  auto length = byte_length(it, end);
+  auto offset = name.length() - length;
+  return std::string(name.substr(0, offset));
 }

 std::string find_base_path(const std::string_view path, char32_t separator) {
@ -614,25 +648,33 @@ std::string find_base_path(const std::string_view path, char32_t separator) {
    return std::string();
  }

-  auto [begin, end] = make_criter(path);
+  auto [begin, end] = make_citer(path);

-  auto it = begin;
+  auto it = end;
+  --it;
+
+  // skip trailing separator
  if (*it == uint32_t(separator)) {
-    ++it;
+    if (it == begin) {
+      return std::string();
+    }
+    --it;
  }

-  it = std::find(it, end, uint32_t(separator));
-  if (it == end) {
+  while (it != begin) {
+    if (*it == uint32_t(separator)) {
+      break;
+    }
+    --it;
+  }
+
+  if (it == begin) {
    return std::string();
  }

-  ++it;
-  if (it == end) {
-    return std::string();
-  }
-
-  auto length = path.length() - byte_length(begin, it);
-  return std::string(path.substr(0, length));
+  auto length = byte_length(it, end);
+  auto offset = path.length() - length;
+  return std::string(path.substr(0, offset));
 }

 std::string canonicalize_path(const std::string_view path, char32_t separator) {
--- a/src/xenia/base/utf8.h
+++ b/src/xenia/base/utf8.h
@ -68,7 +68,10 @@ std::string join_paths(const std::string_view left_path,
                       const std::string_view right_path,
                       char32_t separator = kPathSeparator);

-std::string join_paths(std::vector<std::string_view> paths,
+std::string join_paths(const std::vector<std::string>& paths,
+                       char32_t separator = kPathSeparator);
+
+std::string join_paths(const std::vector<std::string_view>& paths,
                       char32_t separator = kPathSeparator);

 inline std::string join_paths(
@ -86,7 +89,12 @@ inline std::string join_guest_paths(const std::string_view left_path,
  return join_paths(left_path, right_path, kGuestPathSeparator);
 }

-inline std::string join_guest_paths(std::vector<std::string_view> paths) {
+inline std::string join_guest_paths(const std::vector<std::string>& paths) {
+  return join_paths(paths, kGuestPathSeparator);
+}
+
+inline std::string join_guest_paths(
+    const std::vector<std::string_view>& paths) {
  return join_paths(paths, kGuestPathSeparator);
 }

--- a/src/xenia/base/vec128.h
+++ b/src/xenia/base/vec128.h
@ -106,18 +106,6 @@ typedef struct alignas(16) vec128_s {
    };
  };

-  vec128_s() = default;
-  vec128_s(const vec128_s& other) {
-    high = other.high;
-    low = other.low;
-  }
-
-  vec128_s& operator=(const vec128_s& b) {
-    high = b.high;
-    low = b.low;
-    return *this;
-  }
-
  bool operator==(const vec128_s& b) const {
    return low == b.low && high == b.high;
  }
--- a/src/xenia/config.cc
+++ b/src/xenia/config.cc
@ -11,6 +11,7 @@

 #include "third_party/cpptoml/include/cpptoml.h"
 #include "third_party/fmt/include/fmt/format.h"
+#include "xenia/base/assert.h"
 #include "xenia/base/cvar.h"
 #include "xenia/base/filesystem.h"
 #include "xenia/base/logging.h"
@ -29,6 +30,13 @@ std::shared_ptr<cpptoml::table> ParseFile(
 }

 CmdVar(config, "", "Specifies the target config to load.");
+
+DEFINE_uint32(
+    defaults_date, 0,
+    "Do not modify - internal version of the default values in the config, for "
+    "seamless updates if default value of any option is changed.",
+    "Config");
+
 namespace config {
 std::string config_name = "xenia.config.toml";
 std::filesystem::path config_folder;
@ -46,8 +54,19 @@ std::shared_ptr<cpptoml::table> ParseConfig(
  }
 }

-void ReadConfig(const std::filesystem::path& file_path) {
+void ReadConfig(const std::filesystem::path& file_path,
+                bool update_if_no_version_stored) {
+  if (!cvar::ConfigVars) {
+    return;
+  }
  const auto config = ParseConfig(file_path);
+  // Loading an actual global config file that exists - if there's no
+  // defaults_date in it, it's very old (before updating was added at all, thus
+  // all defaults need to be updated).
+  auto defaults_date_cvar =
+      dynamic_cast<cvar::ConfigVar<uint32_t>*>(cv::cv_defaults_date);
+  assert_not_null(defaults_date_cvar);
+  defaults_date_cvar->SetConfigValue(0);
  for (auto& it : *cvar::ConfigVars) {
    auto config_var = static_cast<cvar::IConfigVar*>(it.second);
    auto config_key = config_var->category() + "." + config_var->name();
@ -55,10 +74,17 @@ void ReadConfig(const std::filesystem::path& file_path) {
      config_var->LoadConfigValue(config->get_qualified(config_key));
    }
  }
+  uint32_t config_defaults_date = defaults_date_cvar->GetTypedConfigValue();
+  if (update_if_no_version_stored || config_defaults_date) {
+    cvar::IConfigVarUpdate::ApplyUpdates(config_defaults_date);
+  }
  XELOGI("Loaded config: {}", xe::path_to_utf8(file_path));
 }

 void ReadGameConfig(const std::filesystem::path& file_path) {
+  if (!cvar::ConfigVars) {
+    return;
+  }
  const auto config = ParseConfig(file_path);
  for (auto& it : *cvar::ConfigVars) {
    auto config_var = static_cast<cvar::IConfigVar*>(it.second);
@ -71,9 +97,18 @@ void ReadGameConfig(const std::filesystem::path& file_path) {
 }

 void SaveConfig() {
+  // All cvar defaults have been updated on loading - store the current date.
+  auto defaults_date_cvar =
+      dynamic_cast<cvar::ConfigVar<uint32_t>*>(cv::cv_defaults_date);
+  assert_not_null(defaults_date_cvar);
+  defaults_date_cvar->SetConfigValue(
+      cvar::IConfigVarUpdate::GetLastUpdateDate());
+
  std::vector<cvar::IConfigVar*> vars;
-  for (const auto& s : *cvar::ConfigVars) {
-    vars.push_back(s.second);
+  if (cvar::ConfigVars) {
+    for (const auto& s : *cvar::ConfigVars) {
+      vars.push_back(s.second);
+    }
  }
  std::sort(vars.begin(), vars.end(), [](auto a, auto b) {
    if (a->category() < b->category()) return true;
@ -167,7 +202,12 @@ void SetupConfig(const std::filesystem::path& config_folder) {
  if (!cvars::config.empty()) {
    config_path = xe::to_path(cvars::config);
    if (std::filesystem::exists(config_path)) {
-      ReadConfig(config_path);
+      // An external config file may contain only explicit overrides - in this
+      // case, it will likely not contain the defaults version; don't update
+      // from the version 0 in this case. Or, it may be a full config - in this
+      // case, if it's recent enough (created at least in 2021), it will contain
+      // the version number - updates the defaults in it.
+      ReadConfig(config_path, false);
      return;
    }
  }
@ -176,10 +216,11 @@ void SetupConfig(const std::filesystem::path& config_folder) {
  if (!config_folder.empty()) {
    config_path = config_folder / config_name;
    if (std::filesystem::exists(config_path)) {
-      ReadConfig(config_path);
+      ReadConfig(config_path, true);
    }
-    // we only want to save the config if the user is using the default
-    // config, we don't want to override a user created specific config
+    // Re-save the loaded config to present the most up-to-date list of
+    // parameters to the user, if new options were added, descriptions were
+    // updated, or default values were changed.
    SaveConfig();
  }
 }
--- a/src/xenia/cpp.hint
+++ b/src/xenia/cpp.hint
@ -0,0 +1,16 @@
+// Hint files help the Visual Studio IDE interpret Visual C++ identifiers
+// such as names of functions and macros.
+// For more information see https://go.microsoft.com/fwlink/?linkid=865984
+
+#define DECLARE_XAM_EXPORT_(name, category, tags)
+#define DECLARE_XAM_EXPORT1(name, category, tag)
+#define DECLARE_XAM_EXPORT2(name, category, tag1, tag2)
+
+#define DECLARE_XBDM_EXPORT_(name, category, tags)
+#define DECLARE_XBDM_EXPORT1(name, category, tag)
+
+#define DECLARE_XBOXKRNL_EXPORT_(name, category, tags)
+#define DECLARE_XBOXKRNL_EXPORT1(name, category, tag)
+#define DECLARE_XBOXKRNL_EXPORT2(name, category, tag1, tag2)
+#define DECLARE_XBOXKRNL_EXPORT3(name, category, tag1, tag2, tag3)
+#define DECLARE_XBOXKRNL_EXPORT4(name, category, tag1, tag2, tag3, tag4)
--- a/src/xenia/cpu/backend/x64/x64_backend.cc
+++ b/src/xenia/cpu/backend/x64/x64_backend.cc
@ -519,7 +519,7 @@ GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() {
 }

 // X64Emitter handles actually resolving functions.
-extern "C" uint64_t ResolveFunction(void* raw_context, uint32_t target_address);
+uint64_t ResolveFunction(void* raw_context, uint64_t target_address);

 ResolveFunctionThunk X64ThunkEmitter::EmitResolveFunctionThunk() {
  // ebx = target PPC address
@ -548,7 +548,7 @@ ResolveFunctionThunk X64ThunkEmitter::EmitResolveFunctionThunk() {

  mov(rcx, rsi);  // context
  mov(rdx, rbx);
-  mov(rax, uint64_t(&ResolveFunction));
+  mov(rax, reinterpret_cast<uint64_t>(&ResolveFunction));
  call(rax);

  EmitLoadVolatileRegs();
--- a/src/xenia/cpu/backend/x64/x64_emitter.cc
+++ b/src/xenia/cpu/backend/x64/x64_emitter.cc
@ -382,15 +382,14 @@ void X64Emitter::UnimplementedInstr(const hir::Instr* i) {
 }

 // This is used by the X64ThunkEmitter's ResolveFunctionThunk.
-extern "C" uint64_t ResolveFunction(void* raw_context,
-                                    uint64_t target_address) {
+uint64_t ResolveFunction(void* raw_context, uint64_t target_address) {
  auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);

  // TODO(benvanik): required?
  assert_not_zero(target_address);

-  auto fn =
-      thread_state->processor()->ResolveFunction((uint32_t)target_address);
+  auto fn = thread_state->processor()->ResolveFunction(
+      static_cast<uint32_t>(target_address));
  assert_not_null(fn);
  auto x64_fn = static_cast<X64Function*>(fn);
  uint64_t addr = reinterpret_cast<uint64_t>(x64_fn->machine_code());
@ -801,7 +800,7 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) {
  if (!v.low && !v.high) {
    // 0000...
    vpxor(dest, dest);
-  } else if (v.low == ~0ull && v.high == ~0ull) {
+  } else if (v.low == ~uint64_t(0) && v.high == ~uint64_t(0)) {
    // 1111...
    vpcmpeqb(dest, dest);
  } else {
@ -818,10 +817,10 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, float v) {
    float f;
    uint32_t i;
  } x = {v};
-  if (!v) {
-    // 0
+  if (!x.i) {
+    // +0.0f (but not -0.0f because it may be used to flip the sign via xor).
    vpxor(dest, dest);
-  } else if (x.i == ~0U) {
+  } else if (x.i == ~uint32_t(0)) {
    // 1111...
    vpcmpeqb(dest, dest);
  } else {
@ -837,10 +836,10 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, double v) {
    double d;
    uint64_t i;
  } x = {v};
-  if (!v) {
-    // 0
+  if (!x.i) {
+    // +0.0 (but not -0.0 because it may be used to flip the sign via xor).
    vpxor(dest, dest);
-  } else if (x.i == ~0ULL) {
+  } else if (x.i == ~uint64_t(0)) {
    // 1111...
    vpcmpeqb(dest, dest);
  } else {
--- a/src/xenia/cpu/backend/x64/x64_op.h
+++ b/src/xenia/cpu/backend/x64/x64_op.h
@ -105,8 +105,7 @@ struct Op : OpBase {

 struct VoidOp : Op<VoidOp, KEY_TYPE_X> {
 protected:
-  template <typename T, KeyType KEY_TYPE>
-  friend struct Op;
+  friend struct Op<VoidOp, KEY_TYPE_X>;
  template <hir::Opcode OPCODE, typename... Ts>
  friend struct I;
  void Load(const Instr::Op& op) {}
@ -116,8 +115,7 @@ struct OffsetOp : Op<OffsetOp, KEY_TYPE_O> {
  uint64_t value;

 protected:
-  template <typename T, KeyType KEY_TYPE>
-  friend struct Op;
+  friend struct Op<OffsetOp, KEY_TYPE_O>;
  template <hir::Opcode OPCODE, typename... Ts>
  friend struct I;
  void Load(const Instr::Op& op) { this->value = op.offset; }
@ -127,8 +125,7 @@ struct SymbolOp : Op<SymbolOp, KEY_TYPE_S> {
  Function* value;

 protected:
-  template <typename T, KeyType KEY_TYPE>
-  friend struct Op;
+  friend struct Op<SymbolOp, KEY_TYPE_S>;
  template <hir::Opcode OPCODE, typename... Ts>
  friend struct I;
  bool Load(const Instr::Op& op) {
@ -141,8 +138,7 @@ struct LabelOp : Op<LabelOp, KEY_TYPE_L> {
  hir::Label* value;

 protected:
-  template <typename T, KeyType KEY_TYPE>
-  friend struct Op;
+  friend struct Op<LabelOp, KEY_TYPE_L>;
  template <hir::Opcode OPCODE, typename... Ts>
  friend struct I;
  void Load(const Instr::Op& op) { this->value = op.label; }
--- a/src/xenia/cpu/compiler/passes/data_flow_analysis_pass.cc
+++ b/src/xenia/cpu/compiler/passes/data_flow_analysis_pass.cc
@ -2,7 +2,7 @@
 ******************************************************************************
 * Xenia : Xbox 360 Emulator Research Project                                 *
 ******************************************************************************
- * Copyright 2014 Ben Vanik. All rights reserved.                             *
+ * Copyright 2021 Ben Vanik. All rights reserved.                             *
 * Released under the BSD license - see LICENSE in the root for more details. *
 ******************************************************************************
 */
@ -73,14 +73,14 @@ void DataFlowAnalysisPass::AnalyzeFlow(HIRBuilder* builder,
  // Stash for value map. We may want to maintain this during building.
  auto arena = builder->arena();
  auto value_map = reinterpret_cast<Value**>(
-      arena->Alloc(sizeof(Value*) * max_value_estimate));
+      arena->Alloc(sizeof(Value*) * max_value_estimate, alignof(Value)));

  // Allocate incoming bitvectors for use by blocks. We don't need outgoing
  // because they are only used during the block iteration.
  // Mapped by block ordinal.
  // TODO(benvanik): cache this list, grow as needed, etc.
-  auto incoming_bitvectors =
-      (llvm::BitVector**)arena->Alloc(sizeof(llvm::BitVector*) * block_count);
+  auto incoming_bitvectors = (llvm::BitVector**)arena->Alloc(
+      sizeof(llvm::BitVector*) * block_count, alignof(llvm::BitVector));
  for (auto n = 0u; n < block_count; n++) {
    incoming_bitvectors[n] = new llvm::BitVector(max_value_estimate);
  }
--- a/src/xenia/cpu/compiler/passes/finalization_pass.cc
+++ b/src/xenia/cpu/compiler/passes/finalization_pass.cc
@ -2,7 +2,7 @@
 ******************************************************************************
 * Xenia : Xbox 360 Emulator Research Project                                 *
 ******************************************************************************
- * Copyright 2020 Ben Vanik. All rights reserved.                             *
+ * Copyright 2021 Ben Vanik. All rights reserved.                             *
 * Released under the BSD license - see LICENSE in the root for more details. *
 ******************************************************************************
 */
@ -45,7 +45,7 @@ bool FinalizationPass::Run(HIRBuilder* builder) {
    while (label) {
      if (!label->name) {
        const size_t label_len = 6 + 4;
-        char* name = reinterpret_cast<char*>(arena->Alloc(label_len + 1));
+        char* name = reinterpret_cast<char*>(arena->Alloc(label_len + 1, 1));
        assert_true(label->id <= 9999);
        auto end = fmt::format_to_n(name, label_len, "_label{}", label->id);
        name[end.size] = '\0';
--- a/src/xenia/cpu/export_resolver.h
+++ b/src/xenia/cpu/export_resolver.h
@ -39,7 +39,7 @@ enum class ExportCategory : uint8_t {
 };

 struct ExportTag {
-  typedef uint32_t type;
+  using type = uint32_t;

  // packed like so:
  // ll...... cccccccc ........ ..bihssi
--- a/src/xenia/cpu/hir/hir_builder.cc
+++ b/src/xenia/cpu/hir/hir_builder.cc
@ -2,7 +2,7 @@
 ******************************************************************************
 * Xenia : Xbox 360 Emulator Research Project                                 *
 ******************************************************************************
- * Copyright 2020 Ben Vanik. All rights reserved.                             *
+ * Copyright 2021 Ben Vanik. All rights reserved.                             *
 * Released under the BSD license - see LICENSE in the root for more details. *
 ******************************************************************************
 */
@ -739,7 +739,7 @@ void HIRBuilder::Comment(std::string_view value) {
    return;
  }
  auto size = value.size();
-  auto p = reinterpret_cast<char*>(arena_->Alloc(size + 1));
+  auto p = reinterpret_cast<char*>(arena_->Alloc(size + 1, 1));
  std::memcpy(p, value.data(), size);
  p[size] = '\0';
  Instr* i = AppendInstr(OPCODE_COMMENT_info, 0);
@ -752,7 +752,7 @@ void HIRBuilder::Comment(const StringBuffer& value) {
    return;
  }
  auto size = value.length();
-  auto p = reinterpret_cast<char*>(arena_->Alloc(size + 1));
+  auto p = reinterpret_cast<char*>(arena_->Alloc(size + 1, 1));
  std::memcpy(p, value.buffer(), size);
  p[size] = '\0';
  Instr* i = AppendInstr(OPCODE_COMMENT_info, 0);
--- a/src/xenia/cpu/hir/hir_builder.h
+++ b/src/xenia/cpu/hir/hir_builder.h
@ -2,7 +2,7 @@
 ******************************************************************************
 * Xenia : Xbox 360 Emulator Research Project                                 *
 ******************************************************************************
- * Copyright 2020 Ben Vanik. All rights reserved.                             *
+ * Copyright 2021 Ben Vanik. All rights reserved.                             *
 * Released under the BSD license - see LICENSE in the root for more details. *
 ******************************************************************************
 */
@ -75,7 +75,7 @@ class HIRBuilder {
  template <typename... Args>
  void CommentFormat(const std::string_view format, const Args&... args) {
    static const uint32_t kMaxCommentSize = 1024;
-    char* p = reinterpret_cast<char*>(arena_->Alloc(kMaxCommentSize));
+    char* p = reinterpret_cast<char*>(arena_->Alloc(kMaxCommentSize, 1));
    auto result = fmt::format_to_n(p, kMaxCommentSize - 1, format, args...);
    p[result.size] = '\0';
    size_t rewind = kMaxCommentSize - 1 - result.size;
--- a/src/xenia/cpu/ppc/ppc_hir_builder.cc
+++ b/src/xenia/cpu/ppc/ppc_hir_builder.cc
@ -2,7 +2,7 @@
 ******************************************************************************
 * Xenia : Xbox 360 Emulator Research Project                                 *
 ******************************************************************************
- * Copyright 2020 Ben Vanik. All rights reserved.                             *
+ * Copyright 2021 Ben Vanik. All rights reserved.                             *
 * Released under the BSD license - see LICENSE in the root for more details. *
 ******************************************************************************
 */
@ -104,8 +104,8 @@ bool PPCHIRBuilder::Emit(GuestFunction* function, uint32_t flags) {
  // instruction may have a label assigned to it if it hasn't been hit
  // yet.
  size_t list_size = instr_count_ * sizeof(void*);
-  instr_offset_list_ = (Instr**)arena_->Alloc(list_size);
-  label_list_ = (Label**)arena_->Alloc(list_size);
+  instr_offset_list_ = (Instr**)arena_->Alloc(list_size, alignof(void*));
+  label_list_ = (Label**)arena_->Alloc(list_size, alignof(void*));
  std::memset(instr_offset_list_, 0, list_size);
  std::memset(label_list_, 0, list_size);

@ -244,7 +244,7 @@ void PPCHIRBuilder::AnnotateLabel(uint32_t address, Label* label) {
  char name_buffer[13];
  auto format_result = fmt::format_to_n(name_buffer, 12, "loc_{:08X}", address);
  name_buffer[format_result.size] = '\0';
-  label->name = (char*)arena_->Alloc(sizeof(name_buffer));
+  label->name = (char*)arena_->Alloc(sizeof(name_buffer), 1);
  memcpy(label->name, name_buffer, sizeof(name_buffer));
 }

--- a/src/xenia/cpu/processor.cc
+++ b/src/xenia/cpu/processor.cc
@ -432,12 +432,12 @@ void Processor::LowerIrql(Irql old_value) {
 }

 bool Processor::Save(ByteStream* stream) {
-  stream->Write('PROC');
+  stream->Write(kProcessorSaveSignature);
  return true;
 }

 bool Processor::Restore(ByteStream* stream) {
-  if (stream->Read<uint32_t>() != 'PROC') {
+  if (stream->Read<uint32_t>() != kProcessorSaveSignature) {
    XELOGE("Processor::Restore - Invalid magic value!");
    return false;
  }
--- a/src/xenia/cpu/processor.h
+++ b/src/xenia/cpu/processor.h
@ -34,6 +34,8 @@ DECLARE_bool(debug);
 namespace xe {
 namespace cpu {

+constexpr fourcc_t kProcessorSaveSignature = make_fourcc("PROC");
+
 class Breakpoint;
 class StackWalker;
 class XexModule;
--- a/src/xenia/cpu/testing/util.h
+++ b/src/xenia/cpu/testing/util.h
@ -20,7 +20,7 @@
 #include "xenia/cpu/processor.h"
 #include "xenia/cpu/test_module.h"

-#include "third_party/catch/single_include/catch.hpp"
+#include "third_party/catch/include/catch.hpp"

 #define XENIA_TEST_X64 1

--- a/src/xenia/cpu/xex_module.cc
+++ b/src/xenia/cpu/xex_module.cc
@ -249,12 +249,11 @@ int XexModule::ApplyPatch(XexModule* module) {

  // Patch base XEX header
  uint32_t original_image_size = module->image_size();
-  uint32_t header_target_size = patch_header->delta_headers_target_offset +
-                                patch_header->delta_headers_source_size;
+  uint32_t header_target_size = patch_header->size_of_target_headers;

  if (!header_target_size) {
-    header_target_size =
-        patch_header->size_of_target_headers;  // unsure which is more correct..
+    header_target_size = patch_header->delta_headers_target_offset +
+                         patch_header->delta_headers_source_size;
  }

  size_t mem_size = module->xex_header_mem_.size();
@ -299,6 +298,9 @@ int XexModule::ApplyPatch(XexModule* module) {
    module->xex_header_mem_.resize(header_target_size);
  }

+  // Update security info context with latest security info data
+  module->ReadSecurityInfo();
+
  uint32_t new_image_size = module->image_size();

  // Check if we need to alloc new memory for the patched xex
@ -446,14 +448,9 @@ int XexModule::ApplyPatch(XexModule* module) {
      }
    }

-    // byteswap versions because of bitfields...
    xex2_version source_ver, target_ver;
-    source_ver.value =
-        xe::byte_swap<uint32_t>(patch_header->source_version.value);
-
-    target_ver.value =
-        xe::byte_swap<uint32_t>(patch_header->target_version.value);
-
+    source_ver = patch_header->source_version();
+    target_ver = patch_header->target_version();
    XELOGI(
        "XEX patch applied successfully: base version: {}.{}.{}.{}, new "
        "version: {}.{}.{}.{}",
@ -867,25 +864,7 @@ int XexModule::ReadPEHeaders() {
  return 0;
 }

-bool XexModule::Load(const std::string_view name, const std::string_view path,
-                     const void* xex_addr, size_t xex_length) {
-  auto src_header = reinterpret_cast<const xex2_header*>(xex_addr);
-
-  if (src_header->magic == 'XEX1') {
-    xex_format_ = kFormatXex1;
-  } else if (src_header->magic == 'XEX2') {
-    xex_format_ = kFormatXex2;
-  } else {
-    return false;
-  }
-
-  assert_false(loaded_);
-  loaded_ = true;
-
-  // Read in XEX headers
-  xex_header_mem_.resize(src_header->header_size);
-  std::memcpy(xex_header_mem_.data(), src_header, src_header->header_size);
-
+void XexModule::ReadSecurityInfo() {
  if (xex_format_ == kFormatXex1) {
    const xex1_security_info* xex1_sec_info =
        reinterpret_cast<const xex1_security_info*>(
@ -913,6 +892,29 @@ bool XexModule::Load(const std::string_view name, const std::string_view path,
    security_info_.page_descriptor_count = xex2_sec_info->page_descriptor_count;
    security_info_.page_descriptors = xex2_sec_info->page_descriptors;
  }
+}
+
+bool XexModule::Load(const std::string_view name, const std::string_view path,
+                     const void* xex_addr, size_t xex_length) {
+  auto src_header = reinterpret_cast<const xex2_header*>(xex_addr);
+
+  if (src_header->magic == kXEX1Signature) {
+    xex_format_ = kFormatXex1;
+  } else if (src_header->magic == kXEX2Signature) {
+    xex_format_ = kFormatXex2;
+  } else {
+    return false;
+  }
+
+  assert_false(loaded_);
+  loaded_ = true;
+
+  // Read in XEX headers
+  xex_header_mem_.resize(src_header->header_size);
+  std::memcpy(xex_header_mem_.data(), src_header, src_header->header_size);
+
+  // Read/convert XEX1/XEX2 security info to a common format
+  ReadSecurityInfo();

  auto sec_header = xex_security_info();

@ -1104,8 +1106,8 @@ bool XexModule::SetupLibraryImports(const std::string_view name,
  ImportLibrary library_info;
  library_info.name = base_name;
  library_info.id = library->id;
-  library_info.version.value = library->version.value;
-  library_info.min_version.value = library->version_min.value;
+  library_info.version.value = library->version().value;
+  library_info.min_version.value = library->version_min().value;

  // Imports are stored as {import descriptor, thunk addr, import desc, ...}
  // Even thunks have an import descriptor (albeit unused/useless)
--- a/src/xenia/cpu/xex_module.h
+++ b/src/xenia/cpu/xex_module.h
@ -25,6 +25,10 @@ class KernelState;
 namespace xe {
 namespace cpu {

+constexpr fourcc_t kXEX1Signature = make_fourcc("XEX1");
+constexpr fourcc_t kXEX2Signature = make_fourcc("XEX2");
+constexpr fourcc_t kElfSignature = make_fourcc(0x7F, 'E', 'L', 'F');
+
 class Runtime;

 class XexModule : public xe::cpu::Module {
@ -170,6 +174,8 @@ class XexModule : public xe::cpu::Module {
  std::unique_ptr<Function> CreateFunction(uint32_t address) override;

 private:
+  void ReadSecurityInfo();
+
  int ReadImage(const void* xex_addr, size_t xex_length, bool use_dev_key);
  int ReadImageUncompressed(const void* xex_addr, size_t xex_length);
  int ReadImageBasicCompressed(const void* xex_addr, size_t xex_length);
--- a/src/xenia/emulator.cc
+++ b/src/xenia/emulator.cc
@ -2,7 +2,7 @@
 ******************************************************************************
 * Xenia : Xbox 360 Emulator Research Project                                 *
 ******************************************************************************
- * Copyright 2020 Ben Vanik. All rights reserved.                             *
+ * Copyright 2021 Ben Vanik. All rights reserved.                             *
 * Released under the BSD license - see LICENSE in the root for more details. *
 ******************************************************************************
 */
@ -68,7 +68,8 @@ Emulator::Emulator(const std::filesystem::path& command_line,
      storage_root_(storage_root),
      content_root_(content_root),
      cache_root_(cache_root),
-      game_title_(),
+      title_name_(),
+      title_version_(),
      display_window_(nullptr),
      memory_(),
      audio_system_(),
@ -78,7 +79,7 @@ Emulator::Emulator(const std::filesystem::path& command_line,
      file_system_(),
      kernel_state_(),
      main_thread_(),
-      title_id_(0),
+      title_id_(std::nullopt),
      paused_(false),
      restoring_(false),
      restore_fence_() {}
@ -246,8 +247,9 @@ X_STATUS Emulator::TerminateTitle() {
  }

  kernel_state_->TerminateTitle();
-  title_id_ = 0;
-  game_title_ = "";
+  title_id_ = std::nullopt;
+  title_name_ = "";
+  title_version_ = "";
  on_terminate();
  return X_STATUS_SUCCESS;
 }
@ -418,8 +420,11 @@ bool Emulator::SaveToFile(const std::filesystem::path& path) {

  // Save the emulator state to a file
  ByteStream stream(map->data(), map->size());
-  stream.Write('XSAV');
-  stream.Write(title_id_);
+  stream.Write(kEmulatorSaveSignature);
+  stream.Write(title_id_.has_value());
+  if (title_id_.has_value()) {
+    stream.Write(title_id_.value());
+  }

  // It's important we don't hold the global lock here! XThreads need to step
  // forward (possibly through guarded regions) without worry!
@ -449,12 +454,19 @@ bool Emulator::RestoreFromFile(const std::filesystem::path& path) {

  auto lock = global_critical_region::AcquireDirect();
  ByteStream stream(map->data(), map->size());
-  if (stream.Read<uint32_t>() != 'XSAV') {
+  if (stream.Read<uint32_t>() != kEmulatorSaveSignature) {
    return false;
  }

-  auto title_id = stream.Read<uint32_t>();
-  if (title_id != title_id_) {
+  auto has_title_id = stream.Read<bool>();
+  std::optional<uint32_t> title_id;
+  if (!has_title_id) {
+    title_id = {};
+  } else {
+    title_id = stream.Read<uint32_t>();
+  }
+  if (title_id_.has_value() != title_id.has_value() ||
+      title_id_.value() != title_id.value()) {
    // Swapping between titles is unsupported at the moment.
    assert_always();
    return false;
@ -642,11 +654,28 @@ std::string Emulator::FindLaunchModule() {
  return path + default_module;
 }

+static std::string format_version(xex2_version version) {
+  // fmt::format doesn't like bit fields
+  uint32_t major, minor, build, qfe;
+  major = version.major;
+  minor = version.minor;
+  build = version.build;
+  qfe = version.qfe;
+  if (qfe) {
+    return fmt::format("{}.{}.{}.{}", major, minor, build, qfe);
+  }
+  if (build) {
+    return fmt::format("{}.{}.{}", major, minor, build);
+  }
+  return fmt::format("{}.{}", major, minor);
+}
+
 X_STATUS Emulator::CompleteLaunch(const std::filesystem::path& path,
                                  const std::string_view module_path) {
  // Reset state.
-  title_id_ = 0;
-  game_title_ = "";
+  title_id_ = std::nullopt;
+  title_name_ = "";
+  title_version_ = "";
  display_window_->SetIcon(nullptr, 0);

  // Allow xam to request module loads.
@ -662,8 +691,15 @@ X_STATUS Emulator::CompleteLaunch(const std::filesystem::path& path,
  // Grab the current title ID.
  xex2_opt_execution_info* info = nullptr;
  module->GetOptHeader(XEX_HEADER_EXECUTION_INFO, &info);
-  if (info) {
+
+  if (!info) {
+    title_id_ = 0;
+  } else {
    title_id_ = info->title_id;
+    auto title_version = info->version();
+    if (title_version.value != 0) {
+      title_version_ = format_version(title_version);
+    }
  }

  // Try and load the resource database (xex only).
@ -677,7 +713,12 @@ X_STATUS Emulator::CompleteLaunch(const std::filesystem::path& path,
      kernel::util::XdbfGameData db(
          module->memory()->TranslateVirtual(resource_data), resource_size);
      if (db.is_valid()) {
-        game_title_ = db.title();
+        // TODO(gibbed): get title respective to user locale.
+        title_name_ = db.title(XLanguage::kEnglish);
+        if (title_name_.empty()) {
+          // If English title is unavailable, get the title in default locale.
+          title_name_ = db.title();
+        }
        auto icon_block = db.icon();
        if (icon_block) {
          display_window_->SetIcon(icon_block.buffer, icon_block.size);
@ -691,7 +732,8 @@ X_STATUS Emulator::CompleteLaunch(const std::filesystem::path& path,
  // playing before the video can be seen if doing this in parallel with the
  // main thread.
  on_shader_storage_initialization(true);
-  graphics_system_->InitializeShaderStorage(cache_root_, title_id_, true);
+  graphics_system_->InitializeShaderStorage(cache_root_, title_id_.value(),
+                                            true);
  on_shader_storage_initialization(false);

  auto main_thread = kernel_state_->LaunchModule(module);
@ -699,7 +741,7 @@ X_STATUS Emulator::CompleteLaunch(const std::filesystem::path& path,
    return X_STATUS_UNSUCCESSFUL;
  }
  main_thread_ = main_thread;
-  on_launch(title_id_, game_title_);
+  on_launch(title_id_.value(), title_name_);

  return X_STATUS_SUCCESS;
 }
--- a/src/xenia/emulator.h
+++ b/src/xenia/emulator.h
@ -11,6 +11,7 @@
 #define XENIA_EMULATOR_H_

 #include <functional>
+#include <optional>
 #include <string>

 #include "xenia/base/delegate.h"
@ -43,6 +44,8 @@ class Window;

 namespace xe {

+constexpr fourcc_t kEmulatorSaveSignature = make_fourcc("XSAV");
+
 // The main type that runs the whole emulator.
 // This is responsible for initializing and managing all the various subsystems.
 class Emulator {
@ -65,14 +68,19 @@ class Emulator {
  // Folder files safe to remove without significant side effects are stored in.
  const std::filesystem::path& cache_root() const { return cache_root_; }

-  // Title of the game in the default language.
-  const std::string& game_title() const { return game_title_; }
+  // Name of the title in the default language.
+  const std::string& title_name() const { return title_name_; }
+
+  // Version of the title as a string.
+  const std::string& title_version() const { return title_version_; }

  // Currently running title ID
-  uint32_t title_id() const { return title_id_; }
+  uint32_t title_id() const {
+    return !title_id_.has_value() ? 0 : title_id_.value();
+  }

  // Are we currently running a title?
-  bool is_title_open() const { return title_id_ != 0; }
+  bool is_title_open() const { return title_id_.has_value(); }

  // Window used for displaying graphical output.
  ui::Window* display_window() const { return display_window_; }
@ -172,7 +180,8 @@ class Emulator {
  std::filesystem::path content_root_;
  std::filesystem::path cache_root_;

-  std::string game_title_;
+  std::string title_name_;
+  std::string title_version_;

  ui::Window* display_window_;

@ -188,7 +197,7 @@ class Emulator {

  std::unique_ptr<kernel::KernelState> kernel_state_;
  kernel::object_ref<kernel::XThread> main_thread_;
-  uint32_t title_id_;  // Currently running title ID
+  std::optional<uint32_t> title_id_;  // Currently running title ID

  bool paused_;
  bool restoring_;
--- a/src/xenia/gpu/command_processor.cc
+++ b/src/xenia/gpu/command_processor.cc
@ -257,22 +257,21 @@ bool CommandProcessor::SetupContext() { return true; }

 void CommandProcessor::ShutdownContext() { context_.reset(); }

-void CommandProcessor::InitializeRingBuffer(uint32_t ptr, uint32_t log2_size) {
+void CommandProcessor::InitializeRingBuffer(uint32_t ptr, uint32_t size_log2) {
  read_ptr_index_ = 0;
  primary_buffer_ptr_ = ptr;
-  primary_buffer_size_ = 1 << log2_size;
+  primary_buffer_size_ = uint32_t(1) << (size_log2 + 3);
 }

 void CommandProcessor::EnableReadPointerWriteBack(uint32_t ptr,
-                                                  uint32_t block_size) {
+                                                  uint32_t block_size_log2) {
  // CP_RB_RPTR_ADDR Ring Buffer Read Pointer Address 0x70C
  // ptr = RB_RPTR_ADDR, pointer to write back the address to.
  read_ptr_writeback_ptr_ = ptr;
  // CP_RB_CNTL Ring Buffer Control 0x704
-  // block_size = RB_BLKSZ, number of quadwords read between updates of the
-  //              read pointer.
-  read_ptr_update_freq_ =
-      static_cast<uint32_t>(pow(2.0, static_cast<double>(block_size)) / 4);
+  // block_size = RB_BLKSZ, log2 of number of quadwords read between updates of
+  //              the read pointer.
+  read_ptr_update_freq_ = uint32_t(1) << block_size_log2 >> 2;
 }

 void CommandProcessor::UpdateWritePointer(uint32_t value) {
@ -825,8 +824,8 @@ bool CommandProcessor::ExecutePacketType3_XE_SWAP(RingBuffer* reader,
  // VdSwap will post this to tell us we need to swap the screen/fire an
  // interrupt.
  // 63 words here, but only the first has any data.
-  uint32_t magic = reader->ReadAndSwap<uint32_t>();
-  assert_true(magic == 'SWAP');
+  uint32_t magic = reader->ReadAndSwap<fourcc_t>();
+  assert_true(magic == kSwapSignature);

  // TODO(benvanik): only swap frontbuffer ptr.
  uint32_t frontbuffer_ptr = reader->ReadAndSwap<uint32_t>();
@ -1146,6 +1145,8 @@ bool CommandProcessor::ExecutePacketType3_EVENT_WRITE_EXT(RingBuffer* reader,
 bool CommandProcessor::ExecutePacketType3_EVENT_WRITE_ZPD(RingBuffer* reader,
                                                          uint32_t packet,
                                                          uint32_t count) {
+  // Set by D3D as BE but struct ABI is LE
+  const uint32_t kQueryFinished = xe::byte_swap(0xFFFFFEED);
  assert_true(count == 1);
  uint32_t initiator = reader->ReadAndSwap<uint32_t>();
  // Writeback initiator.
@ -1161,10 +1162,13 @@ bool CommandProcessor::ExecutePacketType3_EVENT_WRITE_ZPD(RingBuffer* reader,
            register_file_->values[XE_GPU_REG_RB_SAMPLE_COUNT_ADDR].u32);
    // 0xFFFFFEED is written to this two locations by D3D only on D3DISSUE_END
    // and used to detect a finished query.
-    bool isEnd = pSampleCounts->ZPass_A == xe::byte_swap(0xFFFFFEED) &&
-                 pSampleCounts->ZPass_B == xe::byte_swap(0xFFFFFEED);
+    bool is_end_via_z_pass = pSampleCounts->ZPass_A == kQueryFinished &&
+                             pSampleCounts->ZPass_B == kQueryFinished;
+    // Older versions of D3D also checks for ZFail (First Gears of War)
+    bool is_end_via_z_fail = pSampleCounts->ZFail_A == kQueryFinished &&
+                             pSampleCounts->ZFail_B == kQueryFinished;
    std::memset(pSampleCounts, 0, sizeof(xe_gpu_depth_sample_counts));
-    if (isEnd) {
+    if (is_end_via_z_pass || is_end_via_z_fail) {
      pSampleCounts->ZPass_A = fake_sample_count;
      pSampleCounts->Total_A = fake_sample_count;
    }
@ -1173,40 +1177,77 @@ bool CommandProcessor::ExecutePacketType3_EVENT_WRITE_ZPD(RingBuffer* reader,
  return true;
 }

-bool CommandProcessor::ExecutePacketType3_DRAW_INDX(RingBuffer* reader,
-                                                    uint32_t packet,
-                                                    uint32_t count) {
-  // initiate fetch of index buffer and draw
-  // if dword0 != 0, this is a conditional draw based on viz query.
+bool CommandProcessor::ExecutePacketType3Draw(RingBuffer* reader,
+                                              uint32_t packet,
+                                              const char* opcode_name,
+                                              uint32_t viz_query_condition,
+                                              uint32_t count_remaining) {
+  // if viz_query_condition != 0, this is a conditional draw based on viz query.
  // This ID matches the one issued in PM4_VIZ_QUERY
-  uint32_t dword0 = reader->ReadAndSwap<uint32_t>();  // viz query info
-  // uint32_t viz_id = dword0 & 0x3F;
+  // uint32_t viz_id = viz_query_condition & 0x3F;
  // when true, render conditionally based on query result
-  // uint32_t viz_use = dword0 & 0x100;
+  // uint32_t viz_use = viz_query_condition & 0x100;

+  assert_not_zero(count_remaining);
+  if (!count_remaining) {
+    XELOGE("{}: Packet too small, can't read VGT_DRAW_INITIATOR", opcode_name);
+    return false;
+  }
  reg::VGT_DRAW_INITIATOR vgt_draw_initiator;
  vgt_draw_initiator.value = reader->ReadAndSwap<uint32_t>();
+  --count_remaining;
  WriteRegister(XE_GPU_REG_VGT_DRAW_INITIATOR, vgt_draw_initiator.value);

+  bool success = true;
+  // TODO(Triang3l): Remove IndexBufferInfo and replace handling of all this
+  // with PrimitiveProcessor when the old Vulkan renderer is removed.
  bool is_indexed = false;
  IndexBufferInfo index_buffer_info;
  switch (vgt_draw_initiator.source_select) {
    case xenos::SourceSelect::kDMA: {
      // Indexed draw.
      is_indexed = true;
-      index_buffer_info.guest_base = reader->ReadAndSwap<uint32_t>();
-      uint32_t index_size = reader->ReadAndSwap<uint32_t>();
-      index_buffer_info.endianness =
-          static_cast<xenos::Endian>(index_size >> 30);
-      index_size &= 0x00FFFFFF;
+
+      // Two separate bounds checks so if there's only one missing register
+      // value out of two, one uint32_t will be skipped in the command buffer,
+      // not two.
+      assert_not_zero(count_remaining);
+      if (!count_remaining) {
+        XELOGE("{}: Packet too small, can't read VGT_DMA_BASE", opcode_name);
+        return false;
+      }
+      uint32_t vgt_dma_base = reader->ReadAndSwap<uint32_t>();
+      --count_remaining;
+      WriteRegister(XE_GPU_REG_VGT_DMA_BASE, vgt_dma_base);
+      reg::VGT_DMA_SIZE vgt_dma_size;
+      assert_not_zero(count_remaining);
+      if (!count_remaining) {
+        XELOGE("{}: Packet too small, can't read VGT_DMA_SIZE", opcode_name);
+        return false;
+      }
+      vgt_dma_size.value = reader->ReadAndSwap<uint32_t>();
+      --count_remaining;
+      WriteRegister(XE_GPU_REG_VGT_DMA_SIZE, vgt_dma_size.value);
+
+      uint32_t index_size_bytes =
+          vgt_draw_initiator.index_size == xenos::IndexFormat::kInt16
+              ? sizeof(uint16_t)
+              : sizeof(uint32_t);
+      // The base address must already be word-aligned according to the R6xx
+      // documentation, but for safety.
+      index_buffer_info.guest_base = vgt_dma_base & ~(index_size_bytes - 1);
+      index_buffer_info.endianness = vgt_dma_size.swap_mode;
      index_buffer_info.format = vgt_draw_initiator.index_size;
-      index_size *=
-          (vgt_draw_initiator.index_size == xenos::IndexFormat::kInt32) ? 4 : 2;
-      index_buffer_info.length = index_size;
+      index_buffer_info.length = vgt_dma_size.num_words * index_size_bytes;
      index_buffer_info.count = vgt_draw_initiator.num_indices;
    } break;
    case xenos::SourceSelect::kImmediate: {
      // TODO(Triang3l): VGT_IMMED_DATA.
+      XELOGE(
+          "{}: Using immediate vertex indices, which are not supported yet. "
+          "Report the game to Xenia developers!",
+          opcode_name, uint32_t(vgt_draw_initiator.source_select));
+      success = false;
      assert_always();
    } break;
    case xenos::SourceSelect::kAutoIndex: {
@ -1215,71 +1256,65 @@ bool CommandProcessor::ExecutePacketType3_DRAW_INDX(RingBuffer* reader,
      index_buffer_info.length = 0;
    } break;
    default: {
-      // Invalid source select.
-      assert_always();
+      // Invalid source selection.
+      success = false;
+      assert_unhandled_case(vgt_draw_initiator.source_select);
    } break;
  }

-  auto viz_query = register_file_->Get<reg::PA_SC_VIZ_QUERY>();
-  if (viz_query.viz_query_ena && viz_query.kill_pix_post_hi_z) {
-    // TODO(Triang3l): Don't drop the draw call completely if the vertex shader
-    // has memexport.
-    // TODO(Triang3l || JoelLinn): Handle this properly in the render backends.
-    return true;
+  // Skip to the next command, for example, if there are immediate indexes that
+  // we don't support yet.
+  reader->AdvanceRead(count_remaining * sizeof(uint32_t));
+
+  if (success) {
+    auto viz_query = register_file_->Get<reg::PA_SC_VIZ_QUERY>();
+    if (!(viz_query.viz_query_ena && viz_query.kill_pix_post_hi_z)) {
+      // TODO(Triang3l): Don't drop the draw call completely if the vertex
+      // shader has memexport.
+      // TODO(Triang3l || JoelLinn): Handle this properly in the render
+      // backends.
+      success = IssueDraw(
+          vgt_draw_initiator.prim_type, vgt_draw_initiator.num_indices,
+          is_indexed ? &index_buffer_info : nullptr,
+          xenos::IsMajorModeExplicit(vgt_draw_initiator.major_mode,
+                                     vgt_draw_initiator.prim_type));
+      if (!success) {
+        XELOGE("{}({}, {}, {}): Failed in backend", opcode_name,
+               vgt_draw_initiator.num_indices,
+               uint32_t(vgt_draw_initiator.prim_type),
+               uint32_t(vgt_draw_initiator.source_select));
+      }
+    }
  }

-  bool success =
-      IssueDraw(vgt_draw_initiator.prim_type, vgt_draw_initiator.num_indices,
-                is_indexed ? &index_buffer_info : nullptr,
-                xenos::IsMajorModeExplicit(vgt_draw_initiator.major_mode,
-                                           vgt_draw_initiator.prim_type));
-  if (!success) {
-    XELOGE("PM4_DRAW_INDX({}, {}, {}): Failed in backend",
-           vgt_draw_initiator.num_indices,
-           uint32_t(vgt_draw_initiator.prim_type),
-           uint32_t(vgt_draw_initiator.source_select));
-  }
+  return success;
+}

-  return true;
+bool CommandProcessor::ExecutePacketType3_DRAW_INDX(RingBuffer* reader,
+                                                    uint32_t packet,
+                                                    uint32_t count) {
+  // "initiate fetch of index buffer and draw"
+  // Generally used by Xbox 360 Direct3D 9 for kDMA and kAutoIndex sources.
+  // With a viz query token as the first one.
+  uint32_t count_remaining = count;
+  assert_not_zero(count_remaining);
+  if (!count_remaining) {
+    XELOGE("PM4_DRAW_INDX: Packet too small, can't read the viz query token");
+    return false;
+  }
+  uint32_t viz_query_condition = reader->ReadAndSwap<uint32_t>();
+  --count_remaining;
+  return ExecutePacketType3Draw(reader, packet, "PM4_DRAW_INDX",
+                                viz_query_condition, count_remaining);
 }

 bool CommandProcessor::ExecutePacketType3_DRAW_INDX_2(RingBuffer* reader,
                                                      uint32_t packet,
                                                      uint32_t count) {
-  // draw using supplied indices in packet
-  reg::VGT_DRAW_INITIATOR vgt_draw_initiator;
-  vgt_draw_initiator.value = reader->ReadAndSwap<uint32_t>();
-  WriteRegister(XE_GPU_REG_VGT_DRAW_INITIATOR, vgt_draw_initiator.value);
-  assert_true(vgt_draw_initiator.source_select ==
-              xenos::SourceSelect::kAutoIndex);
-  // Index buffer unused as automatic.
-  // uint32_t indices_size =
-  //     vgt_draw_initiator.num_indices *
-  //         (vgt_draw_initiator.index_size == xenos::IndexFormat::kInt32 ? 4
-  //                                                                      : 2);
-  // uint32_t index_ptr = reader->ptr();
-  // TODO(Triang3l): VGT_IMMED_DATA.
-  reader->AdvanceRead((count - 1) * sizeof(uint32_t));
-
-  auto viz_query = register_file_->Get<reg::PA_SC_VIZ_QUERY>();
-  if (viz_query.viz_query_ena && viz_query.kill_pix_post_hi_z) {
-    // TODO(Triang3l): Don't drop the draw call completely if the vertex shader
-    // has memexport.
-    // TODO(Triang3l || JoelLinn): Handle this properly in the render backends.
-    return true;
-  }
-
-  bool success = IssueDraw(
-      vgt_draw_initiator.prim_type, vgt_draw_initiator.num_indices, nullptr,
-      xenos::IsMajorModeExplicit(vgt_draw_initiator.major_mode,
-                                 vgt_draw_initiator.prim_type));
-  if (!success) {
-    XELOGE("PM4_DRAW_INDX_IMM({}, {}): Failed in backend",
-           vgt_draw_initiator.num_indices,
-           uint32_t(vgt_draw_initiator.prim_type));
-  }
-
-  return true;
+  // "draw using supplied indices in packet"
+  // Generally used by Xbox 360 Direct3D 9 for kAutoIndex source.
+  // No viz query token.
+  return ExecutePacketType3Draw(reader, packet, "PM4_DRAW_INDX_2", 0, count);
 }

 bool CommandProcessor::ExecutePacketType3_SET_CONSTANT(RingBuffer* reader,
--- a/src/xenia/gpu/command_processor.h
+++ b/src/xenia/gpu/command_processor.h
@ -144,8 +144,8 @@ class CommandProcessor {

  virtual void RestoreEdramSnapshot(const void* snapshot) = 0;

-  void InitializeRingBuffer(uint32_t ptr, uint32_t page_count);
-  void EnableReadPointerWriteBack(uint32_t ptr, uint32_t block_size);
+  void InitializeRingBuffer(uint32_t ptr, uint32_t size_log2);
+  void EnableReadPointerWriteBack(uint32_t ptr, uint32_t block_size_log2);

  void UpdateWritePointer(uint32_t value);

@ -218,6 +218,10 @@ class CommandProcessor {
                                          uint32_t count);
  bool ExecutePacketType3_EVENT_WRITE_ZPD(RingBuffer* reader, uint32_t packet,
                                          uint32_t count);
+  bool ExecutePacketType3Draw(RingBuffer* reader, uint32_t packet,
+                              const char* opcode_name,
+                              uint32_t viz_query_condition,
+                              uint32_t count_remaining);
  bool ExecutePacketType3_DRAW_INDX(RingBuffer* reader, uint32_t packet,
                                    uint32_t count);
  bool ExecutePacketType3_DRAW_INDX_2(RingBuffer* reader, uint32_t packet,
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.h
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h
@ -20,11 +20,11 @@
 #include "xenia/base/assert.h"
 #include "xenia/gpu/command_processor.h"
 #include "xenia/gpu/d3d12/d3d12_graphics_system.h"
+#include "xenia/gpu/d3d12/d3d12_primitive_processor.h"
+#include "xenia/gpu/d3d12/d3d12_render_target_cache.h"
 #include "xenia/gpu/d3d12/d3d12_shared_memory.h"
 #include "xenia/gpu/d3d12/deferred_command_list.h"
 #include "xenia/gpu/d3d12/pipeline_cache.h"
-#include "xenia/gpu/d3d12/primitive_converter.h"
-#include "xenia/gpu/d3d12/render_target_cache.h"
 #include "xenia/gpu/d3d12/texture_cache.h"
 #include "xenia/gpu/draw_util.h"
 #include "xenia/gpu/dxbc_shader.h"
@ -89,7 +89,7 @@ class D3D12CommandProcessor : public CommandProcessor {
  // there are 4 render targets bound with the same EDRAM base (clearly not
  // correct usage), but the shader only clears 1, and then EDRAM buffer stores
  // conflict with each other.
-  uint32_t GetCurrentColorMask(const Shader* pixel_shader) const;
+  uint32_t GetCurrentColorMask(uint32_t shader_writes_color_targets) const;

  void PushTransitionBarrier(
      ID3D12Resource* resource, D3D12_RESOURCE_STATES old_state,
@ -131,7 +131,7 @@ class D3D12CommandProcessor : public CommandProcessor {
  // descriptors, this must only be used to allocate SRVs, otherwise it won't
  // work on Nvidia Fermi (root signature creation will fail)!
  bool RequestOneUseSingleViewDescriptors(
-      uint32_t count, ui::d3d12::util::DescriptorCPUGPUHandlePair* handles_out);
+      uint32_t count, ui::d3d12::util::DescriptorCpuGpuHandlePair* handles_out);
  // These are needed often, so they are always allocated.
  enum class SystemBindlessView : uint32_t {
    kSharedMemoryRawSRV,
@ -149,6 +149,7 @@ class D3D12CommandProcessor : public CommandProcessor {
    kEdramR32G32B32A32UintSRV,
    kEdramRawUAV,
    kEdramR32UintUAV,
+    kEdramR32G32UintUAV,
    kEdramR32G32B32A32UintUAV,

    kGammaRampNormalSRV,
@ -164,16 +165,18 @@ class D3D12CommandProcessor : public CommandProcessor {

    kCount,
  };
-  ui::d3d12::util::DescriptorCPUGPUHandlePair GetSystemBindlessViewHandlePair(
+  ui::d3d12::util::DescriptorCpuGpuHandlePair GetSystemBindlessViewHandlePair(
      SystemBindlessView view) const;
-  ui::d3d12::util::DescriptorCPUGPUHandlePair
+  ui::d3d12::util::DescriptorCpuGpuHandlePair
  GetSharedMemoryUintPow2BindlessSRVHandlePair(
      uint32_t element_size_bytes_pow2) const;
-  ui::d3d12::util::DescriptorCPUGPUHandlePair
+  ui::d3d12::util::DescriptorCpuGpuHandlePair
  GetSharedMemoryUintPow2BindlessUAVHandlePair(
      uint32_t element_size_bytes_pow2) const;
-  ui::d3d12::util::DescriptorCPUGPUHandlePair
+  ui::d3d12::util::DescriptorCpuGpuHandlePair
  GetEdramUintPow2BindlessSRVHandlePair(uint32_t element_size_bytes_pow2) const;
+  ui::d3d12::util::DescriptorCpuGpuHandlePair
+  GetEdramUintPow2BindlessUAVHandlePair(uint32_t element_size_bytes_pow2) const;

  // Returns a single temporary GPU-side buffer within a submission for tasks
  // like texture untiling and resolving.
@ -185,19 +188,20 @@ class D3D12CommandProcessor : public CommandProcessor {
  void ReleaseScratchGPUBuffer(ID3D12Resource* buffer,
                               D3D12_RESOURCE_STATES new_state);

-  // Sets the current SSAA sample positions, needs to be done before setting
-  // render targets or copying to depth render targets.
-  void SetSamplePositions(xenos::MsaaSamples sample_positions);
-
  // Returns a pipeline with deferred creation by its handle. May return nullptr
  // if failed to create the pipeline.
  ID3D12PipelineState* GetD3D12PipelineByHandle(void* handle) const {
    return pipeline_cache_->GetD3D12PipelineByHandle(handle);
  }

-  // Sets the current pipeline to a compute one. This is for cache invalidation
-  // primarily. A submission must be open.
-  void SetComputePipeline(ID3D12PipelineState* pipeline);
+  // Sets the current cached values to external ones. This is for cache
+  // invalidation primarily. A submission must be open.
+  void SetExternalPipeline(ID3D12PipelineState* pipeline);
+  void SetExternalGraphicsRootSignature(ID3D12RootSignature* root_signature);
+  void SetViewport(const D3D12_VIEWPORT& viewport);
+  void SetScissorRect(const D3D12_RECT& scissor_rect);
+  void SetStencilReference(uint32_t stencil_ref);
+  void SetPrimitiveTopology(D3D12_PRIMITIVE_TOPOLOGY primitive_topology);

  // For the pipeline cache to call when binding layout UIDs may be reused.
  void NotifyShaderBindingsLayoutUIDsInvalidated();
@ -351,12 +355,13 @@ class D3D12CommandProcessor : public CommandProcessor {
  void UpdateFixedFunctionState(const draw_util::ViewportInfo& viewport_info,
                                const draw_util::Scissor& scissor,
                                bool primitive_polygonal);
-  void UpdateSystemConstantValues(
-      bool shared_memory_is_uav, bool primitive_polygonal,
-      uint32_t line_loop_closing_index, xenos::Endian index_endian,
-      const draw_util::ViewportInfo& viewport_info, uint32_t pixel_size_x,
-      uint32_t pixel_size_y, uint32_t used_texture_mask, uint32_t color_mask,
-      const RenderTargetCache::PipelineRenderTarget render_targets[4]);
+  void UpdateSystemConstantValues(bool shared_memory_is_uav,
+                                  bool primitive_polygonal,
+                                  uint32_t line_loop_closing_index,
+                                  xenos::Endian index_endian,
+                                  const draw_util::ViewportInfo& viewport_info,
+                                  uint32_t used_texture_mask,
+                                  uint32_t color_mask);
  bool UpdateBindings(const D3D12Shader* vertex_shader,
                      const D3D12Shader* pixel_shader,
                      ID3D12RootSignature* root_signature);
@ -418,10 +423,8 @@ class D3D12CommandProcessor : public CommandProcessor {
  // bindful - mainly because of CopyDescriptorsSimple, which takes the majority
  // of UpdateBindings time, and that's outside the emulator's control even).
  bool bindless_resources_used_ = false;
-  // Should a rasterizer-ordered UAV of the EDRAM buffer with format conversion
-  // and blending performed in pixel shaders be used instead of host render
-  // targets.
-  bool edram_rov_used_ = false;
+
+  std::unique_ptr<D3D12RenderTargetCache> render_target_cache_;

  std::unique_ptr<ui::d3d12::D3D12UploadBufferPool> constant_buffer_pool_;

@ -487,14 +490,12 @@ class D3D12CommandProcessor : public CommandProcessor {

  std::unique_ptr<D3D12SharedMemory> shared_memory_;

+  std::unique_ptr<D3D12PrimitiveProcessor> primitive_processor_;
+
  std::unique_ptr<PipelineCache> pipeline_cache_;

  std::unique_ptr<TextureCache> texture_cache_;

-  std::unique_ptr<RenderTargetCache> render_target_cache_;
-
-  std::unique_ptr<PrimitiveConverter> primitive_converter_;
-
  // Mip 0 contains the normal gamma ramp (256 entries), mip 1 contains the PWL
  // ramp (128 entries). DXGI_FORMAT_R10G10B10A2_UNORM 1D.
  ID3D12Resource* gamma_ramp_texture_ = nullptr;
@ -508,10 +509,9 @@ class D3D12CommandProcessor : public CommandProcessor {
  static constexpr uint32_t kSwapTextureWidth = 1280;
  static constexpr uint32_t kSwapTextureHeight = 720;
  std::pair<uint32_t, uint32_t> GetSwapTextureSize() const {
-    if (texture_cache_->IsResolutionScale2X()) {
-      return std::make_pair(kSwapTextureWidth * 2, kSwapTextureHeight * 2);
-    }
-    return std::make_pair(kSwapTextureWidth, kSwapTextureHeight);
+    uint32_t resolution_scale = texture_cache_->GetDrawResolutionScale();
+    return std::make_pair(kSwapTextureWidth * resolution_scale,
+                          kSwapTextureHeight * resolution_scale);
  }
  ID3D12Resource* swap_texture_ = nullptr;
  D3D12_PLACED_SUBRESOURCE_FOOTPRINT swap_texture_copy_footprint_;
@ -549,9 +549,6 @@ class D3D12CommandProcessor : public CommandProcessor {
  bool ff_blend_factor_update_needed_;
  bool ff_stencil_ref_update_needed_;

-  // Current SSAA sample positions (to be updated by the render target cache).
-  xenos::MsaaSamples current_sample_positions_;
-
  // Currently bound pipeline, either a graphics pipeline from the pipeline
  // cache (with potentially deferred creation - current_external_pipeline_ is
  // nullptr in this case) or a non-Xenos graphics or compute pipeline
--- a/src/xenia/gpu/d3d12/d3d12_graphics_system.cc
+++ b/src/xenia/gpu/d3d12/d3d12_graphics_system.cc
@ -9,6 +9,8 @@

 #include "xenia/gpu/d3d12/d3d12_graphics_system.h"

+#include <algorithm>
+
 #include "xenia/base/logging.h"
 #include "xenia/base/math.h"
 #include "xenia/gpu/d3d12/d3d12_command_processor.h"
@ -20,10 +22,12 @@ namespace xe {
 namespace gpu {
 namespace d3d12 {

-// Generated with `xb gendxbc`.
-#include "xenia/gpu/shaders/bytecode/d3d12_5_1/fullscreen_vs.h"
+// Generated with `xb buildshaders`.
+namespace shaders {
+#include "xenia/gpu/shaders/bytecode/d3d12_5_1/fullscreen_tc_vs.h"
 #include "xenia/gpu/shaders/bytecode/d3d12_5_1/stretch_gamma_ps.h"
 #include "xenia/gpu/shaders/bytecode/d3d12_5_1/stretch_ps.h"
+}  // namespace shaders

 D3D12GraphicsSystem::D3D12GraphicsSystem() {}

@ -138,10 +142,10 @@ X_STATUS D3D12GraphicsSystem::Setup(cpu::Processor* processor,
  // Create the stretch pipelines.
  D3D12_GRAPHICS_PIPELINE_STATE_DESC stretch_pipeline_desc = {};
  stretch_pipeline_desc.pRootSignature = stretch_root_signature_;
-  stretch_pipeline_desc.VS.pShaderBytecode = fullscreen_vs;
-  stretch_pipeline_desc.VS.BytecodeLength = sizeof(fullscreen_vs);
-  stretch_pipeline_desc.PS.pShaderBytecode = stretch_ps;
-  stretch_pipeline_desc.PS.BytecodeLength = sizeof(stretch_ps);
+  stretch_pipeline_desc.VS.pShaderBytecode = shaders::fullscreen_tc_vs;
+  stretch_pipeline_desc.VS.BytecodeLength = sizeof(shaders::fullscreen_tc_vs);
+  stretch_pipeline_desc.PS.pShaderBytecode = shaders::stretch_ps;
+  stretch_pipeline_desc.PS.BytecodeLength = sizeof(shaders::stretch_ps);
  // The shader will set alpha to 1, don't use output-merger to preserve it.
  stretch_pipeline_desc.BlendState.RenderTarget[0].RenderTargetWriteMask =
      D3D12_COLOR_WRITE_ENABLE_ALL;
@ -165,8 +169,8 @@ X_STATUS D3D12GraphicsSystem::Setup(cpu::Processor* processor,
    return X_STATUS_UNSUCCESSFUL;
  }
  stretch_pipeline_desc.pRootSignature = stretch_gamma_root_signature_;
-  stretch_pipeline_desc.PS.pShaderBytecode = stretch_gamma_ps;
-  stretch_pipeline_desc.PS.BytecodeLength = sizeof(stretch_gamma_ps);
+  stretch_pipeline_desc.PS.pShaderBytecode = shaders::stretch_gamma_ps;
+  stretch_pipeline_desc.PS.BytecodeLength = sizeof(shaders::stretch_gamma_ps);
  if (FAILED(device->CreateGraphicsPipelineState(
          &stretch_pipeline_desc, IID_PPV_ARGS(&stretch_gamma_pipeline_)))) {
    XELOGE(
--- a/src/xenia/gpu/d3d12/d3d12_primitive_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_primitive_processor.cc
@ -0,0 +1,173 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2021 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include "xenia/gpu/d3d12/d3d12_primitive_processor.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "xenia/base/assert.h"
+#include "xenia/base/logging.h"
+#include "xenia/gpu/d3d12/d3d12_command_processor.h"
+#include "xenia/gpu/d3d12/deferred_command_list.h"
+#include "xenia/ui/d3d12/d3d12_provider.h"
+#include "xenia/ui/d3d12/d3d12_util.h"
+
+namespace xe {
+namespace gpu {
+namespace d3d12 {
+
+D3D12PrimitiveProcessor::~D3D12PrimitiveProcessor() { Shutdown(true); }
+
+bool D3D12PrimitiveProcessor::Initialize() {
+  if (!InitializeCommon(true, false, false, true)) {
+    Shutdown();
+    return false;
+  }
+  frame_index_buffer_pool_ = std::make_unique<ui::d3d12::D3D12UploadBufferPool>(
+      command_processor_.GetD3D12Context().GetD3D12Provider(),
+      std::max(size_t(kMinRequiredConvertedIndexBufferSize),
+               ui::GraphicsUploadBufferPool::kDefaultPageSize));
+  return true;
+}
+
+void D3D12PrimitiveProcessor::Shutdown(bool from_destructor) {
+  frame_index_buffer_pool_.reset();
+  builtin_index_buffer_upload_.Reset();
+  builtin_index_buffer_gpu_address_ = 0;
+  builtin_index_buffer_.Reset();
+  if (!from_destructor) {
+    ShutdownCommon();
+  }
+}
+
+void D3D12PrimitiveProcessor::CompletedSubmissionUpdated() {
+  if (builtin_index_buffer_upload_ &&
+      command_processor_.GetCompletedSubmission() >=
+          builtin_index_buffer_upload_submission_) {
+    builtin_index_buffer_upload_.Reset();
+  }
+}
+
+void D3D12PrimitiveProcessor::BeginSubmission() {
+  if (builtin_index_buffer_upload_ &&
+      builtin_index_buffer_upload_submission_ == UINT64_MAX) {
+    // No need to submit deferred barriers - builtin_index_buffer_ has never
+    // been used yet, so it's in the initial state, and
+    // builtin_index_buffer_upload_ is in an upload heap, so it's GENERIC_READ.
+    command_processor_.GetDeferredCommandList().D3DCopyResource(
+        builtin_index_buffer_.Get(), builtin_index_buffer_upload_.Get());
+    command_processor_.PushTransitionBarrier(builtin_index_buffer_.Get(),
+                                             D3D12_RESOURCE_STATE_COPY_DEST,
+                                             D3D12_RESOURCE_STATE_INDEX_BUFFER);
+    builtin_index_buffer_upload_submission_ =
+        command_processor_.GetCurrentSubmission();
+  }
+}
+
+void D3D12PrimitiveProcessor::BeginFrame() {
+  frame_index_buffer_pool_->Reclaim(command_processor_.GetCompletedFrame());
+}
+
+void D3D12PrimitiveProcessor::EndFrame() {
+  ClearPerFrameCache();
+  frame_index_buffers_.clear();
+}
+
+bool D3D12PrimitiveProcessor::InitializeBuiltin16BitIndexBuffer(
+    uint32_t index_count, std::function<void(uint16_t*)> fill_callback) {
+  assert_not_zero(index_count);
+  assert_null(builtin_index_buffer_);
+  assert_null(builtin_index_buffer_upload_);
+
+  const ui::d3d12::D3D12Provider& provider =
+      command_processor_.GetD3D12Context().GetD3D12Provider();
+  ID3D12Device* device = provider.GetDevice();
+
+  D3D12_RESOURCE_DESC resource_desc;
+  ui::d3d12::util::FillBufferResourceDesc(
+      resource_desc, UINT64(sizeof(uint16_t) * index_count),
+      D3D12_RESOURCE_FLAG_NONE);
+  Microsoft::WRL::ComPtr<ID3D12Resource> draw_resource;
+  if (FAILED(device->CreateCommittedResource(
+          &ui::d3d12::util::kHeapPropertiesDefault,
+          provider.GetHeapFlagCreateNotZeroed(), &resource_desc,
+          D3D12_RESOURCE_STATE_COPY_DEST, nullptr,
+          IID_PPV_ARGS(&draw_resource)))) {
+    XELOGE(
+        "D3D12 primitive processor: Failed to create the built-in index "
+        "buffer GPU resource with {} 16-bit indices",
+        index_count);
+    return false;
+  }
+  Microsoft::WRL::ComPtr<ID3D12Resource> upload_resource;
+  if (FAILED(device->CreateCommittedResource(
+          &ui::d3d12::util::kHeapPropertiesUpload,
+          provider.GetHeapFlagCreateNotZeroed(), &resource_desc,
+          D3D12_RESOURCE_STATE_GENERIC_READ, nullptr,
+          IID_PPV_ARGS(&upload_resource)))) {
+    XELOGE(
+        "D3D12 primitive processor: Failed to create the built-in index "
+        "buffer upload resource with {} 16-bit indices",
+        index_count);
+    return false;
+  }
+
+  D3D12_RANGE upload_read_range = {};
+  void* mapping;
+  if (FAILED(upload_resource->Map(0, &upload_read_range, &mapping))) {
+    XELOGE(
+        "D3D12 primitive processor: Failed to map the built-in index buffer "
+        "upload resource with {} 16-bit indices",
+        index_count);
+    return false;
+  }
+  fill_callback(reinterpret_cast<uint16_t*>(mapping));
+  upload_resource->Unmap(0, nullptr);
+
+  // Successfully created the buffer and wrote the data to upload.
+  builtin_index_buffer_ = std::move(draw_resource);
+  builtin_index_buffer_gpu_address_ =
+      builtin_index_buffer_->GetGPUVirtualAddress();
+  builtin_index_buffer_upload_ = std::move(upload_resource);
+  // Schedule uploading in the first submission.
+  builtin_index_buffer_upload_submission_ = UINT64_MAX;
+  return true;
+}
+
+void* D3D12PrimitiveProcessor::RequestHostConvertedIndexBufferForCurrentFrame(
+    xenos::IndexFormat format, uint32_t index_count, bool coalign_for_simd,
+    uint32_t coalignment_original_address, size_t& backend_handle_out) {
+  size_t index_size = format == xenos::IndexFormat::kInt16 ? sizeof(uint16_t)
+                                                           : sizeof(uint32_t);
+  D3D12_GPU_VIRTUAL_ADDRESS gpu_address;
+  uint8_t* mapping = frame_index_buffer_pool_->Request(
+      command_processor_.GetCurrentFrame(),
+      index_size * index_count +
+          (coalign_for_simd ? XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE : 0),
+      index_size, nullptr, nullptr, &gpu_address);
+  if (!mapping) {
+    return false;
+  }
+  if (coalign_for_simd) {
+    ptrdiff_t coalignment_offset =
+        GetSimdCoalignmentOffset(mapping, coalignment_original_address);
+    mapping += coalignment_offset;
+    gpu_address = D3D12_GPU_VIRTUAL_ADDRESS(gpu_address + coalignment_offset);
+  }
+  backend_handle_out = frame_index_buffers_.size();
+  frame_index_buffers_.push_back(gpu_address);
+  return mapping;
+}
+
+}  // namespace d3d12
+}  // namespace gpu
+}  // namespace xe
--- a/src/xenia/gpu/d3d12/d3d12_primitive_processor.h
+++ b/src/xenia/gpu/d3d12/d3d12_primitive_processor.h
@ -0,0 +1,90 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2021 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_GPU_D3D12_D3D12_PRIMITIVE_PROCESSOR_H_
+#define XENIA_GPU_D3D12_D3D12_PRIMITIVE_PROCESSOR_H_
+
+#include <cstdint>
+#include <deque>
+#include <memory>
+
+#include "xenia/base/assert.h"
+#include "xenia/gpu/primitive_processor.h"
+#include "xenia/ui/d3d12/d3d12_api.h"
+#include "xenia/ui/d3d12/d3d12_upload_buffer_pool.h"
+
+namespace xe {
+namespace gpu {
+namespace d3d12 {
+
+class D3D12CommandProcessor;
+
+class D3D12PrimitiveProcessor final : public PrimitiveProcessor {
+ public:
+  D3D12PrimitiveProcessor(const RegisterFile& register_file, Memory& memory,
+                          TraceWriter& trace_writer,
+                          SharedMemory& shared_memory,
+                          D3D12CommandProcessor& command_processor)
+      : PrimitiveProcessor(register_file, memory, trace_writer, shared_memory),
+        command_processor_(command_processor) {}
+  ~D3D12PrimitiveProcessor();
+
+  bool Initialize();
+  void Shutdown(bool from_destructor = false);
+  void ClearCache() { frame_index_buffer_pool_->ClearCache(); }
+
+  void CompletedSubmissionUpdated();
+  void BeginSubmission();
+  void BeginFrame();
+  void EndFrame();
+
+  D3D12_GPU_VIRTUAL_ADDRESS GetBuiltinIndexBufferGpuAddress(
+      size_t handle) const {
+    assert_not_null(builtin_index_buffer_);
+    return D3D12_GPU_VIRTUAL_ADDRESS(builtin_index_buffer_gpu_address_ +
+                                     GetBuiltinIndexBufferOffsetBytes(handle));
+  }
+  D3D12_GPU_VIRTUAL_ADDRESS GetConvertedIndexBufferGpuAddress(
+      size_t handle) const {
+    return frame_index_buffers_[handle];
+  }
+
+ protected:
+  bool InitializeBuiltin16BitIndexBuffer(
+      uint32_t index_count,
+      std::function<void(uint16_t*)> fill_callback) override;
+
+  void* RequestHostConvertedIndexBufferForCurrentFrame(
+      xenos::IndexFormat format, uint32_t index_count, bool coalign_for_simd,
+      uint32_t coalignment_original_address,
+      size_t& backend_handle_out) override;
+
+ private:
+  D3D12CommandProcessor& command_processor_;
+
+  Microsoft::WRL::ComPtr<ID3D12Resource> builtin_index_buffer_;
+  D3D12_GPU_VIRTUAL_ADDRESS builtin_index_buffer_gpu_address_ = 0;
+  // Temporary buffer copied in the beginning of the first submission for
+  // uploading to builtin_index_buffer_, destroyed when the submission when it
+  // was uploaded is completed.
+  Microsoft::WRL::ComPtr<ID3D12Resource> builtin_index_buffer_upload_;
+  // UINT64_MAX means not uploaded yet and needs uploading in the first
+  // submission (if the upload buffer exists at all).
+  uint64_t builtin_index_buffer_upload_submission_ = UINT64_MAX;
+
+  std::unique_ptr<ui::d3d12::D3D12UploadBufferPool> frame_index_buffer_pool_;
+  // Indexed by the backend handles.
+  std::deque<D3D12_GPU_VIRTUAL_ADDRESS> frame_index_buffers_;
+};
+
+}  // namespace d3d12
+}  // namespace gpu
+}  // namespace xe
+
+#endif  // XENIA_GPU_D3D12_D3D12_PRIMITIVE_PROCESSOR_H_
--- a/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc
+++ b/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc
--- a/src/xenia/gpu/d3d12/d3d12_render_target_cache.h
+++ b/src/xenia/gpu/d3d12/d3d12_render_target_cache.h
@ -0,0 +1,841 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2021 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_GPU_D3D12_D3D12_RENDER_TARGET_CACHE_H_
+#define XENIA_GPU_D3D12_D3D12_RENDER_TARGET_CACHE_H_
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <deque>
+#include <functional>
+#include <memory>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "xenia/base/assert.h"
+#include "xenia/gpu/d3d12/d3d12_shared_memory.h"
+#include "xenia/gpu/d3d12/texture_cache.h"
+#include "xenia/gpu/draw_util.h"
+#include "xenia/gpu/render_target_cache.h"
+#include "xenia/gpu/trace_writer.h"
+#include "xenia/gpu/xenos.h"
+#include "xenia/memory.h"
+#include "xenia/ui/d3d12/d3d12_cpu_descriptor_pool.h"
+#include "xenia/ui/d3d12/d3d12_provider.h"
+#include "xenia/ui/d3d12/d3d12_upload_buffer_pool.h"
+#include "xenia/ui/d3d12/d3d12_util.h"
+
+namespace xe {
+namespace gpu {
+namespace d3d12 {
+
+class D3D12CommandProcessor;
+
+class D3D12RenderTargetCache final : public RenderTargetCache {
+ public:
+  D3D12RenderTargetCache(const RegisterFile& register_file,
+                         D3D12CommandProcessor& command_processor,
+                         TraceWriter& trace_writer,
+                         bool bindless_resources_used)
+      : RenderTargetCache(register_file),
+        command_processor_(command_processor),
+        trace_writer_(trace_writer),
+        bindless_resources_used_(bindless_resources_used) {}
+  ~D3D12RenderTargetCache() override;
+
+  bool Initialize();
+  void Shutdown(bool from_destructor = false);
+
+  void CompletedSubmissionUpdated();
+  void BeginSubmission();
+
+  Path GetPath() const override { return path_; }
+
+  uint32_t GetResolutionScale() const override { return resolution_scale_; }
+
+  bool Update(bool is_rasterization_done,
+              uint32_t shader_writes_color_targets) override;
+
+  void InvalidateCommandListRenderTargets() {
+    are_current_command_list_render_targets_valid_ = false;
+  }
+
+  bool msaa_2x_supported() const { return msaa_2x_supported_; }
+
+  void WriteEdramRawSRVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle);
+  void WriteEdramRawUAVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle);
+  void WriteEdramUintPow2SRVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle,
+                                       uint32_t element_size_bytes_pow2);
+  void WriteEdramUintPow2UAVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle,
+                                       uint32_t element_size_bytes_pow2);
+
+  // Performs the resolve to a shared memory area according to the current
+  // register values, and also clears the render targets if needed. Must be in a
+  // frame for calling.
+  bool Resolve(const Memory& memory, D3D12SharedMemory& shared_memory,
+               TextureCache& texture_cache, uint32_t& written_address_out,
+               uint32_t& written_length_out);
+
+  // Returns true if any downloads were submitted to the command processor.
+  bool InitializeTraceSubmitDownloads();
+  void InitializeTraceCompleteDownloads();
+  void RestoreEdramSnapshot(const void* snapshot);
+
+  // For host render targets.
+
+  bool gamma_render_target_as_srgb() const {
+    return gamma_render_target_as_srgb_;
+  }
+
+  // Using R16G16[B16A16]_SNORM, which are -1...1, not the needed -32...32.
+  // Persistent data doesn't depend on this, so can be overriden by per-game
+  // configuration.
+  bool IsFixed16TruncatedToMinus1To1() const {
+    return GetPath() == Path::kHostRenderTargets &&
+           !cvars::snorm16_render_target_full_range;
+  }
+
+  DepthFloat24Conversion depth_float24_conversion() const {
+    return depth_float24_conversion_;
+  }
+
+  DXGI_FORMAT GetColorResourceDXGIFormat(
+      xenos::ColorRenderTargetFormat format) const;
+  DXGI_FORMAT GetColorDrawDXGIFormat(
+      xenos::ColorRenderTargetFormat format) const;
+  DXGI_FORMAT GetColorOwnershipTransferDXGIFormat(
+      xenos::ColorRenderTargetFormat format,
+      bool* is_integer_out = nullptr) const;
+  static DXGI_FORMAT GetDepthResourceDXGIFormat(
+      xenos::DepthRenderTargetFormat format);
+  static DXGI_FORMAT GetDepthDSVDXGIFormat(
+      xenos::DepthRenderTargetFormat format);
+  static DXGI_FORMAT GetDepthSRVDepthDXGIFormat(
+      xenos::DepthRenderTargetFormat format);
+  static DXGI_FORMAT GetDepthSRVStencilDXGIFormat(
+      xenos::DepthRenderTargetFormat format);
+
+ protected:
+  class D3D12RenderTarget final : public RenderTarget {
+   public:
+    // descriptor_draw_srgb is only used for k_8_8_8_8 render targets when host
+    // sRGB (gamma_render_target_as_srgb) is used. descriptor_load is present
+    // when the DXGI formats are different for drawing and bit-exact loading
+    // (for NaN pattern preservation across EDRAM tile ownership transfers in
+    // floating-point formats, and to distinguish between two -1 representations
+    // in snorm formats).
+    D3D12RenderTarget(
+        RenderTargetKey key, D3D12RenderTargetCache& render_target_cache,
+        ID3D12Resource* resource,
+        ui::d3d12::D3D12CpuDescriptorPool::Descriptor&& descriptor_draw,
+        ui::d3d12::D3D12CpuDescriptorPool::Descriptor&& descriptor_draw_srgb,
+        ui::d3d12::D3D12CpuDescriptorPool::Descriptor&&
+            descriptor_load_separate,
+        ui::d3d12::D3D12CpuDescriptorPool::Descriptor&& descriptor_srv,
+        ui::d3d12::D3D12CpuDescriptorPool::Descriptor&& descriptor_srv_stencil,
+        D3D12_RESOURCE_STATES resource_state)
+        : RenderTarget(key),
+          render_target_cache_(render_target_cache),
+          resource_(resource),
+          descriptor_draw_(std::move(descriptor_draw)),
+          descriptor_draw_srgb_(std::move(descriptor_draw_srgb)),
+          descriptor_load_separate_(std::move(descriptor_load_separate)),
+          descriptor_srv_(std::move(descriptor_srv)),
+          descriptor_srv_stencil_(std::move(descriptor_srv_stencil)),
+          resource_state_(resource_state) {}
+
+    ID3D12Resource* resource() const { return resource_.Get(); }
+    const ui::d3d12::D3D12CpuDescriptorPool::Descriptor& descriptor_draw()
+        const {
+      return descriptor_draw_;
+    }
+    const ui::d3d12::D3D12CpuDescriptorPool::Descriptor& descriptor_draw_srgb()
+        const {
+      return descriptor_draw_srgb_;
+    }
+    const ui::d3d12::D3D12CpuDescriptorPool::Descriptor& descriptor_srv()
+        const {
+      return descriptor_srv_;
+    }
+    const ui::d3d12::D3D12CpuDescriptorPool::Descriptor&
+    descriptor_srv_stencil() const {
+      return descriptor_srv_stencil_;
+    }
+    const ui::d3d12::D3D12CpuDescriptorPool::Descriptor&
+    descriptor_load_separate() const {
+      return descriptor_load_separate_;
+    }
+
+    D3D12_RESOURCE_STATES SetResourceState(D3D12_RESOURCE_STATES new_state) {
+      D3D12_RESOURCE_STATES old_state = resource_state_;
+      resource_state_ = new_state;
+      return old_state;
+    }
+
+    uint32_t temporary_srv_descriptor_index() const {
+      return temporary_srv_descriptor_index_;
+    }
+    void SetTemporarySRVDescriptorIndex(uint32_t index) {
+      temporary_srv_descriptor_index_ = index;
+    }
+    uint32_t temporary_srv_descriptor_index_stencil() const {
+      return temporary_srv_descriptor_index_stencil_;
+    }
+    void SetTemporarySRVDescriptorIndexStencil(uint32_t index) {
+      temporary_srv_descriptor_index_stencil_ = index;
+    }
+    uint32_t temporary_sort_index() const { return temporary_sort_index_; }
+    void SetTemporarySortIndex(uint32_t index) {
+      temporary_sort_index_ = index;
+    }
+
+   private:
+    D3D12RenderTargetCache& render_target_cache_;
+    Microsoft::WRL::ComPtr<ID3D12Resource> resource_;
+    ui::d3d12::D3D12CpuDescriptorPool::Descriptor descriptor_draw_;
+    ui::d3d12::D3D12CpuDescriptorPool::Descriptor descriptor_draw_srgb_;
+    ui::d3d12::D3D12CpuDescriptorPool::Descriptor descriptor_load_separate_;
+    // Texture SRV non-shader-visible descriptors, to prepare shader-visible
+    // descriptors faster, by copying rather than by creating every time.
+    // TODO(Triang3l): With bindless resources, persistently store them in the
+    // heap.
+    ui::d3d12::D3D12CpuDescriptorPool::Descriptor descriptor_srv_;
+    ui::d3d12::D3D12CpuDescriptorPool::Descriptor descriptor_srv_stencil_;
+    D3D12_RESOURCE_STATES resource_state_;
+    // Temporary storage for indices in operations like transfers and dumps.
+    uint32_t temporary_srv_descriptor_index_ = UINT32_MAX;
+    uint32_t temporary_srv_descriptor_index_stencil_ = UINT32_MAX;
+    uint32_t temporary_sort_index_ = 0;
+  };
+
+  uint32_t GetMaxRenderTargetWidth() const override {
+    return D3D12_REQ_TEXTURE2D_U_OR_V_DIMENSION;
+  }
+  uint32_t GetMaxRenderTargetHeight() const override {
+    return D3D12_REQ_TEXTURE2D_U_OR_V_DIMENSION;
+  }
+
+  xenos::ColorRenderTargetFormat GetHostRelevantColorFormat(
+      xenos::ColorRenderTargetFormat format) const override;
+
+  RenderTarget* CreateRenderTarget(RenderTargetKey key) override;
+
+  bool IsHostDepthEncodingDifferent(
+      xenos::DepthRenderTargetFormat format) const override;
+
+  void RequestPixelShaderInterlockBarrier() override;
+
+ private:
+  enum class EdramBufferModificationStatus {
+    // No uncommitted ROV/UAV writes.
+    kUnmodified,
+    // Need to commit before the next ROV usage with overlap.
+    kAsROV,
+    // Need to commit before any next ROV usage.
+    kAsUAV,
+  };
+  void TransitionEdramBuffer(D3D12_RESOURCE_STATES new_state);
+  void MarkEdramBufferModified(
+      EdramBufferModificationStatus modification_status =
+          EdramBufferModificationStatus::kAsUAV);
+  void CommitEdramBufferUAVWrites(EdramBufferModificationStatus commit_status =
+                                      EdramBufferModificationStatus::kAsROV);
+
+  D3D12CommandProcessor& command_processor_;
+  TraceWriter& trace_writer_;
+  bool bindless_resources_used_;
+
+  Path path_ = Path::kHostRenderTargets;
+  uint32_t resolution_scale_ = 1;
+
+  // For host render targets, an EDRAM-sized scratch buffer for:
+  // - Guest render target data copied from host render targets during copying
+  //   in resolves and in frame trace creation.
+  // - Host float32 depth in ownership transfers when the host depth texture and
+  //   the destination are the same.
+  // For rasterizer-ordered view, the buffer containing the EDRAM data.
+  // (Note that if a hybrid RTV / DSV + ROV approach to color render targets is
+  //  added, which is, however, unlikely as it would have very complicated
+  //  interaction with depth / stencil testing, host depth will need to be
+  //  copied to a different buffer - the same range may have ROV-owned color and
+  //  host float32 depth at the same time).
+  ID3D12Resource* edram_buffer_ = nullptr;
+  D3D12_RESOURCE_STATES edram_buffer_state_;
+  EdramBufferModificationStatus edram_buffer_modification_status_ =
+      EdramBufferModificationStatus::kUnmodified;
+
+  // Non-shader-visible descriptor heap containing pre-created SRV and UAV
+  // descriptors of the EDRAM buffer, for faster binding (by copying rather
+  // than creation).
+  enum class EdramBufferDescriptorIndex : uint32_t {
+    kRawSRV,
+    kR32UintSRV,
+    kR32G32UintSRV,
+    kR32G32B32A32UintSRV,
+    kRawUAV,
+    kR32UintUAV,
+    kR32G32UintUAV,
+    kR32G32B32A32UintUAV,
+
+    kCount,
+  };
+  ID3D12DescriptorHeap* edram_buffer_descriptor_heap_ = nullptr;
+  D3D12_CPU_DESCRIPTOR_HANDLE edram_buffer_descriptor_heap_start_;
+
+  // Resolve copying root signature and pipelines.
+  // Parameter 0 - draw_util::ResolveCopyShaderConstants or its ::DestRelative.
+  // Parameter 1 - destination (shared memory or a part of it).
+  // Parameter 2 - source (EDRAM).
+  ID3D12RootSignature* resolve_copy_root_signature_ = nullptr;
+  static const std::pair<const void*, size_t>
+      kResolveCopyShaders[size_t(draw_util::ResolveCopyShaderIndex::kCount)];
+  ID3D12PipelineState* resolve_copy_pipelines_[size_t(
+      draw_util::ResolveCopyShaderIndex::kCount)] = {};
+
+  // For traces.
+  ID3D12Resource* edram_snapshot_download_buffer_ = nullptr;
+  std::unique_ptr<ui::d3d12::D3D12UploadBufferPool>
+      edram_snapshot_restore_pool_;
+
+  // For host render targets.
+
+  enum TransferCBVRegister : uint32_t {
+    kTransferCBVRegisterStencilMask,
+    kTransferCBVRegisterAddress,
+    kTransferCBVRegisterHostDepthAddress,
+  };
+  enum TransferSRVRegister : uint32_t {
+    kTransferSRVRegisterColor,
+    kTransferSRVRegisterDepth,
+    kTransferSRVRegisterStencil,
+    kTransferSRVRegisterHostDepth,
+    kTransferSRVRegisterCount,
+  };
+  enum TransferUsedRootParameter : uint32_t {
+    // Changed 8 times per transfer.
+    kTransferUsedRootParameterStencilMaskConstant,
+    kTransferUsedRootParameterColorSRV,
+    // Mutually exclusive with ColorSRV.
+    kTransferUsedRootParameterDepthSRV,
+    // Mutually exclusive with ColorSRV.
+    kTransferUsedRootParameterStencilSRV,
+    // May happen to be the same for different sources.
+    kTransferUsedRootParameterAddressConstant,
+    kTransferUsedRootParameterHostDepthSRV,
+    kTransferUsedRootParameterHostDepthAddressConstant,
+    kTransferUsedRootParameterCount,
+
+    kTransferUsedRootParameterStencilMaskConstantBit =
+        uint32_t(1) << kTransferUsedRootParameterStencilMaskConstant,
+    kTransferUsedRootParameterColorSRVBit =
+        uint32_t(1) << kTransferUsedRootParameterColorSRV,
+    kTransferUsedRootParameterDepthSRVBit =
+        uint32_t(1) << kTransferUsedRootParameterDepthSRV,
+    kTransferUsedRootParameterStencilSRVBit =
+        uint32_t(1) << kTransferUsedRootParameterStencilSRV,
+    kTransferUsedRootParameterAddressConstantBit =
+        uint32_t(1) << kTransferUsedRootParameterAddressConstant,
+    kTransferUsedRootParameterHostDepthSRVBit =
+        uint32_t(1) << kTransferUsedRootParameterHostDepthSRV,
+    kTransferUsedRootParameterHostDepthAddressConstantBit =
+        uint32_t(1) << kTransferUsedRootParameterHostDepthAddressConstant,
+
+    kTransferUsedRootParametersDescriptorMask =
+        kTransferUsedRootParameterColorSRVBit |
+        kTransferUsedRootParameterDepthSRVBit |
+        kTransferUsedRootParameterStencilSRVBit |
+        kTransferUsedRootParameterHostDepthSRVBit,
+  };
+  enum class TransferRootSignatureIndex {
+    kColor,
+    kDepth,
+    kDepthStencil,
+    kColorToStencilBit,
+    kStencilToStencilBit,
+    kColorAndHostDepth,
+    kDepthAndHostDepth,
+    kDepthStencilAndHostDepth,
+    kCount,
+  };
+  static const uint32_t
+      kTransferUsedRootParameters[size_t(TransferRootSignatureIndex::kCount)];
+  enum class TransferMode : uint32_t {
+    // 1 SRV (color texture), source constant.
+    kColorToDepth,
+    // 1 SRV (color texture), source constant.
+    kColorToColor,
+
+    // 1 or 2 SRVs (depth texture, stencil texture if SV_StencilRef is
+    // supported), source constant.
+    kDepthToDepth,
+    // 2 SRVs (depth texture, stencil texture), source constant.
+    kDepthToColor,
+
+    // 1 SRV (color texture), mask constant (most frequently changed, 8 times
+    // per transfer), source constant.
+    kColorToStencilBit,
+    // 1 SRV (stencil texture), mask constant, source constant.
+    kDepthToStencilBit,
+
+    // Two-source modes, using the host depth if it, when converted to the guest
+    // format, matches what's in the owner source (not modified, keep host
+    // precision), or the guest data otherwise (significantly modified, possibly
+    // cleared). Stencil for SV_StencilRef is always taken from the guest
+    // source.
+
+    // 2 SRVs (color texture, host depth texture or buffer), source constant,
+    // host depth source constant.
+    kColorAndHostDepthToDepth,
+    // When using different source and destination depth formats. 2 or 3 SRVs
+    // (depth texture, stencil texture if SV_StencilRef is supported, host depth
+    // texture or buffer), source constant, host depth source constant.
+    kDepthAndHostDepthToDepth,
+
+    kCount,
+  };
+  enum class TransferOutput {
+    kColor,
+    kDepth,
+    // With this output, kTransferCBVRegisterStencilMask is used.
+    kStencilBit,
+  };
+  struct TransferModeInfo {
+    TransferOutput output;
+    TransferRootSignatureIndex root_signature_no_stencil_ref;
+    TransferRootSignatureIndex root_signature_with_stencil_ref;
+  };
+  static const TransferModeInfo kTransferModes[size_t(TransferMode::kCount)];
+
+  union TransferShaderKey {
+    struct {
+      xenos::MsaaSamples dest_msaa_samples : xenos::kMsaaSamplesBits;
+      uint32_t dest_host_relevant_format : xenos::kRenderTargetFormatBits;
+      xenos::MsaaSamples source_msaa_samples : xenos::kMsaaSamplesBits;
+      // Always 1x when host_depth_source_is_copy is true not to create the same
+      // pipeline for different MSAA sample counts as it doesn't matter in this
+      // case.
+      xenos::MsaaSamples host_depth_source_msaa_samples
+          : xenos::kMsaaSamplesBits;
+      uint32_t source_host_relevant_format : xenos::kRenderTargetFormatBits;
+      // If host depth is also fetched, whether it's pre-copied to the EDRAM
+      // buffer (but since it's just a scratch buffer, with tiles laid out
+      // linearly with the same pitch as in the original render target; also no
+      // swapping of 40-sample columns as opposed to the host render target -
+      // this is done only for the color source).
+      uint32_t host_depth_source_is_copy : 1;
+
+      // Last bits because this affects the root signature - after sorting, only
+      // change it as fewer times as possible. Depth buffers have an additional
+      // depth SRV.
+      static_assert(size_t(TransferMode::kCount) <= (size_t(1) << 3));
+      TransferMode mode : 3;
+    };
+    uint32_t key = 0;
+    struct Hasher {
+      size_t operator()(const TransferShaderKey& key) const {
+        return std::hash<uint32_t>{}(key.key);
+      }
+    };
+    bool operator==(const TransferShaderKey& other_key) const {
+      return key == other_key.key;
+    }
+    bool operator!=(const TransferShaderKey& other_key) const {
+      return !(*this == other_key);
+    }
+    bool operator<(const TransferShaderKey& other_key) const {
+      return key < other_key.key;
+    }
+  };
+
+  union TransferAddressConstant {
+    struct {
+      // All in tiles.
+      uint32_t dest_pitch : xenos::kEdramPitchTilesBits;
+      uint32_t source_pitch : xenos::kEdramPitchTilesBits;
+      // Safe to use 12 bits for signed difference - no ownership transfer can
+      // ever occur between render targets with EDRAM base >= 2048 as this would
+      // result in 0-length spans. 10 + 10 + 12 is exactly 32, any more bits,
+      // and more root 32-bit constants will be used.
+      // Destination base in tiles minus source base in tiles (not vice versa
+      // because this is a transform of the coordinate system, not addresses
+      // themselves).
+      // 0 for host_depth_source_is_copy (ignored in this case anyway as
+      // destination == source anyway).
+      int32_t source_to_dest : xenos::kEdramBaseTilesBits;
+    };
+    uint32_t constant = 0;
+    bool operator==(const TransferAddressConstant& other_constant) const {
+      return constant == other_constant.constant;
+    }
+    bool operator!=(const TransferAddressConstant& other_constant) const {
+      return !(*this == other_constant);
+    }
+  };
+  static_assert(sizeof(TransferAddressConstant) == sizeof(uint32_t));
+
+  struct TransferInvocation {
+    Transfer transfer;
+    TransferShaderKey shader_key;
+    TransferInvocation(const Transfer& transfer,
+                       const TransferShaderKey& shader_key)
+        : transfer(transfer), shader_key(shader_key) {}
+    bool operator<(const TransferInvocation& other_invocation) {
+      // TODO(Triang3l): See if it may be better to sort by the source in the
+      // first place, especially when reading the same data multiple times (like
+      // to write the stencil bits after depth) for better read locality.
+      // Sort by the shader key primarily to reduce pipeline state (context)
+      // switches.
+      if (shader_key != other_invocation.shader_key) {
+        return shader_key < other_invocation.shader_key;
+      }
+      // Host depth render targets are changed rarely if they exist, won't save
+      // many binding changes, ignore them for simplicity (their existence is
+      // caught by the shader key change).
+      assert_not_null(transfer.source);
+      assert_not_null(other_invocation.transfer.source);
+      uint32_t source_index =
+          static_cast<const D3D12RenderTarget*>(transfer.source)
+              ->temporary_sort_index();
+      uint32_t other_source_index = static_cast<const D3D12RenderTarget*>(
+                                        other_invocation.transfer.source)
+                                        ->temporary_sort_index();
+      if (source_index != other_source_index) {
+        return source_index < other_source_index;
+      }
+      return transfer.start_tiles < other_invocation.transfer.start_tiles;
+    }
+    bool CanBeMergedIntoOneDraw(
+        const TransferInvocation& other_invocation) const {
+      return shader_key == other_invocation.shader_key &&
+             transfer.AreSourcesSame(other_invocation.transfer);
+    }
+  };
+
+  union HostDepthStoreRectangleConstant {
+    struct {
+      // - 1 because the maximum is 0x1FFF / 8, not 0x2000 / 8.
+      uint32_t x_pixels_div_8 : xenos::kResolveSizeBits - 1 -
+                                xenos::kResolveAlignmentPixelsLog2;
+      uint32_t y_pixels_div_8 : xenos::kResolveSizeBits - 1 -
+                                xenos::kResolveAlignmentPixelsLog2;
+      uint32_t width_pixels_div_8_minus_1 : xenos::kResolveSizeBits - 1 -
+                                            xenos::kResolveAlignmentPixelsLog2;
+    };
+    uint32_t constant = 0;
+  };
+  static_assert(sizeof(HostDepthStoreRectangleConstant) == sizeof(uint32_t));
+
+  union HostDepthStoreRenderTargetConstant {
+    struct {
+      uint32_t pitch_tiles : xenos::kEdramPitchTilesBits;
+      // 1 to 3.
+      uint32_t resolution_scale : 2;
+      // For native 2x MSAA vs. 2x over 4x.
+      uint32_t second_sample_index : 2;
+    };
+    uint32_t constant = 0;
+  };
+  static_assert(sizeof(HostDepthStoreRenderTargetConstant) == sizeof(uint32_t));
+
+  enum {
+    kHostDepthStoreRootParameterRectangleConstant,
+    kHostDepthStoreRootParameterRenderTargetConstant,
+    kHostDepthStoreRootParameterSource,
+    kHostDepthStoreRootParameterDest,
+    kHostDepthStoreRootParameterCount,
+  };
+
+  union DumpPipelineKey {
+    struct {
+      xenos::MsaaSamples msaa_samples : 2;
+      uint32_t host_relevant_format : 4;
+      // Last bit because this affects the root signature - after sorting, only
+      // change it at most once. Depth buffers have an additional stencil SRV.
+      uint32_t is_depth : 1;
+    };
+    uint32_t key = 0;
+    struct Hasher {
+      size_t operator()(const DumpPipelineKey& key) const {
+        return std::hash<uint32_t>{}(key.key);
+      }
+    };
+    bool operator==(const DumpPipelineKey& other_key) const {
+      return key == other_key.key;
+    }
+    bool operator!=(const DumpPipelineKey& other_key) const {
+      return !(*this == other_key);
+    }
+    bool operator<(const DumpPipelineKey& other_key) const {
+      return key < other_key.key;
+    }
+
+    xenos::ColorRenderTargetFormat GetColorFormat() const {
+      assert_false(is_depth);
+      return xenos::ColorRenderTargetFormat(host_relevant_format);
+    }
+    xenos::DepthRenderTargetFormat GetDepthFormat() const {
+      assert_true(is_depth);
+      return xenos::DepthRenderTargetFormat(host_relevant_format);
+    }
+  };
+
+  union DumpOffsets {
+    struct {
+      // Absolute index of the first thread group's tile within the source
+      // texture.
+      uint32_t first_group_tile_source_relative : xenos::kEdramBaseTilesBits;
+      uint32_t source_base_tiles : xenos::kEdramBaseTilesBits;
+    };
+    uint32_t offsets = 0;
+    bool operator==(const DumpOffsets& other_offsets) const {
+      return offsets == other_offsets.offsets;
+    }
+    bool operator!=(const DumpOffsets& other_offsets) const {
+      return !(*this == other_offsets);
+    }
+  };
+  static_assert(sizeof(DumpOffsets) == sizeof(uint32_t));
+
+  union DumpPitches {
+    struct {
+      // Both in tiles.
+      uint32_t source_pitch : xenos::kEdramPitchTilesBits;
+      uint32_t dest_pitch : xenos::kEdramPitchTilesBits;
+    };
+    uint32_t pitches = 0;
+    bool operator==(const DumpPitches& other_pitches) const {
+      return pitches == other_pitches.pitches;
+    }
+    bool operator!=(const DumpPitches& other_pitches) const {
+      return !(*this == other_pitches);
+    }
+  };
+  static_assert(sizeof(DumpPitches) == sizeof(uint32_t));
+
+  enum DumpCbuffer : uint32_t {
+    kDumpCbufferOffsets,
+    kDumpCbufferPitches,
+    kDumpCbufferCount,
+  };
+
+  enum DumpRootParameter : uint32_t {
+    // May be changed multiple times for the same source.
+    kDumpRootParameterOffsets,
+    // One resolve may need multiple sources.
+    kDumpRootParameterSource,
+
+    // May be different for different sources.
+    kDumpRootParameterColorPitches = kDumpRootParameterSource + 1,
+    // Only changed between 32bpp and 64bpp.
+    kDumpRootParameterColorEdram,
+
+    kDumpRootParameterColorCount,
+
+    // Same change frequency than the source (though currently the command
+    // processor can't contiguously allocate multiple descriptors with bindless,
+    // when such functionality is added, switch to one root signature).
+    kDumpRootParameterDepthStencil = kDumpRootParameterSource + 1,
+    kDumpRootParameterDepthPitches,
+    kDumpRootParameterDepthEdram,
+
+    kDumpRootParameterDepthCount,
+  };
+
+  struct DumpInvocation {
+    ResolveCopyDumpRectangle rectangle;
+    DumpPipelineKey pipeline_key;
+    DumpInvocation(const ResolveCopyDumpRectangle& rectangle,
+                   const DumpPipelineKey& pipeline_key)
+        : rectangle(rectangle), pipeline_key(pipeline_key) {}
+    bool operator<(const DumpInvocation& other_invocation) {
+      // Sort by the pipeline key primarily to reduce pipeline state (context)
+      // switches.
+      if (pipeline_key != other_invocation.pipeline_key) {
+        return pipeline_key < other_invocation.pipeline_key;
+      }
+      assert_not_null(rectangle.render_target);
+      uint32_t render_target_index =
+          static_cast<const D3D12RenderTarget*>(rectangle.render_target)
+              ->temporary_sort_index();
+      const ResolveCopyDumpRectangle& other_rectangle =
+          other_invocation.rectangle;
+      uint32_t other_render_target_index =
+          static_cast<const D3D12RenderTarget*>(other_rectangle.render_target)
+              ->temporary_sort_index();
+      if (render_target_index != other_render_target_index) {
+        return render_target_index < other_render_target_index;
+      }
+      if (rectangle.row_first != other_rectangle.row_first) {
+        return rectangle.row_first < other_rectangle.row_first;
+      }
+      return rectangle.row_first_start < other_rectangle.row_first_start;
+    }
+  };
+
+  // Returns:
+  // - A pointer to 1 pipeline for writing color or depth (or stencil via
+  //   SV_StencilRef).
+  // - A pointer to 8 pipelines for writing stencil by discarding samples
+  //   depending on whether they have one bit set, from 1 << 0 to 1 << 7.
+  // - Null if failed to create.
+  ID3D12PipelineState* const* GetOrCreateTransferPipelines(
+      TransferShaderKey key);
+
+  static TransferMode GetTransferMode(bool dest_is_stencil_bit,
+                                      bool dest_is_depth, bool source_is_depth,
+                                      bool source_has_host_depth) {
+    assert_true(dest_is_depth ||
+                (!dest_is_stencil_bit && !source_has_host_depth));
+    if (dest_is_stencil_bit) {
+      return source_is_depth ? TransferMode::kDepthToStencilBit
+                             : TransferMode::kColorToStencilBit;
+    }
+    if (dest_is_depth) {
+      if (source_is_depth) {
+        return source_has_host_depth ? TransferMode::kDepthAndHostDepthToDepth
+                                     : TransferMode::kDepthToDepth;
+      }
+      return source_has_host_depth ? TransferMode::kColorAndHostDepthToDepth
+                                   : TransferMode::kColorToDepth;
+    }
+    return source_is_depth ? TransferMode::kDepthToColor
+                           : TransferMode::kColorToColor;
+  }
+
+  // Do ownership transfers for render targets - each render target / vector may
+  // be null / empty in case there's nothing to do for them.
+  // resolve_clear_rectangle is expected to be provided by
+  // PrepareHostRenderTargetsResolveClear which should do all the needed size
+  // bound checks.
+  void PerformTransfersAndResolveClears(
+      uint32_t render_target_count, RenderTarget* const* render_targets,
+      const std::vector<Transfer>* render_target_transfers,
+      const uint64_t* render_target_resolve_clear_values = nullptr,
+      const Transfer::Rectangle* resolve_clear_rectangle = nullptr);
+
+  // Accepts an array of (1 + xenos::kMaxColorRenderTargets) render targets,
+  // first depth, then color.
+  void SetCommandListRenderTargets(
+      RenderTarget* const* depth_and_color_render_targets);
+
+  ID3D12PipelineState* GetOrCreateDumpPipeline(DumpPipelineKey key);
+
+  // Writes contents of host render targets within rectangles from
+  // ResolveInfo::GetCopyEdramTileSpan to edram_buffer_.
+  void DumpRenderTargets(uint32_t dump_base, uint32_t dump_row_length_used,
+                         uint32_t dump_rows, uint32_t dump_pitch);
+
+  bool use_stencil_reference_output_ = false;
+
+  bool gamma_render_target_as_srgb_ = false;
+
+  DepthFloat24Conversion depth_float24_conversion_ =
+      DepthFloat24Conversion::kOnCopy;
+
+  bool msaa_2x_supported_ = false;
+
+  std::shared_ptr<ui::d3d12::D3D12CpuDescriptorPool> descriptor_pool_color_;
+  std::shared_ptr<ui::d3d12::D3D12CpuDescriptorPool> descriptor_pool_depth_;
+  std::shared_ptr<ui::d3d12::D3D12CpuDescriptorPool> descriptor_pool_srv_;
+  ui::d3d12::D3D12CpuDescriptorPool::Descriptor null_rtv_descriptor_ss_;
+  ui::d3d12::D3D12CpuDescriptorPool::Descriptor null_rtv_descriptor_ms_;
+
+  // Possible tile ownership transfer paths:
+  // - To color:
+  //   - From color: 1 SRV (color).
+  //   - From depth: 2 SRVs (depth, stencil).
+  // - To depth / stencil (with SV_StencilRef):
+  //   - From color: 1 SRV (color).
+  //   - From depth: 2 SRVs (depth, stencil).
+  //   - From color and float32 depth: 2 SRVs (color with stencil, depth).
+  //     - Different depth buffer: depth SRV is a texture.
+  //     - Same depth buffer: depth SRV is a buffer (pre-copied).
+  // - To depth (no SV_StencilRef):
+  //   - From color: 1 SRV (color).
+  //   - From depth: 1 SRV (depth).
+  //   - From color and float32 depth: 2 SRVs (color, depth).
+  //     - Different depth buffer: depth SRV is a texture.
+  //     - Same depth buffer: depth SRV is a buffer (pre-copied).
+  // - To stencil (no SV_StencilRef):
+  //   - From color: 1 SRV (color).
+  //   - From depth: 1 SRV (stencil).
+
+  const RenderTarget* const*
+      current_command_list_render_targets_[1 + xenos::kMaxColorRenderTargets];
+  uint32_t are_current_command_list_render_targets_srgb_ = 0;
+  bool are_current_command_list_render_targets_valid_ = false;
+
+  // Temporary storage for descriptors used in PerformTransfersAndResolveClears
+  // and DumpRenderTargets.
+  std::vector<D3D12_CPU_DESCRIPTOR_HANDLE> current_temporary_descriptors_cpu_;
+  std::vector<ui::d3d12::util::DescriptorCpuGpuHandlePair>
+      current_temporary_descriptors_gpu_;
+
+  ID3D12RootSignature* host_depth_store_root_signature_ = nullptr;
+  ID3D12PipelineState*
+      host_depth_store_pipelines_[size_t(xenos::MsaaSamples::k4X) + 1] = {};
+
+  std::unique_ptr<ui::d3d12::D3D12UploadBufferPool>
+      transfer_vertex_buffer_pool_;
+
+  ID3D12RootSignature* transfer_root_signatures_[size_t(
+      TransferRootSignatureIndex::kCount)] = {};
+  std::unordered_map<TransferShaderKey, ID3D12PipelineState*,
+                     TransferShaderKey::Hasher>
+      transfer_pipelines_;
+  std::unordered_map<TransferShaderKey, std::array<ID3D12PipelineState*, 8>,
+                     TransferShaderKey::Hasher>
+      transfer_stencil_bit_pipelines_;
+
+  // Temporary storage for PerformTransfersAndResolveClears.
+  std::vector<TransferInvocation> current_transfer_invocations_;
+
+  // Temporary storage for DumpRenderTargets.
+  std::vector<ResolveCopyDumpRectangle> dump_rectangles_;
+  std::vector<DumpInvocation> dump_invocations_;
+
+  ID3D12RootSignature* dump_root_signature_color_ = nullptr;
+  ID3D12RootSignature* dump_root_signature_depth_ = nullptr;
+  // Compute pipelines for copying host render target contents to the EDRAM
+  // buffer. May be null if failed to create.
+  std::unordered_map<DumpPipelineKey, ID3D12PipelineState*,
+                     DumpPipelineKey::Hasher>
+      dump_pipelines_;
+
+  // Parameter 0 - 2 root constants (red, green).
+  ID3D12RootSignature* uint32_rtv_clear_root_signature_ = nullptr;
+  // [32 or 32_32][MSAA samples].
+  ID3D12PipelineState*
+      uint32_rtv_clear_pipelines_[2][size_t(xenos::MsaaSamples::k4X) + 1] = {};
+
+  std::vector<Transfer> clear_transfers_[2];
+
+  // Temporary storage for DXBC building.
+  std::vector<uint32_t> built_shader_;
+
+  // For rasterizer-ordered view (pixel shader interlock).
+
+  static const std::pair<const void*, size_t> kResolveROVClear32bppShaders[3];
+  static const std::pair<const void*, size_t> kResolveROVClear64bppShaders[3];
+
+  ID3D12RootSignature* resolve_rov_clear_root_signature_ = nullptr;
+  // Clearing 32bpp color or depth.
+  ID3D12PipelineState* resolve_rov_clear_32bpp_pipeline_ = nullptr;
+  // Clearing 64bpp color.
+  ID3D12PipelineState* resolve_rov_clear_64bpp_pipeline_ = nullptr;
+};
+
+}  // namespace d3d12
+}  // namespace gpu
+}  // namespace xe
+
+#endif  // XENIA_GPU_D3D12_D3D12_RENDER_TARGET_CACHE_H_
--- a/src/xenia/gpu/d3d12/d3d12_shader.cc
+++ b/src/xenia/gpu/d3d12/d3d12_shader.cc
@ -99,7 +99,7 @@ void D3D12Shader::D3D12Translation::DisassembleDxbcAndDxil(
 }

 Shader::Translation* D3D12Shader::CreateTranslationInstance(
-    uint32_t modification) {
+    uint64_t modification) {
  return new D3D12Translation(*this, modification);
 }

--- a/src/xenia/gpu/d3d12/d3d12_shader.h
+++ b/src/xenia/gpu/d3d12/d3d12_shader.h
@ -23,7 +23,7 @@ class D3D12Shader : public DxbcShader {
 public:
  class D3D12Translation : public DxbcTranslation {
   public:
-    D3D12Translation(D3D12Shader& shader, uint32_t modification)
+    D3D12Translation(D3D12Shader& shader, uint64_t modification)
        : DxbcTranslation(shader, modification) {}

    void DisassembleDxbcAndDxil(const ui::d3d12::D3D12Provider& provider,
@ -60,7 +60,7 @@ class D3D12Shader : public DxbcShader {
  }

 protected:
-  Translation* CreateTranslationInstance(uint32_t modification) override;
+  Translation* CreateTranslationInstance(uint64_t modification) override;

 private:
  std::atomic_flag binding_layout_user_uids_set_up_ = ATOMIC_FLAG_INIT;
--- a/src/xenia/gpu/d3d12/d3d12_shared_memory.cc
+++ b/src/xenia/gpu/d3d12/d3d12_shared_memory.cc
@ -192,6 +192,11 @@ void D3D12SharedMemory::CompletedSubmissionUpdated() {
  upload_buffer_pool_->Reclaim(command_processor_.GetCompletedSubmission());
 }

+void D3D12SharedMemory::BeginSubmission() {
+  // ExecuteCommandLists is a full UAV barrier.
+  buffer_uav_writes_commit_needed_ = false;
+}
+
 void D3D12SharedMemory::CommitUAVWritesAndTransitionBuffer(
    D3D12_RESOURCE_STATES new_state) {
  if (buffer_state_ == new_state) {
@ -421,7 +426,7 @@ bool D3D12SharedMemory::UploadRanges(
        return false;
      }
      MakeRangeValid(upload_range_start << page_size_log2(),
-                     uint32_t(upload_buffer_size), false);
+                     uint32_t(upload_buffer_size), false, false);
      std::memcpy(
          upload_buffer_mapping,
          memory().TranslatePhysical(upload_range_start << page_size_log2()),
--- a/src/xenia/gpu/d3d12/d3d12_shared_memory.h
+++ b/src/xenia/gpu/d3d12/d3d12_shared_memory.h
@ -43,6 +43,7 @@ class D3D12SharedMemory : public SharedMemory {
  }

  void CompletedSubmissionUpdated();
+  void BeginSubmission();

  // RequestRange may transition the buffer to copy destination - call it before
  // UseForReading or UseForWriting.
--- a/src/xenia/gpu/d3d12/deferred_command_list.cc
+++ b/src/xenia/gpu/d3d12/deferred_command_list.cc
@ -40,6 +40,23 @@ void DeferredCommandList::Execute(ID3D12GraphicsCommandList* command_list,
    stream += kCommandHeaderSizeElements;
    stream_remaining -= kCommandHeaderSizeElements;
    switch (header.command) {
+      case Command::kD3DClearDepthStencilView: {
+        auto& args =
+            *reinterpret_cast<const ClearDepthStencilViewHeader*>(stream);
+        command_list->ClearDepthStencilView(
+            args.depth_stencil_view, args.clear_flags, args.depth, args.stencil,
+            args.num_rects,
+            args.num_rects ? reinterpret_cast<const D3D12_RECT*>(&args + 1)
+                           : nullptr);
+      } break;
+      case Command::kD3DClearRenderTargetView: {
+        auto& args =
+            *reinterpret_cast<const ClearRenderTargetViewHeader*>(stream);
+        command_list->ClearRenderTargetView(
+            args.render_target_view, args.color_rgba, args.num_rects,
+            args.num_rects ? reinterpret_cast<const D3D12_RECT*>(&args + 1)
+                           : nullptr);
+      } break;
      case Command::kD3DClearUnorderedAccessViewUint: {
        auto& args =
            *reinterpret_cast<const ClearUnorderedAccessViewHeader*>(stream);
@ -64,11 +81,12 @@ void DeferredCommandList::Execute(ID3D12GraphicsCommandList* command_list,
        auto& args = *reinterpret_cast<const CopyTextureArguments*>(stream);
        command_list->CopyTextureRegion(&args.dst, 0, 0, 0, &args.src, nullptr);
      } break;
-      case Command::kCopyTextureRegion: {
+      case Command::kD3DCopyTextureRegion: {
        auto& args =
-            *reinterpret_cast<const CopyTextureRegionArguments*>(stream);
-        command_list->CopyTextureRegion(&args.dst, args.dst_x, args.dst_y,
-                                        args.dst_z, &args.src, &args.src_box);
+            *reinterpret_cast<const D3DCopyTextureRegionArguments*>(stream);
+        command_list->CopyTextureRegion(
+            &args.dst, args.dst_x, args.dst_y, args.dst_z, &args.src,
+            args.has_src_box ? &args.src_box : nullptr);
      } break;
      case Command::kD3DDispatch: {
        if (current_pipeline_state != nullptr) {
@ -107,6 +125,17 @@ void DeferredCommandList::Execute(ID3D12GraphicsCommandList* command_list,
        command_list->IASetPrimitiveTopology(
            *reinterpret_cast<const D3D12_PRIMITIVE_TOPOLOGY*>(stream));
      } break;
+      case Command::kD3DIASetVertexBuffers: {
+        static_assert(alignof(D3D12_VERTEX_BUFFER_VIEW) <= alignof(uintmax_t));
+        auto& args =
+            *reinterpret_cast<const D3DIASetVertexBuffersHeader*>(stream);
+        command_list->IASetVertexBuffers(
+            args.start_slot, args.num_views,
+            reinterpret_cast<const D3D12_VERTEX_BUFFER_VIEW*>(
+                reinterpret_cast<const uint8_t*>(stream) +
+                xe::align(sizeof(D3DIASetVertexBuffersHeader),
+                          alignof(D3D12_VERTEX_BUFFER_VIEW))));
+      } break;
      case Command::kD3DOMSetBlendFactor: {
        command_list->OMSetBlendFactor(reinterpret_cast<const FLOAT*>(stream));
      } break;
--- a/src/xenia/gpu/d3d12/deferred_command_list.h
+++ b/src/xenia/gpu/d3d12/deferred_command_list.h
@ -15,6 +15,7 @@
 #include <cstring>
 #include <vector>

+#include "xenia/base/assert.h"
 #include "xenia/base/math.h"
 #include "xenia/ui/d3d12/d3d12_api.h"

@ -33,6 +34,47 @@ class DeferredCommandList {
  void Execute(ID3D12GraphicsCommandList* command_list,
               ID3D12GraphicsCommandList1* command_list_1);

+  D3D12_RECT* ClearDepthStencilViewAllocatedRects(
+      D3D12_CPU_DESCRIPTOR_HANDLE depth_stencil_view,
+      D3D12_CLEAR_FLAGS clear_flags, FLOAT depth, UINT8 stencil,
+      UINT num_rects) {
+    auto args = reinterpret_cast<ClearDepthStencilViewHeader*>(WriteCommand(
+        Command::kD3DClearDepthStencilView,
+        sizeof(ClearDepthStencilViewHeader) + num_rects * sizeof(D3D12_RECT)));
+    args->depth_stencil_view = depth_stencil_view;
+    args->clear_flags = clear_flags;
+    args->depth = depth;
+    args->stencil = stencil;
+    args->num_rects = num_rects;
+    return num_rects ? reinterpret_cast<D3D12_RECT*>(args + 1) : nullptr;
+  }
+
+  void D3DClearDepthStencilView(D3D12_CPU_DESCRIPTOR_HANDLE depth_stencil_view,
+                                D3D12_CLEAR_FLAGS clear_flags, FLOAT depth,
+                                UINT8 stencil, UINT num_rects,
+                                const D3D12_RECT* rects) {
+    D3D12_RECT* allocated_rects = ClearDepthStencilViewAllocatedRects(
+        depth_stencil_view, clear_flags, depth, stencil, num_rects);
+    if (num_rects) {
+      assert_not_null(allocated_rects);
+      std::memcpy(allocated_rects, rects, num_rects * sizeof(D3D12_RECT));
+    }
+  }
+
+  void D3DClearRenderTargetView(D3D12_CPU_DESCRIPTOR_HANDLE render_target_view,
+                                const FLOAT color_rgba[4], UINT num_rects,
+                                const D3D12_RECT* rects) {
+    auto args = reinterpret_cast<ClearRenderTargetViewHeader*>(WriteCommand(
+        Command::kD3DClearRenderTargetView,
+        sizeof(ClearRenderTargetViewHeader) + num_rects * sizeof(D3D12_RECT)));
+    args->render_target_view = render_target_view;
+    std::memcpy(args->color_rgba, color_rgba, 4 * sizeof(FLOAT));
+    args->num_rects = num_rects;
+    if (num_rects != 0) {
+      std::memcpy(args + 1, rects, num_rects * sizeof(D3D12_RECT));
+    }
+  }
+
  void D3DClearUnorderedAccessViewUint(
      D3D12_GPU_DESCRIPTOR_HANDLE view_gpu_handle_in_current_heap,
      D3D12_CPU_DESCRIPTOR_HANDLE view_cpu_handle, ID3D12Resource* resource,
@ -79,18 +121,25 @@ class DeferredCommandList {
    std::memcpy(&args.src, &src, sizeof(D3D12_TEXTURE_COPY_LOCATION));
  }

-  void CopyTextureRegion(const D3D12_TEXTURE_COPY_LOCATION& dst, UINT dst_x,
-                         UINT dst_y, UINT dst_z,
-                         const D3D12_TEXTURE_COPY_LOCATION& src,
-                         const D3D12_BOX& src_box) {
-    auto& args = *reinterpret_cast<CopyTextureRegionArguments*>(WriteCommand(
-        Command::kCopyTextureRegion, sizeof(CopyTextureRegionArguments)));
-    std::memcpy(&args.dst, &dst, sizeof(D3D12_TEXTURE_COPY_LOCATION));
+  void D3DCopyTextureRegion(const D3D12_TEXTURE_COPY_LOCATION* dst, UINT dst_x,
+                            UINT dst_y, UINT dst_z,
+                            const D3D12_TEXTURE_COPY_LOCATION* src,
+                            const D3D12_BOX* src_box) {
+    assert_not_null(dst);
+    assert_not_null(src);
+    auto& args = *reinterpret_cast<D3DCopyTextureRegionArguments*>(WriteCommand(
+        Command::kD3DCopyTextureRegion, sizeof(D3DCopyTextureRegionArguments)));
+    std::memcpy(&args.dst, dst, sizeof(D3D12_TEXTURE_COPY_LOCATION));
    args.dst_x = dst_x;
    args.dst_y = dst_y;
    args.dst_z = dst_z;
-    std::memcpy(&args.src, &src, sizeof(D3D12_TEXTURE_COPY_LOCATION));
-    args.src_box = src_box;
+    std::memcpy(&args.src, src, sizeof(D3D12_TEXTURE_COPY_LOCATION));
+    if (src_box) {
+      args.has_src_box = true;
+      args.src_box = *src_box;
+    } else {
+      args.has_src_box = false;
+    }
  }

  void D3DDispatch(UINT thread_group_count_x, UINT thread_group_count_y,
@ -147,6 +196,23 @@ class DeferredCommandList {
    arg = primitive_topology;
  }

+  void D3DIASetVertexBuffers(UINT start_slot, UINT num_views,
+                             const D3D12_VERTEX_BUFFER_VIEW* views) {
+    if (num_views == 0) {
+      return;
+    }
+    static_assert(alignof(D3D12_VERTEX_BUFFER_VIEW) <= alignof(uintmax_t));
+    const size_t header_size = xe::align(sizeof(D3DIASetVertexBuffersHeader),
+                                         alignof(D3D12_VERTEX_BUFFER_VIEW));
+    auto args = reinterpret_cast<D3DIASetVertexBuffersHeader*>(WriteCommand(
+        Command::kD3DIASetVertexBuffers,
+        header_size + num_views * sizeof(D3D12_VERTEX_BUFFER_VIEW)));
+    args->start_slot = start_slot;
+    args->num_views = num_views;
+    std::memcpy(reinterpret_cast<uint8_t*>(args) + header_size, views,
+                sizeof(D3D12_VERTEX_BUFFER_VIEW) * num_views);
+  }
+
  void D3DOMSetBlendFactor(const FLOAT blend_factor[4]) {
    auto args = reinterpret_cast<FLOAT*>(
        WriteCommand(Command::kD3DOMSetBlendFactor, 4 * sizeof(FLOAT)));
@ -333,16 +399,19 @@ class DeferredCommandList {

 private:
  enum class Command {
+    kD3DClearDepthStencilView,
+    kD3DClearRenderTargetView,
    kD3DClearUnorderedAccessViewUint,
    kD3DCopyBufferRegion,
    kD3DCopyResource,
    kCopyTexture,
-    kCopyTextureRegion,
+    kD3DCopyTextureRegion,
    kD3DDispatch,
    kD3DDrawIndexedInstanced,
    kD3DDrawInstanced,
    kD3DIASetIndexBuffer,
    kD3DIASetPrimitiveTopology,
+    kD3DIASetVertexBuffers,
    kD3DOMSetBlendFactor,
    kD3DOMSetRenderTargets,
    kD3DOMSetStencilRef,
@ -370,12 +439,26 @@ class DeferredCommandList {
  static constexpr size_t kCommandHeaderSizeElements =
      (sizeof(CommandHeader) + sizeof(uintmax_t) - 1) / sizeof(uintmax_t);

+  struct ClearDepthStencilViewHeader {
+    D3D12_CPU_DESCRIPTOR_HANDLE depth_stencil_view;
+    D3D12_CLEAR_FLAGS clear_flags;
+    FLOAT depth;
+    UINT8 stencil;
+    UINT num_rects;
+  };
+
+  struct ClearRenderTargetViewHeader {
+    D3D12_CPU_DESCRIPTOR_HANDLE render_target_view;
+    FLOAT color_rgba[4];
+    UINT num_rects;
+  };
+
  struct ClearUnorderedAccessViewHeader {
    D3D12_GPU_DESCRIPTOR_HANDLE view_gpu_handle_in_current_heap;
    D3D12_CPU_DESCRIPTOR_HANDLE view_cpu_handle;
    ID3D12Resource* resource;
    union {
-      float values_float[4];
+      FLOAT values_float[4];
      UINT values_uint[4];
    };
    UINT num_rects;
@ -399,13 +482,14 @@ class DeferredCommandList {
    D3D12_TEXTURE_COPY_LOCATION src;
  };

-  struct CopyTextureRegionArguments {
+  struct D3DCopyTextureRegionArguments {
    D3D12_TEXTURE_COPY_LOCATION dst;
    UINT dst_x;
    UINT dst_y;
    UINT dst_z;
    D3D12_TEXTURE_COPY_LOCATION src;
    D3D12_BOX src_box;
+    bool has_src_box;
  };

  struct D3DDispatchArguments {
@ -429,6 +513,11 @@ class DeferredCommandList {
    UINT start_instance_location;
  };

+  struct D3DIASetVertexBuffersHeader {
+    UINT start_slot;
+    UINT num_views;
+  };
+
  struct D3DOMSetRenderTargetsArguments {
    uint8_t num_render_target_descriptors;
    bool rts_single_handle_to_descriptor_range;
--- a/src/xenia/gpu/d3d12/pipeline_cache.cc
+++ b/src/xenia/gpu/d3d12/pipeline_cache.cc
--- a/src/xenia/gpu/d3d12/pipeline_cache.h
+++ b/src/xenia/gpu/d3d12/pipeline_cache.h
@ -23,11 +23,13 @@

 #include "xenia/base/hash.h"
 #include "xenia/base/platform.h"
+#include "xenia/base/string_buffer.h"
 #include "xenia/base/threading.h"
+#include "xenia/gpu/d3d12/d3d12_render_target_cache.h"
 #include "xenia/gpu/d3d12/d3d12_shader.h"
-#include "xenia/gpu/d3d12/render_target_cache.h"
 #include "xenia/gpu/dxbc_shader_translator.h"
 #include "xenia/gpu/gpu_flags.h"
+#include "xenia/gpu/primitive_processor.h"
 #include "xenia/gpu/register_file.h"
 #include "xenia/gpu/xenos.h"
 #include "xenia/ui/d3d12/d3d12_api.h"
@ -43,10 +45,9 @@ class PipelineCache {
  static constexpr size_t kLayoutUIDEmpty = 0;

  PipelineCache(D3D12CommandProcessor& command_processor,
-                const RegisterFile& register_file, bool bindless_resources_used,
-                bool edram_rov_used,
-                flags::DepthFloat24Conversion depth_float24_conversion,
-                uint32_t resolution_scale);
+                const RegisterFile& register_file,
+                const D3D12RenderTargetCache& render_target_cache,
+                bool bindless_resources_used);
  ~PipelineCache();

  bool Initialize();
@ -62,22 +63,28 @@ class PipelineCache {

  D3D12Shader* LoadShader(xenos::ShaderType shader_type,
                          const uint32_t* host_address, uint32_t dword_count);
+  // Analyze shader microcode on the translator thread.
+  void AnalyzeShaderUcode(Shader& shader) {
+    shader.AnalyzeUcode(ucode_disasm_buffer_);
+  }

-  // Retrieves the shader modifications for the current state, and returns
-  // whether they are valid.
-  bool GetCurrentShaderModifications(
-      DxbcShaderTranslator::Modification& vertex_shader_modification_out,
-      DxbcShaderTranslator::Modification& pixel_shader_modification_out) const;
-
-  // Translates shaders if needed, also making shader info up to date.
-  bool EnsureShadersTranslated(D3D12Shader::D3D12Translation* vertex_shader,
-                               D3D12Shader::D3D12Translation* pixel_shader);
+  // Retrieves the shader modification for the current state. The shader must
+  // have microcode analyzed.
+  DxbcShaderTranslator::Modification
+  PipelineCache::GetCurrentVertexShaderModification(
+      const Shader& shader,
+      Shader::HostVertexShaderType host_vertex_shader_type) const;
+  DxbcShaderTranslator::Modification
+  PipelineCache::GetCurrentPixelShaderModification(const Shader& shader) const;

+  // If draw_util::IsRasterizationPotentiallyDone is false, the pixel shader
+  // MUST be made nullptr BEFORE calling this!
  bool ConfigurePipeline(
      D3D12Shader::D3D12Translation* vertex_shader,
      D3D12Shader::D3D12Translation* pixel_shader,
-      xenos::PrimitiveType primitive_type, xenos::IndexFormat index_format,
-      const RenderTargetCache::PipelineRenderTarget render_targets[5],
+      const PrimitiveProcessor::ProcessingResult& primitive_processing_result,
+      uint32_t bound_depth_and_color_render_target_bits,
+      const uint32_t* bound_depth_and_color_render_targets_formats,
      void** pipeline_handle_out, ID3D12RootSignature** root_signature_out);

  // Returns a pipeline with deferred creation by its handle. May return nullptr
@ -93,9 +100,7 @@ class PipelineCache {
    uint32_t ucode_dword_count : 31;
    xenos::ShaderType type : 1;

-    reg::SQ_PROGRAM_CNTL sq_program_cntl;
-
-    static constexpr uint32_t kVersion = 0x20201207;
+    static constexpr uint32_t kVersion = 0x20201219;
  });

  // Update PipelineDescription::kVersion if any of the Pipeline* enums are
@ -138,6 +143,8 @@ class PipelineCache {
    kNone,
    kFront,
    kBack,
+    // Special case, handled via disabling the pixel shader and depth / stencil.
+    kDisableRasterization,
  };

  enum class PipelineBlendFactor : uint32_t {
@ -171,10 +178,10 @@ class PipelineCache {

  XEPACKEDSTRUCT(PipelineDescription, {
    uint64_t vertex_shader_hash;
+    uint64_t vertex_shader_modification;
    // 0 if drawing without a pixel shader.
    uint64_t pixel_shader_hash;
-    uint32_t vertex_shader_modification;
-    uint32_t pixel_shader_modification;
+    uint64_t pixel_shader_modification;

    int32_t depth_bias;
    float depth_bias_slope_scaled;
@ -189,12 +196,12 @@ class PipelineCache {
    PipelineCullMode cull_mode : 2;                   // 9
    uint32_t front_counter_clockwise : 1;             // 10
    uint32_t depth_clip : 1;                          // 11
-    uint32_t rov_msaa : 1;                            // 12
-    xenos::DepthRenderTargetFormat depth_format : 1;  // 13
-    xenos::CompareFunction depth_func : 3;            // 16
-    uint32_t depth_write : 1;                         // 17
-    uint32_t stencil_enable : 1;                      // 18
-    uint32_t stencil_read_mask : 8;                   // 26
+    xenos::MsaaSamples host_msaa_samples : 2;         // 13
+    xenos::DepthRenderTargetFormat depth_format : 1;  // 14
+    xenos::CompareFunction depth_func : 3;            // 17
+    uint32_t depth_write : 1;                         // 18
+    uint32_t stencil_enable : 1;                      // 19
+    uint32_t stencil_read_mask : 8;                   // 27

    uint32_t stencil_write_mask : 8;                   // 8
    xenos::StencilOp stencil_front_fail_op : 3;        // 11
@ -206,9 +213,9 @@ class PipelineCache {
    xenos::StencilOp stencil_back_pass_op : 3;         // 29
    xenos::CompareFunction stencil_back_func : 3;      // 32

-    PipelineRenderTarget render_targets[4];
+    PipelineRenderTarget render_targets[xenos::kMaxColorRenderTargets];

-    static constexpr uint32_t kVersion = 0x20201207;
+    static constexpr uint32_t kVersion = 0x20210425;
  });

  XEPACKEDSTRUCT(PipelineStoredDescription, {
@ -223,27 +230,26 @@ class PipelineCache {
    PipelineDescription description;
  };

-  // Returns the host vertex shader type for the current draw if it's valid and
-  // supported, or Shader::HostVertexShaderType(-1) if not.
-  Shader::HostVertexShaderType GetCurrentHostVertexShaderTypeIfValid() const;
-
  D3D12Shader* LoadShader(xenos::ShaderType shader_type,
                          const uint32_t* host_address, uint32_t dword_count,
                          uint64_t data_hash);

  // Can be called from multiple threads.
-  bool TranslateShader(DxbcShaderTranslator& translator,
-                       D3D12Shader::D3D12Translation& translation,
-                       reg::SQ_PROGRAM_CNTL cntl,
-                       IDxbcConverter* dxbc_converter = nullptr,
-                       IDxcUtils* dxc_utils = nullptr,
-                       IDxcCompiler* dxc_compiler = nullptr);
+  bool TranslateAnalyzedShader(DxbcShaderTranslator& translator,
+                               D3D12Shader::D3D12Translation& translation,
+                               IDxbcConverter* dxbc_converter = nullptr,
+                               IDxcUtils* dxc_utils = nullptr,
+                               IDxcCompiler* dxc_compiler = nullptr);

+  // If draw_util::IsRasterizationPotentiallyDone is false, the pixel shader
+  // MUST be made nullptr BEFORE calling this! The shaders must be translated
+  // and valid.
  bool GetCurrentStateDescription(
      D3D12Shader::D3D12Translation* vertex_shader,
      D3D12Shader::D3D12Translation* pixel_shader,
-      xenos::PrimitiveType primitive_type, xenos::IndexFormat index_format,
-      const RenderTargetCache::PipelineRenderTarget render_targets[5],
+      const PrimitiveProcessor::ProcessingResult& primitive_processing_result,
+      uint32_t bound_depth_and_color_render_target_bits,
+      const uint32_t* bound_depth_and_color_render_target_formats,
      PipelineRuntimeDescription& runtime_description_out);

  ID3D12PipelineState* CreateD3D12Pipeline(
@ -251,13 +257,12 @@ class PipelineCache {

  D3D12CommandProcessor& command_processor_;
  const RegisterFile& register_file_;
+  const D3D12RenderTargetCache& render_target_cache_;
  bool bindless_resources_used_;
-  bool edram_rov_used_;
-  // 20e4 depth conversion mode to use for non-ROV output.
-  flags::DepthFloat24Conversion depth_float24_conversion_;
-  uint32_t resolution_scale_;

-  // Reusable shader translator.
+  // Temporary storage for AnalyzeUcode calls on the processor thread.
+  StringBuffer ucode_disasm_buffer_;
+  // Reusable shader translator for the processor thread.
  std::unique_ptr<DxbcShaderTranslator> shader_translator_;

  // Command processor thread DXIL conversion/disassembly interfaces, if DXIL
@ -332,8 +337,7 @@ class PipelineCache {
  std::condition_variable storage_write_request_cond_;
  // Storage thread input is protected with storage_write_request_lock_, and the
  // thread is notified about its change via storage_write_request_cond_.
-  std::deque<std::pair<const Shader*, reg::SQ_PROGRAM_CNTL>>
-      storage_write_shader_queue_;
+  std::deque<const Shader*> storage_write_shader_queue_;
  std::deque<PipelineStoredDescription> storage_write_pipeline_queue_;
  bool storage_write_flush_shaders_ = false;
  bool storage_write_flush_pipelines_ = false;
--- a/src/xenia/gpu/d3d12/primitive_converter.cc
+++ b/src/xenia/gpu/d3d12/primitive_converter.cc
@ -1,762 +0,0 @@
-/**
- ******************************************************************************
- * Xenia : Xbox 360 Emulator Research Project                                 *
- ******************************************************************************
- * Copyright 2018 Ben Vanik. All rights reserved.                             *
- * Released under the BSD license - see LICENSE in the root for more details. *
- ******************************************************************************
- */
-
-#include "xenia/gpu/d3d12/primitive_converter.h"
-
-#include <algorithm>
-
-#include "xenia/base/assert.h"
-#include "xenia/base/cvar.h"
-#include "xenia/base/logging.h"
-#include "xenia/base/math.h"
-#include "xenia/base/memory.h"
-#include "xenia/base/platform.h"
-#include "xenia/base/profiling.h"
-#include "xenia/gpu/d3d12/d3d12_command_processor.h"
-#include "xenia/ui/d3d12/d3d12_util.h"
-
-DEFINE_bool(d3d12_convert_quads_to_triangles, false,
-            "Convert quad lists to triangle lists on the CPU instead of using "
-            "a geometry shader. Not recommended for playing, for debugging "
-            "primarily (because PIX fails to display vertices when a geometry "
-            "shader is used), and this way quads can't be discarded correctly "
-            "when the game uses vertex kill functionality.",
-            "D3D12");
-
-namespace xe {
-namespace gpu {
-namespace d3d12 {
-
-PrimitiveConverter::PrimitiveConverter(D3D12CommandProcessor& command_processor,
-                                       const RegisterFile& register_file,
-                                       Memory& memory,
-                                       TraceWriter& trace_writer)
-    : command_processor_(command_processor),
-      register_file_(register_file),
-      memory_(memory),
-      trace_writer_(trace_writer) {
-  system_page_size_ = uint32_t(memory::page_size());
-}
-
-PrimitiveConverter::~PrimitiveConverter() { Shutdown(); }
-
-bool PrimitiveConverter::Initialize() {
-  auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
-  auto device = provider.GetDevice();
-  D3D12_HEAP_FLAGS heap_flag_create_not_zeroed =
-      provider.GetHeapFlagCreateNotZeroed();
-
-  // There can be at most 65535 indices in a Xenos draw call (16 bit index
-  // count), but they can be up to 4 bytes large, and conversion can add more
-  // indices (almost triple the count for triangle strips or fans, for
-  // instance).
-  buffer_pool_ = std::make_unique<ui::d3d12::D3D12UploadBufferPool>(
-      provider, std::max(sizeof(uint32_t) * 3 * 65535,
-                         ui::d3d12::D3D12UploadBufferPool::kDefaultPageSize));
-
-  // Create the static index buffer for non-indexed drawing.
-  D3D12_RESOURCE_DESC static_ib_desc;
-  ui::d3d12::util::FillBufferResourceDesc(
-      static_ib_desc, kStaticIBTotalCount * sizeof(uint16_t),
-      D3D12_RESOURCE_FLAG_NONE);
-  if (FAILED(device->CreateCommittedResource(
-          &ui::d3d12::util::kHeapPropertiesUpload, heap_flag_create_not_zeroed,
-          &static_ib_desc, D3D12_RESOURCE_STATE_GENERIC_READ, nullptr,
-          IID_PPV_ARGS(&static_ib_upload_)))) {
-    XELOGE(
-        "Failed to create the upload buffer for the primitive conversion "
-        "static index buffer");
-    Shutdown();
-    return false;
-  }
-  D3D12_RANGE static_ib_read_range;
-  static_ib_read_range.Begin = 0;
-  static_ib_read_range.End = 0;
-  void* static_ib_mapping;
-  if (FAILED(static_ib_upload_->Map(0, &static_ib_read_range,
-                                    &static_ib_mapping))) {
-    XELOGE(
-        "Failed to map the upload buffer for the primitive conversion "
-        "static index buffer");
-    Shutdown();
-    return false;
-  }
-  uint16_t* static_ib_data = reinterpret_cast<uint16_t*>(static_ib_mapping);
-  // Triangle fans as triangle lists.
-  // https://docs.microsoft.com/en-us/windows/desktop/direct3d9/triangle-fans
-  // Ordered as (v1, v2, v0), (v2, v3, v0).
-  uint16_t* static_ib_data_pointer =
-      &static_ib_data[kStaticIBTriangleFanOffset];
-  for (uint32_t i = 2; i < kMaxNonIndexedVertices; ++i) {
-    *(static_ib_data_pointer++) = i - 1;
-    *(static_ib_data_pointer++) = i;
-    *(static_ib_data_pointer++) = 0;
-  }
-  static_ib_data_pointer = &static_ib_data[kStaticIBQuadOffset];
-  for (uint32_t i = 0; i < (kMaxNonIndexedVertices >> 2); ++i) {
-    uint32_t quad_index = i << 2;
-    *(static_ib_data_pointer++) = quad_index;
-    *(static_ib_data_pointer++) = quad_index + 1;
-    *(static_ib_data_pointer++) = quad_index + 2;
-    *(static_ib_data_pointer++) = quad_index;
-    *(static_ib_data_pointer++) = quad_index + 2;
-    *(static_ib_data_pointer++) = quad_index + 3;
-  }
-  static_ib_upload_->Unmap(0, nullptr);
-  // Not uploaded yet.
-  static_ib_upload_submission_ = UINT64_MAX;
-  if (FAILED(device->CreateCommittedResource(
-          &ui::d3d12::util::kHeapPropertiesDefault, heap_flag_create_not_zeroed,
-          &static_ib_desc, D3D12_RESOURCE_STATE_COPY_DEST, nullptr,
-          IID_PPV_ARGS(&static_ib_)))) {
-    XELOGE("Failed to create the primitive conversion static index buffer");
-    Shutdown();
-    return false;
-  }
-  static_ib_gpu_address_ = static_ib_->GetGPUVirtualAddress();
-
-  memory_regions_invalidated_.store(0ull, std::memory_order_relaxed);
-  memory_invalidation_callback_handle_ =
-      memory_.RegisterPhysicalMemoryInvalidationCallback(
-          MemoryInvalidationCallbackThunk, this);
-
-  return true;
-}
-
-void PrimitiveConverter::Shutdown() {
-  if (memory_invalidation_callback_handle_ != nullptr) {
-    memory_.UnregisterPhysicalMemoryInvalidationCallback(
-        memory_invalidation_callback_handle_);
-    memory_invalidation_callback_handle_ = nullptr;
-  }
-  ui::d3d12::util::ReleaseAndNull(static_ib_);
-  ui::d3d12::util::ReleaseAndNull(static_ib_upload_);
-  buffer_pool_.reset();
-}
-
-void PrimitiveConverter::ClearCache() { buffer_pool_->ClearCache(); }
-
-void PrimitiveConverter::CompletedSubmissionUpdated() {
-  if (static_ib_upload_ && command_processor_.GetCompletedSubmission() >=
-                               static_ib_upload_submission_) {
-    // Completely uploaded - release the upload buffer.
-    static_ib_upload_->Release();
-    static_ib_upload_ = nullptr;
-  }
-}
-
-void PrimitiveConverter::BeginSubmission() {
-  // Got a command list now - upload and transition the static index buffer if
-  // needed.
-  if (static_ib_upload_ && static_ib_upload_submission_ == UINT64_MAX) {
-    command_processor_.GetDeferredCommandList().D3DCopyResource(
-        static_ib_, static_ib_upload_);
-    command_processor_.PushTransitionBarrier(static_ib_,
-                                             D3D12_RESOURCE_STATE_COPY_DEST,
-                                             D3D12_RESOURCE_STATE_INDEX_BUFFER);
-    static_ib_upload_submission_ = command_processor_.GetCurrentSubmission();
-  }
-}
-
-void PrimitiveConverter::BeginFrame() {
-  buffer_pool_->Reclaim(command_processor_.GetCompletedFrame());
-  converted_indices_cache_.clear();
-  memory_regions_used_ = 0;
-}
-
-xenos::PrimitiveType PrimitiveConverter::GetReplacementPrimitiveType(
-    xenos::PrimitiveType type) {
-  switch (type) {
-    case xenos::PrimitiveType::kTriangleFan:
-      return xenos::PrimitiveType::kTriangleList;
-    case xenos::PrimitiveType::kLineLoop:
-      return xenos::PrimitiveType::kLineStrip;
-    case xenos::PrimitiveType::kQuadList:
-      if (cvars::d3d12_convert_quads_to_triangles) {
-        return xenos::PrimitiveType::kTriangleList;
-      }
-      break;
-    default:
-      break;
-  }
-  return type;
-}
-
-PrimitiveConverter::ConversionResult PrimitiveConverter::ConvertPrimitives(
-    xenos::PrimitiveType source_type, uint32_t address, uint32_t index_count,
-    xenos::IndexFormat index_format, xenos::Endian index_endianness,
-    D3D12_GPU_VIRTUAL_ADDRESS& gpu_address_out, uint32_t& index_count_out) {
-  bool index_32bit = index_format == xenos::IndexFormat::kInt32;
-  const auto& regs = register_file_;
-  bool reset = regs.Get<reg::PA_SU_SC_MODE_CNTL>().multi_prim_ib_ena;
-  // Swap the reset index because we will be comparing unswapped values to it.
-  uint32_t reset_index = xenos::GpuSwap(
-      regs[XE_GPU_REG_VGT_MULTI_PRIM_IB_RESET_INDX].u32, index_endianness);
-  // If the specified reset index is the same as the one used by Direct3D 12
-  // (0xFFFF or 0xFFFFFFFF - in the pipeline cache, we use the former for
-  // 16-bit and the latter for 32-bit indices), we can use the buffer directly.
-  uint32_t reset_index_host = index_32bit ? 0xFFFFFFFFu : 0xFFFFu;
-
-  // Degenerate line loops are just lines.
-  if (source_type == xenos::PrimitiveType::kLineLoop && index_count <= 2) {
-    source_type = xenos::PrimitiveType::kLineStrip;
-  }
-
-  // Check if need to convert at all.
-  if (source_type == xenos::PrimitiveType::kTriangleStrip ||
-      source_type == xenos::PrimitiveType::kLineStrip) {
-    if (!reset || reset_index == reset_index_host) {
-      return ConversionResult::kConversionNotNeeded;
-    }
-  } else if (source_type == xenos::PrimitiveType::kQuadList) {
-    if (!cvars::d3d12_convert_quads_to_triangles) {
-      return ConversionResult::kConversionNotNeeded;
-    }
-  } else if (source_type != xenos::PrimitiveType::kTriangleFan &&
-             source_type != xenos::PrimitiveType::kLineLoop) {
-    return ConversionResult::kConversionNotNeeded;
-  }
-
-#if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
-  SCOPE_profile_cpu_f("gpu");
-#endif  // XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
-
-  // Exit early for clearly empty draws, without even reading the memory.
-  uint32_t index_count_min;
-  if (source_type == xenos::PrimitiveType::kLineStrip ||
-      source_type == xenos::PrimitiveType::kLineLoop) {
-    index_count_min = 2;
-  } else if (source_type == xenos::PrimitiveType::kQuadList) {
-    index_count_min = 4;
-  } else {
-    index_count_min = 3;
-  }
-  if (index_count < index_count_min) {
-    return ConversionResult::kPrimitiveEmpty;
-  }
-
-  // Invalidate the cache if data behind any entry was modified.
-  if (memory_regions_invalidated_.exchange(0ull, std::memory_order_acquire) &
-      memory_regions_used_) {
-    converted_indices_cache_.clear();
-    memory_regions_used_ = 0;
-  }
-
-  address &= index_32bit ? 0x1FFFFFFC : 0x1FFFFFFE;
-  uint32_t index_size = index_32bit ? sizeof(uint32_t) : sizeof(uint16_t);
-  uint32_t index_buffer_size = index_size * index_count;
-  uint32_t address_last = address + index_size * (index_count - 1);
-
-  // Create the cache entry, currently only for the key.
-  ConvertedIndices converted_indices;
-  converted_indices.key.address = address;
-  converted_indices.key.source_type = source_type;
-  converted_indices.key.format = index_format;
-  converted_indices.key.count = index_count;
-  converted_indices.key.reset = reset ? 1 : 0;
-  converted_indices.reset_index = reset_index;
-
-  // Try to find the previously converted index buffer.
-  auto found_range =
-      converted_indices_cache_.equal_range(converted_indices.key.value);
-  for (auto iter = found_range.first; iter != found_range.second; ++iter) {
-    const ConvertedIndices& found_converted = iter->second;
-    if (reset && found_converted.reset_index != reset_index) {
-      continue;
-    }
-    if (found_converted.converted_index_count == 0) {
-      return ConversionResult::kPrimitiveEmpty;
-    }
-    if (!found_converted.gpu_address) {
-      return ConversionResult::kConversionNotNeeded;
-    }
-    gpu_address_out = found_converted.gpu_address;
-    index_count_out = found_converted.converted_index_count;
-    return ConversionResult::kConverted;
-  }
-
-  // Get the memory usage mask for cache invalidation.
-  // 1 bit = (512 / 64) MB = 8 MB.
-  uint64_t memory_regions_used_bits = ~((1ull << (address >> 23)) - 1);
-  if (address_last < (63 << 23)) {
-    memory_regions_used_bits = (1ull << ((address_last >> 23) + 1)) - 1;
-  }
-
-  union {
-    const void* source;
-    const uint8_t* source_8;
-    const uint16_t* source_16;
-    const uint32_t* source_32;
-    uintptr_t source_uintptr;
-  };
-  source = memory_.TranslatePhysical(address);
-
-  // Calculate the new index count, and also check if there's nothing to convert
-  // in the buffer (for instance, if not using actually primitive reset).
-  uint32_t converted_index_count = 0;
-  bool conversion_needed = false;
-  bool simd = false;
-  // Optimization specific to primitive types - if reset index not found in the
-  // source index buffer, can set this to false and use a faster way of copying.
-  bool reset_actually_used = reset;
-  if (source_type == xenos::PrimitiveType::kTriangleFan) {
-    // Triangle fans are not supported by Direct3D 12 at all.
-    conversion_needed = true;
-    trace_writer_.WriteMemoryRead(address, index_buffer_size);
-    if (reset) {
-      uint32_t current_fan_index_count = 0;
-      for (uint32_t i = 0; i < index_count; ++i) {
-        uint32_t index = index_format == xenos::IndexFormat::kInt32
-                             ? source_32[i]
-                             : source_16[i];
-        if (index == reset_index) {
-          current_fan_index_count = 0;
-          continue;
-        }
-        if (++current_fan_index_count >= 3) {
-          converted_index_count += 3;
-        }
-      }
-    } else {
-      converted_index_count = 3 * (index_count - 2);
-    }
-  } else if (source_type == xenos::PrimitiveType::kTriangleStrip ||
-             source_type == xenos::PrimitiveType::kLineStrip) {
-    converted_index_count = index_count;
-    // Check if the restart index is used at all in this buffer because reading
-    // vertices from a default heap is faster than from an upload heap.
-    conversion_needed = false;
-    trace_writer_.WriteMemoryRead(address, index_buffer_size);
-#if XE_ARCH_AMD64
-    // Will use SIMD to copy 16-byte blocks using _mm_or_si128.
-    simd = true;
-    union {
-      const void* check_source;
-      const uint32_t* check_source_16;
-      const uint32_t* check_source_32;
-      const __m128i* check_source_128;
-      uintptr_t check_source_uintptr;
-    };
-    check_source = source;
-    uint32_t check_indices_remaining = index_count;
-    alignas(16) uint64_t check_result[2];
-    if (index_format == xenos::IndexFormat::kInt32) {
-      while (check_indices_remaining != 0 && (check_source_uintptr & 15)) {
-        --check_indices_remaining;
-        if (*(check_source_32++) == reset_index) {
-          conversion_needed = true;
-          check_indices_remaining = 0;
-        }
-      }
-      __m128i check_reset_index_vector = _mm_set1_epi32(reset_index);
-      while (check_indices_remaining >= 4) {
-        check_indices_remaining -= 4;
-        _mm_store_si128(reinterpret_cast<__m128i*>(&check_result),
-                        _mm_cmpeq_epi32(_mm_load_si128(check_source_128++),
-                                        check_reset_index_vector));
-        if (check_result[0] || check_result[1]) {
-          conversion_needed = true;
-          check_indices_remaining = 0;
-        }
-      }
-      while (check_indices_remaining != 0) {
-        --check_indices_remaining;
-        if (*(check_source_32++) == reset_index) {
-          conversion_needed = true;
-          check_indices_remaining = 0;
-        }
-      }
-    } else {
-      while (check_indices_remaining != 0 && (check_source_uintptr & 15)) {
-        --check_indices_remaining;
-        if (*(check_source_16++) == reset_index) {
-          conversion_needed = true;
-          check_indices_remaining = 0;
-        }
-      }
-      __m128i check_reset_index_vector = _mm_set1_epi16(reset_index);
-      while (check_indices_remaining >= 8) {
-        check_indices_remaining -= 8;
-        _mm_store_si128(reinterpret_cast<__m128i*>(&check_result),
-                        _mm_cmpeq_epi16(_mm_load_si128(check_source_128++),
-                                        check_reset_index_vector));
-        if (check_result[0] || check_result[1]) {
-          conversion_needed = true;
-          check_indices_remaining = 0;
-        }
-      }
-      while (check_indices_remaining != 0) {
-        --check_indices_remaining;
-        if (*(check_source_16++) == reset_index) {
-          conversion_needed = true;
-          check_indices_remaining = 0;
-        }
-      }
-    }
-#else
-    if (index_format == xenos::IndexFormat::kInt32) {
-      for (uint32_t i = 0; i < index_count; ++i) {
-        if (source_32[i] == reset_index) {
-          conversion_needed = true;
-          break;
-        }
-      }
-    } else {
-      for (uint32_t i = 0; i < index_count; ++i) {
-        if (source_16[i] == reset_index) {
-          conversion_needed = true;
-          break;
-        }
-      }
-    }
-#endif  // XE_ARCH_AMD64
-  } else if (source_type == xenos::PrimitiveType::kLineLoop) {
-    conversion_needed = true;
-    trace_writer_.WriteMemoryRead(address, index_buffer_size);
-    if (reset) {
-      reset_actually_used = false;
-      uint32_t current_strip_index_count = 0;
-      for (uint32_t i = 0; i < index_count; ++i) {
-        uint32_t index = index_format == xenos::IndexFormat::kInt32
-                             ? source_32[i]
-                             : source_16[i];
-        if (index == reset_index) {
-          reset_actually_used = true;
-          // Loop strips with more than 2 vertices.
-          if (current_strip_index_count > 2) {
-            ++converted_index_count;
-          }
-          current_strip_index_count = 0;
-          continue;
-        }
-        // Start a new strip if 2 vertices, add one vertex if more.
-        if (++current_strip_index_count >= 2) {
-          converted_index_count += current_strip_index_count == 2 ? 2 : 1;
-        }
-      }
-    } else {
-      converted_index_count = index_count + 1;
-    }
-  } else if (source_type == xenos::PrimitiveType::kQuadList) {
-    conversion_needed = true;
-    trace_writer_.WriteMemoryRead(address, index_buffer_size);
-    converted_index_count = (index_count >> 2) * 6;
-  }
-  converted_indices.converted_index_count = converted_index_count;
-
-  // If nothing to convert, store this result so the check won't be happening
-  // again and again and exit.
-  if (!conversion_needed || converted_index_count == 0) {
-    converted_indices.gpu_address = 0;
-    converted_indices_cache_.emplace(converted_indices.key.value,
-                                     converted_indices);
-    memory_regions_used_ |= memory_regions_used_bits;
-    return converted_index_count == 0 ? ConversionResult::kPrimitiveEmpty
-                                      : ConversionResult::kConversionNotNeeded;
-  }
-
-  // Convert.
-
-  D3D12_GPU_VIRTUAL_ADDRESS gpu_address;
-  void* target = AllocateIndices(index_format, converted_index_count,
-                                 simd ? address & 15 : 0, gpu_address);
-  if (target == nullptr) {
-    return ConversionResult::kFailed;
-  }
-
-  if (source_type == xenos::PrimitiveType::kTriangleFan) {
-    // https://docs.microsoft.com/en-us/windows/desktop/direct3d9/triangle-fans
-    // Ordered as (v1, v2, v0), (v2, v3, v0).
-    if (reset) {
-      uint32_t current_fan_index_count = 0;
-      uint32_t current_fan_first_index = 0;
-      if (index_format == xenos::IndexFormat::kInt32) {
-        uint32_t* target_32 = reinterpret_cast<uint32_t*>(target);
-        for (uint32_t i = 0; i < index_count; ++i) {
-          uint32_t index = source_32[i];
-          if (index == reset_index) {
-            current_fan_index_count = 0;
-            continue;
-          }
-          if (current_fan_index_count == 0) {
-            current_fan_first_index = index;
-          }
-          if (++current_fan_index_count >= 3) {
-            *(target_32++) = source_32[i - 1];
-            *(target_32++) = index;
-            *(target_32++) = current_fan_first_index;
-          }
-        }
-      } else {
-        uint16_t* target_16 = reinterpret_cast<uint16_t*>(target);
-        for (uint32_t i = 0; i < index_count; ++i) {
-          uint16_t index = source_16[i];
-          if (index == reset_index) {
-            current_fan_index_count = 0;
-            continue;
-          }
-          if (current_fan_index_count == 0) {
-            current_fan_first_index = index;
-          }
-          if (++current_fan_index_count >= 3) {
-            *(target_16++) = source_16[i - 1];
-            *(target_16++) = index;
-            *(target_16++) = uint16_t(current_fan_first_index);
-          }
-        }
-      }
-    } else {
-      if (index_format == xenos::IndexFormat::kInt32) {
-        uint32_t* target_32 = reinterpret_cast<uint32_t*>(target);
-        for (uint32_t i = 2; i < index_count; ++i) {
-          *(target_32++) = source_32[i - 1];
-          *(target_32++) = source_32[i];
-          *(target_32++) = source_32[0];
-        }
-      } else {
-        uint16_t* target_16 = reinterpret_cast<uint16_t*>(target);
-        for (uint32_t i = 2; i < index_count; ++i) {
-          *(target_16++) = source_16[i - 1];
-          *(target_16++) = source_16[i];
-          *(target_16++) = source_16[0];
-        }
-      }
-    }
-  } else if (source_type == xenos::PrimitiveType::kTriangleStrip ||
-             source_type == xenos::PrimitiveType::kLineStrip) {
-#if XE_ARCH_AMD64
-    // Replace the reset index with the maximum representable value - vector OR
-    // gives 0 or 0xFFFF/0xFFFFFFFF, which is exactly what is needed.
-    // Allocations in the target index buffer are aligned with 16-byte
-    // granularity, and within 16-byte vectors, both the source and the target
-    // start at the same offset.
-    union {
-      const __m128i* source_aligned_128;
-      uintptr_t source_aligned_uintptr;
-    };
-    source_aligned_uintptr = source_uintptr & ~(uintptr_t(15));
-    union {
-      __m128i* target_aligned_128;
-      uintptr_t target_aligned_uintptr;
-    };
-    target_aligned_uintptr =
-        reinterpret_cast<uintptr_t>(target) & ~(uintptr_t(15));
-    uint32_t vector_count = (address_last >> 4) - (address >> 4) + 1;
-    if (index_format == xenos::IndexFormat::kInt32) {
-      __m128i reset_index_vector = _mm_set1_epi32(reset_index);
-      for (uint32_t i = 0; i < vector_count; ++i) {
-        __m128i indices_vector = _mm_load_si128(source_aligned_128++);
-        __m128i indices_are_reset_vector =
-            _mm_cmpeq_epi32(indices_vector, reset_index_vector);
-        _mm_store_si128(target_aligned_128++,
-                        _mm_or_si128(indices_vector, indices_are_reset_vector));
-      }
-    } else {
-      __m128i reset_index_vector = _mm_set1_epi16(reset_index);
-      for (uint32_t i = 0; i < vector_count; ++i) {
-        __m128i indices_vector = _mm_load_si128(source_aligned_128++);
-        __m128i indices_are_reset_vector =
-            _mm_cmpeq_epi16(indices_vector, reset_index_vector);
-        _mm_store_si128(target_aligned_128++,
-                        _mm_or_si128(indices_vector, indices_are_reset_vector));
-      }
-    }
-#else
-    if (index_format == xenos::IndexFormat::kInt32) {
-      for (uint32_t i = 0; i < index_count; ++i) {
-        uint32_t index = source_32[i];
-        reinterpret_cast<uint32_t*>(target)[i] =
-            index == reset_index ? 0xFFFFFFFFu : index;
-      }
-    } else {
-      for (uint32_t i = 0; i < index_count; ++i) {
-        uint16_t index = source_16[i];
-        reinterpret_cast<uint16_t*>(target)[i] =
-            index == reset_index ? 0xFFFFu : index;
-      }
-    }
-#endif  // XE_ARCH_AMD64
-  } else if (source_type == xenos::PrimitiveType::kLineLoop) {
-    if (reset_actually_used) {
-      uint32_t current_strip_index_count = 0;
-      uint32_t current_strip_first_index = 0;
-      if (index_format == xenos::IndexFormat::kInt32) {
-        uint32_t* target_32 = reinterpret_cast<uint32_t*>(target);
-        for (uint32_t i = 0; i < index_count; ++i) {
-          uint32_t index = source_32[i];
-          if (index == reset_index) {
-            if (current_strip_index_count > 2) {
-              *(target_32++) = current_strip_first_index;
-            }
-            current_strip_index_count = 0;
-            continue;
-          }
-          if (current_strip_index_count == 0) {
-            current_strip_first_index = index;
-          }
-          ++current_strip_index_count;
-          if (current_strip_index_count >= 2) {
-            if (current_strip_index_count == 2) {
-              *(target_32++) = current_strip_first_index;
-            }
-            *(target_32++) = index;
-          }
-        }
-      } else {
-        uint16_t* target_16 = reinterpret_cast<uint16_t*>(target);
-        for (uint32_t i = 0; i < index_count; ++i) {
-          uint16_t index = source_16[i];
-          if (index == reset_index) {
-            if (current_strip_index_count > 2) {
-              *(target_16++) = uint16_t(current_strip_first_index);
-            }
-            current_strip_index_count = 0;
-            continue;
-          }
-          if (current_strip_index_count == 0) {
-            current_strip_first_index = index;
-          }
-          ++current_strip_index_count;
-          if (current_strip_index_count >= 2) {
-            if (current_strip_index_count == 2) {
-              *(target_16++) = uint16_t(current_strip_first_index);
-            }
-            *(target_16++) = index;
-          }
-        }
-      }
-    } else {
-      std::memcpy(target, source, index_count * index_size);
-      if (converted_index_count > index_count) {
-        if (index_format == xenos::IndexFormat::kInt32) {
-          reinterpret_cast<uint32_t*>(target)[index_count] = source_32[0];
-        } else {
-          reinterpret_cast<uint16_t*>(target)[index_count] = source_16[0];
-        }
-      }
-    }
-  } else if (source_type == xenos::PrimitiveType::kQuadList) {
-    uint32_t quad_count = index_count >> 4;
-    if (index_format == xenos::IndexFormat::kInt32) {
-      uint32_t* target_32 = reinterpret_cast<uint32_t*>(target);
-      for (uint32_t i = 0; i < quad_count; ++i) {
-        uint32_t quad_index = i << 2;
-        *(target_32++) = source_32[quad_index];
-        *(target_32++) = source_32[quad_index + 1];
-        *(target_32++) = source_32[quad_index + 2];
-        *(target_32++) = source_32[quad_index];
-        *(target_32++) = source_32[quad_index + 2];
-        *(target_32++) = source_32[quad_index + 3];
-      }
-    } else {
-      uint16_t* target_16 = reinterpret_cast<uint16_t*>(target);
-      for (uint32_t i = 0; i < quad_count; ++i) {
-        uint32_t quad_index = i << 2;
-        *(target_16++) = source_16[quad_index];
-        *(target_16++) = source_16[quad_index + 1];
-        *(target_16++) = source_16[quad_index + 2];
-        *(target_16++) = source_16[quad_index];
-        *(target_16++) = source_16[quad_index + 2];
-        *(target_16++) = source_16[quad_index + 3];
-      }
-    }
-  }
-
-  // Cache and return the indices.
-  converted_indices.gpu_address = gpu_address;
-  converted_indices_cache_.emplace(converted_indices.key.value,
-                                   converted_indices);
-  memory_regions_used_ |= memory_regions_used_bits;
-  gpu_address_out = gpu_address;
-  index_count_out = converted_index_count;
-  return ConversionResult::kConverted;
-}
-
-void* PrimitiveConverter::AllocateIndices(
-    xenos::IndexFormat format, uint32_t count, uint32_t simd_offset,
-    D3D12_GPU_VIRTUAL_ADDRESS& gpu_address_out) {
-  if (count == 0) {
-    return nullptr;
-  }
-  uint32_t size =
-      count * (format == xenos::IndexFormat::kInt32 ? sizeof(uint32_t)
-                                                    : sizeof(uint16_t));
-  // 16-align all index data because SIMD is used to replace the reset index
-  // (without that, 4-alignment would be required anyway to mix 16-bit and
-  // 32-bit indices in one buffer page).
-  size = xe::align(size, uint32_t(16));
-  // Add some space to align SIMD register components the same way in the source
-  // and the buffer.
-  simd_offset &= 15;
-  if (simd_offset != 0) {
-    size += 16;
-  }
-  D3D12_GPU_VIRTUAL_ADDRESS gpu_address;
-  uint8_t* mapping =
-      buffer_pool_->Request(command_processor_.GetCurrentFrame(), size, 16,
-                            nullptr, nullptr, &gpu_address);
-  if (mapping == nullptr) {
-    XELOGE("Failed to allocate space for {} converted {}-bit vertex indices",
-           count, format == xenos::IndexFormat::kInt32 ? 32 : 16);
-    return nullptr;
-  }
-  gpu_address_out = gpu_address + simd_offset;
-  return mapping + simd_offset;
-}
-
-std::pair<uint32_t, uint32_t> PrimitiveConverter::MemoryInvalidationCallback(
-    uint32_t physical_address_start, uint32_t length, bool exact_range) {
-  // 1 bit = (512 / 64) MB = 8 MB. Invalidate a region of this size.
-  uint32_t bit_index_first = physical_address_start >> 23;
-  uint32_t bit_index_last = (physical_address_start + length - 1) >> 23;
-  uint64_t bits = ~((1ull << bit_index_first) - 1);
-  if (bit_index_last < 63) {
-    bits &= (1ull << (bit_index_last + 1)) - 1;
-  }
-  memory_regions_invalidated_ |= bits;
-  return std::make_pair<uint32_t, uint32_t>(0, UINT32_MAX);
-}
-
-std::pair<uint32_t, uint32_t>
-PrimitiveConverter::MemoryInvalidationCallbackThunk(
-    void* context_ptr, uint32_t physical_address_start, uint32_t length,
-    bool exact_range) {
-  return reinterpret_cast<PrimitiveConverter*>(context_ptr)
-      ->MemoryInvalidationCallback(physical_address_start, length, exact_range);
-}
-
-D3D12_GPU_VIRTUAL_ADDRESS PrimitiveConverter::GetStaticIndexBuffer(
-    xenos::PrimitiveType source_type, uint32_t index_count,
-    uint32_t& index_count_out) const {
-  if (index_count > kMaxNonIndexedVertices) {
-    assert_always();
-    return D3D12_GPU_VIRTUAL_ADDRESS(0);
-  }
-  if (source_type == xenos::PrimitiveType::kTriangleFan) {
-    index_count_out = (std::max(index_count, uint32_t(2)) - 2) * 3;
-    return static_ib_gpu_address_ +
-           kStaticIBTriangleFanOffset * sizeof(uint16_t);
-  }
-  if (source_type == xenos::PrimitiveType::kQuadList &&
-      cvars::d3d12_convert_quads_to_triangles) {
-    index_count_out = (index_count >> 2) * 6;
-    return static_ib_gpu_address_ + kStaticIBQuadOffset * sizeof(uint16_t);
-  }
-  return D3D12_GPU_VIRTUAL_ADDRESS(0);
-}
-
-void PrimitiveConverter::InitializeTrace() {
-  // WriteMemoryRead must not be skipped.
-  converted_indices_cache_.clear();
-  memory_regions_used_ = 0;
-}
-
-}  // namespace d3d12
-}  // namespace gpu
-}  // namespace xe
--- a/src/xenia/gpu/d3d12/primitive_converter.h
+++ b/src/xenia/gpu/d3d12/primitive_converter.h
@ -1,189 +0,0 @@
-/**
- ******************************************************************************
- * Xenia : Xbox 360 Emulator Research Project                                 *
- ******************************************************************************
- * Copyright 2018 Ben Vanik. All rights reserved.                             *
- * Released under the BSD license - see LICENSE in the root for more details. *
- ******************************************************************************
- */
-
-#ifndef XENIA_GPU_D3D12_PRIMITIVE_CONVERTER_H_
-#define XENIA_GPU_D3D12_PRIMITIVE_CONVERTER_H_
-
-#include <atomic>
-#include <memory>
-#include <unordered_map>
-
-#include "xenia/gpu/register_file.h"
-#include "xenia/gpu/trace_writer.h"
-#include "xenia/gpu/xenos.h"
-#include "xenia/memory.h"
-#include "xenia/ui/d3d12/d3d12_context.h"
-#include "xenia/ui/d3d12/d3d12_upload_buffer_pool.h"
-
-namespace xe {
-namespace gpu {
-namespace d3d12 {
-
-class D3D12CommandProcessor;
-
-// Index buffer cache for primitive types not natively supported by Direct3D 12:
-// - Triangle and line strips with non-0xFFFF/0xFFFFFFFF reset index.
-// - Triangle fans.
-// - Line loops (only indexed ones - non-indexed are better handled in vertex
-//   shaders, otherwise a whole index buffer would have to be created for every
-//   vertex count value).
-// - Quad lists (for debugging since geometry shaders break PIX - as an
-//   alternative to the geometry shader).
-class PrimitiveConverter {
- public:
-  PrimitiveConverter(D3D12CommandProcessor& command_processor,
-                     const RegisterFile& register_file, Memory& memory,
-                     TraceWriter& trace_writer);
-  ~PrimitiveConverter();
-
-  bool Initialize();
-  void Shutdown();
-  void ClearCache();
-
-  void CompletedSubmissionUpdated();
-  void BeginSubmission();
-  void BeginFrame();
-
-  // Returns the primitive type that the original type will be converted to.
-  static xenos::PrimitiveType GetReplacementPrimitiveType(
-      xenos::PrimitiveType type);
-
-  enum class ConversionResult {
-    // Converted to a transient buffer.
-    kConverted,
-    // Conversion not required - use the index buffer in shared memory.
-    kConversionNotNeeded,
-    // No errors, but nothing to render.
-    kPrimitiveEmpty,
-    // Total failure of the draw call.
-    kFailed
-  };
-
-  // Converts an index buffer to the primitive type returned by
-  // GetReplacementPrimitiveType. If conversion has been performed, the returned
-  // buffer will be in the GENERIC_READ state (it's in an upload heap). Only
-  // writing to the outputs if returning kConverted. The restart index will be
-  // handled internally from the register values.
-  ConversionResult ConvertPrimitives(xenos::PrimitiveType source_type,
-                                     uint32_t address, uint32_t index_count,
-                                     xenos::IndexFormat index_format,
-                                     xenos::Endian index_endianness,
-                                     D3D12_GPU_VIRTUAL_ADDRESS& gpu_address_out,
-                                     uint32_t& index_count_out);
-
-  // Returns the 16-bit index buffer for drawing unsupported non-indexed
-  // primitives in INDEX_BUFFER state, for non-indexed drawing. Returns 0 if
-  // conversion is not available (can draw natively).
-  D3D12_GPU_VIRTUAL_ADDRESS GetStaticIndexBuffer(
-      xenos::PrimitiveType source_type, uint32_t index_count,
-      uint32_t& index_count_out) const;
-
-  // Callback for invalidating buffers mid-frame.
-  std::pair<uint32_t, uint32_t> MemoryInvalidationCallback(
-      uint32_t physical_address_start, uint32_t length, bool exact_range);
-
-  void InitializeTrace();
-
- private:
-  // simd_offset is source address & 15 - if SIMD is used, the source and the
-  // target must have the same alignment within one register. 0 is optimal when
-  // not using SIMD.
-  void* AllocateIndices(xenos::IndexFormat format, uint32_t count,
-                        uint32_t simd_offset,
-                        D3D12_GPU_VIRTUAL_ADDRESS& gpu_address_out);
-
-  static std::pair<uint32_t, uint32_t> MemoryInvalidationCallbackThunk(
-      void* context_ptr, uint32_t physical_address_start, uint32_t length,
-      bool exact_range);
-
-  D3D12CommandProcessor& command_processor_;
-  const RegisterFile& register_file_;
-  Memory& memory_;
-  TraceWriter& trace_writer_;
-
-  std::unique_ptr<ui::d3d12::D3D12UploadBufferPool> buffer_pool_;
-
-  // Static index buffers for emulating unsupported primitive types when drawing
-  // without an index buffer.
-  // CPU-side, used only for uploading - destroyed once the copy commands have
-  // been completed.
-  ID3D12Resource* static_ib_upload_ = nullptr;
-  uint64_t static_ib_upload_submission_;
-  // GPU-side - used for drawing.
-  ID3D12Resource* static_ib_ = nullptr;
-  D3D12_GPU_VIRTUAL_ADDRESS static_ib_gpu_address_;
-  // In PM4 draw packets, 16 bits are used for the vertex count.
-  static constexpr uint32_t kMaxNonIndexedVertices = 65535;
-  static constexpr uint32_t kStaticIBTriangleFanOffset = 0;
-  static constexpr uint32_t kStaticIBTriangleFanCount =
-      (kMaxNonIndexedVertices - 2) * 3;
-  static constexpr uint32_t kStaticIBQuadOffset =
-      kStaticIBTriangleFanOffset + kStaticIBTriangleFanCount;
-  static constexpr uint32_t kStaticIBQuadCount =
-      (kMaxNonIndexedVertices >> 2) * 6;
-  static constexpr uint32_t kStaticIBTotalCount =
-      kStaticIBQuadOffset + kStaticIBQuadCount;
-
-  // Not identifying the index buffer uniquely - reset index must also be
-  // checked if reset is enabled.
-  union ConvertedIndicesKey {
-    uint64_t value;
-    struct {
-      uint32_t address;                      // 32
-      xenos::PrimitiveType source_type : 6;  // 38
-      xenos::IndexFormat format : 1;         // 39
-      uint32_t count : 16;                   // 55
-      uint32_t reset : 1;                    // 56
-    };
-
-    // Clearing the unused bits.
-    ConvertedIndicesKey() : value(0) {}
-    ConvertedIndicesKey(const ConvertedIndicesKey& key) : value(key.value) {}
-    ConvertedIndicesKey& operator=(const ConvertedIndicesKey& key) {
-      value = key.value;
-      return *this;
-    }
-    bool operator==(const ConvertedIndicesKey& key) const {
-      return value == key.value;
-    }
-    bool operator!=(const ConvertedIndicesKey& key) const {
-      return value != key.value;
-    }
-  };
-
-  struct ConvertedIndices {
-    ConvertedIndicesKey key;
-    // If reset is enabled, this also must be checked to find cached indices.
-    uint32_t reset_index;
-
-    // Zero GPU address if conversion not needed or the resulting index buffer
-    // is empty.
-    D3D12_GPU_VIRTUAL_ADDRESS gpu_address;
-    // When conversion is not needed, this must be equal to the original index
-    // count.
-    uint32_t converted_index_count;
-  };
-
-  // Cache for a single frame.
-  std::unordered_multimap<uint64_t, ConvertedIndices> converted_indices_cache_;
-
-  // Very coarse cache invalidation - if something is modified in a 8 MB portion
-  // of the physical memory and converted indices are also there, invalidate all
-  // the cache.
-  uint64_t memory_regions_used_;
-  std::atomic<uint64_t> memory_regions_invalidated_ = 0;
-  void* memory_invalidation_callback_handle_ = nullptr;
-  uint32_t system_page_size_;
-};
-
-}  // namespace d3d12
-}  // namespace gpu
-}  // namespace xe
-
-#endif  // XENIA_GPU_D3D12_PRIMITIVE_CONVERTER_H_
--- a/src/xenia/gpu/d3d12/render_target_cache.cc
+++ b/src/xenia/gpu/d3d12/render_target_cache.cc
--- a/src/xenia/gpu/d3d12/render_target_cache.h
+++ b/src/xenia/gpu/d3d12/render_target_cache.h
@ -1,574 +0,0 @@
-/**
- ******************************************************************************
- * Xenia : Xbox 360 Emulator Research Project                                 *
- ******************************************************************************
- * Copyright 2018 Ben Vanik. All rights reserved.                             *
- * Released under the BSD license - see LICENSE in the root for more details. *
- ******************************************************************************
- */
-
-#ifndef XENIA_GPU_D3D12_RENDER_TARGET_CACHE_H_
-#define XENIA_GPU_D3D12_RENDER_TARGET_CACHE_H_
-
-#include <memory>
-#include <unordered_map>
-
-#include "xenia/base/cvar.h"
-#include "xenia/gpu/d3d12/d3d12_shader.h"
-#include "xenia/gpu/d3d12/d3d12_shared_memory.h"
-#include "xenia/gpu/d3d12/texture_cache.h"
-#include "xenia/gpu/draw_util.h"
-#include "xenia/gpu/gpu_flags.h"
-#include "xenia/gpu/register_file.h"
-#include "xenia/gpu/trace_writer.h"
-#include "xenia/gpu/xenos.h"
-#include "xenia/memory.h"
-#include "xenia/ui/d3d12/d3d12_api.h"
-#include "xenia/ui/d3d12/d3d12_upload_buffer_pool.h"
-
-DECLARE_bool(d3d12_16bit_rtv_full_range);
-
-namespace xe {
-namespace gpu {
-namespace d3d12 {
-
-class D3D12CommandProcessor;
-
-// =============================================================================
-// How EDRAM is used by Xenos:
-// (Copied from the old version of the render target cache, so implementation
-//  info may differ from the way EDRAM is emulated now.)
-// =============================================================================
-//
-// On the 360 the render target is an opaque block of memory in EDRAM that's
-// only accessible via resolves. We use this to our advantage to simulate
-// something like it as best we can by having a shared backing memory with
-// a multitude of views for each tile location in EDRAM.
-//
-// This allows us to have the same base address write to the same memory
-// regardless of framebuffer format. Resolving then uses whatever format the
-// resolve requests straight from the backing memory.
-//
-// EDRAM is a beast and we only approximate it as best we can. Basically,
-// the 10MiB of EDRAM is composed of 2048 5120b tiles. Each tile is 80x16px.
-// +-----+-----+-----+---
-// |tile0|tile1|tile2|...  2048 times
-// +-----+-----+-----+---
-// Operations dealing with EDRAM deal in tile offsets, so base 0x100 is tile
-// offset 256, 256*5120=1310720b into the buffer. All rendering operations are
-// aligned to tiles so trying to draw at 256px wide will have a real width of
-// 320px by rounding up to the next tile.
-//
-// MSAA and other settings will modify the exact pixel sizes, like 4X makes
-// each tile effectively 40x8px / 2X makes each tile 80x8px, but they are still
-// all 5120b. As we try to emulate this we adjust our viewport when rendering to
-// stretch pixels as needed.
-//
-// It appears that games also take advantage of MSAA stretching tiles when doing
-// clears. Games will clear a view with 1/2X pitch/height and 4X MSAA and then
-// later draw to that view with 1X pitch/height and 1X MSAA.
-//
-// The good news is that games cannot read EDRAM directly but must use a copy
-// operation to get the data out. That gives us a chance to do whatever we
-// need to (re-tile, etc) only when requested.
-//
-// To approximate the tiled EDRAM layout we use a single large chunk of memory.
-// From this memory we create many VkImages (and VkImageViews) of various
-// formats and dimensions as requested by the game. These are used as
-// attachments during rendering and as sources during copies. They are also
-// heavily aliased - lots of images will reference the same locations in the
-// underlying EDRAM buffer. The only requirement is that there are no hazards
-// with specific tiles (reading/writing the same tile through different images)
-// and otherwise it should be ok *fingers crossed*.
-//
-// One complication is the copy/resolve process itself: we need to give back
-// the data asked for in the format desired and where it goes is arbitrary
-// (any address in physical memory). If the game is good we get resolves of
-// EDRAM into fixed base addresses with scissored regions. If the game is bad
-// we are broken.
-//
-// Resolves from EDRAM result in tiled textures - that's texture tiles, not
-// EDRAM tiles. If we wanted to ensure byte-for-byte correctness we'd need to
-// then tile the images as we wrote them out. For now, we just attempt to
-// get the (X, Y) in linear space and do that. This really comes into play
-// when multiple resolves write to the same texture or memory aliased by
-// multiple textures - which is common due to predicated tiling. The examples
-// below demonstrate what this looks like, but the important thing is that
-// we are aware of partial textures and overlapping regions.
-//
-// Example with multiple render targets:
-//   Two color targets of 256x256px tightly packed in EDRAM:
-//     color target 0: base 0x0, pitch 320, scissor 0,0, 256x256
-//       starts at tile 0, buffer offset 0
-//       contains 64 tiles (320/80)*(256/16)
-//     color target 1: base 0x40, pitch 320, scissor 256,0, 256x256
-//       starts at tile 64 (after color target 0), buffer offset 327680b
-//       contains 64 tiles
-//   In EDRAM each set of 64 tiles is contiguous:
-//     +------+------+   +------+------+------+
-//     |ct0.0 |ct0.1 |...|ct0.63|ct1.0 |ct1.1 |...
-//     +------+------+   +------+------+------+
-//   To render into these, we setup two VkImages:
-//     image 0: bound to buffer offset 0, 320x256x4=327680b
-//     image 1: bound to buffer offset 327680b, 320x256x4=327680b
-//   So when we render to them:
-//     +------+-+ scissored to 256x256, actually 320x256
-//     | .    | | <- . appears at some untiled offset in the buffer, but
-//     |      | |      consistent if aliased with the same format
-//     +------+-+
-//   In theory, this gives us proper aliasing in most cases.
-//
-// Example with horizontal predicated tiling:
-//   Trying to render 1024x576 @4X MSAA, splitting into two regions
-//   horizontally:
-//     +----------+
-//     | 1024x288 |
-//     +----------+
-//     | 1024x288 |
-//     +----------+
-//   EDRAM configured for 1056x288px with tile size 2112x567px (4X MSAA):
-//     color target 0: base 0x0, pitch 1080, 26x36 tiles
-//   First render (top):
-//     window offset 0,0
-//     scissor 0,0, 1024x288
-//   First resolve (top):
-//     RB_COPY_DEST_BASE    0x1F45D000
-//     RB_COPY_DEST_PITCH   pitch=1024, height=576
-//     vertices: 0,0, 1024,0, 1024,288
-//   Second render (bottom):
-//     window offset 0,-288
-//     scissor 0,288, 1024x288
-//   Second resolve (bottom):
-//     RB_COPY_DEST_BASE    0x1F57D000 (+1179648b)
-//     RB_COPY_DEST_PITCH   pitch=1024, height=576
-//       (exactly 1024x288*4b after first resolve)
-//     vertices: 0,288, 1024,288, 1024,576
-//   Resolving here is easy as the textures are contiguous in memory. We can
-//   snoop in the first resolve with the dest height to know the total size,
-//   and in the second resolve see that it overlaps and place it in the
-//   existing target.
-//
-// Example with vertical predicated tiling:
-//   Trying to render 1280x720 @2X MSAA, splitting into two regions
-//   vertically:
-//     +-----+-----+
-//     | 640 | 640 |
-//     |  x  |  x  |
-//     | 720 | 720 |
-//     +-----+-----+
-//   EDRAM configured for 640x736px with tile size 640x1472px (2X MSAA):
-//     color target 0: base 0x0, pitch 640, 8x92 tiles
-//   First render (left):
-//     window offset 0,0
-//     scissor 0,0, 640x720
-//   First resolve (left):
-//     RB_COPY_DEST_BASE    0x1BC6D000
-//     RB_COPY_DEST_PITCH   pitch=1280, height=720
-//     vertices: 0,0, 640,0, 640,720
-//   Second render (right):
-//     window offset -640,0
-//     scissor 640,0, 640x720
-//   Second resolve (right):
-//     RB_COPY_DEST_BASE    0x1BC81000 (+81920b)
-//     RB_COPY_DEST_PITCH   pitch=1280, height=720
-//     vertices: 640,0, 1280,0, 1280,720
-//   Resolving here is much more difficult as resolves are tiled and the right
-//   half of the texture is 81920b away:
-//     81920/4bpp=20480px, /32 (texture tile size)=640px
-//   We know the texture size with the first resolve and with the second we
-//   must check for overlap then compute the offset (in both X and Y).
-//
-// =============================================================================
-// Surface size:
-// =============================================================================
-//
-// XGSurfaceSize code in game executables calculates the size in tiles in the
-// following order:
-// 1) If MSAA is >=2x, multiply the height by 2.
-// 2) If MSAA is 4x, multiply the width by 2.
-// 3) 80x16-align multisampled width and height.
-// 4) Multiply width*height by 4 or 8 depending on the pixel format.
-// 5) Divide the byte size by 5120.
-// This means that when working with EDRAM surface sizes we should assume that a
-// multisampled surface is the same as a single-sampled surface with 2x height
-// and width - however, format size doesn't effect the dimensions. Surface pitch
-// in the surface info register is single-sampled.
-//
-// =============================================================================
-// Rasterizer-ordered view usage:
-// =============================================================================
-//
-// There is a separate output merger emulation path currently in development,
-// using rasterizer-ordered views for writing directly to the 10 MB EDRAM buffer
-// instead of the host output merger for render target output.
-//
-// The convential method of implementing Xenos render targets via host render
-// targets has various flaws that may be impossible to fix:
-// - k_16_16 and k_16_16_16_16 have -32...32 range on Xenos, but there's no
-//   equivalent format on PC APIs. They may be emulated using snorm16 (by
-//   dividing shader color output by 32) or float32, however, blending behaves
-//   incorrectly for both. In the former case, multiplicative blending may not
-//   work correctly - 1 becomes 1/32, and instead of 1 * 1 = 1, you get
-//   1/32 * 1/32 = 1/1024. For 32-bit floats, additive blending result may go up
-//   to infinity.
-// - k_2_10_10_10_FLOAT has similar blending issues, though less prominent, when
-//   emulated via float16 render targets. In addition to a greater range for
-//   RGB (values can go up to 65504 and infinity rather than 31.875), alpha is
-//   represented totally differently - in k_2_10_10_10_FLOAT, it may have only
-//   4 values, and adding, for example, 0.1 to 0.333 will still result in 0.333,
-//   while with float16, it will be increasing, and the limit is infinity.
-// - Due to simultaneously bound host render targets being independent from each
-//   other, and because the height is unknown (and the viewport and scissor are
-//   not always present - D3DPT_RECTLIST is used very commonly, especially for
-//   clearing (Direct3D 9 Clear is implemented this way on the Xbox 360) and
-//   copying, and it's usually drawn without a viewport and with the scissor of
-//   the maximum possible size), there may be cases of simultaneously bound
-//   render targets overlapping each other in the EDRAM in a way that is
-//   difficult to resolve, and stores/loads may destroy data.
-//
-// =============================================================================
-// 2x width and height scaling implementation:
-// =============================================================================
-//
-// For ease of mapping EDRAM addresses, host pixels (top-left, top-right,
-// bottom-left, bottom-right) within EACH GUEST SAMPLE are stored consecutively,
-// this means that the address of each sample with 4x resolution enabled is 4x
-// the address of it without increased resolution - and you only need to add
-// (uint(SV_Position.y) * 2u + uint(SV_Position.x)) to the dword/qword index to
-// get each of the 4 host pixels for each sample.
-class RenderTargetCache {
- public:
-  // Direct3D 12 debug layer is giving errors that contradict each other when
-  // you use null RTV descriptors - if you set a valid format in RTVFormats in
-  // the pipeline state, it says that null descriptors can only be used if the
-  // format in the pipeline state is DXGI_FORMAT_UNKNOWN, however, if
-  // DXGI_FORMAT_UNKNOWN is set, it complains that the format in the pipeline
-  // state doesn't match the RTV format. So we have to make render target
-  // bindings consecutive and remap the output indices in pixel shaders.
-  struct PipelineRenderTarget {
-    uint32_t guest_render_target;
-    DXGI_FORMAT format;
-  };
-
-  RenderTargetCache(D3D12CommandProcessor& command_processor,
-                    const RegisterFile& register_file,
-                    TraceWriter& trace_writer, bool bindless_resources_used,
-                    bool edram_rov_used);
-  ~RenderTargetCache();
-
-  bool Initialize(const TextureCache& texture_cache);
-  void Shutdown();
-  void ClearCache();
-
-  flags::DepthFloat24Conversion depth_float24_conversion() const {
-    return depth_float24_conversion_;
-  }
-
-  void CompletedSubmissionUpdated();
-  void BeginSubmission();
-  void EndFrame();
-  // Called in the beginning of a draw call - may bind pipelines and change the
-  // view descriptor heap.
-  bool UpdateRenderTargets(const D3D12Shader* pixel_shader);
-  // Returns the host-to-guest mappings and host formats of currently bound
-  // render targets for pipeline creation and remapping in shaders. They are
-  // consecutive, and format DXGI_FORMAT_UNKNOWN terminates the list. Depth
-  // format is in the 5th render target.
-  const PipelineRenderTarget* GetCurrentPipelineRenderTargets() const {
-    return current_pipeline_render_targets_;
-  }
-
-  // Performs the resolve to a shared memory area according to the current
-  // register values, and also clears the EDRAM buffer if needed. Must be in a
-  // frame for calling.
-  bool Resolve(const Memory& memory, D3D12SharedMemory& shared_memory,
-               TextureCache& texture_cache, uint32_t& written_address_out,
-               uint32_t& written_length_out);
-
-  // Flushes the render targets to EDRAM and unbinds them, for instance, when
-  // the command processor takes over framebuffer bindings to draw something
-  // special. May change the CBV/SRV/UAV descriptor heap.
-  void FlushAndUnbindRenderTargets();
-  void WriteEdramRawSRVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle);
-  void WriteEdramRawUAVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle);
-  void WriteEdramUintPow2SRVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle,
-                                       uint32_t element_size_bytes_pow2);
-  void WriteEdramUintPow2UAVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle,
-                                       uint32_t element_size_bytes_pow2);
-
-  // Totally necessary to rely on the base format - Too Human switches between
-  // 2_10_10_10_FLOAT and 2_10_10_10_FLOAT_AS_16_16_16_16 every draw.
-  static xenos::ColorRenderTargetFormat GetBaseColorFormat(
-      xenos::ColorRenderTargetFormat format);
-  static DXGI_FORMAT GetColorDXGIFormat(xenos::ColorRenderTargetFormat format);
-  // Nvidia may have higher performance with 24-bit depth, AMD should have no
-  // performance difference, but with EDRAM loads/stores less conversion should
-  // be performed by the shaders if D24S8 is emulated as D24_UNORM_S8_UINT, and
-  // it's probably more accurate.
-  static DXGI_FORMAT GetDepthDXGIFormat(xenos::DepthRenderTargetFormat format) {
-    return format == xenos::DepthRenderTargetFormat::kD24FS8
-               ? DXGI_FORMAT_D32_FLOAT_S8X24_UINT
-               : DXGI_FORMAT_D24_UNORM_S8_UINT;
-  }
-
-  // Returns true if any downloads were submitted to the command processor.
-  bool InitializeTraceSubmitDownloads();
-  void InitializeTraceCompleteDownloads();
-  void RestoreEdramSnapshot(const void* snapshot);
-
- private:
-  enum class EdramLoadStoreMode {
-    kColor32bpp,
-    kColor64bpp,
-    kColor7e3,
-    kDepthUnorm,
-    kDepthFloat,
-    kDepthFloat24And32,
-
-    kCount
-  };
-
-  struct EdramLoadStoreModeInfo {
-    const void* load_shader;
-    size_t load_shader_size;
-    const WCHAR* load_pipeline_name;
-
-    const void* store_shader;
-    size_t store_shader_size;
-    const WCHAR* store_pipeline_name;
-  };
-
-  union RenderTargetKey {
-    struct {
-      // Supersampled (_ss - scaled 2x if needed) dimensions, divided by 80x16.
-      // The limit is 2560x2560 without AA, 2560x5120 with 2x AA, and 5120x5120
-      // with 4x AA, and twice as much (up to 10240x10240) with 2x resolution
-      // scaling.
-      uint32_t width_ss_div_80 : 8;    // 8
-      uint32_t height_ss_div_16 : 10;  // 18
-      uint32_t is_depth : 1;           // 19
-      uint32_t format : 4;             // 23
-    };
-    uint32_t value;
-
-    // Clearing the unused bits.
-    RenderTargetKey() : value(0) {}
-    RenderTargetKey(const RenderTargetKey& key) : value(key.value) {}
-    RenderTargetKey& operator=(const RenderTargetKey& key) {
-      value = key.value;
-      return *this;
-    }
-    bool operator==(const RenderTargetKey& key) const {
-      return value == key.value;
-    }
-    bool operator!=(const RenderTargetKey& key) const {
-      return value != key.value;
-    }
-  };
-
-  struct RenderTarget {
-    ID3D12Resource* resource;
-    D3D12_RESOURCE_STATES state;
-    D3D12_CPU_DESCRIPTOR_HANDLE handle;
-    RenderTargetKey key;
-#if 0
-    // The first 4 MB page in the heaps.
-    uint32_t heap_page_first;
-    // The number of 4 MB pages this render target uses.
-    uint32_t heap_page_count;
-#else
-    // Index of the render target when multiple render targets with the same key
-    // are bound simultaneously.
-    uint32_t instance;
-#endif
-    // Color/depth and stencil layouts.
-    D3D12_PLACED_SUBRESOURCE_FOOTPRINT footprints[2];
-    // Buffer size needed to copy the render target to the EDRAM buffer.
-    uint32_t copy_buffer_size;
-  };
-
-  struct RenderTargetBinding {
-    // Whether this render target has been used since the last full update.
-    bool is_bound;
-    uint32_t edram_base;
-    // How many 16-pixel rows has already been drawn to the render target since
-    // the last full update.
-    uint32_t edram_dirty_rows;
-    union {
-      uint32_t format;
-      xenos::ColorRenderTargetFormat color_format;
-      xenos::DepthRenderTargetFormat depth_format;
-    };
-    RenderTarget* render_target;
-  };
-
-  uint32_t GetEdramBufferSize() const;
-
-  void TransitionEdramBuffer(D3D12_RESOURCE_STATES new_state);
-  void CommitEdramBufferUAVWrites(bool force);
-
-  void ClearBindings();
-
-#if 0
-  // Checks if the heap for the render target exists and tries to create it if
-  // it's not.
-  bool MakeHeapResident(uint32_t heap_index);
-#endif
-
-  // Creates a new RTV/DSV descriptor heap if needed to be able to allocate one
-  // descriptor in it.
-  bool EnsureRTVHeapAvailable(bool is_depth);
-
-  // Returns true if a render target with such key can be created.
-  static bool GetResourceDesc(RenderTargetKey key, D3D12_RESOURCE_DESC& desc);
-
-#if 0
-  RenderTarget* FindOrCreateRenderTarget(RenderTargetKey key,
-                                         uint32_t heap_page_first);
-#else
-  RenderTarget* FindOrCreateRenderTarget(RenderTargetKey key,
-                                         uint32_t instance);
-#endif
-
-  EdramLoadStoreMode GetLoadStoreMode(bool is_depth, uint32_t format) const;
-
-  // Must be in a frame to call. Stores the dirty areas of the currently bound
-  // render targets and marks them as clean.
-  void StoreRenderTargetsToEdram();
-
-  // Must be in a frame to call. Loads the render targets from the EDRAM buffer,
-  // filling all the rows the render target can hold.
-  void LoadRenderTargetsFromEdram(uint32_t render_target_count,
-                                  RenderTarget* const* render_targets,
-                                  const uint32_t* edram_bases);
-
-  D3D12CommandProcessor& command_processor_;
-  const RegisterFile& register_file_;
-  TraceWriter& trace_writer_;
-  bool bindless_resources_used_;
-  bool edram_rov_used_;
-
-  // 20e4 depth conversion mode to use for non-ROV output.
-  flags::DepthFloat24Conversion depth_float24_conversion_;
-
-  // Whether 1 guest pixel is rendered as 2x2 host pixels (currently only
-  // supported with ROV).
-  bool resolution_scale_2x_ = false;
-
-  // The EDRAM buffer allowing color and depth data to be reinterpreted.
-  ID3D12Resource* edram_buffer_ = nullptr;
-  D3D12_RESOURCE_STATES edram_buffer_state_;
-  // Whether there have been any outstanding UAV writes and a UAV barrier is
-  // needed before accessing the EDRAM buffer in an unordered way again.
-  bool edram_buffer_modified_ = false;
-
-  // Non-shader-visible descriptor heap containing pre-created SRV and UAV
-  // descriptors of the EDRAM buffer, for faster binding (via copying rather
-  // than creation).
-  enum class EdramBufferDescriptorIndex : uint32_t {
-    kRawSRV,
-    kR32UintSRV,
-    kR32G32UintSRV,
-    kR32G32B32A32UintSRV,
-    kRawUAV,
-    kR32UintUAV,
-    kR32G32B32A32UintUAV,
-
-    kCount,
-  };
-  ID3D12DescriptorHeap* edram_buffer_descriptor_heap_ = nullptr;
-  D3D12_CPU_DESCRIPTOR_HANDLE edram_buffer_descriptor_heap_start_;
-
-  // EDRAM root signatures.
-  ID3D12RootSignature* edram_load_store_root_signature_ = nullptr;
-  struct EdramLoadStoreRootConstants {
-    uint32_t rt_color_depth_offset;
-    uint32_t rt_color_depth_pitch;
-    uint32_t rt_stencil_offset;
-    uint32_t rt_stencil_pitch;
-    // 0:10 - EDRAM base in tiles.
-    // 11 - log2(vertical sample count), 0 for 1x AA, 1 for 2x/4x AA.
-    // 12 - log2(horizontal sample count), 0 for 1x/2x AA, 1 for 4x AA.
-    // 13 - whether 2x resolution scale is used.
-    // 14 - whether to apply the hack and duplicate the top/left
-    //      half-row/half-column to reduce the impact of half-pixel offset with
-    //      2x resolution scale (obsolete since the move to the new resolve
-    //      code).
-    // 15 - whether it's a depth render target.
-    // 16: - EDRAM pitch in tiles.
-    uint32_t base_samples_2x_depth_pitch;
-  };
-  // EDRAM pipelines for the RTV/DSV path.
-  static const EdramLoadStoreModeInfo
-      edram_load_store_mode_info_[size_t(EdramLoadStoreMode::kCount)];
-  ID3D12PipelineState*
-      edram_load_pipelines_[size_t(EdramLoadStoreMode::kCount)] = {};
-  // Store pipelines are not created with ROV.
-  ID3D12PipelineState*
-      edram_store_pipelines_[size_t(EdramLoadStoreMode::kCount)] = {};
-
-  // Resolve root signatures and pipelines.
-  ID3D12RootSignature* resolve_copy_root_signature_ = nullptr;
-  static const std::pair<const uint8_t*, size_t>
-      resolve_copy_shaders_[size_t(draw_util::ResolveCopyShaderIndex::kCount)];
-  ID3D12PipelineState* resolve_copy_pipelines_[size_t(
-      draw_util::ResolveCopyShaderIndex::kCount)] = {};
-  ID3D12RootSignature* resolve_clear_root_signature_ = nullptr;
-  // Clearing 32bpp color, depth with ROV, or unorm depth without ROV.
-  ID3D12PipelineState* resolve_clear_32bpp_pipeline_ = nullptr;
-  // Clearing 64bpp color.
-  ID3D12PipelineState* resolve_clear_64bpp_pipeline_ = nullptr;
-  // Clearing float depth without ROV, both the float24 and the host float32
-  // versions.
-  ID3D12PipelineState* resolve_clear_depth_24_32_pipeline_ = nullptr;
-
-  // FIXME(Triang3l): Investigate what's wrong with placed RTV/DSV aliasing on
-  // Nvidia Maxwell 1st generation and older.
-#if 0
-  // 48 MB heaps backing used render targets resources, created when needed.
-  // 24 MB proved to be not enough to store a single render target occupying the
-  // entire EDRAM - a 32-bit depth/stencil one - at some resolution.
-  // But we also need more than 32 MB to be able to resolve the entire EDRAM
-  // into a k_32_32_32_32_FLOAT texture.
-  // TODO(Triang3l): With 2x resolution scale, render targets can take 4x more
-  // memory - won't fit in this heap size. Resolution scale support was added
-  // when placed resources already have been disabled, however.
-  ID3D12Heap* heaps_[5] = {};
-  static constexpr uint32_t kHeap4MBPages = 12;
-#endif
-
-  static constexpr uint32_t kRenderTargetDescriptorHeapSize = 2048;
-  // Descriptor heap, for linear allocation of heaps and descriptors.
-  struct RenderTargetDescriptorHeap {
-    ID3D12DescriptorHeap* heap;
-    D3D12_CPU_DESCRIPTOR_HANDLE start_handle;
-    // When descriptors_used is >= kRenderTargetDescriptorHeapSize, a new heap
-    // must be allocated and linked to the one that became full now.
-    uint32_t descriptors_used;
-    RenderTargetDescriptorHeap* previous;
-  };
-  RenderTargetDescriptorHeap* descriptor_heaps_color_ = nullptr;
-  RenderTargetDescriptorHeap* descriptor_heaps_depth_ = nullptr;
-
-  std::unordered_multimap<uint32_t, RenderTarget*> render_targets_;
-
-  uint32_t current_surface_pitch_ = 0;
-  xenos::MsaaSamples current_msaa_samples_ = xenos::MsaaSamples::k1X;
-  // current_edram_max_rows_ is for RTV/DSV only (render target texture size).
-  uint32_t current_edram_max_rows_ = 0;
-  RenderTargetBinding current_bindings_[5] = {};
-  bool apply_to_command_list_ = true;
-
-  PipelineRenderTarget current_pipeline_render_targets_[5];
-
-  // For traces only.
-  ID3D12Resource* edram_snapshot_download_buffer_ = nullptr;
-  std::unique_ptr<ui::d3d12::D3D12UploadBufferPool>
-      edram_snapshot_restore_pool_;
-};
-
-}  // namespace d3d12
-}  // namespace gpu
-}  // namespace xe
-
-#endif  // XENIA_GPU_D3D12_RENDER_TARGET_CACHE_H_
--- a/src/xenia/gpu/d3d12/texture_cache.cc
+++ b/src/xenia/gpu/d3d12/texture_cache.cc
--- a/src/xenia/gpu/d3d12/texture_cache.h
+++ b/src/xenia/gpu/d3d12/texture_cache.h
@ -10,18 +10,24 @@
 #ifndef XENIA_GPU_D3D12_TEXTURE_CACHE_H_
 #define XENIA_GPU_D3D12_TEXTURE_CACHE_H_

+#include <array>
 #include <atomic>
 #include <cstring>
 #include <unordered_map>
 #include <utility>
+#include <vector>

+#include "xenia/base/assert.h"
+#include "xenia/base/hash.h"
 #include "xenia/base/mutex.h"
 #include "xenia/gpu/d3d12/d3d12_shader.h"
 #include "xenia/gpu/d3d12/d3d12_shared_memory.h"
 #include "xenia/gpu/register_file.h"
 #include "xenia/gpu/texture_info.h"
+#include "xenia/gpu/texture_util.h"
 #include "xenia/gpu/xenos.h"
 #include "xenia/ui/d3d12/d3d12_api.h"
+#include "xenia/ui/d3d12/d3d12_provider.h"

 namespace xe {
 namespace gpu {
@ -37,7 +43,7 @@ class D3D12CommandProcessor;
 // found in game executables explaining the valid usage of BaseAddress when
 // streaming the largest LOD (it says games should not use 0 as the base address
 // when the largest LOD isn't loaded, but rather, either allocate a valid
-// address for it or make it the same as MipAddress):
+// address for it or make it the same as mip_address):
 // - If the texture has a base address, but no mip address, it's not mipmapped -
 //   the host texture has only the largest level too.
 // - If the texture has different non-zero base address and mip address, a host
@ -51,77 +57,59 @@ class D3D12CommandProcessor;
 //   the mip address, a mipmapped texture is created, but min/max LOD is clamped
 //   to the lower bound of 1 - the game is expected to do that anyway until the
 //   largest LOD is loaded.
-//   TODO(Triang3l): Check if there are any games with BaseAddress==MipAddress
-//   but min or max LOD being 0, especially check Modern Warfare 2/3.
-//   TODO(Triang3l): Attach the largest LOD to existing textures with a valid
-//   MipAddress but no BaseAddress to save memory because textures are streamed
-//   this way anyway.
+// TODO(Triang3l): Attach the largest LOD to existing textures with a valid
+// mip_address but no base ever used yet (no base_address) to save memory
+// because textures are streamed this way anyway.
 class TextureCache {
-  union TextureKey {
-    struct {
-      // Physical 4 KB page with the base mip level, disregarding A/C/E address
-      // range prefix.
-      uint32_t base_page : 17;             // 17 total
-      xenos::DataDimension dimension : 2;  // 19
-      uint32_t width : 13;                 // 32
+  struct TextureKey {
+    // Physical 4 KB page with the base mip level, disregarding A/C/E address
+    // range prefix.
+    uint32_t base_page : 17;             // 17 total
+    xenos::DataDimension dimension : 2;  // 19
+    uint32_t width : 13;                 // 32

-      uint32_t height : 13;      // 45
-      uint32_t tiled : 1;        // 46
-      uint32_t packed_mips : 1;  // 47
-      // Physical 4 KB page with mip 1 and smaller.
-      uint32_t mip_page : 17;  // 64
+    uint32_t height : 13;      // 45
+    uint32_t tiled : 1;        // 46
+    uint32_t packed_mips : 1;  // 47
+    // Physical 4 KB page with mip 1 and smaller.
+    uint32_t mip_page : 17;  // 64
+
+    // Layers for stacked and 3D, 6 for cube, 1 for other dimensions.
+    uint32_t depth : 10;              // 74
+    uint32_t pitch : 9;               // 83
+    uint32_t mip_max_level : 4;       // 87
+    xenos::TextureFormat format : 6;  // 93
+    xenos::Endian endianness : 2;     // 95
+    // Whether this texture is signed and has a different host representation
+    // than an unsigned view of the same guest texture.
+    uint32_t signed_separate : 1;  // 96
+
+    // Whether this texture is a 2x-scaled resolve target.
+    uint32_t scaled_resolve : 1;  // 97

-      // Layers for stacked and 3D, 6 for cube, 1 for other dimensions.
-      uint32_t depth : 10;              // 74
-      uint32_t mip_max_level : 4;       // 78
-      xenos::TextureFormat format : 6;  // 84
-      xenos::Endian endianness : 2;     // 86
-      // Whether this texture is signed and has a different host representation
-      // than an unsigned view of the same guest texture.
-      uint32_t signed_separate : 1;  // 87
-      // Whether this texture is a 2x-scaled resolve target.
-      uint32_t scaled_resolve : 1;  // 88
-    };
-    struct {
-      // The key used for unordered_multimap lookup. Single uint32_t instead of
-      // a uint64_t so XXH hash can be calculated in a stable way due to no
-      // padding.
-      uint32_t map_key[2];
-      // The key used to identify one texture within unordered_multimap buckets.
-      uint32_t bucket_key;
-    };
    TextureKey() { MakeInvalid(); }
    TextureKey(const TextureKey& key) {
-      SetMapKey(key.GetMapKey());
-      bucket_key = key.bucket_key;
+      std::memcpy(this, &key, sizeof(*this));
    }
    TextureKey& operator=(const TextureKey& key) {
-      SetMapKey(key.GetMapKey());
-      bucket_key = key.bucket_key;
+      std::memcpy(this, &key, sizeof(*this));
      return *this;
    }
-    bool operator==(const TextureKey& key) const {
-      return GetMapKey() == key.GetMapKey() && bucket_key == key.bucket_key;
-    }
-    bool operator!=(const TextureKey& key) const {
-      return GetMapKey() != key.GetMapKey() || bucket_key != key.bucket_key;
-    }
-    uint64_t GetMapKey() const {
-      return uint64_t(map_key[0]) | (uint64_t(map_key[1]) << 32);
-    }
-    void SetMapKey(uint64_t key) {
-      map_key[0] = uint32_t(key);
-      map_key[1] = uint32_t(key >> 32);
-    }
    bool IsInvalid() const {
-      // Zero base and zero width is enough for a binding to be invalid.
-      return map_key[0] == 0;
+      // Zero size is enough for a binding to be invalid (not possible on the
+      // real GPU since dimensions minus 1 are stored).
+      return !width;
    }
    void MakeInvalid() {
-      // Reset all for a stable hash.
-      SetMapKey(0);
-      bucket_key = 0;
+      // Zero everything, including the padding, for a stable hash.
+      std::memset(this, 0, sizeof(*this));
    }
+
+    using Hasher = xe::hash::XXHasher<TextureKey>;
+    bool operator==(const TextureKey& key) const {
+      return !std::memcmp(this, &key, sizeof(*this));
+    }
+    bool operator!=(const TextureKey& key) const { return !(*this == key); }
  };

 public:
@ -168,16 +156,18 @@ class TextureCache {
  };

  TextureCache(D3D12CommandProcessor& command_processor,
-               const RegisterFile& register_file, bool bindless_resources_used,
-               D3D12SharedMemory& shared_memory);
+               const RegisterFile& register_file,
+               D3D12SharedMemory& shared_memory, bool bindless_resources_used,
+               uint32_t draw_resolution_scale);
  ~TextureCache();

-  bool Initialize(bool edram_rov_used);
+  bool Initialize();
  void Shutdown();
  void ClearCache();

  void TextureFetchConstantWritten(uint32_t index);

+  void BeginSubmission();
  void BeginFrame();
  void EndFrame();

@ -196,19 +186,29 @@ class TextureCache {
  bool AreActiveTextureSRVKeysUpToDate(
      const TextureSRVKey* keys,
      const D3D12Shader::TextureBinding* host_shader_bindings,
-      uint32_t host_shader_binding_count) const;
+      size_t host_shader_binding_count) const;
  // Exports the current binding data to texture SRV keys so they can be stored
  // for checking whether subsequent draw calls can keep using the same
  // bindings. Write host_shader_binding_count keys.
  void WriteActiveTextureSRVKeys(
      TextureSRVKey* keys,
      const D3D12Shader::TextureBinding* host_shader_bindings,
-      uint32_t host_shader_binding_count) const;
+      size_t host_shader_binding_count) const;
  // Returns the post-swizzle signedness of a currently bound texture (must be
  // called after RequestTextures).
  uint8_t GetActiveTextureSwizzledSigns(uint32_t index) const {
    return texture_bindings_[index].swizzled_signs;
  }
+  bool IsActiveTextureResolved(uint32_t index) const {
+    const TextureBinding& binding = texture_bindings_[index];
+    if (binding.texture && binding.texture->IsResolved()) {
+      return true;
+    }
+    if (binding.texture_signed && binding.texture_signed->IsResolved()) {
+      return true;
+    }
+    return false;
+  }
  void WriteActiveTextureBindfulSRV(
      const D3D12Shader::TextureBinding& host_shader_binding,
      D3D12_CPU_DESCRIPTOR_HANDLE handle);
@ -221,26 +221,37 @@ class TextureCache {
                    D3D12_CPU_DESCRIPTOR_HANDLE handle) const;

  void MarkRangeAsResolved(uint32_t start_unscaled, uint32_t length_unscaled);
-
-  bool IsResolutionScale2X() const { return scaled_resolve_buffer_ != nullptr; }
-  ID3D12Resource* GetScaledResolveBuffer() const {
-    return scaled_resolve_buffer_;
-  }
-  // Ensures the buffer tiles backing the range are resident.
-  bool EnsureScaledResolveBufferResident(uint32_t start_unscaled,
-                                         uint32_t length_unscaled);
-  void UseScaledResolveBufferForReading();
-  void UseScaledResolveBufferForWriting();
-  void MarkScaledResolveBufferUAVWritesCommitNeeded() {
-    if (scaled_resolve_buffer_state_ == D3D12_RESOURCE_STATE_UNORDERED_ACCESS) {
-      scaled_resolve_buffer_uav_writes_commit_needed_ = true;
+  static uint32_t GetMaxDrawResolutionScale(
+      const ui::d3d12::D3D12Provider& provider) {
+    // 31 because 2 GB buffers are used.
+    if (provider.GetTiledResourcesTier() < D3D12_TILED_RESOURCES_TIER_1 ||
+        provider.GetVirtualAddressBitsPerResource() < 31) {
+      return 1;
    }
+    return kMaxDrawResolutionScale;
+  }
+  uint32_t GetDrawResolutionScale() const { return draw_resolution_scale_; }
+  // Ensures the tiles backing the range in the buffers are allocated.
+  bool EnsureScaledResolveMemoryCommitted(uint32_t start_unscaled,
+                                          uint32_t length_unscaled);
+  // Makes the specified range of up to 1-2 GB currently accessible on the GPU.
+  // One draw call can access only at most one range - the same memory is
+  // accessible through different buffers based on the range needed, so aliasing
+  // barriers are required.
+  bool MakeScaledResolveRangeCurrent(uint32_t start_unscaled,
+                                     uint32_t length_unscaled);
+  // These functions create a view of the range specified in the last successful
+  // MakeScaledResolveRangeCurrent call because that function must be called
+  // before this.
+  void CreateCurrentScaledResolveRangeUintPow2SRV(
+      D3D12_CPU_DESCRIPTOR_HANDLE handle, uint32_t element_size_bytes_pow2);
+  void CreateCurrentScaledResolveRangeUintPow2UAV(
+      D3D12_CPU_DESCRIPTOR_HANDLE handle, uint32_t element_size_bytes_pow2);
+  void TransitionCurrentScaledResolveRange(D3D12_RESOURCE_STATES new_state);
+  void MarkCurrentScaledResolveRangeUAVWritesCommitNeeded() {
+    assert_true(draw_resolution_scale_ > 1);
+    GetCurrentScaledResolveBuffer().SetUAVBarrierPending();
  }
-  // Can't address more than 512 MB on Nvidia, so an offset is required.
-  void CreateScaledResolveBufferUintPow2UAV(D3D12_CPU_DESCRIPTOR_HANDLE handle,
-                                            uint32_t guest_address_bytes,
-                                            uint32_t guest_length_bytes,
-                                            uint32_t element_size_bytes_pow2);

  // Returns the ID3D12Resource of the front buffer texture (in
  // PIXEL_SHADER_RESOURCE state), or nullptr in case of failure, and writes the
@ -251,6 +262,8 @@ class TextureCache {
      xenos::TextureFormat& format_out);

 private:
+  static constexpr uint32_t kMaxDrawResolutionScale = 3;
+
  enum class LoadMode {
    k8bpb,
    k16bpb,
@ -281,7 +294,82 @@ class TextureCache {
    kUnknown = kCount
  };

-  struct LoadModeInfo {
+  struct LoadShaderInfo {
+    // Rules of data access in load shaders:
+    // - Source reading (from the shared memory or the scaled resolve buffer):
+    //   - Guest data may be stored in a sparsely-allocated buffer, or, in
+    //     Direct3D 12 terms, a tiled buffer. This means that some regions of
+    //     the buffer may not be mapped. On tiled resources tier 1 hardware,
+    //     accesing unmapped tiles results in undefined behavior, including a
+    //     GPU page fault and device removal. So, shaders must not try to access
+    //     potentially unmapped regions (that are outside the texture memory
+    //     extents calculated on the CPU, taking into account that Xenia can't
+    //     overestimate texture sizes freely since it must not try to upload
+    //     unallocated pages on the CPU).
+    //   - Buffer tiles have 64 KB size on Direct3D 12. Vulkan has its own
+    //     alignment requirements for sparse binding. But overall, we're
+    //     allocating pretty large regions.
+    //   - Resolution scaling disabled:
+    //     - Shared memory allocates regions of power of two sizes that map
+    //       directly to the same portions of the 512 MB of the console's
+    //       physical memory. So, a 64 KB-aligned host buffer region is also 64
+    //       KB-aligned in the guest address space.
+    //     - Tiled textures: 32x32x4-block tiles are always resident each as a
+    //       whole. If the width is bigger than the pitch, the overflowing
+    //       32x32x4 tiles are also loaded as entire tiles. We do not have
+    //       separate shaders for 2D and 3D. So, for tiled textures, it's safe
+    //       to consider that if any location within a 32x32-aligned portion is
+    //       within the texture bounds, the entire 32x32 portion also can be
+    //       read.
+    //     - Linear textures: Pitch is aligned to 256 bytes. Row count, however,
+    //       is not aligned to anything (unless the mip tail is being loaded).
+    //       The overflowing last row in case `width > pitch`, however, is made
+    //       resident up to the last texel in it. But row start alignment is
+    //       256, which is a power of two, and is smaller than the Direct3D 12
+    //       tile size of 64 KB. So, if any block within a 256-aligned region is
+    //       within the texture bounds, without resolution scaling, reading from
+    //       any location in that 256-aligned region is safe.
+    //     - Since we use the same shaders for tiled and linear textures (as
+    //       well as 1D textures), this means that without resolution scaling,
+    //       it's safe to access a min(256 bytes, 32 blocks)-aligned portion
+    //       along X, but only within the same row of blocks, with bounds
+    //       checking only for such portion as a whole, but without additional
+    //       bounds checking inside of it.
+    //     - Therefore, it's recommended that shaders read power-of-two amounts
+    //       of blocks (so there will naturally be some alignment to some power
+    //       of two), and this way, each thread may read at most 16 16bpb blocks
+    //       or at most 32 8bpb or smaller blocks with in a single
+    //       `if (x < width)` for the whole aligned range of the same length.
+    //   - Resolution scaling enabled:
+    //     - For simplicity, unlike in the shared memory, buffer tile boundaries
+    //       are not aligned to powers of 2 the same way as guest addresses are.
+    //       While for 2x resolution scaling it still happens to be the case
+    //       because `host address = guest address << 1`, for 3x, it's not - a
+    //       64 KB host tile would represent 7281.777 guest bytes (though we
+    //       scale texels, not bytes, but that's what it would be for k_8
+    //       textures).
+    //     - The above would affect the `width > pitch` case for linear
+    //       textures, requiring overestimating the width in calculation of the
+    //       range of the tiles to map, while not doing this overestimation on
+    //       the guest memory extent calculation side (otherwise it may result
+    //       in attempting to upload unallocated memory on the CPU). For
+    //       example, let's take look at an extreme case of a 369x28 k_8 texture
+    //       with pitch of 256 bytes. The last row, in guest memory, would be
+    //       loaded from the [7168, 7281) range, or, with 3x3 resolution
+    //       scaling, from bytes [64512, 65529). However, if we try to
+    //       unconditionally load 2 pixels, like the texture is 370x28, we will
+    //       be accessing the bytes [64512, 65538). But bytes 65536 and 65537
+    //       will be in another 64 KB tile, which may be not mapped yet.
+    //       However, none of this is an issue for one simple reason - resolving
+    //       is only possible to tiled textures, so linear textures will never
+    //       be resolution-scaled.
+    //     - Tiled textures have potentially referenced guest 32x32-block tiles
+    //       loaded in their entirety. So, just like for unscaled textures, if
+    //       any block within a tile is available, the entire tile is as well.
+    // - Destination writing (to the linear buffer):
+    //   - host_x_blocks_per_thread specifies how many pixels can be written
+    //     without bounds checking within increments of that amount - the pitch
+    //     of the destination buffer is manually overaligned if needed.
    const void* shader;
    size_t shader_size;
    // Log2 of the sizes, in bytes, of the source (guest) SRV and the
@ -289,11 +377,15 @@ class TextureCache {
    // may copy multiple blocks per one invocation.
    uint32_t srv_bpe_log2;
    uint32_t uav_bpe_log2;
-    // Optional shader for loading 2x-scaled resolve targets.
-    const void* shader_2x;
-    size_t shader_2x_size;
-    uint32_t srv_bpe_log2_2x;
-    uint32_t uav_bpe_log2_2x;
+    // Number of guest blocks (or texels for uncompressed) along X axis written
+    // by every compute shader thread - rows in the upload buffer are padded to
+    // at least this amount.
+    uint32_t host_x_blocks_per_thread;
+  };
+
+  struct LoadModeInfo {
+    // For different drawing resolution scales.
+    LoadShaderInfo shaders[kMaxDrawResolutionScale];
  };

  struct HostFormat {
@ -341,24 +433,19 @@ class TextureCache {
    ID3D12Resource* resource;
    uint64_t resource_size;
    D3D12_RESOURCE_STATES state;
+    // Whether the most up-to-date base / mips contain pages with data from a
+    // resolve operation (rather than from the CPU or memexport), primarily for
+    // choosing between piecewise linear gamma and sRGB when the former is
+    // emulated with the latter.
+    bool base_resolved;
+    bool mips_resolved;

    uint64_t last_usage_frame;
    uint64_t last_usage_time;
    Texture* used_previous;
    Texture* used_next;

-    // Byte size of the top guest mip level.
-    uint32_t base_size;
-    // Byte size of mips between 1 and key.mip_max_level, containing all array
-    // slices.
-    uint32_t mip_size;
-    // Offsets of all the array slices on a mip level relative to mips_address
-    // (0 for mip 0, it's relative to base_address then, and for mip 1).
-    uint32_t mip_offsets[14];
-    // Byte sizes of an array slice on each mip level.
-    uint32_t slice_sizes[14];
-    // Row pitches on each mip level (for linear layout mainly).
-    uint32_t pitches[14];
+    texture_util::TextureGuestLayout guest_layout;

    // For bindful - indices in the non-shader-visible descriptor cache for
    // copying to the shader-visible heap (much faster than recreating, which,
@ -375,6 +462,14 @@ class TextureCache {
    bool base_in_sync;
    // Whether the recent mip data has been loaded from the memory.
    bool mips_in_sync;
+
+    bool IsResolved() const { return base_resolved || mips_resolved; }
+    uint32_t GetGuestBaseSize() const {
+      return guest_layout.base.level_data_extent_bytes;
+    }
+    uint32_t GetGuestMipsSize() const {
+      return guest_layout.mips_total_extent_bytes;
+    }
  };

  struct SRVDescriptorCachePage {
@ -385,24 +480,24 @@ class TextureCache {

  struct LoadConstants {
    // vec4 0.
+    uint32_t is_tiled_3d_endian;
    // Base offset in bytes.
-    uint32_t guest_base;
-    // For linear textures - row byte pitch.
-    uint32_t guest_pitch;
-    // In blocks - and for mipmaps, it's also power-of-two-aligned.
-    uint32_t guest_storage_width_height[2];
+    uint32_t guest_offset;
+    // For tiled textures - row pitch in blocks, aligned to 32.
+    // For linear textures - row pitch in bytes.
+    uint32_t guest_pitch_aligned;
+    // For 3D textures only (ignored otherwise) - aligned to 32.
+    uint32_t guest_z_stride_block_rows_aligned;

    // vec4 1.
+    // If this is a packed mip tail, this is aligned to tile dimensions.
    uint32_t size_blocks[3];
-    uint32_t is_3d_endian;
+    // Base offset in bytes.
+    uint32_t host_offset;

    // vec4 2.
-    // Base offset in bytes.
-    uint32_t host_base;
    uint32_t host_pitch;
    uint32_t height_texels;
-
-    static constexpr uint32_t kGuestPitchTiled = UINT32_MAX;
  };

  struct TextureBinding {
@ -427,6 +522,66 @@ class TextureCache {
    }
  };

+  static uint32_t GetMaxHostTextureWidthHeight(xenos::DataDimension dimension) {
+    switch (dimension) {
+      case xenos::DataDimension::k1D:
+      case xenos::DataDimension::k2DOrStacked:
+        // 1D and 2D are emulated as 2D arrays.
+        return D3D12_REQ_TEXTURE2D_U_OR_V_DIMENSION;
+      case xenos::DataDimension::k3D:
+        return D3D12_REQ_TEXTURE3D_U_V_OR_W_DIMENSION;
+      case xenos::DataDimension::kCube:
+        return D3D12_REQ_TEXTURECUBE_DIMENSION;
+      default:
+        assert_unhandled_case(dimension);
+        return 0;
+    }
+  }
+  static uint32_t GetMaxHostTextureDepth(xenos::DataDimension dimension) {
+    switch (dimension) {
+      case xenos::DataDimension::k1D:
+      case xenos::DataDimension::k2DOrStacked:
+        // 1D and 2D are emulated as 2D arrays.
+        return D3D12_REQ_TEXTURE2D_ARRAY_AXIS_DIMENSION;
+      case xenos::DataDimension::k3D:
+        return D3D12_REQ_TEXTURE3D_U_V_OR_W_DIMENSION;
+      case xenos::DataDimension::kCube:
+        return D3D12_REQ_TEXTURE2D_ARRAY_AXIS_DIMENSION / 6 * 6;
+      default:
+        assert_unhandled_case(dimension);
+        return 0;
+    }
+  }
+
+  class ScaledResolveVirtualBuffer {
+   public:
+    ScaledResolveVirtualBuffer(ID3D12Resource* resource,
+                               D3D12_RESOURCE_STATES resource_state)
+        : resource_(resource), resource_state_(resource_state) {}
+    ID3D12Resource* resource() const { return resource_.Get(); }
+    D3D12_RESOURCE_STATES SetResourceState(D3D12_RESOURCE_STATES new_state) {
+      D3D12_RESOURCE_STATES old_state = resource_state_;
+      if (old_state == D3D12_RESOURCE_STATE_UNORDERED_ACCESS) {
+        uav_barrier_pending_ = false;
+      }
+      resource_state_ = new_state;
+      return old_state;
+    }
+    // After writing through a UAV.
+    void SetUAVBarrierPending() {
+      if (resource_state_ == D3D12_RESOURCE_STATE_UNORDERED_ACCESS) {
+        uav_barrier_pending_ = true;
+      }
+    }
+    // After an aliasing barrier (which is even stronger than an UAV barrier).
+    void ClearUAVBarrierPending() { uav_barrier_pending_ = false; }
+
+   private:
+    Microsoft::WRL::ComPtr<ID3D12Resource> resource_;
+    D3D12_RESOURCE_STATES resource_state_;
+    bool uav_barrier_pending_ = false;
+  };
+
  // Whether the signed version of the texture has a different representation on
  // the host than its unsigned version (for example, if it's a fixed-point
  // texture emulated with a larger host pixel format).
@ -522,6 +677,42 @@ class TextureCache {
  // an error.
  void ClearBindings();

+  size_t GetScaledResolveBufferCount() const {
+    assert_true(draw_resolution_scale_ > 1);
+    // Make sure any range up to 1 GB is accessible through 1 or 2 buffers.
+    // 2x2 scale buffers - just one 2 GB buffer for all 2 GB.
+    // 3x3 scale buffers - 4 buffers:
+    //  +0.0 +0.5 +1.0 +1.5 +2.0 +2.5 +3.0 +3.5 +4.0 +4.5
+    // |___________________|___________________|
+    //           |___________________|______________|
+    // Buffer N has an offset of N * 1 GB in the scaled resolve address space.
+    // The logic is:
+    // - 2 GB can be accessed through a [0 GB ... 2 GB) buffer - only need one.
+    // - 2.1 GB needs [0 GB ... 2 GB) and [1 GB ... 2.1 GB) - two buffers.
+    // - 3 GB needs [0 GB ... 2 GB) and [1 GB ... 3 GB) - two buffers.
+    // - 3.1 GB needs [0 GB ... 2 GB), [1 GB ... 3 GB) and [2 GB ... 3.1 GB) -
+    //   three buffers.
+    uint64_t address_space_size =
+        uint64_t(SharedMemory::kBufferSize) *
+        (draw_resolution_scale_ * draw_resolution_scale_);
+    return size_t((address_space_size - 1) >> 30);
+  }
+  // Returns indices of two scaled resolve virtual buffers that the location in
+  // memory may be accessible through. May be the same if it's a location near
+  // the beginning or the end of the address represented only by one buffer.
+  std::array<size_t, 2> GetPossibleScaledResolveBufferIndices(
+      uint64_t address_scaled) const {
+    assert_true(draw_resolution_scale_ > 1);
+    size_t address_gb = size_t(address_scaled >> 30);
+    size_t max_index = GetScaledResolveBufferCount() - 1;
+    // In different cases for 3x3:
+    //  +0.0 +0.5 +1.0 +1.5 +2.0 +2.5 +3.0 +3.5 +4.0 +4.5
+    // |12________2________|1_________2________|
+    //           |1_________2________|1_________12__|
+    return std::array<size_t, 2>{
+        std::min(address_gb, max_index),
+        std::min(std::max(address_gb, size_t(1)) - size_t(1), max_index)};
+  }
  // Checks if there are any pages that contain scaled resolve data within the
  // range.
  bool IsRangeScaledResolved(uint32_t start_unscaled, uint32_t length_unscaled);
@ -534,6 +725,18 @@ class TextureCache {
  void ScaledResolveGlobalWatchCallback(uint32_t address_first,
                                        uint32_t address_last,
                                        bool invalidated_by_gpu);
+  // The index is also the gigabyte offset of the buffer from the start of the
+  // scaled physical memory address space.
+  size_t GetCurrentScaledResolveBufferIndex() const {
+    return scaled_resolve_1gb_buffer_indices_
+        [scaled_resolve_current_range_start_scaled_ >> 30];
+  }
+  ScaledResolveVirtualBuffer& GetCurrentScaledResolveBuffer() {
+    ScaledResolveVirtualBuffer* scaled_resolve_buffer =
+        scaled_resolve_2gb_buffers_[GetCurrentScaledResolveBufferIndex()];
+    assert_not_null(scaled_resolve_buffer);
+    return *scaled_resolve_buffer;
+  }

  static const HostFormat host_formats_[64];

@ -541,16 +744,16 @@ class TextureCache {

  D3D12CommandProcessor& command_processor_;
  const RegisterFile& register_file_;
-  bool bindless_resources_used_;
  D3D12SharedMemory& shared_memory_;
+  bool bindless_resources_used_;

  static const LoadModeInfo load_mode_info_[];
  ID3D12RootSignature* load_root_signature_ = nullptr;
  ID3D12PipelineState* load_pipelines_[size_t(LoadMode::kCount)] = {};
-  // Load pipelines for 2x-scaled resolved targets.
-  ID3D12PipelineState* load_pipelines_2x_[size_t(LoadMode::kCount)] = {};
+  // Load pipelines for resolution-scaled resolve targets.
+  ID3D12PipelineState* load_pipelines_scaled_[size_t(LoadMode::kCount)] = {};

-  std::unordered_multimap<uint64_t, Texture*> textures_;
+  std::unordered_map<TextureKey, Texture*, TextureKey::Hasher> textures_;
  uint64_t textures_total_size_ = 0;
  Texture* texture_used_first_ = nullptr;
  Texture* texture_used_last_ = nullptr;
@ -592,37 +795,73 @@ class TextureCache {
  };
  uint8_t unsupported_format_features_used_[64];

-  // The 2 GB tiled buffer for resolved data with 2x resolution scale.
-  static constexpr uint32_t kScaledResolveBufferSizeLog2 = 31;
-  static constexpr uint32_t kScaledResolveBufferSize =
-      1u << kScaledResolveBufferSizeLog2;
-  ID3D12Resource* scaled_resolve_buffer_ = nullptr;
-  D3D12_RESOURCE_STATES scaled_resolve_buffer_state_ =
-      D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
-  bool scaled_resolve_buffer_uav_writes_commit_needed_ = false;
+  uint32_t draw_resolution_scale_ = 1;
+  // The tiled buffer for resolved data with resolution scaling.
+  // Because on Direct3D 12 (at least on Windows 10 2004) typed SRV or UAV
+  // creation fails for offsets above 4 GB, a single tiled 4.5 GB buffer can't
+  // be used for 3x resolution scaling.
+  // Instead, "sliding window" buffers allowing to access a single range of up
+  // to 1 GB (or up to 2 GB, depending on the low bits) at any moment are used.
+  // Parts of 4.5 GB address space can be accessed through 2 GB buffers as:
+  //  +0.0 +0.5 +1.0 +1.5 +2.0 +2.5 +3.0 +3.5 +4.0 +4.5
+  // |___________________|___________________|      or
+  //           |___________________|______________|
+  // (2 GB is also the amount of scaled physical memory with 2x resolution
+  // scale, and older Intel GPUs, while support tiled resources, only support 31
+  // virtual address bits per resource).
+  // Index is first gigabyte. Only including buffers containing over 1 GB
+  // (because otherwise the data will be fully contained in another).
+  // Size is calculated the same as in GetScaledResolveBufferCount.
+  ScaledResolveVirtualBuffer*
+      scaled_resolve_2gb_buffers_[(uint64_t(SharedMemory::kBufferSize) *
+                                       (kMaxDrawResolutionScale *
+                                        kMaxDrawResolutionScale) -
+                                   1) >>
+                                  30] = {};
  // Not very big heaps (16 MB) because they are needed pretty sparsely. One
-  // scaled 1280x720x32bpp texture is slighly bigger than 14 MB.
+  // 2x-scaled 1280x720x32bpp texture is slighly bigger than 14 MB.
  static constexpr uint32_t kScaledResolveHeapSizeLog2 = 24;
  static constexpr uint32_t kScaledResolveHeapSize =
-      1 << kScaledResolveHeapSizeLog2;
+      uint32_t(1) << kScaledResolveHeapSizeLog2;
  static_assert(
      (kScaledResolveHeapSize % D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES) == 0,
      "Scaled resolve heap size must be a multiple of Direct3D tile size");
+  static_assert(
+      kScaledResolveHeapSizeLog2 <= SharedMemory::kBufferSizeLog2,
+      "Scaled resolve heaps are assumed to be wholly mappable irrespective of "
+      "resolution scale, never truncated, for example, if the scaled resolve "
+      "address space is 4.5 GB, but the heap size is 1 GB");
+  static_assert(
+      kScaledResolveHeapSizeLog2 <= 30,
+      "Scaled resolve heaps are assumed to only be wholly mappable to up to "
+      "two 2 GB buffers");
  // Resident portions of the tiled buffer.
-  ID3D12Heap* scaled_resolve_heaps_[kScaledResolveBufferSize >>
-                                    kScaledResolveHeapSizeLog2] = {};
+  std::vector<ID3D12Heap*> scaled_resolve_heaps_;
  // Number of currently resident portions of the tiled buffer, for profiling.
  uint32_t scaled_resolve_heap_count_ = 0;
  // Global watch for scaled resolve data invalidation.
  SharedMemory::GlobalWatchHandle scaled_resolve_global_watch_handle_ = nullptr;
+  // Current scaled resolve state.
+  // For aliasing barrier placement, last owning buffer index for each of 1 GB.
+  size_t
+      scaled_resolve_1gb_buffer_indices_[(uint64_t(SharedMemory::kBufferSize) *
+                                              kMaxDrawResolutionScale *
+                                              kMaxDrawResolutionScale +
+                                          ((uint32_t(1) << 30) - 1)) >>
+                                         30];
+  // Range used in the last successful MakeScaledResolveRangeCurrent call.
+  uint64_t scaled_resolve_current_range_start_scaled_;
+  uint64_t scaled_resolve_current_range_length_scaled_;

  xe::global_critical_region global_critical_region_;
  // Bit vector storing whether each 4 KB physical memory page contains scaled
-  // resolve data. uint32_t rather than uint64_t because parts of it are sent to
-  // shaders.
+  // resolve data. uint32_t rather than uint64_t because parts of it can be sent
+  // to shaders.
  uint32_t* scaled_resolve_pages_ = nullptr;
  // Second level of the bit vector for faster rejection of non-scaled textures.
-  uint64_t scaled_resolve_pages_l2_[(512 << 20) >> (12 + 5 + 6)];
+  // >> 12 for 4 KB pages, >> 5 for uint32_t level 1 bits, >> 6 for uint64_t
+  // level 2 bits.
+  uint64_t scaled_resolve_pages_l2_[SharedMemory::kBufferSize >> (12 + 5 + 6)];
 };

 }  // namespace d3d12
--- a/src/xenia/gpu/draw_util.cc
+++ b/src/xenia/gpu/draw_util.cc
--- a/src/xenia/gpu/draw_util.h
+++ b/src/xenia/gpu/draw_util.h
@ -16,6 +16,7 @@
 #include "xenia/base/assert.h"
 #include "xenia/gpu/register_file.h"
 #include "xenia/gpu/registers.h"
+#include "xenia/gpu/shader.h"
 #include "xenia/gpu/trace_writer.h"
 #include "xenia/gpu/xenos.h"
 #include "xenia/memory.h"
@ -33,16 +34,144 @@ namespace draw_util {
 // for use with the top-left rasterization rule later.
 int32_t FloatToD3D11Fixed16p8(float f32);

+// Polygonal primitive types (not including points and lines) are rasterized as
+// triangles, have front and back faces, and also support face culling and fill
+// modes (polymode_front_ptype, polymode_back_ptype). Other primitive types are
+// always "front" (but don't support front face and back face culling, according
+// to OpenGL and Vulkan specifications - even if glCullFace is
+// GL_FRONT_AND_BACK, points and lines are still drawn), and may in some cases
+// use the "para" registers instead of "front" or "back" (for "parallelogram" -
+// like poly_offset_para_enable).
+constexpr bool IsPrimitivePolygonal(bool vgt_output_path_is_tessellation_enable,
+                                    xenos::PrimitiveType type) {
+  if (vgt_output_path_is_tessellation_enable &&
+      (type == xenos::PrimitiveType::kTrianglePatch ||
+       type == xenos::PrimitiveType::kQuadPatch)) {
+    // For patch primitive types, the major mode is always explicit, so just
+    // checking if VGT_OUTPUT_PATH_CNTL::path_select is kTessellationEnable is
+    // enough.
+    return true;
+  }
+  switch (type) {
+    case xenos::PrimitiveType::kTriangleList:
+    case xenos::PrimitiveType::kTriangleFan:
+    case xenos::PrimitiveType::kTriangleStrip:
+    case xenos::PrimitiveType::kTriangleWithWFlags:
+    case xenos::PrimitiveType::kQuadList:
+    case xenos::PrimitiveType::kQuadStrip:
+    case xenos::PrimitiveType::kPolygon:
+      return true;
+    default:
+      break;
+  }
+  // TODO(Triang3l): Investigate how kRectangleList should be treated - possibly
+  // actually drawn as two polygons on the console, however, the current
+  // geometry shader doesn't care about the winding order - allowing backface
+  // culling for rectangles currently breaks Gears of War 2.
+  return false;
+}
+
+inline bool IsPrimitivePolygonal(const RegisterFile& regs) {
+  return IsPrimitivePolygonal(
+      regs.Get<reg::VGT_OUTPUT_PATH_CNTL>().path_select ==
+          xenos::VGTOutputPath::kTessellationEnable,
+      regs.Get<reg::VGT_DRAW_INITIATOR>().prim_type);
+}
+
+// Whether with the current state, any samples to rasterize (for any reason, not
+// only to write something to a render target, but also to do sample counting or
+// pixel shader memexport) can be generated. Finally dropping draw calls can
+// only be done if the vertex shader doesn't memexport. Checks mostly special
+// cases (for both the guest and usual host implementations), not everything
+// like whether viewport / scissor are empty (until this truly matters in any
+// game, of course).
+bool IsRasterizationPotentiallyDone(const RegisterFile& regs,
+                                    bool primitive_polygonal);
+
+// Direct3D 10.1+ standard sample positions, also used in Vulkan, for
+// calculations related to host MSAA, in 1/16th of a pixel.
+extern const int8_t kD3D10StandardSamplePositions2x[2][2];
+extern const int8_t kD3D10StandardSamplePositions4x[4][2];
+
+inline reg::RB_DEPTHCONTROL GetDepthControlForCurrentEdramMode(
+    const RegisterFile& regs) {
+  xenos::ModeControl edram_mode = regs.Get<reg::RB_MODECONTROL>().edram_mode;
+  if (edram_mode != xenos::ModeControl::kColorDepth &&
+      edram_mode != xenos::ModeControl::kDepth) {
+    // Both depth and stencil disabled (EDRAM depth and stencil ignored).
+    reg::RB_DEPTHCONTROL disabled;
+    disabled.value = 0;
+    return disabled;
+  }
+  return regs.Get<reg::RB_DEPTHCONTROL>();
+}
+
+constexpr float GetD3D10PolygonOffsetScale(
+    xenos::DepthRenderTargetFormat depth_format, bool float24_as_0_to_0_5) {
+  if (depth_format == xenos::DepthRenderTargetFormat::kD24S8) {
+    return float(1 << 24);
+  }
+  // 20 explicit + 1 implicit (1.) mantissa bits.
+  // 2^20 is not enough for Call of Duty 4 retail version's first mission F.N.G.
+  // shooting range floor (with the number 1) on Direct3D 12. Tested on Nvidia
+  // GeForce GTX 1070, the exact formula (taking into account the 0...1 to
+  // 0...0.5 remapping described below) used for testing is
+  // `int(ceil(offset * 2^20 * 0.5)) * sign(offset)`. With 2^20 * 0.5, there
+  // are various kinds of stripes dependending on the view angle in that
+  // location. With 2^21 * 0.5, the issue is not present.
+  constexpr float kFloat24Scale = float(1 << 21);
+  // 0...0.5 range may be used on the host to represent the 0...1 guest depth
+  // range to be able to copy all possible encodings, which are [0, 2), via a
+  // [0, 1] depth output variable, during EDRAM contents reinterpretation.
+  // This is done by scaling the viewport depth bounds by 0.5. However, the
+  // depth bias is applied after the viewport. This adjustment is only needed
+  // for the constant bias - for slope-scaled, the derivatives of Z are
+  // calculated after the viewport as well, and will already include the 0.5
+  // scaling from the viewport.
+  return float24_as_0_to_0_5 ? kFloat24Scale * 0.5f : kFloat24Scale;
+}
+
+inline bool DoesCoverageDependOnAlpha(reg::RB_COLORCONTROL rb_colorcontrol) {
+  return (rb_colorcontrol.alpha_test_enable &&
+          rb_colorcontrol.alpha_func != xenos::CompareFunction::kAlways) ||
+         rb_colorcontrol.alpha_to_mask_enable;
+}
+
+// Whether the pixel shader can be disabled on the host to speed up depth
+// pre-passes and shadowmaps. The shader must have its ucode analyzed. If
+// IsRasterizationPotentiallyDone, this shouldn't be called, and assumed false
+// instead. Helps reject the pixel shader in some cases - memexport draws in
+// Halo 3, and also most of some 1-point draws not covering anything done for
+// some reason in different games with a leftover pixel shader from the previous
+// draw, but with SQ_PROGRAM_CNTL destroyed, reducing the number of
+// unpredictable unneeded translations of random shaders with different host
+// modification bits, such as register count and depth format-related (though
+// shaders with side effects on depth or memory export will still be preserved).
+bool IsPixelShaderNeededWithRasterization(const Shader& shader,
+                                          const RegisterFile& regs);
+
 struct ViewportInfo {
-  // The returned viewport will always be in the positive quarter-plane for
-  // simplicity of clamping to the maximum size supported by the host, negative
-  // offset will be applied via ndc_offset.
-  float left;
-  float top;
-  float width;
-  float height;
+  // Offset from render target UV = 0 to +UV.
+  // For simplicity of cropping to the maximum size on the host; to match the
+  // Direct3D 12 clipping / scissoring behavior with a fractional viewport, to
+  // floor(TopLeftXY) ... floor(TopLeftXY + WidthHeight), on the real AMD, Intel
+  // and Nvidia hardware (not WARP); as well as to hide the differences between
+  // 0 and 8+ viewportSubPixelBits on Vulkan, and to prevent any numerical error
+  // in bound checking in host APIs, viewport bounds are returned as integers.
+  // Also they're returned as non-negative, also to make it easier to crop (so
+  // Vulkan maxViewportDimensions and viewportBoundsRange don't have to be
+  // handled separately - maxViewportDimensions is greater than or equal to the
+  // largest framebuffer image size, so it's safe, and viewportBoundsRange is
+  // always bigger than maxViewportDimensions. All fractional offsetting,
+  // including the half-pixel offset, and cropping are handled via ndc_scale and
+  // ndc_offset.
+  uint32_t xy_offset[2];
+  // Extent can be zero for an empty viewport - host APIs not supporting empty
+  // viewports need to use an empty scissor rectangle.
+  uint32_t xy_extent[2];
  float z_min;
  float z_max;
+  // The scale is applied before the offset (like using multiply-add).
  float ndc_scale[3];
  float ndc_offset[3];
 };
@ -50,19 +179,31 @@ struct ViewportInfo {
 // a viewport, plus values to multiply-add the returned position by, usable on
 // host graphics APIs such as Direct3D 11+ and Vulkan, also forcing it to the
 // Direct3D clip space with 0...W Z rather than -W...W.
-void GetHostViewportInfo(const RegisterFile& regs, float pixel_size_x,
-                         float pixel_size_y, bool origin_bottom_left,
-                         float x_max, float y_max, bool allow_reverse_z,
-                         bool convert_z_to_float24,
+void GetHostViewportInfo(const RegisterFile& regs, uint32_t resolution_scale,
+                         bool origin_bottom_left, uint32_t x_max,
+                         uint32_t y_max, bool allow_reverse_z,
+                         bool convert_z_to_float24, bool full_float24_in_0_to_1,
+                         bool pixel_shader_writes_depth,
                         ViewportInfo& viewport_info_out);

 struct Scissor {
-  uint32_t left;
-  uint32_t top;
-  uint32_t width;
-  uint32_t height;
+  // Offset from render target UV = 0 to +UV.
+  uint32_t offset[2];
+  // Extent can be zero.
+  uint32_t extent[2];
 };
-void GetScissor(const RegisterFile& regs, Scissor& scissor_out);
+void GetScissor(const RegisterFile& regs, Scissor& scissor_out,
+                bool clamp_to_surface_pitch = true);
+
+// Scales, and shift amounts of the upper 32 bits of the 32x32=64-bit
+// multiplication result, for fast division and multiplication by
+// EDRAM-tile-related amounts.
+constexpr uint32_t kDivideScale3 = 0xAAAAAAABu;
+constexpr uint32_t kDivideUpperShift3 = 1;
+constexpr uint32_t kDivideScale5 = 0xCCCCCCCDu;
+constexpr uint32_t kDivideUpperShift5 = 2;
+constexpr uint32_t kDivideScale15 = 0x88888889u;
+constexpr uint32_t kDivideUpperShift15 = 3;

 // To avoid passing values that the shader won't understand (even though
 // Direct3D 9 shouldn't pass them anyway).
@ -75,11 +216,11 @@ xenos::CopySampleSelect SanitizeCopySampleSelect(

 union ResolveEdramPackedInfo {
  struct {
-    // With offset to the 160x32 region that local_x/y_div_8 are relative to,
-    // and with 32bpp/64bpp taken into account.
+    // With 32bpp/64bpp taken into account.
    uint32_t pitch_tiles : xenos::kEdramPitchTilesBits;
    xenos::MsaaSamples msaa_samples : xenos::kMsaaSamplesBits;
    uint32_t is_depth : 1;
+    // With offset to the 160x32 region that local_x/y_div_8 are relative to.
    uint32_t base_tiles : xenos::kEdramBaseTilesBits;
    uint32_t format : xenos::kRenderTargetFormatBits;
    uint32_t format_is_64bpp : 1;
@ -122,26 +263,58 @@ union ResolveAddressPackedInfo {
 static_assert(sizeof(ResolveAddressPackedInfo) <= sizeof(uint32_t),
              "ResolveAddressPackedInfo must be packable in uint32_t");

+// Returns tiles actually covered by a resolve area. Row length used is width of
+// the area in tiles, but the pitch between rows is edram_info.pitch_tiles.
+void GetResolveEdramTileSpan(ResolveEdramPackedInfo edram_info,
+                             ResolveAddressPackedInfo address_info,
+                             uint32_t& base_out, uint32_t& row_length_used_out,
+                             uint32_t& rows_out);
+
+union ResolveCopyDestPitchPackedInfo {
+  struct {
+    // 0...16384/32.
+    uint32_t pitch_aligned_div_32 : xenos::kTexture2DCubeMaxWidthHeightLog2 +
+                                    2 - xenos::kTextureTileWidthHeightLog2;
+    uint32_t height_aligned_div_32 : xenos::kTexture2DCubeMaxWidthHeightLog2 +
+                                     2 - xenos::kTextureTileWidthHeightLog2;
+  };
+  uint32_t packed;
+};
+static_assert(sizeof(ResolveCopyDestPitchPackedInfo) <= sizeof(uint32_t),
+              "ResolveAddressPackedInfo must be packable in uint32_t");
+
 // For backends with Shader Model 5-like compute, host shaders to use to perform
 // copying in resolve operations.
 enum class ResolveCopyShaderIndex {
  kFast32bpp1x2xMSAA,
  kFast32bpp4xMSAA,
  kFast32bpp2xRes,
+  kFast32bpp3xRes1x2xMSAA,
+  kFast32bpp3xRes4xMSAA,
  kFast64bpp1x2xMSAA,
  kFast64bpp4xMSAA,
  kFast64bpp2xRes,
+  kFast64bpp3xRes,

  kFull8bpp,
  kFull8bpp2xRes,
+  kFull8bpp3xRes,
  kFull16bpp,
  kFull16bpp2xRes,
+  kFull16bppFrom32bpp3xRes,
+  kFull16bppFrom64bpp3xRes,
  kFull32bpp,
  kFull32bpp2xRes,
+  kFull32bppFrom32bpp3xRes,
+  kFull32bppFrom64bpp3xRes,
  kFull64bpp,
  kFull64bpp2xRes,
+  kFull64bppFrom32bpp3xRes,
+  kFull64bppFrom64bpp3xRes,
  kFull128bpp,
  kFull128bpp2xRes,
+  kFull128bppFrom32bpp3xRes,
+  kFull128bppFrom64bpp3xRes,

  kCount,
  kUnknown = kCount,
@ -182,7 +355,7 @@ struct ResolveCopyShaderConstants {
    ResolveEdramPackedInfo edram_info;
    ResolveAddressPackedInfo address_info;
    reg::RB_COPY_DEST_INFO dest_info;
-    reg::RB_COPY_DEST_PITCH dest_pitch;
+    ResolveCopyDestPitchPackedInfo dest_pitch_aligned;
  };
  DestRelative dest_relative;
  uint32_t dest_base;
@ -202,15 +375,23 @@ struct ResolveClearShaderConstants {
 struct ResolveInfo {
  reg::RB_COPY_CONTROL rb_copy_control;

-  // color_edram_info and depth_edram_info are set up if copying or clearing
-  // color and depth respectively, according to RB_COPY_CONTROL.
-  ResolveEdramPackedInfo color_edram_info;
+  // depth_edram_info / depth_original_base and color_edram_info /
+  // color_original_base are set up if copying or clearing color and depth
+  // respectively, according to RB_COPY_CONTROL.
  ResolveEdramPackedInfo depth_edram_info;
+  ResolveEdramPackedInfo color_edram_info;
+  // Original bases, without adjustment to a 160x32 region for packed offsets,
+  // for locating host render targets to perform clears if host render targets
+  // are used for EDRAM emulation - the same as the base that the render target
+  // will likely used for drawing next, to prevent unneeded tile ownership
+  // transfers between clears and first usage if clearing a subregion.
+  uint32_t depth_original_base;
+  uint32_t color_original_base;

  ResolveAddressPackedInfo address;

-  reg::RB_COPY_DEST_INFO rb_copy_dest_info;
-  reg::RB_COPY_DEST_PITCH rb_copy_dest_pitch;
+  reg::RB_COPY_DEST_INFO copy_dest_info;
+  ResolveCopyDestPitchPackedInfo copy_dest_pitch_aligned;

  // Memory range that will potentially be modified by copying, with
  // address.local_x/y_div_8 & 31 being the origin relative to it.
@ -228,6 +409,16 @@ struct ResolveInfo {
    return rb_copy_control.copy_src_select >= xenos::kMaxColorRenderTargets;
  }

+  // See GetResolveEdramTileSpan documentation for explanation.
+  void GetCopyEdramTileSpan(uint32_t& base_out, uint32_t& row_length_used_out,
+                            uint32_t& rows_out, uint32_t& pitch_out) const {
+    ResolveEdramPackedInfo edram_info =
+        IsCopyingDepth() ? depth_edram_info : color_edram_info;
+    GetResolveEdramTileSpan(edram_info, address, base_out, row_length_used_out,
+                            rows_out);
+    pitch_out = edram_info.pitch_tiles;
+  }
+
  ResolveCopyShaderIndex GetCopyShader(
      uint32_t resolution_scale, ResolveCopyShaderConstants& constants_out,
      uint32_t& group_count_x_out, uint32_t& group_count_y_out) const;
@ -241,23 +432,10 @@ struct ResolveInfo {
  }

  void GetDepthClearShaderConstants(
-      bool has_float32_copy, ResolveClearShaderConstants& constants_out) const {
+      ResolveClearShaderConstants& constants_out) const {
    assert_true(IsClearingDepth());
    constants_out.rt_specific.clear_value[0] = rb_depth_clear;
-    if (has_float32_copy) {
-      float depth32;
-      uint32_t depth24 = rb_depth_clear >> 8;
-      if (xenos::DepthRenderTargetFormat(depth_edram_info.format) ==
-          xenos::DepthRenderTargetFormat::kD24S8) {
-        depth32 = depth24 * float(1.0f / 16777215.0f);
-      } else {
-        depth32 = xenos::Float20e4To32(depth24);
-      }
-      constants_out.rt_specific.clear_value[1] =
-          *reinterpret_cast<const uint32_t*>(&depth32);
-    } else {
-      constants_out.rt_specific.clear_value[1] = rb_depth_clear;
-    }
+    constants_out.rt_specific.clear_value[1] = rb_depth_clear;
    constants_out.rt_specific.edram_info = depth_edram_info;
    constants_out.address_info = address;
  }
@ -266,9 +444,8 @@ struct ResolveInfo {
      ResolveClearShaderConstants& constants_out) const {
    assert_true(IsClearingColor());
    // Not doing -32...32 to -1...1 clamping here as a hack for k_16_16 and
-    // k_16_16_16_16 blending emulation when using traditional host render
-    // targets as it would be inconsistent with the usual way of clearing with a
-    // quad.
+    // k_16_16_16_16 blending emulation when using host render targets as it
+    // would be inconsistent with the usual way of clearing with a depth quad.
    // TODO(Triang3l): Check which 32-bit portion is in which register.
    constants_out.rt_specific.clear_value[0] = rb_color_clear;
    constants_out.rt_specific.clear_value[1] = rb_color_clear_lo;
@ -295,13 +472,14 @@ struct ResolveInfo {
 };

 // Returns false if there was an error obtaining the info making it totally
-// invalid. edram_16_as_minus_1_to_1 is false if 16_16 and 16_16_16_16 color
-// render target formats are properly emulated as -32...32, true if emulated as
-// snorm, with range limited to -1...1, but with correct blending within that
-// range.
+// invalid. fixed_16_truncated_to_minus_1_to_1 is false if 16_16 and 16_16_16_16
+// color render target formats are properly emulated as -32...32, true if
+// emulated as snorm, with range limited to -1...1, but with correct blending
+// within that range.
 bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
                    TraceWriter& trace_writer, uint32_t resolution_scale,
-                    bool edram_16_as_minus_1_to_1, ResolveInfo& info_out);
+                    bool fixed_16_truncated_to_minus_1_to_1,
+                    ResolveInfo& info_out);

 // Taking user configuration - stretching or letterboxing, overscan region to
 // crop to fill while maintaining the aspect ratio - into account, returns the
--- a/src/xenia/gpu/dxbc.h
+++ b/src/xenia/gpu/dxbc.h
--- a/src/xenia/gpu/dxbc_shader.cc
+++ b/src/xenia/gpu/dxbc_shader.cc
@ -19,7 +19,7 @@ DxbcShader::DxbcShader(xenos::ShaderType shader_type, uint64_t data_hash,
    : Shader(shader_type, data_hash, dword_ptr, dword_count) {}

 Shader::Translation* DxbcShader::CreateTranslationInstance(
-    uint32_t modification) {
+    uint64_t modification) {
  return new DxbcTranslation(*this, modification);
 }

--- a/src/xenia/gpu/dxbc_shader.h
+++ b/src/xenia/gpu/dxbc_shader.h
@ -10,6 +10,7 @@
 #ifndef XENIA_GPU_DXBC_SHADER_H_
 #define XENIA_GPU_DXBC_SHADER_H_

+#include <atomic>
 #include <vector>

 #include "xenia/gpu/dxbc_shader_translator.h"
@ -23,13 +24,17 @@ class DxbcShader : public Shader {
 public:
  class DxbcTranslation : public Translation {
   public:
-    DxbcTranslation(DxbcShader& shader, uint32_t modification)
+    DxbcTranslation(DxbcShader& shader, uint64_t modification)
        : Translation(shader, modification) {}
  };

  DxbcShader(xenos::ShaderType shader_type, uint64_t data_hash,
             const uint32_t* dword_ptr, uint32_t dword_count);

+  // Resource bindings are gathered after the successful translation of any
+  // modification for simplicity of translation (and they don't depend on
+  // modification bits).
+
  static constexpr uint32_t kMaxTextureBindingIndexBits =
      DxbcShaderTranslator::kMaxTextureBindingIndexBits;
  static constexpr uint32_t kMaxTextureBindings =
@ -43,11 +48,13 @@ class DxbcShader : public Shader {
    bool is_signed;
  };
  // Safe to hash and compare with memcmp for layout hashing.
-  const TextureBinding* GetTextureBindings(uint32_t& count_out) const {
-    count_out = uint32_t(texture_bindings_.size());
-    return texture_bindings_.data();
+  const std::vector<TextureBinding>& GetTextureBindingsAfterTranslation()
+      const {
+    return texture_bindings_;
+  }
+  const uint32_t GetUsedTextureMaskAfterTranslation() const {
+    return used_texture_mask_;
  }
-  const uint32_t GetUsedTextureMask() const { return used_texture_mask_; }

  static constexpr uint32_t kMaxSamplerBindingIndexBits =
      DxbcShaderTranslator::kMaxSamplerBindingIndexBits;
@ -61,17 +68,18 @@ class DxbcShader : public Shader {
    xenos::TextureFilter mip_filter;
    xenos::AnisoFilter aniso_filter;
  };
-  const SamplerBinding* GetSamplerBindings(uint32_t& count_out) const {
-    count_out = uint32_t(sampler_bindings_.size());
-    return sampler_bindings_.data();
+  const std::vector<SamplerBinding>& GetSamplerBindingsAfterTranslation()
+      const {
+    return sampler_bindings_;
  }

 protected:
-  Translation* CreateTranslationInstance(uint32_t modification) override;
+  Translation* CreateTranslationInstance(uint64_t modification) override;

 private:
  friend class DxbcShaderTranslator;

+  std::atomic_flag bindings_setup_entered_ = ATOMIC_FLAG_INIT;
  std::vector<TextureBinding> texture_bindings_;
  std::vector<SamplerBinding> sampler_bindings_;
  uint32_t used_texture_mask_ = 0;
--- a/src/xenia/gpu/dxbc_shader_translator.cc
+++ b/src/xenia/gpu/dxbc_shader_translator.cc
--- a/src/xenia/gpu/dxbc_shader_translator.h
+++ b/src/xenia/gpu/dxbc_shader_translator.h
--- a/src/xenia/gpu/dxbc_shader_translator_alu.cc
+++ b/src/xenia/gpu/dxbc_shader_translator_alu.cc
--- a/src/xenia/gpu/dxbc_shader_translator_fetch.cc
+++ b/src/xenia/gpu/dxbc_shader_translator_fetch.cc
--- a/src/xenia/gpu/dxbc_shader_translator_memexport.cc
+++ b/src/xenia/gpu/dxbc_shader_translator_memexport.cc
@ -7,6 +7,8 @@
 ******************************************************************************
 */

+#include "xenia/base/assert.h"
+#include "xenia/gpu/draw_util.h"
 #include "xenia/gpu/dxbc_shader_translator.h"

 namespace xe {
@ -15,7 +17,7 @@ using namespace ucode;

 void DxbcShaderTranslator::ExportToMemory_PackFixed32(
    const uint32_t* eM_temps, uint32_t eM_count, const uint32_t bits[4],
-    const DxbcSrc& is_integer, const DxbcSrc& is_signed) {
+    const dxbc::Src& is_integer, const dxbc::Src& is_signed) {
  // Will insert with BFI - sign extension of red will be overwritten, not
  // truncated.
  assert_not_zero(bits[0]);
@ -26,64 +28,64 @@ void DxbcShaderTranslator::ExportToMemory_PackFixed32(
      mask |= 1 << i;
    }
  }
-  DxbcOpIf(true, is_signed);
+  a_.OpIf(true, is_signed);
  {
    float range[4];
    for (uint32_t i = 0; i < 4; ++i) {
      range[i] = bits[i] ? float((uint32_t(1) << (bits[i] - 1)) - 1) : 0.0f;
    }
-    DxbcSrc range_src(DxbcSrc::LP(range));
-    DxbcOpIf(false, is_integer);
+    dxbc::Src range_src(dxbc::Src::LP(range));
+    a_.OpIf(false, is_integer);
    for (uint32_t i = 0; i < eM_count; ++i) {
      uint32_t eM_temp = eM_temps[i];
-      DxbcOpMul(DxbcDest::R(eM_temp, mask), DxbcSrc::R(eM_temp), range_src);
+      a_.OpMul(dxbc::Dest::R(eM_temp, mask), dxbc::Src::R(eM_temp), range_src);
    }
-    DxbcOpEndIf();
+    a_.OpEndIf();
    for (uint32_t i = 0; i < eM_count; ++i) {
-      DxbcDest eM_dest(DxbcDest::R(eM_temps[i], mask));
-      DxbcSrc eM_src(DxbcSrc::R(eM_temps[i]));
+      dxbc::Dest eM_dest(dxbc::Dest::R(eM_temps[i], mask));
+      dxbc::Src eM_src(dxbc::Src::R(eM_temps[i]));
      // TODO(Triang3l): NaN should become zero, not -range.
-      DxbcOpMax(eM_dest, eM_src, -range_src);
-      DxbcOpMin(eM_dest, eM_src, range_src);
+      a_.OpMax(eM_dest, eM_src, -range_src);
+      a_.OpMin(eM_dest, eM_src, range_src);
    }
  }
-  DxbcOpElse();
+  a_.OpElse();
  {
    float range[4];
    for (uint32_t i = 0; i < 4; ++i) {
      range[i] = float((uint32_t(1) << bits[i]) - 1);
    }
-    DxbcSrc range_src(DxbcSrc::LP(range));
-    DxbcOpIf(false, is_integer);
+    dxbc::Src range_src(dxbc::Src::LP(range));
+    a_.OpIf(false, is_integer);
    for (uint32_t i = 0; i < eM_count; ++i) {
      uint32_t eM_temp = eM_temps[i];
-      DxbcOpMul(DxbcDest::R(eM_temp, mask), DxbcSrc::R(eM_temp), range_src);
+      a_.OpMul(dxbc::Dest::R(eM_temp, mask), dxbc::Src::R(eM_temp), range_src);
    }
-    DxbcOpEndIf();
+    a_.OpEndIf();
    for (uint32_t i = 0; i < eM_count; ++i) {
-      DxbcDest eM_dest(DxbcDest::R(eM_temps[i], mask));
-      DxbcSrc eM_src(DxbcSrc::R(eM_temps[i]));
-      DxbcOpMax(eM_dest, eM_src, DxbcSrc::LF(0.0f));
-      DxbcOpMin(eM_dest, eM_src, range_src);
+      dxbc::Dest eM_dest(dxbc::Dest::R(eM_temps[i], mask));
+      dxbc::Src eM_src(dxbc::Src::R(eM_temps[i]));
+      a_.OpMax(eM_dest, eM_src, dxbc::Src::LF(0.0f));
+      a_.OpMin(eM_dest, eM_src, range_src);
    }
  }
-  DxbcOpEndIf();
+  a_.OpEndIf();
  for (uint32_t i = 0; i < eM_count; ++i) {
    uint32_t eM_temp = eM_temps[i];
    // Round to the nearest integer, according to the rules of handling integer
    // formats in Direct3D.
    // TODO(Triang3l): Round by adding +-0.5, not with round_ne.
-    DxbcOpRoundNE(DxbcDest::R(eM_temp, mask), DxbcSrc::R(eM_temp));
-    DxbcOpFToI(DxbcDest::R(eM_temp, mask), DxbcSrc::R(eM_temp));
-    DxbcDest eM_packed_dest(DxbcDest::R(eM_temp, 0b0001));
-    DxbcSrc eM_packed_src(DxbcSrc::R(eM_temp, DxbcSrc::kXXXX));
+    a_.OpRoundNE(dxbc::Dest::R(eM_temp, mask), dxbc::Src::R(eM_temp));
+    a_.OpFToI(dxbc::Dest::R(eM_temp, mask), dxbc::Src::R(eM_temp));
+    dxbc::Dest eM_packed_dest(dxbc::Dest::R(eM_temp, 0b0001));
+    dxbc::Src eM_packed_src(dxbc::Src::R(eM_temp, dxbc::Src::kXXXX));
    uint32_t offset = bits[0];
    for (uint32_t j = 1; j < 4; ++j) {
      if (!bits[j]) {
        continue;
      }
-      DxbcOpBFI(eM_packed_dest, DxbcSrc::LU(bits[j]), DxbcSrc::LU(offset),
-                DxbcSrc::R(eM_temp).Select(j), eM_packed_src);
+      a_.OpBFI(eM_packed_dest, dxbc::Src::LU(bits[j]), dxbc::Src::LU(offset),
+               dxbc::Src::R(eM_temp).Select(j), eM_packed_src);
      offset += bits[j];
    }
  }
@ -99,44 +101,94 @@ void DxbcShaderTranslator::ExportToMemory() {
  uint32_t control_temp = PushSystemTemp();

  // Safety check if the shared memory is bound as UAV.
-  system_constants_used_ |= 1ull << kSysConst_Flags_Index;
-  DxbcOpAnd(DxbcDest::R(control_temp, 0b0001),
-            DxbcSrc::CB(cbuffer_index_system_constants_,
-                        uint32_t(CbufferRegister::kSystemConstants),
-                        kSysConst_Flags_Vec)
-                .Select(kSysConst_Flags_Comp),
-            DxbcSrc::LU(kSysFlag_SharedMemoryIsUAV));
+  a_.OpUBFE(dxbc::Dest::R(control_temp, 0b0001), dxbc::Src::LU(1),
+            dxbc::Src::LU(kSysFlag_SharedMemoryIsUAV_Shift),
+            LoadFlagsSystemConstant());
+  // Open the `if` with the uniform condition for the shared memory buffer being
+  // bound as a UAV (more fine-grained checks are vector and likely divergent).
+  a_.OpIf(true, dxbc::Src::R(control_temp, dxbc::Src::kXXXX));
+
+  // Check more fine-grained limitations.
+  // The flag in control_temp.x can be 0 or 1 for simplicity, not necessarily
+  // 0 or 0xFFFFFFFF.
+  bool inner_condition_provided = false;
  if (is_pixel_shader()) {
-    // Disable memexport in pixel shaders with supersampling since VPOS is
-    // ambiguous.
-    if (edram_rov_used_) {
-      system_constants_used_ |= 1ull
-                                << kSysConst_EdramResolutionSquareScale_Index;
-      DxbcOpULT(DxbcDest::R(control_temp, 0b0010),
-                DxbcSrc::CB(cbuffer_index_system_constants_,
-                            uint32_t(CbufferRegister::kSystemConstants),
-                            kSysConst_EdramResolutionSquareScale_Vec)
-                    .Select(kSysConst_EdramResolutionSquareScale_Comp),
-                DxbcSrc::LU(2));
-      DxbcOpAnd(DxbcDest::R(control_temp, 0b0001),
-                DxbcSrc::R(control_temp, DxbcSrc::kXXXX),
-                DxbcSrc::R(control_temp, DxbcSrc::kYYYY));
-    } else {
-      // Enough to check just Y because it's scaled for both 2x and 4x.
-      system_constants_used_ |= 1ull << kSysConst_SampleCountLog2_Index;
-      DxbcOpMovC(DxbcDest::R(control_temp, 0b0001),
-                 DxbcSrc::CB(cbuffer_index_system_constants_,
-                             uint32_t(CbufferRegister::kSystemConstants),
-                             kSysConst_SampleCountLog2_Vec)
-                     .Select(kSysConst_SampleCountLog2_Comp + 1),
-                 DxbcSrc::LU(0), DxbcSrc::R(control_temp, DxbcSrc::kXXXX));
+    if (draw_resolution_scale_ > 1) {
+      // Only do memexport for one host pixel in a guest pixel.
+      // For 2x - (1, 1) because it's covered with half-pixel offset that
+      // becomes full-pixel.
+      // For 3x - also (1, 1) because it's still covered with half-pixel offset,
+      // but close to the center.
+      in_position_used_ |= 0b0011;
+      a_.OpFToU(
+          dxbc::Dest::R(control_temp, 0b0110),
+          dxbc::Src::V(uint32_t(InOutRegister::kPSInPosition), 0b0100 << 2));
+      switch (draw_resolution_scale_) {
+        case 2:
+          a_.OpAnd(dxbc::Dest::R(control_temp, 0b0110),
+                   dxbc::Src::R(control_temp), dxbc::Src::LU(1));
+          // No need to do IEq - already 1 for right / bottom, 0 for left / top.
+          break;
+        case 3:
+          // xy % 3 == 1.
+          for (uint32_t i = 1; i <= 2; ++i) {
+            a_.OpUMul(dxbc::Dest::R(control_temp, 0b1000), dxbc::Dest::Null(),
+                      dxbc::Src::R(control_temp).Select(i),
+                      dxbc::Src::LU(draw_util::kDivideScale3));
+            a_.OpUShR(dxbc::Dest::R(control_temp, 0b1000),
+                      dxbc::Src::R(control_temp, dxbc::Src::kWWWW),
+                      dxbc::Src::LU(draw_util::kDivideUpperShift3));
+            a_.OpIMAd(dxbc::Dest::R(control_temp, 1 << i),
+                      dxbc::Src::R(control_temp, dxbc::Src::kWWWW),
+                      dxbc::Src::LI(-3), dxbc::Src::R(control_temp).Select(i));
+          }
+          a_.OpIEq(dxbc::Dest::R(control_temp, 0b0110),
+                   dxbc::Src::R(control_temp), dxbc::Src::LU(1));
+          break;
+        default:
+          assert_unhandled_case(draw_resolution_scale_);
+      }
+      a_.OpAnd(dxbc::Dest::R(control_temp,
+                             inner_condition_provided ? 0b0010 : 0b0001),
+               dxbc::Src::R(control_temp, dxbc::Src::kYYYY),
+               dxbc::Src::R(control_temp, dxbc::Src::kZZZZ));
+      if (inner_condition_provided) {
+        // Merge with the previous condition in control_temp.x.
+        a_.OpAnd(dxbc::Dest::R(control_temp, 0b0001),
+                 dxbc::Src::R(control_temp, dxbc::Src::kXXXX),
+                 dxbc::Src::R(control_temp, dxbc::Src::kYYYY));
+      }
+      inner_condition_provided = true;
+    }
+    // With sample-rate shading (with float24 conversion), only do memexport
+    // from one sample (as the shader is invoked multiple times for a pixel),
+    // if SV_SampleIndex == firstbit_lo(SV_Coverage). For zero coverage,
+    // firstbit_lo returns 0xFFFFFFFF.
+    if (IsSampleRate()) {
+      a_.OpFirstBitLo(dxbc::Dest::R(control_temp, 0b0010),
+                      dxbc::Src::VCoverage());
+      a_.OpIEq(
+          dxbc::Dest::R(control_temp,
+                        inner_condition_provided ? 0b0010 : 0b0001),
+          dxbc::Src::V(uint32_t(InOutRegister::kPSInFrontFaceAndSampleIndex),
+                       dxbc::Src::kYYYY),
+          dxbc::Src::R(control_temp, dxbc::Src::kYYYY));
+      if (inner_condition_provided) {
+        // Merge with the previous condition in control_temp.x.
+        a_.OpAnd(dxbc::Dest::R(control_temp, 0b0001),
+                 dxbc::Src::R(control_temp, dxbc::Src::kXXXX),
+                 dxbc::Src::R(control_temp, dxbc::Src::kYYYY));
+      }
+      inner_condition_provided = true;
    }
  }
-  // Check if memexport can be done.
-  DxbcOpIf(true, DxbcSrc::R(control_temp, DxbcSrc::kXXXX));
+  // Open the inner (vector) conditional if needed.
+  if (inner_condition_provided) {
+    a_.OpIf(true, dxbc::Src::R(control_temp, dxbc::Src::kXXXX));
+  }
  // control_temp.x is now free.

-  for (uint32_t i = 0; i < kMaxMemExports; ++i) {
+  for (uint32_t i = 0; i < Shader::kMaxMemExports; ++i) {
    uint32_t eA_temp = system_temps_memexport_address_[i];
    if (eA_temp == UINT32_MAX) {
      // Export not used.
@ -160,21 +212,21 @@ void DxbcShaderTranslator::ExportToMemory() {
    }

    // Swap red and blue if needed.
-    DxbcOpAnd(DxbcDest::R(control_temp, 0b0001),
-              DxbcSrc::R(eA_temp, DxbcSrc::kZZZZ),
-              DxbcSrc::LU(uint32_t(1) << 19));
+    a_.OpAnd(dxbc::Dest::R(control_temp, 0b0001),
+             dxbc::Src::R(eA_temp, dxbc::Src::kZZZZ),
+             dxbc::Src::LU(uint32_t(1) << 19));
    for (uint32_t j = 0; j < eM_count; ++j) {
      uint32_t eM_temp = eM_temps[j];
-      DxbcOpMovC(DxbcDest::R(eM_temp, 0b0101),
-                 DxbcSrc::R(control_temp, DxbcSrc::kXXXX),
-                 DxbcSrc::R(eM_temp, 0b000010), DxbcSrc::R(eM_temp));
+      a_.OpMovC(dxbc::Dest::R(eM_temp, 0b0101),
+                dxbc::Src::R(control_temp, dxbc::Src::kXXXX),
+                dxbc::Src::R(eM_temp, 0b000010), dxbc::Src::R(eM_temp));
    }

    // Initialize element size in control_temp.x to 4 bytes as this is the most
    // common size.
-    DxbcDest element_size_dest(DxbcDest::R(control_temp, 0b0001));
-    DxbcSrc element_size_src(DxbcSrc::R(control_temp, DxbcSrc::kXXXX));
-    DxbcOpMov(element_size_dest, DxbcSrc::LU(4));
+    dxbc::Dest element_size_dest(dxbc::Dest::R(control_temp, 0b0001));
+    dxbc::Src element_size_src(dxbc::Src::R(control_temp, dxbc::Src::kXXXX));
+    a_.OpMov(element_size_dest, dxbc::Src::LU(4));

    // Each eM should get a packed value in the destination format now.

@ -182,285 +234,293 @@ void DxbcShaderTranslator::ExportToMemory() {
    // Y - signedness if fixed-point.
    // Z - fractional/integer if fixed-point.
    // W - color format.
-    DxbcOpUBFE(DxbcDest::R(control_temp, 0b1110), DxbcSrc::LU(0, 1, 1, 6),
-               DxbcSrc::LU(0, 16, 17, 8), DxbcSrc::R(eA_temp, DxbcSrc::kZZZZ));
-    DxbcSrc is_signed(DxbcSrc::R(control_temp, DxbcSrc::kYYYY));
-    DxbcSrc is_integer(DxbcSrc::R(control_temp, DxbcSrc::kZZZZ));
+    a_.OpUBFE(dxbc::Dest::R(control_temp, 0b1110), dxbc::Src::LU(0, 1, 1, 6),
+              dxbc::Src::LU(0, 16, 17, 8),
+              dxbc::Src::R(eA_temp, dxbc::Src::kZZZZ));
+    dxbc::Src is_signed(dxbc::Src::R(control_temp, dxbc::Src::kYYYY));
+    dxbc::Src is_integer(dxbc::Src::R(control_temp, dxbc::Src::kZZZZ));
    // Convert and pack the format.
-    DxbcOpSwitch(DxbcSrc::R(control_temp, DxbcSrc::kWWWW));
+    a_.OpSwitch(dxbc::Src::R(control_temp, dxbc::Src::kWWWW));
    // control_temp.w is now free.
    {
      // k_8_8_8_8
      // k_8_8_8_8_AS_16_16_16_16
-      DxbcOpCase(DxbcSrc::LU(uint32_t(xenos::ColorFormat::k_8_8_8_8)));
-      DxbcOpCase(
-          DxbcSrc::LU(uint32_t(xenos::ColorFormat::k_8_8_8_8_AS_16_16_16_16)));
+      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_8_8_8_8)));
+      a_.OpCase(dxbc::Src::LU(
+          uint32_t(xenos::ColorFormat::k_8_8_8_8_AS_16_16_16_16)));
      {
        uint32_t bits[4] = {8, 8, 8, 8};
        ExportToMemory_PackFixed32(eM_temps, eM_count, bits, is_integer,
                                   is_signed);
      }
-      DxbcOpBreak();
+      a_.OpBreak();

      // k_2_10_10_10
      // k_2_10_10_10_AS_16_16_16_16
-      DxbcOpCase(DxbcSrc::LU(uint32_t(xenos::ColorFormat::k_2_10_10_10)));
-      DxbcOpCase(DxbcSrc::LU(
+      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_2_10_10_10)));
+      a_.OpCase(dxbc::Src::LU(
          uint32_t(xenos::ColorFormat::k_2_10_10_10_AS_16_16_16_16)));
      {
        uint32_t bits[4] = {10, 10, 10, 2};
        ExportToMemory_PackFixed32(eM_temps, eM_count, bits, is_integer,
                                   is_signed);
      }
-      DxbcOpBreak();
+      a_.OpBreak();

      // k_10_11_11
      // k_10_11_11_AS_16_16_16_16
-      DxbcOpCase(DxbcSrc::LU(uint32_t(xenos::ColorFormat::k_10_11_11)));
-      DxbcOpCase(
-          DxbcSrc::LU(uint32_t(xenos::ColorFormat::k_10_11_11_AS_16_16_16_16)));
+      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_10_11_11)));
+      a_.OpCase(dxbc::Src::LU(
+          uint32_t(xenos::ColorFormat::k_10_11_11_AS_16_16_16_16)));
      {
        uint32_t bits[4] = {11, 11, 10};
        ExportToMemory_PackFixed32(eM_temps, eM_count, bits, is_integer,
                                   is_signed);
      }
-      DxbcOpBreak();
+      a_.OpBreak();

      // k_11_11_10
      // k_11_11_10_AS_16_16_16_16
-      DxbcOpCase(DxbcSrc::LU(uint32_t(xenos::ColorFormat::k_11_11_10)));
-      DxbcOpCase(
-          DxbcSrc::LU(uint32_t(xenos::ColorFormat::k_11_11_10_AS_16_16_16_16)));
+      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_11_11_10)));
+      a_.OpCase(dxbc::Src::LU(
+          uint32_t(xenos::ColorFormat::k_11_11_10_AS_16_16_16_16)));
      {
        uint32_t bits[4] = {10, 11, 11};
        ExportToMemory_PackFixed32(eM_temps, eM_count, bits, is_integer,
                                   is_signed);
      }
-      DxbcOpBreak();
+      a_.OpBreak();

      // k_16_16
-      DxbcOpCase(DxbcSrc::LU(uint32_t(xenos::ColorFormat::k_16_16)));
+      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_16)));
      {
        uint32_t bits[4] = {16, 16};
        ExportToMemory_PackFixed32(eM_temps, eM_count, bits, is_integer,
                                   is_signed);
      }
-      DxbcOpBreak();
+      a_.OpBreak();

      // k_16_16_16_16
-      DxbcOpCase(DxbcSrc::LU(uint32_t(xenos::ColorFormat::k_16_16_16_16)));
-      DxbcOpMov(element_size_dest, DxbcSrc::LU(8));
-      DxbcOpIf(true, is_signed);
+      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_16_16_16)));
+      a_.OpMov(element_size_dest, dxbc::Src::LU(8));
+      a_.OpIf(true, is_signed);
      {
-        DxbcOpIf(false, is_integer);
+        a_.OpIf(false, is_integer);
        for (uint32_t j = 0; j < eM_count; ++j) {
          uint32_t eM_temp = eM_temps[j];
-          DxbcOpMul(DxbcDest::R(eM_temp), DxbcSrc::R(eM_temp),
-                    DxbcSrc::LF(32767.0f));
+          a_.OpMul(dxbc::Dest::R(eM_temp), dxbc::Src::R(eM_temp),
+                   dxbc::Src::LF(32767.0f));
        }
-        DxbcOpEndIf();
+        a_.OpEndIf();
        for (uint32_t j = 0; j < eM_count; ++j) {
-          DxbcDest eM_dest(DxbcDest::R(eM_temps[j]));
-          DxbcSrc eM_src(DxbcSrc::R(eM_temps[j]));
+          dxbc::Dest eM_dest(dxbc::Dest::R(eM_temps[j]));
+          dxbc::Src eM_src(dxbc::Src::R(eM_temps[j]));
          // TODO(Triang3l): NaN should become zero, not -range.
-          DxbcOpMax(eM_dest, eM_src, DxbcSrc::LF(-32767.0f));
-          DxbcOpMin(eM_dest, eM_src, DxbcSrc::LF(32767.0f));
+          a_.OpMax(eM_dest, eM_src, dxbc::Src::LF(-32767.0f));
+          a_.OpMin(eM_dest, eM_src, dxbc::Src::LF(32767.0f));
        }
      }
-      DxbcOpElse();
+      a_.OpElse();
      {
-        DxbcOpIf(false, is_integer);
+        a_.OpIf(false, is_integer);
        for (uint32_t j = 0; j < eM_count; ++j) {
          uint32_t eM_temp = eM_temps[j];
-          DxbcOpMul(DxbcDest::R(eM_temp), DxbcSrc::R(eM_temp),
-                    DxbcSrc::LF(65535.0f));
+          a_.OpMul(dxbc::Dest::R(eM_temp), dxbc::Src::R(eM_temp),
+                   dxbc::Src::LF(65535.0f));
        }
-        DxbcOpEndIf();
+        a_.OpEndIf();
        for (uint32_t j = 0; j < eM_count; ++j) {
-          DxbcDest eM_dest(DxbcDest::R(eM_temps[j]));
-          DxbcSrc eM_src(DxbcSrc::R(eM_temps[j]));
-          DxbcOpMax(eM_dest, eM_src, DxbcSrc::LF(0.0f));
-          DxbcOpMin(eM_dest, eM_src, DxbcSrc::LF(65535.0f));
+          dxbc::Dest eM_dest(dxbc::Dest::R(eM_temps[j]));
+          dxbc::Src eM_src(dxbc::Src::R(eM_temps[j]));
+          a_.OpMax(eM_dest, eM_src, dxbc::Src::LF(0.0f));
+          a_.OpMin(eM_dest, eM_src, dxbc::Src::LF(65535.0f));
        }
      }
-      DxbcOpEndIf();
+      a_.OpEndIf();
      for (uint32_t j = 0; j < eM_count; ++j) {
        uint32_t eM_temp = eM_temps[j];
        // Round to the nearest integer, according to the rules of handling
        // integer formats in Direct3D.
        // TODO(Triang3l): Round by adding +-0.5, not with round_ne.
-        DxbcOpRoundNE(DxbcDest::R(eM_temp), DxbcSrc::R(eM_temp));
-        DxbcOpFToI(DxbcDest::R(eM_temp), DxbcSrc::R(eM_temp));
-        DxbcOpBFI(DxbcDest::R(eM_temp, 0b0011), DxbcSrc::LU(16),
-                  DxbcSrc::LU(16), DxbcSrc::R(eM_temp, 0b1101),
-                  DxbcSrc::R(eM_temp, 0b1000));
+        a_.OpRoundNE(dxbc::Dest::R(eM_temp), dxbc::Src::R(eM_temp));
+        a_.OpFToI(dxbc::Dest::R(eM_temp), dxbc::Src::R(eM_temp));
+        a_.OpBFI(dxbc::Dest::R(eM_temp, 0b0011), dxbc::Src::LU(16),
+                 dxbc::Src::LU(16), dxbc::Src::R(eM_temp, 0b1101),
+                 dxbc::Src::R(eM_temp, 0b1000));
      }
-      DxbcOpBreak();
+      a_.OpBreak();

      // k_16_16_FLOAT
-      DxbcOpCase(DxbcSrc::LU(uint32_t(xenos::ColorFormat::k_16_16_FLOAT)));
+      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_16_FLOAT)));
      for (uint32_t j = 0; j < eM_count; ++j) {
        uint32_t eM_temp = eM_temps[j];
-        DxbcOpF32ToF16(DxbcDest::R(eM_temp, 0b0011), DxbcSrc::R(eM_temp));
-        DxbcOpBFI(DxbcDest::R(eM_temp, 0b0001), DxbcSrc::LU(16),
-                  DxbcSrc::LU(16), DxbcSrc::R(eM_temp, DxbcSrc::kYYYY),
-                  DxbcSrc::R(eM_temp, DxbcSrc::kXXXX));
+        a_.OpF32ToF16(dxbc::Dest::R(eM_temp, 0b0011), dxbc::Src::R(eM_temp));
+        a_.OpBFI(dxbc::Dest::R(eM_temp, 0b0001), dxbc::Src::LU(16),
+                 dxbc::Src::LU(16), dxbc::Src::R(eM_temp, dxbc::Src::kYYYY),
+                 dxbc::Src::R(eM_temp, dxbc::Src::kXXXX));
      }
-      DxbcOpBreak();
+      a_.OpBreak();

      // k_16_16_16_16_FLOAT
-      DxbcOpCase(
-          DxbcSrc::LU(uint32_t(xenos::ColorFormat::k_16_16_16_16_FLOAT)));
-      DxbcOpMov(element_size_dest, DxbcSrc::LU(8));
+      a_.OpCase(
+          dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_16_16_16_16_FLOAT)));
+      a_.OpMov(element_size_dest, dxbc::Src::LU(8));
      for (uint32_t j = 0; j < eM_count; ++j) {
        uint32_t eM_temp = eM_temps[j];
-        DxbcOpF32ToF16(DxbcDest::R(eM_temp), DxbcSrc::R(eM_temp));
-        DxbcOpBFI(DxbcDest::R(eM_temp, 0b0011), DxbcSrc::LU(16),
-                  DxbcSrc::LU(16), DxbcSrc::R(eM_temp, 0b1101),
-                  DxbcSrc::R(eM_temp, 0b1000));
+        a_.OpF32ToF16(dxbc::Dest::R(eM_temp), dxbc::Src::R(eM_temp));
+        a_.OpBFI(dxbc::Dest::R(eM_temp, 0b0011), dxbc::Src::LU(16),
+                 dxbc::Src::LU(16), dxbc::Src::R(eM_temp, 0b1101),
+                 dxbc::Src::R(eM_temp, 0b1000));
      }
-      DxbcOpBreak();
+      a_.OpBreak();

      // k_32_FLOAT
      // Already in the destination format, 4 bytes per element already
      // selected.

      // k_32_32_FLOAT
-      DxbcOpCase(DxbcSrc::LU(uint32_t(xenos::ColorFormat::k_32_32_FLOAT)));
-      DxbcOpMov(element_size_dest, DxbcSrc::LU(8));
+      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_32_32_FLOAT)));
+      a_.OpMov(element_size_dest, dxbc::Src::LU(8));
      // Already in the destination format.
-      DxbcOpBreak();
+      a_.OpBreak();

      // k_32_32_32_32_FLOAT
-      DxbcOpCase(
-          DxbcSrc::LU(uint32_t(xenos::ColorFormat::k_32_32_32_32_FLOAT)));
-      DxbcOpMov(element_size_dest, DxbcSrc::LU(16));
+      a_.OpCase(
+          dxbc::Src::LU(uint32_t(xenos::ColorFormat::k_32_32_32_32_FLOAT)));
+      a_.OpMov(element_size_dest, dxbc::Src::LU(16));
      // Already in the destination format.
-      DxbcOpBreak();
+      a_.OpBreak();
    }
-    DxbcOpEndSwitch();
+    a_.OpEndSwitch();
    // control_temp.yz are now free.

    // Do endian swap.
    {
-      DxbcDest endian_dest(DxbcDest::R(control_temp, 0b0010));
-      DxbcSrc endian_src(DxbcSrc::R(control_temp, DxbcSrc::kYYYY));
+      dxbc::Dest endian_dest(dxbc::Dest::R(control_temp, 0b0010));
+      dxbc::Src endian_src(dxbc::Src::R(control_temp, dxbc::Src::kYYYY));
      // Extract endianness into control_temp.y.
-      DxbcOpAnd(endian_dest, DxbcSrc::R(eA_temp, DxbcSrc::kZZZZ),
-                DxbcSrc::LU(0b111));
+      a_.OpAnd(endian_dest, dxbc::Src::R(eA_temp, dxbc::Src::kZZZZ),
+               dxbc::Src::LU(0b111));

      // Change 8-in-64 and 8-in-128 to 8-in-32.
      for (uint32_t j = 0; j < 2; ++j) {
-        DxbcOpIEq(DxbcDest::R(control_temp, 0b0100), endian_src,
-                  DxbcSrc::LU(uint32_t(j ? xenos::Endian128::k8in128
-                                         : xenos::Endian128::k8in64)));
+        a_.OpIEq(dxbc::Dest::R(control_temp, 0b0100), endian_src,
+                 dxbc::Src::LU(uint32_t(j ? xenos::Endian128::k8in128
+                                          : xenos::Endian128::k8in64)));
        for (uint32_t k = 0; k < eM_count; ++k) {
          uint32_t eM_temp = eM_temps[k];
-          DxbcOpMovC(DxbcDest::R(eM_temp),
-                     DxbcSrc::R(control_temp, DxbcSrc::kZZZZ),
-                     DxbcSrc::R(eM_temp, j ? 0b00011011 : 0b10110001),
-                     DxbcSrc::R(eM_temp));
+          a_.OpMovC(dxbc::Dest::R(eM_temp),
+                    dxbc::Src::R(control_temp, dxbc::Src::kZZZZ),
+                    dxbc::Src::R(eM_temp, j ? 0b00011011 : 0b10110001),
+                    dxbc::Src::R(eM_temp));
        }
-        DxbcOpMovC(endian_dest, DxbcSrc::R(control_temp, DxbcSrc::kZZZZ),
-                   DxbcSrc::LU(uint32_t(xenos::Endian128::k8in32)), endian_src);
+        a_.OpMovC(endian_dest, dxbc::Src::R(control_temp, dxbc::Src::kZZZZ),
+                  dxbc::Src::LU(uint32_t(xenos::Endian128::k8in32)),
+                  endian_src);
      }

      uint32_t swap_temp = PushSystemTemp();
-      DxbcDest swap_temp_dest(DxbcDest::R(swap_temp));
-      DxbcSrc swap_temp_src(DxbcSrc::R(swap_temp));
+      dxbc::Dest swap_temp_dest(dxbc::Dest::R(swap_temp));
+      dxbc::Src swap_temp_src(dxbc::Src::R(swap_temp));

      // 8-in-16 or one half of 8-in-32.
-      DxbcOpSwitch(endian_src);
-      DxbcOpCase(DxbcSrc::LU(uint32_t(xenos::Endian128::k8in16)));
-      DxbcOpCase(DxbcSrc::LU(uint32_t(xenos::Endian128::k8in32)));
+      a_.OpSwitch(endian_src);
+      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::Endian128::k8in16)));
+      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::Endian128::k8in32)));
      for (uint32_t j = 0; j < eM_count; ++j) {
-        DxbcDest eM_dest(DxbcDest::R(eM_temps[j]));
-        DxbcSrc eM_src(DxbcSrc::R(eM_temps[j]));
+        dxbc::Dest eM_dest(dxbc::Dest::R(eM_temps[j]));
+        dxbc::Src eM_src(dxbc::Src::R(eM_temps[j]));
        // Temp = X0Z0.
-        DxbcOpAnd(swap_temp_dest, eM_src, DxbcSrc::LU(0x00FF00FF));
+        a_.OpAnd(swap_temp_dest, eM_src, dxbc::Src::LU(0x00FF00FF));
        // eM = YZW0.
-        DxbcOpUShR(eM_dest, eM_src, DxbcSrc::LU(8));
+        a_.OpUShR(eM_dest, eM_src, dxbc::Src::LU(8));
        // eM = Y0W0.
-        DxbcOpAnd(eM_dest, eM_src, DxbcSrc::LU(0x00FF00FF));
+        a_.OpAnd(eM_dest, eM_src, dxbc::Src::LU(0x00FF00FF));
        // eM = YXWZ.
-        DxbcOpUMAd(eM_dest, swap_temp_src, DxbcSrc::LU(256), eM_src);
+        a_.OpUMAd(eM_dest, swap_temp_src, dxbc::Src::LU(256), eM_src);
      }
-      DxbcOpBreak();
-      DxbcOpEndSwitch();
+      a_.OpBreak();
+      a_.OpEndSwitch();

      // 16-in-32 or another half of 8-in-32.
-      DxbcOpSwitch(endian_src);
-      DxbcOpCase(DxbcSrc::LU(uint32_t(xenos::Endian128::k8in32)));
-      DxbcOpCase(DxbcSrc::LU(uint32_t(xenos::Endian128::k16in32)));
+      a_.OpSwitch(endian_src);
+      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::Endian128::k8in32)));
+      a_.OpCase(dxbc::Src::LU(uint32_t(xenos::Endian128::k16in32)));
      for (uint32_t j = 0; j < eM_count; ++j) {
-        DxbcDest eM_dest(DxbcDest::R(eM_temps[j]));
-        DxbcSrc eM_src(DxbcSrc::R(eM_temps[j]));
+        dxbc::Dest eM_dest(dxbc::Dest::R(eM_temps[j]));
+        dxbc::Src eM_src(dxbc::Src::R(eM_temps[j]));
        // Temp = ZW00.
-        DxbcOpUShR(swap_temp_dest, eM_src, DxbcSrc::LU(16));
+        a_.OpUShR(swap_temp_dest, eM_src, dxbc::Src::LU(16));
        // eM = ZWXY.
-        DxbcOpBFI(eM_dest, DxbcSrc::LU(16), DxbcSrc::LU(16), eM_src,
-                  swap_temp_src);
+        a_.OpBFI(eM_dest, dxbc::Src::LU(16), dxbc::Src::LU(16), eM_src,
+                 swap_temp_src);
      }
-      DxbcOpBreak();
-      DxbcOpEndSwitch();
+      a_.OpBreak();
+      a_.OpEndSwitch();

      // Release swap_temp.
      PopSystemTemp();
    }
    // control_temp.yz are now free.

-    DxbcDest address_dest(DxbcDest::R(eA_temp, 0b0001));
-    DxbcSrc address_src(DxbcSrc::R(eA_temp, DxbcSrc::kXXXX));
+    dxbc::Dest address_dest(dxbc::Dest::R(eA_temp, 0b0001));
+    dxbc::Src address_src(dxbc::Src::R(eA_temp, dxbc::Src::kXXXX));
    // Multiply the base address by dword size, also dropping the 0x40000000
    // bit.
-    DxbcOpIShL(address_dest, address_src, DxbcSrc::LU(2));
+    a_.OpIShL(address_dest, address_src, dxbc::Src::LU(2));
    // Drop the exponent in the element index.
-    DxbcOpAnd(DxbcDest::R(eA_temp, 0b0010), DxbcSrc::R(eA_temp, DxbcSrc::kYYYY),
-              DxbcSrc::LU((1 << 23) - 1));
+    a_.OpAnd(dxbc::Dest::R(eA_temp, 0b0010),
+             dxbc::Src::R(eA_temp, dxbc::Src::kYYYY),
+             dxbc::Src::LU((1 << 23) - 1));
    // Add the offset of the first written element to the base address.
-    DxbcOpUMAd(address_dest, DxbcSrc::R(eA_temp, DxbcSrc::kYYYY),
-               element_size_src, address_src);
+    a_.OpUMAd(address_dest, dxbc::Src::R(eA_temp, dxbc::Src::kYYYY),
+              element_size_src, address_src);
    // Do the writes.
-    DxbcSrc eM_written_src(
-        DxbcSrc::R(system_temp_memexport_written_).Select(i >> 2));
+    dxbc::Src eM_written_src(
+        dxbc::Src::R(system_temp_memexport_written_).Select(i >> 2));
    uint32_t eM_written_base = 1u << ((i & 3) << 3);
    for (uint32_t j = 0; j < eM_count; ++j) {
      // Go to the next eM#.
      uint32_t eM_relative_offset = eM_offsets[j] - (j ? eM_offsets[j - 1] : 0);
      if (eM_relative_offset) {
        if (eM_relative_offset == 1) {
-          DxbcOpIAdd(address_dest, element_size_src, address_src);
+          a_.OpIAdd(address_dest, element_size_src, address_src);
        } else {
-          DxbcOpUMAd(address_dest, DxbcSrc::LU(eM_relative_offset),
-                     element_size_src, address_src);
+          a_.OpUMAd(address_dest, dxbc::Src::LU(eM_relative_offset),
+                    element_size_src, address_src);
        }
      }
      // Check if the eM# was actually written to on the execution path.
-      DxbcOpAnd(DxbcDest::R(control_temp, 0b0010), eM_written_src,
-                DxbcSrc::LU(eM_written_base << eM_offsets[j]));
-      DxbcOpIf(true, DxbcSrc::R(control_temp, DxbcSrc::kYYYY));
+      a_.OpAnd(dxbc::Dest::R(control_temp, 0b0010), eM_written_src,
+               dxbc::Src::LU(eM_written_base << eM_offsets[j]));
+      a_.OpIf(true, dxbc::Src::R(control_temp, dxbc::Src::kYYYY));
      // Write the element of the needed size.
-      DxbcSrc eM_src(DxbcSrc::R(eM_temps[j]));
-      DxbcOpSwitch(element_size_src);
+      dxbc::Src eM_src(dxbc::Src::R(eM_temps[j]));
+      a_.OpSwitch(element_size_src);
      for (uint32_t k = 1; k <= 4; k <<= 1) {
-        DxbcOpCase(DxbcSrc::LU(k * 4));
+        a_.OpCase(dxbc::Src::LU(k * 4));
        if (uav_index_shared_memory_ == kBindingIndexUnallocated) {
          uav_index_shared_memory_ = uav_count_++;
        }
-        DxbcOpStoreRaw(
-            DxbcDest::U(uav_index_shared_memory_,
-                        uint32_t(UAVRegister::kSharedMemory), (1 << k) - 1),
+        a_.OpStoreRaw(
+            dxbc::Dest::U(uav_index_shared_memory_,
+                          uint32_t(UAVRegister::kSharedMemory), (1 << k) - 1),
            address_src, eM_src);
-        DxbcOpBreak();
+        a_.OpBreak();
      }
-      DxbcOpEndSwitch();
-      DxbcOpEndIf();
+      a_.OpEndSwitch();
+      a_.OpEndIf();
    }
    // control_temp.y is now free.
  }

-  // Close the memexport possibility check.
-  DxbcOpEndIf();
+  // Close the inner memexport possibility conditional.
+  if (inner_condition_provided) {
+    a_.OpEndIf();
+  }
+
+  // Close the outer memexport possibility conditional.
+  a_.OpEndIf();

  // Release control_temp.
  PopSystemTemp();
--- a/src/xenia/gpu/dxbc_shader_translator_om.cc
+++ b/src/xenia/gpu/dxbc_shader_translator_om.cc
--- a/src/xenia/gpu/gpu_flags.cc
+++ b/src/xenia/gpu/gpu_flags.cc
@ -40,63 +40,9 @@ DEFINE_bool(
    "be fully covered when MSAA is used with fullscreen passes.",
    "GPU");

-DEFINE_string(
-    depth_float24_conversion, "",
-    "Method for converting 32-bit Z values to 20e4 floating point when using "
-    "host depth buffers without native 20e4 support (when not using rasterizer-"
-    "ordered views / fragment shader interlocks to perform depth testing "
-    "manually).\n"
-    "Use: [any, on_copy, truncate, round]\n"
-    " on_copy:\n"
-    "  Do depth testing at host precision, converting when copying between "
-    "host depth buffers and the EDRAM buffer to support reinterpretation, "
-    "maintaining two copies, in both host and 20e4 formats, for reloading data "
-    "to host depth buffers when it wasn't overwritten.\n"
-    "  + Highest performance, allows early depth test and writing.\n"
-    "  + Host MSAA is possible with pixel-rate shading where supported.\n"
-    "  - EDRAM > RAM > EDRAM depth buffer round trip done in certain games "
-    "(such as GTA IV) destroys precision irreparably, causing artifacts if "
-    "another rendering pass is done after the EDRAM reupload.\n"
-    " truncate:\n"
-    "  Convert to 20e4 directly in pixel shaders, always rounding down.\n"
-    "  + Good performance, conservative early depth test is possible.\n"
-    "  + No precision loss when anything changes in the storage of the depth "
-    "buffer, EDRAM > RAM > EDRAM copying preserves precision.\n"
-    "  - Rounding mode is incorrect, sometimes giving results smaller than "
-    "they should be - may cause inaccuracy especially in edge cases when the "
-    "game wants to write an exact value.\n"
-    "  - Host MSAA is only possible at SSAA speed, with per-sample shading.\n"
-    " round:\n"
-    "  Convert to 20e4 directly in pixel shaders, correctly rounding to the "
-    "nearest even.\n"
-    "  + Highest accuracy.\n"
-    "  - Significantly limited performance, early depth test is not possible.\n"
-    "  - Host MSAA is only possible at SSAA speed, with per-sample shading.\n"
-    " Any other value:\n"
-    "  Choose what is considered the most optimal (currently \"on_copy\").",
-    "GPU");
-
 DEFINE_int32(query_occlusion_fake_sample_count, 1000,
             "If set to -1 no sample counts are written, games may hang. Else, "
             "the sample count of every tile will be incremented on every "
             "EVENT_WRITE_ZPD by this number. Setting this to 0 means "
             "everything is reported as occluded.",
             "GPU");
-
-namespace xe {
-namespace gpu {
-namespace flags {
-
-DepthFloat24Conversion GetDepthFloat24Conversion() {
-  if (cvars::depth_float24_conversion == "truncate") {
-    return DepthFloat24Conversion::kOnOutputTruncating;
-  }
-  if (cvars::depth_float24_conversion == "round") {
-    return DepthFloat24Conversion::kOnOutputRounding;
-  }
-  return DepthFloat24Conversion::kOnCopy;
-}
-
-}  // namespace flags
-}  // namespace gpu
-}  // namespace xe
--- a/src/xenia/gpu/gpu_flags.h
+++ b/src/xenia/gpu/gpu_flags.h
@ -22,69 +22,6 @@ DECLARE_bool(gpu_allow_invalid_fetch_constants);

 DECLARE_bool(half_pixel_offset);

-DECLARE_string(depth_float24_conversion);
-
 DECLARE_int32(query_occlusion_fake_sample_count);

-namespace xe {
-namespace gpu {
-namespace flags {
-
-enum class DepthFloat24Conversion {
-  // Doing depth test at the host precision, converting to 20e4 to support
-  // reinterpretation, but keeping a separate EDRAM view containing depth values
-  // in the host format. When copying from the EDRAM buffer to host depth
-  // buffers, writing the stored host pixel if stored_f24 == to_f24(stored_host)
-  // (otherwise it was overwritten by something else, like clearing, or a color
-  // buffer; this is inexact though, and will incorrectly load pixels that were
-  // overwritten by something else in the EDRAM, but turned out to have the same
-  // value on the guest as before - an outdated host-precision value will be
-  // loaded in these cases instead).
-  //
-  // EDRAM > RAM, then reusing the EDRAM region for something else > EDRAM round
-  // trip destroys precision beyond repair.
-  //
-  // Full host early Z and MSAA with pixel-rate shading are supported.
-  kOnCopy,
-  // Converting the depth to the closest host value representable exactly as a
-  // 20e4 float in pixel shaders, to support invariance in cases when the guest
-  // reuploads a previously resolved depth buffer to the EDRAM, rounding towards
-  // zero (which contradicts the rounding used by the Direct3D 9 reference
-  // rasterizer, but allows less-than-or-equal pixel shader depth output to be
-  // used to preserve most of early Z culling when the game is using reversed
-  // depth, which is the usual way of doing depth testing on the Xbox 360 and of
-  // utilizing the advantages of a floating-point encoding).
-  //
-  // With MSAA, pixel shaders must run at sample frequency - otherwise, if the
-  // depth is the same for the entire pixel, intersections of polygons cannot be
-  // antialiased.
-  //
-  // Important usage note: When using this mode, bounds of the fixed-function
-  // viewport must be converted to and back from float24 too (preferably using
-  // correct rounding to the nearest even, to reduce the error already caused by
-  // truncation rather than to amplify it). This ensures that clamping to the
-  // viewport bounds, which happens after the pixel shader even if it overwrites
-  // the resulting depth, is never done to a value not representable as float24
-  // (for example, if the minimum Z is a number too small to be represented as
-  // float24, but not zero, it won't be possible to write what should become
-  // 0x000000 to the depth buffer). Note that this may add some error to the
-  // depth values from the rasterizer; however, modifying Z in the vertex shader
-  // to make interpolated depth values would cause clipping to be done to
-  // different bounds, which may be more undesirable, especially in cases when Z
-  // is explicitly set to a value like 0 or W (in such cases, the adjusted
-  // polygon may go outside 0...W in clip space and disappear).
-  kOnOutputTruncating,
-  // Similar to kOnOutputTruncating, but rounding to the nearest even, more
-  // correctly, however, because the resulting depth can be bigger than the
-  // original host value, early depth testing can't be used at all. Same
-  // viewport usage rules apply.
-  kOnOutputRounding,
-};
-
-DepthFloat24Conversion GetDepthFloat24Conversion();
-
-}  // namespace flags
-}  // namespace gpu
-}  // namespace xe
-
 #endif  // XENIA_GPU_GPU_FLAGS_H_
--- a/src/xenia/gpu/graphics_system.cc
+++ b/src/xenia/gpu/graphics_system.cc
@ -221,13 +221,13 @@ void GraphicsSystem::WriteRegister(uint32_t addr, uint32_t value) {
  register_file_.values[r].u32 = value;
 }

-void GraphicsSystem::InitializeRingBuffer(uint32_t ptr, uint32_t log2_size) {
-  command_processor_->InitializeRingBuffer(ptr, log2_size + 0x3);
+void GraphicsSystem::InitializeRingBuffer(uint32_t ptr, uint32_t size_log2) {
+  command_processor_->InitializeRingBuffer(ptr, size_log2);
 }

 void GraphicsSystem::EnableReadPointerWriteBack(uint32_t ptr,
-                                                uint32_t block_size) {
-  command_processor_->EnableReadPointerWriteBack(ptr, block_size);
+                                                uint32_t block_size_log2) {
+  command_processor_->EnableReadPointerWriteBack(ptr, block_size_log2);
 }

 void GraphicsSystem::SetInterruptCallback(uint32_t callback,
--- a/src/xenia/gpu/graphics_system.h
+++ b/src/xenia/gpu/graphics_system.h
@ -55,8 +55,9 @@ class GraphicsSystem {
    return command_processor_.get();
  }

-  virtual void InitializeRingBuffer(uint32_t ptr, uint32_t log2_size);
-  virtual void EnableReadPointerWriteBack(uint32_t ptr, uint32_t block_size);
+  virtual void InitializeRingBuffer(uint32_t ptr, uint32_t size_log2);
+  virtual void EnableReadPointerWriteBack(uint32_t ptr,
+                                          uint32_t block_size_log2);

  virtual void SetInterruptCallback(uint32_t callback, uint32_t user_data);
  void DispatchInterruptCallback(uint32_t source, uint32_t cpu);
--- a/src/xenia/gpu/primitive_processor.cc
+++ b/src/xenia/gpu/primitive_processor.cc
--- a/src/xenia/gpu/primitive_processor.h
+++ b/src/xenia/gpu/primitive_processor.h
@ -0,0 +1,869 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2021 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_GPU_PRIMITIVE_PROCESSOR_H_
+#define XENIA_GPU_PRIMITIVE_PROCESSOR_H_
+
+#include <climits>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <deque>
+#include <functional>
+#include <mutex>
+#include <unordered_map>
+#include <utility>
+
+#include "xenia/base/assert.h"
+#include "xenia/base/cvar.h"
+#include "xenia/base/math.h"
+#include "xenia/base/mutex.h"
+#include "xenia/base/platform.h"
+#include "xenia/gpu/register_file.h"
+#include "xenia/gpu/shader.h"
+#include "xenia/gpu/shared_memory.h"
+#include "xenia/gpu/trace_writer.h"
+#include "xenia/gpu/xenos.h"
+#include "xenia/memory.h"
+
+#if XE_ARCH_AMD64
+// 128-bit SSSE3-level (SSE2+ for integer comparison, SSSE3 for pshufb) or AVX
+// (256-bit AVX only got integer operations such as comparison in AVX2, which is
+// above the minimum requirements of Xenia).
+#include <tmmintrin.h>
+#define XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE 16
+#elif XE_ARCH_ARM64
+#include <arm_neon.h>
+#define XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE 16
+#else
+#define XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE 0
+#endif  // XE_ARCH
+
+// The idea behind this config variable is to force both indirection without
+// primitive reset and pre-masking / pre-swapping with primitive reset,
+// therefore this is supposed to be checked only by the host if it supports
+// indirection. It's pretty pointless to do only half of this on backends that
+// support full 32-bit indices unconditionally.
+DECLARE_bool(ignore_32bit_vertex_index_support);
+
+namespace xe {
+namespace gpu {
+
+// Normalizes primitive data in various ways for use with Direct3D 12 and Vulkan
+// (down to its minimum requirements plus the portability subset).
+//
+// This solves various issues:
+// - Triangle fans not supported on Direct3D 10+ and the Vulkan portability
+//   subset.
+//   - Converts to triangle lists, both with and without primitive reset.
+// - Line loops are not supported on Direct3D 12 or Vulkan.
+//   - Converts to line strips.
+// - Quads not reproducible with line lists with adjacency without geometry
+//   shaders (some Vulkan implementations), as well as being hard to debug in
+//   PIX due to "catastrophic failures".
+//   - Converts to triangle lists.
+// - Vulkan requiring 0xFFFF primitive restart index for 16-bit indices and
+//   0xFFFFFFFF for 32-bit (Direct3D 12 slightly relaxes this, allowing 0xFFFF
+//   for 32-bit also, but it's of no use to Xenia since guest indices are
+//   big-endian usually. Also, only 24 lower bits of the vertex index being used
+//   on the guest (tested on an Adreno 200 phone with drawing, though not with
+//   primitive restart as OpenGL ES 2.0 doesn't expose it), so the upper 8 bits
+//   likely shouldn't have effect on primitive restart (guest reset index
+//   0xFFFFFF likely working for 0xFFFFFF, 0xFFFFFFFF, and 254 more indices),
+//   while Vulkan and Direct3D 12 require exactly 0xFFFFFFFF.
+//   - For 16-bit indices with guest reset index other than 0xFFFF (passing
+//     0xFFFF directly to the host is fine because it's the same irrespective of
+//     endianness), there are two possible solutions:
+//     - If the index buffer otherwise doesn't contain 0xFFFF otherwise (since
+//       it's a valid vertex index in this case), replacing the primitive reset
+//       index with 0xFFFF in the 16-bit buffer.
+//     - If the index buffer contains any usage of 0xFFFF as a real vertex
+//       index, converting the index buffer to 32-bit, and replacing the
+//       primitive reset index with 0xFFFFFFFF.
+//   - For 32-bit indices, there are two paths:
+//     - If the guest reset index is 0xFFFFFF, and the index buffer actually
+//       uses only 0xFFFFFFFF for reset, using it without changes.
+//     - If the guest uses something other than 0xFFFFFFFF for primitive reset,
+//       replacing elements with (index & 0xFFFFFF) == reset_index with
+//       0xFFFFFFFF.
+// - Some Vulkan implementations only support 24-bit indices. The guests usually
+//   pass big-endian vertices, so we need all 32 bits (as the least significant
+//   bits will be in 24...31) to perform the byte swapping. For this reason, we
+//   load 32-bit indices indirectly, doing non-indexed draws and fetching the
+//   indices from the shared memory. This, however, is not compatible with
+//   primitive restart.
+//   - Pre-swapping, masking to 24 bits, and converting the reset index to
+//     0xFFFFFFFF, resulting in an index buffer that can be used directly.
+
+class PrimitiveProcessor {
+ public:
+  enum ProcessedIndexBufferType {
+    // Auto-indexed on the host.
+    kNone,
+    // GPU DMA, from the shared memory.
+    // For 32-bit, indirection is needed if the host only supports 24-bit
+    // indices (even for non-endian-swapped, as the GPU should be ignoring the
+    // upper 8 bits completely, rather than exhibiting undefined behavior.
+    kGuest,
+    // Converted and stored in the primitive converter for the current draw
+    // command. For 32-bit indices, if the host doesn't support all 32 bits,
+    // this kind of an index buffer will always be pre-masked and pre-swapped.
+    kHostConverted,
+    // Auto-indexed on the guest, but with an adapter index buffer on the host.
+    kHostBuiltin,
+  };
+
+  struct ProcessingResult {
+    xenos::PrimitiveType guest_primitive_type;
+    xenos::PrimitiveType host_primitive_type;
+    // Includes whether tessellation is enabled (not kVertex) and the type of
+    // tessellation.
+    Shader::HostVertexShaderType host_vertex_shader_type;
+    // Only used for non-kVertex host_vertex_shader_type. For kAdaptive, the
+    // index buffer is always from the guest and fully 32-bit, and contains the
+    // floating-point tessellation factors.
+    xenos::TessellationMode tessellation_mode;
+    // TODO(Triang3l): If important, split into the index count and the actual
+    // index buffer size, using zeros for out-of-bounds indices.
+    uint32_t host_draw_vertex_count;
+    uint32_t line_loop_closing_index;
+    ProcessedIndexBufferType index_buffer_type;
+    uint32_t guest_index_base;
+    xenos::IndexFormat host_index_format;
+    xenos::Endian host_index_endian;
+    // The reset index, if enabled, is always 0xFFFF for host_index_format
+    // kInt16 and 0xFFFFFFFF for kInt32.
+    bool host_primitive_reset_enabled;
+    // Backend-specific handle for the index buffer valid for the current draw,
+    // only valid for index_buffer_type kHostConverted and kHostBuiltin.
+    size_t host_index_buffer_handle;
+    bool IsTessellated() const {
+      return host_vertex_shader_type != Shader::HostVertexShaderType::kVertex;
+    }
+  };
+
+  virtual ~PrimitiveProcessor();
+
+  bool AreFull32BitVertexIndicesUsed() const {
+    return full_32bit_vertex_indices_used_;
+  }
+  bool IsConvertingTriangleFansToLists() const {
+    return convert_triangle_fans_to_lists_;
+  }
+  bool IsConvertingLineLoopsToStrips() const {
+    return convert_line_loops_to_strips_;
+  }
+  // Quad lists may be emulated as line lists with adjacency and a geometry
+  // shader, but geometry shaders must be supported for this.
+  bool IsConvertingQuadListsToTriangleLists() const {
+    return convert_quad_lists_to_triangle_lists_;
+  }
+
+  // Submission must be open to call (may request the index buffer in the shared
+  // memory).
+  bool Process(ProcessingResult& result_out);
+
+  // Invalidates the cache within the range.
+  std::pair<uint32_t, uint32_t> MemoryInvalidationCallback(
+      uint32_t physical_address_start, uint32_t length, bool exact_range);
+
+ protected:
+  // For host-side index buffer creation, the biggest possibly needed contiguous
+  // allocation, in indices.
+  // - No conversion: up to 0xFFFF vertices (as the vertex count in
+  //   VGT_DRAW_INITIATOR is 16-bit).
+  // - Triangle fans to lists: since the 3rd vertex, every guest vertex creates
+  //   a triangle, thus the maximum is 3 * (UINT16_MAX - 2), or 0x2FFF7.
+  //   Primitive reset can only slow down the amplification - the 3 vertices
+  //   after a reset add 1 host vertex each, not 3 each.
+  // - Line loops to strips: adding 1 vertex if there are at least 2 vertices in
+  //   the original primitive, either replacing the primitive reset index with
+  //   this new closing vertex, or in case of the final primitive, just adding a
+  //   vertex - thus the absolute limit is UINT16_MAX + 1, or 0x10000.
+  // - Quad lists to triangle lists: vertices are processed in groups of 4, each
+  //   group converted to 6 vertices, so the limit is 1.5 * 0xFFFC, or 0x17FFA.
+  // Thus, the maximum vertex count is defined by triangle fan to list
+  // conversion.
+  // Also include padding for co-alignment of the source and the destination for
+  // SIMD.
+  static constexpr uint32_t kMinRequiredConvertedIndexBufferSize =
+      sizeof(uint32_t) * (UINT16_MAX - 2) * 3 *
+      +XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE;
+
+  PrimitiveProcessor(const RegisterFile& register_file, Memory& memory,
+                     TraceWriter& trace_writer, SharedMemory& shared_memory)
+      : register_file_(register_file),
+        memory_(memory),
+        trace_writer_(trace_writer),
+        shared_memory_(shared_memory) {}
+
+  // Call from the backend-specific initialization function.
+  // - full_32bit_vertex_indices_supported:
+  //   - If the backend supports 32-bit indices unconditionally, and doesn't
+  //     generate indirection logic in vertex shaders, pass hard-coded `true`.
+  //   - Otherwise:
+  //     - If the host doesn't support full 32-bit indices (but supports at
+  //       least 24-bit indices), pass `false`.
+  //     - If the host supports 32-bit indices, but the backend can handle both
+  //       cases, pass `cvars::ignore_32bit_vertex_index_support`, and
+  //       afterwards, check `AreFull32BitVertexIndicesUsed()` externally to see
+  //       if indirection may be needed.
+  //     - When full 32-bit indices are not supported, the host must be using
+  //       auto-indexed draws for 32-bit indices of ProcessedIndexBufferType
+  //       kGuest, while fetching the index data manually from the shared memory
+  //       buffer and endian-swapping it.
+  //     - Indirection, however, precludes primitive reset usage - so if
+  //       primitive reset is needed, the primitive processor will pre-swap and
+  //       pre-mask the index buffer so there are only host-endian 0x00###### or
+  //       0xFFFFFFFF values in it. In this case, a kHostConverted index buffer
+  //       is returned from Process, and indirection is not needed (and
+  //       impossible since the index buffer is not in the shared memory buffer
+  //       anymore), though byte swap is still needed as 16-bit indices may also
+  //       be kHostConverted, while they are completely unaffected by this. The
+  //       same applies to primitive type conversion - if it happens for 32-bit
+  //       guest indices, and kHostConverted is returned, they will be
+  //       pre-swapped and pre-masked.
+  // - triangle_fans_supported, line_loops_supported, quad_lists_supported:
+  //   - Pass true or false depending on whether the host actually supports
+  //     those guest primitive types directly or through geometry shader
+  //     emulation. Debug overriding will be resolved in the common code if
+  //     needed.
+  bool InitializeCommon(bool full_32bit_vertex_indices_supported,
+                        bool triangle_fans_supported, bool line_loops_supported,
+                        bool quad_lists_supported);
+  // If any primitive type conversion is needed for auto-indexed draws, called
+  // from InitializeCommon (thus only once in the primitive processor's
+  // lifetime) to set up the backend's index buffer containing indices for
+  // primitive type remapping. The backend must allocate a `sizeof(uint16_t) *
+  // index_count` buffer and call fill_callback for its mapping if creation is
+  // successful. 16-bit indices are enough even if the backend has primitive
+  // reset enabled all the time (Metal) as auto-indexed draws are limited to
+  // UINT16_MAX vertices, not UINT16_MAX + 1.
+  virtual bool InitializeBuiltin16BitIndexBuffer(
+      uint32_t index_count, std::function<void(uint16_t*)> fill_callback) = 0;
+  // Call last in implementation-specific shutdown, also callable from the
+  // destructor.
+  void ShutdownCommon();
+
+  // Call at boundaries of lifespans of converted data (between frames,
+  // preferably in the end of a frame so between the swap and the next draw,
+  // access violation handlers need to do less work).
+  void ClearPerFrameCache();
+
+  static constexpr size_t GetBuiltinIndexBufferOffsetBytes(size_t handle) {
+    // For simplicity, just using the handles as byte offsets.
+    return handle;
+  }
+
+  // The destination allocation must have XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE
+  // excess bytes.
+  static ptrdiff_t GetSimdCoalignmentOffset(const void* host_index_ptr,
+                                            uint32_t guest_index_base) {
+#if XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE
+    // Always moving the host pointer only forward into the allocation padding
+    // space of XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE bytes. Without relying on
+    // two's complement wrapping overflow behavior, the logic would look like:
+    // uintptr_t host_subalignment =
+    //     reinterpret_cast<uintptr_t>(host_index_ptr) &
+    //     (XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE - 1);
+    // uint32_t guest_subalignment = guest_index_base &
+    //                               (XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE - 1);
+    // uintptr_t host_index_address_aligned = host_index_address;
+    // if (guest_subalignment >= host_subalignment) {
+    //   return guest_subalignment - host_subalignment;
+    // }
+    // return XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE -
+    //        (host_subalignment - guest_subalignment);
+    return ptrdiff_t(
+        (guest_index_base - reinterpret_cast<uintptr_t>(host_index_ptr)) &
+        (XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE - 1));
+#else
+    return 0;
+#endif
+  }
+
+  // Requests a buffer to write the new transformed indices to. The lifetime of
+  // the returned buffer must be that of the current frame. Returns the mapping
+  // of the buffer to write to, or nullptr in case of failure, in addition to,
+  // if successful, a handle that can be used by the backend's command processor
+  // to access the backend-specific data for binding the buffer.
+  virtual void* RequestHostConvertedIndexBufferForCurrentFrame(
+      xenos::IndexFormat format, uint32_t index_count, bool coalign_for_simd,
+      uint32_t coalignment_original_address, size_t& backend_handle_out) = 0;
+
+ private:
+#if XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE
+#if XE_ARCH_AMD64
+  // SSSE3 or AVX.
+  using SimdVectorU16 = __m128i;
+  using SimdVectorU32 = __m128i;
+  static SimdVectorU16 ReplicateU16(uint16_t value) {
+    return _mm_set1_epi16(int16_t(value));
+  }
+  static SimdVectorU32 ReplicateU32(uint32_t value) {
+    return _mm_set1_epi32(int32_t(value));
+  }
+  static SimdVectorU16 LoadAlignedVectorU16(const uint16_t* source) {
+    return _mm_load_si128(reinterpret_cast<const __m128i*>(source));
+  }
+  static SimdVectorU32 LoadAlignedVectorU32(const uint32_t* source) {
+    return _mm_load_si128(reinterpret_cast<const __m128i*>(source));
+  }
+  static void StoreUnalignedVectorU16(uint16_t* dest, SimdVectorU16 source) {
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(dest), source);
+  }
+  static void StoreUnalignedVectorU32(uint32_t* dest, SimdVectorU32 source) {
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(dest), source);
+  }
+#elif XE_ARCH_ARM64
+  // NEON.
+  using SimdVectorU16 = uint16x8_t;
+  using SimdVectorU32 = uint32x4_t;
+  static SimdVectorU16 ReplicateU16(uint16_t value) {
+    return vdupq_n_u16(value);
+  }
+  static SimdVectorU32 ReplicateU32(uint32_t value) {
+    return vdupq_n_u32(value);
+  }
+  static SimdVectorU16 LoadAlignedVectorU16(const uint16_t* source) {
+#if XE_COMPILER_MSVC
+    return vld1q_u16_ex(source, sizeof(uint16x8_t) * CHAR_BIT);
+#else
+    return vld1q_u16(reinterpret_cast<const uint16_t*>(
+        __builtin_assume_aligned(source, sizeof(uint16x8_t))));
+#endif
+  }
+  static SimdVectorU32 LoadAlignedVectorU32(const uint32_t* source) {
+#if XE_COMPILER_MSVC
+    return vld1q_u32_ex(source, sizeof(uint16x8_t) * CHAR_BIT);
+#else
+    return vld1q_u32(reinterpret_cast<const uint32_t*>(
+        __builtin_assume_aligned(source, sizeof(uint32x4_t))));
+#endif
+  }
+  static void StoreUnalignedVectorU16(uint16_t* dest, SimdVectorU16 source) {
+    vst1q_u16(dest, source);
+  }
+  static void StoreUnalignedVectorU32(uint32_t* dest, SimdVectorU32 source) {
+    vst1q_u32(dest, source);
+  }
+#else
+#error SIMD vector types and constant loads not specified.
+#endif  // XE_ARCH
+  static_assert(
+      sizeof(SimdVectorU16) == XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE,
+      "XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE must reflect the vector size "
+      "actually used");
+  static_assert(
+      sizeof(SimdVectorU32) == XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE,
+      "XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE must reflect the vector size "
+      "actually used");
+  static constexpr uint32_t kSimdVectorU16Elements =
+      sizeof(SimdVectorU16) / sizeof(uint16_t);
+  static constexpr uint32_t kSimdVectorU32Elements =
+      sizeof(SimdVectorU32) / sizeof(uint32_t);
+#endif  // XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE
+
+  static bool IsResetUsed(const uint16_t* source, uint32_t count,
+                          uint16_t reset_index_guest_endian);
+  static void Get16BitResetIndexUsage(const uint16_t* source, uint32_t count,
+                                      uint16_t reset_index_guest_endian,
+                                      bool& is_reset_index_used_out,
+                                      bool& is_ffff_used_as_vertex_index_out);
+  static bool IsResetUsed(const uint32_t* source, uint32_t count,
+                          uint32_t reset_index_guest_endian,
+                          uint32_t low_bits_mask_guest_endian);
+  static void ReplaceResetIndex16To16(uint16_t* dest, const uint16_t* source,
+                                      uint32_t count,
+                                      uint16_t reset_index_guest_endian);
+  // For use when the reset index is not 0xFFFF, and 0xFFFF is also used as a
+  // valid index - keeps 0xFFFF as a real index and replaces the reset index
+  // with 0xFFFFFFFF instead.
+  static void ReplaceResetIndex16To24(uint32_t* dest, const uint16_t* source,
+                                      uint32_t count,
+                                      uint16_t reset_index_guest_endian);
+  // The reset index and the low 24 bits mask are taken explicitly because this
+  // function may be used two ways:
+  // - Passthrough - when the vertex shader swaps the indices (when 32-bit
+  //   indices are supported on the host), in this case HostSwap is kNone, but
+  //   the reset index and the guest low bits mask can be swapped according to
+  //   the guest endian.
+  // - Swapping for the host - when only 24 bits of an index are supported on
+  //   the host. In this case, masking and comparison are done before applying
+  //   HostSwap, but according to HostSwap, if needed, the data is swapped from
+  //   the PowerPC's big endianness to the host GPU little endianness that we
+  //   assume, which matches the Xenos's little endianness.
+  template <xenos::Endian HostSwap>
+  static void ReplaceResetIndex32To24(uint32_t* dest, const uint32_t* source,
+                                      uint32_t count,
+                                      uint32_t reset_index_guest_endian,
+                                      uint32_t low_bits_mask_guest_endian) {
+    // The Xbox 360's GPU only uses the low 24 bits of the index - masking.
+#if XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE
+    while (count && (reinterpret_cast<uintptr_t>(source) &
+                     (XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE - 1))) {
+      --count;
+      uint32_t index = *(source++) & low_bits_mask_guest_endian;
+      *(dest++) = index != reset_index_guest_endian
+                      ? xenos::GpuSwap(index, HostSwap)
+                      : UINT32_MAX;
+    }
+    if (count >= kSimdVectorU32Elements) {
+      SimdVectorU32 reset_index_guest_endian_simd =
+          ReplicateU32(reset_index_guest_endian);
+      SimdVectorU32 low_bits_mask_guest_endian_simd =
+          ReplicateU32(low_bits_mask_guest_endian);
+#if XE_ARCH_AMD64
+      __m128i host_swap_shuffle;
+      if constexpr (HostSwap != xenos::Endian::kNone) {
+        host_swap_shuffle = _mm_set_epi32(
+            int32_t(xenos::GpuSwap(uint32_t(0x0F0E0D0C), HostSwap)),
+            int32_t(xenos::GpuSwap(uint32_t(0x0B0A0908), HostSwap)),
+            int32_t(xenos::GpuSwap(uint32_t(0x07060504), HostSwap)),
+            int32_t(xenos::GpuSwap(uint32_t(0x03020100), HostSwap)));
+      }
+#endif  // XE_ARCH_AMD64
+      while (count >= kSimdVectorU32Elements) {
+        count -= kSimdVectorU32Elements;
+        // Comparison produces 0 or 0xFFFF on AVX and Neon - we need 0xFFFF as
+        // the result for the primitive reset indices, so the result is
+        // `index | (index == reset_index)`.
+        SimdVectorU32 source_simd = LoadAlignedVectorU32(source);
+        source += kSimdVectorU32Elements;
+        SimdVectorU32 result_simd;
+#if XE_ARCH_AMD64
+        source_simd =
+            _mm_and_si128(source_simd, low_bits_mask_guest_endian_simd);
+        result_simd = _mm_or_si128(
+            source_simd,
+            _mm_cmpeq_epi32(source_simd, reset_index_guest_endian_simd));
+        if constexpr (HostSwap != xenos::Endian::kNone) {
+          result_simd = _mm_shuffle_epi8(result_simd, host_swap_shuffle);
+        }
+#elif XE_ARCH_ARM64
+        source_simd = vandq_u32(source_simd, low_bits_mask_guest_endian_simd);
+        result_simd = vorrq_u32(
+            source_simd, vceqq_u32(source_simd, reset_index_guest_endian_simd));
+        if constexpr (HostSwap == xenos::Endian::k8in16) {
+          result_simd = vreinterpretq_u32_u8(
+              vrev16q_u8(vreinterpretq_u8_u32(result_simd)));
+        } else if constexpr (HostSwap == xenos::Endian::k8in32) {
+          result_simd = vreinterpretq_u32_u8(
+              vrev32q_u8(vreinterpretq_u8_u32(result_simd)));
+        } else if constexpr (HostSwap == xenos::Endian::k16in32) {
+          result_simd = vreinterpretq_u32_u16(
+              vrev32q_u16(vreinterpretq_u16_u32(result_simd)));
+        }
+#else
+#error SIMD ReplaceResetIndex32To24 not implemented.
+#endif  // XE_ARCH
+        StoreUnalignedVectorU32(dest, result_simd);
+        dest += kSimdVectorU32Elements;
+      }
+    }
+#endif  // XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE
+    while (count--) {
+      uint32_t index = *(source++) & low_bits_mask_guest_endian;
+      *(dest++) = index != reset_index_guest_endian
+                      ? xenos::GpuSwap(index, HostSwap)
+                      : UINT32_MAX;
+    }
+  }
+
+  // TODO(Triang3l): 16-bit > 32-bit primitive type conversion for Metal, where
+  // primitive reset is always enabled, if UINT16_MAX is used as a real vertex
+  // index.
+
+  struct PassthroughIndexTransform {
+    uint16_t operator()(uint16_t index) const { return index; }
+    uint32_t operator()(uint32_t index) const { return index; }
+  };
+  struct To24NonSwappingIndexTransform {
+    uint32_t operator()(uint32_t index) const {
+      return index & xenos::kVertexIndexMask;
+    }
+  };
+  struct To24Swapping8In16IndexTransform {
+    uint32_t operator()(uint32_t index) const {
+      return xenos::GpuSwap(index, xenos::Endian::k8in16) &
+             xenos::kVertexIndexMask;
+    }
+  };
+  struct To24Swapping8In32IndexTransform {
+    uint32_t operator()(uint32_t index) const {
+      return xenos::GpuSwap(index, xenos::Endian::k8in32) &
+             xenos::kVertexIndexMask;
+    }
+  };
+  struct To24Swapping16In32IndexTransform {
+    uint32_t operator()(uint32_t index) const {
+      return xenos::GpuSwap(index, xenos::Endian::k16in32) &
+             xenos::kVertexIndexMask;
+    }
+  };
+
+  // Triangle fans as triangle lists.
+  // Ordered as (v1, v2, v0), (v2, v3, v0) in Direct3D.
+  // https://docs.microsoft.com/en-us/windows/desktop/direct3d9/triangle-fans
+  static constexpr uint32_t GetTriangleFanListIndexCount(
+      uint32_t fan_index_count) {
+    return fan_index_count > 2 ? (fan_index_count - 2) * 3 : 0;
+  }
+  template <typename Index, typename IndexTransform>
+  static void TriangleFanToList(Index* dest, const Index* source,
+                                uint32_t source_index_count,
+                                const IndexTransform& index_transform) {
+    if (source_index_count <= 2) {
+      // To match GetTriangleFanListIndexCount.
+      return;
+    }
+    Index index_first = index_transform(source[0]);
+    Index index_previous = index_transform(source[1]);
+    for (uint32_t i = 2; i < source_index_count; ++i) {
+      Index index_current = index_transform(source[i]);
+      *(dest++) = index_previous;
+      *(dest++) = index_current;
+      *(dest++) = index_first;
+      index_previous = index_current;
+    }
+  }
+
+  static constexpr uint32_t GetLineLoopStripIndexCount(
+      uint32_t loop_index_count) {
+    // Even if 2 vertices are supplied, two lines are still drawn between them.
+    // https://www.khronos.org/opengl/wiki/Primitive
+    // "You get n lines for n input vertices"
+    // "If the user only specifies 1 vertex, the drawing command is ignored"
+    return loop_index_count > 1 ? loop_index_count + 1 : 0;
+  }
+  template <typename Index, typename IndexTransform>
+  static void LineLoopToStrip(Index* dest, const Index* source,
+                              uint32_t source_index_count,
+                              const IndexTransform& index_transform) {
+    if (source_index_count <= 1) {
+      // To match GetLineLoopStripIndexCount.
+      return;
+    }
+    Index index_first = index_transform(source[0]);
+    dest[0] = index_first;
+    for (uint32_t i = 1; i < source_index_count; ++i) {
+      dest[i] = index_transform(source[i]);
+    }
+    dest[source_index_count] = index_first;
+  }
+  static void LineLoopToStrip(uint16_t* dest, const uint16_t* source,
+                              uint32_t source_index_count,
+                              const PassthroughIndexTransform& index_transform);
+  static void LineLoopToStrip(uint32_t* dest, const uint32_t* source,
+                              uint32_t source_index_count,
+                              const PassthroughIndexTransform& index_transform);
+
+  static constexpr uint32_t GetQuadListTriangleListIndexCount(
+      uint32_t quad_list_index_count) {
+    return (quad_list_index_count / 4) * 6;
+  }
+  template <typename Index, typename IndexTransform>
+  static void QuadListToTriangleList(Index* dest, const Index* source,
+                                     uint32_t source_index_count,
+                                     const IndexTransform& index_transform) {
+    uint32_t quad_count = source_index_count / 4;
+    for (uint32_t i = 0; i < quad_count; ++i) {
+      // TODO(Triang3l): Find the correct order.
+      // v0, v1, v2.
+      Index common_index_0 = index_transform(*(source++));
+      *(dest++) = common_index_0;
+      *(dest++) = index_transform(*(source++));
+      Index common_index_2 = index_transform(*(source++));
+      *(dest++) = common_index_2;
+      // v0, v2, v3.
+      *(dest++) = common_index_0;
+      *(dest++) = common_index_2;
+      *(dest++) = index_transform(*(source++));
+    }
+  }
+
+  // Pre-gathering the ranges allows for usage of the same functions for
+  // conversion with and without reset. In addition, this increases safety in
+  // weird cases - there won't be mismatch between the pre-calculation of the
+  // post-conversion index count and the actual conversion if the game for some
+  // reason modifies the index buffer between the two and adds or removes reset
+  // indices in it.
+  struct SinglePrimitiveRange {
+    SinglePrimitiveRange(uint32_t guest_offset, uint32_t guest_index_count,
+                         uint32_t host_index_count)
+        : guest_offset(guest_offset),
+          guest_index_count(guest_index_count),
+          host_index_count(host_index_count) {}
+    uint32_t guest_offset;
+    uint32_t guest_index_count;
+    uint32_t host_index_count;
+  };
+  static uint32_t GetMultiPrimitiveHostIndexCountAndRanges(
+      std::function<uint32_t(uint32_t)> single_primitive_guest_to_host_count,
+      const uint16_t* source, uint32_t source_index_count,
+      uint16_t reset_index_guest_endian,
+      std::deque<SinglePrimitiveRange>& ranges_append_out);
+  static uint32_t GetMultiPrimitiveHostIndexCountAndRanges(
+      std::function<uint32_t(uint32_t)> single_primitive_guest_to_host_count,
+      const uint32_t* source, uint32_t source_index_count,
+      uint32_t reset_index_guest_endian, uint32_t low_bits_mask_guest_endian,
+      std::deque<SinglePrimitiveRange>& ranges_append_out);
+
+  template <typename Index, typename IndexTransform,
+            typename PrimitiveRangeIterator>
+  static void ConvertSinglePrimitiveRanges(
+      Index* dest, const Index* source,
+      xenos::PrimitiveType source_primitive_type,
+      const IndexTransform& index_transform,
+      PrimitiveRangeIterator ranges_beginning,
+      PrimitiveRangeIterator ranges_end) {
+    Index* dest_write_ptr = dest;
+    switch (source_primitive_type) {
+      case xenos::PrimitiveType::kTriangleFan:
+        for (PrimitiveRangeIterator range_it = ranges_beginning;
+             range_it != ranges_end; ++range_it) {
+          TriangleFanToList(dest_write_ptr, source + range_it->guest_offset,
+                            range_it->guest_index_count, index_transform);
+          dest_write_ptr += range_it->host_index_count;
+        }
+        break;
+      case xenos::PrimitiveType::kLineLoop:
+        for (PrimitiveRangeIterator range_it = ranges_beginning;
+             range_it != ranges_end; ++range_it) {
+          LineLoopToStrip(dest_write_ptr, source + range_it->guest_offset,
+                          range_it->guest_index_count, index_transform);
+          dest_write_ptr += range_it->host_index_count;
+        }
+        break;
+      case xenos::PrimitiveType::kQuadList:
+        for (PrimitiveRangeIterator range_it = ranges_beginning;
+             range_it != ranges_end; ++range_it) {
+          QuadListToTriangleList(dest_write_ptr,
+                                 source + range_it->guest_offset,
+                                 range_it->guest_index_count, index_transform);
+          dest_write_ptr += range_it->host_index_count;
+        }
+        break;
+      default:
+        assert_unhandled_case(source_primitive_type);
+    }
+  }
+
+  const RegisterFile& register_file_;
+  Memory& memory_;
+  TraceWriter& trace_writer_;
+  SharedMemory& shared_memory_;
+
+  bool full_32bit_vertex_indices_used_ = false;
+  bool convert_triangle_fans_to_lists_ = false;
+  bool convert_line_loops_to_strips_ = false;
+  bool convert_quad_lists_to_triangle_lists_ = false;
+
+  // Byte offsets used, for simplicity, directly as handles.
+  size_t builtin_ib_offset_triangle_fans_to_lists_ = SIZE_MAX;
+  size_t builtin_ib_offset_quad_lists_to_triangle_lists_ = SIZE_MAX;
+
+  std::deque<SinglePrimitiveRange> single_primitive_ranges_;
+
+  // Caching for reuse of converted indices within a frame.
+
+  // 256 KB as the largest possible guest index buffer - 0xFFFF 32-bit indices -
+  // is slightly smaller than 256 KB, thus cache entries need store links within
+  // at most 2 buckets.
+  static constexpr uint32_t kCacheBucketSizeBytesLog2 = 18;
+  static constexpr uint32_t kCacheBucketSizeBytes =
+      uint32_t(1) << kCacheBucketSizeBytesLog2;
+  static constexpr uint32_t kCacheBucketCount =
+      xe::align(SharedMemory::kBufferSize, kCacheBucketSizeBytes) /
+      kCacheBucketSizeBytes;
+
+  union CacheKey {
+    struct {
+      uint32_t base;                  // 32 total
+      uint32_t count : 16;            // 48
+      xenos::IndexFormat format : 1;  // 49
+      xenos::Endian endian : 2;       // 52
+      uint32_t is_reset_enabled : 1;  // 53
+      // kNone if not changing the type (like only processing the reset index).
+      xenos::PrimitiveType conversion_guest_primitive_type : 6;  // 59
+    };
+    uint64_t key = 0;
+
+    CacheKey() = default;
+    CacheKey(uint32_t base, uint32_t count, xenos::IndexFormat format,
+             xenos::Endian endian, bool is_reset_enabled,
+             xenos::PrimitiveType conversion_guest_primitive_type =
+                 xenos::PrimitiveType::kNone)
+        : base(base),
+          count(count),
+          format(format),
+          endian(endian),
+          is_reset_enabled(is_reset_enabled),
+          conversion_guest_primitive_type(conversion_guest_primitive_type) {}
+
+    struct Hasher {
+      size_t operator()(const CacheKey& key) const {
+        return std::hash<uint64_t>{}(key.key);
+      }
+    };
+    bool operator==(const CacheKey& other_key) const {
+      return key == other_key.key;
+    }
+
+    uint32_t GetSizeBytes() const {
+      return count * (format == xenos::IndexFormat::kInt16 ? sizeof(uint16_t)
+                                                           : sizeof(uint32_t));
+    }
+  };
+
+  // Subset of ConversionResult that can be reused for different primitive types
+  // if the same result is used irrespective of one (like when only processing
+  // the reset index).
+  struct CachedResult {
+    uint32_t host_draw_vertex_count;
+    ProcessedIndexBufferType index_buffer_type;
+    xenos::IndexFormat host_index_format;
+    xenos::Endian host_index_endian;
+    bool host_primitive_reset_enabled;
+    size_t host_index_buffer_handle;
+  };
+
+  struct CacheEntry {
+    static_assert(
+        UINT16_MAX * sizeof(uint32_t) <=
+            (size_t(1) << kCacheBucketSizeBytesLog2),
+        "Assuming that primitive processor cache entries need to store to the "
+        "previous and to the next entries only within up to 2 buckets, so the "
+        "size of the cache buckets must be not smaller than the maximum guest "
+        "index buffer size");
+    union {
+      size_t free_next;
+      size_t buckets_prev[2];
+    };
+    size_t buckets_next[2];
+    CacheKey key;
+    CachedResult result;
+    static uint32_t GetBucketCount(CacheKey key) {
+      uint32_t count =
+          ((key.base + (key.GetSizeBytes() - 1)) >> kCacheBucketSizeBytesLog2) -
+          (key.base >> kCacheBucketSizeBytesLog2) + 1;
+      assert_true(count <= 2,
+                  "Cache entries only store list links within two buckets");
+      return count;
+    }
+    uint32_t GetBucketCount() const { return GetBucketCount(key); }
+  };
+
+  // A cache transaction performs a few operations in a RAII-like way (so
+  // processing may return an error for any reason, and won't have to clean up
+  // cache_currently_processing_base_ / size_bytes_ explicitly):
+  // - Transaction initialization:
+  //   - Lookup of previously processed indices in the cache.
+  //   - If not found, beginning to add a new entry that is going to be
+  //     processed:
+  //     - Marking the range as currently being processed, for slightly safer
+  //       race condition handling if one happens - if invalidation happens
+  //       during the transaction (but outside a global critical region lock,
+  //       since processing may take a long time), the new cache entry won't be
+  //       stored as it will already be invalid at the time of the completion of
+  //       the transaction.
+  //     - Enabling an access callback for the range.
+  // - Setting the new result after processing (if not found in the cache
+  //   previously).
+  // - Transaction completion:
+  //   - If the range wasn't invalidated during the transaction, storing the new
+  //     entry in the cache.
+  // If an entry was found in the cache (GetFoundResult results non-null), it
+  // MUST be used instead of processing - this class doesn't provide the
+  // possibility replace existing entries.
+  class CacheTransaction final {
+   public:
+    CacheTransaction(PrimitiveProcessor& processor, CacheKey key);
+    const CachedResult* GetFoundResult() const {
+      return result_type_ == ResultType::kExisting ? &result_ : nullptr;
+    }
+    void SetNewResult(const CachedResult& new_result) {
+      // Replacement of an existing entry is not allowed.
+      assert_true(result_type_ != ResultType::kExisting);
+      result_ = new_result;
+      result_type_ = ResultType::kNewSet;
+    }
+    ~CacheTransaction();
+
+   private:
+    PrimitiveProcessor& processor_;
+    // If key_.count == 0, this transaction shouldn't do anything - for empty
+    // ranges it's pointless, and it's unsafe to get the end pointer without
+    // special logic, and count == 0 is also used as a special indicator for
+    // vertex count below the cache usage threshold.
+    CacheKey key_;
+    CachedResult result_;
+    enum class ResultType {
+      kNewUnset,
+      kNewSet,
+      kExisting,
+    };
+    ResultType result_type_ = ResultType::kNewUnset;
+  };
+
+  std::deque<CacheEntry> cache_entry_pool_;
+
+  void* memory_invalidation_callback_handle_ = nullptr;
+
+  xe::global_critical_region global_critical_region_;
+  // Modified by both the processor and the invalidation callback.
+  std::unordered_map<CacheKey, size_t, CacheKey::Hasher> cache_map_;
+  // The conversion is performed while the lock is released since it may take a
+  // long time.
+  // If during the conversion the region currently being converted is
+  // invalidated, the current entry will not be added to the cache.
+  // Modified by the processor, read by the invalidation callback.
+  uint32_t cache_currently_processing_base_ = 0;
+  // 0 if not in a cache transaction that hasn't found an existing entry
+  // currently.
+  uint32_t cache_currently_processing_size_bytes_ = 0;
+  // Modified by both the processor and the invalidation callback.
+  size_t cache_bucket_free_first_entry_ = SIZE_MAX;
+  // Modified by both the processor and the invalidation callback.
+  uint64_t cache_buckets_non_empty_l1_[(kCacheBucketCount + 63) / 64] = {};
+  // For even faster handling of memory invalidation - whether any bit is set in
+  // each cache_buckets_non_empty_l1_.
+  // Modified by both the processor and the invalidation callback.
+  uint64_t cache_buckets_non_empty_l2_[(kCacheBucketCount + (64 * 64 - 1)) /
+                                       (64 * 64)] = {};
+  // Must be called in a global critical region.
+  void UpdateCacheBucketsNonEmptyL2(
+      uint32_t bucket_index_div_64,
+      [[maybe_unused]] const std::unique_lock<std::recursive_mutex>&
+          global_lock) {
+    uint64_t& cache_buckets_non_empty_l2_ref =
+        cache_buckets_non_empty_l2_[bucket_index_div_64 >> 6];
+    uint64_t cache_buckets_non_empty_l2_bit = uint64_t(1)
+                                              << (bucket_index_div_64 & 63);
+    if (cache_buckets_non_empty_l1_[bucket_index_div_64]) {
+      cache_buckets_non_empty_l2_ref |= cache_buckets_non_empty_l2_bit;
+    } else {
+      cache_buckets_non_empty_l2_ref &= ~cache_buckets_non_empty_l2_bit;
+    }
+  }
+  // cache_buckets_non_empty_l1_ (along with cache_buckets_non_empty_l2_, which
+  // must be kept in sync) used for indication whether each entry is non-empty,
+  // for faster clearing (there's no special index here for an empty entry).
+  // Huge, so it's the last in the class.
+  // Modified by both the processor and the invalidation callback.
+  size_t cache_bucket_first_entries_[kCacheBucketCount];
+  static std::pair<uint32_t, uint32_t> MemoryInvalidationCallbackThunk(
+      void* context_ptr, uint32_t physical_address_start, uint32_t length,
+      bool exact_range);
+};
+
+}  // namespace gpu
+}  // namespace xe
+
+#endif  // XENIA_GPU_PRIMITIVE_PROCESSOR_H_
--- a/src/xenia/gpu/register_table.inc
+++ b/src/xenia/gpu/register_table.inc
@ -342,6 +342,8 @@ XE_GPU_REGISTER(0x2184, kDword, SQ_WRAPPING_1)

 // These three registers are set by the command processor.
 XE_GPU_REGISTER(0x21F9, kDword, VGT_EVENT_INITIATOR)
+XE_GPU_REGISTER(0x21FA, kDword, VGT_DMA_BASE)
+XE_GPU_REGISTER(0x21FB, kDword, VGT_DMA_SIZE)
 XE_GPU_REGISTER(0x21FC, kDword, VGT_DRAW_INITIATOR)
 XE_GPU_REGISTER(0x21FD, kDword, VGT_IMMED_DATA)

@ -419,6 +421,11 @@ XE_GPU_REGISTER(0x2323, kDword, RB_COPY_SURFACE_SLICE)
 XE_GPU_REGISTER(0x2324, kDword, RB_SAMPLE_COUNT_CTL)
 XE_GPU_REGISTER(0x2325, kDword, RB_SAMPLE_COUNT_ADDR)

+// Polygon offset scales and offsets are 32-bit floating-point.
+// "slope computed in subpixels (1/12 or 1/16)" - R5xx Acceleration.
+// But the correct scale for conversion of the slope scale (FRONT_BACK/SCALE)
+// from subpixels to pixels is likely 1/16 according to:
+// https://github.com/mesa3d/mesa/blob/54ad9b444c8e73da498211870e785239ad3ff1aa/src/gallium/drivers/radeonsi/si_state.c#L946
 XE_GPU_REGISTER(0x2380, kFloat, PA_SU_POLY_OFFSET_FRONT_SCALE)
 XE_GPU_REGISTER(0x2381, kFloat, PA_SU_POLY_OFFSET_FRONT_OFFSET)
 XE_GPU_REGISTER(0x2382, kFloat, PA_SU_POLY_OFFSET_BACK_SCALE)
--- a/src/xenia/gpu/registers.h
+++ b/src/xenia/gpu/registers.h
@ -13,12 +13,19 @@
 #include <cstdint>
 #include <cstdlib>

+#include "xenia/base/assert.h"
 #include "xenia/gpu/xenos.h"

 // Most registers can be found from:
 // https://github.com/UDOOboard/Kernel_Unico/blob/master/drivers/mxc/amd-gpu/include/reg/yamato/14/yamato_registers.h
 // Some registers were added on Adreno specifically and are not referenced in
 // game .pdb files and never set by games.
+
+// Only 32-bit types (uint32_t, int32_t, float or enums with uint32_t / int32_t
+// as the underlying type) are allowed in the bit fields here, as Visual C++
+// restarts packing when a field requires different alignment than the previous
+// one.
+
 namespace xe {
 namespace gpu {

@ -38,7 +45,7 @@ namespace reg {

 *******************************************************************************/

-union COHER_STATUS_HOST {
+union alignas(uint32_t) COHER_STATUS_HOST {
  struct {
    uint32_t matching_contexts : 8;      // +0
    uint32_t rb_copy_dest_base_ena : 1;  // +8
@ -60,8 +67,9 @@ union COHER_STATUS_HOST {
  uint32_t value;
  static constexpr Register register_index = XE_GPU_REG_COHER_STATUS_HOST;
 };
+static_assert_size(COHER_STATUS_HOST, sizeof(uint32_t));

-union WAIT_UNTIL {
+union alignas(uint32_t) WAIT_UNTIL {
  struct {
    uint32_t : 1;                    // +0
    uint32_t wait_re_vsync : 1;      // +1
@ -83,6 +91,7 @@ union WAIT_UNTIL {
  uint32_t value;
  static constexpr Register register_index = XE_GPU_REG_WAIT_UNTIL;
 };
+static_assert_size(WAIT_UNTIL, sizeof(uint32_t));

 /*******************************************************************************
  ___ ___ ___  _   _ ___ _  _  ___ ___ ___
@ -92,11 +101,12 @@ union WAIT_UNTIL {

 *******************************************************************************/

-union SQ_PROGRAM_CNTL {
+union alignas(uint32_t) SQ_PROGRAM_CNTL {
  struct {
    // Note from a2xx.xml:
    // Only 0x3F worth of valid register values for VS_NUM_REG and PS_NUM_REG,
    // but high bit is set to indicate "0 registers used".
+    // (Register count = (num_reg & 0x80) ? 0 : (num_reg + 1))
    uint32_t vs_num_reg : 8;                           // +0
    uint32_t ps_num_reg : 8;                           // +8
    uint32_t vs_resource : 1;                          // +16
@ -111,8 +121,9 @@ union SQ_PROGRAM_CNTL {
  uint32_t value;
  static constexpr Register register_index = XE_GPU_REG_SQ_PROGRAM_CNTL;
 };
+static_assert_size(SQ_PROGRAM_CNTL, sizeof(uint32_t));

-union SQ_CONTEXT_MISC {
+union alignas(uint32_t) SQ_CONTEXT_MISC {
  struct {
    uint32_t inst_pred_optimize : 1;          // +0
    uint32_t sc_output_screen_xy : 1;         // +1
@ -142,8 +153,9 @@ union SQ_CONTEXT_MISC {
  uint32_t value;
  static constexpr Register register_index = XE_GPU_REG_SQ_CONTEXT_MISC;
 };
+static_assert_size(SQ_CONTEXT_MISC, sizeof(uint32_t));

-union SQ_INTERPOLATOR_CNTL {
+union alignas(uint32_t) SQ_INTERPOLATOR_CNTL {
  struct {
    uint32_t param_shade : 16;  // +0
    // SampleLocation bits - 0 for centroid, 1 for center, if
@ -153,6 +165,7 @@ union SQ_INTERPOLATOR_CNTL {
  uint32_t value;
  static constexpr Register register_index = XE_GPU_REG_SQ_INTERPOLATOR_CNTL;
 };
+static_assert_size(SQ_INTERPOLATOR_CNTL, sizeof(uint32_t));

 /*******************************************************************************
 __   _____ ___ _____ _____  __
@ -172,7 +185,17 @@ union SQ_INTERPOLATOR_CNTL {

 *******************************************************************************/

-union VGT_DRAW_INITIATOR {
+union alignas(uint32_t) VGT_DMA_SIZE {
+  struct {
+    uint32_t num_words : 24;      // +0
+    uint32_t : 6;                 // +24
+    xenos::Endian swap_mode : 2;  // +30
+  };
+  uint32_t value;
+  static constexpr Register register_index = XE_GPU_REG_VGT_DMA_SIZE;
+};
+
+union alignas(uint32_t) VGT_DRAW_INITIATOR {
  // Different than on A2xx and R6xx/R7xx.
  struct {
    xenos::PrimitiveType prim_type : 6;     // +0
@ -187,22 +210,88 @@ union VGT_DRAW_INITIATOR {
  uint32_t value;
  static constexpr Register register_index = XE_GPU_REG_VGT_DRAW_INITIATOR;
 };
+static_assert_size(VGT_DRAW_INITIATOR, sizeof(uint32_t));

-union VGT_OUTPUT_PATH_CNTL {
+// Unlike on R6xx (but closer to R5xx), and according to the Adreno 200 header,
+// the registers related to the vertex index are 24-bit. Vertex indices are
+// unsigned, and only the lower 24 bits of them are actually used by the GPU -
+// this has been verified on an Adreno 200 phone (LG Optimus L7) on OpenGL ES
+// using a GL_UNSIGNED_INT element array buffer with junk in the upper 8 bits
+// that had no effect on drawing.
+
+// The order of operations is primitive reset index checking -> offsetting ->
+// clamping.
+
+union alignas(uint32_t) VGT_MULTI_PRIM_IB_RESET_INDX {
+  struct {
+    // The upper 8 bits of the value from the index buffer are confirmed to be
+    // ignored. So, though this specifically is untested (because
+    // GL_PRIMITIVE_RESTART_FIXED_INDEX was added only in OpenGL ES 3.0, though
+    // it behaves conceptually close to our expectations anyway - uses the
+    // 0xFFFFFFFF restart index while GL_MAX_ELEMENT_INDEX may be 0xFFFFFF),
+    // the restart index check likely only involves the lower 24 bit of the
+    // vertex index - therefore, if reset_indx is 0xFFFFFF, likely 0xFFFFFF,
+    // 0x1FFFFFF, 0xFFFFFFFF all cause primitive reset.
+    uint32_t reset_indx : 24;
+  };
+  uint32_t value;
+  static constexpr Register register_index =
+      XE_GPU_REG_VGT_MULTI_PRIM_IB_RESET_INDX;
+};
+static_assert_size(VGT_MULTI_PRIM_IB_RESET_INDX, sizeof(uint32_t));
+
+union alignas(uint32_t) VGT_INDX_OFFSET {
+  struct {
+    // Unlike R5xx's VAP_INDEX_OFFSET, which is signed 25-bit, this is 24-bit -
+    // and signedness doesn't matter as index calculations are done in 24-bit
+    // integers, and ((0xFFFFFE + 3) & 0xFFFFFF) == 1 anyway, just like
+    // ((0xFFFFFFFE + 3) & 0xFFFFFF) == 1 if we treated it as signed by
+    // sign-extending on the host. Direct3D 9 just writes BaseVertexIndex as a
+    // signed int32 to the entire register, but the upper 8 bits are ignored
+    // anyway, and that has no effect on offsets that fit in 24 bits.
+    uint32_t indx_offset : 24;
+  };
+  uint32_t value;
+  static constexpr Register register_index = XE_GPU_REG_VGT_INDX_OFFSET;
+};
+static_assert_size(VGT_INDX_OFFSET, sizeof(uint32_t));
+
+union alignas(uint32_t) VGT_MIN_VTX_INDX {
+  struct {
+    uint32_t min_indx : 24;
+  };
+  uint32_t value;
+  static constexpr Register register_index = XE_GPU_REG_VGT_MIN_VTX_INDX;
+};
+static_assert_size(VGT_MIN_VTX_INDX, sizeof(uint32_t));
+
+union alignas(uint32_t) VGT_MAX_VTX_INDX {
+  struct {
+    // Usually 0xFFFF or 0xFFFFFF.
+    uint32_t max_indx : 24;
+  };
+  uint32_t value;
+  static constexpr Register register_index = XE_GPU_REG_VGT_MAX_VTX_INDX;
+};
+static_assert_size(VGT_MAX_VTX_INDX, sizeof(uint32_t));
+
+union alignas(uint32_t) VGT_OUTPUT_PATH_CNTL {
  struct {
    xenos::VGTOutputPath path_select : 2;  // +0
  };
  uint32_t value;
  static constexpr Register register_index = XE_GPU_REG_VGT_OUTPUT_PATH_CNTL;
 };
+static_assert_size(VGT_OUTPUT_PATH_CNTL, sizeof(uint32_t));

-union VGT_HOS_CNTL {
+union alignas(uint32_t) VGT_HOS_CNTL {
  struct {
    xenos::TessellationMode tess_mode : 2;  // +0
  };
  uint32_t value;
  static constexpr Register register_index = XE_GPU_REG_VGT_HOS_CNTL;
 };
+static_assert_size(VGT_HOS_CNTL, sizeof(uint32_t));

 /*******************************************************************************
  ___ ___ ___ __  __ ___ _____ _____   _____
@ -217,7 +306,7 @@ union VGT_HOS_CNTL {

 *******************************************************************************/

-union PA_SU_POINT_MINMAX {
+union alignas(uint32_t) PA_SU_POINT_MINMAX {
  struct {
    // Radius, 12.4 fixed point.
    uint32_t min_size : 16;  // +0
@ -226,8 +315,9 @@ union PA_SU_POINT_MINMAX {
  uint32_t value;
  static constexpr Register register_index = XE_GPU_REG_PA_SU_POINT_MINMAX;
 };
+static_assert_size(PA_SU_POINT_MINMAX, sizeof(uint32_t));

-union PA_SU_POINT_SIZE {
+union alignas(uint32_t) PA_SU_POINT_SIZE {
  struct {
    // 1/2 width or height, 12.4 fixed point.
    uint32_t height : 16;  // +0
@ -236,14 +326,19 @@ union PA_SU_POINT_SIZE {
  uint32_t value;
  static constexpr Register register_index = XE_GPU_REG_PA_SU_POINT_SIZE;
 };
+static_assert_size(PA_SU_POINT_SIZE, sizeof(uint32_t));

 // Setup Unit / Scanline Converter mode cntl
-union PA_SU_SC_MODE_CNTL {
+union alignas(uint32_t) PA_SU_SC_MODE_CNTL {
  struct {
    uint32_t cull_front : 1;  // +0
    uint32_t cull_back : 1;   // +1
    // 0 - front is CCW, 1 - front is CW.
-    uint32_t face : 1;                            // +2
+    uint32_t face : 1;  // +2
+    // The game Fuse uses poly_mode 2 for triangles, which is "reserved" on R6xx
+    // and not defined on Adreno 2xx, but polymode_front/back_ptype are 0
+    // (points) in this case in Fuse, which should not be respected for
+    // non-kDualMode as the game wants to draw filled triangles.
    xenos::PolygonModeEnable poly_mode : 2;       // +3
    xenos::PolygonType polymode_front_ptype : 3;  // +5
    xenos::PolygonType polymode_back_ptype : 3;   // +8
@ -267,9 +362,10 @@ union PA_SU_SC_MODE_CNTL {
  uint32_t value;
  static constexpr Register register_index = XE_GPU_REG_PA_SU_SC_MODE_CNTL;
 };
+static_assert_size(PA_SU_SC_MODE_CNTL, sizeof(uint32_t));

 // Setup Unit Vertex Control
-union PA_SU_VTX_CNTL {
+union alignas(uint32_t) PA_SU_VTX_CNTL {
  struct {
    uint32_t pix_center : 1;  // +0 1 = half pixel offset (OpenGL).
    uint32_t round_mode : 2;  // +1
@ -278,8 +374,9 @@ union PA_SU_VTX_CNTL {
  uint32_t value;
  static constexpr Register register_index = XE_GPU_REG_PA_SU_VTX_CNTL;
 };
+static_assert_size(PA_SU_VTX_CNTL, sizeof(uint32_t));

-union PA_SC_MPASS_PS_CNTL {
+union alignas(uint32_t) PA_SC_MPASS_PS_CNTL {
  struct {
    uint32_t mpass_pix_vec_per_pass : 20;  // +0
    uint32_t : 11;                         // +20
@ -288,9 +385,10 @@ union PA_SC_MPASS_PS_CNTL {
  uint32_t value;
  static constexpr Register register_index = XE_GPU_REG_PA_SC_MPASS_PS_CNTL;
 };
+static_assert_size(PA_SC_MPASS_PS_CNTL, sizeof(uint32_t));

 // Scanline converter viz query, used by D3D for gpu side conditional rendering
-union PA_SC_VIZ_QUERY {
+union alignas(uint32_t) PA_SC_VIZ_QUERY {
  struct {
    // the visibility of draws should be evaluated
    uint32_t viz_query_ena : 1;  // +0
@ -303,9 +401,10 @@ union PA_SC_VIZ_QUERY {
  uint32_t value;
  static constexpr Register register_index = XE_GPU_REG_PA_SC_VIZ_QUERY;
 };
+static_assert_size(PA_SC_VIZ_QUERY, sizeof(uint32_t));

 // Clipper clip control
-union PA_CL_CLIP_CNTL {
+union alignas(uint32_t) PA_CL_CLIP_CNTL {
  struct {
    uint32_t ucp_ena_0 : 1;               // +0
    uint32_t ucp_ena_1 : 1;               // +1
@ -328,9 +427,10 @@ union PA_CL_CLIP_CNTL {
  uint32_t value;
  static constexpr Register register_index = XE_GPU_REG_PA_CL_CLIP_CNTL;
 };
+static_assert_size(PA_CL_CLIP_CNTL, sizeof(uint32_t));

 // Viewport transform engine control
-union PA_CL_VTE_CNTL {
+union alignas(uint32_t) PA_CL_VTE_CNTL {
  struct {
    uint32_t vport_x_scale_ena : 1;   // +0
    uint32_t vport_x_offset_ena : 1;  // +1
@ -347,8 +447,31 @@ union PA_CL_VTE_CNTL {
  uint32_t value;
  static constexpr Register register_index = XE_GPU_REG_PA_CL_VTE_CNTL;
 };
+static_assert_size(PA_CL_VTE_CNTL, sizeof(uint32_t));

-union PA_SC_WINDOW_OFFSET {
+union alignas(uint32_t) PA_SC_SCREEN_SCISSOR_TL {
+  struct {
+    int32_t tl_x : 15;  // +0
+    uint32_t : 1;       // +15
+    int32_t tl_y : 15;  // +16
+  };
+  uint32_t value;
+  static constexpr Register register_index = XE_GPU_REG_PA_SC_SCREEN_SCISSOR_TL;
+};
+static_assert_size(PA_SC_SCREEN_SCISSOR_TL, sizeof(uint32_t));
+
+union alignas(uint32_t) PA_SC_SCREEN_SCISSOR_BR {
+  struct {
+    int32_t br_x : 15;  // +0
+    uint32_t : 1;       // +15
+    int32_t br_y : 15;  // +16
+  };
+  uint32_t value;
+  static constexpr Register register_index = XE_GPU_REG_PA_SC_SCREEN_SCISSOR_BR;
+};
+static_assert_size(PA_SC_SCREEN_SCISSOR_BR, sizeof(uint32_t));
+
+union alignas(uint32_t) PA_SC_WINDOW_OFFSET {
  struct {
    int32_t window_x_offset : 15;  // +0
    uint32_t : 1;                  // +15
@ -357,8 +480,9 @@ union PA_SC_WINDOW_OFFSET {
  uint32_t value;
  static constexpr Register register_index = XE_GPU_REG_PA_SC_WINDOW_OFFSET;
 };
+static_assert_size(PA_SC_WINDOW_OFFSET, sizeof(uint32_t));

-union PA_SC_WINDOW_SCISSOR_TL {
+union alignas(uint32_t) PA_SC_WINDOW_SCISSOR_TL {
  struct {
    uint32_t tl_x : 14;                  // +0
    uint32_t : 2;                        // +14
@ -369,8 +493,9 @@ union PA_SC_WINDOW_SCISSOR_TL {
  uint32_t value;
  static constexpr Register register_index = XE_GPU_REG_PA_SC_WINDOW_SCISSOR_TL;
 };
+static_assert_size(PA_SC_WINDOW_SCISSOR_TL, sizeof(uint32_t));

-union PA_SC_WINDOW_SCISSOR_BR {
+union alignas(uint32_t) PA_SC_WINDOW_SCISSOR_BR {
  struct {
    uint32_t br_x : 14;  // +0
    uint32_t : 2;        // +14
@ -379,6 +504,7 @@ union PA_SC_WINDOW_SCISSOR_BR {
  uint32_t value;
  static constexpr Register register_index = XE_GPU_REG_PA_SC_WINDOW_SCISSOR_BR;
 };
+static_assert_size(PA_SC_WINDOW_SCISSOR_BR, sizeof(uint32_t));

 /*******************************************************************************
  ___ ___
@ -388,15 +514,16 @@ union PA_SC_WINDOW_SCISSOR_BR {

 *******************************************************************************/

-union RB_MODECONTROL {
+union alignas(uint32_t) RB_MODECONTROL {
  struct {
    xenos::ModeControl edram_mode : 3;  // +0
  };
  uint32_t value;
  static constexpr Register register_index = XE_GPU_REG_RB_MODECONTROL;
 };
+static_assert_size(RB_MODECONTROL, sizeof(uint32_t));

-union RB_SURFACE_INFO {
+union alignas(uint32_t) RB_SURFACE_INFO {
  struct {
    uint32_t surface_pitch : 14;          // +0 in pixels.
    uint32_t : 2;                         // +14
@ -406,8 +533,9 @@ union RB_SURFACE_INFO {
  uint32_t value;
  static constexpr Register register_index = XE_GPU_REG_RB_SURFACE_INFO;
 };
+static_assert_size(RB_SURFACE_INFO, sizeof(uint32_t));

-union RB_COLORCONTROL {
+union alignas(uint32_t) RB_COLORCONTROL {
  struct {
    xenos::CompareFunction alpha_func : 3;  // +0
    uint32_t alpha_test_enable : 1;         // +3
@ -455,8 +583,9 @@ union RB_COLORCONTROL {
  uint32_t value;
  static constexpr Register register_index = XE_GPU_REG_RB_COLORCONTROL;
 };
+static_assert_size(RB_COLORCONTROL, sizeof(uint32_t));

-union RB_COLOR_INFO {
+union alignas(uint32_t) RB_COLOR_INFO {
  struct {
    uint32_t color_base : 12;                         // +0 in tiles.
    uint32_t : 4;                                     // +12
@ -468,8 +597,9 @@ union RB_COLOR_INFO {
  // RB_COLOR[1-3]_INFO also use this format.
  static const Register rt_register_indices[4];
 };
+static_assert_size(RB_COLOR_INFO, sizeof(uint32_t));

-union RB_COLOR_MASK {
+union alignas(uint32_t) RB_COLOR_MASK {
  struct {
    uint32_t write_red0 : 1;    // +0
    uint32_t write_green0 : 1;  // +1
@ -491,8 +621,9 @@ union RB_COLOR_MASK {
  uint32_t value;
  static constexpr Register register_index = XE_GPU_REG_RB_COLOR_MASK;
 };
+static_assert_size(RB_COLOR_MASK, sizeof(uint32_t));

-union RB_BLENDCONTROL {
+union alignas(uint32_t) RB_BLENDCONTROL {
  struct {
    xenos::BlendFactor color_srcblend : 5;   // +0
    xenos::BlendOp color_comb_fcn : 3;       // +5
@ -508,8 +639,9 @@ union RB_BLENDCONTROL {
  static constexpr Register register_index = XE_GPU_REG_RB_BLENDCONTROL0;
  static const Register rt_register_indices[4];
 };
+static_assert_size(RB_BLENDCONTROL, sizeof(uint32_t));

-union RB_DEPTHCONTROL {
+union alignas(uint32_t) RB_DEPTHCONTROL {
  struct {
    uint32_t stencil_enable : 1;  // +0
    uint32_t z_enable : 1;        // +1
@ -530,8 +662,9 @@ union RB_DEPTHCONTROL {
  uint32_t value;
  static constexpr Register register_index = XE_GPU_REG_RB_DEPTHCONTROL;
 };
+static_assert_size(RB_DEPTHCONTROL, sizeof(uint32_t));

-union RB_STENCILREFMASK {
+union alignas(uint32_t) RB_STENCILREFMASK {
  struct {
    uint32_t stencilref : 8;        // +0
    uint32_t stencilmask : 8;       // +8
@ -541,8 +674,9 @@ union RB_STENCILREFMASK {
  static constexpr Register register_index = XE_GPU_REG_RB_STENCILREFMASK;
  // RB_STENCILREFMASK_BF also uses this format.
 };
+static_assert_size(RB_STENCILREFMASK, sizeof(uint32_t));

-union RB_DEPTH_INFO {
+union alignas(uint32_t) RB_DEPTH_INFO {
  struct {
    uint32_t depth_base : 12;                         // +0 in tiles.
    uint32_t : 4;                                     // +12
@ -551,10 +685,11 @@ union RB_DEPTH_INFO {
  uint32_t value;
  static constexpr Register register_index = XE_GPU_REG_RB_DEPTH_INFO;
 };
+static_assert_size(RB_DEPTH_INFO, sizeof(uint32_t));

 // Copy registers are very different than on Adreno.

-union RB_COPY_CONTROL {
+union alignas(uint32_t) RB_COPY_CONTROL {
  struct {
    uint32_t copy_src_select : 3;                    // +0 Depth is 4.
    uint32_t : 1;                                    // +3
@ -568,8 +703,9 @@ union RB_COPY_CONTROL {
  uint32_t value;
  static constexpr Register register_index = XE_GPU_REG_RB_COPY_CONTROL;
 };
+static_assert_size(RB_COPY_CONTROL, sizeof(uint32_t));

-union RB_COPY_DEST_INFO {
+union alignas(uint32_t) RB_COPY_DEST_INFO {
  struct {
    xenos::Endian128 copy_dest_endian : 3;    // +0
    uint32_t copy_dest_array : 1;             // +3
@ -583,8 +719,9 @@ union RB_COPY_DEST_INFO {
  uint32_t value;
  static constexpr Register register_index = XE_GPU_REG_RB_COPY_DEST_INFO;
 };
+static_assert_size(RB_COPY_DEST_INFO, sizeof(uint32_t));

-union RB_COPY_DEST_PITCH {
+union alignas(uint32_t) RB_COPY_DEST_PITCH {
  struct {
    uint32_t copy_dest_pitch : 14;   // +0
    uint32_t : 2;                    // +14
@ -593,6 +730,7 @@ union RB_COPY_DEST_PITCH {
  uint32_t value;
  static constexpr Register register_index = XE_GPU_REG_RB_COPY_DEST_PITCH;
 };
+static_assert_size(RB_COPY_DEST_PITCH, sizeof(uint32_t));

 }  // namespace reg

--- a/src/xenia/gpu/render_target_cache.cc
+++ b/src/xenia/gpu/render_target_cache.cc
--- a/Show More
+++ b/Show More