Implement delayed depth readbacks for Vulkan only

This commit is contained in:
Henrik Rydgård 2023-02-05 16:59:23 +01:00
parent d1e0061dcd
commit b3e6b81e43
8 changed files with 136 additions and 55 deletions

View File

@ -4,6 +4,13 @@
#include "Common/Log.h"
#include "Common/StringUtils.h"
void CachedReadback::Destroy(VulkanContext *vulkan) {
if (buffer) {
vulkan->Delete().QueueDeleteBufferAllocation(buffer, allocation);
}
bufferSize = 0;
}
void FrameData::Init(VulkanContext *vulkan, int index) {
this->index = index;
VkDevice device = vulkan->GetDevice();
@ -48,6 +55,12 @@ void FrameData::Destroy(VulkanContext *vulkan) {
vkDestroyCommandPool(device, cmdPoolMain, nullptr);
vkDestroyFence(device, fence, nullptr);
vkDestroyQueryPool(device, profile.queryPool, nullptr);
readbacks_.IterateMut([=](const ReadbackKey &key, CachedReadback *value) {
value->Destroy(vulkan);
delete value;
});
readbacks_.Clear();
}
void FrameData::AcquireNextImage(VulkanContext *vulkan, FrameDataShared &shared) {

View File

@ -6,6 +6,7 @@
#include <condition_variable>
#include "Common/GPU/Vulkan/VulkanContext.h"
#include "Common/Data/Collections/Hashmaps.h"
enum {
MAX_TIMESTAMP_QUERIES = 128,
@ -25,6 +26,23 @@ struct QueueProfileContext {
double cpuEndTime;
};
class VKRFramebuffer;
struct ReadbackKey {
const VKRFramebuffer *framebuf;
int width;
int height;
};
struct CachedReadback {
VkBuffer buffer;
VmaAllocation allocation;
VkDeviceSize bufferSize;
bool isCoherent;
void Destroy(VulkanContext *vulkan);
};
struct FrameDataShared {
// Permanent objects
VkSemaphore acquireSemaphore = VK_NULL_HANDLE;
@ -77,6 +95,11 @@ struct FrameData {
QueueProfileContext profile;
bool profilingEnabled_ = false;
// Async readback cache.
DenseHashMap<ReadbackKey, CachedReadback*, nullptr> readbacks_;
FrameData() : readbacks_(8) {}
void Init(VulkanContext *vulkan, int index);
void Destroy(VulkanContext *vulkan);
@ -91,5 +114,5 @@ struct FrameData {
private:
// Metadata for logging etc
int index;
int index = -1;
};

View File

@ -69,7 +69,8 @@ void VulkanQueueRunner::CreateDeviceObjects() {
void VulkanQueueRunner::DestroyDeviceObjects() {
INFO_LOG(G3D, "VulkanQueueRunner::DestroyDeviceObjects");
DestroyReadbackBuffer();
syncReadback_.Destroy(vulkan_);
renderPasses_.IterateMut([&](const RPKey &rpkey, VKRRenderPass *rp) {
_assert_(rp);
@ -419,7 +420,7 @@ void VulkanQueueRunner::RunSteps(std::vector<VKRStep *> &steps, FrameData &frame
PerformBlit(step, cmd);
break;
case VKRStepType::READBACK:
PerformReadback(step, cmd);
PerformReadback(step, cmd, frameData);
break;
case VKRStepType::READBACK_IMAGE:
PerformReadbackImage(step, cmd);
@ -1944,52 +1945,35 @@ void VulkanQueueRunner::SetupTransferDstWriteAfterWrite(VKRImage &img, VkImageAs
);
}
void VulkanQueueRunner::ResizeReadbackBuffer(VkDeviceSize requiredSize) {
if (readbackBuffer_ && requiredSize <= readbackBufferSize_) {
void VulkanQueueRunner::ResizeReadbackBuffer(CachedReadback *readback, VkDeviceSize requiredSize) {
if (readback->buffer && requiredSize <= readback->bufferSize) {
return;
}
if (readbackBuffer_) {
vulkan_->Delete().QueueDeleteBufferAllocation(readbackBuffer_, readbackAllocation_);
if (readback->buffer) {
vulkan_->Delete().QueueDeleteBufferAllocation(readback->buffer, readback->allocation);
}
readbackBufferSize_ = requiredSize;
readback->bufferSize = requiredSize;
VkDevice device = vulkan_->GetDevice();
VkBufferCreateInfo buf{ VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO };
buf.size = readbackBufferSize_;
buf.size = readback->bufferSize;
buf.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT;
VmaAllocationCreateInfo allocCreateInfo{};
allocCreateInfo.usage = VMA_MEMORY_USAGE_GPU_TO_CPU;
VmaAllocationInfo allocInfo{};
VkResult res = vmaCreateBuffer(vulkan_->Allocator(), &buf, &allocCreateInfo, &readbackBuffer_, &readbackAllocation_, &allocInfo);
VkResult res = vmaCreateBuffer(vulkan_->Allocator(), &buf, &allocCreateInfo, &readback->buffer, &readback->allocation, &allocInfo);
_assert_(res == VK_SUCCESS);
const VkMemoryType &memoryType = vulkan_->GetMemoryProperties().memoryTypes[allocInfo.memoryType];
readbackBufferIsCoherent_ = (memoryType.propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT) != 0;
readback->isCoherent = (memoryType.propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT) != 0;
}
void VulkanQueueRunner::DestroyReadbackBuffer() {
if (readbackBuffer_) {
vulkan_->Delete().QueueDeleteBufferAllocation(readbackBuffer_, readbackAllocation_);
}
readbackBufferSize_ = 0;
}
void VulkanQueueRunner::PerformReadback(const VKRStep &step, VkCommandBuffer cmd) {
ResizeReadbackBuffer(sizeof(uint32_t) * step.readback.srcRect.extent.width * step.readback.srcRect.extent.height);
VkBufferImageCopy region{};
region.imageOffset = { step.readback.srcRect.offset.x, step.readback.srcRect.offset.y, 0 };
region.imageExtent = { step.readback.srcRect.extent.width, step.readback.srcRect.extent.height, 1 };
region.imageSubresource.aspectMask = step.readback.aspectMask;
region.imageSubresource.layerCount = 1;
region.bufferOffset = 0;
region.bufferRowLength = step.readback.srcRect.extent.width;
region.bufferImageHeight = step.readback.srcRect.extent.height;
void VulkanQueueRunner::PerformReadback(const VKRStep &step, VkCommandBuffer cmd, FrameData &frameData) {
VkImage image;
VkImageLayout copyLayout;
// Special case for backbuffer readbacks.
@ -2023,7 +2007,40 @@ void VulkanQueueRunner::PerformReadback(const VKRStep &step, VkCommandBuffer cmd
copyLayout = srcImage->layout;
}
vkCmdCopyImageToBuffer(cmd, image, copyLayout, readbackBuffer_, 1, &region);
// TODO: Handle different readback formats!
u32 readbackSizeInBytes = sizeof(uint32_t) * step.readback.srcRect.extent.width * step.readback.srcRect.extent.height;
CachedReadback *cached = nullptr;
if (step.readback.delayed) {
ReadbackKey key;
key.framebuf = step.readback.src;
key.width = step.readback.srcRect.extent.width;
key.height = step.readback.srcRect.extent.height;
// See if there's already a buffer we can reuse
cached = frameData.readbacks_.Get(key);
if (!cached) {
cached = new CachedReadback();
cached->bufferSize = 0;
frameData.readbacks_.Insert(key, cached);
}
} else {
cached = &syncReadback_;
}
ResizeReadbackBuffer(cached, readbackSizeInBytes);
VkBufferImageCopy region{};
region.imageOffset = { step.readback.srcRect.offset.x, step.readback.srcRect.offset.y, 0 };
region.imageExtent = { step.readback.srcRect.extent.width, step.readback.srcRect.extent.height, 1 };
region.imageSubresource.aspectMask = step.readback.aspectMask;
region.imageSubresource.layerCount = 1;
region.bufferOffset = 0;
region.bufferRowLength = step.readback.srcRect.extent.width;
region.bufferImageHeight = step.readback.srcRect.extent.height;
vkCmdCopyImageToBuffer(cmd, image, copyLayout, cached->buffer, 1, &region);
// NOTE: Can't read the buffer using the CPU here - need to sync first.
@ -2050,7 +2067,7 @@ void VulkanQueueRunner::PerformReadbackImage(const VKRStep &step, VkCommandBuffe
SetupTransitionToTransferSrc(srcImage, VK_IMAGE_ASPECT_COLOR_BIT, &recordBarrier_);
recordBarrier_.Flush(cmd);
ResizeReadbackBuffer(sizeof(uint32_t) * step.readback_image.srcRect.extent.width * step.readback_image.srcRect.extent.height);
ResizeReadbackBuffer(&syncReadback_, sizeof(uint32_t) * step.readback_image.srcRect.extent.width * step.readback_image.srcRect.extent.height);
VkBufferImageCopy region{};
region.imageOffset = { step.readback_image.srcRect.offset.x, step.readback_image.srcRect.offset.y, 0 };
@ -2061,7 +2078,7 @@ void VulkanQueueRunner::PerformReadbackImage(const VKRStep &step, VkCommandBuffe
region.bufferOffset = 0;
region.bufferRowLength = step.readback_image.srcRect.extent.width;
region.bufferImageHeight = step.readback_image.srcRect.extent.height;
vkCmdCopyImageToBuffer(cmd, step.readback_image.image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, readbackBuffer_, 1, &region);
vkCmdCopyImageToBuffer(cmd, step.readback_image.image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, syncReadback_.buffer, 1, &region);
// Now transfer it back to a texture.
TransitionImageLayout2(cmd, step.readback_image.image, 0, 1, 1, // I don't think we have any multilayer cases for regular textures. Above in PerformReadback, though..
@ -2074,22 +2091,39 @@ void VulkanQueueRunner::PerformReadbackImage(const VKRStep &step, VkCommandBuffe
// Doing that will also act like a heavyweight barrier ensuring that device writes are visible on the host.
}
void VulkanQueueRunner::CopyReadbackBuffer(VKRFramebuffer *src, int width, int height, Draw::DataFormat srcFormat, Draw::DataFormat destFormat, int pixelStride, uint8_t *pixels) {
if (!readbackBuffer_)
return; // Something has gone really wrong.
bool VulkanQueueRunner::CopyReadbackBuffer(FrameData &frameData, VKRFramebuffer *src, int width, int height, Draw::DataFormat srcFormat, Draw::DataFormat destFormat, int pixelStride, uint8_t *pixels) {
CachedReadback *readback = &syncReadback_;
// Look up in readback cache.
if (src) {
ReadbackKey key;
key.framebuf = src;
key.width = width;
key.height = height;
CachedReadback *cached = frameData.readbacks_.Get(key);
if (cached) {
readback = cached;
} else {
// Didn't have a cached image ready yet
return false;
}
}
if (!readback->buffer)
return false; // Didn't find anything in cache, or something has gone really wrong.
// Read back to the requested address in ram from buffer.
void *mappedData;
const size_t srcPixelSize = DataFormatSizeInBytes(srcFormat);
VkResult res = vmaMapMemory(vulkan_->Allocator(), readbackAllocation_, &mappedData);
VkResult res = vmaMapMemory(vulkan_->Allocator(), readback->allocation, &mappedData);
if (res != VK_SUCCESS) {
ERROR_LOG(G3D, "CopyReadbackBuffer: vkMapMemory failed! result=%d", (int)res);
return;
return false;
}
if (!readbackBufferIsCoherent_) {
vmaInvalidateAllocation(vulkan_->Allocator(), readbackAllocation_, 0, width * height * srcPixelSize);
if (!readback->isCoherent) {
vmaInvalidateAllocation(vulkan_->Allocator(), readback->allocation, 0, width * height * srcPixelSize);
}
// TODO: Perform these conversions in a compute shader on the GPU.
@ -2116,5 +2150,6 @@ void VulkanQueueRunner::CopyReadbackBuffer(VKRFramebuffer *src, int width, int h
_assert_msg_(false, "CopyReadbackBuffer: Unknown src format %d", (int)srcFormat);
}
vmaUnmapMemory(vulkan_->Allocator(), readbackAllocation_);
vmaUnmapMemory(vulkan_->Allocator(), readback->allocation);
return true;
}

View File

@ -199,6 +199,7 @@ struct VKRStep {
int aspectMask;
VKRFramebuffer *src;
VkRect2D srcRect;
bool delayed;
} readback;
struct {
VkImage image;
@ -253,7 +254,7 @@ public:
}
// src == 0 means to copy from the sync readback buffer.
void CopyReadbackBuffer(VKRFramebuffer *src, int width, int height, Draw::DataFormat srcFormat, Draw::DataFormat destFormat, int pixelStride, uint8_t *pixels);
bool CopyReadbackBuffer(FrameData &frameData, VKRFramebuffer *src, int width, int height, Draw::DataFormat srcFormat, Draw::DataFormat destFormat, int pixelStride, uint8_t *pixels);
VKRRenderPass *GetRenderPass(const RPKey &key);
@ -289,7 +290,7 @@ private:
void PerformRenderPass(const VKRStep &pass, VkCommandBuffer cmd);
void PerformCopy(const VKRStep &pass, VkCommandBuffer cmd);
void PerformBlit(const VKRStep &pass, VkCommandBuffer cmd);
void PerformReadback(const VKRStep &pass, VkCommandBuffer cmd);
void PerformReadback(const VKRStep &pass, VkCommandBuffer cmd, FrameData &frameData);
void PerformReadbackImage(const VKRStep &pass, VkCommandBuffer cmd);
void LogRenderPass(const VKRStep &pass, bool verbose);
@ -298,8 +299,7 @@ private:
void LogReadback(const VKRStep &pass);
void LogReadbackImage(const VKRStep &pass);
void ResizeReadbackBuffer(VkDeviceSize requiredSize);
void DestroyReadbackBuffer();
void ResizeReadbackBuffer(CachedReadback *readback, VkDeviceSize requiredSize);
void ApplyMGSHack(std::vector<VKRStep *> &steps);
void ApplySonicHack(std::vector<VKRStep *> &steps);
@ -325,10 +325,7 @@ private:
// Readback buffer. Currently we only support synchronous readback, so we only really need one.
// We size it generously.
VmaAllocation readbackAllocation_ = VK_NULL_HANDLE;
VkBuffer readbackBuffer_ = VK_NULL_HANDLE;
VkDeviceSize readbackBufferSize_ = 0;
bool readbackBufferIsCoherent_ = false;
CachedReadback syncReadback_{};
// TODO: Enable based on compat.ini.
uint32_t hacksEnabled_ = 0;

View File

@ -929,11 +929,14 @@ bool VulkanRenderManager::CopyFramebufferToMemory(VKRFramebuffer *src, VkImageAs
step->readback.src = src;
step->readback.srcRect.offset = { x, y };
step->readback.srcRect.extent = { (uint32_t)w, (uint32_t)h };
step->readback.delayed = mode == Draw::ReadbackMode::OLD_DATA_OK;
step->dependencies.insert(src);
step->tag = tag;
steps_.push_back(step);
FlushSync();
if (mode == Draw::ReadbackMode::BLOCK) {
FlushSync();
}
Draw::DataFormat srcFormat = Draw::DataFormat::UNDEFINED;
if (aspectBits & VK_IMAGE_ASPECT_COLOR_BIT) {
@ -972,7 +975,8 @@ bool VulkanRenderManager::CopyFramebufferToMemory(VKRFramebuffer *src, VkImageAs
}
// Need to call this after FlushSync so the pixels are guaranteed to be ready in CPU-accessible VRAM.
queueRunner_.CopyReadbackBuffer(src, w, h, srcFormat, destFormat, pixelStride, pixels);
queueRunner_.CopyReadbackBuffer(frameData_[vulkan_->GetCurFrame()],
mode == Draw::ReadbackMode::OLD_DATA_OK ? src : nullptr, w, h, srcFormat, destFormat, pixelStride, pixels);
return true;
}
@ -992,7 +996,7 @@ void VulkanRenderManager::CopyImageToMemorySync(VkImage image, int mipLevel, int
FlushSync();
// Need to call this after FlushSync so the pixels are guaranteed to be ready in CPU-accessible VRAM.
queueRunner_.CopyReadbackBuffer(nullptr, w, h, destFormat, destFormat, pixelStride, pixels);
queueRunner_.CopyReadbackBuffer(frameData_[vulkan_->GetCurFrame()], nullptr, w, h, destFormat, destFormat, pixelStride, pixels);
}
static void RemoveDrawCommands(std::vector<VkRenderData> *cmds) {

View File

@ -2631,7 +2631,9 @@ bool FramebufferManagerCommon::GetFramebuffer(u32 fb_address, int fb_stride, GEB
bool flipY = (GetGPUBackend() == GPUBackend::OPENGL && !useBufferedRendering_) ? true : false;
buffer.Allocate(w, h, GE_FORMAT_8888, flipY);
bool retval = draw_->CopyFramebufferToMemory(bound, Draw::FB_COLOR_BIT, 0, 0, w, h, Draw::DataFormat::R8G8B8A8_UNORM, buffer.GetData(), w, Draw::ReadbackMode::BLOCK, "GetFramebuffer");
gpuStats.numReadbacks++;
// Don't need to increment gpu stats for readback count here, this is a debugger-only function.
// After a readback we'll have flushed and started over, need to dirty a bunch of things to be safe.
gstate_c.Dirty(DIRTY_TEXTURE_IMAGE | DIRTY_TEXTURE_PARAMS);
// We may have blitted to a temp FBO.
@ -2792,7 +2794,11 @@ void FramebufferManagerCommon::ReadbackFramebuffer(VirtualFramebuffer *vfb, int
size_t len = snprintf(tag, sizeof(tag), "FramebufferPack/%08x_%08x_%dx%d_%s", vfb->fb_address, vfb->z_address, w, h, GeBufferFormatToString(vfb->fb_format));
NotifyMemInfo(MemBlockFlags::WRITE, fb_address + dstByteOffset, dstSize, tag, len);
gpuStats.numReadbacks++;
if (mode == Draw::ReadbackMode::BLOCK) {
gpuStats.numBlockingReadbacks++;
} else {
gpuStats.numReadbacks++;
}
}
bool FramebufferManagerCommon::ReadbackStencilbuffer(Draw::Framebuffer *fbo, int x, int y, int w, int h, uint8_t *pixels, int pixelsStride, Draw::ReadbackMode mode) {

View File

@ -88,6 +88,7 @@ struct GPUStatistics {
numFlushes = 0;
numTexturesDecoded = 0;
numFramebufferEvaluations = 0;
numBlockingReadbacks = 0;
numReadbacks = 0;
numUploads = 0;
numDepal = 0;
@ -118,6 +119,7 @@ struct GPUStatistics {
int numShaderSwitches;
int numTexturesDecoded;
int numFramebufferEvaluations;
int numBlockingReadbacks;
int numReadbacks;
int numUploads;
int numDepal;

View File

@ -3467,7 +3467,7 @@ size_t GPUCommon::FormatGPUStatsCommon(char *buffer, size_t size) {
"Vertices: %d cached: %d uncached: %d\n"
"FBOs active: %d (evaluations: %d)\n"
"Textures: %d, dec: %d, invalidated: %d, hashed: %d kB\n"
"readbacks %d, uploads %d, depal %d\n"
"readbacks %d (%d non-block), uploads %d, depal %d\n"
"Copies: depth %d, color %d, reint %d, blend %d, selftex %d\n"
"GPU cycles executed: %d (%f per vertex)\n",
gpuStats.msProcessingDisplayLists * 1000.0f,
@ -3485,6 +3485,7 @@ size_t GPUCommon::FormatGPUStatsCommon(char *buffer, size_t size) {
gpuStats.numTexturesDecoded,
gpuStats.numTextureInvalidations,
gpuStats.numTextureDataBytesHashed / 1024,
gpuStats.numBlockingReadbacks,
gpuStats.numReadbacks,
gpuStats.numUploads,
gpuStats.numDepal,