Merge pull request #10393 from hrydgard/compute-upload

Vulkan: Texture upload through compute, experimental texture scaling too
This commit is contained in:
Henrik Rydgård 2019-10-08 09:11:25 +02:00 committed by GitHub
commit 0b17dd04e6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 864 additions and 35 deletions

View File

@ -121,9 +121,10 @@ bool VulkanTexture::CreateDirect(VkCommandBuffer cmd, VulkanDeviceAllocator *all
if (initialLayout != VK_IMAGE_LAYOUT_UNDEFINED && initialLayout != VK_IMAGE_LAYOUT_PREINITIALIZED) {
switch (initialLayout) {
case VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL:
case VK_IMAGE_LAYOUT_GENERAL:
TransitionImageLayout2(cmd, image_, 0, numMips, VK_IMAGE_ASPECT_COLOR_BIT,
VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_IMAGE_LAYOUT_UNDEFINED, initialLayout,
VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
0, VK_ACCESS_TRANSFER_WRITE_BIT);
break;
default:
@ -208,10 +209,10 @@ void VulkanTexture::GenerateMip(VkCommandBuffer cmd, int mip) {
VK_ACCESS_TRANSFER_READ_BIT, VK_ACCESS_TRANSFER_WRITE_BIT);
}
void VulkanTexture::EndCreate(VkCommandBuffer cmd, bool vertexTexture) {
void VulkanTexture::EndCreate(VkCommandBuffer cmd, bool vertexTexture, VkImageLayout layout) {
TransitionImageLayout2(cmd, image_, 0, numMips_,
VK_IMAGE_ASPECT_COLOR_BIT,
VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
layout, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
VK_PIPELINE_STAGE_TRANSFER_BIT, vertexTexture ? VK_PIPELINE_STAGE_VERTEX_SHADER_BIT : VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
}
@ -222,6 +223,26 @@ void VulkanTexture::Touch() {
}
}
VkImageView VulkanTexture::CreateViewForMip(int mip) {
VkImageViewCreateInfo view_info = { VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO };
view_info.image = image_;
view_info.viewType = VK_IMAGE_VIEW_TYPE_2D;
view_info.format = format_;
view_info.components.r = VK_COMPONENT_SWIZZLE_R;
view_info.components.g = VK_COMPONENT_SWIZZLE_G;
view_info.components.b = VK_COMPONENT_SWIZZLE_B;
view_info.components.a = VK_COMPONENT_SWIZZLE_A;
view_info.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
view_info.subresourceRange.baseMipLevel = mip;
view_info.subresourceRange.levelCount = 1;
view_info.subresourceRange.baseArrayLayer = 0;
view_info.subresourceRange.layerCount = 1;
VkImageView view;
VkResult res = vkCreateImageView(vulkan_->GetDevice(), &view_info, NULL, &view);
assert(res == VK_SUCCESS);
return view;
}
void VulkanTexture::Destroy() {
if (view_ != VK_NULL_HANDLE) {
vulkan_->Delete().QueueDeleteImageView(view_);

View File

@ -21,7 +21,11 @@ public:
bool CreateDirect(VkCommandBuffer cmd, VulkanDeviceAllocator *allocator, int w, int h, int numMips, VkFormat format, VkImageLayout initialLayout, VkImageUsageFlags usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT, const VkComponentMapping *mapping = nullptr);
void UploadMip(VkCommandBuffer cmd, int mip, int mipWidth, int mipHeight, VkBuffer buffer, uint32_t offset, size_t rowLength); // rowLength is in pixels
void GenerateMip(VkCommandBuffer cmd, int mip);
void EndCreate(VkCommandBuffer cmd, bool vertexTexture = false);
void EndCreate(VkCommandBuffer cmd, bool vertexTexture = false, VkImageLayout layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
// When loading mips from compute shaders, you need to pass VK_IMAGE_LAYOUT_GENERAL to the above function.
// In addition, ignore UploadMip and GenerateMip, and instead use GetViewForMip. Make sure to delete the returned views when used.
VkImageView CreateViewForMip(int mip);
void Destroy();

View File

@ -23,8 +23,8 @@
#include "base/timeutil.h"
#include "math/math_util.h"
VulkanPushBuffer::VulkanPushBuffer(VulkanContext *vulkan, size_t size, VkBufferUsageFlags usage)
: vulkan_(vulkan), size_(size), usage_(usage) {
VulkanPushBuffer::VulkanPushBuffer(VulkanContext *vulkan, size_t size, VkBufferUsageFlags usage, VkMemoryPropertyFlags memoryPropertyMask)
: vulkan_(vulkan), memoryPropertyMask_(memoryPropertyMask), size_(size), usage_(usage) {
bool res = AddBuffer();
assert(res);
}
@ -58,7 +58,7 @@ bool VulkanPushBuffer::AddBuffer() {
// Okay, that's the buffer. Now let's allocate some memory for it.
VkMemoryAllocateInfo alloc{ VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO };
alloc.allocationSize = reqs.size;
vulkan_->MemoryTypeFromProperties(reqs.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, &alloc.memoryTypeIndex);
vulkan_->MemoryTypeFromProperties(reqs.memoryTypeBits, memoryPropertyMask_, &alloc.memoryTypeIndex);
res = vkAllocateMemory(device, &alloc, nullptr, &info.deviceMemory);
if (VK_SUCCESS != res) {
@ -89,7 +89,8 @@ void VulkanPushBuffer::Destroy(VulkanContext *vulkan) {
void VulkanPushBuffer::NextBuffer(size_t minSize) {
// First, unmap the current memory.
Unmap();
if (memoryPropertyMask_ & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)
Unmap();
buf_++;
if (buf_ >= buffers_.size() || minSize > size_) {
@ -108,7 +109,8 @@ void VulkanPushBuffer::NextBuffer(size_t minSize) {
// Now, move to the next buffer and map it.
offset_ = 0;
Map();
if (memoryPropertyMask_ & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)
Map();
}
void VulkanPushBuffer::Defragment(VulkanContext *vulkan) {
@ -142,14 +144,15 @@ void VulkanPushBuffer::Map() {
void VulkanPushBuffer::Unmap() {
_dbg_assert_(G3D, writePtr_ != 0);
/*
// Should not need this since we use coherent memory.
VkMappedMemoryRange range{ VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE };
range.offset = 0;
range.size = offset_;
range.memory = buffers_[buf_].deviceMemory;
vkFlushMappedMemoryRanges(device_, 1, &range);
*/
if ((memoryPropertyMask_ & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT) == 0) {
VkMappedMemoryRange range{ VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE };
range.offset = 0;
range.size = offset_;
range.memory = buffers_[buf_].deviceMemory;
vkFlushMappedMemoryRanges(vulkan_->GetDevice(), 1, &range);
}
vkUnmapMemory(vulkan_->GetDevice(), buffers_[buf_].deviceMemory);
writePtr_ = nullptr;
}

View File

@ -22,7 +22,10 @@ class VulkanPushBuffer {
};
public:
VulkanPushBuffer(VulkanContext *vulkan, size_t size, VkBufferUsageFlags usage);
// NOTE: If you create a push buffer with only VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
// then you can't use any of the push functions as pointers will not be reachable from the CPU.
// You must in this case use Allocate() only, and pass the returned offset and the VkBuffer to Vulkan APIs.
VulkanPushBuffer(VulkanContext *vulkan, size_t size, VkBufferUsageFlags usage, VkMemoryPropertyFlags memoryPropertyMask = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
~VulkanPushBuffer();
void Destroy(VulkanContext *vulkan);
@ -35,15 +38,18 @@ public:
offset_ = 0;
// Note: we must defrag because some buffers may be smaller than size_.
Defragment(vulkan);
Map();
if (memoryPropertyMask_ & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)
Map();
}
void BeginNoReset() {
Map();
if (memoryPropertyMask_ & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)
Map();
}
void End() {
Unmap();
if (memoryPropertyMask_ & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)
Unmap();
}
void Map();
@ -109,6 +115,8 @@ private:
void Defragment(VulkanContext *vulkan);
VulkanContext *vulkan_;
VkMemoryPropertyFlags memoryPropertyMask_;
std::vector<BufInfo> buffers_;
size_t buf_ = 0;
size_t offset_ = 0;

View File

@ -751,6 +751,7 @@ static ConfigSetting graphicsSettings[] = {
ReportedConfigSetting("TexScalingLevel", &g_Config.iTexScalingLevel, 1, true, true),
ReportedConfigSetting("TexScalingType", &g_Config.iTexScalingType, 0, true, true),
ReportedConfigSetting("TexDeposterize", &g_Config.bTexDeposterize, false, true, true),
ReportedConfigSetting("TexHardwareScaling", &g_Config.bTexHardwareScaling, false, true, true),
ConfigSetting("VSyncInterval", &g_Config.bVSync, false, true, true),
ReportedConfigSetting("BloomHack", &g_Config.iBloomHack, 0, true, true),

View File

@ -170,6 +170,7 @@ public:
int iTexScalingLevel; // 0 = auto, 1 = off, 2 = 2x, ..., 5 = 5x
int iTexScalingType; // 0 = xBRZ, 1 = Hybrid
bool bTexDeposterize;
bool bTexHardwareScaling;
int iFpsLimit1;
int iFpsLimit2;
int iMaxRecent;

View File

@ -153,6 +153,8 @@ void DrawEngineVulkan::InitDeviceObjects() {
frame_[i].pushUBO = new VulkanPushBuffer(vulkan_, 8 * 1024 * 1024, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT);
frame_[i].pushVertex = new VulkanPushBuffer(vulkan_, 2 * 1024 * 1024, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT);
frame_[i].pushIndex = new VulkanPushBuffer(vulkan_, 1 * 1024 * 1024, VK_BUFFER_USAGE_INDEX_BUFFER_BIT);
frame_[i].pushLocal = new VulkanPushBuffer(vulkan_, 1 * 1024 * 1024, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
}
VkPipelineLayoutCreateInfo pl{ VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO };
@ -209,6 +211,11 @@ void DrawEngineVulkan::FrameData::Destroy(VulkanContext *vulkan) {
delete pushIndex;
pushIndex = nullptr;
}
if (pushLocal) {
pushLocal->Destroy(vulkan);
delete pushLocal;
pushLocal = nullptr;
}
}
void DrawEngineVulkan::DestroyDeviceObjects() {
@ -264,10 +271,12 @@ void DrawEngineVulkan::BeginFrame() {
frame->pushUBO->Reset();
frame->pushVertex->Reset();
frame->pushIndex->Reset();
frame->pushLocal->Reset();
frame->pushUBO->Begin(vulkan_);
frame->pushVertex->Begin(vulkan_);
frame->pushIndex->Begin(vulkan_);
frame->pushLocal->Begin(vulkan_);
// TODO: How can we make this nicer...
tessDataTransferVulkan->SetPushBuffer(frame->pushUBO);
@ -324,6 +333,7 @@ void DrawEngineVulkan::EndFrame() {
frame->pushUBO->End();
frame->pushVertex->End();
frame->pushIndex->End();
frame->pushLocal->End();
vertexCache_->End();
}

View File

@ -188,6 +188,11 @@ public:
return frame_[vulkan_->GetCurFrame()].pushUBO;
}
// Only use Allocate on this one.
VulkanPushBuffer *GetPushBufferLocal() {
return frame_[vulkan_->GetCurFrame()].pushLocal;
}
const DrawEngineVulkanStats &GetStats() const {
return stats_;
}
@ -257,6 +262,10 @@ private:
VulkanPushBuffer *pushUBO = nullptr;
VulkanPushBuffer *pushVertex = nullptr;
VulkanPushBuffer *pushIndex = nullptr;
// Special push buffer in GPU local memory, for texture data conversion and similar tasks.
VulkanPushBuffer *pushLocal;
// We do rolling allocation and reset instead of caching across frames. That we might do later.
DenseHashMap<DescriptorSetKey, VkDescriptorSet, (VkDescriptorSet)VK_NULL_HANDLE> descSets;

View File

@ -67,6 +67,450 @@ static const VkComponentMapping VULKAN_1555_SWIZZLE = { VK_COMPONENT_SWIZZLE_B,
static const VkComponentMapping VULKAN_565_SWIZZLE = { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G, VK_COMPONENT_SWIZZLE_B, VK_COMPONENT_SWIZZLE_A };
static const VkComponentMapping VULKAN_8888_SWIZZLE = { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G, VK_COMPONENT_SWIZZLE_B, VK_COMPONENT_SWIZZLE_A };
// 4xBRZ shader - Copyright (C) 2014-2016 DeSmuME team (GPL2+)
// Hyllian's xBR-vertex code and texel mapping
// Copyright (C) 2011/2016 Hyllian - sergiogdb@gmail.com
// TODO: Handles alpha badly for PSP.
const char *shader4xbrz = R"(
vec4 premultiply_alpha(vec4 c) { float a = clamp(c.a, 0.0, 1.0); return vec4(c.rgb * a, a); }
vec4 postdivide_alpha(vec4 c) { return c.a < 0.001? vec4(0.0,0.0,0.0,0.0) : vec4(c.rgb / c.a, c.a); }
#define BLEND_ALPHA 1
#define BLEND_NONE 0
#define BLEND_NORMAL 1
#define BLEND_DOMINANT 2
#define LUMINANCE_WEIGHT 1.0
#define EQUAL_COLOR_TOLERANCE 30.0/255.0
#define STEEP_DIRECTION_THRESHOLD 2.2
#define DOMINANT_DIRECTION_THRESHOLD 3.6
float reduce(vec4 color) {
return dot(color.rgb, vec3(65536.0, 256.0, 1.0));
}
float DistYCbCr(vec4 pixA, vec4 pixB) {
const vec3 w = vec3(0.2627, 0.6780, 0.0593);
const float scaleB = 0.5 / (1.0 - w.b);
const float scaleR = 0.5 / (1.0 - w.r);
vec4 diff = pixA - pixB;
float Y = dot(diff.rgb, w);
float Cb = scaleB * (diff.b - Y);
float Cr = scaleR * (diff.r - Y);
return sqrt( ((LUMINANCE_WEIGHT * Y) * (LUMINANCE_WEIGHT * Y)) + (Cb * Cb) + (Cr * Cr) + (diff.a * diff.a));
}
bool IsPixEqual(const vec4 pixA, const vec4 pixB) {
return (DistYCbCr(pixA, pixB) < EQUAL_COLOR_TOLERANCE);
}
bool IsBlendingNeeded(const ivec4 blend) {
ivec4 diff = blend - ivec4(BLEND_NONE);
return diff.x != 0 || diff.y != 0 || diff.z != 0 || diff.w != 0;
}
vec4 applyScalingf(uvec2 origxy, uvec2 xy) {
float dx = 1.0 / params.width;
float dy = 1.0 / params.height;
// A1 B1 C1
// A0 A B C C4
// D0 D E F F4
// G0 G H I I4
// G5 H5 I5
uvec4 t1 = uvec4(origxy.x - 1, origxy.x, origxy.x + 1, origxy.y - 2); // A1 B1 C1
uvec4 t2 = uvec4(origxy.x - 1, origxy.x, origxy.x + 1, origxy.y - 1); // A B C
uvec4 t3 = uvec4(origxy.x - 1, origxy.x, origxy.x + 1, origxy.y + 0); // D E F
uvec4 t4 = uvec4(origxy.x - 1, origxy.x, origxy.x + 1, origxy.y + 1); // G H I
uvec4 t5 = uvec4(origxy.x - 1, origxy.x, origxy.x + 1, origxy.y + 2); // G5 H5 I5
uvec4 t6 = uvec4(origxy.x - 2, origxy.y - 1, origxy.y, origxy.y + 1); // A0 D0 G0
uvec4 t7 = uvec4(origxy.x + 2, origxy.y - 1, origxy.y, origxy.y + 1); // C4 F4 I4
vec2 f = fract(vec2(float(xy.x) / float(params.scale), float(xy.y) / float(params.scale)));
//---------------------------------------
// Input Pixel Mapping: |21|22|23|
// 19|06|07|08|09
// 18|05|00|01|10
// 17|04|03|02|11
// |15|14|13|
vec4 src[25];
src[21] = premultiply_alpha(readColorf(t1.xw));
src[22] = premultiply_alpha(readColorf(t1.yw));
src[23] = premultiply_alpha(readColorf(t1.zw));
src[ 6] = premultiply_alpha(readColorf(t2.xw));
src[ 7] = premultiply_alpha(readColorf(t2.yw));
src[ 8] = premultiply_alpha(readColorf(t2.zw));
src[ 5] = premultiply_alpha(readColorf(t3.xw));
src[ 0] = premultiply_alpha(readColorf(t3.yw));
src[ 1] = premultiply_alpha(readColorf(t3.zw));
src[ 4] = premultiply_alpha(readColorf(t4.xw));
src[ 3] = premultiply_alpha(readColorf(t4.yw));
src[ 2] = premultiply_alpha(readColorf(t4.zw));
src[15] = premultiply_alpha(readColorf(t5.xw));
src[14] = premultiply_alpha(readColorf(t5.yw));
src[13] = premultiply_alpha(readColorf(t5.zw));
src[19] = premultiply_alpha(readColorf(t6.xy));
src[18] = premultiply_alpha(readColorf(t6.xz));
src[17] = premultiply_alpha(readColorf(t6.xw));
src[ 9] = premultiply_alpha(readColorf(t7.xy));
src[10] = premultiply_alpha(readColorf(t7.xz));
src[11] = premultiply_alpha(readColorf(t7.xw));
float v[9];
v[0] = reduce(src[0]);
v[1] = reduce(src[1]);
v[2] = reduce(src[2]);
v[3] = reduce(src[3]);
v[4] = reduce(src[4]);
v[5] = reduce(src[5]);
v[6] = reduce(src[6]);
v[7] = reduce(src[7]);
v[8] = reduce(src[8]);
ivec4 blendResult = ivec4(BLEND_NONE);
// Preprocess corners
// Pixel Tap Mapping: --|--|--|--|--
// --|--|07|08|--
// --|05|00|01|10
// --|04|03|02|11
// --|--|14|13|--
// Corner (1, 1)
if ( ((v[0] == v[1] && v[3] == v[2]) || (v[0] == v[3] && v[1] == v[2])) == false) {
float dist_03_01 = DistYCbCr(src[ 4], src[ 0]) + DistYCbCr(src[ 0], src[ 8]) + DistYCbCr(src[14], src[ 2]) + DistYCbCr(src[ 2], src[10]) + (4.0 * DistYCbCr(src[ 3], src[ 1]));
float dist_00_02 = DistYCbCr(src[ 5], src[ 3]) + DistYCbCr(src[ 3], src[13]) + DistYCbCr(src[ 7], src[ 1]) + DistYCbCr(src[ 1], src[11]) + (4.0 * DistYCbCr(src[ 0], src[ 2]));
bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_03_01) < dist_00_02;
blendResult[2] = ((dist_03_01 < dist_00_02) && (v[0] != v[1]) && (v[0] != v[3])) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE;
}
// Pixel Tap Mapping: --|--|--|--|--
// --|06|07|--|--
// 18|05|00|01|--
// 17|04|03|02|--
// --|15|14|--|--
// Corner (0, 1)
if ( ((v[5] == v[0] && v[4] == v[3]) || (v[5] == v[4] && v[0] == v[3])) == false) {
float dist_04_00 = DistYCbCr(src[17], src[ 5]) + DistYCbCr(src[ 5], src[ 7]) + DistYCbCr(src[15], src[ 3]) + DistYCbCr(src[ 3], src[ 1]) + (4.0 * DistYCbCr(src[ 4], src[ 0]));
float dist_05_03 = DistYCbCr(src[18], src[ 4]) + DistYCbCr(src[ 4], src[14]) + DistYCbCr(src[ 6], src[ 0]) + DistYCbCr(src[ 0], src[ 2]) + (4.0 * DistYCbCr(src[ 5], src[ 3]));
bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_05_03) < dist_04_00;
blendResult[3] = ((dist_04_00 > dist_05_03) && (v[0] != v[5]) && (v[0] != v[3])) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE;
}
// Pixel Tap Mapping: --|--|22|23|--
// --|06|07|08|09
// --|05|00|01|10
// --|--|03|02|--
// --|--|--|--|--
// Corner (1, 0)
if ( ((v[7] == v[8] && v[0] == v[1]) || (v[7] == v[0] && v[8] == v[1])) == false) {
float dist_00_08 = DistYCbCr(src[ 5], src[ 7]) + DistYCbCr(src[ 7], src[23]) + DistYCbCr(src[ 3], src[ 1]) + DistYCbCr(src[ 1], src[ 9]) + (4.0 * DistYCbCr(src[ 0], src[ 8]));
float dist_07_01 = DistYCbCr(src[ 6], src[ 0]) + DistYCbCr(src[ 0], src[ 2]) + DistYCbCr(src[22], src[ 8]) + DistYCbCr(src[ 8], src[10]) + (4.0 * DistYCbCr(src[ 7], src[ 1]));
bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_07_01) < dist_00_08;
blendResult[1] = ((dist_00_08 > dist_07_01) && (v[0] != v[7]) && (v[0] != v[1])) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE;
}
// Pixel Tap Mapping: --|21|22|--|--
// 19|06|07|08|--
// 18|05|00|01|--
// --|04|03|--|--
// --|--|--|--|--
// Corner (0, 0)
if ( ((v[6] == v[7] && v[5] == v[0]) || (v[6] == v[5] && v[7] == v[0])) == false) {
float dist_05_07 = DistYCbCr(src[18], src[ 6]) + DistYCbCr(src[ 6], src[22]) + DistYCbCr(src[ 4], src[ 0]) + DistYCbCr(src[ 0], src[ 8]) + (4.0 * DistYCbCr(src[ 5], src[ 7]));
float dist_06_00 = DistYCbCr(src[19], src[ 5]) + DistYCbCr(src[ 5], src[ 3]) + DistYCbCr(src[21], src[ 7]) + DistYCbCr(src[ 7], src[ 1]) + (4.0 * DistYCbCr(src[ 6], src[ 0]));
bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_05_07) < dist_06_00;
blendResult[0] = ((dist_05_07 < dist_06_00) && (v[0] != v[5]) && (v[0] != v[7])) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE;
}
vec4 dst[16];
dst[ 0] = src[0];
dst[ 1] = src[0];
dst[ 2] = src[0];
dst[ 3] = src[0];
dst[ 4] = src[0];
dst[ 5] = src[0];
dst[ 6] = src[0];
dst[ 7] = src[0];
dst[ 8] = src[0];
dst[ 9] = src[0];
dst[10] = src[0];
dst[11] = src[0];
dst[12] = src[0];
dst[13] = src[0];
dst[14] = src[0];
dst[15] = src[0];
// Scale pixel
if (IsBlendingNeeded(blendResult) == true) {
float dist_01_04 = DistYCbCr(src[1], src[4]);
float dist_03_08 = DistYCbCr(src[3], src[8]);
bool haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_01_04 <= dist_03_08) && (v[0] != v[4]) && (v[5] != v[4]);
bool haveSteepLine = (STEEP_DIRECTION_THRESHOLD * dist_03_08 <= dist_01_04) && (v[0] != v[8]) && (v[7] != v[8]);
bool needBlend = (blendResult[2] != BLEND_NONE);
bool doLineBlend = ( blendResult[2] >= BLEND_DOMINANT ||
((blendResult[1] != BLEND_NONE && !IsPixEqual(src[0], src[4])) ||
(blendResult[3] != BLEND_NONE && !IsPixEqual(src[0], src[8])) ||
(IsPixEqual(src[4], src[3]) && IsPixEqual(src[3], src[2]) && IsPixEqual(src[2], src[1]) && IsPixEqual(src[1], src[8]) && IsPixEqual(src[0], src[2]) == false) ) == false );
vec4 blendPix = ( DistYCbCr(src[0], src[1]) <= DistYCbCr(src[0], src[3]) ) ? src[1] : src[3];
dst[ 2] = mix(dst[ 2], blendPix, (needBlend && doLineBlend) ? ((haveShallowLine) ? ((haveSteepLine) ? 1.0/3.0 : 0.25) : ((haveSteepLine) ? 0.25 : 0.00)) : 0.00);
dst[ 9] = mix(dst[ 9], blendPix, (needBlend && doLineBlend && haveSteepLine) ? 0.25 : 0.00);
dst[10] = mix(dst[10], blendPix, (needBlend && doLineBlend && haveSteepLine) ? 0.75 : 0.00);
dst[11] = mix(dst[11], blendPix, (needBlend) ? ((doLineBlend) ? ((haveSteepLine) ? 1.00 : ((haveShallowLine) ? 0.75 : 0.50)) : 0.08677704501) : 0.00);
dst[12] = mix(dst[12], blendPix, (needBlend) ? ((doLineBlend) ? 1.00 : 0.6848532563) : 0.00);
dst[13] = mix(dst[13], blendPix, (needBlend) ? ((doLineBlend) ? ((haveShallowLine) ? 1.00 : ((haveSteepLine) ? 0.75 : 0.50)) : 0.08677704501) : 0.00);
dst[14] = mix(dst[14], blendPix, (needBlend && doLineBlend && haveShallowLine) ? 0.75 : 0.00);
dst[15] = mix(dst[15], blendPix, (needBlend && doLineBlend && haveShallowLine) ? 0.25 : 0.00);
dist_01_04 = DistYCbCr(src[7], src[2]);
dist_03_08 = DistYCbCr(src[1], src[6]);
haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_01_04 <= dist_03_08) && (v[0] != v[2]) && (v[3] != v[2]);
haveSteepLine = (STEEP_DIRECTION_THRESHOLD * dist_03_08 <= dist_01_04) && (v[0] != v[6]) && (v[5] != v[6]);
needBlend = (blendResult[1] != BLEND_NONE);
doLineBlend = ( blendResult[1] >= BLEND_DOMINANT ||
!((blendResult[0] != BLEND_NONE && !IsPixEqual(src[0], src[2])) ||
(blendResult[2] != BLEND_NONE && !IsPixEqual(src[0], src[6])) ||
(IsPixEqual(src[2], src[1]) && IsPixEqual(src[1], src[8]) && IsPixEqual(src[8], src[7]) && IsPixEqual(src[7], src[6]) && !IsPixEqual(src[0], src[8])) ) );
blendPix = ( DistYCbCr(src[0], src[7]) <= DistYCbCr(src[0], src[1]) ) ? src[7] : src[1];
dst[ 1] = mix(dst[ 1], blendPix, (needBlend && doLineBlend) ? ((haveShallowLine) ? ((haveSteepLine) ? 1.0/3.0 : 0.25) : ((haveSteepLine) ? 0.25 : 0.00)) : 0.00);
dst[ 6] = mix(dst[ 6], blendPix, (needBlend && doLineBlend && haveSteepLine) ? 0.25 : 0.00);
dst[ 7] = mix(dst[ 7], blendPix, (needBlend && doLineBlend && haveSteepLine) ? 0.75 : 0.00);
dst[ 8] = mix(dst[ 8], blendPix, (needBlend) ? ((doLineBlend) ? ((haveSteepLine) ? 1.00 : ((haveShallowLine) ? 0.75 : 0.50)) : 0.08677704501) : 0.00);
dst[ 9] = mix(dst[ 9], blendPix, (needBlend) ? ((doLineBlend) ? 1.00 : 0.6848532563) : 0.00);
dst[10] = mix(dst[10], blendPix, (needBlend) ? ((doLineBlend) ? ((haveShallowLine) ? 1.00 : ((haveSteepLine) ? 0.75 : 0.50)) : 0.08677704501) : 0.00);
dst[11] = mix(dst[11], blendPix, (needBlend && doLineBlend && haveShallowLine) ? 0.75 : 0.00);
dst[12] = mix(dst[12], blendPix, (needBlend && doLineBlend && haveShallowLine) ? 0.25 : 0.00);
dist_01_04 = DistYCbCr(src[5], src[8]);
dist_03_08 = DistYCbCr(src[7], src[4]);
haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_01_04 <= dist_03_08) && (v[0] != v[8]) && (v[1] != v[8]);
haveSteepLine = (STEEP_DIRECTION_THRESHOLD * dist_03_08 <= dist_01_04) && (v[0] != v[4]) && (v[3] != v[4]);
needBlend = (blendResult[0] != BLEND_NONE);
doLineBlend = ( blendResult[0] >= BLEND_DOMINANT ||
!((blendResult[3] != BLEND_NONE && !IsPixEqual(src[0], src[8])) ||
(blendResult[1] != BLEND_NONE && !IsPixEqual(src[0], src[4])) ||
(IsPixEqual(src[8], src[7]) && IsPixEqual(src[7], src[6]) && IsPixEqual(src[6], src[5]) && IsPixEqual(src[5], src[4]) && !IsPixEqual(src[0], src[6])) ) );
blendPix = ( DistYCbCr(src[0], src[5]) <= DistYCbCr(src[0], src[7]) ) ? src[5] : src[7];
dst[ 0] = mix(dst[ 0], blendPix, (needBlend && doLineBlend) ? ((haveShallowLine) ? ((haveSteepLine) ? 1.0/3.0 : 0.25) : ((haveSteepLine) ? 0.25 : 0.00)) : 0.00);
dst[15] = mix(dst[15], blendPix, (needBlend && doLineBlend && haveSteepLine) ? 0.25 : 0.00);
dst[ 4] = mix(dst[ 4], blendPix, (needBlend && doLineBlend && haveSteepLine) ? 0.75 : 0.00);
dst[ 5] = mix(dst[ 5], blendPix, (needBlend) ? ((doLineBlend) ? ((haveSteepLine) ? 1.00 : ((haveShallowLine) ? 0.75 : 0.50)) : 0.08677704501) : 0.00);
dst[ 6] = mix(dst[ 6], blendPix, (needBlend) ? ((doLineBlend) ? 1.00 : 0.6848532563) : 0.00);
dst[ 7] = mix(dst[ 7], blendPix, (needBlend) ? ((doLineBlend) ? ((haveShallowLine) ? 1.00 : ((haveSteepLine) ? 0.75 : 0.50)) : 0.08677704501) : 0.00);
dst[ 8] = mix(dst[ 8], blendPix, (needBlend && doLineBlend && haveShallowLine) ? 0.75 : 0.00);
dst[ 9] = mix(dst[ 9], blendPix, (needBlend && doLineBlend && haveShallowLine) ? 0.25 : 0.00);
dist_01_04 = DistYCbCr(src[3], src[6]);
dist_03_08 = DistYCbCr(src[5], src[2]);
haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_01_04 <= dist_03_08) && (v[0] != v[6]) && (v[7] != v[6]);
haveSteepLine = (STEEP_DIRECTION_THRESHOLD * dist_03_08 <= dist_01_04) && (v[0] != v[2]) && (v[1] != v[2]);
needBlend = (blendResult[3] != BLEND_NONE);
doLineBlend = ( blendResult[3] >= BLEND_DOMINANT ||
!((blendResult[2] != BLEND_NONE && !IsPixEqual(src[0], src[6])) ||
(blendResult[0] != BLEND_NONE && !IsPixEqual(src[0], src[2])) ||
(IsPixEqual(src[6], src[5]) && IsPixEqual(src[5], src[4]) && IsPixEqual(src[4], src[3]) && IsPixEqual(src[3], src[2]) && !IsPixEqual(src[0], src[4])) ) );
blendPix = ( DistYCbCr(src[0], src[3]) <= DistYCbCr(src[0], src[5]) ) ? src[3] : src[5];
dst[ 3] = mix(dst[ 3], blendPix, (needBlend && doLineBlend) ? ((haveShallowLine) ? ((haveSteepLine) ? 1.0/3.0 : 0.25) : ((haveSteepLine) ? 0.25 : 0.00)) : 0.00);
dst[12] = mix(dst[12], blendPix, (needBlend && doLineBlend && haveSteepLine) ? 0.25 : 0.00);
dst[13] = mix(dst[13], blendPix, (needBlend && doLineBlend && haveSteepLine) ? 0.75 : 0.00);
dst[14] = mix(dst[14], blendPix, (needBlend) ? ((doLineBlend) ? ((haveSteepLine) ? 1.00 : ((haveShallowLine) ? 0.75 : 0.50)) : 0.08677704501) : 0.00);
dst[15] = mix(dst[15], blendPix, (needBlend) ? ((doLineBlend) ? 1.00 : 0.6848532563) : 0.00);
dst[ 4] = mix(dst[ 4], blendPix, (needBlend) ? ((doLineBlend) ? ((haveShallowLine) ? 1.00 : ((haveSteepLine) ? 0.75 : 0.50)) : 0.08677704501) : 0.00);
dst[ 5] = mix(dst[ 5], blendPix, (needBlend && doLineBlend && haveShallowLine) ? 0.75 : 0.00);
dst[ 6] = mix(dst[ 6], blendPix, (needBlend && doLineBlend && haveShallowLine) ? 0.25 : 0.00);
}
// select output pixel
vec4 res = mix(mix(mix(mix(dst[ 6], dst[ 7], step(0.25, f.x)),
mix(dst[ 8], dst[ 9], step(0.75, f.x)),
step(0.50, f.x)),
mix(mix(dst[ 5], dst[ 0], step(0.25, f.x)),
mix(dst[ 1], dst[10], step(0.75, f.x)),
step(0.50, f.x)),
step(0.25, f.y)),
mix(mix(mix(dst[ 4], dst[ 3], step(0.25, f.x)),
mix(dst[ 2], dst[11], step(0.75, f.x)),
step(0.50, f.x)),
mix(mix(dst[15], dst[14], step(0.25, f.x)),
mix(dst[13], dst[12], step(0.75, f.x)),
step(0.50, f.x)),
step(0.75, f.y)),
step(0.50, f.y));
return postdivide_alpha(res);
}
uint applyScalingu(uvec2 origxy, uvec2 xy) {
return packUnorm4x8(applyScalingf(origxy, xy));
}
)";
const char *copyShader = R"(
#version 450
#extension GL_ARB_separate_shader_objects : enable
// No idea what's optimal here...
#define WORKGROUP_SIZE 16
layout (local_size_x = WORKGROUP_SIZE, local_size_y = WORKGROUP_SIZE, local_size_z = 1) in;
layout(std430, binding = 1) buffer Buf1 {
uint data[];
} buf1;
layout(std430, binding = 2) buffer Buf2 {
uint data[];
} buf2;
layout(push_constant) uniform Params {
int width;
int height;
int scale;
int fmt;
} params;
uint readColoru(uvec2 p) {
// Note that if the pixels are packed, we can do multiple stores
// and only launch this compute shader for every N pixels,
// by slicing the width in half and multiplying x by 2, for example.
if (params.fmt == 0) {
return buf1.data[p.y * params.width + p.x];
} else {
uint offset = p.y * params.width + p.x;
uint data = buf1.data[offset / 2];
if ((offset & 1) != 0) {
data = data >> 16;
}
if (params.fmt == 6) {
uint r = ((data << 3) & 0xF8) | ((data >> 2) & 0x07);
uint g = ((data >> 3) & 0xFC) | ((data >> 9) & 0x03);
uint b = ((data >> 8) & 0xF8) | ((data >> 13) & 0x07);
return 0xFF000000 | (b << 16) | (g << 8) | r;
} else if (params.fmt == 5) {
uint r = ((data << 3) & 0xF8) | ((data >> 2) & 0x07);
uint g = ((data >> 2) & 0xF8) | ((data >> 7) & 0x07);
uint b = ((data >> 7) & 0xF8) | ((data >> 12) & 0x07);
uint a = ((data >> 15) & 0x01) == 0 ? 0x00 : 0xFF;
return (a << 24) | (b << 16) | (g << 8) | r;
} else if (params.fmt == 4) {
uint r = (data & 0x0F) | ((data << 4) & 0x0F);
uint g = (data & 0xF0) | ((data >> 4) & 0x0F);
uint b = ((data >> 8) & 0x0F) | ((data >> 4) & 0xF0);
uint a = ((data >> 12) & 0x0F) | ((data >> 8) & 0xF0);
return (a << 24) | (b << 16) | (g << 8) | r;
}
}
}
vec4 readColorf(uvec2 p) {
return unpackUnorm4x8(readColoru(p));
}
%s
void main() {
uvec2 xy = gl_GlobalInvocationID.xy;
// Kill off any out-of-image threads to avoid stray writes.
// Should only happen on the tiniest mipmaps as PSP textures are power-of-2,
// and we use a 16x16 workgroup size.
if (xy.x >= params.width || xy.y >= params.height)
return;
uvec2 origxy = xy / params.scale;
if (params.scale == 1) {
buf2.data[xy.y * params.width + xy.x] = readColoru(origxy);
} else {
buf2.data[xy.y * params.width + xy.x] = applyScalingu(origxy, xy);
}
}
)";
const char *uploadShader = R"(
#version 450
#extension GL_ARB_separate_shader_objects : enable
// No idea what's optimal here...
#define WORKGROUP_SIZE 16
layout (local_size_x = WORKGROUP_SIZE, local_size_y = WORKGROUP_SIZE, local_size_z = 1) in;
uniform layout(binding = 0, rgba8) writeonly image2D img;
layout(std430, binding = 1) buffer Buf {
uint data[];
} buf;
layout(push_constant) uniform Params {
int width;
int height;
int scale;
int fmt;
} params;
uint readColoru(uvec2 p) {
// Note that if the pixels are packed, we can do multiple stores
// and only launch this compute shader for every N pixels,
// by slicing the width in half and multiplying x by 2, for example.
if (params.fmt == 0) {
return buf.data[p.y * params.width + p.x];
} else {
uint offset = p.y * params.width + p.x;
uint data = buf.data[offset / 2];
if ((offset & 1) != 0) {
data = data >> 16;
}
if (params.fmt == 6) {
uint r = ((data << 3) & 0xF8) | ((data >> 2) & 0x07);
uint g = ((data >> 3) & 0xFC) | ((data >> 9) & 0x03);
uint b = ((data >> 8) & 0xF8) | ((data >> 13) & 0x07);
return 0xFF000000 | (b << 16) | (g << 8) | r;
} else if (params.fmt == 5) {
uint r = ((data << 3) & 0xF8) | ((data >> 2) & 0x07);
uint g = ((data >> 2) & 0xF8) | ((data >> 7) & 0x07);
uint b = ((data >> 7) & 0xF8) | ((data >> 12) & 0x07);
uint a = ((data >> 15) & 0x01) == 0 ? 0x00 : 0xFF;
return (a << 24) | (b << 16) | (g << 8) | r;
} else if (params.fmt == 4) {
uint r = (data & 0x0F) | ((data << 4) & 0x0F);
uint g = (data & 0xF0) | ((data >> 4) & 0x0F);
uint b = ((data >> 8) & 0x0F) | ((data >> 4) & 0xF0);
uint a = ((data >> 12) & 0x0F) | ((data >> 8) & 0xF0);
return (a << 24) | (b << 16) | (g << 8) | r;
}
}
}
vec4 readColorf(uvec2 p) {
// Unpack the color (we could look it up in a CLUT here if we wanted...)
// It's a bit silly that we need to unpack to float and then have imageStore repack,
// but the alternative is to store to a buffer, and then launch a vkCmdCopyBufferToImage instead.
return unpackUnorm4x8(readColoru(p));
}
%s
void main() {
uvec2 xy = gl_GlobalInvocationID.xy;
// Kill off any out-of-image threads to avoid stray writes.
// Should only happen on the tiniest mipmaps as PSP textures are power-of-2,
// and we use a 16x16 workgroup size.
if (xy.x >= params.width || xy.y >= params.height)
return;
uvec2 origxy = xy / params.scale;
if (params.scale == 1) {
imageStore(img, ivec2(xy.x, xy.y), readColorf(origxy));
} else {
imageStore(img, ivec2(xy.x, xy.y), applyScalingf(origxy, xy));
}
}
)";
SamplerCache::~SamplerCache() {
DeviceLost();
}
@ -141,7 +585,8 @@ std::vector<std::string> SamplerCache::DebugGetSamplerIDs() const {
TextureCacheVulkan::TextureCacheVulkan(Draw::DrawContext *draw, VulkanContext *vulkan)
: TextureCacheCommon(draw),
vulkan_(vulkan),
samplerCache_(vulkan) {
samplerCache_(vulkan),
computeShaderManager_(vulkan) {
timesInvalidatedAllThisFrame_ = 0;
DeviceRestore(vulkan, draw);
SetupTextureDecoder();
@ -180,6 +625,13 @@ void TextureCacheVulkan::DeviceLost() {
if (samplerNearest_)
vulkan_->Delete().QueueDeleteSampler(samplerNearest_);
if (uploadCS_ != VK_NULL_HANDLE)
vulkan_->Delete().QueueDeleteShaderModule(uploadCS_);
if (copyCS_ != VK_NULL_HANDLE)
vulkan_->Delete().QueueDeleteShaderModule(copyCS_);
computeShaderManager_.DeviceLost();
nextTexture_ = nullptr;
}
@ -200,6 +652,19 @@ void TextureCacheVulkan::DeviceRestore(VulkanContext *vulkan, Draw::DrawContext
samp.minFilter = VK_FILTER_NEAREST;
samp.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST;
vkCreateSampler(vulkan_->GetDevice(), &samp, nullptr, &samplerNearest_);
std::string error;
std::string fullUploadShader = StringFromFormat(uploadShader, shader4xbrz);
std::string fullCopyShader = StringFromFormat(copyShader, shader4xbrz);
if (g_Config.bTexHardwareScaling) {
uploadCS_ = CompileShaderModule(vulkan_, VK_SHADER_STAGE_COMPUTE_BIT, fullUploadShader.c_str(), &error);
_dbg_assert_msg_(G3D, uploadCS_ != VK_NULL_HANDLE, "failed to compile upload shader");
copyCS_ = CompileShaderModule(vulkan_, VK_SHADER_STAGE_COMPUTE_BIT, fullCopyShader.c_str(), &error);
_dbg_assert_msg_(G3D, copyCS_!= VK_NULL_HANDLE, "failed to compile copy shader");
}
computeShaderManager_.DeviceRestore(vulkan);
}
void TextureCacheVulkan::ReleaseTexture(TexCacheEntry *entry, bool delete_them) {
@ -272,10 +737,12 @@ void TextureCacheVulkan::StartFrame() {
}
allocator_->Begin();
computeShaderManager_.BeginFrame();
}
void TextureCacheVulkan::EndFrame() {
allocator_->End();
computeShaderManager_.EndFrame();
if (texelsScaledThisFrame_) {
// INFO_LOG(G3D, "Scaled %i texels", texelsScaledThisFrame_);
@ -570,14 +1037,14 @@ void TextureCacheVulkan::BuildTexture(TexCacheEntry *const entry) {
// Don't scale the PPGe texture.
if (entry->addr > 0x05000000 && entry->addr < PSP_GetKernelMemoryEnd())
scaleFactor = 1;
if ((entry->status & TexCacheEntry::STATUS_CHANGE_FREQUENT) != 0 && scaleFactor != 1) {
if ((entry->status & TexCacheEntry::STATUS_CHANGE_FREQUENT) != 0 && scaleFactor != 1 && !g_Config.bTexHardwareScaling) {
// Remember for later that we /wanted/ to scale this texture.
entry->status |= TexCacheEntry::STATUS_TO_SCALE;
scaleFactor = 1;
}
if (scaleFactor != 1) {
if (texelsScaledThisFrame_ >= TEXCACHE_MAX_TEXELS_SCALED) {
if (texelsScaledThisFrame_ >= TEXCACHE_MAX_TEXELS_SCALED && !g_Config.bTexHardwareScaling) {
entry->status |= TexCacheEntry::STATUS_TO_SCALE;
scaleFactor = 1;
} else {
@ -597,6 +1064,9 @@ void TextureCacheVulkan::BuildTexture(TexCacheEntry *const entry) {
actualFmt = ToVulkanFormat(replaced.Format(0));
}
bool computeUpload = false;
bool computeCopy = false;
{
delete entry->vkTex;
entry->vkTex = new VulkanTexture(vulkan_);
@ -621,11 +1091,29 @@ void TextureCacheVulkan::BuildTexture(TexCacheEntry *const entry) {
break;
}
VkImageLayout imageLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
VkImageUsageFlags usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT;
// If we want to use the GE debugger, we should add VK_IMAGE_USAGE_TRANSFER_SRC_BIT too...
// Compute experiment
if (actualFmt == VULKAN_8888_FORMAT && scaleFactor > 1 && g_Config.bTexHardwareScaling) {
// Enable the experiment you want.
if (uploadCS_ != VK_NULL_HANDLE)
computeUpload = true;
else if (copyCS_ != VK_NULL_HANDLE)
computeCopy = true;
}
if (computeUpload) {
usage |= VK_IMAGE_USAGE_STORAGE_BIT;
imageLayout = VK_IMAGE_LAYOUT_GENERAL;
}
char texName[128]{};
snprintf(texName, sizeof(texName), "Texture%08x", entry->addr);
image->SetTag(texName);
bool allocSuccess = image->CreateDirect(cmdInit, allocator_, w * scaleFactor, h * scaleFactor, maxLevel + 1, actualFmt, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT, mapping);
bool allocSuccess = image->CreateDirect(cmdInit, allocator_, w * scaleFactor, h * scaleFactor, maxLevel + 1, actualFmt, imageLayout, usage, mapping);
if (!allocSuccess && !lowMemoryMode_) {
WARN_LOG_REPORT(G3D, "Texture cache ran out of GPU memory; switching to low memory mode");
lowMemoryMode_ = true;
@ -677,6 +1165,9 @@ void TextureCacheVulkan::BuildTexture(TexCacheEntry *const entry) {
if (replaced.Valid()) {
replaced.GetSize(i, mipWidth, mipHeight);
}
int srcBpp = dstFmt == VULKAN_8888_FORMAT ? 4 : 2;
int srcStride = mipWidth * srcBpp;
int srcSize = srcStride * mipHeight;
int bpp = actualFmt == VULKAN_8888_FORMAT ? 4 : 2;
int stride = (mipWidth * bpp + 15) & ~15;
int size = stride * mipHeight;
@ -684,22 +1175,85 @@ void TextureCacheVulkan::BuildTexture(TexCacheEntry *const entry) {
VkBuffer texBuf;
// nvidia returns 1 but that can't be healthy... let's align by 16 as a minimum.
int pushAlignment = std::max(16, (int)vulkan_->GetPhysicalDeviceProperties().properties.limits.optimalBufferCopyOffsetAlignment);
void *data = drawEngine_->GetPushBufferForTextureData()->PushAligned(size, &bufferOffset, &texBuf, pushAlignment);
void *data;
bool dataScaled = true;
if (replaced.Valid()) {
data = drawEngine_->GetPushBufferForTextureData()->PushAligned(size, &bufferOffset, &texBuf, pushAlignment);
replaced.Load(i, data, stride);
entry->vkTex->UploadMip(cmdInit, i, mipWidth, mipHeight, texBuf, bufferOffset, stride / bpp);
} else {
auto dispatchCompute = [&](VkDescriptorSet descSet) {
struct Params { int x; int y; int s; int fmt; } params{ mipWidth, mipHeight, scaleFactor, 0 };
if (dstFmt == VULKAN_4444_FORMAT) {
params.fmt = 4;
} else if (dstFmt == VULKAN_1555_FORMAT) {
params.fmt = 5;
} else if (dstFmt == VULKAN_565_FORMAT) {
params.fmt = 6;
}
vkCmdBindDescriptorSets(cmdInit, VK_PIPELINE_BIND_POINT_COMPUTE, computeShaderManager_.GetPipelineLayout(), 0, 1, &descSet, 0, nullptr);
vkCmdPushConstants(cmdInit, computeShaderManager_.GetPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(params), &params);
vkCmdDispatch(cmdInit, (mipWidth + 15) / 16, (mipHeight + 15) / 16, 1);
};
if (fakeMipmap) {
data = drawEngine_->GetPushBufferForTextureData()->PushAligned(size, &bufferOffset, &texBuf, pushAlignment);
LoadTextureLevel(*entry, (uint8_t *)data, stride, level, scaleFactor, dstFmt);
entry->vkTex->UploadMip(cmdInit, 0, mipWidth, mipHeight, texBuf, bufferOffset, stride / bpp);
break;
} else {
LoadTextureLevel(*entry, (uint8_t *)data, stride, i, scaleFactor, dstFmt);
if (computeUpload) {
data = drawEngine_->GetPushBufferForTextureData()->PushAligned(srcSize, &bufferOffset, &texBuf, pushAlignment);
dataScaled = false;
LoadTextureLevel(*entry, (uint8_t *)data, srcStride, i, 1, dstFmt);
// This format can be used with storage images.
VkImageView view = entry->vkTex->CreateViewForMip(i);
VkDescriptorSet descSet = computeShaderManager_.GetDescriptorSet(view, texBuf, bufferOffset, srcSize);
vkCmdBindPipeline(cmdInit, VK_PIPELINE_BIND_POINT_COMPUTE, computeShaderManager_.GetPipeline(uploadCS_));
dispatchCompute(descSet);
vulkan_->Delete().QueueDeleteImageView(view);
} else if (computeCopy) {
data = drawEngine_->GetPushBufferForTextureData()->PushAligned(srcSize, &bufferOffset, &texBuf, pushAlignment);
dataScaled = false;
LoadTextureLevel(*entry, (uint8_t *)data, srcStride, i, 1, dstFmt);
// Simple test of using a "copy shader" before the upload. This one could unswizzle or whatever
// and will work for any texture format including 16-bit as long as the shader is written to pack it into int32 size bits
// which is the smallest possible write.
VkBuffer localBuf;
uint32_t localOffset;
uint32_t localSize = size;
localOffset = (uint32_t)drawEngine_->GetPushBufferLocal()->Allocate(localSize, &localBuf);
VkDescriptorSet descSet = computeShaderManager_.GetDescriptorSet(VK_NULL_HANDLE, texBuf, bufferOffset, srcSize, localBuf, localOffset, localSize);
vkCmdBindPipeline(cmdInit, VK_PIPELINE_BIND_POINT_COMPUTE, computeShaderManager_.GetPipeline(copyCS_));
dispatchCompute(descSet);
// After the compute, before the copy, we need a memory barrier.
VkBufferMemoryBarrier barrier{ VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER };
barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
barrier.buffer = localBuf;
barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
barrier.offset = localOffset;
barrier.size = localSize;
vkCmdPipelineBarrier(cmdInit, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
0, 0, nullptr, 1, &barrier, 0, nullptr);
entry->vkTex->UploadMip(cmdInit, i, mipWidth, mipHeight, localBuf, localOffset, stride / bpp);
} else {
data = drawEngine_->GetPushBufferForTextureData()->PushAligned(size, &bufferOffset, &texBuf, pushAlignment);
LoadTextureLevel(*entry, (uint8_t *)data, stride, i, scaleFactor, dstFmt);
entry->vkTex->UploadMip(cmdInit, i, mipWidth, mipHeight, texBuf, bufferOffset, stride / bpp);
}
}
if (replacer_.Enabled()) {
replacer_.NotifyTextureDecoded(replacedInfo, data, stride, i, mipWidth, mipHeight);
// When hardware texture scaling is enabled, this saves the original.
int w = dataScaled ? mipWidth : mipWidth / scaleFactor;
int h = dataScaled ? mipHeight : mipHeight / scaleFactor;
replacer_.NotifyTextureDecoded(replacedInfo, data, stride, i, w, h);
}
}
entry->vkTex->UploadMip(cmdInit, i, mipWidth, mipHeight, texBuf, bufferOffset, stride / bpp);
}
if (maxLevel == 0) {
@ -710,7 +1264,7 @@ void TextureCacheVulkan::BuildTexture(TexCacheEntry *const entry) {
if (replaced.Valid()) {
entry->SetAlphaStatus(TexCacheEntry::TexStatus(replaced.AlphaStatus()));
}
entry->vkTex->EndCreate(cmdInit);
entry->vkTex->EndCreate(cmdInit, false, computeUpload ? VK_IMAGE_LAYOUT_GENERAL : VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
}
gstate_c.SetTextureFullAlpha(entry->GetAlphaStatus() == TexCacheEntry::STATUS_ALPHA_FULL);

View File

@ -25,6 +25,7 @@
#include "Common/Vulkan/VulkanContext.h"
#include "GPU/Vulkan/TextureScalerVulkan.h"
#include "GPU/Common/TextureCacheCommon.h"
#include "GPU/Vulkan/VulkanUtil.h"
struct VirtualFramebuffer;
class FramebufferManagerVulkan;
@ -126,6 +127,8 @@ private:
VulkanDeviceAllocator *allocator_ = nullptr;
VulkanPushBuffer *push_ = nullptr;
VulkanComputeShaderManager computeShaderManager_;
SamplerCache samplerCache_;
TextureScalerVulkan scaler;
@ -142,6 +145,9 @@ private:
DrawEngineVulkan *drawEngine_;
Vulkan2D *vulkan2D_;
VkShaderModule uploadCS_ = VK_NULL_HANDLE;
VkShaderModule copyCS_ = VK_NULL_HANDLE;
// Bound state to emulate an API similar to the others
VkImageView imageView_ = VK_NULL_HANDLE;
VkSampler curSampler_ = VK_NULL_HANDLE;

View File

@ -16,6 +16,7 @@
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
#include "base/basictypes.h"
#include "base/stringutil.h"
#include "Common/Log.h"
#include "Common/Vulkan/VulkanContext.h"
#include "GPU/Vulkan/VulkanUtil.h"
@ -51,17 +52,14 @@ void Vulkan2D::DestroyDeviceObjects() {
VkDevice device = vulkan_->GetDevice();
if (descriptorSetLayout_ != VK_NULL_HANDLE) {
vulkan_->Delete().QueueDeleteDescriptorSetLayout(descriptorSetLayout_);
descriptorSetLayout_ = VK_NULL_HANDLE;
}
if (pipelineLayout_ != VK_NULL_HANDLE) {
vulkan_->Delete().QueueDeletePipelineLayout(pipelineLayout_);
pipelineLayout_ = VK_NULL_HANDLE;
}
// pipelineBasicTex_ and pipelineBasicTex_ come from vulkan2D_.
if (pipelineCache_ != VK_NULL_HANDLE) {
vulkan_->Delete().QueueDeletePipelineCache(pipelineCache_);
pipelineCache_ = VK_NULL_HANDLE;
}
}
@ -388,7 +386,7 @@ VkShaderModule CompileShaderModule(VulkanContext *vulkan, VkShaderStageFlagBits
ERROR_LOG(G3D, "Error in shader compilation!");
}
ERROR_LOG(G3D, "Messages: %s", error->c_str());
ERROR_LOG(G3D, "Shader source:\n%s", code);
ERROR_LOG(G3D, "Shader source:\n%s", LineNumberString(code).c_str());
OutputDebugStringUTF8("Messages:\n");
OutputDebugStringUTF8(error->c_str());
return VK_NULL_HANDLE;
@ -401,3 +399,169 @@ VkShaderModule CompileShaderModule(VulkanContext *vulkan, VkShaderStageFlagBits
}
}
}
VulkanComputeShaderManager::VulkanComputeShaderManager(VulkanContext *vulkan) : vulkan_(vulkan), pipelines_(8) {
}
VulkanComputeShaderManager::~VulkanComputeShaderManager() {}
void VulkanComputeShaderManager::InitDeviceObjects() {
VkPipelineCacheCreateInfo pc{ VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO };
VkResult res = vkCreatePipelineCache(vulkan_->GetDevice(), &pc, nullptr, &pipelineCache_);
assert(VK_SUCCESS == res);
VkDescriptorSetLayoutBinding bindings[3] = {};
bindings[0].descriptorCount = 1;
bindings[0].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;
bindings[0].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
bindings[0].binding = 0;
bindings[1].descriptorCount = 1;
bindings[1].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
bindings[1].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
bindings[1].binding = 1;
bindings[2].descriptorCount = 1;
bindings[2].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
bindings[2].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
bindings[2].binding = 2;
VkDevice device = vulkan_->GetDevice();
VkDescriptorSetLayoutCreateInfo dsl = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO };
dsl.bindingCount = ARRAY_SIZE(bindings);
dsl.pBindings = bindings;
res = vkCreateDescriptorSetLayout(device, &dsl, nullptr, &descriptorSetLayout_);
assert(VK_SUCCESS == res);
VkDescriptorPoolSize dpTypes[2];
dpTypes[0].descriptorCount = 8192;
dpTypes[0].type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
dpTypes[1].descriptorCount = 4096;
dpTypes[1].type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;
VkDescriptorPoolCreateInfo dp = { VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO };
dp.flags = 0; // Don't want to mess around with individually freeing these, let's go fixed each frame and zap the whole array. Might try the dynamic approach later.
dp.maxSets = 4096; // GTA can end up creating more than 1000 textures in the first frame!
dp.pPoolSizes = dpTypes;
dp.poolSizeCount = ARRAY_SIZE(dpTypes);
for (int i = 0; i < ARRAY_SIZE(frameData_); i++) {
VkResult res = vkCreateDescriptorPool(vulkan_->GetDevice(), &dp, nullptr, &frameData_[i].descPool);
assert(VK_SUCCESS == res);
}
VkPushConstantRange push = {};
push.offset = 0;
push.size = 16;
push.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
VkPipelineLayoutCreateInfo pl = { VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO };
pl.pPushConstantRanges = &push;
pl.pushConstantRangeCount = 1;
pl.setLayoutCount = 1;
pl.pSetLayouts = &descriptorSetLayout_;
pl.flags = 0;
res = vkCreatePipelineLayout(device, &pl, nullptr, &pipelineLayout_);
assert(VK_SUCCESS == res);
}
void VulkanComputeShaderManager::DestroyDeviceObjects() {
for (int i = 0; i < ARRAY_SIZE(frameData_); i++) {
vulkan_->Delete().QueueDeleteDescriptorPool(frameData_[i].descPool);
}
if (descriptorSetLayout_) {
vulkan_->Delete().QueueDeleteDescriptorSetLayout(descriptorSetLayout_);
}
pipelines_.Iterate([&](const PipelineKey &key, VkPipeline pipeline) {
vulkan_->Delete().QueueDeletePipeline(pipeline);
});
pipelines_.Clear();
if (pipelineLayout_) {
vulkan_->Delete().QueueDeletePipelineLayout(pipelineLayout_);
}
if (pipelineCache_ != VK_NULL_HANDLE) {
vulkan_->Delete().QueueDeletePipelineCache(pipelineCache_);
}
}
VkDescriptorSet VulkanComputeShaderManager::GetDescriptorSet(VkImageView image, VkBuffer buffer, VkDeviceSize offset, VkDeviceSize range, VkBuffer buffer2, VkDeviceSize offset2, VkDeviceSize range2) {
int curFrame = vulkan_->GetCurFrame();
FrameData &frameData = frameData_[curFrame];
frameData_[curFrame].numDescriptors++;
VkDescriptorSet desc;
VkDescriptorSetAllocateInfo descAlloc = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO };
descAlloc.pSetLayouts = &descriptorSetLayout_;
descAlloc.descriptorPool = frameData.descPool;
descAlloc.descriptorSetCount = 1;
VkResult result = vkAllocateDescriptorSets(vulkan_->GetDevice(), &descAlloc, &desc);
assert(result == VK_SUCCESS);
VkWriteDescriptorSet writes[2]{};
int n = 0;
VkDescriptorImageInfo imageInfo = {};
VkDescriptorBufferInfo bufferInfo[2] = {};
if (image) {
imageInfo.imageLayout = VK_IMAGE_LAYOUT_GENERAL;
imageInfo.imageView = image;
imageInfo.sampler = VK_NULL_HANDLE;
writes[n].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
writes[n].dstBinding = 0;
writes[n].pImageInfo = &imageInfo;
writes[n].descriptorCount = 1;
writes[n].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;
writes[n].dstSet = desc;
n++;
}
bufferInfo[0].buffer = buffer;
bufferInfo[0].offset = offset;
bufferInfo[0].range = range;
writes[n].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
writes[n].dstBinding = 1;
writes[n].pBufferInfo = &bufferInfo[0];
writes[n].descriptorCount = 1;
writes[n].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
writes[n].dstSet = desc;
n++;
if (buffer2) {
bufferInfo[1].buffer = buffer2;
bufferInfo[1].offset = offset2;
bufferInfo[1].range = range2;
writes[n].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
writes[n].dstBinding = 2;
writes[n].pBufferInfo = &bufferInfo[1];
writes[n].descriptorCount = 1;
writes[n].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
writes[n].dstSet = desc;
n++;
}
vkUpdateDescriptorSets(vulkan_->GetDevice(), n, writes, 0, nullptr);
return desc;
}
VkPipeline VulkanComputeShaderManager::GetPipeline(VkShaderModule cs) {
PipelineKey key{ cs };
VkPipeline pipeline = pipelines_.Get(key);
if (pipeline)
return pipeline;
VkComputePipelineCreateInfo pci{ VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO };
pci.stage.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
pci.stage.module = cs;
pci.stage.pName = "main";
pci.stage.stage = VK_SHADER_STAGE_COMPUTE_BIT;
pci.layout = pipelineLayout_;
pci.flags = 0;
vkCreateComputePipelines(vulkan_->GetDevice(), pipelineCache_, 1, &pci, nullptr, &pipeline);
pipelines_.Insert(key, pipeline);
return pipeline;
}
void VulkanComputeShaderManager::BeginFrame() {
int curFrame = vulkan_->GetCurFrame();
FrameData &frame = frameData_[curFrame];
frameData_[curFrame].numDescriptors = 0;
vkResetDescriptorPool(vulkan_->GetDevice(), frame.descPool, 0);
}
void VulkanComputeShaderManager::EndFrame() {
}

View File

@ -20,6 +20,7 @@
#include <tuple>
#include <map>
#include "Common/Hashmaps.h"
#include "Common/Vulkan/VulkanContext.h"
#include "Common/Vulkan/VulkanLoader.h"
#include "Common/Vulkan/VulkanImage.h"
@ -124,5 +125,52 @@ private:
std::vector<VkPipeline> keptPipelines_;
};
// Manager for compute shaders that upload things (and those have two bindings: a storage buffer to read from and an image to write to).
class VulkanComputeShaderManager {
public:
VulkanComputeShaderManager(VulkanContext *vulkan);
~VulkanComputeShaderManager();
void DeviceLost() {
DestroyDeviceObjects();
}
void DeviceRestore(VulkanContext *vulkan) {
vulkan_ = vulkan;
InitDeviceObjects();
}
// Note: This doesn't cache. The descriptor is for immediate use only.
VkDescriptorSet GetDescriptorSet(VkImageView image, VkBuffer buffer, VkDeviceSize offset, VkDeviceSize range, VkBuffer buffer2 = VK_NULL_HANDLE, VkDeviceSize offset2 = 0, VkDeviceSize range2 = 0);
// This of course caches though.
VkPipeline GetPipeline(VkShaderModule cs);
VkPipelineLayout GetPipelineLayout() const { return pipelineLayout_; }
void BeginFrame();
void EndFrame();
private:
void InitDeviceObjects();
void DestroyDeviceObjects();
VulkanContext *vulkan_ = nullptr;
VkPipelineCache cache_ = VK_NULL_HANDLE;
VkDescriptorSetLayout descriptorSetLayout_ = VK_NULL_HANDLE;
VkPipelineLayout pipelineLayout_ = VK_NULL_HANDLE;
VkPipelineCache pipelineCache_ = VK_NULL_HANDLE;
struct FrameData {
VkDescriptorPool descPool;
int numDescriptors;
};
FrameData frameData_[VulkanContext::MAX_INFLIGHT_FRAMES];
struct PipelineKey {
VkShaderModule module;
};
DenseHashMap<PipelineKey, VkPipeline, (VkPipeline)VK_NULL_HANDLE> pipelines_;
};
VkShaderModule CompileShaderModule(VulkanContext *vulkan, VkShaderStageFlagBits stage, const char *code, std::string *error);