[jak2] speed up the sky texture animation (#2829)

This saved about 1.6 ms per frame in the city for me (~1.3 saved from
not doing sky twice, 0.3 saved in format lookup tables).

The big texture animator is about 1.0 ms.

![image](https://github.com/open-goal/jak-project/assets/48171810/c7bc7743-308c-4425-ad14-118e2d483fad)
This commit is contained in:
water111 2023-07-15 11:06:32 -04:00 committed by GitHub
parent 6f244b11ef
commit d80b1b8119
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 180 additions and 50 deletions

View File

@ -59,7 +59,8 @@ inline u32 psmt8_addr(u32 x, u32 y, u32 width) {
// column is 16, 4
// first determine the page
u32 pages_per_row = width / 128;
// Note: not actually sure what the GS does here...
u32 pages_per_row = std::max(1u, width / 128);
u32 page_col = x / 128;
u32 page_row = y / 64;
u32 page_x = x % 128;

View File

@ -818,6 +818,11 @@ void OpenGLRenderer::draw_renderer_selection_window() {
ImGui::Checkbox("Occlusion Cull", &m_render_state.use_occlusion_culling);
ImGui::Checkbox("Blackout Loads", &m_enable_fast_blackout_loads);
if (m_texture_animator && ImGui::TreeNode("Texture Animator")) {
m_texture_animator->draw_debug_window();
ImGui::TreePop();
}
for (size_t i = 0; i < m_bucket_renderers.size(); i++) {
auto renderer = m_bucket_renderers[i].get();
if (renderer && !renderer->empty()) {

View File

@ -7,6 +7,8 @@
#include "game/graphics/texture/TexturePool.h"
#include "third-party/imgui/imgui.h"
//#define dprintf(...) printf(__VA_ARGS__)
//#define dfmt(...) fmt::print(__VA_ARGS__)
#define dprintf(...)
@ -193,7 +195,11 @@ GLuint ClutBlender::run(const float* weights) {
}
TextureAnimator::TextureAnimator(ShaderLibrary& shaders, const tfrag3::Level* common_level)
: m_common_level(common_level) {
: m_common_level(common_level),
m_psm32_to_psm8_8_8(8, 8, 8, 64),
m_psm32_to_psm8_16_16(16, 16, 16, 64),
m_psm32_to_psm8_32_32(32, 32, 16, 64),
m_psm32_to_psm8_64_64(64, 64, 64, 64) {
glGenVertexArrays(1, &m_vao);
glGenBuffers(1, &m_vertex_buffer);
glBindVertexArray(m_vao);
@ -297,6 +303,10 @@ TextureAnimator::TextureAnimator(ShaderLibrary& shaders, const tfrag3::Level* co
"-start", "-end", {});
}
void TextureAnimator::draw_debug_window() {
ImGui::Checkbox("fast-scrambler", &m_debug.use_fast_scrambler);
}
int TextureAnimator::create_clut_blender_group(const std::vector<std::string>& textures,
const std::string& suffix0,
const std::string& suffix1,
@ -1131,30 +1141,75 @@ GLuint TextureAnimator::make_or_get_gpu_texture_for_current_shader(TexturePool&
switch (m_current_shader.tex0.psm()) {
// reading as a different format, needs scrambler.
case GsTex0::PSM::PSMT8: {
auto p = scoped_prof("scrambler");
int w = 1 << m_current_shader.tex0.tw();
int h = 1 << m_current_shader.tex0.th();
ASSERT(w == vram_entry->tex_width * 2);
ASSERT(h == vram_entry->tex_height * 2);
ASSERT(m_current_shader.tex0.tbw() == 1);
std::vector<u32> rgba_data(w * h);
Timer timer;
m_converter.upload_width(vram_entry->data.data(), m_current_shader.tex0.tbp0(),
vram_entry->tex_width, vram_entry->tex_height);
// also needs clut lookup
load_clut_to_converter();
{
std::vector<u32> rgba_data(w * h);
m_converter.download_rgba8888(
(u8*)rgba_data.data(), m_current_shader.tex0.tbp0(), m_current_shader.tex0.tbw(), w,
h, (int)m_current_shader.tex0.psm(), (int)m_current_shader.tex0.cpsm(),
m_current_shader.tex0.cbp(), rgba_data.size() * 4);
// file_util::write_rgba_png("out.png", rgba_data.data(), 1 <<
// m_current_shader.tex0.tw(),
// 1 << m_current_shader.tex0.th());
dprintf("processing %d x %d took %.3f ms\n", w, h, timer.getMs());
return make_temp_gpu_texture(rgba_data.data(), w, h);
const auto& clut_lookup = m_textures.find(m_current_shader.tex0.cbp());
if (clut_lookup == m_textures.end()) {
printf("set shader referenced an unknown clut texture in %d\n",
m_current_shader.tex0.cbp());
ASSERT_NOT_REACHED();
}
switch (clut_lookup->second.kind) {
case VramEntry::Kind::CLUT16_16_IN_PSM32:
break;
default:
printf("unhandled clut source kind: %d\n", (int)clut_lookup->second.kind);
ASSERT_NOT_REACHED();
}
const u32* clut_u32s = (const u32*)clut_lookup->second.data.data();
if (w == 8 && h == 8 && m_debug.use_fast_scrambler) {
ASSERT_NOT_REACHED();
} else if (w == 16 && h == 16) {
for (int i = 0; i < 16 * 16; i++) {
memcpy(&rgba_data[m_psm32_to_psm8_8_8.destinations_per_byte[i]],
&clut_u32s[m_clut_table.addrs[vram_entry->data[i]]], 4);
}
} else if (w == 32 && h == 32 && m_debug.use_fast_scrambler) {
for (int i = 0; i < 32 * 32; i++) {
rgba_data[m_psm32_to_psm8_16_16.destinations_per_byte[i]] =
clut_u32s[m_clut_table.addrs[vram_entry->data[i]]];
}
} else if (w == 64 && h == 64 && m_debug.use_fast_scrambler) {
for (int i = 0; i < 64 * 64; i++) {
rgba_data[m_psm32_to_psm8_32_32.destinations_per_byte[i]] =
clut_u32s[m_clut_table.addrs[vram_entry->data[i]]];
}
} else if (w == 128 && h == 128 && m_debug.use_fast_scrambler) {
for (int i = 0; i < 128 * 128; i++) {
rgba_data[m_psm32_to_psm8_64_64.destinations_per_byte[i]] =
clut_u32s[m_clut_table.addrs[vram_entry->data[i]]];
}
} else {
Timer timer;
m_converter.upload_width(vram_entry->data.data(), m_current_shader.tex0.tbp0(),
vram_entry->tex_width, vram_entry->tex_height);
// also needs clut lookup
load_clut_to_converter();
{
m_converter.download_rgba8888(
(u8*)rgba_data.data(), m_current_shader.tex0.tbp0(), m_current_shader.tex0.tbw(),
w, h, (int)m_current_shader.tex0.psm(), (int)m_current_shader.tex0.cpsm(),
m_current_shader.tex0.cbp(), rgba_data.size() * 4);
// file_util::write_rgba_png("out.png", rgba_data.data(), 1 <<
// m_current_shader.tex0.tw(),
// 1 << m_current_shader.tex0.th());
printf("Scrambler took the slow path %d x %d took %.3f ms\n", w, h, timer.getMs());
}
}
auto ret = make_temp_gpu_texture(rgba_data.data(), w, h);
// debug_save_opengl_texture(fmt::format("tex_{}.png", w), ret);
return ret;
ASSERT_NOT_REACHED();
} break;
default:
@ -1165,26 +1220,6 @@ GLuint TextureAnimator::make_or_get_gpu_texture_for_current_shader(TexturePool&
case VramEntry::Kind::CLUT16_16_IN_PSM32:
ASSERT_NOT_REACHED();
/*
case VramEntry::Kind::GENERIC_PSMT8: {
fmt::print("drawing: {}\n", (int)m_current_shader.tex0.psm());
ASSERT(m_current_shader.tex0.psm() == GsTex0::PSM::PSMT8);
ASSERT(m_current_shader.tex0.cpsm() == 0); // psm32.
int tw = 1 << m_current_shader.tex0.tw();
int th = 1 << m_current_shader.tex0.th();
ASSERT(tw == vram_entry->tex_width);
ASSERT(th == vram_entry->tex_height);
std::vector<u32> rgba_data(tw * th);
const u32* clut = get_current_clut_16_16_psm32();
for (int r = 0; r < th; r++) {
for (int c = 0; c < tw; c++) {
rgba_data[c + r * tw] = clut[vram_entry->data[c + r * tw]];
}
}
return make_temp_gpu_texture(rgba_data.data(), tw, th);
}
*/
break;
default:
ASSERT_NOT_REACHED();

View File

@ -9,6 +9,7 @@
#include "common/dma/dma_chain_read.h"
#include "common/dma/gs.h"
#include "common/math/Vector.h"
#include "common/texture/texture_conversion.h"
#include "game/graphics/opengl_renderer/Shader.h"
#include "game/graphics/opengl_renderer/opengl_utils.h"
@ -80,6 +81,79 @@ class ClutBlender {
std::vector<u32> m_temp_rgba;
};
struct Psm32ToPsm8Scrambler {
Psm32ToPsm8Scrambler(int w, int h, int write_tex_width, int read_tex_width) {
struct InAddr {
int x = -1, y = -1, c = -1;
};
struct OutAddr {
int x = -1, y = -1;
};
std::vector<InAddr> vram_from_in(w * h * 4);
std::vector<OutAddr> vram_from_out(w * h * 4);
// loop over pixels in input
for (int y = 0; y < h; y++) {
for (int x = 0; x < w; x++) {
int byte_addr = psmct32_addr(x, y, write_tex_width);
for (int c = 0; c < 4; c++) {
auto& s = vram_from_in.at(byte_addr + c);
s.x = x;
s.y = y;
s.c = c;
}
}
}
// output
for (int y = 0; y < h * 2; y++) {
for (int x = 0; x < w * 2; x++) {
int byte_addr = psmt8_addr(x, y, read_tex_width);
auto& s = vram_from_out.at(byte_addr);
s.x = x;
s.y = y;
}
}
destinations_per_byte.resize(4 * w * h);
for (size_t i = 0; i < vram_from_out.size(); i++) {
auto& in = vram_from_in.at(i);
auto& out = vram_from_out.at(i);
if (in.c >= 0) {
destinations_per_byte.at(in.c + in.x * 4 + in.y * 4 * w) = out.x + out.y * w * 2;
}
}
}
std::vector<int> destinations_per_byte;
};
struct ClutReader {
std::array<int, 256> addrs;
ClutReader() {
for (int i = 0; i < 256; i++) {
u32 clut_chunk = i / 16;
u32 off_in_chunk = i % 16;
u8 clx = 0, cly = 0;
if (clut_chunk & 1) {
clx = 8;
}
cly = (clut_chunk >> 1) * 2;
if (off_in_chunk >= 8) {
off_in_chunk -= 8;
cly++;
}
clx += off_in_chunk;
// the x, y CLUT value is looked up in PSMCT32 mode
u32 clut_addr = clx + cly * 16;
ASSERT(clut_addr < 256);
addrs[i] = clut_addr;
}
}
};
class TexturePool;
class TextureAnimator {
@ -88,6 +162,7 @@ class TextureAnimator {
~TextureAnimator();
void handle_texture_anim_data(DmaFollower& dma, const u8* ee_mem, TexturePool* texture_pool);
GLuint get_by_slot(int idx);
void draw_debug_window();
const std::vector<GLuint>* slots() { return &m_output_slots; }
private:
@ -170,6 +245,10 @@ class TextureAnimator {
GLuint tcc;
} m_uniforms;
struct {
bool use_fast_scrambler = true;
} m_debug;
GLuint m_shader_id;
GLuint m_dummy_texture;
@ -201,15 +280,7 @@ class TextureAnimator {
const std::optional<std::string>& dgo);
void run_clut_blender_group(DmaTransfer& tf, int idx);
// std::vector<ClutBlender> m_darkjak_blenders;
// std::vector<int> m_darkjak_output_slots;
//
// std::vector<ClutBlender> m_jakb_prison_blenders;
// std::vector<int> m_jakb_prison_output_slots;
//
// std::vector<ClutBlender> m_jakb_oracle_blenders;
// std::vector<int> m_jakb_oracle_slots;
//
// std::vector<ClutBlender> m_jakb_nest_blenders;
// std::vector<int> m_jakb_nest_slots;
Psm32ToPsm8Scrambler m_psm32_to_psm8_8_8, m_psm32_to_psm8_16_16, m_psm32_to_psm8_32_32,
m_psm32_to_psm8_64_64;
ClutReader m_clut_table;
};

View File

@ -2745,3 +2745,14 @@ This commonly includes things such as:
:size-assert #x80
:flag-assert #xf00000080
)
(defmacro spawn-gem-near-target! (&key (count 5))
`(dotimes (i ,count)
(birth-pickup-at-point
(vector+! (new 'stack 'vector) (target-pos 0) (new 'static 'vector :y (meters 2.0)))
(pickup-type gem)
1.0
#t
*entity-pool*
(the fact-info #f))))

View File

@ -583,6 +583,13 @@
(cond
((or (= anim-array *sky-texture-anim-array*)
)
(when (= bucket (bucket-id tex-lcom-sky-post))
;; skip. I believe this is only used to generate the envmap texture for the ocean.
;; it generates the exact same thing, so if we want this on PC one day, we can just
;; steal if from the beginning of the frame.
(return #f)
)
;; for sky, we basically emulate the full thing
;; (format *stdcon* "doing sky to bucket ~d~%" bucket)
)