diff --git a/common/dma/dma_copy.cpp b/common/dma/dma_copy.cpp index e1049b3ec..898d98c5f 100644 --- a/common/dma/dma_copy.cpp +++ b/common/dma/dma_copy.cpp @@ -74,6 +74,15 @@ FixedChunkDmaCopier::FixedChunkDmaCopier(u32 main_memory_size) m_chunk_mask.resize(m_chunk_count); } +void FixedChunkDmaCopier::set_input_data(const void* memory, u32 offset, bool run_copy) { + if (run_copy) { + run(memory, offset, false); + } else { + m_input_offset = offset; + m_input_data = memory; + } +} + const DmaData& FixedChunkDmaCopier::run(const void* memory, u32 offset, bool verify) { Timer timer; m_input_offset = offset; diff --git a/common/dma/dma_copy.h b/common/dma/dma_copy.h index 781ff3e72..db1053c9a 100644 --- a/common/dma/dma_copy.h +++ b/common/dma/dma_copy.h @@ -22,6 +22,8 @@ class FixedChunkDmaCopier { static constexpr u32 chunk_size = 0x20000; // 128 kB, gives use 1024 chunks for a 128 MB RAM. FixedChunkDmaCopier(u32 main_memory_size); + void set_input_data(const void* memory, u32 offset, bool run); + const DmaData& run(const void* memory, u32 offset, bool verify = false); void serialize_last_result(Serializer& serializer); diff --git a/common/dma/gs.cpp b/common/dma/gs.cpp index a320caa32..55beacfc9 100644 --- a/common/dma/gs.cpp +++ b/common/dma/gs.cpp @@ -389,6 +389,21 @@ std::string DrawMode::to_string() const { case AlphaBlend::DISABLED: result += "disabled\n"; break; + case AlphaBlend::SRC_DST_FIX_DST: + result += "src, dst, fix, dst\n"; + break; + case AlphaBlend::SRC_0_DST_DST: + result += "src, 0, dst, dst\n"; + break; + case AlphaBlend::SRC_SRC_SRC_SRC: + result += "src, src, src, src\n"; + break; + case AlphaBlend::ZERO_SRC_SRC_DST: + result += "0, src, src, dst\n"; + break; + case AlphaBlend::SRC_0_FIX_DST: + result += "src, 0, fix, dst\n"; + break; default: ASSERT(false); } @@ -409,7 +424,8 @@ std::string DrawMode::to_string() const { result += "never\n"; break; default: - ASSERT(false); + result += "invalid!\n"; + break; } result += fmt::format(" zte: {}\n", get_zt_enable()); result += fmt::format(" abe: {}\n", get_ab_enable()); @@ -430,5 +446,6 @@ std::string DrawMode::to_string() const { default: ASSERT(false); } + result += fmt::format(" fog: {}\n decal: {}\n", get_fog_enable(), get_decal()); return result; } \ No newline at end of file diff --git a/common/dma/gs.h b/common/dma/gs.h index 73c380532..08a0375b6 100644 --- a/common/dma/gs.h +++ b/common/dma/gs.h @@ -365,6 +365,8 @@ class DrawMode { SRC_0_FIX_DST = 3, // fix = 128 SRC_DST_FIX_DST = 4, // fix = 64 ZERO_SRC_SRC_DST = 5, + SRC_SRC_SRC_SRC = 6, + SRC_0_DST_DST = 7 }; enum class AlphaTest { @@ -483,8 +485,27 @@ class DrawMode { bool get_decal() const { return !(m_val & (1 << 28)); } void enable_decal() { m_val = m_val & (~(1 << 28)); } void disable_decal() { m_val = m_val | (1 << 28); } + void set_decal(bool en) { + if (en) { + enable_decal(); + } else { + disable_decal(); + } + } + + bool get_fog_enable() const { return m_val & (1 << 29); } + void enable_fog() { m_val = m_val | (1 << 29); } + void disable_fog() { m_val = m_val & (~(1 << 29)); } + void set_fog(bool en) { + if (en) { + enable_fog(); + } else { + disable_fog(); + } + } u32& as_int() { return m_val; } + const u32& as_int() const { return m_val; } bool operator==(const DrawMode& other) const { return m_val == other.m_val; } bool operator!=(const DrawMode& other) const { return m_val != other.m_val; } @@ -508,5 +529,6 @@ class DrawMode { // 23 t clamp // 24 - 27 alpha blend // 28 !decal + // 29 fge u32 m_val = UINT32_MAX; }; diff --git a/game/CMakeLists.txt b/game/CMakeLists.txt index 039af887d..491c68d58 100644 --- a/game/CMakeLists.txt +++ b/game/CMakeLists.txt @@ -77,6 +77,7 @@ set(RUNTIME_SOURCE graphics/opengl_renderer/BucketRenderer.cpp graphics/opengl_renderer/debug_gui.cpp graphics/opengl_renderer/DirectRenderer.cpp + graphics/opengl_renderer/DirectRenderer2.cpp graphics/opengl_renderer/EyeRenderer.cpp graphics/opengl_renderer/GenericProgram.cpp graphics/opengl_renderer/GenericRenderer.cpp diff --git a/game/graphics/opengl_renderer/BucketRenderer.h b/game/graphics/opengl_renderer/BucketRenderer.h index f3d800b95..3ab4ae537 100644 --- a/game/graphics/opengl_renderer/BucketRenderer.h +++ b/game/graphics/opengl_renderer/BucketRenderer.h @@ -92,6 +92,7 @@ struct SharedRenderState { bool render_debug = false; bool enable_merc_xgkick = true; bool enable_generic_xgkick = true; + bool use_direct2 = true; math::Vector fog_color; float fog_intensity = 1.f; @@ -116,6 +117,7 @@ class BucketRenderer { virtual bool empty() const { return false; } virtual void draw_debug_window() = 0; virtual void serialize(Serializer&) {} + virtual void init_shaders(ShaderLibrary&) {} protected: std::string m_name; diff --git a/game/graphics/opengl_renderer/DirectRenderer2.cpp b/game/graphics/opengl_renderer/DirectRenderer2.cpp new file mode 100644 index 000000000..130586128 --- /dev/null +++ b/game/graphics/opengl_renderer/DirectRenderer2.cpp @@ -0,0 +1,757 @@ +#include "DirectRenderer2.h" +#include "third-party/imgui/imgui.h" +#include "common/log/log.h" +#include + +DirectRenderer2::DirectRenderer2(u32 max_verts, + u32 max_inds, + u32 max_draws, + const std::string& name) + : m_name(name) { + // allocate buffers + m_vertices.vertices.resize(max_verts); + m_vertices.indices.resize(max_inds); + m_draw_buffer.resize(max_draws); + + // create OpenGL objects + glGenBuffers(1, &m_ogl.vertex_buffer); + glGenBuffers(1, &m_ogl.index_buffer); + glGenVertexArrays(1, &m_ogl.vao); + + // set up the vertex array + glBindVertexArray(m_ogl.vao); + glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, m_ogl.index_buffer); + glBufferData(GL_ELEMENT_ARRAY_BUFFER, max_inds * sizeof(u32), nullptr, GL_STREAM_DRAW); + glBindBuffer(GL_ARRAY_BUFFER, m_ogl.vertex_buffer); + glBufferData(GL_ARRAY_BUFFER, max_verts * sizeof(Vertex), nullptr, GL_STREAM_DRAW); + + // xyz + glEnableVertexAttribArray(0); + glVertexAttribPointer(0, // location 0 in the shader + 3, // 3 floats per vert + GL_FLOAT, // floats + GL_TRUE, // normalized, ignored, + sizeof(Vertex), // + (void*)offsetof(Vertex, xyz) // offset in array + ); + + // rgba + glEnableVertexAttribArray(1); + glVertexAttribPointer(1, // location 1 in the shader + 4, // 4 color components + GL_UNSIGNED_BYTE, // u8 + GL_TRUE, // normalized (255 becomes 1) + sizeof(Vertex), // + (void*)offsetof(Vertex, rgba) // + ); + + // stq + glEnableVertexAttribArray(2); + glVertexAttribPointer(2, // location 2 in the shader + 3, // 3 floats per vert + GL_FLOAT, // floats + GL_FALSE, // normalized, ignored + sizeof(Vertex), // + (void*)offsetof(Vertex, stq) // offset in array + ); + + // byte data + glEnableVertexAttribArray(3); + glVertexAttribIPointer(3, // location 0 in the shader + 4, // 3 floats per vert + GL_UNSIGNED_BYTE, // u8's + sizeof(Vertex), // + (void*)offsetof(Vertex, tex_unit) // offset in array + ); + + glBindBuffer(GL_ARRAY_BUFFER, 0); + glBindVertexArray(0); +} + +DirectRenderer2::~DirectRenderer2() { + glDeleteBuffers(1, &m_ogl.vertex_buffer); + glDeleteBuffers(1, &m_ogl.index_buffer); + glDeleteVertexArrays(1, &m_ogl.vao); +} + +void DirectRenderer2::init_shaders(ShaderLibrary& shaders) { + shaders[ShaderId::DIRECT2].activate(); + m_ogl.alpha_reject = glGetUniformLocation(shaders[ShaderId::DIRECT2].id(), "alpha_reject"); + m_ogl.color_mult = glGetUniformLocation(shaders[ShaderId::DIRECT2].id(), "color_mult"); + m_ogl.fog_color = glGetUniformLocation(shaders[ShaderId::DIRECT2].id(), "fog_color"); +} + +void DirectRenderer2::reset_buffers() { + m_next_free_draw = 0; + m_vertices.next_index = 0; + m_vertices.next_vertex = 0; + m_state.next_vertex_starts_strip = true; + m_state.strip_warmup = 0; + m_current_state_has_open_draw = false; +} + +void DirectRenderer2::reset_state() { + m_state = {}; + m_stats = {}; + if (m_next_free_draw || m_vertices.next_vertex || m_vertices.next_index) { + fmt::print("[{}] Call to reset_state while there was pending draw data!\n", m_name); + } + reset_buffers(); +} + +std::string DirectRenderer2::Vertex::print() const { + return fmt::format("{} {} {}\n", xyz.to_string_aligned(), stq.to_string_aligned(), rgba[0]); +} + +std::string DirectRenderer2::Draw::to_string() const { + std::string result; + result += mode.to_string(); + result += fmt::format("TBP: 0x{:x}\n", tbp); + result += fmt::format("fix: 0x{:x}\n", fix); + return result; +} + +std::string DirectRenderer2::Draw::to_single_line_string() const { + return fmt::format("mode 0x{:8x} tbp 0x{:4x} fix 0x{:2x}\n", mode.as_int(), tbp, fix); +} + +void DirectRenderer2::flush_pending(SharedRenderState* render_state, ScopedProfilerNode& prof) { + // skip, if we're empty. + if (m_next_free_draw == 0) { + reset_buffers(); + return; + } + + // first, upload: + Timer upload_timer; + glBindVertexArray(m_ogl.vao); + glBindBuffer(GL_ARRAY_BUFFER, m_ogl.vertex_buffer); + glBufferData(GL_ARRAY_BUFFER, m_vertices.next_vertex * sizeof(Vertex), m_vertices.vertices.data(), + GL_STREAM_DRAW); + glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, m_ogl.index_buffer); + glBufferData(GL_ELEMENT_ARRAY_BUFFER, m_vertices.next_index * sizeof(u32), + m_vertices.indices.data(), GL_STREAM_DRAW); + m_stats.upload_wait += upload_timer.getSeconds(); + m_stats.num_uploads++; + m_stats.upload_bytes += + (m_vertices.next_vertex * sizeof(Vertex)) + (m_vertices.next_index * sizeof(u32)); + + // initial OpenGL setup + glEnable(GL_PRIMITIVE_RESTART); + glPrimitiveRestartIndex(UINT32_MAX); + render_state->shaders[ShaderId::DIRECT2].activate(); + + // draw call loop + // draw_call_loop_simple(render_state, prof); + draw_call_loop_grouped(render_state, prof); + + // done! reset. + glBindVertexArray(0); + + reset_buffers(); +} + +void DirectRenderer2::draw_call_loop_simple(SharedRenderState* render_state, + ScopedProfilerNode& prof) { + fmt::print("------------------------\n"); + for (u32 draw_idx = 0; draw_idx < m_next_free_draw; draw_idx++) { + const auto& draw = m_draw_buffer[draw_idx]; + fmt::print("{}", draw.to_single_line_string()); + setup_opengl_for_draw_mode(draw, render_state); + setup_opengl_tex(0, draw.tbp, draw.mode.get_filt_enable(), draw.mode.get_clamp_s_enable(), + draw.mode.get_clamp_t_enable(), render_state); + void* offset = (void*)(draw.start_index * sizeof(u32)); + int end_idx; + if (draw_idx == m_next_free_draw - 1) { + end_idx = m_vertices.next_index; + } else { + end_idx = m_draw_buffer[draw_idx + 1].start_index; + } + glDrawElements(GL_TRIANGLES, end_idx - draw.start_index, GL_UNSIGNED_INT, (void*)offset); + prof.add_draw_call(); + prof.add_tri((end_idx - draw.start_index) / 3); + } +} + +void DirectRenderer2::draw_call_loop_grouped(SharedRenderState* render_state, + ScopedProfilerNode& prof) { + u32 draw_idx = 0; + while (draw_idx < m_next_free_draw) { + const auto& draw = m_draw_buffer[draw_idx]; + u32 end_of_draw_group = draw_idx; // this is inclusive + setup_opengl_for_draw_mode(draw, render_state); + setup_opengl_tex(draw.tex_unit, draw.tbp, draw.mode.get_filt_enable(), + draw.mode.get_clamp_s_enable(), draw.mode.get_clamp_t_enable(), render_state); + + for (u32 draw_to_consider = draw_idx + 1; draw_to_consider < draw_idx + TEX_UNITS; + draw_to_consider++) { + if (draw_to_consider >= m_next_free_draw) { + break; + } + const auto& next_draw = m_draw_buffer[draw_to_consider]; + if (next_draw.mode.as_int() != draw.mode.as_int()) { + break; + } + if (next_draw.fix != draw.fix) { + break; + } + m_stats.saved_draws++; + end_of_draw_group++; + setup_opengl_tex(next_draw.tex_unit, next_draw.tbp, next_draw.mode.get_filt_enable(), + next_draw.mode.get_clamp_s_enable(), next_draw.mode.get_clamp_t_enable(), + render_state); + } + + u32 end_idx; + if (end_of_draw_group == m_next_free_draw - 1) { + end_idx = m_vertices.next_index; + } else { + end_idx = m_draw_buffer[end_of_draw_group + 1].start_index; + } + void* offset = (void*)(draw.start_index * sizeof(u32)); + // fmt::print("drawing {:4d} with abe {} tex {} {}", end_idx - draw.start_index, + // (int)draw.mode.get_ab_enable(), end_of_draw_group - draw_idx, draw.to_single_line_string() ); + // fmt::print("{}\n", draw.mode.to_string()); + glDrawElements(GL_TRIANGLES, end_idx - draw.start_index, GL_UNSIGNED_INT, (void*)offset); + prof.add_draw_call(); + prof.add_tri((end_idx - draw.start_index) / 3); + draw_idx = end_of_draw_group + 1; + } +} + +void DirectRenderer2::setup_opengl_for_draw_mode(const Draw& draw, + SharedRenderState* render_state) { + // compute alpha_reject: + float alpha_reject = 0.f; + if (draw.mode.get_at_enable()) { + switch (draw.mode.get_alpha_test()) { + case DrawMode::AlphaTest::ALWAYS: + break; + case DrawMode::AlphaTest::GEQUAL: + alpha_reject = draw.mode.get_aref() / 128.f; + break; + case DrawMode::AlphaTest::NEVER: + break; + default: + fmt::print("unknown alpha test: {}\n", (int)draw.mode.get_alpha_test()); + ASSERT(false); + } + } + + // setup blending and color mult + float color_mult = 1.f; + if (!draw.mode.get_ab_enable()) { + glDisable(GL_BLEND); + } else { + glEnable(GL_BLEND); + glBlendColor(1, 1, 1, 1); + if (draw.mode.get_alpha_blend() == DrawMode::AlphaBlend::SRC_DST_SRC_DST) { + // (Cs - Cd) * As + Cd + // Cs * As + (1 - As) * Cd + // s, d + glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA); + glBlendEquation(GL_FUNC_ADD); + } else if (draw.mode.get_alpha_blend() == DrawMode::AlphaBlend::SRC_0_SRC_DST) { + // (Cs - 0) * As + Cd + // Cs * As + (1) * Cd + // s, d + ASSERT(draw.fix == 0); + glBlendFunc(GL_SRC_ALPHA, GL_ONE); + glBlendEquation(GL_FUNC_ADD); + } else if (draw.mode.get_alpha_blend() == DrawMode::AlphaBlend::ZERO_SRC_SRC_DST) { + // (0 - Cs) * As + Cd + // Cd - Cs * As + // s, d + glBlendFunc(GL_SRC_ALPHA, GL_ONE); + glBlendEquation(GL_FUNC_REVERSE_SUBTRACT); + } else if (draw.mode.get_alpha_blend() == DrawMode::AlphaBlend::SRC_DST_FIX_DST) { + // (Cs - Cd) * fix + Cd + // Cs * fix + (1 - fx) * Cd + glBlendFunc(GL_CONSTANT_ALPHA, GL_ONE_MINUS_CONSTANT_ALPHA); + glBlendColor(0, 0, 0, draw.fix / 127.f); + glBlendEquation(GL_FUNC_ADD); + } else if (draw.mode.get_alpha_blend() == DrawMode::AlphaBlend::SRC_SRC_SRC_SRC) { + // this is very weird... + // Cs + glBlendFunc(GL_ONE, GL_ZERO); + glBlendEquation(GL_FUNC_ADD); + } else if (draw.mode.get_alpha_blend() == DrawMode::AlphaBlend::SRC_0_DST_DST) { + // (Cs - 0) * Ad + Cd + glBlendFunc(GL_DST_ALPHA, GL_ONE); + glBlendEquation(GL_FUNC_ADD); + color_mult = 0.5; + } else { + ASSERT(false); + } + } + + // setup ztest + if (draw.mode.get_zt_enable()) { + glEnable(GL_DEPTH_TEST); + switch (draw.mode.get_depth_test()) { + case GsTest::ZTest::NEVER: + glDepthFunc(GL_NEVER); + break; + case GsTest::ZTest::ALWAYS: + glDepthFunc(GL_ALWAYS); + break; + case GsTest::ZTest::GEQUAL: + glDepthFunc(GL_GEQUAL); + break; + case GsTest::ZTest::GREATER: + glDepthFunc(GL_GREATER); + break; + default: + ASSERT(false); + } + } else { + // you aren't supposed to turn off z test enable, the GS had some bugs + ASSERT(false); + } + + if (draw.mode.get_depth_write_enable()) { + glDepthMask(GL_TRUE); + } else { + glDepthMask(GL_FALSE); + } + + if (draw.tbp == UINT16_MAX) { + // not using a texture + ASSERT(false); + render_state->shaders[ShaderId::DIRECT_BASIC].activate(); + } else { + // yes using a texture + render_state->shaders[ShaderId::DIRECT2].activate(); + glUniform1f(m_ogl.alpha_reject, alpha_reject); + glUniform1f(m_ogl.color_mult, color_mult); + glUniform4f(m_ogl.fog_color, render_state->fog_color[0], render_state->fog_color[1], + render_state->fog_color[2], render_state->fog_intensity); + } +} + +void DirectRenderer2::setup_opengl_tex(u16 unit, + u16 tbp, + bool filter, + bool clamp_s, + bool clamp_t, + SharedRenderState* render_state) { + // look up the texture + TextureRecord* tex = nullptr; + u32 tbp_to_lookup = tbp & 0x7fff; + bool use_mt4hh = tbp & 0x8000; + + if (use_mt4hh) { + tex = render_state->texture_pool->lookup_mt4hh(tbp_to_lookup); + } else { + tex = render_state->texture_pool->lookup(tbp_to_lookup); + } + + if (!tex) { + // TODO Add back + if (tbp_to_lookup >= 8160 && tbp_to_lookup <= 8600) { + fmt::print("Failed to find texture at {}, using random (eye zone)\n", tbp_to_lookup); + + tex = render_state->texture_pool->get_random_texture(); + } else { + fmt::print("Failed to find texture at {}, using random\n", tbp_to_lookup); + tex = render_state->texture_pool->get_random_texture(); + } + } + + if (!tex->on_gpu) { + render_state->texture_pool->upload_to_gpu(tex); + } + + glActiveTexture(GL_TEXTURE0 + unit); + glBindTexture(GL_TEXTURE_2D, tex->gpu_texture); + if (clamp_s) { + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); + } else { + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT); + } + + if (clamp_t) { + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); + } else { + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT); + } + + if (filter) { + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, + m_debug.disable_mip ? GL_LINEAR : GL_LINEAR_MIPMAP_LINEAR); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); + } else { + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + } +} + +void DirectRenderer2::draw_debug_window() { + ImGui::Text("Uploads: %d", m_stats.num_uploads); + ImGui::Text("Upload time: %.3f ms", m_stats.upload_wait * 1000); + ImGui::Text("Upload size: %d bytes", m_stats.upload_bytes); + ImGui::Text("Flush due to full: %d times", m_stats.flush_due_to_full); +} + +void DirectRenderer2::render_gif_data(const u8* data, + SharedRenderState* render_state, + ScopedProfilerNode& prof) { + bool eop = false; + + u32 offset = 0; + while (!eop) { + GifTag tag(data + offset); + offset += 16; + + // unpack registers. + // faster to do it once outside of the nloop loop. + GifTag::RegisterDescriptor reg_desc[16]; + u32 nreg = tag.nreg(); + for (u32 i = 0; i < nreg; i++) { + reg_desc[i] = tag.reg(i); + } + + auto format = tag.flg(); + if (format == GifTag::Format::PACKED) { + if (tag.pre()) { + handle_prim(tag.prim()); + } + for (u32 loop = 0; loop < tag.nloop(); loop++) { + for (u32 reg = 0; reg < nreg; reg++) { + // fmt::print("{}\n", reg_descriptor_name(reg_desc[reg])); + switch (reg_desc[reg]) { + case GifTag::RegisterDescriptor::AD: + handle_ad(data + offset); + break; + case GifTag::RegisterDescriptor::ST: + handle_st_packed(data + offset); + break; + case GifTag::RegisterDescriptor::RGBAQ: + handle_rgbaq_packed(data + offset); + break; + case GifTag::RegisterDescriptor::XYZF2: + handle_xyzf2_packed(data + offset, render_state, prof); + break; + case GifTag::RegisterDescriptor::PRIM: + ASSERT(false); // handle_prim_packed(data + offset, render_state, prof); + break; + case GifTag::RegisterDescriptor::TEX0_1: + ASSERT(false); // handle_tex0_1_packed(data + offset); + break; + default: + fmt::print("Register {} is not supported in packed mode yet\n", + reg_descriptor_name(reg_desc[reg])); + ASSERT(false); + } + offset += 16; // PACKED = quadwords + } + } + } else if (format == GifTag::Format::REGLIST) { + for (u32 loop = 0; loop < tag.nloop(); loop++) { + for (u32 reg = 0; reg < nreg; reg++) { + u64 register_data; + memcpy(®ister_data, data + offset, 8); + // fmt::print("loop: {} reg: {} {}\n", loop, reg, reg_descriptor_name(reg_desc[reg])); + switch (reg_desc[reg]) { + case GifTag::RegisterDescriptor::PRIM: + ASSERT(false); // handle_prim(register_data, render_state, prof); + break; + case GifTag::RegisterDescriptor::RGBAQ: + ASSERT(false); // handle_rgbaq(register_data); + break; + case GifTag::RegisterDescriptor::XYZF2: + ASSERT(false); // handle_xyzf2(register_data, render_state, prof); + break; + default: + fmt::print("Register {} is not supported in reglist mode yet\n", + reg_descriptor_name(reg_desc[reg])); + ASSERT(false); + } + offset += 8; // PACKED = quadwords + } + } + } else { + ASSERT(false); // format not packed or reglist. + } + + eop = tag.eop(); + } +} + +void DirectRenderer2::handle_ad(const u8* data) { + u64 value; + GsRegisterAddress addr; + memcpy(&value, data, sizeof(u64)); + memcpy(&addr, data + 8, sizeof(GsRegisterAddress)); + + // fmt::print("{}\n", register_address_name(addr)); + switch (addr) { + case GsRegisterAddress::ZBUF_1: + handle_zbuf1(value); + break; + case GsRegisterAddress::TEST_1: + handle_test1(value); + break; + case GsRegisterAddress::ALPHA_1: + handle_alpha1(value); + break; + case GsRegisterAddress::PABE: + // ASSERT(false); // handle_pabe(value); + ASSERT(value == 0); + break; + case GsRegisterAddress::CLAMP_1: + handle_clamp1(value); + break; + case GsRegisterAddress::PRIM: + ASSERT(false); // handle_prim(value, render_state, prof); + break; + + case GsRegisterAddress::TEX1_1: + handle_tex1_1(value); + break; + case GsRegisterAddress::TEXA: { + GsTexa reg(value); + + // rgba16 isn't used so this doesn't matter? + // but they use sane defaults anyway + ASSERT(reg.ta0() == 0); + ASSERT(reg.ta1() == 0x80); // note: check rgba16_to_rgba32 if this changes. + + ASSERT(reg.aem() == false); + } break; + case GsRegisterAddress::TEXCLUT: + // TODO + // the only thing the direct renderer does with texture is font, which does no tricks with + // CLUT. The texture upload process will do all of the lookups with the default CLUT. + // So we'll just assume that the TEXCLUT is set properly and ignore this. + break; + case GsRegisterAddress::FOGCOL: + // TODO + break; + case GsRegisterAddress::TEX0_1: + handle_tex0_1(value); + break; + case GsRegisterAddress::MIPTBP1_1: + case GsRegisterAddress::MIPTBP2_1: + // TODO this has the address of different mip levels. + break; + case GsRegisterAddress::TEXFLUSH: + break; + default: + fmt::print("Address {} is not supported\n", register_address_name(addr)); + ASSERT(false); + } +} + +void DirectRenderer2::handle_test1(u64 val) { + GsTest reg(val); + ASSERT(!reg.date()); // datm doesn't matter + if (m_state.gs_test != reg) { + m_current_state_has_open_draw = false; + m_state.gs_test = reg; + m_state.as_mode.set_at(reg.alpha_test_enable()); + if (reg.alpha_test_enable()) { + switch (reg.alpha_test()) { + case GsTest::AlphaTest::NEVER: + m_state.as_mode.set_alpha_test(DrawMode::AlphaTest::NEVER); + break; + case GsTest::AlphaTest::ALWAYS: + m_state.as_mode.set_alpha_test(DrawMode::AlphaTest::ALWAYS); + break; + case GsTest::AlphaTest::GEQUAL: + m_state.as_mode.set_alpha_test(DrawMode::AlphaTest::GEQUAL); + break; + default: + ASSERT(false); + } + } + + m_state.as_mode.set_aref(reg.aref()); + m_state.as_mode.set_alpha_fail(reg.afail()); + m_state.as_mode.set_zt(reg.zte()); + m_state.as_mode.set_depth_test(reg.ztest()); + } +} + +void DirectRenderer2::handle_zbuf1(u64 val) { + GsZbuf x(val); + ASSERT(x.psm() == TextureFormat::PSMZ24); + ASSERT(x.zbp() == 448); + bool write = !x.zmsk(); + if (write != m_state.as_mode.get_depth_write_enable()) { + m_current_state_has_open_draw = false; + m_state.as_mode.set_depth_write_enable(write); + } +} + +void DirectRenderer2::handle_tex0_1(u64 val) { + GsTex0 reg(val); + if (m_state.gs_tex0 != reg) { + m_current_state_has_open_draw = false; + m_state.gs_tex0 = reg; + m_state.tbp = reg.tbp0(); + // tbw + if (reg.psm() == GsTex0::PSM::PSMT4HH) { + m_state.tbp |= 0x8000; + } + // tw/th + m_state.as_mode.set_tcc(reg.tcc()); + m_state.set_tcc_flag(reg.tcc()); + bool decal = reg.tfx() == GsTex0::TextureFunction::DECAL; + m_state.as_mode.set_decal(decal); + m_state.set_decal_flag(decal); + ASSERT(reg.tfx() == GsTex0::TextureFunction::DECAL || + reg.tfx() == GsTex0::TextureFunction::MODULATE); + } +} + +void DirectRenderer2::handle_tex1_1(u64 val) { + GsTex1 reg(val); + if (reg.mmag() != m_state.as_mode.get_filt_enable()) { + m_current_state_has_open_draw = false; + m_state.as_mode.set_filt_enable(reg.mmag()); + } +} + +void DirectRenderer2::handle_clamp1(u64 val) { + bool clamp_s = val & 0b001; + bool clamp_t = val & 0b100; + + if ((clamp_s != m_state.as_mode.get_clamp_s_enable()) || + (clamp_t != m_state.as_mode.get_clamp_t_enable())) { + m_current_state_has_open_draw = false; + m_state.as_mode.set_clamp_s_enable(clamp_s); + m_state.as_mode.set_clamp_t_enable(clamp_t); + } +} + +void DirectRenderer2::handle_prim(u64 val) { + m_state.next_vertex_starts_strip = true; + GsPrim reg(val); + if (reg != m_state.gs_prim) { + m_current_state_has_open_draw = false; + ASSERT(reg.kind() == GsPrim::Kind::TRI_STRIP); + ASSERT(reg.gouraud()); + if (!reg.tme()) { + ASSERT(false); // todo, might need this + } + m_state.as_mode.set_fog(reg.fge()); + m_state.set_fog_flag(reg.fge()); + m_state.as_mode.set_ab(reg.abe()); + ASSERT(!reg.aa1()); + ASSERT(!reg.fst()); + ASSERT(!reg.ctxt()); + ASSERT(!reg.fix()); + } +} + +void DirectRenderer2::handle_st_packed(const u8* data) { + memcpy(&m_state.s, data + 0, 4); + memcpy(&m_state.t, data + 4, 4); + memcpy(&m_state.Q, data + 8, 4); +} + +void DirectRenderer2::handle_rgbaq_packed(const u8* data) { + m_state.rgba[0] = data[0]; + m_state.rgba[1] = data[4]; + m_state.rgba[2] = data[8]; + m_state.rgba[3] = data[12]; +} + +void DirectRenderer2::handle_xyzf2_packed(const u8* data, + SharedRenderState* render_state, + ScopedProfilerNode& prof) { + if (m_vertices.close_to_full()) { + m_stats.flush_due_to_full++; + flush_pending(render_state, prof); + } + + u32 x, y; + memcpy(&x, data, 4); + memcpy(&y, data + 4, 4); + + u64 upper; + memcpy(&upper, data + 8, 8); + u32 z = (upper >> 4) & 0xffffff; + + u8 f = (upper >> 36); + bool adc = !(upper & (1ull << 47)); + + if (m_state.next_vertex_starts_strip) { + m_state.next_vertex_starts_strip = false; + m_state.strip_warmup = 0; + } + + // push the vertex + auto& vert = m_vertices.vertices[m_vertices.next_vertex++]; + m_state.strip_warmup++; + if (adc && m_state.strip_warmup >= 3) { + m_vertices.indices[m_vertices.next_index++] = m_vertices.next_vertex - 1; + m_vertices.indices[m_vertices.next_index++] = m_vertices.next_vertex - 2; + m_vertices.indices[m_vertices.next_index++] = m_vertices.next_vertex - 3; + } + + if (!m_current_state_has_open_draw) { + m_current_state_has_open_draw = true; + if (m_next_free_draw >= m_draw_buffer.size()) { + ASSERT(false); + } + // pick a texture unit to use + u8 tex_unit = 0; + if (m_next_free_draw > 0) { + tex_unit = (m_draw_buffer[m_next_free_draw - 1].tex_unit + 1) % TEX_UNITS; + } + auto& draw = m_draw_buffer[m_next_free_draw++]; + draw.mode = m_state.as_mode; + draw.start_index = m_vertices.next_index; + draw.tbp = m_state.tbp; + draw.fix = m_state.gs_alpha.fix(); + // associate this draw with this texture unit. + draw.tex_unit = tex_unit; + m_state.tex_unit = tex_unit; + } + + vert.xyz[0] = x; + vert.xyz[1] = y; + vert.xyz[2] = z; + vert.rgba = m_state.rgba; + vert.stq = math::Vector(m_state.s, m_state.t, m_state.Q); + vert.tex_unit = m_state.tex_unit; + vert.fog = f; + vert.flags = m_state.vertex_flags; +} + +void DirectRenderer2::handle_alpha1(u64 val) { + GsAlpha reg(val); + if (m_state.gs_alpha != reg) { + m_state.gs_alpha = reg; + m_current_state_has_open_draw = false; + auto a = reg.a_mode(); + auto b = reg.b_mode(); + auto c = reg.c_mode(); + auto d = reg.d_mode(); + if (a == GsAlpha::BlendMode::SOURCE && b == GsAlpha::BlendMode::DEST && + c == GsAlpha::BlendMode::SOURCE && d == GsAlpha::BlendMode::DEST) { + m_state.as_mode.set_alpha_blend(DrawMode::AlphaBlend::SRC_DST_SRC_DST); + } else if (a == GsAlpha::BlendMode::SOURCE && b == GsAlpha::BlendMode::ZERO_OR_FIXED && + c == GsAlpha::BlendMode::SOURCE && d == GsAlpha::BlendMode::DEST) { + m_state.as_mode.set_alpha_blend(DrawMode::AlphaBlend::SRC_0_SRC_DST); + } else if (a == GsAlpha::BlendMode::ZERO_OR_FIXED && b == GsAlpha::BlendMode::SOURCE && + c == GsAlpha::BlendMode::SOURCE && d == GsAlpha::BlendMode::DEST) { + m_state.as_mode.set_alpha_blend(DrawMode::AlphaBlend::ZERO_SRC_SRC_DST); + } else if (a == GsAlpha::BlendMode::SOURCE && b == GsAlpha::BlendMode::DEST && + c == GsAlpha::BlendMode::ZERO_OR_FIXED && d == GsAlpha::BlendMode::DEST) { + m_state.as_mode.set_alpha_blend(DrawMode::AlphaBlend::SRC_DST_FIX_DST); + } else if (a == GsAlpha::BlendMode::SOURCE && b == GsAlpha::BlendMode::SOURCE && + c == GsAlpha::BlendMode::SOURCE && d == GsAlpha::BlendMode::SOURCE) { + m_state.as_mode.set_alpha_blend(DrawMode::AlphaBlend::SRC_SRC_SRC_SRC); + } else if (a == GsAlpha::BlendMode::SOURCE && b == GsAlpha::BlendMode::ZERO_OR_FIXED && + c == GsAlpha::BlendMode::DEST && d == GsAlpha::BlendMode::DEST) { + m_state.as_mode.set_alpha_blend(DrawMode::AlphaBlend::SRC_0_DST_DST); + } else { + // unsupported blend: a 0 b 2 c 2 d 1 + // lg::error("unsupported blend: a {} b {} c {} d {}", (int)a, (int)b, (int)c, (int)d); + // ASSERT(false); + } + } +} \ No newline at end of file diff --git a/game/graphics/opengl_renderer/DirectRenderer2.h b/game/graphics/opengl_renderer/DirectRenderer2.h new file mode 100644 index 000000000..e9dc6c760 --- /dev/null +++ b/game/graphics/opengl_renderer/DirectRenderer2.h @@ -0,0 +1,139 @@ +#pragma once + +#include +#include "common/common_types.h" +#include "game/graphics/opengl_renderer/BucketRenderer.h" +#include "common/dma/gs.h" + +class DirectRenderer2 { + public: + DirectRenderer2(u32 max_verts, u32 max_inds, u32 max_draws, const std::string& name); + void init_shaders(ShaderLibrary& shaders); + void reset_state(); + void render_gif_data(const u8* data, SharedRenderState* render_state, ScopedProfilerNode& prof); + void flush_pending(SharedRenderState* render_state, ScopedProfilerNode& prof); + void draw_debug_window(); + ~DirectRenderer2(); + + private: + static constexpr u8 TEX_UNITS = 10; + void reset_buffers(); + + void draw_call_loop_simple(SharedRenderState* render_state, ScopedProfilerNode& prof); + void draw_call_loop_grouped(SharedRenderState* render_state, ScopedProfilerNode& prof); + + // the GsState is the state of all Gs Registers. + struct GsState { + DrawMode as_mode; + u16 tbp; + GsTest gs_test; + GsTex0 gs_tex0; + GsPrim gs_prim; + GsAlpha gs_alpha; + u8 tex_unit = 0; + + float s, t, Q; + math::Vector rgba; + bool next_vertex_starts_strip = true; + u32 strip_warmup = 0; + u8 vertex_flags = 0; + void set_tcc_flag(bool value) { vertex_flags ^= (-(u8)value ^ vertex_flags) & 1; } + void set_decal_flag(bool value) { vertex_flags ^= (-(u8)value ^ vertex_flags) & 2; } + void set_fog_flag(bool value) { vertex_flags ^= (-(u8)value ^ vertex_flags) & 4; } + } m_state; + + // if this is true, then drawing a vertex can just get pushed directly to the vertex buffer. + // if not, we need to set up a new draw + bool m_current_state_has_open_draw = false; + + struct Draw { + DrawMode mode; + u32 start_index = -1; + u16 tbp = UINT16_MAX; + u8 fix = 0; + u8 tex_unit = 0; + + std::string to_string() const; + std::string to_single_line_string() const; + }; + + std::vector m_draw_buffer; + u32 m_next_free_draw = 0; + + struct Vertex { + math::Vector xyz; + math::Vector rgba; + math::Vector stq; + u8 tex_unit; + u8 flags; + u8 fog; + u8 pad; + + std::string print() const; + }; + static_assert(sizeof(Vertex) == 32); + + struct VertexBuffer { + std::vector vertices; + std::vector indices; + u32 next_vertex = 0; + u32 next_index = 0; + + void push_reset() { indices[next_index++] = UINT32_MAX; } + + Vertex& push() { + indices[next_index++] = next_vertex; + return vertices[next_vertex++]; + } + + bool close_to_full() { + return (next_vertex + 40 > vertices.size()) || (next_index + 40 > indices.size()); + } + } m_vertices; + + struct { + GLuint vertex_buffer; + GLuint index_buffer; + GLuint vao; + GLuint alpha_reject, color_mult, fog_color; + } m_ogl; + + struct Stats { + u32 upload_bytes = 0; + u32 num_uploads = 0; + u32 flush_due_to_full = 0; + float upload_wait = 0; + u32 saved_draws = 0; + } m_stats; + + struct Debug { + bool disable_mip = true; + } m_debug; + + std::string m_name; + void setup_opengl_for_draw_mode(const Draw& draw, SharedRenderState* render_state); + void setup_opengl_tex(u16 unit, + u16 tbp, + bool filter, + bool clamp_s, + bool clamp_t, + SharedRenderState* render_state); + + // gif handlers + void handle_ad(const u8* data); + + void handle_test1(u64 val); + void handle_tex0_1(u64 val); + void handle_tex1_1(u64 val); + void handle_clamp1(u64 val); + void handle_prim(u64 val); + void handle_alpha1(u64 val); + void handle_zbuf1(u64 val); + + // packed + void handle_st_packed(const u8* data); + void handle_rgbaq_packed(const u8* data); + void handle_xyzf2_packed(const u8* data, + SharedRenderState* render_state, + ScopedProfilerNode& prof); +}; diff --git a/game/graphics/opengl_renderer/GenericRenderer.cpp b/game/graphics/opengl_renderer/GenericRenderer.cpp index 795ac4a06..5146d35c1 100644 --- a/game/graphics/opengl_renderer/GenericRenderer.cpp +++ b/game/graphics/opengl_renderer/GenericRenderer.cpp @@ -2,7 +2,13 @@ #include "third-party/imgui/imgui.h" GenericRenderer::GenericRenderer(const std::string& name, BucketId my_id) - : BucketRenderer(name, my_id), m_direct(name, my_id, 0x4000) {} + : BucketRenderer(name, my_id), + m_direct(name, my_id, 0x30000), + m_direct2(30000, 60000, 1000, name) {} + +void GenericRenderer::init_shaders(ShaderLibrary& shaders) { + m_direct2.init_shaders(shaders); +} void GenericRenderer::render(DmaFollower& dma, SharedRenderState* render_state, @@ -16,7 +22,7 @@ void GenericRenderer::render(DmaFollower& dma, // the default ALPHA doesn't seem to be right. I don't know what's supposed to set it here. // although this is definitely a hack, it doesn't seem to cause problems when the first thing to // draw is transparent. - m_direct.hack_disable_blend(); + // m_direct.hack_disable_blend(); // skip if disabled if (!m_enabled) { @@ -58,10 +64,18 @@ void GenericRenderer::render(DmaFollower& dma, ASSERT(false); } } else if (v0.kind == VifCode::Kind::FLUSHA && v1.kind == VifCode::Kind::DIRECT) { - m_direct.render_gif(data.data, data.size_bytes, render_state, prof); + if (render_state->use_direct2) { + m_direct2.render_gif_data(data.data, render_state, prof); + } else { + m_direct.render_gif(data.data, data.size_bytes, render_state, prof); + } ASSERT(v1.immediate == data.size_bytes / 16); } else if (v0.kind == VifCode::Kind::NOP && v1.kind == VifCode::Kind::DIRECT) { - m_direct.render_gif(data.data, data.size_bytes, render_state, prof); + if (render_state->use_direct2) { + m_direct2.render_gif_data(data.data, render_state, prof); + } else { + m_direct.render_gif(data.data, data.size_bytes, render_state, prof); + } ASSERT(v1.immediate == data.size_bytes / 16); } else if (v0.kind == VifCode::Kind::STCYCL && v1.kind == VifCode::Kind::UNPACK_V4_32) { vu.stcycl = v0.immediate; @@ -123,7 +137,11 @@ void GenericRenderer::render(DmaFollower& dma, } m_skipped_tags++; } - m_direct.flush_pending(render_state, prof); + if (render_state->use_direct2) { + m_direct2.flush_pending(render_state, prof); + } else { + m_direct.flush_pending(render_state, prof); + } } void GenericRenderer::handle_dma_stream(const u8* data, @@ -297,7 +315,11 @@ void GenericRenderer::mscal(int imm, SharedRenderState* render_state, ScopedProf void GenericRenderer::xgkick(u16 addr, SharedRenderState* render_state, ScopedProfilerNode& prof) { if (render_state->enable_generic_xgkick && m_xgkick_idx >= m_min_xgkick && m_xgkick_idx < m_max_xgkick) { - m_direct.render_gif(m_buffer.data + (16 * addr), UINT32_MAX, render_state, prof); + if (render_state->use_direct2) { + m_direct2.render_gif_data(m_buffer.data + (16 * addr), render_state, prof); + } else { + m_direct.render_gif(m_buffer.data + (16 * addr), UINT32_MAX, render_state, prof); + } } m_xgkick_idx++; } \ No newline at end of file diff --git a/game/graphics/opengl_renderer/GenericRenderer.h b/game/graphics/opengl_renderer/GenericRenderer.h index bca194307..aad169583 100644 --- a/game/graphics/opengl_renderer/GenericRenderer.h +++ b/game/graphics/opengl_renderer/GenericRenderer.h @@ -2,6 +2,7 @@ #include "game/graphics/opengl_renderer/BucketRenderer.h" #include "game/graphics/opengl_renderer/DirectRenderer.h" +#include "game/graphics/opengl_renderer/DirectRenderer2.h" #include "game/common/vu.h" class GenericRenderer : public BucketRenderer { @@ -9,6 +10,7 @@ class GenericRenderer : public BucketRenderer { GenericRenderer(const std::string& name, BucketId my_id); void render(DmaFollower& dma, SharedRenderState* render_state, ScopedProfilerNode& prof) override; void draw_debug_window() override; + void init_shaders(ShaderLibrary& shaders) override; private: u32 unpack32_4(const VifCodeUnpack& up, const u8* data, u32 imm); @@ -39,6 +41,7 @@ class GenericRenderer : public BucketRenderer { int m_skipped_tags = 0; DirectRenderer m_direct; + DirectRenderer2 m_direct2; std::string m_debug; struct Vu { diff --git a/game/graphics/opengl_renderer/MercRenderer.cpp b/game/graphics/opengl_renderer/MercRenderer.cpp index 87f941922..a2731796c 100644 --- a/game/graphics/opengl_renderer/MercRenderer.cpp +++ b/game/graphics/opengl_renderer/MercRenderer.cpp @@ -3,10 +3,16 @@ #include "third-party/imgui/imgui.h" MercRenderer::MercRenderer(const std::string& name, BucketId my_id) - : BucketRenderer(name, my_id), m_direct(fmt::format("{}-dir", name), my_id, 0x30000) { + : BucketRenderer(name, my_id), + m_direct(fmt::format("{}-dir", name), my_id, 0x30000), + m_direct2(20000, 40000, 1000, name) { memset(m_buffer.data, 0, sizeof(m_buffer.data)); } +void MercRenderer::init_shaders(ShaderLibrary& shaders) { + m_direct2.init_shaders(shaders); +} + void MercRenderer::render(DmaFollower& dma, SharedRenderState* render_state, ScopedProfilerNode& prof) { @@ -41,11 +47,15 @@ void MercRenderer::render(DmaFollower& dma, // if we reach here, there's stuff to draw handle_setup(dma, render_state, prof); + m_direct2.reset_state(); m_direct.reset_state(); + while (dma.current_tag_offset() != render_state->next_bucket) { handle_merc_chain(dma, render_state, prof); } ASSERT(dma.current_tag_offset() == render_state->next_bucket); + m_direct2.flush_pending(render_state, prof); + m_direct.flush_pending(render_state, prof); } @@ -336,10 +346,15 @@ void MercRenderer::draw_debug_window() { ImGui::Checkbox("Normal MSCAL enable", &m_enable_normal_mscals); ImGui::Checkbox("Prime MSCAL enable", &m_enable_prime_mscals); ImGui::Checkbox("Send to direct", &m_enable_send_to_direct); + m_direct2.draw_debug_window(); } void MercRenderer::xgkick(u16 addr, SharedRenderState* render_state, ScopedProfilerNode& prof) { if (m_enable_send_to_direct && render_state->enable_merc_xgkick) { - m_direct.render_gif(m_buffer.data + (16 * addr), UINT32_MAX, render_state, prof); + if (render_state->use_direct2) { + m_direct2.render_gif_data(m_buffer.data + (16 * addr), render_state, prof); + } else { + m_direct.render_gif(m_buffer.data + (16 * addr), UINT32_MAX, render_state, prof); + } } } diff --git a/game/graphics/opengl_renderer/MercRenderer.h b/game/graphics/opengl_renderer/MercRenderer.h index 36d7f7bf9..fe5e88dae 100644 --- a/game/graphics/opengl_renderer/MercRenderer.h +++ b/game/graphics/opengl_renderer/MercRenderer.h @@ -4,10 +4,13 @@ #include "common/math/Vector.h" #include "game/graphics/opengl_renderer/DirectRenderer.h" #include "game/common/vu.h" +#include "game/graphics/opengl_renderer/DirectRenderer2.h" class MercRenderer : public BucketRenderer { public: MercRenderer(const std::string& name, BucketId my_id); + void init_shaders(ShaderLibrary& shaders) override; + void render(DmaFollower& dma, SharedRenderState* render_state, ScopedProfilerNode& prof) override; void draw_debug_window() override; @@ -79,6 +82,7 @@ class MercRenderer : public BucketRenderer { u16 xitop(); DirectRenderer m_direct; + DirectRenderer2 m_direct2; struct { u32 row[4] = {0, 0, 0, 0}; diff --git a/game/graphics/opengl_renderer/OpenGLRenderer.cpp b/game/graphics/opengl_renderer/OpenGLRenderer.cpp index 6abd8918e..14a2cf03d 100644 --- a/game/graphics/opengl_renderer/OpenGLRenderer.cpp +++ b/game/graphics/opengl_renderer/OpenGLRenderer.cpp @@ -43,6 +43,7 @@ void GLAPIENTRY opengl_error_callback(GLenum source, } else if (severity == GL_DEBUG_SEVERITY_HIGH) { lg::error("[{}] OpenGL error 0x{:X} S{:X} T{:X}: {}", g_current_render, id, source, type, message); + ASSERT(false); } } @@ -219,6 +220,8 @@ void OpenGLRenderer::init_bucket_renderers() { if (!m_bucket_renderers[i]) { init_bucket_renderer(fmt::format("bucket{}", i), (BucketId)i); } + + m_bucket_renderers[i]->init_shaders(m_render_state.shaders); } } @@ -295,6 +298,7 @@ void OpenGLRenderer::draw_renderer_selection_window() { ImGui::Checkbox("Render Debug (slower)", &m_render_state.render_debug); ImGui::Checkbox("Merc XGKICK", &m_render_state.enable_merc_xgkick); ImGui::Checkbox("Generic XGKICK", &m_render_state.enable_generic_xgkick); + ImGui::Checkbox("Direct 2", &m_render_state.use_direct2); for (size_t i = 0; i < m_bucket_renderers.size(); i++) { auto renderer = m_bucket_renderers[i].get(); diff --git a/game/graphics/opengl_renderer/Shader.cpp b/game/graphics/opengl_renderer/Shader.cpp index e1951ab8c..ff852f2ca 100644 --- a/game/graphics/opengl_renderer/Shader.cpp +++ b/game/graphics/opengl_renderer/Shader.cpp @@ -75,4 +75,5 @@ ShaderLibrary::ShaderLibrary() { at(ShaderId::TFRAG3) = {"tfrag3"}; at(ShaderId::TFRAG3_NO_TEX) = {"tfrag3_no_tex"}; at(ShaderId::SPRITE3) = {"sprite3_3d"}; + at(ShaderId::DIRECT2) = {"direct2"}; } diff --git a/game/graphics/opengl_renderer/Shader.h b/game/graphics/opengl_renderer/Shader.h index edca2d7c9..3400248c7 100644 --- a/game/graphics/opengl_renderer/Shader.h +++ b/game/graphics/opengl_renderer/Shader.h @@ -32,6 +32,7 @@ enum class ShaderId { TFRAG3_NO_TEX = 7, SPRITE = 8, SPRITE3 = 9, + DIRECT2 = 10, MAX_SHADERS }; diff --git a/game/graphics/opengl_renderer/shaders/direct2.frag b/game/graphics/opengl_renderer/shaders/direct2.frag new file mode 100644 index 000000000..9428eff58 --- /dev/null +++ b/game/graphics/opengl_renderer/shaders/direct2.frag @@ -0,0 +1,75 @@ +#version 430 core + +out vec4 color; + +in vec4 fragment_color; +in vec3 tex_coord; +uniform float alpha_reject; +uniform float color_mult; +uniform vec4 fog_color; + +in flat uvec4 tex_info; +in float fog; + +layout (binding = 0) uniform sampler2D tex_T0; +layout (binding = 1) uniform sampler2D tex_T1; +layout (binding = 2) uniform sampler2D tex_T2; +layout (binding = 3) uniform sampler2D tex_T3; +layout (binding = 4) uniform sampler2D tex_T4; +layout (binding = 5) uniform sampler2D tex_T5; +layout (binding = 6) uniform sampler2D tex_T6; +layout (binding = 7) uniform sampler2D tex_T7; +layout (binding = 8) uniform sampler2D tex_T8; +layout (binding = 9) uniform sampler2D tex_T9; + + +vec4 sample_tex(vec2 coord, uint unit) { + switch (unit) { + case 0: return texture(tex_T0, coord); + case 1: return texture(tex_T1, coord); + case 2: return texture(tex_T2, coord); + case 3: return texture(tex_T3, coord); + case 4: return texture(tex_T4, coord); + case 5: return texture(tex_T5, coord); + case 6: return texture(tex_T6, coord); + case 7: return texture(tex_T7, coord); + case 8: return texture(tex_T8, coord); + case 9: return texture(tex_T9, coord); + default : return vec4(1.0, 0, 1.0, 1.0); + } +} + +void main() { + vec4 T0 = sample_tex(tex_coord.xy / tex_coord.z, tex_info.x); + // y is tcc + // z is decal + + if ((tex_info.y & 1u) == 0) { + if ((tex_info.y & 2u) == 0) { + // modulate + no tcc + color.xyz = fragment_color.xyz * T0.xyz; + color.w = fragment_color.w; + } else { + // decal + no tcc + color.xyz = T0.xyz * 0.5; + color.w = fragment_color.w; + } + } else { + if ((tex_info.y & 2u) == 0) { + // modulate + tcc + color = fragment_color * T0; + } else { + // decal + tcc + color.xyz = T0.xyz * 0.5; + color.w = T0.w; + } + } + color *= 2; + color.xyz *= color_mult; + if (color.a < alpha_reject) { + discard; + } + if ((tex_info.y & 4u) != 0) { + color.xyz = mix(color.xyz, fog_color.xyz / 255., clamp(fog_color.w * (1 - fog), 0, 1)); + } +} diff --git a/game/graphics/opengl_renderer/shaders/direct2.vert b/game/graphics/opengl_renderer/shaders/direct2.vert new file mode 100644 index 000000000..7895564c6 --- /dev/null +++ b/game/graphics/opengl_renderer/shaders/direct2.vert @@ -0,0 +1,26 @@ +#version 430 core + +layout (location = 0) in vec3 position_in; +layout (location = 1) in vec4 rgba_in; +layout (location = 2) in vec3 tex_coord_in; +layout (location = 3) in uvec4 byte_info; + + +out vec4 fragment_color; +out vec3 tex_coord; +out float fog; + +// putting all texture info stuff here so it's easier to copy-paste +out flat uvec2 tex_info; + +void main() { + gl_Position = vec4((position_in.x - 0x8000) / 0x1000, + -(position_in.y - 0x8000) / 0x800, + position_in.z / 0x800000 - 1., 1.0); + // scissoring area adjust + gl_Position.y *= 512.0/448.0; + fragment_color = vec4(rgba_in.x, rgba_in.y, rgba_in.z, rgba_in.w * 2.); + tex_coord = tex_coord_in; + tex_info = byte_info.xy; + fog = float(byte_info.z) / 255.; +} diff --git a/game/graphics/pipelines/opengl.cpp b/game/graphics/pipelines/opengl.cpp index ea4c7052b..0e148dcf0 100644 --- a/game/graphics/pipelines/opengl.cpp +++ b/game/graphics/pipelines/opengl.cpp @@ -31,6 +31,8 @@ namespace { +constexpr bool run_dma_copy = false; + struct GraphicsData { // vsync std::mutex sync_mutex; @@ -253,7 +255,6 @@ void render_game_frame(int width, int height, int lbox_width, int lbox_height) { g_gfx_data->debug_gui.want_save() = false; } - auto& chain = g_gfx_data->dma_copier.get_last_result(); g_gfx_data->frame_idx_of_input_data = g_gfx_data->frame_idx; RenderOptions options; options.window_height_px = height; @@ -268,10 +269,14 @@ void render_game_frame(int width, int height, int lbox_width, int lbox_height) { if (options.save_screenshot) { options.screenshot_path = make_output_file_name(g_gfx_data->debug_gui.screenshot_name()); } - g_gfx_data->ogl_renderer.render(DmaFollower(chain.data.data(), chain.start_offset), options); - // g_gfx_data->ogl_renderer.render(DmaFollower(g_gfx_data->dma_copier.get_last_input_data(), - // g_gfx_data->dma_copier.get_last_input_offset()), - // options); + if constexpr (run_dma_copy) { + auto& chain = g_gfx_data->dma_copier.get_last_result(); + g_gfx_data->ogl_renderer.render(DmaFollower(chain.data.data(), chain.start_offset), options); + } else { + g_gfx_data->ogl_renderer.render(DmaFollower(g_gfx_data->dma_copier.get_last_input_data(), + g_gfx_data->dma_copier.get_last_input_offset()), + options); + } } // before vsync, mark the chain as rendered. @@ -518,7 +523,7 @@ void gl_send_chain(const void* data, u32 offset) { // The renderers should just operate on DMA chains, so eliminating this step in the future may // be easy. - g_gfx_data->dma_copier.run(data, offset); + g_gfx_data->dma_copier.set_input_data(data, offset, run_dma_copy); g_gfx_data->has_data_to_render = true; g_gfx_data->dma_cv.notify_all(); diff --git a/game/mips2c/functions/generic_merc.cpp b/game/mips2c/functions/generic_merc.cpp index 74db89aab..761cd648e 100644 --- a/game/mips2c/functions/generic_merc.cpp +++ b/game/mips2c/functions/generic_merc.cpp @@ -1337,6 +1337,10 @@ void sq_buffer(Mask mask, const Vf& data, u32 qw) { } } +void sq_xyzw(const Vf& data, u32 qw) { + memcpy(vu0_data_mem + qw * 16, data.data, 16); +} + void lq_buffer(Mask mask, Vf& data, u32 qw) { ASSERT(qw * 16 < sizeof(vu0_data_mem)); for (int i = 0; i < 4; i++) { @@ -1346,6 +1350,10 @@ void lq_buffer(Mask mask, Vf& data, u32 qw) { } } +void lq_xyzw(Vf& data, u32 qw) { + memcpy(data.data, vu0_data_mem + qw * 16, 16); +} + void vcallms_280(ExecutionContext* c, u16* vis) { bool bc; // 0.003921569 | maxw.x vf17, vf00, vf00 :i @@ -1358,13 +1366,13 @@ void vcallms_280(ExecutionContext* c, u16* vis) { c->vfs[vf06].vf.minii(Mask::z, c->vf_src(vf00).vf, c->I); vis[vi08] = 0x8c; /* 140 */ // sqi.xyzw vf01, vi08 | minii.z vf07, vf00, I - c->vfs[vf07].vf.minii(Mask::z, c->vf_src(vf00).vf, c->I); sq_buffer(Mask::xyzw, c->vf_src(vf01).vf, vis[vi08]++); + c->vfs[vf07].vf.minii(Mask::z, c->vf_src(vf00).vf, c->I); sq_xyzw(c->vf_src(vf01).vf, vis[vi08]++); // sqi.xyzw vf02, vi08 | minix.w vf05, vf00, vf27 - c->vfs[vf05].vf.mini(Mask::w, c->vf_src(vf00).vf, c->vf_src(vf27).vf.x()); sq_buffer(Mask::xyzw, c->vf_src(vf02).vf, vis[vi08]++); + c->vfs[vf05].vf.mini(Mask::w, c->vf_src(vf00).vf, c->vf_src(vf27).vf.x()); sq_xyzw(c->vf_src(vf02).vf, vis[vi08]++); // sqi.xyzw vf03, vi08 | miniy.w vf06, vf00, vf27 - c->vfs[vf06].vf.mini(Mask::w, c->vf_src(vf00).vf, c->vf_src(vf27).vf.y()); sq_buffer(Mask::xyzw, c->vf_src(vf03).vf, vis[vi08]++); + c->vfs[vf06].vf.mini(Mask::w, c->vf_src(vf00).vf, c->vf_src(vf27).vf.y()); sq_xyzw(c->vf_src(vf03).vf, vis[vi08]++); // sqi.xyzw vf04, vi08 | miniz.w vf07, vf00, vf27 - c->vfs[vf07].vf.mini(Mask::w, c->vf_src(vf00).vf, c->vf_src(vf27).vf.z()); sq_buffer(Mask::xyzw, c->vf_src(vf04).vf, vis[vi08]++); + c->vfs[vf07].vf.mini(Mask::w, c->vf_src(vf00).vf, c->vf_src(vf27).vf.z()); sq_xyzw(c->vf_src(vf04).vf, vis[vi08]++); // BRANCH! // ibne vi00, vi13, L1 | nop @@ -1397,7 +1405,7 @@ void vcallms_280(ExecutionContext* c, u16* vis) { vis[vi01] = vis[vi03]; L3: // lqi.xyzw vf29, vi10 | nop - lq_buffer(Mask::xyzw, c->vfs[vf29].vf, vis[vi10]++); + lq_xyzw(c->vfs[vf29].vf, vis[vi10]++); // iadd vi02, vi08, vi11 | nop vis[vi02] = vis[vi08] + vis[vi11]; // iadd vi04, vi02, vi12 | nop @@ -1415,25 +1423,480 @@ void vcallms_280(ExecutionContext* c, u16* vis) { void vcallms_303(ExecutionContext* c, u16* vis) { // vf21.x coming into here is bad (should be < 1, I think) // sq.xyzw vf23, 3(vi14) | mulx.xyzw vf11, vf20, vf19 - c->vfs[vf11].vf.mul(Mask::xyzw, c->vf_src(vf20).vf, c->vf_src(vf19).vf.x()); sq_buffer(Mask::xyzw, c->vf_src(vf23).vf, vis[vi14] + 3); + c->vfs[vf11].vf.mul_xyzw(c->vf_src(vf20).vf, c->vf_src(vf19).vf.x()); sq_xyzw(c->vf_src(vf23).vf, vis[vi14] + 3); // sq.xyzw vf24, 4(vi14) | mulx.xyzw vf12, vf21, vf19 - c->vfs[vf12].vf.mul(Mask::xyzw, c->vf_src(vf21).vf, c->vf_src(vf19).vf.x()); sq_buffer(Mask::xyzw, c->vf_src(vf24).vf, vis[vi14] + 4); + c->vfs[vf12].vf.mul_xyzw(c->vf_src(vf21).vf, c->vf_src(vf19).vf.x()); sq_xyzw(c->vf_src(vf24).vf, vis[vi14] + 4); // sq.xyzw vf25, 5(vi14) | mulx.xyzw vf13, vf22, vf19 - c->vfs[vf13].vf.mul(Mask::xyzw, c->vf_src(vf22).vf, c->vf_src(vf19).vf.x()); sq_buffer(Mask::xyzw, c->vf_src(vf25).vf, vis[vi14] + 5); + c->vfs[vf13].vf.mul_xyzw(c->vf_src(vf22).vf, c->vf_src(vf19).vf.x()); sq_xyzw(c->vf_src(vf25).vf, vis[vi14] + 5); // sq.xyzw vf26, 6(vi14) | nop - sq_buffer(Mask::xyzw, c->vf_src(vf26).vf, vis[vi14] + 6); + sq_xyzw(c->vf_src(vf26).vf, vis[vi14] + 6); // sq.xyzw vf11, 0(vi14) | nop - sq_buffer(Mask::xyzw, c->vf_src(vf11).vf, vis[vi14]); + sq_xyzw(c->vf_src(vf11).vf, vis[vi14]); // sq.xyzw vf12, 1(vi14) | nop - sq_buffer(Mask::xyzw, c->vf_src(vf12).vf, vis[vi14] + 1); + sq_xyzw(c->vf_src(vf12).vf, vis[vi14] + 1); // sq.xyzw vf13, 2(vi14) | nop :e - sq_buffer(Mask::xyzw, c->vf_src(vf13).vf, vis[vi14] + 2); + sq_xyzw(c->vf_src(vf13).vf, vis[vi14] + 2); // nop | nop } +void vcallms_311_case_314_ref(ExecutionContext* c, u16* vis) { + bool bc; + // mtir vi11, vf01.x | maddz.xyzw vf11, vf26, vf10 + c->acc.vf.madd_xyzw(c->vfs[vf11].vf, c->vf_src(vf26).vf, c->vf_src(vf10).vf.z()); vis[vi11] = c->vf_src(vf01).vf.x_as_u16(); + // jr vi01 | mul.xyzw vf14, vf13, Q + c->vfs[vf14].vf.mul_xyzw(c->vf_src(vf13).vf, c->Q); + // sqi.xyzw vf04, vi08 | mulaw.xyzw ACC, vf20, vf08 + c->acc.vf.mula_xyzw(c->vf_src(vf20).vf, c->vf_src(vf08).vf.w()); sq_xyzw(c->vf_src(vf04).vf, vis[vi08]++); + // rsqrt Q, vf00.w, vf16.x | maddaw.xyzw ACC, vf21, vf09 + c->acc.vf.madda_xyzw(c->vfs[vf21].vf, c->vfs[vf09].vf.w()); c->Q = c->vf_src(vf00).vf.w() / std::sqrt(c->vf_src(vf16).vf.x()); + // lq.xyzw vf24, -124(vi11) | maddaw.xyzw ACC, vf22, vf10 + c->acc.vf.madda_xyzw(c->vfs[vf22].vf, c->vfs[vf10].vf.w()); lq_xyzw(c->vfs[vf24].vf, vis[vi11] + -124); + // lq.xyzw vf25, -123(vi11) | maddw.xyzw vf15, vf23, vf00 + c->acc.vf.madd_xyzw(c->vfs[vf15].vf, c->vf_src(vf23).vf, c->vf_src(vf00).vf.w()); lq_xyzw(c->vfs[vf25].vf, vis[vi11] + -123); + // lq.xyzw vf26, -122(vi11) | mul.xyzw vf16, vf11, vf11 + c->vfs[vf16].vf.mul_xyzw(c->vf_src(vf11).vf, c->vf_src(vf11).vf); lq_xyzw(c->vfs[vf26].vf, vis[vi11] + -122); + // lq.xyzw vf20, -128(vi11) | add.xyzw vf08, vf01, vf05 + c->vfs[vf08].vf.add_xyzw(c->vf_src(vf01).vf, c->vf_src(vf05).vf); lq_xyzw(c->vfs[vf20].vf, vis[vi11] + -128); + // lq.xyzw vf21, -127(vi11) | add.xyzw vf09, vf02, vf06 + c->vfs[vf09].vf.add_xyzw(c->vf_src(vf02).vf, c->vf_src(vf06).vf); lq_xyzw(c->vfs[vf21].vf, vis[vi11] + -127); + // lq.xyzw vf22, -126(vi11) | add.xyzw vf10, vf03, vf07 + c->vfs[vf10].vf.add_xyzw(c->vf_src(vf03).vf, c->vf_src(vf07).vf); lq_xyzw(c->vfs[vf22].vf, vis[vi11] + -126); + // BRANCH! + // ibne vi08, vi02, L4 | adday.xyzw vf16, vf16 + c->acc.vf.adda(Mask::xyzw, c->vfs[vf16].vf, c->vfs[vf16].vf.y()); bc = (vis[vi08] != vis[vi02]); + // lq.xyzw vf23, -125(vi11) | maddz.xyzw vf16, vf17, vf16 + c->acc.vf.madd_xyzw(c->vfs[vf16].vf, c->vf_src(vf17).vf, c->vf_src(vf16).vf.z()); lq_xyzw(c->vfs[vf23].vf, vis[vi11] + -125); + if (bc) { goto L4; } + + // ior vi01, vi03, vi00 | nop + vis[vi01] = vis[vi03]; + L4: + // move.xyzw vf13, vf12 | mulaz.xyzw ACC, vf24, vf08 :e + c->acc.vf.mula_xyzw(c->vf_src(vf24).vf, c->vf_src(vf08).vf.z()); c->vfs[vf13].vf.move(Mask::xyzw, c->vf_src(vf12).vf); + // move.xyzw vf12, vf11 | maddaz.xyzw ACC, vf25, vf09 + c->acc.vf.madda_xyzw(c->vfs[vf25].vf, c->vfs[vf09].vf.z()); c->vfs[vf12].vf.move(Mask::xyzw, c->vf_src(vf11).vf); +} + +void vcallms_311_case_314(ExecutionContext* c, u16* vis) { + // this is one of the most expensive operations. + + // 00 mtir vi11, vf01.x | maddz.xyzw vf11, vf26, vf10 + // 01 jr vi01 | mul.xyzw vf14, vf13, Q + // 02 sqi.xyzw vf04, vi08 | mulaw.xyzw ACC, vf20, vf08 + // 03 rsqrt Q, vf00.w, vf16.x | maddaw.xyzw ACC, vf21, vf09 + // 04 lq.xyzw vf24, -124(vi11) | maddaw.xyzw ACC, vf22, vf10 + // 05 lq.xyzw vf25, -123(vi11) | maddw.xyzw vf15, vf23, vf00 + // 06 lq.xyzw vf26, -122(vi11) | mul.xyzw vf16, vf11, vf11 + // 07 lq.xyzw vf20, -128(vi11) | add.xyzw vf08, vf01, vf05 + // 08 lq.xyzw vf21, -127(vi11) | add.xyzw vf09, vf02, vf06 + // 09 lq.xyzw vf22, -126(vi11) | add.xyzw vf10, vf03, vf07 + // 10 ibne vi08, vi02, L4 | adday.xyzw vf16, vf16 + // 11 lq.xyzw vf23, -125(vi11) | maddz.xyzw vf16, vf17, vf16 + // 12 ior vi01, vi03, vi00 | nop + // L4: + // 13 move.xyzw vf13, vf12 | mulaz.xyzw ACC, vf24, vf08 :e + // 14 move.xyzw vf12, vf11 | maddaz.xyzw ACC, vf25, vf09 + + // we've got: + // - mul by Q + // - calculate next Q + // - accumulated multiply part + // - lots of loading + // - adding part + + c->acc.vf.madd_xyzw(c->vfs[vf11].vf, c->vf_src(vf26).vf, c->vf_src(vf10).vf.z()); + vis[vi11] = c->vf_src(vf01).vf.x_as_u16(); + c->vfs[vf14].vf.mul_xyzw(c->vf_src(vf13).vf, c->Q); + sq_xyzw(c->vf_src(vf04).vf, vis[vi08]++); + c->Q = 1.f / std::sqrt(c->vf_src(vf16).vf.x()); + + + + // c->acc.vf.mula_xyzw(c->vf_src(vf20).vf, c->vf_src(vf08).vf.w()); + // c->acc.vf.madda_xyzw(c->vfs[vf21].vf, c->vfs[vf09].vf.w()); + // c->acc.vf.madda_xyzw(c->vfs[vf22].vf, c->vfs[vf10].vf.w()); + // c->acc.vf.madd_xyzw(c->vfs[vf15].vf, c->vf_src(vf23).vf, c->vf_src(vf00).vf.w()); + + __m128 acc = _mm_mul_ps(_mm_load_ps(c->vf_src(vf20).vf.data), _mm_set1_ps(c->vf_src(vf08).vf.w())); + acc = _mm_add_ps(_mm_mul_ps(_mm_load_ps(c->vf_src(vf21).vf.data), _mm_set1_ps(c->vf_src(vf09).vf.w())), acc); + acc = _mm_add_ps(_mm_mul_ps(_mm_load_ps(c->vf_src(vf22).vf.data), _mm_set1_ps(c->vf_src(vf10).vf.w())), acc); + acc = _mm_add_ps(_mm_load_ps(c->vf_src(vf23).vf.data), acc); + _mm_store_ps(c->vfs[vf15].vf.data, acc); + + + // lq_xyzw(c->vfs[vf20].vf, vis[vi11] + -128); + // lq_xyzw(c->vfs[vf21].vf, vis[vi11] + -127); + // lq_xyzw(c->vfs[vf22].vf, vis[vi11] + -126); + // lq_xyzw(c->vfs[vf23].vf, vis[vi11] + -125); + // lq_xyzw(c->vfs[vf24].vf, vis[vi11] + -124); + // lq_xyzw(c->vfs[vf25].vf, vis[vi11] + -123); + // lq_xyzw(c->vfs[vf26].vf, vis[vi11] + -122); + memcpy(c->vfs[vf20].vf.data, vu0_data_mem + (vis[vi11] - 128) * 16, 7 * 16); + + + c->vfs[vf16].vf.mul_xyzw(c->vf_src(vf11).vf, c->vf_src(vf11).vf); + c->vfs[vf08].vf.add_xyzw(c->vf_src(vf01).vf, c->vf_src(vf05).vf); + c->vfs[vf09].vf.add_xyzw(c->vf_src(vf02).vf, c->vf_src(vf06).vf); + c->vfs[vf10].vf.add_xyzw(c->vf_src(vf03).vf, c->vf_src(vf07).vf); + + c->acc.vf.adda(Mask::xyzw, c->vfs[vf16].vf, c->vfs[vf16].vf.y()); + c->acc.vf.madd_xyzw(c->vfs[vf16].vf, c->vf_src(vf17).vf, c->vf_src(vf16).vf.z()); + c->acc.vf.mula_xyzw(c->vf_src(vf24).vf, c->vf_src(vf08).vf.z()); + c->acc.vf.madda_xyzw(c->vfs[vf25].vf, c->vfs[vf09].vf.z()); + + c->vfs[vf13].vf.move(Mask::xyzw, c->vf_src(vf12).vf); + c->vfs[vf12].vf.move(Mask::xyzw, c->vf_src(vf11).vf); + + // end integer thing. + if (vis[vi08] == vis[vi02]) { + vis[vi01] = vis[vi03]; + } +} + +void vcallms_311_case_326(ExecutionContext* c, u16* vis) { + bool bc; + // mtir vi11, vf01.x | maddz.xyzw vf11, vf26, vf10 + c->acc.vf.madd_xyzw(c->vfs[vf11].vf, c->vf_src(vf26).vf, c->vf_src(vf10).vf.z()); vis[vi11] = c->vf_src(vf01).vf.x_as_u16(); + // jr vi01 | mul.xyzw vf14, vf13, Q + c->vfs[vf14].vf.mul_xyzw(c->vf_src(vf13).vf, c->Q); + // sqi.xyzw vf04, vi08 | mulaw.xyzw ACC, vf20, vf08 + c->acc.vf.mula_xyzw(c->vf_src(vf20).vf, c->vf_src(vf08).vf.w()); sq_xyzw(c->vf_src(vf04).vf, vis[vi08]++); +// rsqrt Q, vf00.w, vf16.x | maddaw.xyzw ACC, vf21, vf09 + c->acc.vf.madda_xyzw(c->vfs[vf21].vf, c->vfs[vf09].vf.w()); c->Q = c->vf_src(vf00).vf.w() / std::sqrt(c->vf_src(vf16).vf.x()); + // mtir vi12, vf01.y | maddaw.xyzw ACC, vf22, vf10 + c->acc.vf.madda_xyzw(c->vfs[vf22].vf, c->vfs[vf10].vf.w()); vis[vi12] = c->vf_src(vf01).vf.y_as_u16(); + // iand vi11, vi11, vi09 | maddw.xyzw vf15, vf23, vf00 + c->acc.vf.madd_xyzw(c->vfs[vf15].vf, c->vf_src(vf23).vf, c->vf_src(vf00).vf.w()); vis[vi11] = vis[vi11] & vis[vi09]; + // lq.xyzw vf19, 4(vi11) | mul.xyzw vf16, vf11, vf11 + c->vfs[vf16].vf.mul_xyzw(c->vf_src(vf11).vf, c->vf_src(vf11).vf); lq_xyzw(c->vfs[vf19].vf, vis[vi11] + 4); + // BRANCH! + // ibeq vi09, vi11, L7 | add.xyzw vf08, vf01, vf05 + c->vfs[vf08].vf.add_xyzw(c->vf_src(vf01).vf, c->vf_src(vf05).vf); bc = (vis[vi09] == vis[vi11]); + // iand vi12, vi12, vi09 | add.xyzw vf09, vf02, vf06 + c->vfs[vf09].vf.add_xyzw(c->vf_src(vf02).vf, c->vf_src(vf06).vf); vis[vi12] = vis[vi12] & vis[vi09]; + if (bc) { goto L7; } + + // nop | muly.xyzw vf18, vf18, vf17 + c->vfs[vf18].vf.mul_xyzw(c->vf_src(vf18).vf, c->vf_src(vf17).vf.y()); + // lq.xyzw vf24, 4(vi12) | add.xyzw vf10, vf03, vf07 + c->vfs[vf10].vf.add_xyzw(c->vf_src(vf03).vf, c->vf_src(vf07).vf); lq_xyzw(c->vfs[vf24].vf, vis[vi12] + 4); + // lq.xyzw vf27, 5(vi11) | adday.xyzw vf16, vf16 + c->acc.vf.adda(Mask::xyzw, c->vfs[vf16].vf, c->vfs[vf16].vf.y()); lq_xyzw(c->vfs[vf27].vf, vis[vi11] + 5); + // lq.xyzw vf25, 5(vi12) | maddz.xyzw vf16, vf17, vf16 + c->acc.vf.madd_xyzw(c->vfs[vf16].vf, c->vf_src(vf17).vf, c->vf_src(vf16).vf.z()); lq_xyzw(c->vfs[vf25].vf, vis[vi12] + 5); + // lq.xyzw vf28, 6(vi11) | mulax.xyzw ACC, vf19, vf18 + c->acc.vf.mula_xyzw(c->vf_src(vf19).vf, c->vf_src(vf18).vf.x()); lq_xyzw(c->vfs[vf28].vf, vis[vi11] + 6); + // lq.xyzw vf26, 6(vi12) | maddy.xyzw vf24, vf24, vf18 + c->acc.vf.madd_xyzw(c->vfs[vf24].vf, c->vf_src(vf24).vf, c->vf_src(vf18).vf.y()); lq_xyzw(c->vfs[vf26].vf, vis[vi12] + 6); + // lq.xyzw vf29, 0(vi11) | mulax.xyzw ACC, vf27, vf18 + c->acc.vf.mula_xyzw(c->vf_src(vf27).vf, c->vf_src(vf18).vf.x()); lq_xyzw(c->vfs[vf29].vf, vis[vi11]); + // lq.xyzw vf20, 0(vi12) | maddy.xyzw vf25, vf25, vf18 + c->acc.vf.madd_xyzw(c->vfs[vf25].vf, c->vf_src(vf25).vf, c->vf_src(vf18).vf.y()); lq_xyzw(c->vfs[vf20].vf, vis[vi12]); + // lq.xyzw vf19, 1(vi11) | mulax.xyzw ACC, vf28, vf18 + c->acc.vf.mula_xyzw(c->vf_src(vf28).vf, c->vf_src(vf18).vf.x()); lq_xyzw(c->vfs[vf19].vf, vis[vi11] + 1); + // lq.xyzw vf21, 1(vi12) | maddy.xyzw vf26, vf26, vf18 + c->acc.vf.madd_xyzw(c->vfs[vf26].vf, c->vf_src(vf26).vf, c->vf_src(vf18).vf.y()); lq_xyzw(c->vfs[vf21].vf, vis[vi12] + 1); + // lq.xyzw vf27, 2(vi11) | mulax.xyzw ACC, vf29, vf18 + c->acc.vf.mula_xyzw(c->vf_src(vf29).vf, c->vf_src(vf18).vf.x()); lq_xyzw(c->vfs[vf27].vf, vis[vi11] + 2); + // lq.xyzw vf22, 2(vi12) | maddy.xyzw vf20, vf20, vf18 + c->acc.vf.madd_xyzw(c->vfs[vf20].vf, c->vf_src(vf20).vf, c->vf_src(vf18).vf.y()); lq_xyzw(c->vfs[vf22].vf, vis[vi12] + 2); + // lq.xyzw vf28, 3(vi11) | mulax.xyzw ACC, vf19, vf18 + c->acc.vf.mula_xyzw(c->vf_src(vf19).vf, c->vf_src(vf18).vf.x()); lq_xyzw(c->vfs[vf28].vf, vis[vi11] + 3); + // lq.xyzw vf23, 3(vi12) | maddy.xyzw vf21, vf21, vf18 + c->acc.vf.madd_xyzw(c->vfs[vf21].vf, c->vf_src(vf21).vf, c->vf_src(vf18).vf.y()); lq_xyzw(c->vfs[vf23].vf, vis[vi12] + 3); + // nop | mulax.xyzw ACC, vf27, vf18 + c->acc.vf.mula_xyzw(c->vf_src(vf27).vf, c->vf_src(vf18).vf.x()); + // iaddiu vi01, vi00, 0x161 | maddy.xyzw vf22, vf22, vf18 + c->acc.vf.madd_xyzw(c->vfs[vf22].vf, c->vf_src(vf22).vf, c->vf_src(vf18).vf.y()); vis[vi01] = 0x161; /* 353 */ + // BRANCH! + // ibne vi08, vi04, L5 | mulax.xyzw ACC, vf28, vf18 + c->acc.vf.mula_xyzw(c->vf_src(vf28).vf, c->vf_src(vf18).vf.x()); bc = (vis[vi08] != vis[vi04]); + // nop | maddy.xyzw vf23, vf23, vf18 + c->acc.vf.madd_xyzw(c->vfs[vf23].vf, c->vf_src(vf23).vf, c->vf_src(vf18).vf.y()); + if (bc) { goto L5; } + + // ior vi01, vi05, vi00 | nop + vis[vi01] = vis[vi05]; + L5: + // move.xyzw vf13, vf12 | mulaz.xyzw ACC, vf24, vf08 :e + c->acc.vf.mula_xyzw(c->vf_src(vf24).vf, c->vf_src(vf08).vf.z()); c->vfs[vf13].vf.move(Mask::xyzw, c->vf_src(vf12).vf); + // move.xyzw vf12, vf11 | maddaz.xyzw ACC, vf25, vf09 + c->acc.vf.madda_xyzw(c->vfs[vf25].vf, c->vfs[vf09].vf.z()); c->vfs[vf12].vf.move(Mask::xyzw, c->vf_src(vf11).vf); + return; + L7: + // nop | add.xyzw vf10, vf03, vf07 + c->vfs[vf10].vf.add_xyzw(c->vf_src(vf03).vf, c->vf_src(vf07).vf); + // BRANCH! + // ibne vi08, vi04, L8 | adday.xyzw vf16, vf16 + c->acc.vf.adda(Mask::xyzw, c->vfs[vf16].vf, c->vfs[vf16].vf.y()); bc = (vis[vi08] != vis[vi04]); + // nop | maddz.xyzw vf16, vf17, vf16 + c->acc.vf.madd_xyzw(c->vfs[vf16].vf, c->vf_src(vf17).vf, c->vf_src(vf16).vf.z()); + if (bc) { goto L8; } + + // ior vi01, vi05, vi00 | nop + vis[vi01] = vis[vi05]; + L8: + // move.xyzw vf13, vf12 | mulaz.xyzw ACC, vf24, vf08 :e + c->acc.vf.mula_xyzw(c->vf_src(vf24).vf, c->vf_src(vf08).vf.z()); c->vfs[vf13].vf.move(Mask::xyzw, c->vf_src(vf12).vf); + // move.xyzw vf12, vf11 | maddaz.xyzw ACC, vf25, vf09 + c->acc.vf.madda_xyzw(c->vfs[vf25].vf, c->vfs[vf09].vf.z()); c->vfs[vf12].vf.move(Mask::xyzw, c->vf_src(vf11).vf); + return; +} + +void vcallms_311_case_353(ExecutionContext* c, u16* vis) { + bool bc; + // mtir vi11, vf01.x | maddz.xyzw vf11, vf26, vf10 + c->acc.vf.madd_xyzw(c->vfs[vf11].vf, c->vf_src(vf26).vf, c->vf_src(vf10).vf.z()); vis[vi11] = c->vf_src(vf01).vf.x_as_u16(); + // jr vi01 | mul.xyzw vf14, vf13, Q + c->vfs[vf14].vf.mul_xyzw(c->vf_src(vf13).vf, c->Q); + // sqi.xyzw vf04, vi08 | mulaw.xyzw ACC, vf20, vf08 + c->acc.vf.mula_xyzw(c->vf_src(vf20).vf, c->vf_src(vf08).vf.w()); sq_xyzw(c->vf_src(vf04).vf, vis[vi08]++); + // rsqrt Q, vf00.w, vf16.x | maddaw.xyzw ACC, vf21, vf09 + c->acc.vf.madda_xyzw(c->vfs[vf21].vf, c->vfs[vf09].vf.w()); c->Q = c->vf_src(vf00).vf.w() / std::sqrt(c->vf_src(vf16).vf.x()); + // mtir vi12, vf01.y | maddaw.xyzw ACC, vf22, vf10 + c->acc.vf.madda_xyzw(c->vfs[vf22].vf, c->vfs[vf10].vf.w()); vis[vi12] = c->vf_src(vf01).vf.y_as_u16(); + // iand vi11, vi11, vi09 | maddw.xyzw vf15, vf23, vf00 + c->acc.vf.madd_xyzw(c->vfs[vf15].vf, c->vf_src(vf23).vf, c->vf_src(vf00).vf.w()); vis[vi11] = vis[vi11] & vis[vi09]; + // lq.xyzw vf19, 4(vi11) | mul.xyzw vf16, vf11, vf11 + c->vfs[vf16].vf.mul_xyzw(c->vf_src(vf11).vf, c->vf_src(vf11).vf); lq_xyzw(c->vfs[vf19].vf, vis[vi11] + 4); + // BRANCH! + // ibeq vi09, vi11, L7 | add.xyzw vf08, vf01, vf05 + c->vfs[vf08].vf.add_xyzw(c->vf_src(vf01).vf, c->vf_src(vf05).vf); bc = (vis[vi09] == vis[vi11]); + // iand vi12, vi12, vi09 | add.xyzw vf09, vf02, vf06 + c->vfs[vf09].vf.add_xyzw(c->vf_src(vf02).vf, c->vf_src(vf06).vf); vis[vi12] = vis[vi12] & vis[vi09]; + if (bc) { goto L7; } + + // lq.xyzw vf24, 4(vi12) | add.xyzw vf10, vf03, vf07 + c->vfs[vf10].vf.add_xyzw(c->vf_src(vf03).vf, c->vf_src(vf07).vf); lq_xyzw(c->vfs[vf24].vf, vis[vi12] + 4); + // lq.xyzw vf27, 5(vi11) | adday.xyzw vf16, vf16 + c->acc.vf.adda(Mask::xyzw, c->vfs[vf16].vf, c->vfs[vf16].vf.y()); lq_xyzw(c->vfs[vf27].vf, vis[vi11] + 5); + // lq.xyzw vf25, 5(vi12) | maddz.xyzw vf16, vf17, vf16 + c->acc.vf.madd_xyzw(c->vfs[vf16].vf, c->vf_src(vf17).vf, c->vf_src(vf16).vf.z()); lq_xyzw(c->vfs[vf25].vf, vis[vi12] + 5); + // lq.xyzw vf28, 6(vi11) | mulaz.xyzw ACC, vf19, vf18 + c->acc.vf.mula_xyzw(c->vf_src(vf19).vf, c->vf_src(vf18).vf.z()); lq_xyzw(c->vfs[vf28].vf, vis[vi11] + 6); + // lq.xyzw vf26, 6(vi12) | maddw.xyzw vf24, vf24, vf18 + c->acc.vf.madd_xyzw(c->vfs[vf24].vf, c->vf_src(vf24).vf, c->vf_src(vf18).vf.w()); lq_xyzw(c->vfs[vf26].vf, vis[vi12] + 6); + // lq.xyzw vf29, 0(vi11) | mulaz.xyzw ACC, vf27, vf18 + c->acc.vf.mula_xyzw(c->vf_src(vf27).vf, c->vf_src(vf18).vf.z()); lq_xyzw(c->vfs[vf29].vf, vis[vi11]); + // lq.xyzw vf20, 0(vi12) | maddw.xyzw vf25, vf25, vf18 + c->acc.vf.madd_xyzw(c->vfs[vf25].vf, c->vf_src(vf25).vf, c->vf_src(vf18).vf.w()); lq_xyzw(c->vfs[vf20].vf, vis[vi12]); + // lq.xyzw vf19, 1(vi11) | mulaz.xyzw ACC, vf28, vf18 + c->acc.vf.mula_xyzw(c->vf_src(vf28).vf, c->vf_src(vf18).vf.z()); lq_xyzw(c->vfs[vf19].vf, vis[vi11] + 1); + // lq.xyzw vf21, 1(vi12) | maddw.xyzw vf26, vf26, vf18 + c->acc.vf.madd_xyzw(c->vfs[vf26].vf, c->vf_src(vf26).vf, c->vf_src(vf18).vf.w()); lq_xyzw(c->vfs[vf21].vf, vis[vi12] + 1); + // lq.xyzw vf27, 2(vi11) | mulaz.xyzw ACC, vf29, vf18 + c->acc.vf.mula_xyzw(c->vf_src(vf29).vf, c->vf_src(vf18).vf.z()); lq_xyzw(c->vfs[vf27].vf, vis[vi11] + 2); + // lq.xyzw vf22, 2(vi12) | maddw.xyzw vf20, vf20, vf18 + c->acc.vf.madd_xyzw(c->vfs[vf20].vf, c->vf_src(vf20).vf, c->vf_src(vf18).vf.w()); lq_xyzw(c->vfs[vf22].vf, vis[vi12] + 2); + // lq.xyzw vf28, 3(vi11) | mulaz.xyzw ACC, vf19, vf18 + c->acc.vf.mula_xyzw(c->vf_src(vf19).vf, c->vf_src(vf18).vf.z()); lq_xyzw(c->vfs[vf28].vf, vis[vi11] + 3); + // lq.xyzw vf23, 3(vi12) | maddw.xyzw vf21, vf21, vf18 + c->acc.vf.madd_xyzw(c->vfs[vf21].vf, c->vf_src(vf21).vf, c->vf_src(vf18).vf.w()); lq_xyzw(c->vfs[vf23].vf, vis[vi12] + 3); + // lqi.xyzw vf29, vi10 | mulaz.xyzw ACC, vf27, vf18 + c->acc.vf.mula_xyzw(c->vf_src(vf27).vf, c->vf_src(vf18).vf.z()); lq_xyzw(c->vfs[vf29].vf, vis[vi10]++); + // iaddiu vi01, vi00, 0x146 | maddw.xyzw vf22, vf22, vf18 + c->acc.vf.madd_xyzw(c->vfs[vf22].vf, c->vf_src(vf22).vf, c->vf_src(vf18).vf.w()); vis[vi01] = 0x146; /* 326 */ + // BRANCH! + // ibne vi08, vi04, L6 | mulaz.xyzw ACC, vf28, vf18 + c->acc.vf.mula_xyzw(c->vf_src(vf28).vf, c->vf_src(vf18).vf.z()); bc = (vis[vi08] != vis[vi04]); + // nop | maddw.xyzw vf23, vf23, vf18 + c->acc.vf.madd_xyzw(c->vfs[vf23].vf, c->vf_src(vf23).vf, c->vf_src(vf18).vf.w()); + if (bc) { goto L6; } + + // ior vi01, vi05, vi00 | nop + vis[vi01] = vis[vi05]; + L6: + // mtir vi13, vf29.w | itof0.xyzw vf18, vf29 + c->vfs[vf18].vf.itof0(Mask::xyzw, c->vf_src(vf29).vf); vis[vi13] = c->vf_src(vf29).vf.w_as_u16(); + // move.xyzw vf13, vf12 | mulaz.xyzw ACC, vf24, vf08 :e + c->acc.vf.mula_xyzw(c->vf_src(vf24).vf, c->vf_src(vf08).vf.z()); c->vfs[vf13].vf.move(Mask::xyzw, c->vf_src(vf12).vf); + // move.xyzw vf12, vf11 | maddaz.xyzw ACC, vf25, vf09 + c->acc.vf.madda_xyzw(c->vfs[vf25].vf, c->vfs[vf09].vf.z()); c->vfs[vf12].vf.move(Mask::xyzw, c->vf_src(vf11).vf); + return; + L7: + // nop | add.xyzw vf10, vf03, vf07 + c->vfs[vf10].vf.add_xyzw(c->vf_src(vf03).vf, c->vf_src(vf07).vf); + // BRANCH! + // ibne vi08, vi04, L8 | adday.xyzw vf16, vf16 + c->acc.vf.adda(Mask::xyzw, c->vfs[vf16].vf, c->vfs[vf16].vf.y()); bc = (vis[vi08] != vis[vi04]); + // nop | maddz.xyzw vf16, vf17, vf16 + c->acc.vf.madd_xyzw(c->vfs[vf16].vf, c->vf_src(vf17).vf, c->vf_src(vf16).vf.z()); + if (bc) { goto L8; } + + // ior vi01, vi05, vi00 | nop + vis[vi01] = vis[vi05]; + L8: + // move.xyzw vf13, vf12 | mulaz.xyzw ACC, vf24, vf08 :e + c->acc.vf.mula_xyzw(c->vf_src(vf24).vf, c->vf_src(vf08).vf.z()); c->vfs[vf13].vf.move(Mask::xyzw, c->vf_src(vf12).vf); + // move.xyzw vf12, vf11 | maddaz.xyzw ACC, vf25, vf09 + c->acc.vf.madda_xyzw(c->vfs[vf25].vf, c->vfs[vf09].vf.z()); c->vfs[vf12].vf.move(Mask::xyzw, c->vf_src(vf11).vf); + return; +} + +void vcallms_311_case_386(ExecutionContext* c, u16* vis) { + bool bc; + // mtir vi11, vf01.x | maddz.xyzw vf11, vf26, vf10 + c->acc.vf.madd_xyzw(c->vfs[vf11].vf, c->vf_src(vf26).vf, c->vf_src(vf10).vf.z()); vis[vi11] = c->vf_src(vf01).vf.x_as_u16(); + // jr vi01 | mul.xyzw vf14, vf13, Q + c->vfs[vf14].vf.mul_xyzw(c->vf_src(vf13).vf, c->Q); + // sqi.xyzw vf04, vi08 | mulaw.xyzw ACC, vf20, vf08 + c->acc.vf.mula_xyzw(c->vf_src(vf20).vf, c->vf_src(vf08).vf.w()); sq_xyzw(c->vf_src(vf04).vf, vis[vi08]++); + JUMP_386: + // rsqrt Q, vf00.w, vf16.x | maddaw.xyzw ACC, vf21, vf09 + c->acc.vf.madda_xyzw(c->vfs[vf21].vf, c->vfs[vf09].vf.w()); c->Q = c->vf_src(vf00).vf.w() / std::sqrt(c->vf_src(vf16).vf.x()); + // mtir vi12, vf01.y | maddaw.xyzw ACC, vf22, vf10 + c->acc.vf.madda_xyzw(c->vfs[vf22].vf, c->vfs[vf10].vf.w()); vis[vi12] = c->vf_src(vf01).vf.y_as_u16(); + // iand vi11, vi11, vi09 | maddw.xyzw vf15, vf23, vf00 + c->acc.vf.madd_xyzw(c->vfs[vf15].vf, c->vf_src(vf23).vf, c->vf_src(vf00).vf.w()); vis[vi11] = vis[vi11] & vis[vi09]; + // lq.xyzw vf19, 4(vi11) | mul.xyzw vf16, vf11, vf11 + c->vfs[vf16].vf.mul_xyzw(c->vf_src(vf11).vf, c->vf_src(vf11).vf); lq_xyzw(c->vfs[vf19].vf, vis[vi11] + 4); + // BRANCH! + // ibeq vi09, vi11, L10 | add.xyzw vf08, vf01, vf05 + c->vfs[vf08].vf.add_xyzw(c->vf_src(vf01).vf, c->vf_src(vf05).vf); bc = (vis[vi09] == vis[vi11]); + // iand vi12, vi12, vi09 | add.xyzw vf09, vf02, vf06 + c->vfs[vf09].vf.add_xyzw(c->vf_src(vf02).vf, c->vf_src(vf06).vf); vis[vi12] = vis[vi12] & vis[vi09]; + if (bc) { goto L10; } + + // nop | muly.xyzw vf18, vf18, vf17 + c->vfs[vf18].vf.mul_xyzw(c->vf_src(vf18).vf, c->vf_src(vf17).vf.y()); + // lq.xyzw vf27, 4(vi12) | add.xyzw vf10, vf03, vf07 + c->vfs[vf10].vf.add_xyzw(c->vf_src(vf03).vf, c->vf_src(vf07).vf); lq_xyzw(c->vfs[vf27].vf, vis[vi12] + 4); + // lq.xyzw vf24, 4(vi13) | adday.xyzw vf16, vf16 + c->acc.vf.adda(Mask::xyzw, c->vfs[vf16].vf, c->vfs[vf16].vf.y()); lq_xyzw(c->vfs[vf24].vf, vis[vi13] + 4); + // lq.xyzw vf28, 5(vi11) | maddz.xyzw vf16, vf17, vf16 + c->acc.vf.madd_xyzw(c->vfs[vf16].vf, c->vf_src(vf17).vf, c->vf_src(vf16).vf.z()); lq_xyzw(c->vfs[vf28].vf, vis[vi11] + 5); + // lq.xyzw vf19, 5(vi12) | mulax.xyzw ACC, vf19, vf18 + c->acc.vf.mula_xyzw(c->vf_src(vf19).vf, c->vf_src(vf18).vf.x()); lq_xyzw(c->vfs[vf19].vf, vis[vi12] + 5); + // lq.xyzw vf25, 5(vi13) | madday.xyzw ACC, vf27, vf18 + c->acc.vf.madda_xyzw(c->vfs[vf27].vf, c->vfs[vf18].vf.y()); lq_xyzw(c->vfs[vf25].vf, vis[vi13] + 5); + // lq.xyzw vf27, 6(vi11) | maddz.xyzw vf24, vf24, vf18 + c->acc.vf.madd_xyzw(c->vfs[vf24].vf, c->vf_src(vf24).vf, c->vf_src(vf18).vf.z()); lq_xyzw(c->vfs[vf27].vf, vis[vi11] + 6); + // lq.xyzw vf28, 6(vi12) | mulax.xyzw ACC, vf28, vf18 + c->acc.vf.mula_xyzw(c->vf_src(vf28).vf, c->vf_src(vf18).vf.x()); lq_xyzw(c->vfs[vf28].vf, vis[vi12] + 6); + // lq.xyzw vf26, 6(vi13) | madday.xyzw ACC, vf19, vf18 + c->acc.vf.madda_xyzw(c->vfs[vf19].vf, c->vfs[vf18].vf.y()); lq_xyzw(c->vfs[vf26].vf, vis[vi13] + 6); + // lq.xyzw vf19, 0(vi11) | maddz.xyzw vf25, vf25, vf18 + c->acc.vf.madd_xyzw(c->vfs[vf25].vf, c->vf_src(vf25).vf, c->vf_src(vf18).vf.z()); lq_xyzw(c->vfs[vf19].vf, vis[vi11]); + // lq.xyzw vf27, 0(vi12) | mulax.xyzw ACC, vf27, vf18 + c->acc.vf.mula_xyzw(c->vf_src(vf27).vf, c->vf_src(vf18).vf.x()); lq_xyzw(c->vfs[vf27].vf, vis[vi12]); + // lq.xyzw vf20, 0(vi13) | madday.xyzw ACC, vf28, vf18 + c->acc.vf.madda_xyzw(c->vfs[vf28].vf, c->vfs[vf18].vf.y()); lq_xyzw(c->vfs[vf20].vf, vis[vi13]); + // lq.xyzw vf28, 1(vi11) | maddz.xyzw vf26, vf26, vf18 + c->acc.vf.madd_xyzw(c->vfs[vf26].vf, c->vf_src(vf26).vf, c->vf_src(vf18).vf.z()); lq_xyzw(c->vfs[vf28].vf, vis[vi11] + 1); + // lq.xyzw vf19, 1(vi12) | mulax.xyzw ACC, vf19, vf18 + c->acc.vf.mula_xyzw(c->vf_src(vf19).vf, c->vf_src(vf18).vf.x()); lq_xyzw(c->vfs[vf19].vf, vis[vi12] + 1); + // lq.xyzw vf21, 1(vi13) | madday.xyzw ACC, vf27, vf18 + c->acc.vf.madda_xyzw(c->vfs[vf27].vf, c->vfs[vf18].vf.y()); lq_xyzw(c->vfs[vf21].vf, vis[vi13] + 1); + // lq.xyzw vf27, 2(vi11) | maddz.xyzw vf20, vf20, vf18 + c->acc.vf.madd_xyzw(c->vfs[vf20].vf, c->vf_src(vf20).vf, c->vf_src(vf18).vf.z()); lq_xyzw(c->vfs[vf27].vf, vis[vi11] + 2); + // lq.xyzw vf28, 2(vi12) | mulax.xyzw ACC, vf28, vf18 + c->acc.vf.mula_xyzw(c->vf_src(vf28).vf, c->vf_src(vf18).vf.x()); lq_xyzw(c->vfs[vf28].vf, vis[vi12] + 2); + // lq.xyzw vf22, 2(vi13) | madday.xyzw ACC, vf19, vf18 + c->acc.vf.madda_xyzw(c->vfs[vf19].vf, c->vfs[vf18].vf.y()); lq_xyzw(c->vfs[vf22].vf, vis[vi13] + 2); + // lq.xyzw vf19, 3(vi11) | maddz.xyzw vf21, vf21, vf18 + c->acc.vf.madd_xyzw(c->vfs[vf21].vf, c->vf_src(vf21).vf, c->vf_src(vf18).vf.z()); lq_xyzw(c->vfs[vf19].vf, vis[vi11] + 3); + // lq.xyzw vf27, 3(vi12) | mulax.xyzw ACC, vf27, vf18 + c->acc.vf.mula_xyzw(c->vf_src(vf27).vf, c->vf_src(vf18).vf.x()); lq_xyzw(c->vfs[vf27].vf, vis[vi12] + 3); + // lq.xyzw vf23, 3(vi13) | madday.xyzw ACC, vf28, vf18 + c->acc.vf.madda_xyzw(c->vfs[vf28].vf, c->vfs[vf18].vf.y()); lq_xyzw(c->vfs[vf23].vf, vis[vi13] + 3); + // lqi.xyzw vf29, vi10 | maddz.xyzw vf22, vf22, vf18 + c->acc.vf.madd_xyzw(c->vfs[vf22].vf, c->vf_src(vf22).vf, c->vf_src(vf18).vf.z()); lq_xyzw(c->vfs[vf29].vf, vis[vi10]++); + // BRANCH! + // ibne vi08, vi06, L9 | mulax.xyzw ACC, vf19, vf18 + c->acc.vf.mula_xyzw(c->vf_src(vf19).vf, c->vf_src(vf18).vf.x()); bc = (vis[vi08] != vis[vi06]); + // nop | madday.xyzw ACC, vf27, vf18 + c->acc.vf.madda_xyzw(c->vfs[vf27].vf, c->vfs[vf18].vf.y()); + if (bc) { goto L9; } + + // iaddiu vi01, vi00, 0x1ab | nop + vis[vi01] = 0x1ab; /* 427 */ + L9: + // nop | maddz.xyzw vf23, vf23, vf18 + c->acc.vf.madd_xyzw(c->vfs[vf23].vf, c->vf_src(vf23).vf, c->vf_src(vf18).vf.z()); + // mtir vi13, vf29.w | itof0.xyz vf18, vf29 + c->vfs[vf18].vf.itof0(Mask::xyz, c->vf_src(vf29).vf); vis[vi13] = c->vf_src(vf29).vf.w_as_u16(); + // move.xyzw vf13, vf12 | mulaz.xyzw ACC, vf24, vf08 :e + c->acc.vf.mula_xyzw(c->vf_src(vf24).vf, c->vf_src(vf08).vf.z()); c->vfs[vf13].vf.move(Mask::xyzw, c->vf_src(vf12).vf); + // move.xyzw vf12, vf11 | maddaz.xyzw ACC, vf25, vf09 + c->acc.vf.madda_xyzw(c->vfs[vf25].vf, c->vfs[vf09].vf.z()); c->vfs[vf12].vf.move(Mask::xyzw, c->vf_src(vf11).vf); + return; + L10: + // nop | add.xyzw vf10, vf03, vf07 + c->vfs[vf10].vf.add_xyzw(c->vf_src(vf03).vf, c->vf_src(vf07).vf); + // BRANCH! + // ibne vi08, vi06, L11 | adday.xyzw vf16, vf16 + c->acc.vf.adda(Mask::xyzw, c->vfs[vf16].vf, c->vfs[vf16].vf.y()); bc = (vis[vi08] != vis[vi06]); + // nop | maddz.xyzw vf16, vf17, vf16 + c->acc.vf.madd_xyzw(c->vfs[vf16].vf, c->vf_src(vf17).vf, c->vf_src(vf16).vf.z()); + if (bc) { goto L11; } + + // iaddiu vi01, vi00, 0x1ab | nop + vis[vi01] = 0x1ab; /* 427 */ + L11: + // move.xyzw vf13, vf12 | mulaz.xyzw ACC, vf24, vf08 :e + c->acc.vf.mula_xyzw(c->vf_src(vf24).vf, c->vf_src(vf08).vf.z()); c->vfs[vf13].vf.move(Mask::xyzw, c->vf_src(vf12).vf); + // move.xyzw vf12, vf11 | maddaz.xyzw ACC, vf25, vf09 + c->acc.vf.madda_xyzw(c->vfs[vf25].vf, c->vfs[vf09].vf.z()); c->vfs[vf12].vf.move(Mask::xyzw, c->vf_src(vf11).vf); + return; +} + +void vcallms_311_case_427(ExecutionContext* c, u16* vis) { + // mtir vi11, vf01.x | maddz.xyzw vf11, vf26, vf10 + c->acc.vf.madd_xyzw(c->vfs[vf11].vf, c->vf_src(vf26).vf, c->vf_src(vf10).vf.z()); vis[vi11] = c->vf_src(vf01).vf.x_as_u16(); + // jr vi01 | mul.xyzw vf14, vf13, Q + c->vfs[vf14].vf.mul_xyzw(c->vf_src(vf13).vf, c->Q); + // sqi.xyzw vf04, vi08 | mulaw.xyzw ACC, vf20, vf08 + c->acc.vf.mula_xyzw(c->vf_src(vf20).vf, c->vf_src(vf08).vf.w()); sq_xyzw(c->vf_src(vf04).vf, vis[vi08]++); + c->acc.vf.madda_xyzw(c->vfs[vf21].vf, c->vfs[vf09].vf.w()); c->Q = c->vf_src(vf00).vf.w() / std::sqrt(c->vf_src(vf16).vf.x()); + // nop | maddaw.xyzw ACC, vf22, vf10 + c->acc.vf.madda_xyzw(c->vfs[vf22].vf, c->vfs[vf10].vf.w()); + // nop | maddw.xyzw vf15, vf23, vf00 + c->acc.vf.madd_xyzw(c->vfs[vf15].vf, c->vf_src(vf23).vf, c->vf_src(vf00).vf.w()); + // nop | mul.xyzw vf16, vf11, vf11 + c->vfs[vf16].vf.mul_xyzw(c->vf_src(vf11).vf, c->vf_src(vf11).vf); + // nop | add.xyzw vf08, vf01, vf05 + c->vfs[vf08].vf.add_xyzw(c->vf_src(vf01).vf, c->vf_src(vf05).vf); + // nop | add.xyzw vf09, vf02, vf06 + c->vfs[vf09].vf.add_xyzw(c->vf_src(vf02).vf, c->vf_src(vf06).vf); + // nop | add.xyzw vf10, vf03, vf07 + c->vfs[vf10].vf.add_xyzw(c->vf_src(vf03).vf, c->vf_src(vf07).vf); + // nop | adday.xyzw vf16, vf16 + c->acc.vf.adda(Mask::xyzw, c->vfs[vf16].vf, c->vfs[vf16].vf.y()); + // nop | maddz.xyzw vf16, vf17, vf16 + c->acc.vf.madd_xyzw(c->vfs[vf16].vf, c->vf_src(vf17).vf, c->vf_src(vf16).vf.z()); + // move.xyzw vf13, vf12 | mulaz.xyzw ACC, vf24, vf08 :e + c->acc.vf.mula_xyzw(c->vf_src(vf24).vf, c->vf_src(vf08).vf.z()); c->vfs[vf13].vf.move(Mask::xyzw, c->vf_src(vf12).vf); + // move.xyzw vf12, vf11 | maddaz.xyzw ACC, vf25, vf09 + c->acc.vf.madda_xyzw(c->vfs[vf25].vf, c->vfs[vf09].vf.z()); c->vfs[vf12].vf.move(Mask::xyzw, c->vf_src(vf11).vf); + return; +} + void vcallms_311(ExecutionContext* c, u16* vis) { + switch(vis[vi01]) { + case 314: + vcallms_311_case_314(c, vis); + break; + case 326: + vcallms_311_case_326(c, vis); + break; + case 353: + vcallms_311_case_353(c, vis); + break; + case 386: + vcallms_311_case_386(c, vis); + break; + case 427: + vcallms_311_case_427(c, vis); + break; + default: + fmt::print("BAD JUMP {}\n", vis[vi01]); + ASSERT(false); + } +} + +void vcallms_311_reference(ExecutionContext* c, u16* vis) { // mtir vi11, vf01.x | maddz.xyzw vf11, vf26, vf10 ;; 311 // jr vi01 | mul.xyzw vf14, vf13, Q @@ -1444,11 +1907,11 @@ void vcallms_311(ExecutionContext* c, u16* vis) { bool bc; // mtir vi11, vf01.x | maddz.xyzw vf11, vf26, vf10 - c->acc.vf.madd(Mask::xyzw, c->vfs[vf11].vf, c->vf_src(vf26).vf, c->vf_src(vf10).vf.z()); vis[vi11] = c->vf_src(vf01).vf.x_as_u16(); + c->acc.vf.madd_xyzw(c->vfs[vf11].vf, c->vf_src(vf26).vf, c->vf_src(vf10).vf.z()); vis[vi11] = c->vf_src(vf01).vf.x_as_u16(); // jr vi01 | mul.xyzw vf14, vf13, Q - c->vfs[vf14].vf.mul(Mask::xyzw, c->vf_src(vf13).vf, c->Q); + c->vfs[vf14].vf.mul_xyzw(c->vf_src(vf13).vf, c->Q); // sqi.xyzw vf04, vi08 | mulaw.xyzw ACC, vf20, vf08 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf20).vf, c->vf_src(vf08).vf.w()); sq_buffer(Mask::xyzw, c->vf_src(vf04).vf, vis[vi08]++); + c->acc.vf.mula_xyzw(c->vf_src(vf20).vf, c->vf_src(vf08).vf.w()); sq_xyzw(c->vf_src(vf04).vf, vis[vi08]++); switch(vis[vi01]) { case 314: @@ -1468,158 +1931,148 @@ void vcallms_311(ExecutionContext* c, u16* vis) { JUMP_314: // rsqrt Q, vf00.w, vf16.x | maddaw.xyzw ACC, vf21, vf09 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf21].vf, c->vfs[vf09].vf.w()); c->Q = c->vf_src(vf00).vf.w() / std::sqrt(c->vf_src(vf16).vf.x()); + c->acc.vf.madda_xyzw(c->vfs[vf21].vf, c->vfs[vf09].vf.w()); c->Q = c->vf_src(vf00).vf.w() / std::sqrt(c->vf_src(vf16).vf.x()); // lq.xyzw vf24, -124(vi11) | maddaw.xyzw ACC, vf22, vf10 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf22].vf, c->vfs[vf10].vf.w()); lq_buffer(Mask::xyzw, c->vfs[vf24].vf, vis[vi11] + -124); + c->acc.vf.madda_xyzw(c->vfs[vf22].vf, c->vfs[vf10].vf.w()); lq_xyzw(c->vfs[vf24].vf, vis[vi11] + -124); // lq.xyzw vf25, -123(vi11) | maddw.xyzw vf15, vf23, vf00 - c->acc.vf.madd(Mask::xyzw, c->vfs[vf15].vf, c->vf_src(vf23).vf, c->vf_src(vf00).vf.w()); lq_buffer(Mask::xyzw, c->vfs[vf25].vf, vis[vi11] + -123); -// if (c->vfs[vf15].f[0] > 100000) { -// fmt::print("likely bad vertex: {}\n", c->vfs[vf15].vf.print()); -// fmt::print(" rhs vf08: {} * {}\n", c->vfs[vf08].vf.w(), c->vfs[vf20].vf.x()); -// fmt::print(" rhs vf09: {} * {}\n", c->vfs[vf09].vf.w(), c->vfs[vf21].vf.x()); -// fmt::print(" rhs vf10: {} * {}\n", c->vfs[vf10].vf.w(), c->vfs[vf22].vf.x()); -// fmt::print(" rhs vf00: 1 * {}\n", c->vfs[vf22].vf.x()); -// -// fmt::print(" vf16.x was {}\n", c->vf_src(vf16).vf.x()); -// } - + c->acc.vf.madd_xyzw(c->vfs[vf15].vf, c->vf_src(vf23).vf, c->vf_src(vf00).vf.w()); lq_xyzw(c->vfs[vf25].vf, vis[vi11] + -123); // lq.xyzw vf26, -122(vi11) | mul.xyzw vf16, vf11, vf11 - c->vfs[vf16].vf.mul(Mask::xyzw, c->vf_src(vf11).vf, c->vf_src(vf11).vf); lq_buffer(Mask::xyzw, c->vfs[vf26].vf, vis[vi11] + -122); + c->vfs[vf16].vf.mul_xyzw(c->vf_src(vf11).vf, c->vf_src(vf11).vf); lq_xyzw(c->vfs[vf26].vf, vis[vi11] + -122); // lq.xyzw vf20, -128(vi11) | add.xyzw vf08, vf01, vf05 - c->vfs[vf08].vf.add(Mask::xyzw, c->vf_src(vf01).vf, c->vf_src(vf05).vf); lq_buffer(Mask::xyzw, c->vfs[vf20].vf, vis[vi11] + -128); + c->vfs[vf08].vf.add_xyzw(c->vf_src(vf01).vf, c->vf_src(vf05).vf); lq_xyzw(c->vfs[vf20].vf, vis[vi11] + -128); // lq.xyzw vf21, -127(vi11) | add.xyzw vf09, vf02, vf06 - c->vfs[vf09].vf.add(Mask::xyzw, c->vf_src(vf02).vf, c->vf_src(vf06).vf); lq_buffer(Mask::xyzw, c->vfs[vf21].vf, vis[vi11] + -127); + c->vfs[vf09].vf.add_xyzw(c->vf_src(vf02).vf, c->vf_src(vf06).vf); lq_xyzw(c->vfs[vf21].vf, vis[vi11] + -127); // lq.xyzw vf22, -126(vi11) | add.xyzw vf10, vf03, vf07 - c->vfs[vf10].vf.add(Mask::xyzw, c->vf_src(vf03).vf, c->vf_src(vf07).vf); lq_buffer(Mask::xyzw, c->vfs[vf22].vf, vis[vi11] + -126); + c->vfs[vf10].vf.add_xyzw(c->vf_src(vf03).vf, c->vf_src(vf07).vf); lq_xyzw(c->vfs[vf22].vf, vis[vi11] + -126); // BRANCH! // ibne vi08, vi02, L4 | adday.xyzw vf16, vf16 c->acc.vf.adda(Mask::xyzw, c->vfs[vf16].vf, c->vfs[vf16].vf.y()); bc = (vis[vi08] != vis[vi02]); // lq.xyzw vf23, -125(vi11) | maddz.xyzw vf16, vf17, vf16 - c->acc.vf.madd(Mask::xyzw, c->vfs[vf16].vf, c->vf_src(vf17).vf, c->vf_src(vf16).vf.z()); lq_buffer(Mask::xyzw, c->vfs[vf23].vf, vis[vi11] + -125); + c->acc.vf.madd_xyzw(c->vfs[vf16].vf, c->vf_src(vf17).vf, c->vf_src(vf16).vf.z()); lq_xyzw(c->vfs[vf23].vf, vis[vi11] + -125); if (bc) { goto L4; } // ior vi01, vi03, vi00 | nop vis[vi01] = vis[vi03]; L4: // move.xyzw vf13, vf12 | mulaz.xyzw ACC, vf24, vf08 :e - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf24).vf, c->vf_src(vf08).vf.z()); c->vfs[vf13].vf.move(Mask::xyzw, c->vf_src(vf12).vf); + c->acc.vf.mula_xyzw(c->vf_src(vf24).vf, c->vf_src(vf08).vf.z()); c->vfs[vf13].vf.move(Mask::xyzw, c->vf_src(vf12).vf); // move.xyzw vf12, vf11 | maddaz.xyzw ACC, vf25, vf09 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf25].vf, c->vfs[vf09].vf.z()); c->vfs[vf12].vf.move(Mask::xyzw, c->vf_src(vf11).vf); + c->acc.vf.madda_xyzw(c->vfs[vf25].vf, c->vfs[vf09].vf.z()); c->vfs[vf12].vf.move(Mask::xyzw, c->vf_src(vf11).vf); return; JUMP_326: // rsqrt Q, vf00.w, vf16.x | maddaw.xyzw ACC, vf21, vf09 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf21].vf, c->vfs[vf09].vf.w()); c->Q = c->vf_src(vf00).vf.w() / std::sqrt(c->vf_src(vf16).vf.x()); + c->acc.vf.madda_xyzw(c->vfs[vf21].vf, c->vfs[vf09].vf.w()); c->Q = c->vf_src(vf00).vf.w() / std::sqrt(c->vf_src(vf16).vf.x()); // mtir vi12, vf01.y | maddaw.xyzw ACC, vf22, vf10 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf22].vf, c->vfs[vf10].vf.w()); vis[vi12] = c->vf_src(vf01).vf.y_as_u16(); + c->acc.vf.madda_xyzw(c->vfs[vf22].vf, c->vfs[vf10].vf.w()); vis[vi12] = c->vf_src(vf01).vf.y_as_u16(); // iand vi11, vi11, vi09 | maddw.xyzw vf15, vf23, vf00 - c->acc.vf.madd(Mask::xyzw, c->vfs[vf15].vf, c->vf_src(vf23).vf, c->vf_src(vf00).vf.w()); vis[vi11] = vis[vi11] & vis[vi09]; + c->acc.vf.madd_xyzw(c->vfs[vf15].vf, c->vf_src(vf23).vf, c->vf_src(vf00).vf.w()); vis[vi11] = vis[vi11] & vis[vi09]; // lq.xyzw vf19, 4(vi11) | mul.xyzw vf16, vf11, vf11 - c->vfs[vf16].vf.mul(Mask::xyzw, c->vf_src(vf11).vf, c->vf_src(vf11).vf); lq_buffer(Mask::xyzw, c->vfs[vf19].vf, vis[vi11] + 4); + c->vfs[vf16].vf.mul_xyzw(c->vf_src(vf11).vf, c->vf_src(vf11).vf); lq_xyzw(c->vfs[vf19].vf, vis[vi11] + 4); // BRANCH! // ibeq vi09, vi11, L7 | add.xyzw vf08, vf01, vf05 - c->vfs[vf08].vf.add(Mask::xyzw, c->vf_src(vf01).vf, c->vf_src(vf05).vf); bc = (vis[vi09] == vis[vi11]); + c->vfs[vf08].vf.add_xyzw(c->vf_src(vf01).vf, c->vf_src(vf05).vf); bc = (vis[vi09] == vis[vi11]); // iand vi12, vi12, vi09 | add.xyzw vf09, vf02, vf06 - c->vfs[vf09].vf.add(Mask::xyzw, c->vf_src(vf02).vf, c->vf_src(vf06).vf); vis[vi12] = vis[vi12] & vis[vi09]; + c->vfs[vf09].vf.add_xyzw(c->vf_src(vf02).vf, c->vf_src(vf06).vf); vis[vi12] = vis[vi12] & vis[vi09]; if (bc) { goto L7; } // nop | muly.xyzw vf18, vf18, vf17 - c->vfs[vf18].vf.mul(Mask::xyzw, c->vf_src(vf18).vf, c->vf_src(vf17).vf.y()); + c->vfs[vf18].vf.mul_xyzw(c->vf_src(vf18).vf, c->vf_src(vf17).vf.y()); // lq.xyzw vf24, 4(vi12) | add.xyzw vf10, vf03, vf07 - c->vfs[vf10].vf.add(Mask::xyzw, c->vf_src(vf03).vf, c->vf_src(vf07).vf); lq_buffer(Mask::xyzw, c->vfs[vf24].vf, vis[vi12] + 4); + c->vfs[vf10].vf.add_xyzw(c->vf_src(vf03).vf, c->vf_src(vf07).vf); lq_xyzw(c->vfs[vf24].vf, vis[vi12] + 4); // lq.xyzw vf27, 5(vi11) | adday.xyzw vf16, vf16 - c->acc.vf.adda(Mask::xyzw, c->vfs[vf16].vf, c->vfs[vf16].vf.y()); lq_buffer(Mask::xyzw, c->vfs[vf27].vf, vis[vi11] + 5); + c->acc.vf.adda(Mask::xyzw, c->vfs[vf16].vf, c->vfs[vf16].vf.y()); lq_xyzw(c->vfs[vf27].vf, vis[vi11] + 5); // lq.xyzw vf25, 5(vi12) | maddz.xyzw vf16, vf17, vf16 - c->acc.vf.madd(Mask::xyzw, c->vfs[vf16].vf, c->vf_src(vf17).vf, c->vf_src(vf16).vf.z()); lq_buffer(Mask::xyzw, c->vfs[vf25].vf, vis[vi12] + 5); + c->acc.vf.madd_xyzw(c->vfs[vf16].vf, c->vf_src(vf17).vf, c->vf_src(vf16).vf.z()); lq_xyzw(c->vfs[vf25].vf, vis[vi12] + 5); // lq.xyzw vf28, 6(vi11) | mulax.xyzw ACC, vf19, vf18 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf19).vf, c->vf_src(vf18).vf.x()); lq_buffer(Mask::xyzw, c->vfs[vf28].vf, vis[vi11] + 6); + c->acc.vf.mula_xyzw(c->vf_src(vf19).vf, c->vf_src(vf18).vf.x()); lq_xyzw(c->vfs[vf28].vf, vis[vi11] + 6); // lq.xyzw vf26, 6(vi12) | maddy.xyzw vf24, vf24, vf18 - c->acc.vf.madd(Mask::xyzw, c->vfs[vf24].vf, c->vf_src(vf24).vf, c->vf_src(vf18).vf.y()); lq_buffer(Mask::xyzw, c->vfs[vf26].vf, vis[vi12] + 6); + c->acc.vf.madd_xyzw(c->vfs[vf24].vf, c->vf_src(vf24).vf, c->vf_src(vf18).vf.y()); lq_xyzw(c->vfs[vf26].vf, vis[vi12] + 6); // lq.xyzw vf29, 0(vi11) | mulax.xyzw ACC, vf27, vf18 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf27).vf, c->vf_src(vf18).vf.x()); lq_buffer(Mask::xyzw, c->vfs[vf29].vf, vis[vi11]); + c->acc.vf.mula_xyzw(c->vf_src(vf27).vf, c->vf_src(vf18).vf.x()); lq_xyzw(c->vfs[vf29].vf, vis[vi11]); // lq.xyzw vf20, 0(vi12) | maddy.xyzw vf25, vf25, vf18 - c->acc.vf.madd(Mask::xyzw, c->vfs[vf25].vf, c->vf_src(vf25).vf, c->vf_src(vf18).vf.y()); lq_buffer(Mask::xyzw, c->vfs[vf20].vf, vis[vi12]); + c->acc.vf.madd_xyzw(c->vfs[vf25].vf, c->vf_src(vf25).vf, c->vf_src(vf18).vf.y()); lq_xyzw(c->vfs[vf20].vf, vis[vi12]); // lq.xyzw vf19, 1(vi11) | mulax.xyzw ACC, vf28, vf18 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf28).vf, c->vf_src(vf18).vf.x()); lq_buffer(Mask::xyzw, c->vfs[vf19].vf, vis[vi11] + 1); + c->acc.vf.mula_xyzw(c->vf_src(vf28).vf, c->vf_src(vf18).vf.x()); lq_xyzw(c->vfs[vf19].vf, vis[vi11] + 1); // lq.xyzw vf21, 1(vi12) | maddy.xyzw vf26, vf26, vf18 - c->acc.vf.madd(Mask::xyzw, c->vfs[vf26].vf, c->vf_src(vf26).vf, c->vf_src(vf18).vf.y()); lq_buffer(Mask::xyzw, c->vfs[vf21].vf, vis[vi12] + 1); + c->acc.vf.madd_xyzw(c->vfs[vf26].vf, c->vf_src(vf26).vf, c->vf_src(vf18).vf.y()); lq_xyzw(c->vfs[vf21].vf, vis[vi12] + 1); // lq.xyzw vf27, 2(vi11) | mulax.xyzw ACC, vf29, vf18 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf29).vf, c->vf_src(vf18).vf.x()); lq_buffer(Mask::xyzw, c->vfs[vf27].vf, vis[vi11] + 2); + c->acc.vf.mula_xyzw(c->vf_src(vf29).vf, c->vf_src(vf18).vf.x()); lq_xyzw(c->vfs[vf27].vf, vis[vi11] + 2); // lq.xyzw vf22, 2(vi12) | maddy.xyzw vf20, vf20, vf18 - c->acc.vf.madd(Mask::xyzw, c->vfs[vf20].vf, c->vf_src(vf20).vf, c->vf_src(vf18).vf.y()); lq_buffer(Mask::xyzw, c->vfs[vf22].vf, vis[vi12] + 2); + c->acc.vf.madd_xyzw(c->vfs[vf20].vf, c->vf_src(vf20).vf, c->vf_src(vf18).vf.y()); lq_xyzw(c->vfs[vf22].vf, vis[vi12] + 2); // lq.xyzw vf28, 3(vi11) | mulax.xyzw ACC, vf19, vf18 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf19).vf, c->vf_src(vf18).vf.x()); lq_buffer(Mask::xyzw, c->vfs[vf28].vf, vis[vi11] + 3); + c->acc.vf.mula_xyzw(c->vf_src(vf19).vf, c->vf_src(vf18).vf.x()); lq_xyzw(c->vfs[vf28].vf, vis[vi11] + 3); // lq.xyzw vf23, 3(vi12) | maddy.xyzw vf21, vf21, vf18 - c->acc.vf.madd(Mask::xyzw, c->vfs[vf21].vf, c->vf_src(vf21).vf, c->vf_src(vf18).vf.y()); lq_buffer(Mask::xyzw, c->vfs[vf23].vf, vis[vi12] + 3); + c->acc.vf.madd_xyzw(c->vfs[vf21].vf, c->vf_src(vf21).vf, c->vf_src(vf18).vf.y()); lq_xyzw(c->vfs[vf23].vf, vis[vi12] + 3); // nop | mulax.xyzw ACC, vf27, vf18 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf27).vf, c->vf_src(vf18).vf.x()); + c->acc.vf.mula_xyzw(c->vf_src(vf27).vf, c->vf_src(vf18).vf.x()); // iaddiu vi01, vi00, 0x161 | maddy.xyzw vf22, vf22, vf18 - c->acc.vf.madd(Mask::xyzw, c->vfs[vf22].vf, c->vf_src(vf22).vf, c->vf_src(vf18).vf.y()); vis[vi01] = 0x161; /* 353 */ + c->acc.vf.madd_xyzw(c->vfs[vf22].vf, c->vf_src(vf22).vf, c->vf_src(vf18).vf.y()); vis[vi01] = 0x161; /* 353 */ // BRANCH! // ibne vi08, vi04, L5 | mulax.xyzw ACC, vf28, vf18 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf28).vf, c->vf_src(vf18).vf.x()); bc = (vis[vi08] != vis[vi04]); + c->acc.vf.mula_xyzw(c->vf_src(vf28).vf, c->vf_src(vf18).vf.x()); bc = (vis[vi08] != vis[vi04]); // nop | maddy.xyzw vf23, vf23, vf18 - c->acc.vf.madd(Mask::xyzw, c->vfs[vf23].vf, c->vf_src(vf23).vf, c->vf_src(vf18).vf.y()); + c->acc.vf.madd_xyzw(c->vfs[vf23].vf, c->vf_src(vf23).vf, c->vf_src(vf18).vf.y()); if (bc) { goto L5; } // ior vi01, vi05, vi00 | nop vis[vi01] = vis[vi05]; L5: // move.xyzw vf13, vf12 | mulaz.xyzw ACC, vf24, vf08 :e - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf24).vf, c->vf_src(vf08).vf.z()); c->vfs[vf13].vf.move(Mask::xyzw, c->vf_src(vf12).vf); + c->acc.vf.mula_xyzw(c->vf_src(vf24).vf, c->vf_src(vf08).vf.z()); c->vfs[vf13].vf.move(Mask::xyzw, c->vf_src(vf12).vf); // move.xyzw vf12, vf11 | maddaz.xyzw ACC, vf25, vf09 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf25].vf, c->vfs[vf09].vf.z()); c->vfs[vf12].vf.move(Mask::xyzw, c->vf_src(vf11).vf); + c->acc.vf.madda_xyzw(c->vfs[vf25].vf, c->vfs[vf09].vf.z()); c->vfs[vf12].vf.move(Mask::xyzw, c->vf_src(vf11).vf); return; JUMP_353: // rsqrt Q, vf00.w, vf16.x | maddaw.xyzw ACC, vf21, vf09 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf21].vf, c->vfs[vf09].vf.w()); c->Q = c->vf_src(vf00).vf.w() / std::sqrt(c->vf_src(vf16).vf.x()); + c->acc.vf.madda_xyzw(c->vfs[vf21].vf, c->vfs[vf09].vf.w()); c->Q = c->vf_src(vf00).vf.w() / std::sqrt(c->vf_src(vf16).vf.x()); // mtir vi12, vf01.y | maddaw.xyzw ACC, vf22, vf10 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf22].vf, c->vfs[vf10].vf.w()); vis[vi12] = c->vf_src(vf01).vf.y_as_u16(); + c->acc.vf.madda_xyzw(c->vfs[vf22].vf, c->vfs[vf10].vf.w()); vis[vi12] = c->vf_src(vf01).vf.y_as_u16(); // iand vi11, vi11, vi09 | maddw.xyzw vf15, vf23, vf00 - c->acc.vf.madd(Mask::xyzw, c->vfs[vf15].vf, c->vf_src(vf23).vf, c->vf_src(vf00).vf.w()); vis[vi11] = vis[vi11] & vis[vi09]; + c->acc.vf.madd_xyzw(c->vfs[vf15].vf, c->vf_src(vf23).vf, c->vf_src(vf00).vf.w()); vis[vi11] = vis[vi11] & vis[vi09]; // lq.xyzw vf19, 4(vi11) | mul.xyzw vf16, vf11, vf11 - c->vfs[vf16].vf.mul(Mask::xyzw, c->vf_src(vf11).vf, c->vf_src(vf11).vf); lq_buffer(Mask::xyzw, c->vfs[vf19].vf, vis[vi11] + 4); + c->vfs[vf16].vf.mul_xyzw(c->vf_src(vf11).vf, c->vf_src(vf11).vf); lq_xyzw(c->vfs[vf19].vf, vis[vi11] + 4); // BRANCH! // ibeq vi09, vi11, L7 | add.xyzw vf08, vf01, vf05 - c->vfs[vf08].vf.add(Mask::xyzw, c->vf_src(vf01).vf, c->vf_src(vf05).vf); bc = (vis[vi09] == vis[vi11]); + c->vfs[vf08].vf.add_xyzw(c->vf_src(vf01).vf, c->vf_src(vf05).vf); bc = (vis[vi09] == vis[vi11]); // iand vi12, vi12, vi09 | add.xyzw vf09, vf02, vf06 - c->vfs[vf09].vf.add(Mask::xyzw, c->vf_src(vf02).vf, c->vf_src(vf06).vf); vis[vi12] = vis[vi12] & vis[vi09]; + c->vfs[vf09].vf.add_xyzw(c->vf_src(vf02).vf, c->vf_src(vf06).vf); vis[vi12] = vis[vi12] & vis[vi09]; if (bc) { goto L7; } // lq.xyzw vf24, 4(vi12) | add.xyzw vf10, vf03, vf07 - c->vfs[vf10].vf.add(Mask::xyzw, c->vf_src(vf03).vf, c->vf_src(vf07).vf); lq_buffer(Mask::xyzw, c->vfs[vf24].vf, vis[vi12] + 4); + c->vfs[vf10].vf.add_xyzw(c->vf_src(vf03).vf, c->vf_src(vf07).vf); lq_xyzw(c->vfs[vf24].vf, vis[vi12] + 4); // lq.xyzw vf27, 5(vi11) | adday.xyzw vf16, vf16 - c->acc.vf.adda(Mask::xyzw, c->vfs[vf16].vf, c->vfs[vf16].vf.y()); lq_buffer(Mask::xyzw, c->vfs[vf27].vf, vis[vi11] + 5); + c->acc.vf.adda(Mask::xyzw, c->vfs[vf16].vf, c->vfs[vf16].vf.y()); lq_xyzw(c->vfs[vf27].vf, vis[vi11] + 5); // lq.xyzw vf25, 5(vi12) | maddz.xyzw vf16, vf17, vf16 - c->acc.vf.madd(Mask::xyzw, c->vfs[vf16].vf, c->vf_src(vf17).vf, c->vf_src(vf16).vf.z()); lq_buffer(Mask::xyzw, c->vfs[vf25].vf, vis[vi12] + 5); + c->acc.vf.madd_xyzw(c->vfs[vf16].vf, c->vf_src(vf17).vf, c->vf_src(vf16).vf.z()); lq_xyzw(c->vfs[vf25].vf, vis[vi12] + 5); // lq.xyzw vf28, 6(vi11) | mulaz.xyzw ACC, vf19, vf18 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf19).vf, c->vf_src(vf18).vf.z()); lq_buffer(Mask::xyzw, c->vfs[vf28].vf, vis[vi11] + 6); + c->acc.vf.mula_xyzw(c->vf_src(vf19).vf, c->vf_src(vf18).vf.z()); lq_xyzw(c->vfs[vf28].vf, vis[vi11] + 6); // lq.xyzw vf26, 6(vi12) | maddw.xyzw vf24, vf24, vf18 - c->acc.vf.madd(Mask::xyzw, c->vfs[vf24].vf, c->vf_src(vf24).vf, c->vf_src(vf18).vf.w()); lq_buffer(Mask::xyzw, c->vfs[vf26].vf, vis[vi12] + 6); + c->acc.vf.madd_xyzw(c->vfs[vf24].vf, c->vf_src(vf24).vf, c->vf_src(vf18).vf.w()); lq_xyzw(c->vfs[vf26].vf, vis[vi12] + 6); // lq.xyzw vf29, 0(vi11) | mulaz.xyzw ACC, vf27, vf18 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf27).vf, c->vf_src(vf18).vf.z()); lq_buffer(Mask::xyzw, c->vfs[vf29].vf, vis[vi11]); + c->acc.vf.mula_xyzw(c->vf_src(vf27).vf, c->vf_src(vf18).vf.z()); lq_xyzw(c->vfs[vf29].vf, vis[vi11]); // lq.xyzw vf20, 0(vi12) | maddw.xyzw vf25, vf25, vf18 - c->acc.vf.madd(Mask::xyzw, c->vfs[vf25].vf, c->vf_src(vf25).vf, c->vf_src(vf18).vf.w()); lq_buffer(Mask::xyzw, c->vfs[vf20].vf, vis[vi12]); + c->acc.vf.madd_xyzw(c->vfs[vf25].vf, c->vf_src(vf25).vf, c->vf_src(vf18).vf.w()); lq_xyzw(c->vfs[vf20].vf, vis[vi12]); // lq.xyzw vf19, 1(vi11) | mulaz.xyzw ACC, vf28, vf18 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf28).vf, c->vf_src(vf18).vf.z()); lq_buffer(Mask::xyzw, c->vfs[vf19].vf, vis[vi11] + 1); + c->acc.vf.mula_xyzw(c->vf_src(vf28).vf, c->vf_src(vf18).vf.z()); lq_xyzw(c->vfs[vf19].vf, vis[vi11] + 1); // lq.xyzw vf21, 1(vi12) | maddw.xyzw vf26, vf26, vf18 - c->acc.vf.madd(Mask::xyzw, c->vfs[vf26].vf, c->vf_src(vf26).vf, c->vf_src(vf18).vf.w()); lq_buffer(Mask::xyzw, c->vfs[vf21].vf, vis[vi12] + 1); + c->acc.vf.madd_xyzw(c->vfs[vf26].vf, c->vf_src(vf26).vf, c->vf_src(vf18).vf.w()); lq_xyzw(c->vfs[vf21].vf, vis[vi12] + 1); // lq.xyzw vf27, 2(vi11) | mulaz.xyzw ACC, vf29, vf18 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf29).vf, c->vf_src(vf18).vf.z()); lq_buffer(Mask::xyzw, c->vfs[vf27].vf, vis[vi11] + 2); + c->acc.vf.mula_xyzw(c->vf_src(vf29).vf, c->vf_src(vf18).vf.z()); lq_xyzw(c->vfs[vf27].vf, vis[vi11] + 2); // lq.xyzw vf22, 2(vi12) | maddw.xyzw vf20, vf20, vf18 - c->acc.vf.madd(Mask::xyzw, c->vfs[vf20].vf, c->vf_src(vf20).vf, c->vf_src(vf18).vf.w()); lq_buffer(Mask::xyzw, c->vfs[vf22].vf, vis[vi12] + 2); + c->acc.vf.madd_xyzw(c->vfs[vf20].vf, c->vf_src(vf20).vf, c->vf_src(vf18).vf.w()); lq_xyzw(c->vfs[vf22].vf, vis[vi12] + 2); // lq.xyzw vf28, 3(vi11) | mulaz.xyzw ACC, vf19, vf18 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf19).vf, c->vf_src(vf18).vf.z()); lq_buffer(Mask::xyzw, c->vfs[vf28].vf, vis[vi11] + 3); + c->acc.vf.mula_xyzw(c->vf_src(vf19).vf, c->vf_src(vf18).vf.z()); lq_xyzw(c->vfs[vf28].vf, vis[vi11] + 3); // lq.xyzw vf23, 3(vi12) | maddw.xyzw vf21, vf21, vf18 - c->acc.vf.madd(Mask::xyzw, c->vfs[vf21].vf, c->vf_src(vf21).vf, c->vf_src(vf18).vf.w()); lq_buffer(Mask::xyzw, c->vfs[vf23].vf, vis[vi12] + 3); + c->acc.vf.madd_xyzw(c->vfs[vf21].vf, c->vf_src(vf21).vf, c->vf_src(vf18).vf.w()); lq_xyzw(c->vfs[vf23].vf, vis[vi12] + 3); // lqi.xyzw vf29, vi10 | mulaz.xyzw ACC, vf27, vf18 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf27).vf, c->vf_src(vf18).vf.z()); lq_buffer(Mask::xyzw, c->vfs[vf29].vf, vis[vi10]++); + c->acc.vf.mula_xyzw(c->vf_src(vf27).vf, c->vf_src(vf18).vf.z()); lq_xyzw(c->vfs[vf29].vf, vis[vi10]++); // iaddiu vi01, vi00, 0x146 | maddw.xyzw vf22, vf22, vf18 - c->acc.vf.madd(Mask::xyzw, c->vfs[vf22].vf, c->vf_src(vf22).vf, c->vf_src(vf18).vf.w()); vis[vi01] = 0x146; /* 326 */ + c->acc.vf.madd_xyzw(c->vfs[vf22].vf, c->vf_src(vf22).vf, c->vf_src(vf18).vf.w()); vis[vi01] = 0x146; /* 326 */ // BRANCH! // ibne vi08, vi04, L6 | mulaz.xyzw ACC, vf28, vf18 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf28).vf, c->vf_src(vf18).vf.z()); bc = (vis[vi08] != vis[vi04]); + c->acc.vf.mula_xyzw(c->vf_src(vf28).vf, c->vf_src(vf18).vf.z()); bc = (vis[vi08] != vis[vi04]); // nop | maddw.xyzw vf23, vf23, vf18 - c->acc.vf.madd(Mask::xyzw, c->vfs[vf23].vf, c->vf_src(vf23).vf, c->vf_src(vf18).vf.w()); + c->acc.vf.madd_xyzw(c->vfs[vf23].vf, c->vf_src(vf23).vf, c->vf_src(vf18).vf.w()); if (bc) { goto L6; } // ior vi01, vi05, vi00 | nop @@ -1628,171 +2081,171 @@ void vcallms_311(ExecutionContext* c, u16* vis) { // mtir vi13, vf29.w | itof0.xyzw vf18, vf29 c->vfs[vf18].vf.itof0(Mask::xyzw, c->vf_src(vf29).vf); vis[vi13] = c->vf_src(vf29).vf.w_as_u16(); // move.xyzw vf13, vf12 | mulaz.xyzw ACC, vf24, vf08 :e - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf24).vf, c->vf_src(vf08).vf.z()); c->vfs[vf13].vf.move(Mask::xyzw, c->vf_src(vf12).vf); + c->acc.vf.mula_xyzw(c->vf_src(vf24).vf, c->vf_src(vf08).vf.z()); c->vfs[vf13].vf.move(Mask::xyzw, c->vf_src(vf12).vf); // move.xyzw vf12, vf11 | maddaz.xyzw ACC, vf25, vf09 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf25].vf, c->vfs[vf09].vf.z()); c->vfs[vf12].vf.move(Mask::xyzw, c->vf_src(vf11).vf); + c->acc.vf.madda_xyzw(c->vfs[vf25].vf, c->vfs[vf09].vf.z()); c->vfs[vf12].vf.move(Mask::xyzw, c->vf_src(vf11).vf); return; L7: // nop | add.xyzw vf10, vf03, vf07 - c->vfs[vf10].vf.add(Mask::xyzw, c->vf_src(vf03).vf, c->vf_src(vf07).vf); + c->vfs[vf10].vf.add_xyzw(c->vf_src(vf03).vf, c->vf_src(vf07).vf); // BRANCH! // ibne vi08, vi04, L8 | adday.xyzw vf16, vf16 c->acc.vf.adda(Mask::xyzw, c->vfs[vf16].vf, c->vfs[vf16].vf.y()); bc = (vis[vi08] != vis[vi04]); // nop | maddz.xyzw vf16, vf17, vf16 - c->acc.vf.madd(Mask::xyzw, c->vfs[vf16].vf, c->vf_src(vf17).vf, c->vf_src(vf16).vf.z()); + c->acc.vf.madd_xyzw(c->vfs[vf16].vf, c->vf_src(vf17).vf, c->vf_src(vf16).vf.z()); if (bc) { goto L8; } // ior vi01, vi05, vi00 | nop vis[vi01] = vis[vi05]; L8: // move.xyzw vf13, vf12 | mulaz.xyzw ACC, vf24, vf08 :e - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf24).vf, c->vf_src(vf08).vf.z()); c->vfs[vf13].vf.move(Mask::xyzw, c->vf_src(vf12).vf); + c->acc.vf.mula_xyzw(c->vf_src(vf24).vf, c->vf_src(vf08).vf.z()); c->vfs[vf13].vf.move(Mask::xyzw, c->vf_src(vf12).vf); // move.xyzw vf12, vf11 | maddaz.xyzw ACC, vf25, vf09 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf25].vf, c->vfs[vf09].vf.z()); c->vfs[vf12].vf.move(Mask::xyzw, c->vf_src(vf11).vf); + c->acc.vf.madda_xyzw(c->vfs[vf25].vf, c->vfs[vf09].vf.z()); c->vfs[vf12].vf.move(Mask::xyzw, c->vf_src(vf11).vf); return; JUMP_386: // rsqrt Q, vf00.w, vf16.x | maddaw.xyzw ACC, vf21, vf09 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf21].vf, c->vfs[vf09].vf.w()); c->Q = c->vf_src(vf00).vf.w() / std::sqrt(c->vf_src(vf16).vf.x()); + c->acc.vf.madda_xyzw(c->vfs[vf21].vf, c->vfs[vf09].vf.w()); c->Q = c->vf_src(vf00).vf.w() / std::sqrt(c->vf_src(vf16).vf.x()); // mtir vi12, vf01.y | maddaw.xyzw ACC, vf22, vf10 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf22].vf, c->vfs[vf10].vf.w()); vis[vi12] = c->vf_src(vf01).vf.y_as_u16(); + c->acc.vf.madda_xyzw(c->vfs[vf22].vf, c->vfs[vf10].vf.w()); vis[vi12] = c->vf_src(vf01).vf.y_as_u16(); // iand vi11, vi11, vi09 | maddw.xyzw vf15, vf23, vf00 - c->acc.vf.madd(Mask::xyzw, c->vfs[vf15].vf, c->vf_src(vf23).vf, c->vf_src(vf00).vf.w()); vis[vi11] = vis[vi11] & vis[vi09]; + c->acc.vf.madd_xyzw(c->vfs[vf15].vf, c->vf_src(vf23).vf, c->vf_src(vf00).vf.w()); vis[vi11] = vis[vi11] & vis[vi09]; // lq.xyzw vf19, 4(vi11) | mul.xyzw vf16, vf11, vf11 - c->vfs[vf16].vf.mul(Mask::xyzw, c->vf_src(vf11).vf, c->vf_src(vf11).vf); lq_buffer(Mask::xyzw, c->vfs[vf19].vf, vis[vi11] + 4); + c->vfs[vf16].vf.mul_xyzw(c->vf_src(vf11).vf, c->vf_src(vf11).vf); lq_xyzw(c->vfs[vf19].vf, vis[vi11] + 4); // BRANCH! // ibeq vi09, vi11, L10 | add.xyzw vf08, vf01, vf05 - c->vfs[vf08].vf.add(Mask::xyzw, c->vf_src(vf01).vf, c->vf_src(vf05).vf); bc = (vis[vi09] == vis[vi11]); + c->vfs[vf08].vf.add_xyzw(c->vf_src(vf01).vf, c->vf_src(vf05).vf); bc = (vis[vi09] == vis[vi11]); // iand vi12, vi12, vi09 | add.xyzw vf09, vf02, vf06 - c->vfs[vf09].vf.add(Mask::xyzw, c->vf_src(vf02).vf, c->vf_src(vf06).vf); vis[vi12] = vis[vi12] & vis[vi09]; + c->vfs[vf09].vf.add_xyzw(c->vf_src(vf02).vf, c->vf_src(vf06).vf); vis[vi12] = vis[vi12] & vis[vi09]; if (bc) { goto L10; } // nop | muly.xyzw vf18, vf18, vf17 - c->vfs[vf18].vf.mul(Mask::xyzw, c->vf_src(vf18).vf, c->vf_src(vf17).vf.y()); + c->vfs[vf18].vf.mul_xyzw(c->vf_src(vf18).vf, c->vf_src(vf17).vf.y()); // lq.xyzw vf27, 4(vi12) | add.xyzw vf10, vf03, vf07 - c->vfs[vf10].vf.add(Mask::xyzw, c->vf_src(vf03).vf, c->vf_src(vf07).vf); lq_buffer(Mask::xyzw, c->vfs[vf27].vf, vis[vi12] + 4); + c->vfs[vf10].vf.add_xyzw(c->vf_src(vf03).vf, c->vf_src(vf07).vf); lq_xyzw(c->vfs[vf27].vf, vis[vi12] + 4); // lq.xyzw vf24, 4(vi13) | adday.xyzw vf16, vf16 - c->acc.vf.adda(Mask::xyzw, c->vfs[vf16].vf, c->vfs[vf16].vf.y()); lq_buffer(Mask::xyzw, c->vfs[vf24].vf, vis[vi13] + 4); + c->acc.vf.adda(Mask::xyzw, c->vfs[vf16].vf, c->vfs[vf16].vf.y()); lq_xyzw(c->vfs[vf24].vf, vis[vi13] + 4); // lq.xyzw vf28, 5(vi11) | maddz.xyzw vf16, vf17, vf16 - c->acc.vf.madd(Mask::xyzw, c->vfs[vf16].vf, c->vf_src(vf17).vf, c->vf_src(vf16).vf.z()); lq_buffer(Mask::xyzw, c->vfs[vf28].vf, vis[vi11] + 5); + c->acc.vf.madd_xyzw(c->vfs[vf16].vf, c->vf_src(vf17).vf, c->vf_src(vf16).vf.z()); lq_xyzw(c->vfs[vf28].vf, vis[vi11] + 5); // lq.xyzw vf19, 5(vi12) | mulax.xyzw ACC, vf19, vf18 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf19).vf, c->vf_src(vf18).vf.x()); lq_buffer(Mask::xyzw, c->vfs[vf19].vf, vis[vi12] + 5); + c->acc.vf.mula_xyzw(c->vf_src(vf19).vf, c->vf_src(vf18).vf.x()); lq_xyzw(c->vfs[vf19].vf, vis[vi12] + 5); // lq.xyzw vf25, 5(vi13) | madday.xyzw ACC, vf27, vf18 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf27].vf, c->vfs[vf18].vf.y()); lq_buffer(Mask::xyzw, c->vfs[vf25].vf, vis[vi13] + 5); + c->acc.vf.madda_xyzw(c->vfs[vf27].vf, c->vfs[vf18].vf.y()); lq_xyzw(c->vfs[vf25].vf, vis[vi13] + 5); // lq.xyzw vf27, 6(vi11) | maddz.xyzw vf24, vf24, vf18 - c->acc.vf.madd(Mask::xyzw, c->vfs[vf24].vf, c->vf_src(vf24).vf, c->vf_src(vf18).vf.z()); lq_buffer(Mask::xyzw, c->vfs[vf27].vf, vis[vi11] + 6); + c->acc.vf.madd_xyzw(c->vfs[vf24].vf, c->vf_src(vf24).vf, c->vf_src(vf18).vf.z()); lq_xyzw(c->vfs[vf27].vf, vis[vi11] + 6); // lq.xyzw vf28, 6(vi12) | mulax.xyzw ACC, vf28, vf18 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf28).vf, c->vf_src(vf18).vf.x()); lq_buffer(Mask::xyzw, c->vfs[vf28].vf, vis[vi12] + 6); + c->acc.vf.mula_xyzw(c->vf_src(vf28).vf, c->vf_src(vf18).vf.x()); lq_xyzw(c->vfs[vf28].vf, vis[vi12] + 6); // lq.xyzw vf26, 6(vi13) | madday.xyzw ACC, vf19, vf18 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf19].vf, c->vfs[vf18].vf.y()); lq_buffer(Mask::xyzw, c->vfs[vf26].vf, vis[vi13] + 6); + c->acc.vf.madda_xyzw(c->vfs[vf19].vf, c->vfs[vf18].vf.y()); lq_xyzw(c->vfs[vf26].vf, vis[vi13] + 6); // lq.xyzw vf19, 0(vi11) | maddz.xyzw vf25, vf25, vf18 - c->acc.vf.madd(Mask::xyzw, c->vfs[vf25].vf, c->vf_src(vf25).vf, c->vf_src(vf18).vf.z()); lq_buffer(Mask::xyzw, c->vfs[vf19].vf, vis[vi11]); + c->acc.vf.madd_xyzw(c->vfs[vf25].vf, c->vf_src(vf25).vf, c->vf_src(vf18).vf.z()); lq_xyzw(c->vfs[vf19].vf, vis[vi11]); // lq.xyzw vf27, 0(vi12) | mulax.xyzw ACC, vf27, vf18 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf27).vf, c->vf_src(vf18).vf.x()); lq_buffer(Mask::xyzw, c->vfs[vf27].vf, vis[vi12]); + c->acc.vf.mula_xyzw(c->vf_src(vf27).vf, c->vf_src(vf18).vf.x()); lq_xyzw(c->vfs[vf27].vf, vis[vi12]); // lq.xyzw vf20, 0(vi13) | madday.xyzw ACC, vf28, vf18 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf28].vf, c->vfs[vf18].vf.y()); lq_buffer(Mask::xyzw, c->vfs[vf20].vf, vis[vi13]); + c->acc.vf.madda_xyzw(c->vfs[vf28].vf, c->vfs[vf18].vf.y()); lq_xyzw(c->vfs[vf20].vf, vis[vi13]); // lq.xyzw vf28, 1(vi11) | maddz.xyzw vf26, vf26, vf18 - c->acc.vf.madd(Mask::xyzw, c->vfs[vf26].vf, c->vf_src(vf26).vf, c->vf_src(vf18).vf.z()); lq_buffer(Mask::xyzw, c->vfs[vf28].vf, vis[vi11] + 1); + c->acc.vf.madd_xyzw(c->vfs[vf26].vf, c->vf_src(vf26).vf, c->vf_src(vf18).vf.z()); lq_xyzw(c->vfs[vf28].vf, vis[vi11] + 1); // lq.xyzw vf19, 1(vi12) | mulax.xyzw ACC, vf19, vf18 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf19).vf, c->vf_src(vf18).vf.x()); lq_buffer(Mask::xyzw, c->vfs[vf19].vf, vis[vi12] + 1); + c->acc.vf.mula_xyzw(c->vf_src(vf19).vf, c->vf_src(vf18).vf.x()); lq_xyzw(c->vfs[vf19].vf, vis[vi12] + 1); // lq.xyzw vf21, 1(vi13) | madday.xyzw ACC, vf27, vf18 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf27].vf, c->vfs[vf18].vf.y()); lq_buffer(Mask::xyzw, c->vfs[vf21].vf, vis[vi13] + 1); + c->acc.vf.madda_xyzw(c->vfs[vf27].vf, c->vfs[vf18].vf.y()); lq_xyzw(c->vfs[vf21].vf, vis[vi13] + 1); // lq.xyzw vf27, 2(vi11) | maddz.xyzw vf20, vf20, vf18 - c->acc.vf.madd(Mask::xyzw, c->vfs[vf20].vf, c->vf_src(vf20).vf, c->vf_src(vf18).vf.z()); lq_buffer(Mask::xyzw, c->vfs[vf27].vf, vis[vi11] + 2); + c->acc.vf.madd_xyzw(c->vfs[vf20].vf, c->vf_src(vf20).vf, c->vf_src(vf18).vf.z()); lq_xyzw(c->vfs[vf27].vf, vis[vi11] + 2); // lq.xyzw vf28, 2(vi12) | mulax.xyzw ACC, vf28, vf18 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf28).vf, c->vf_src(vf18).vf.x()); lq_buffer(Mask::xyzw, c->vfs[vf28].vf, vis[vi12] + 2); + c->acc.vf.mula_xyzw(c->vf_src(vf28).vf, c->vf_src(vf18).vf.x()); lq_xyzw(c->vfs[vf28].vf, vis[vi12] + 2); // lq.xyzw vf22, 2(vi13) | madday.xyzw ACC, vf19, vf18 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf19].vf, c->vfs[vf18].vf.y()); lq_buffer(Mask::xyzw, c->vfs[vf22].vf, vis[vi13] + 2); + c->acc.vf.madda_xyzw(c->vfs[vf19].vf, c->vfs[vf18].vf.y()); lq_xyzw(c->vfs[vf22].vf, vis[vi13] + 2); // lq.xyzw vf19, 3(vi11) | maddz.xyzw vf21, vf21, vf18 - c->acc.vf.madd(Mask::xyzw, c->vfs[vf21].vf, c->vf_src(vf21).vf, c->vf_src(vf18).vf.z()); lq_buffer(Mask::xyzw, c->vfs[vf19].vf, vis[vi11] + 3); + c->acc.vf.madd_xyzw(c->vfs[vf21].vf, c->vf_src(vf21).vf, c->vf_src(vf18).vf.z()); lq_xyzw(c->vfs[vf19].vf, vis[vi11] + 3); // lq.xyzw vf27, 3(vi12) | mulax.xyzw ACC, vf27, vf18 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf27).vf, c->vf_src(vf18).vf.x()); lq_buffer(Mask::xyzw, c->vfs[vf27].vf, vis[vi12] + 3); + c->acc.vf.mula_xyzw(c->vf_src(vf27).vf, c->vf_src(vf18).vf.x()); lq_xyzw(c->vfs[vf27].vf, vis[vi12] + 3); // lq.xyzw vf23, 3(vi13) | madday.xyzw ACC, vf28, vf18 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf28].vf, c->vfs[vf18].vf.y()); lq_buffer(Mask::xyzw, c->vfs[vf23].vf, vis[vi13] + 3); + c->acc.vf.madda_xyzw(c->vfs[vf28].vf, c->vfs[vf18].vf.y()); lq_xyzw(c->vfs[vf23].vf, vis[vi13] + 3); // lqi.xyzw vf29, vi10 | maddz.xyzw vf22, vf22, vf18 - c->acc.vf.madd(Mask::xyzw, c->vfs[vf22].vf, c->vf_src(vf22).vf, c->vf_src(vf18).vf.z()); lq_buffer(Mask::xyzw, c->vfs[vf29].vf, vis[vi10]++); + c->acc.vf.madd_xyzw(c->vfs[vf22].vf, c->vf_src(vf22).vf, c->vf_src(vf18).vf.z()); lq_xyzw(c->vfs[vf29].vf, vis[vi10]++); // BRANCH! // ibne vi08, vi06, L9 | mulax.xyzw ACC, vf19, vf18 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf19).vf, c->vf_src(vf18).vf.x()); bc = (vis[vi08] != vis[vi06]); + c->acc.vf.mula_xyzw(c->vf_src(vf19).vf, c->vf_src(vf18).vf.x()); bc = (vis[vi08] != vis[vi06]); // nop | madday.xyzw ACC, vf27, vf18 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf27].vf, c->vfs[vf18].vf.y()); + c->acc.vf.madda_xyzw(c->vfs[vf27].vf, c->vfs[vf18].vf.y()); if (bc) { goto L9; } // iaddiu vi01, vi00, 0x1ab | nop vis[vi01] = 0x1ab; /* 427 */ L9: // nop | maddz.xyzw vf23, vf23, vf18 - c->acc.vf.madd(Mask::xyzw, c->vfs[vf23].vf, c->vf_src(vf23).vf, c->vf_src(vf18).vf.z()); + c->acc.vf.madd_xyzw(c->vfs[vf23].vf, c->vf_src(vf23).vf, c->vf_src(vf18).vf.z()); // mtir vi13, vf29.w | itof0.xyz vf18, vf29 c->vfs[vf18].vf.itof0(Mask::xyz, c->vf_src(vf29).vf); vis[vi13] = c->vf_src(vf29).vf.w_as_u16(); // move.xyzw vf13, vf12 | mulaz.xyzw ACC, vf24, vf08 :e - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf24).vf, c->vf_src(vf08).vf.z()); c->vfs[vf13].vf.move(Mask::xyzw, c->vf_src(vf12).vf); + c->acc.vf.mula_xyzw(c->vf_src(vf24).vf, c->vf_src(vf08).vf.z()); c->vfs[vf13].vf.move(Mask::xyzw, c->vf_src(vf12).vf); // move.xyzw vf12, vf11 | maddaz.xyzw ACC, vf25, vf09 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf25].vf, c->vfs[vf09].vf.z()); c->vfs[vf12].vf.move(Mask::xyzw, c->vf_src(vf11).vf); + c->acc.vf.madda_xyzw(c->vfs[vf25].vf, c->vfs[vf09].vf.z()); c->vfs[vf12].vf.move(Mask::xyzw, c->vf_src(vf11).vf); return; L10: // nop | add.xyzw vf10, vf03, vf07 - c->vfs[vf10].vf.add(Mask::xyzw, c->vf_src(vf03).vf, c->vf_src(vf07).vf); + c->vfs[vf10].vf.add_xyzw(c->vf_src(vf03).vf, c->vf_src(vf07).vf); // BRANCH! // ibne vi08, vi06, L11 | adday.xyzw vf16, vf16 c->acc.vf.adda(Mask::xyzw, c->vfs[vf16].vf, c->vfs[vf16].vf.y()); bc = (vis[vi08] != vis[vi06]); // nop | maddz.xyzw vf16, vf17, vf16 - c->acc.vf.madd(Mask::xyzw, c->vfs[vf16].vf, c->vf_src(vf17).vf, c->vf_src(vf16).vf.z()); + c->acc.vf.madd_xyzw(c->vfs[vf16].vf, c->vf_src(vf17).vf, c->vf_src(vf16).vf.z()); if (bc) { goto L11; } // iaddiu vi01, vi00, 0x1ab | nop vis[vi01] = 0x1ab; /* 427 */ L11: // move.xyzw vf13, vf12 | mulaz.xyzw ACC, vf24, vf08 :e - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf24).vf, c->vf_src(vf08).vf.z()); c->vfs[vf13].vf.move(Mask::xyzw, c->vf_src(vf12).vf); + c->acc.vf.mula_xyzw(c->vf_src(vf24).vf, c->vf_src(vf08).vf.z()); c->vfs[vf13].vf.move(Mask::xyzw, c->vf_src(vf12).vf); // move.xyzw vf12, vf11 | maddaz.xyzw ACC, vf25, vf09 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf25].vf, c->vfs[vf09].vf.z()); c->vfs[vf12].vf.move(Mask::xyzw, c->vf_src(vf11).vf); + c->acc.vf.madda_xyzw(c->vfs[vf25].vf, c->vfs[vf09].vf.z()); c->vfs[vf12].vf.move(Mask::xyzw, c->vf_src(vf11).vf); return; JUMP_427: // rsqrt Q, vf00.w, vf16.x | maddaw.xyzw ACC, vf21, vf09 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf21].vf, c->vfs[vf09].vf.w()); c->Q = c->vf_src(vf00).vf.w() / std::sqrt(c->vf_src(vf16).vf.x()); + c->acc.vf.madda_xyzw(c->vfs[vf21].vf, c->vfs[vf09].vf.w()); c->Q = c->vf_src(vf00).vf.w() / std::sqrt(c->vf_src(vf16).vf.x()); // nop | maddaw.xyzw ACC, vf22, vf10 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf22].vf, c->vfs[vf10].vf.w()); + c->acc.vf.madda_xyzw(c->vfs[vf22].vf, c->vfs[vf10].vf.w()); // nop | maddw.xyzw vf15, vf23, vf00 - c->acc.vf.madd(Mask::xyzw, c->vfs[vf15].vf, c->vf_src(vf23).vf, c->vf_src(vf00).vf.w()); + c->acc.vf.madd_xyzw(c->vfs[vf15].vf, c->vf_src(vf23).vf, c->vf_src(vf00).vf.w()); // nop | mul.xyzw vf16, vf11, vf11 - c->vfs[vf16].vf.mul(Mask::xyzw, c->vf_src(vf11).vf, c->vf_src(vf11).vf); + c->vfs[vf16].vf.mul_xyzw(c->vf_src(vf11).vf, c->vf_src(vf11).vf); // nop | add.xyzw vf08, vf01, vf05 - c->vfs[vf08].vf.add(Mask::xyzw, c->vf_src(vf01).vf, c->vf_src(vf05).vf); + c->vfs[vf08].vf.add_xyzw(c->vf_src(vf01).vf, c->vf_src(vf05).vf); // nop | add.xyzw vf09, vf02, vf06 - c->vfs[vf09].vf.add(Mask::xyzw, c->vf_src(vf02).vf, c->vf_src(vf06).vf); + c->vfs[vf09].vf.add_xyzw(c->vf_src(vf02).vf, c->vf_src(vf06).vf); // nop | add.xyzw vf10, vf03, vf07 - c->vfs[vf10].vf.add(Mask::xyzw, c->vf_src(vf03).vf, c->vf_src(vf07).vf); + c->vfs[vf10].vf.add_xyzw(c->vf_src(vf03).vf, c->vf_src(vf07).vf); // nop | adday.xyzw vf16, vf16 c->acc.vf.adda(Mask::xyzw, c->vfs[vf16].vf, c->vfs[vf16].vf.y()); // nop | maddz.xyzw vf16, vf17, vf16 - c->acc.vf.madd(Mask::xyzw, c->vfs[vf16].vf, c->vf_src(vf17).vf, c->vf_src(vf16).vf.z()); + c->acc.vf.madd_xyzw(c->vfs[vf16].vf, c->vf_src(vf17).vf, c->vf_src(vf16).vf.z()); // move.xyzw vf13, vf12 | mulaz.xyzw ACC, vf24, vf08 :e - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf24).vf, c->vf_src(vf08).vf.z()); c->vfs[vf13].vf.move(Mask::xyzw, c->vf_src(vf12).vf); + c->acc.vf.mula_xyzw(c->vf_src(vf24).vf, c->vf_src(vf08).vf.z()); c->vfs[vf13].vf.move(Mask::xyzw, c->vf_src(vf12).vf); // move.xyzw vf12, vf11 | maddaz.xyzw ACC, vf25, vf09 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf25].vf, c->vfs[vf09].vf.z()); c->vfs[vf12].vf.move(Mask::xyzw, c->vf_src(vf11).vf); + c->acc.vf.madda_xyzw(c->vfs[vf25].vf, c->vfs[vf09].vf.z()); c->vfs[vf12].vf.move(Mask::xyzw, c->vf_src(vf11).vf); return; // nop | mulaz.xyzw ACC, vf09, vf01 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf09).vf, c->vf_src(vf01).vf.z()); + c->acc.vf.mula_xyzw(c->vf_src(vf09).vf, c->vf_src(vf01).vf.z()); // nop | maddax.xyzw ACC, vf10, vf01 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf10].vf, c->vfs[vf01].vf.x()); + c->acc.vf.madda_xyzw(c->vfs[vf10].vf, c->vfs[vf01].vf.x()); // nop | maddy.xyzw vf01, vf11, vf01 u16 f1 = c->acc.vf.madd_flag(Mask::xyzw, c->vfs[vf01].vf, c->vf_src(vf11).vf, c->vf_src(vf01).vf.y()); // nop | mulaz.xyzw ACC, vf09, vf02 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf09).vf, c->vf_src(vf02).vf.z()); + c->acc.vf.mula_xyzw(c->vf_src(vf09).vf, c->vf_src(vf02).vf.z()); // nop | maddax.xyzw ACC, vf10, vf02 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf10].vf, c->vfs[vf02].vf.x()); + c->acc.vf.madda_xyzw(c->vfs[vf10].vf, c->vfs[vf02].vf.x()); // nop | maddy.xyzw vf02, vf11, vf02 u16 f2 = c->acc.vf.madd_flag(Mask::xyzw, c->vfs[vf02].vf, c->vf_src(vf11).vf, c->vf_src(vf02).vf.y()); // fmand vi01, vi09 | mulaz.xyzw ACC, vf09, vf03 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf09).vf, c->vf_src(vf03).vf.z()); vis[vi01] = f1 & vis[vi09]; + c->acc.vf.mula_xyzw(c->vf_src(vf09).vf, c->vf_src(vf03).vf.z()); vis[vi01] = f1 & vis[vi09]; // nop | maddax.xyzw ACC, vf10, vf03 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf10].vf, c->vfs[vf03].vf.x()); + c->acc.vf.madda_xyzw(c->vfs[vf10].vf, c->vfs[vf03].vf.x()); // nop | maddy.xyzw vf03, vf11, vf03 u16 f3 = c->acc.vf.madd_flag(Mask::xyzw, c->vfs[vf03].vf, c->vf_src(vf11).vf, c->vf_src(vf03).vf.y()); // fmand vi02, vi09 | mulaz.xyzw ACC, vf09, vf04 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf09).vf, c->vf_src(vf04).vf.z()); vis[vi02] = f2 & vis[vi09]; + c->acc.vf.mula_xyzw(c->vf_src(vf09).vf, c->vf_src(vf04).vf.z()); vis[vi02] = f2 & vis[vi09]; // nop | maddax.xyzw ACC, vf10, vf04 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf10].vf, c->vfs[vf04].vf.x()); + c->acc.vf.madda_xyzw(c->vfs[vf10].vf, c->vfs[vf04].vf.x()); // nop | maddy.xyzw vf04, vf11, vf04 u16 f4 = c->acc.vf.madd_flag(Mask::xyzw, c->vfs[vf04].vf, c->vf_src(vf11).vf, c->vf_src(vf04).vf.y()); // fmand vi03, vi09 | nop @@ -1806,27 +2259,27 @@ void vcallms_311(ExecutionContext* c, u16* vis) { return; // nop | mulaz.xyzw ACC, vf09, vf05 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf09).vf, c->vf_src(vf05).vf.z()); + c->acc.vf.mula_xyzw(c->vf_src(vf09).vf, c->vf_src(vf05).vf.z()); // nop | maddax.xyzw ACC, vf10, vf05 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf10].vf, c->vfs[vf05].vf.x()); + c->acc.vf.madda_xyzw(c->vfs[vf10].vf, c->vfs[vf05].vf.x()); // nop | maddy.xyzw vf05, vf11, vf05 u16 f5 = c->acc.vf.madd_flag(Mask::xyzw, c->vfs[vf05].vf, c->vf_src(vf11).vf, c->vf_src(vf05).vf.y()); // nop | mulaz.xyzw ACC, vf09, vf06 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf09).vf, c->vf_src(vf06).vf.z()); + c->acc.vf.mula_xyzw(c->vf_src(vf09).vf, c->vf_src(vf06).vf.z()); // nop | maddax.xyzw ACC, vf10, vf06 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf10].vf, c->vfs[vf06].vf.x()); + c->acc.vf.madda_xyzw(c->vfs[vf10].vf, c->vfs[vf06].vf.x()); // nop | maddy.xyzw vf06, vf11, vf06 u16 f6 = c->acc.vf.madd_flag(Mask::xyzw, c->vfs[vf06].vf, c->vf_src(vf11).vf, c->vf_src(vf06).vf.y()); // fmand vi05, vi09 | mulaz.xyzw ACC, vf09, vf07 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf09).vf, c->vf_src(vf07).vf.z()); vis[vi05] = f5 & vis[vi09]; + c->acc.vf.mula_xyzw(c->vf_src(vf09).vf, c->vf_src(vf07).vf.z()); vis[vi05] = f5 & vis[vi09]; // nop | maddax.xyzw ACC, vf10, vf07 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf10].vf, c->vfs[vf07].vf.x()); + c->acc.vf.madda_xyzw(c->vfs[vf10].vf, c->vfs[vf07].vf.x()); // nop | maddy.xyzw vf07, vf11, vf07 u16 f7 = c->acc.vf.madd_flag(Mask::xyzw, c->vfs[vf07].vf, c->vf_src(vf11).vf, c->vf_src(vf07).vf.y()); // fmand vi06, vi09 | mulaz.xyzw ACC, vf09, vf08 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf09).vf, c->vf_src(vf08).vf.z()); vis[vi06] = f6 & vis[vi09]; + c->acc.vf.mula_xyzw(c->vf_src(vf09).vf, c->vf_src(vf08).vf.z()); vis[vi06] = f6 & vis[vi09]; // nop | maddax.xyzw ACC, vf10, vf08 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf10].vf, c->vfs[vf08].vf.x()); + c->acc.vf.madda_xyzw(c->vfs[vf10].vf, c->vfs[vf08].vf.x()); // nop | maddy.xyzw vf08, vf11, vf08 u16 f8 = c->acc.vf.madd_flag(Mask::xyzw, c->vfs[vf08].vf, c->vf_src(vf11).vf, c->vf_src(vf08).vf.y()); // fmand vi07, vi09 | nop @@ -3066,27 +3519,27 @@ struct Cache { void vcallms_438(ExecutionContext* c, u16* vis) { // nop | mulaz.xyzw ACC, vf09, vf01 158 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf09).vf, c->vf_src(vf01).vf.z()); + c->acc.vf.mula_xyzw(c->vf_src(vf09).vf, c->vf_src(vf01).vf.z()); // nop | maddax.xyzw ACC, vf10, vf01 159 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf10].vf, c->vfs[vf01].vf.x()); + c->acc.vf.madda_xyzw(c->vfs[vf10].vf, c->vfs[vf01].vf.x()); // nop | maddy.xyzw vf01, vf11, vf01 160 u16 f1 = c->acc.vf.madd_flag(Mask::xyzw, c->vfs[vf01].vf, c->vf_src(vf11).vf, c->vf_src(vf01).vf.y()); // nop | mulaz.xyzw ACC, vf09, vf02 161 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf09).vf, c->vf_src(vf02).vf.z()); + c->acc.vf.mula_xyzw(c->vf_src(vf09).vf, c->vf_src(vf02).vf.z()); // nop | maddax.xyzw ACC, vf10, vf02 162 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf10].vf, c->vfs[vf02].vf.x()); + c->acc.vf.madda_xyzw(c->vfs[vf10].vf, c->vfs[vf02].vf.x()); // nop | maddy.xyzw vf02, vf11, vf02 163 u16 f2 = c->acc.vf.madd_flag(Mask::xyzw, c->vfs[vf02].vf, c->vf_src(vf11).vf, c->vf_src(vf02).vf.y()); // fmand vi01, vi09 | mulaz.xyzw ACC, vf09, vf03 164 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf09).vf, c->vf_src(vf03).vf.z()); vis[1] = vis[9] & f1; + c->acc.vf.mula_xyzw(c->vf_src(vf09).vf, c->vf_src(vf03).vf.z()); vis[1] = vis[9] & f1; // nop | maddax.xyzw ACC, vf10, vf03 165 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf10].vf, c->vfs[vf03].vf.x()); + c->acc.vf.madda_xyzw(c->vfs[vf10].vf, c->vfs[vf03].vf.x()); // nop | maddy.xyzw vf03, vf11, vf03 166 u16 f3 = c->acc.vf.madd_flag(Mask::xyzw, c->vfs[vf03].vf, c->vf_src(vf11).vf, c->vf_src(vf03).vf.y()); // fmand vi02, vi09 | mulaz.xyzw ACC, vf09, vf04 167 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf09).vf, c->vf_src(vf04).vf.z()); vis[2] = vis[9] & f2; + c->acc.vf.mula_xyzw(c->vf_src(vf09).vf, c->vf_src(vf04).vf.z()); vis[2] = vis[9] & f2; // nop | maddax.xyzw ACC, vf10, vf04 168 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf10].vf, c->vfs[vf04].vf.x()); + c->acc.vf.madda_xyzw(c->vfs[vf10].vf, c->vfs[vf04].vf.x()); // nop | maddy.xyzw vf04, vf11, vf04 169 c->acc.vf.madd_flag(Mask::xyzw, c->vfs[vf04].vf, c->vf_src(vf11).vf, c->vf_src(vf04).vf.y()); // fmand vi03, vi09 | nop 170 @@ -3104,27 +3557,27 @@ void vcallms_438(ExecutionContext* c, u16* vis) { void vcallms_454(ExecutionContext* c, u16* vis) { // nop | mulaz.xyzw ACC, vf09, vf05 174 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf09).vf, c->vf_src(vf05).vf.z()); + c->acc.vf.mula_xyzw(c->vf_src(vf09).vf, c->vf_src(vf05).vf.z()); // nop | maddax.xyzw ACC, vf10, vf05 175 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf10].vf, c->vfs[vf05].vf.x()); + c->acc.vf.madda_xyzw(c->vfs[vf10].vf, c->vfs[vf05].vf.x()); // nop | maddy.xyzw vf05, vf11, vf05 176 u16 f1 = c->acc.vf.madd_flag(Mask::xyzw, c->vfs[vf05].vf, c->vf_src(vf11).vf, c->vf_src(vf05).vf.y()); // nop | mulaz.xyzw ACC, vf09, vf06 177 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf09).vf, c->vf_src(vf06).vf.z()); + c->acc.vf.mula_xyzw(c->vf_src(vf09).vf, c->vf_src(vf06).vf.z()); // nop | maddax.xyzw ACC, vf10, vf06 178 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf10].vf, c->vfs[vf06].vf.x()); + c->acc.vf.madda_xyzw(c->vfs[vf10].vf, c->vfs[vf06].vf.x()); // nop | maddy.xyzw vf06, vf11, vf06 179 u16 f2 = c->acc.vf.madd_flag(Mask::xyzw, c->vfs[vf06].vf, c->vf_src(vf11).vf, c->vf_src(vf06).vf.y()); // fmand vi05, vi09 | mulaz.xyzw ACC, vf09, vf07 180 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf09).vf, c->vf_src(vf07).vf.z()); vis[5] = vis[9] & f1; + c->acc.vf.mula_xyzw(c->vf_src(vf09).vf, c->vf_src(vf07).vf.z()); vis[5] = vis[9] & f1; // nop | maddax.xyzw ACC, vf10, vf07 181 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf10].vf, c->vfs[vf07].vf.x()); + c->acc.vf.madda_xyzw(c->vfs[vf10].vf, c->vfs[vf07].vf.x()); // nop | maddy.xyzw vf07, vf11, vf07 182 u16 f3 = c->acc.vf.madd_flag(Mask::xyzw, c->vfs[vf07].vf, c->vf_src(vf11).vf, c->vf_src(vf07).vf.y()); // fmand vi06, vi09 | mulaz.xyzw ACC, vf09, vf08 183 - c->acc.vf.mula(Mask::xyzw, c->vf_src(vf09).vf, c->vf_src(vf08).vf.z()); vis[5] = vis[9] & f2; + c->acc.vf.mula_xyzw(c->vf_src(vf09).vf, c->vf_src(vf08).vf.z()); vis[5] = vis[9] & f2; // nop | maddax.xyzw ACC, vf10, vf08 184 - c->acc.vf.madda(Mask::xyzw, c->vfs[vf10].vf, c->vfs[vf08].vf.x()); + c->acc.vf.madda_xyzw(c->vfs[vf10].vf, c->vfs[vf08].vf.x()); // nop | maddy.xyzw vf08, vf11, vf08 185 u16 f4 =c->acc.vf.madd_flag(Mask::xyzw, c->vfs[vf08].vf, c->vf_src(vf11).vf, c->vf_src(vf08).vf.y()); // fmand vi07, vi09 | nop 186