mirror of
https://github.com/libretro/ppsspp.git
synced 2024-11-23 16:19:44 +00:00
SIMD-optimize some data conv routines used in uniform updates.
This commit is contained in:
parent
cbd107b03d
commit
91783a3281
@ -892,6 +892,8 @@ add_library(native STATIC
|
||||
ext/native/math/fast/fast_matrix.c
|
||||
ext/native/math/fast/fast_matrix_neon.S
|
||||
ext/native/math/fast/fast_matrix_sse.c
|
||||
ext/native/math/dataconv.cpp
|
||||
ext/native/math/dataconv.h
|
||||
ext/native/math/curves.cpp
|
||||
ext/native/math/curves.h
|
||||
ext/native/math/expression_parser.cpp
|
||||
|
@ -519,8 +519,7 @@ void CallSyscall(MIPSOpcode op)
|
||||
{
|
||||
PROFILE_THIS_SCOPE("syscall");
|
||||
double start = 0.0; // need to initialize to fix the race condition where coreCollectDebugStats is enabled in the middle of this func.
|
||||
if (coreCollectDebugStats)
|
||||
{
|
||||
if (coreCollectDebugStats) {
|
||||
time_update();
|
||||
start = time_now_d();
|
||||
}
|
||||
@ -544,8 +543,7 @@ void CallSyscall(MIPSOpcode op)
|
||||
ERROR_LOG_REPORT(HLE, "Unimplemented HLE function %s", info->name ? info->name : "(\?\?\?)");
|
||||
}
|
||||
|
||||
if (coreCollectDebugStats)
|
||||
{
|
||||
if (coreCollectDebugStats) {
|
||||
time_update();
|
||||
u32 callno = (op >> 6) & 0xFFFFF; //20 bits
|
||||
int funcnum = callno & 0xFFF;
|
||||
|
@ -94,6 +94,7 @@ enum : uint64_t {
|
||||
DIRTY_BONE_UNIFORMS = 0xFF000000ULL,
|
||||
|
||||
DIRTY_ALL_UNIFORMS = 0x1FFFFFFFFULL,
|
||||
DIRTY_ALL_LIGHTS = DIRTY_LIGHT0 | DIRTY_LIGHT1 | DIRTY_LIGHT2 | DIRTY_LIGHT3,
|
||||
|
||||
// Other dirty elements that aren't uniforms!
|
||||
DIRTY_FRAMEBUF = 1ULL << 40,
|
||||
|
@ -219,7 +219,6 @@ void LightUpdateUniforms(UB_VS_Lights *ub, uint64_t dirtyUniforms) {
|
||||
if (dirtyUniforms & DIRTY_MATEMISSIVE) {
|
||||
Uint8x3ToFloat4(ub->materialEmissive, gstate.materialemissive);
|
||||
}
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (dirtyUniforms & (DIRTY_LIGHT0 << i)) {
|
||||
if (gstate.isDirectionalLight(i)) {
|
||||
|
11
GPU/Math3D.h
11
GPU/Math3D.h
@ -822,8 +822,7 @@ typedef Math3D::Vec3Packed<float> Vec3Packedf;
|
||||
typedef Math3D::Vec4<float> Vec4f;
|
||||
|
||||
|
||||
inline void Vec3ByMatrix43(float vecOut[3], const float v[3], const float m[12])
|
||||
{
|
||||
inline void Vec3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) {
|
||||
vecOut[0] = v[0] * m[0] + v[1] * m[3] + v[2] * m[6] + m[9];
|
||||
vecOut[1] = v[0] * m[1] + v[1] * m[4] + v[2] * m[7] + m[10];
|
||||
vecOut[2] = v[0] * m[2] + v[1] * m[5] + v[2] * m[8] + m[11];
|
||||
@ -895,6 +894,14 @@ inline void ConvertMatrix4x3To4x4Transposed(float *m4x4, const float *m4x3) {
|
||||
m4x4[15] = 1.0f;
|
||||
}
|
||||
|
||||
// 0369
|
||||
// 147A
|
||||
// 258B
|
||||
// ->>-
|
||||
// 0123
|
||||
// 4567
|
||||
// 89AB
|
||||
// Don't see a way to SIMD that. Should be pretty fast anyway.
|
||||
inline void ConvertMatrix4x3To3x4Transposed(float *m4x4, const float *m4x3) {
|
||||
m4x4[0] = m4x3[0];
|
||||
m4x4[1] = m4x3[3];
|
||||
|
@ -670,8 +670,8 @@ void DrawEngineVulkan::DoFlush() {
|
||||
VulkanVertexShader *vshader = nullptr;
|
||||
VulkanFragmentShader *fshader = nullptr;
|
||||
|
||||
uint32_t ibOffset = 0;
|
||||
uint32_t vbOffset = 0;
|
||||
uint32_t ibOffset;
|
||||
uint32_t vbOffset;
|
||||
|
||||
if (useHWTransform) {
|
||||
// We don't detect clears in this path, so here we can switch framebuffers if necessary.
|
||||
@ -880,7 +880,7 @@ void DrawEngineVulkan::DoFlush() {
|
||||
return;
|
||||
}
|
||||
if (pipeline != lastPipeline_) {
|
||||
vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline->pipeline); // TODO: Avoid if same as last draw.
|
||||
vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline->pipeline);
|
||||
lastPipeline_ = pipeline;
|
||||
}
|
||||
ApplyDrawStateLate(cmd, false, 0);
|
||||
|
@ -482,8 +482,6 @@ void GPU_Vulkan::Execute_Prim(u32 op, u32 diff) {
|
||||
|
||||
// This also makes skipping drawing very effective.
|
||||
framebufferManager_->SetRenderFrameBuffer(gstate_c.IsDirty(DIRTY_FRAMEBUF), gstate_c.skipDrawReason);
|
||||
if (!draw_->GetNativeObject(Draw::NativeObject::CURRENT_RENDERPASS))
|
||||
Crash();
|
||||
|
||||
if (gstate_c.skipDrawReason & (SKIPDRAW_SKIPFRAME | SKIPDRAW_NON_DISPLAYED_FB)) {
|
||||
drawEngine_.SetupVertexDecoder(gstate.vertType); // Do we still need to do this?
|
||||
|
@ -51,6 +51,7 @@ LOCAL_SRC_FILES :=\
|
||||
input/input_state.cpp \
|
||||
math/fast/fast_math.c \
|
||||
math/fast/fast_matrix.c \
|
||||
math/dataconv.cpp \
|
||||
math/math_util.cpp \
|
||||
math/curves.cpp \
|
||||
math/expression_parser.cpp \
|
||||
|
3
ext/native/math/dataconv.cpp
Normal file
3
ext/native/math/dataconv.cpp
Normal file
@ -0,0 +1,3 @@
|
||||
#include "dataconv.h"
|
||||
|
||||
alignas(16) const float one_over_255_x4[4] = { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, };
|
@ -3,26 +3,65 @@
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
|
||||
#include "Common/Common.h"
|
||||
#include "ppsspp_config.h"
|
||||
|
||||
#ifdef _M_SSE
|
||||
#include <emmintrin.h>>
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
#if PPSSPP_PLATFORM(ARM_NEON)
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
extern const float one_over_255_x4[4];
|
||||
|
||||
// Utilities useful for filling in std140-layout uniform buffers, and similar.
|
||||
// NEON intrinsics: http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0491f/BABDCGGF.html
|
||||
|
||||
// LSBs in f[0], etc.
|
||||
// Could be SSE optimized.
|
||||
inline void Uint8x4ToFloat4(float f[4], uint32_t u) {
|
||||
#ifdef _M_SSE
|
||||
__m128i zero = _mm_setzero_si128();
|
||||
__m128i value = _mm_set1_epi32(u);
|
||||
__m128i value32 = _mm_unpacklo_epi16(_mm_unpacklo_epi8(value, zero), zero);
|
||||
__m128 fvalues = _mm_mul_ps(_mm_cvtepi32_ps(value32), _mm_load_ps(one_over_255_x4));
|
||||
_mm_storeu_ps(f, fvalues);
|
||||
#elif PPSSPP_PLATFORM(ARM_NEON)
|
||||
const float32x4_t one_over = vdupq_n_f32(1.0f/255.0f);
|
||||
const uint8x8_t value = vld1_lane_u32(u);
|
||||
const uint16x8_t value16 = vmovl_s8(value);
|
||||
const uint32x4_t value32 = vmovl_s16(vget_low_s16(value16));
|
||||
const float32x4_t valueFloat = vmulq_f32(vcvtq_f32_u32(value32), one_over);
|
||||
vst1q_u32((uint32_t *)dest, valueFloat);
|
||||
#else
|
||||
f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);
|
||||
f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f);
|
||||
f[2] = ((u >> 16) & 0xFF) * (1.0f / 255.0f);
|
||||
f[3] = ((u >> 24) & 0xFF) * (1.0f / 255.0f);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void Uint8x3ToFloat4(float f[4], uint32_t u) {
|
||||
inline void Uint8x3ToFloat4_AlphaUint8(float f[4], uint32_t u, uint8_t alpha) {
|
||||
#if defined(_M_SSE) || PPSSPP_PLATFORM(ARM_NEON)
|
||||
Uint8x4ToFloat4(f, (u & 0xFFFFFF) | (alpha << 24));
|
||||
#else
|
||||
f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);
|
||||
f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f);
|
||||
f[2] = ((u >> 16) & 0xFF) * (1.0f / 255.0f);
|
||||
f[3] = 0.0f;
|
||||
f[3] = alpha * (1.0f / 255.0f);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void Uint8x3ToFloat4(float f[4], uint32_t u) {
|
||||
#if defined(_M_SSE) || PPSSPP_PLATFORM(ARM_NEON)
|
||||
Uint8x4ToFloat4(f, u & 0xFFFFFF);
|
||||
#else
|
||||
f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);
|
||||
f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f);
|
||||
f[2] = ((u >> 16) & 0xFF) * (1.0f / 255.0f);
|
||||
f[3] = ((u >> 24) & 0xFF) * (1.0f / 255.0f);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void Uint8x3ToInt4(int i[4], uint32_t u) {
|
||||
@ -46,13 +85,6 @@ inline void Uint8x3ToFloat4_Alpha(float f[4], uint32_t u, float alpha) {
|
||||
f[3] = alpha;
|
||||
}
|
||||
|
||||
inline void Uint8x3ToFloat4_AlphaUint8(float f[4], uint32_t u, uint8_t alpha) {
|
||||
f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);
|
||||
f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f);
|
||||
f[2] = ((u >> 16) & 0xFF) * (1.0f / 255.0f);
|
||||
f[3] = alpha * (1.0f / 255.0f);
|
||||
}
|
||||
|
||||
inline void Uint8x1ToFloat4(float f[4], uint32_t u) {
|
||||
f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);
|
||||
f[1] = 0.0f;
|
||||
@ -63,40 +95,54 @@ inline void Uint8x1ToFloat4(float f[4], uint32_t u) {
|
||||
// These are just for readability.
|
||||
|
||||
inline void CopyFloat2(float dest[2], const float src[2]) {
|
||||
memcpy(dest, src, sizeof(float) * 2);
|
||||
dest[0] = src[0];
|
||||
dest[1] = src[1];
|
||||
}
|
||||
|
||||
inline void CopyFloat3(float dest[3], const float src[3]) {
|
||||
memcpy(dest, src, sizeof(float) * 3);
|
||||
dest[0] = src[0];
|
||||
dest[1] = src[1];
|
||||
dest[2] = src[2];
|
||||
}
|
||||
|
||||
inline void CopyFloat1To4(float dest[4], const float src) {
|
||||
#ifdef _M_SSE
|
||||
_mm_storeu_ps(dest, _mm_set_ss(src));
|
||||
#else
|
||||
dest[0] = src;
|
||||
dest[1] = 0.0f;
|
||||
dest[2] = 0.0f;
|
||||
dest[3] = 0.0f;
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void CopyFloat2To4(float dest[4], const float src[2]) {
|
||||
memcpy(dest, src, sizeof(float) * 2);
|
||||
dest[0] = src[0];
|
||||
dest[1] = src[1];
|
||||
dest[2] = 0.0f;
|
||||
dest[3] = 0.0f;
|
||||
}
|
||||
|
||||
inline void CopyFloat3To4(float dest[4], const float src[3]) {
|
||||
memcpy(dest, src, sizeof(float) * 3);
|
||||
dest[0] = src[0];
|
||||
dest[1] = src[1];
|
||||
dest[2] = src[2];
|
||||
dest[3] = 0.0f;
|
||||
}
|
||||
|
||||
inline void CopyFloat4(float dest[4], const float src[4]) {
|
||||
memcpy(dest, src, sizeof(float) * 4);
|
||||
}
|
||||
|
||||
inline void CopyMatrix4x4(float dest[16], const float src[16]) {
|
||||
memcpy(dest, src, sizeof(float) * 16);
|
||||
}
|
||||
|
||||
inline void ExpandFloat24x3ToFloat4(float dest[4], uint32_t src[3]) {
|
||||
#ifdef _M_SSE
|
||||
__m128i values = _mm_slli_epi32(_mm_load_si128((const __m128i *)src), 8);
|
||||
_mm_storeu_si128((__m128i *)dest, values);
|
||||
#elif PPSSPP_PLATFORM(ARM_NEON)
|
||||
const uint32x4_t values = vshlq_n_u32(vld1q_u32(&gstate.texscaleu), 8);
|
||||
vst1q_u32((uint32_t *)dest, values);
|
||||
#else
|
||||
uint32_t temp[4] = { src[0] << 8, src[1] << 8, src[2] << 8, 0 };
|
||||
memcpy(dest, temp, sizeof(float) * 4);
|
||||
#endif
|
||||
}
|
||||
|
@ -696,6 +696,7 @@
|
||||
<ClCompile Include="gfx_es2\draw_text_android.cpp" />
|
||||
<ClCompile Include="gfx_es2\draw_text_qt.cpp" />
|
||||
<ClCompile Include="gfx_es2\draw_text_win.cpp" />
|
||||
<ClCompile Include="math\dataconv.cpp" />
|
||||
<ClCompile Include="thin3d\d3d11_loader.cpp" />
|
||||
<ClCompile Include="thin3d\thin3d_d3d11.cpp" />
|
||||
<ClCompile Include="util\text\wrap_text.cpp" />
|
||||
|
@ -784,6 +784,9 @@
|
||||
<ClCompile Include="gfx_es2\draw_text_android.cpp">
|
||||
<Filter>gfx</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="math\dataconv.cpp">
|
||||
<Filter>math</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<Filter Include="gfx">
|
||||
|
Loading…
Reference in New Issue
Block a user