mirror of
https://github.com/hrydgard/ppsspp.git
synced 2024-11-22 21:09:52 +00:00
Merge pull request #17808 from hrydgard/frustum-cull-small-draws
Frustum-cull small draws (experiment)
This commit is contained in:
commit
27e47d9899
@ -737,6 +737,7 @@ add_library(Common STATIC
|
||||
Common/Input/InputState.cpp
|
||||
Common/Input/InputState.h
|
||||
Common/Math/fast/fast_matrix.c
|
||||
Common/Math/CrossSIMD.h
|
||||
Common/Math/curves.cpp
|
||||
Common/Math/curves.h
|
||||
Common/Math/expression_parser.cpp
|
||||
|
@ -484,6 +484,7 @@
|
||||
<ClInclude Include="Input\GestureDetector.h" />
|
||||
<ClInclude Include="Input\InputState.h" />
|
||||
<ClInclude Include="Input\KeyCodes.h" />
|
||||
<ClInclude Include="Math\CrossSIMD.h" />
|
||||
<ClInclude Include="Math\curves.h" />
|
||||
<ClInclude Include="Math\expression_parser.h" />
|
||||
<ClInclude Include="Math\fast\fast_matrix.h" />
|
||||
|
@ -518,6 +518,9 @@
|
||||
<ClInclude Include="GPU\Vulkan\VulkanDescSet.h">
|
||||
<Filter>GPU\Vulkan</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="Math\CrossSIMD.h">
|
||||
<Filter>Math</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="ABI.cpp" />
|
||||
|
58
Common/Math/CrossSIMD.h
Normal file
58
Common/Math/CrossSIMD.h
Normal file
@ -0,0 +1,58 @@
|
||||
// CrossSIMD
|
||||
//
|
||||
// Compatibility wrappers for SIMD dialects.
|
||||
//
|
||||
// In the long run, might do a more general single-source-SIMD wrapper here consisting
|
||||
// of defines that translate to either NEON or SSE. It would be possible to write quite a lot of
|
||||
// our various color conversion functions and so on in a pretty generic manner.
|
||||
|
||||
#include "ppsspp_config.h"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#if PPSSPP_ARCH(SSE2)
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
#if PPSSPP_ARCH(ARM_NEON)
|
||||
#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
|
||||
#include <arm64_neon.h>
|
||||
#else
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Basic types
|
||||
|
||||
#if PPSSPP_ARCH(ARM64_NEON)
|
||||
|
||||
// No special ones here.
|
||||
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
|
||||
// Compatibility wrappers making ARM64 NEON code run on ARM32
|
||||
// With optimization on, these should compile down to the optimal code.
|
||||
|
||||
inline float32x4_t vmulq_laneq_f32(float32x4_t a, float32x4_t b, int lane) {
|
||||
switch (lane & 3) {
|
||||
case 0: return vmulq_lane_f32(a, vget_low_f32(b), 0);
|
||||
case 1: return vmulq_lane_f32(a, vget_low_f32(b), 1);
|
||||
case 2: return vmulq_lane_f32(a, vget_high_f32(b), 0);
|
||||
default: return vmulq_lane_f32(a, vget_high_f32(b), 1);
|
||||
}
|
||||
}
|
||||
|
||||
inline float32x4_t vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t c, int lane) {
|
||||
switch (lane & 3) {
|
||||
case 0: return vmlaq_lane_f32(a, b, vget_low_f32(c), 0);
|
||||
case 1: return vmlaq_lane_f32(a, b, vget_low_f32(c), 1);
|
||||
case 2: return vmlaq_lane_f32(a, b, vget_high_f32(c), 0);
|
||||
default: return vmlaq_lane_f32(a, b, vget_high_f32(c), 1);
|
||||
}
|
||||
}
|
||||
|
||||
inline uint32x4_t vcgezq_f32(float32x4_t v) {
|
||||
return vcgeq_f32(v, vdupq_n_f32(0.0f));
|
||||
}
|
||||
|
||||
#endif
|
@ -620,7 +620,6 @@ CollapsibleHeader::CollapsibleHeader(bool *toggle, const std::string &text, Layo
|
||||
|
||||
void CollapsibleHeader::Draw(UIContext &dc) {
|
||||
Style style = dc.theme->itemStyle;
|
||||
style.background.color = 0;
|
||||
if (HasFocus()) style = dc.theme->itemFocusedStyle;
|
||||
if (down_) style = dc.theme->itemDownStyle;
|
||||
if (!IsEnabled()) style = dc.theme->itemDisabledStyle;
|
||||
|
@ -1893,7 +1893,7 @@ void PlayTimeTracker::Load(const Section *section) {
|
||||
|
||||
// Parse the string.
|
||||
PlayTime gameTime{};
|
||||
if (2 == sscanf(value.c_str(), "%d,%llu", &gameTime.totalTimePlayed, &gameTime.lastTimePlayed)) {
|
||||
if (2 == sscanf(value.c_str(), "%d,%llu", &gameTime.totalTimePlayed, (long long *)&gameTime.lastTimePlayed)) {
|
||||
tracker_[key] = gameTime;
|
||||
}
|
||||
}
|
||||
|
@ -21,6 +21,7 @@
|
||||
#include "Common/Data/Convert/ColorConv.h"
|
||||
#include "Common/Profiler/Profiler.h"
|
||||
#include "Common/LogReporting.h"
|
||||
#include "Common/Math/CrossSIMD.h"
|
||||
#include "Common/Math/lin/matrix4x4.h"
|
||||
#include "Core/Config.h"
|
||||
#include "GPU/Common/DrawEngineCommon.h"
|
||||
@ -197,15 +198,10 @@ void DrawEngineCommon::DispatchSubmitImm(GEPrimitiveType prim, TransformedVertex
|
||||
|
||||
// Gated by DIRTY_CULL_PLANES
|
||||
void DrawEngineCommon::UpdatePlanes() {
|
||||
float world[16];
|
||||
float view[16];
|
||||
float worldview[16];
|
||||
float worldviewproj[16];
|
||||
ConvertMatrix4x3To4x4(world, gstate.worldMatrix);
|
||||
float viewproj[16];
|
||||
ConvertMatrix4x3To4x4(view, gstate.viewMatrix);
|
||||
// TODO: Create a Matrix4x3ByMatrix4x3, and Matrix4x4ByMatrix4x3?
|
||||
Matrix4ByMatrix4(worldview, world, view);
|
||||
Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix);
|
||||
Matrix4ByMatrix4(viewproj, view, gstate.projMatrix);
|
||||
|
||||
// Next, we need to apply viewport, scissor, region, and even offset - but only for X/Y.
|
||||
// Note that the PSP does not clip against the viewport.
|
||||
@ -214,6 +210,9 @@ void DrawEngineCommon::UpdatePlanes() {
|
||||
minOffset_ = baseOffset + Vec2f(std::max(gstate.getRegionRateX() - 0x100, gstate.getScissorX1()), std::max(gstate.getRegionRateY() - 0x100, gstate.getScissorY1())) - Vec2f(1.0f, 1.0f);
|
||||
maxOffset_ = baseOffset + Vec2f(std::min(gstate.getRegionX2(), gstate.getScissorX2()), std::min(gstate.getRegionY2(), gstate.getScissorY2())) + Vec2f(1.0f, 1.0f);
|
||||
|
||||
// Let's not handle these special cases in the fast culler.
|
||||
offsetOutsideEdge_ = maxOffset_.x >= 4096.0f || minOffset_.x < 1.0f || minOffset_.y < 1.0f || maxOffset_.y >= 4096.0f;
|
||||
|
||||
// Now let's apply the viewport to our scissor/region + offset range.
|
||||
Vec2f inverseViewportScale = Vec2f(1.0f / gstate.getViewportXScale(), 1.0f / gstate.getViewportYScale());
|
||||
Vec2f minViewport = (minOffset_ - Vec2f(gstate.getViewportXCenter(), gstate.getViewportYCenter())) * inverseViewportScale;
|
||||
@ -232,14 +231,14 @@ void DrawEngineCommon::UpdatePlanes() {
|
||||
applyViewport.wy = -(maxViewport.y + minViewport.y) * viewportInvSize.y;
|
||||
|
||||
float mtx[16];
|
||||
Matrix4ByMatrix4(mtx, worldviewproj, applyViewport.m);
|
||||
|
||||
planes_[0].Set(mtx[3] - mtx[0], mtx[7] - mtx[4], mtx[11] - mtx[8], mtx[15] - mtx[12]); // Right
|
||||
planes_[1].Set(mtx[3] + mtx[0], mtx[7] + mtx[4], mtx[11] + mtx[8], mtx[15] + mtx[12]); // Left
|
||||
planes_[2].Set(mtx[3] + mtx[1], mtx[7] + mtx[5], mtx[11] + mtx[9], mtx[15] + mtx[13]); // Bottom
|
||||
planes_[3].Set(mtx[3] - mtx[1], mtx[7] - mtx[5], mtx[11] - mtx[9], mtx[15] - mtx[13]); // Top
|
||||
planes_[4].Set(mtx[3] + mtx[2], mtx[7] + mtx[6], mtx[11] + mtx[10], mtx[15] + mtx[14]); // Near
|
||||
planes_[5].Set(mtx[3] - mtx[2], mtx[7] - mtx[6], mtx[11] - mtx[10], mtx[15] - mtx[14]); // Far
|
||||
Matrix4ByMatrix4(mtx, viewproj, applyViewport.m);
|
||||
// I'm sure there's some fairly optimized way to set these.
|
||||
planes_.Set(0, mtx[3] - mtx[0], mtx[7] - mtx[4], mtx[11] - mtx[8], mtx[15] - mtx[12]); // Right
|
||||
planes_.Set(1, mtx[3] + mtx[0], mtx[7] + mtx[4], mtx[11] + mtx[8], mtx[15] + mtx[12]); // Left
|
||||
planes_.Set(2, mtx[3] + mtx[1], mtx[7] + mtx[5], mtx[11] + mtx[9], mtx[15] + mtx[13]); // Bottom
|
||||
planes_.Set(3, mtx[3] - mtx[1], mtx[7] - mtx[5], mtx[11] - mtx[9], mtx[15] - mtx[13]); // Top
|
||||
planes_.Set(4, mtx[3] + mtx[2], mtx[7] + mtx[6], mtx[11] + mtx[10], mtx[15] + mtx[14]); // Near
|
||||
planes_.Set(5, mtx[3] - mtx[2], mtx[7] - mtx[6], mtx[11] - mtx[10], mtx[15] - mtx[14]); // Far
|
||||
}
|
||||
|
||||
// This code has plenty of potential for optimization.
|
||||
@ -262,7 +261,6 @@ bool DrawEngineCommon::TestBoundingBox(const void *vdata, const void *inds, int
|
||||
|
||||
SimpleVertex *corners = (SimpleVertex *)(decoded_ + 65536 * 12);
|
||||
float *verts = (float *)(decoded_ + 65536 * 18);
|
||||
int vertStride = 3;
|
||||
|
||||
// Although this may lead to drawing that shouldn't happen, the viewport is more complex on VR.
|
||||
// Let's always say objects are within bounds.
|
||||
@ -338,17 +336,23 @@ bool DrawEngineCommon::TestBoundingBox(const void *vdata, const void *inds, int
|
||||
}
|
||||
break;
|
||||
case GE_VTYPE_POS_FLOAT:
|
||||
// No need to copy in this case, we can just read directly from the source format with a stride.
|
||||
verts = (float *)((uint8_t *)vdata + offset);
|
||||
vertStride = stride / 4;
|
||||
// Previous code:
|
||||
// for (int i = 0; i < vertexCount; i++)
|
||||
// memcpy(&verts[i * 3], (const u8 *)vdata + stride * i + offset, sizeof(float) * 3);
|
||||
for (int i = 0; i < vertexCount; i++)
|
||||
memcpy(&verts[i * 3], (const u8 *)vdata + stride * i + offset, sizeof(float) * 3);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Pretransform the verts in-place so we don't have to do it inside the loop.
|
||||
// We do this differently in the fast version below since we skip the max/minOffset checks there
|
||||
// making it easier to get the whole thing ready for SIMD.
|
||||
for (int i = 0; i < vertexCount; i++) {
|
||||
float worldpos[3];
|
||||
Vec3ByMatrix43(worldpos, &verts[i * 3], gstate.worldMatrix);
|
||||
memcpy(&verts[i * 3], worldpos, 12);
|
||||
}
|
||||
|
||||
// Note: near/far are not checked without clamp/clip enabled, so we skip those planes.
|
||||
int totalPlanes = gstate.isDepthClampEnabled() ? 6 : 4;
|
||||
for (int plane = 0; plane < totalPlanes; plane++) {
|
||||
@ -358,8 +362,8 @@ bool DrawEngineCommon::TestBoundingBox(const void *vdata, const void *inds, int
|
||||
// Test against the frustum planes, and count.
|
||||
// TODO: We should test 4 vertices at a time using SIMD.
|
||||
// I guess could also test one vertex against 4 planes at a time, though a lot of waste at the common case of 6.
|
||||
const float *pos = verts + i * vertStride;
|
||||
float value = planes_[plane].Test(pos);
|
||||
const float *worldpos = verts + i * 3;
|
||||
float value = planes_.Test(plane, worldpos);
|
||||
if (value <= -FLT_EPSILON) // Not sure why we use exactly this value. Probably '< 0' would do.
|
||||
out++;
|
||||
else
|
||||
@ -388,6 +392,179 @@ bool DrawEngineCommon::TestBoundingBox(const void *vdata, const void *inds, int
|
||||
return true;
|
||||
}
|
||||
|
||||
// NOTE: This doesn't handle through-mode, indexing, morph, or skinning.
|
||||
bool DrawEngineCommon::TestBoundingBoxFast(const void *vdata, int vertexCount, u32 vertType) {
|
||||
SimpleVertex *corners = (SimpleVertex *)(decoded_ + 65536 * 12);
|
||||
float *verts = (float *)(decoded_ + 65536 * 18);
|
||||
|
||||
// Although this may lead to drawing that shouldn't happen, the viewport is more complex on VR.
|
||||
// Let's always say objects are within bounds.
|
||||
if (gstate_c.Use(GPU_USE_VIRTUAL_REALITY))
|
||||
return true;
|
||||
|
||||
// Due to world matrix updates per "thing", this isn't quite as effective as it could be if we did world transform
|
||||
// in here as well. Though, it still does cut down on a lot of updates in Tekken 6.
|
||||
if (gstate_c.IsDirty(DIRTY_CULL_PLANES)) {
|
||||
UpdatePlanes();
|
||||
gpuStats.numPlaneUpdates++;
|
||||
gstate_c.Clean(DIRTY_CULL_PLANES);
|
||||
}
|
||||
|
||||
// Also let's just bail if offsetOutsideEdge_ is set, instead of handling the cases.
|
||||
// NOTE: This is written to in UpdatePlanes so can't check it before.
|
||||
if (offsetOutsideEdge_)
|
||||
return true;
|
||||
|
||||
// Simple, most common case.
|
||||
VertexDecoder *dec = GetVertexDecoder(vertType);
|
||||
int stride = dec->VertexSize();
|
||||
int offset = dec->posoff;
|
||||
int vertStride = 3;
|
||||
|
||||
// TODO: Possibly do the plane tests directly against the source formats instead of converting.
|
||||
switch (vertType & GE_VTYPE_POS_MASK) {
|
||||
case GE_VTYPE_POS_8BIT:
|
||||
for (int i = 0; i < vertexCount; i++) {
|
||||
const s8 *data = (const s8 *)vdata + i * stride + offset;
|
||||
for (int j = 0; j < 3; j++) {
|
||||
verts[i * 3 + j] = data[j] * (1.0f / 128.0f);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case GE_VTYPE_POS_16BIT:
|
||||
{
|
||||
#if PPSSPP_ARCH(SSE2)
|
||||
__m128 scaleFactor = _mm_set1_ps(1.0f / 32768.0f);
|
||||
for (int i = 0; i < vertexCount; i++) {
|
||||
const s16 *data = ((const s16 *)((const s8 *)vdata + i * stride + offset));
|
||||
__m128i bits = _mm_castpd_si128(_mm_load_sd((const double *)data));
|
||||
// Sign extension. Hacky without SSE4.
|
||||
bits = _mm_srai_epi32(_mm_unpacklo_epi16(bits, bits), 16);
|
||||
__m128 pos = _mm_mul_ps(_mm_cvtepi32_ps(bits), scaleFactor);
|
||||
_mm_storeu_ps(verts + i * 3, pos); // TODO: use stride 4 to avoid clashing writes?
|
||||
}
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
for (int i = 0; i < vertexCount; i++) {
|
||||
const s16 *dataPtr = ((const s16 *)((const s8 *)vdata + i * stride + offset));
|
||||
int32x4_t data = vmovl_s16(vld1_s16(dataPtr));
|
||||
float32x4_t pos = vcvtq_n_f32_s32(data, 15); // >> 15 = division by 32768.0f
|
||||
vst1q_f32(verts + i * 3, pos);
|
||||
}
|
||||
#else
|
||||
for (int i = 0; i < vertexCount; i++) {
|
||||
const s16 *data = ((const s16 *)((const s8 *)vdata + i * stride + offset));
|
||||
for (int j = 0; j < 3; j++) {
|
||||
verts[i * 3 + j] = data[j] * (1.0f / 32768.0f);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
case GE_VTYPE_POS_FLOAT:
|
||||
// No need to copy in this case, we can just read directly from the source format with a stride.
|
||||
verts = (float *)((uint8_t *)vdata + offset);
|
||||
vertStride = stride / 4;
|
||||
break;
|
||||
}
|
||||
|
||||
// We only check the 4 sides. Near/far won't likely make a huge difference.
|
||||
// We test one vertex against 4 planes to get some SIMD. Vertices need to be transformed to world space
|
||||
// for testing, don't want to re-do that, so we have to use that "pivot" of the data.
|
||||
#if PPSSPP_ARCH(SSE2)
|
||||
const __m128 worldX = _mm_loadu_ps(gstate.worldMatrix);
|
||||
const __m128 worldY = _mm_loadu_ps(gstate.worldMatrix + 3);
|
||||
const __m128 worldZ = _mm_loadu_ps(gstate.worldMatrix + 6);
|
||||
const __m128 worldW = _mm_loadu_ps(gstate.worldMatrix + 9);
|
||||
const __m128 planeX = _mm_loadu_ps(planes_.x);
|
||||
const __m128 planeY = _mm_loadu_ps(planes_.y);
|
||||
const __m128 planeZ = _mm_loadu_ps(planes_.z);
|
||||
const __m128 planeW = _mm_loadu_ps(planes_.w);
|
||||
__m128 inside = _mm_set1_ps(0.0f);
|
||||
for (int i = 0; i < vertexCount; i++) {
|
||||
const float *pos = verts + i * vertStride;
|
||||
__m128 worldpos = _mm_add_ps(
|
||||
_mm_add_ps(
|
||||
_mm_mul_ps(worldX, _mm_set1_ps(pos[0])),
|
||||
_mm_mul_ps(worldY, _mm_set1_ps(pos[1]))
|
||||
),
|
||||
_mm_add_ps(
|
||||
_mm_mul_ps(worldZ, _mm_set1_ps(pos[2])),
|
||||
worldW
|
||||
)
|
||||
);
|
||||
// OK, now we check it against the four planes.
|
||||
// This is really curiously similar to a matrix multiplication (well, it is one).
|
||||
__m128 posX = _mm_shuffle_ps(worldpos, worldpos, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
__m128 posY = _mm_shuffle_ps(worldpos, worldpos, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
__m128 posZ = _mm_shuffle_ps(worldpos, worldpos, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
__m128 planeDist = _mm_add_ps(
|
||||
_mm_add_ps(
|
||||
_mm_mul_ps(planeX, posX),
|
||||
_mm_mul_ps(planeY, posY)
|
||||
),
|
||||
_mm_add_ps(
|
||||
_mm_mul_ps(planeZ, posZ),
|
||||
planeW
|
||||
)
|
||||
);
|
||||
inside = _mm_or_ps(inside, _mm_cmpge_ps(planeDist, _mm_setzero_ps()));
|
||||
}
|
||||
// 0xF means that we found at least one vertex inside every one of the planes.
|
||||
// We don't bother with counts, though it wouldn't be hard if we had a use for them.
|
||||
return _mm_movemask_ps(inside) == 0xF;
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
const float32x4_t worldX = vld1q_f32(gstate.worldMatrix);
|
||||
const float32x4_t worldY = vld1q_f32(gstate.worldMatrix + 3);
|
||||
const float32x4_t worldZ = vld1q_f32(gstate.worldMatrix + 6);
|
||||
const float32x4_t worldW = vld1q_f32(gstate.worldMatrix + 9);
|
||||
const float32x4_t planeX = vld1q_f32(planes_.x);
|
||||
const float32x4_t planeY = vld1q_f32(planes_.y);
|
||||
const float32x4_t planeZ = vld1q_f32(planes_.z);
|
||||
const float32x4_t planeW = vld1q_f32(planes_.w);
|
||||
uint32x4_t inside = vdupq_n_u32(0);
|
||||
for (int i = 0; i < vertexCount; i++) {
|
||||
const float *pos = verts + i * vertStride;
|
||||
float32x4_t objpos = vld1q_f32(pos);
|
||||
float32x4_t worldpos = vaddq_f32(
|
||||
vmlaq_laneq_f32(
|
||||
vmulq_laneq_f32(worldX, objpos, 0),
|
||||
worldY, objpos, 1),
|
||||
vmlaq_laneq_f32(worldW, worldZ, objpos, 2)
|
||||
);
|
||||
// OK, now we check it against the four planes.
|
||||
// This is really curiously similar to a matrix multiplication (well, it is one).
|
||||
float32x4_t planeDist = vaddq_f32(
|
||||
vmlaq_laneq_f32(
|
||||
vmulq_laneq_f32(planeX, worldpos, 0),
|
||||
planeY, worldpos, 1),
|
||||
vmlaq_laneq_f32(planeW, planeZ, worldpos, 2)
|
||||
);
|
||||
inside = vorrq_u32(inside, vcgezq_f32(planeDist));
|
||||
}
|
||||
uint64_t insideBits = vget_lane_u64(vreinterpret_u64_u16(vmovn_u32(inside)), 0);
|
||||
return ~insideBits == 0; // InsideBits all ones means that we found at least one vertex inside every one of the planes. We don't bother with counts, though it wouldn't be hard.
|
||||
#else
|
||||
int inside[4]{};
|
||||
for (int i = 0; i < vertexCount; i++) {
|
||||
const float *pos = verts + i * vertStride;
|
||||
float worldpos[3];
|
||||
Vec3ByMatrix43(worldpos, pos, gstate.worldMatrix);
|
||||
for (int plane = 0; plane < 4; plane++) {
|
||||
float value = planes_.Test(plane, worldpos);
|
||||
if (value >= 0.0f)
|
||||
inside[plane]++;
|
||||
}
|
||||
}
|
||||
|
||||
for (int plane = 0; plane < 4; plane++) {
|
||||
if (inside[plane] == 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
// TODO: This probably is not the best interface.
|
||||
bool DrawEngineCommon::GetCurrentSimpleVertices(int count, std::vector<GPUDebugVertex> &vertices, std::vector<u16> &indices) {
|
||||
// This is always for the current vertices.
|
||||
@ -670,6 +847,31 @@ int DrawEngineCommon::ExtendNonIndexedPrim(const uint32_t *cmd, const uint32_t *
|
||||
return cmd - start;
|
||||
}
|
||||
|
||||
void DrawEngineCommon::SkipPrim(GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int *bytesRead) {
|
||||
if (!indexGen.PrimCompatible(prevPrim_, prim)) {
|
||||
DispatchFlush();
|
||||
}
|
||||
|
||||
// This isn't exactly right, if we flushed, since prims can straddle previous calls.
|
||||
// But it generally works for common usage.
|
||||
if (prim == GE_PRIM_KEEP_PREVIOUS) {
|
||||
// Has to be set to something, let's assume POINTS (0) if no previous.
|
||||
if (prevPrim_ == GE_PRIM_INVALID)
|
||||
prevPrim_ = GE_PRIM_POINTS;
|
||||
prim = prevPrim_;
|
||||
} else {
|
||||
prevPrim_ = prim;
|
||||
}
|
||||
|
||||
// If vtype has changed, setup the vertex decoder.
|
||||
if (vertTypeID != lastVType_ || !dec_) {
|
||||
dec_ = GetVertexDecoder(vertTypeID);
|
||||
lastVType_ = vertTypeID;
|
||||
}
|
||||
|
||||
*bytesRead = vertexCount * dec_->VertexSize();
|
||||
}
|
||||
|
||||
// vertTypeID is the vertex type but with the UVGen mode smashed into the top bits.
|
||||
bool DrawEngineCommon::SubmitPrim(const void *verts, const void *inds, GEPrimitiveType prim, int vertexCount, u32 vertTypeID, bool clockwise, int *bytesRead) {
|
||||
if (!indexGen.PrimCompatible(prevPrim_, prim) || numDrawVerts_ >= MAX_DEFERRED_DRAW_VERTS || numDrawInds_ >= MAX_DEFERRED_DRAW_INDS || vertexCountInDrawCalls_ + vertexCount > VERTEX_BUFFER_MAX) {
|
||||
|
@ -69,11 +69,11 @@ public:
|
||||
virtual void SendDataToShader(const SimpleVertex *const *points, int size_u, int size_v, u32 vertType, const Spline::Weight2D &weights) = 0;
|
||||
};
|
||||
|
||||
// Culling plane.
|
||||
struct Plane {
|
||||
float x, y, z, w;
|
||||
void Set(float _x, float _y, float _z, float _w) { x = _x; y = _y; z = _z; w = _w; }
|
||||
float Test(const float f[3]) const { return x * f[0] + y * f[1] + z * f[2] + w; }
|
||||
// Culling plane, group of 8.
|
||||
struct alignas(16) Plane8 {
|
||||
float x[8], y[8], z[8], w[8];
|
||||
void Set(int i, float _x, float _y, float _z, float _w) { x[i] = _x; y[i] = _y; z[i] = _z; w[i] = _w; }
|
||||
float Test(int i, const float f[3]) const { return x[i] * f[0] + y[i] * f[1] + z[i] * f[2] + w[i]; }
|
||||
};
|
||||
|
||||
class DrawEngineCommon {
|
||||
@ -104,6 +104,10 @@ public:
|
||||
|
||||
bool TestBoundingBox(const void *control_points, const void *inds, int vertexCount, u32 vertType);
|
||||
|
||||
// This is a less accurate version of TestBoundingBox, but faster. Can have more false positives.
|
||||
// Doesn't support indexing.
|
||||
bool TestBoundingBoxFast(const void *control_points, int vertexCount, u32 vertType);
|
||||
|
||||
void FlushSkin() {
|
||||
bool applySkin = (lastVType_ & GE_VTYPE_WEIGHT_MASK) && decOptions_.applySkinInDecode;
|
||||
if (applySkin) {
|
||||
@ -113,6 +117,8 @@ public:
|
||||
|
||||
int ExtendNonIndexedPrim(const uint32_t *cmd, const uint32_t *stall, u32 vertTypeID, bool clockwise, int *bytesRead, bool isTriangle);
|
||||
bool SubmitPrim(const void *verts, const void *inds, GEPrimitiveType prim, int vertexCount, u32 vertTypeID, bool clockwise, int *bytesRead);
|
||||
void SkipPrim(GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int *bytesRead);
|
||||
|
||||
template<class Surface>
|
||||
void SubmitCurve(const void *control_points, const void *indices, Surface &surface, u32 vertType, int *bytesRead, const char *scope);
|
||||
void ClearSplineBezierWeights();
|
||||
@ -287,7 +293,8 @@ protected:
|
||||
TessellationDataTransfer *tessDataTransfer;
|
||||
|
||||
// Culling
|
||||
Plane planes_[6];
|
||||
Plane8 planes_;
|
||||
Vec2f minOffset_;
|
||||
Vec2f maxOffset_;
|
||||
bool offsetOutsideEdge_;
|
||||
};
|
||||
|
@ -76,6 +76,7 @@ struct GPUStatistics {
|
||||
void ResetFrame() {
|
||||
numDrawCalls = 0;
|
||||
numVertexDecodes = 0;
|
||||
numCulledDraws = 0;
|
||||
numDrawSyncs = 0;
|
||||
numListSyncs = 0;
|
||||
numVertsSubmitted = 0;
|
||||
@ -111,6 +112,7 @@ struct GPUStatistics {
|
||||
// Per frame statistics
|
||||
int numDrawCalls;
|
||||
int numVertexDecodes;
|
||||
int numCulledDraws;
|
||||
int numDrawSyncs;
|
||||
int numListSyncs;
|
||||
int numFlushes;
|
||||
|
@ -989,9 +989,45 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) {
|
||||
int cullMode = gstate.getCullMode();
|
||||
|
||||
uint32_t vertTypeID = GetVertTypeID(vertexType, gstate.getUVGenMode(), g_Config.bSoftwareSkinning);
|
||||
if (!drawEngineCommon_->SubmitPrim(verts, inds, prim, count, vertTypeID, true, &bytesRead)) {
|
||||
|
||||
#define MAX_CULL_CHECK_COUNT 6
|
||||
|
||||
// For now, turn off culling on platforms where we don't have SIMD bounding box tests, like RISC-V.
|
||||
#if PPSSPP_ARCH(ARM_NEON) || PPSSPP_ARCH(SSE2)
|
||||
|
||||
#define PASSES_CULLING ((vertexType & (GE_VTYPE_THROUGH_MASK | GE_VTYPE_MORPHCOUNT_MASK | GE_VTYPE_WEIGHT_MASK | GE_VTYPE_IDX_MASK)) || count > MAX_CULL_CHECK_COUNT)
|
||||
|
||||
#else
|
||||
|
||||
#define PASSES_CULLING true
|
||||
|
||||
#endif
|
||||
|
||||
// If certain conditions are true, do frustum culling.
|
||||
bool passCulling = PASSES_CULLING;
|
||||
if (!passCulling) {
|
||||
// Do software culling.
|
||||
if (drawEngineCommon_->TestBoundingBoxFast(verts, count, vertexType)) {
|
||||
passCulling = true;
|
||||
} else {
|
||||
gpuStats.numCulledDraws++;
|
||||
}
|
||||
}
|
||||
|
||||
// If the first one in a batch passes, let's assume the whole batch passes.
|
||||
// Cuts down on checking, while not losing that much efficiency.
|
||||
bool onePassed = false;
|
||||
if (passCulling) {
|
||||
if (!drawEngineCommon_->SubmitPrim(verts, inds, prim, count, vertTypeID, true, &bytesRead)) {
|
||||
canExtend = false;
|
||||
}
|
||||
onePassed = true;
|
||||
} else {
|
||||
// Still need to advance bytesRead.
|
||||
drawEngineCommon_->SkipPrim(prim, count, vertTypeID, &bytesRead);
|
||||
canExtend = false;
|
||||
}
|
||||
|
||||
// After drawing, we advance the vertexAddr (when non indexed) or indexAddr (when indexed).
|
||||
// Some games rely on this, they don't bother reloading VADDR and IADDR.
|
||||
// The VADDR/IADDR registers are NOT updated.
|
||||
@ -1027,7 +1063,7 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) {
|
||||
bool clockwise = !gstate.isCullEnabled() || gstate.getCullMode() == cullMode;
|
||||
if (canExtend) {
|
||||
// Non-indexed draws can be cheaply merged if vertexAddr hasn't changed, that means the vertices
|
||||
// are consecutive in memory.
|
||||
// are consecutive in memory. We also ignore culling here.
|
||||
_dbg_assert_((vertexType & GE_VTYPE_IDX_MASK) == GE_VTYPE_IDX_NONE);
|
||||
int commandsExecuted = drawEngineCommon_->ExtendNonIndexedPrim(src, stall, vertTypeID, clockwise, &bytesRead, isTriangle);
|
||||
if (!commandsExecuted) {
|
||||
@ -1047,7 +1083,25 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) {
|
||||
// We can extend again after submitting a normal draw.
|
||||
canExtend = isTriangle;
|
||||
}
|
||||
if (!drawEngineCommon_->SubmitPrim(verts, inds, newPrim, count, vertTypeID, clockwise, &bytesRead)) {
|
||||
|
||||
bool passCulling = onePassed || PASSES_CULLING;
|
||||
if (!passCulling) {
|
||||
// Do software culling.
|
||||
if (drawEngineCommon_->TestBoundingBox(verts, inds, count, vertexType)) {
|
||||
passCulling = true;
|
||||
} else {
|
||||
gpuStats.numCulledDraws++;
|
||||
}
|
||||
}
|
||||
if (passCulling) {
|
||||
if (!drawEngineCommon_->SubmitPrim(verts, inds, newPrim, count, vertTypeID, clockwise, &bytesRead)) {
|
||||
canExtend = false;
|
||||
}
|
||||
// As soon as one passes, assume we don't need to check the rest of this batch.
|
||||
onePassed = true;
|
||||
} else {
|
||||
// Still need to advance bytesRead.
|
||||
drawEngineCommon_->SkipPrim(newPrim, count, vertTypeID, &bytesRead);
|
||||
canExtend = false;
|
||||
}
|
||||
AdvanceVerts(vertexType, count, bytesRead);
|
||||
@ -1412,7 +1466,7 @@ void GPUCommonHW::Execute_WorldMtxNum(u32 op, u32 diff) {
|
||||
if (dst[i] != newVal) {
|
||||
Flush();
|
||||
dst[i] = newVal;
|
||||
gstate_c.Dirty(DIRTY_WORLDMATRIX | DIRTY_CULL_PLANES);
|
||||
gstate_c.Dirty(DIRTY_WORLDMATRIX);
|
||||
}
|
||||
if (++i >= end) {
|
||||
break;
|
||||
@ -1435,7 +1489,7 @@ void GPUCommonHW::Execute_WorldMtxData(u32 op, u32 diff) {
|
||||
if (num < 12 && newVal != ((const u32 *)gstate.worldMatrix)[num]) {
|
||||
Flush();
|
||||
((u32 *)gstate.worldMatrix)[num] = newVal;
|
||||
gstate_c.Dirty(DIRTY_WORLDMATRIX | DIRTY_CULL_PLANES);
|
||||
gstate_c.Dirty(DIRTY_WORLDMATRIX);
|
||||
}
|
||||
num++;
|
||||
gstate.worldmtxnum = (GE_CMD_WORLDMATRIXNUMBER << 24) | (num & 0x00FFFFFF);
|
||||
@ -1691,7 +1745,7 @@ size_t GPUCommonHW::FormatGPUStatsCommon(char *buffer, size_t size) {
|
||||
float vertexAverageCycles = gpuStats.numVertsSubmitted > 0 ? (float)gpuStats.vertexGPUCycles / (float)gpuStats.numVertsSubmitted : 0.0f;
|
||||
return snprintf(buffer, size,
|
||||
"DL processing time: %0.2f ms, %d drawsync, %d listsync\n"
|
||||
"Draw: %d (%d dec), flushes %d, clears %d, bbox jumps %d (%d updates)\n"
|
||||
"Draw: %d (%d dec, %d culled), flushes %d, clears %d, bbox jumps %d (%d updates)\n"
|
||||
"Vertices: %d drawn: %d\n"
|
||||
"FBOs active: %d (evaluations: %d)\n"
|
||||
"Textures: %d, dec: %d, invalidated: %d, hashed: %d kB\n"
|
||||
@ -1705,6 +1759,7 @@ size_t GPUCommonHW::FormatGPUStatsCommon(char *buffer, size_t size) {
|
||||
gpuStats.numListSyncs,
|
||||
gpuStats.numDrawCalls,
|
||||
gpuStats.numVertexDecodes,
|
||||
gpuStats.numCulledDraws,
|
||||
gpuStats.numFlushes,
|
||||
gpuStats.numClears,
|
||||
gpuStats.numBBOXJumps,
|
||||
|
@ -105,6 +105,7 @@
|
||||
<ClInclude Include="..\..\Common\File\AndroidStorage.h" />
|
||||
<ClInclude Include="..\..\Common\GPU\GPUBackendCommon.h" />
|
||||
<ClInclude Include="..\..\Common\GPU\Vulkan\VulkanLoader.h" />
|
||||
<ClInclude Include="..\..\Common\Math\CrossSIMD.h" />
|
||||
<ClInclude Include="..\..\Common\Math\Statistics.h" />
|
||||
<ClInclude Include="..\..\Common\Net\HTTPNaettRequest.h" />
|
||||
<ClInclude Include="..\..\Common\Net\HTTPRequest.h" />
|
||||
|
@ -862,6 +862,9 @@
|
||||
<ClInclude Include="..\..\ext\naett\naett.h">
|
||||
<Filter>ext\naett</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\..\Common\Math\CrossSIMD.h">
|
||||
<Filter>Math</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="..\..\Common\Math\fast\fast_matrix_neon.S">
|
||||
|
@ -11,6 +11,7 @@
|
||||
#if defined(_M_IX86) || defined(__i386__) || defined (__EMSCRIPTEN__)
|
||||
#define PPSSPP_ARCH_X86 1
|
||||
#define PPSSPP_ARCH_32BIT 1
|
||||
#define PPSSPP_ARCH_SSE2 1
|
||||
//TODO: Remove this compat define
|
||||
#ifndef _M_IX86
|
||||
#define _M_IX86 600
|
||||
@ -19,6 +20,7 @@
|
||||
|
||||
#if (defined(_M_X64) || defined(__amd64__) || defined(__x86_64__)) && !defined(__EMSCRIPTEN__)
|
||||
#define PPSSPP_ARCH_AMD64 1
|
||||
#define PPSSPP_ARCH_SSE2 1
|
||||
#if defined(__ILP32__)
|
||||
#define PPSSPP_ARCH_32BIT 1
|
||||
#else
|
||||
|
Loading…
Reference in New Issue
Block a user