Merge pull request #17808 from hrydgard/frustum-cull-small-draws

Frustum-cull small draws (experiment)
This commit is contained in:
Henrik Rydgård 2023-12-09 17:23:56 +01:00 committed by GitHub
commit 27e47d9899
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 371 additions and 37 deletions

View File

@ -737,6 +737,7 @@ add_library(Common STATIC
Common/Input/InputState.cpp
Common/Input/InputState.h
Common/Math/fast/fast_matrix.c
Common/Math/CrossSIMD.h
Common/Math/curves.cpp
Common/Math/curves.h
Common/Math/expression_parser.cpp

View File

@ -484,6 +484,7 @@
<ClInclude Include="Input\GestureDetector.h" />
<ClInclude Include="Input\InputState.h" />
<ClInclude Include="Input\KeyCodes.h" />
<ClInclude Include="Math\CrossSIMD.h" />
<ClInclude Include="Math\curves.h" />
<ClInclude Include="Math\expression_parser.h" />
<ClInclude Include="Math\fast\fast_matrix.h" />

View File

@ -518,6 +518,9 @@
<ClInclude Include="GPU\Vulkan\VulkanDescSet.h">
<Filter>GPU\Vulkan</Filter>
</ClInclude>
<ClInclude Include="Math\CrossSIMD.h">
<Filter>Math</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<ClCompile Include="ABI.cpp" />

58
Common/Math/CrossSIMD.h Normal file
View File

@ -0,0 +1,58 @@
// CrossSIMD
//
// Compatibility wrappers for SIMD dialects.
//
// In the long run, might do a more general single-source-SIMD wrapper here consisting
// of defines that translate to either NEON or SSE. It would be possible to write quite a lot of
// our various color conversion functions and so on in a pretty generic manner.
#include "ppsspp_config.h"
#include <cstdint>
#if PPSSPP_ARCH(SSE2)
#include <emmintrin.h>
#endif
#if PPSSPP_ARCH(ARM_NEON)
#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
#include <arm64_neon.h>
#else
#include <arm_neon.h>
#endif
#endif
// Basic types
#if PPSSPP_ARCH(ARM64_NEON)
// No special ones here.
#elif PPSSPP_ARCH(ARM_NEON)
// Compatibility wrappers making ARM64 NEON code run on ARM32
// With optimization on, these should compile down to the optimal code.
inline float32x4_t vmulq_laneq_f32(float32x4_t a, float32x4_t b, int lane) {
switch (lane & 3) {
case 0: return vmulq_lane_f32(a, vget_low_f32(b), 0);
case 1: return vmulq_lane_f32(a, vget_low_f32(b), 1);
case 2: return vmulq_lane_f32(a, vget_high_f32(b), 0);
default: return vmulq_lane_f32(a, vget_high_f32(b), 1);
}
}
inline float32x4_t vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t c, int lane) {
switch (lane & 3) {
case 0: return vmlaq_lane_f32(a, b, vget_low_f32(c), 0);
case 1: return vmlaq_lane_f32(a, b, vget_low_f32(c), 1);
case 2: return vmlaq_lane_f32(a, b, vget_high_f32(c), 0);
default: return vmlaq_lane_f32(a, b, vget_high_f32(c), 1);
}
}
inline uint32x4_t vcgezq_f32(float32x4_t v) {
return vcgeq_f32(v, vdupq_n_f32(0.0f));
}
#endif

View File

@ -620,7 +620,6 @@ CollapsibleHeader::CollapsibleHeader(bool *toggle, const std::string &text, Layo
void CollapsibleHeader::Draw(UIContext &dc) {
Style style = dc.theme->itemStyle;
style.background.color = 0;
if (HasFocus()) style = dc.theme->itemFocusedStyle;
if (down_) style = dc.theme->itemDownStyle;
if (!IsEnabled()) style = dc.theme->itemDisabledStyle;

View File

@ -1893,7 +1893,7 @@ void PlayTimeTracker::Load(const Section *section) {
// Parse the string.
PlayTime gameTime{};
if (2 == sscanf(value.c_str(), "%d,%llu", &gameTime.totalTimePlayed, &gameTime.lastTimePlayed)) {
if (2 == sscanf(value.c_str(), "%d,%llu", &gameTime.totalTimePlayed, (long long *)&gameTime.lastTimePlayed)) {
tracker_[key] = gameTime;
}
}

View File

@ -21,6 +21,7 @@
#include "Common/Data/Convert/ColorConv.h"
#include "Common/Profiler/Profiler.h"
#include "Common/LogReporting.h"
#include "Common/Math/CrossSIMD.h"
#include "Common/Math/lin/matrix4x4.h"
#include "Core/Config.h"
#include "GPU/Common/DrawEngineCommon.h"
@ -197,15 +198,10 @@ void DrawEngineCommon::DispatchSubmitImm(GEPrimitiveType prim, TransformedVertex
// Gated by DIRTY_CULL_PLANES
void DrawEngineCommon::UpdatePlanes() {
float world[16];
float view[16];
float worldview[16];
float worldviewproj[16];
ConvertMatrix4x3To4x4(world, gstate.worldMatrix);
float viewproj[16];
ConvertMatrix4x3To4x4(view, gstate.viewMatrix);
// TODO: Create a Matrix4x3ByMatrix4x3, and Matrix4x4ByMatrix4x3?
Matrix4ByMatrix4(worldview, world, view);
Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix);
Matrix4ByMatrix4(viewproj, view, gstate.projMatrix);
// Next, we need to apply viewport, scissor, region, and even offset - but only for X/Y.
// Note that the PSP does not clip against the viewport.
@ -214,6 +210,9 @@ void DrawEngineCommon::UpdatePlanes() {
minOffset_ = baseOffset + Vec2f(std::max(gstate.getRegionRateX() - 0x100, gstate.getScissorX1()), std::max(gstate.getRegionRateY() - 0x100, gstate.getScissorY1())) - Vec2f(1.0f, 1.0f);
maxOffset_ = baseOffset + Vec2f(std::min(gstate.getRegionX2(), gstate.getScissorX2()), std::min(gstate.getRegionY2(), gstate.getScissorY2())) + Vec2f(1.0f, 1.0f);
// Let's not handle these special cases in the fast culler.
offsetOutsideEdge_ = maxOffset_.x >= 4096.0f || minOffset_.x < 1.0f || minOffset_.y < 1.0f || maxOffset_.y >= 4096.0f;
// Now let's apply the viewport to our scissor/region + offset range.
Vec2f inverseViewportScale = Vec2f(1.0f / gstate.getViewportXScale(), 1.0f / gstate.getViewportYScale());
Vec2f minViewport = (minOffset_ - Vec2f(gstate.getViewportXCenter(), gstate.getViewportYCenter())) * inverseViewportScale;
@ -232,14 +231,14 @@ void DrawEngineCommon::UpdatePlanes() {
applyViewport.wy = -(maxViewport.y + minViewport.y) * viewportInvSize.y;
float mtx[16];
Matrix4ByMatrix4(mtx, worldviewproj, applyViewport.m);
planes_[0].Set(mtx[3] - mtx[0], mtx[7] - mtx[4], mtx[11] - mtx[8], mtx[15] - mtx[12]); // Right
planes_[1].Set(mtx[3] + mtx[0], mtx[7] + mtx[4], mtx[11] + mtx[8], mtx[15] + mtx[12]); // Left
planes_[2].Set(mtx[3] + mtx[1], mtx[7] + mtx[5], mtx[11] + mtx[9], mtx[15] + mtx[13]); // Bottom
planes_[3].Set(mtx[3] - mtx[1], mtx[7] - mtx[5], mtx[11] - mtx[9], mtx[15] - mtx[13]); // Top
planes_[4].Set(mtx[3] + mtx[2], mtx[7] + mtx[6], mtx[11] + mtx[10], mtx[15] + mtx[14]); // Near
planes_[5].Set(mtx[3] - mtx[2], mtx[7] - mtx[6], mtx[11] - mtx[10], mtx[15] - mtx[14]); // Far
Matrix4ByMatrix4(mtx, viewproj, applyViewport.m);
// I'm sure there's some fairly optimized way to set these.
planes_.Set(0, mtx[3] - mtx[0], mtx[7] - mtx[4], mtx[11] - mtx[8], mtx[15] - mtx[12]); // Right
planes_.Set(1, mtx[3] + mtx[0], mtx[7] + mtx[4], mtx[11] + mtx[8], mtx[15] + mtx[12]); // Left
planes_.Set(2, mtx[3] + mtx[1], mtx[7] + mtx[5], mtx[11] + mtx[9], mtx[15] + mtx[13]); // Bottom
planes_.Set(3, mtx[3] - mtx[1], mtx[7] - mtx[5], mtx[11] - mtx[9], mtx[15] - mtx[13]); // Top
planes_.Set(4, mtx[3] + mtx[2], mtx[7] + mtx[6], mtx[11] + mtx[10], mtx[15] + mtx[14]); // Near
planes_.Set(5, mtx[3] - mtx[2], mtx[7] - mtx[6], mtx[11] - mtx[10], mtx[15] - mtx[14]); // Far
}
// This code has plenty of potential for optimization.
@ -262,7 +261,6 @@ bool DrawEngineCommon::TestBoundingBox(const void *vdata, const void *inds, int
SimpleVertex *corners = (SimpleVertex *)(decoded_ + 65536 * 12);
float *verts = (float *)(decoded_ + 65536 * 18);
int vertStride = 3;
// Although this may lead to drawing that shouldn't happen, the viewport is more complex on VR.
// Let's always say objects are within bounds.
@ -338,17 +336,23 @@ bool DrawEngineCommon::TestBoundingBox(const void *vdata, const void *inds, int
}
break;
case GE_VTYPE_POS_FLOAT:
// No need to copy in this case, we can just read directly from the source format with a stride.
verts = (float *)((uint8_t *)vdata + offset);
vertStride = stride / 4;
// Previous code:
// for (int i = 0; i < vertexCount; i++)
// memcpy(&verts[i * 3], (const u8 *)vdata + stride * i + offset, sizeof(float) * 3);
for (int i = 0; i < vertexCount; i++)
memcpy(&verts[i * 3], (const u8 *)vdata + stride * i + offset, sizeof(float) * 3);
break;
}
}
}
// Pretransform the verts in-place so we don't have to do it inside the loop.
// We do this differently in the fast version below since we skip the max/minOffset checks there
// making it easier to get the whole thing ready for SIMD.
for (int i = 0; i < vertexCount; i++) {
float worldpos[3];
Vec3ByMatrix43(worldpos, &verts[i * 3], gstate.worldMatrix);
memcpy(&verts[i * 3], worldpos, 12);
}
// Note: near/far are not checked without clamp/clip enabled, so we skip those planes.
int totalPlanes = gstate.isDepthClampEnabled() ? 6 : 4;
for (int plane = 0; plane < totalPlanes; plane++) {
@ -358,8 +362,8 @@ bool DrawEngineCommon::TestBoundingBox(const void *vdata, const void *inds, int
// Test against the frustum planes, and count.
// TODO: We should test 4 vertices at a time using SIMD.
// I guess could also test one vertex against 4 planes at a time, though a lot of waste at the common case of 6.
const float *pos = verts + i * vertStride;
float value = planes_[plane].Test(pos);
const float *worldpos = verts + i * 3;
float value = planes_.Test(plane, worldpos);
if (value <= -FLT_EPSILON) // Not sure why we use exactly this value. Probably '< 0' would do.
out++;
else
@ -388,6 +392,179 @@ bool DrawEngineCommon::TestBoundingBox(const void *vdata, const void *inds, int
return true;
}
// NOTE: This doesn't handle through-mode, indexing, morph, or skinning.
bool DrawEngineCommon::TestBoundingBoxFast(const void *vdata, int vertexCount, u32 vertType) {
SimpleVertex *corners = (SimpleVertex *)(decoded_ + 65536 * 12);
float *verts = (float *)(decoded_ + 65536 * 18);
// Although this may lead to drawing that shouldn't happen, the viewport is more complex on VR.
// Let's always say objects are within bounds.
if (gstate_c.Use(GPU_USE_VIRTUAL_REALITY))
return true;
// Due to world matrix updates per "thing", this isn't quite as effective as it could be if we did world transform
// in here as well. Though, it still does cut down on a lot of updates in Tekken 6.
if (gstate_c.IsDirty(DIRTY_CULL_PLANES)) {
UpdatePlanes();
gpuStats.numPlaneUpdates++;
gstate_c.Clean(DIRTY_CULL_PLANES);
}
// Also let's just bail if offsetOutsideEdge_ is set, instead of handling the cases.
// NOTE: This is written to in UpdatePlanes so can't check it before.
if (offsetOutsideEdge_)
return true;
// Simple, most common case.
VertexDecoder *dec = GetVertexDecoder(vertType);
int stride = dec->VertexSize();
int offset = dec->posoff;
int vertStride = 3;
// TODO: Possibly do the plane tests directly against the source formats instead of converting.
switch (vertType & GE_VTYPE_POS_MASK) {
case GE_VTYPE_POS_8BIT:
for (int i = 0; i < vertexCount; i++) {
const s8 *data = (const s8 *)vdata + i * stride + offset;
for (int j = 0; j < 3; j++) {
verts[i * 3 + j] = data[j] * (1.0f / 128.0f);
}
}
break;
case GE_VTYPE_POS_16BIT:
{
#if PPSSPP_ARCH(SSE2)
__m128 scaleFactor = _mm_set1_ps(1.0f / 32768.0f);
for (int i = 0; i < vertexCount; i++) {
const s16 *data = ((const s16 *)((const s8 *)vdata + i * stride + offset));
__m128i bits = _mm_castpd_si128(_mm_load_sd((const double *)data));
// Sign extension. Hacky without SSE4.
bits = _mm_srai_epi32(_mm_unpacklo_epi16(bits, bits), 16);
__m128 pos = _mm_mul_ps(_mm_cvtepi32_ps(bits), scaleFactor);
_mm_storeu_ps(verts + i * 3, pos); // TODO: use stride 4 to avoid clashing writes?
}
#elif PPSSPP_ARCH(ARM_NEON)
for (int i = 0; i < vertexCount; i++) {
const s16 *dataPtr = ((const s16 *)((const s8 *)vdata + i * stride + offset));
int32x4_t data = vmovl_s16(vld1_s16(dataPtr));
float32x4_t pos = vcvtq_n_f32_s32(data, 15); // >> 15 = division by 32768.0f
vst1q_f32(verts + i * 3, pos);
}
#else
for (int i = 0; i < vertexCount; i++) {
const s16 *data = ((const s16 *)((const s8 *)vdata + i * stride + offset));
for (int j = 0; j < 3; j++) {
verts[i * 3 + j] = data[j] * (1.0f / 32768.0f);
}
}
#endif
break;
}
case GE_VTYPE_POS_FLOAT:
// No need to copy in this case, we can just read directly from the source format with a stride.
verts = (float *)((uint8_t *)vdata + offset);
vertStride = stride / 4;
break;
}
// We only check the 4 sides. Near/far won't likely make a huge difference.
// We test one vertex against 4 planes to get some SIMD. Vertices need to be transformed to world space
// for testing, don't want to re-do that, so we have to use that "pivot" of the data.
#if PPSSPP_ARCH(SSE2)
const __m128 worldX = _mm_loadu_ps(gstate.worldMatrix);
const __m128 worldY = _mm_loadu_ps(gstate.worldMatrix + 3);
const __m128 worldZ = _mm_loadu_ps(gstate.worldMatrix + 6);
const __m128 worldW = _mm_loadu_ps(gstate.worldMatrix + 9);
const __m128 planeX = _mm_loadu_ps(planes_.x);
const __m128 planeY = _mm_loadu_ps(planes_.y);
const __m128 planeZ = _mm_loadu_ps(planes_.z);
const __m128 planeW = _mm_loadu_ps(planes_.w);
__m128 inside = _mm_set1_ps(0.0f);
for (int i = 0; i < vertexCount; i++) {
const float *pos = verts + i * vertStride;
__m128 worldpos = _mm_add_ps(
_mm_add_ps(
_mm_mul_ps(worldX, _mm_set1_ps(pos[0])),
_mm_mul_ps(worldY, _mm_set1_ps(pos[1]))
),
_mm_add_ps(
_mm_mul_ps(worldZ, _mm_set1_ps(pos[2])),
worldW
)
);
// OK, now we check it against the four planes.
// This is really curiously similar to a matrix multiplication (well, it is one).
__m128 posX = _mm_shuffle_ps(worldpos, worldpos, _MM_SHUFFLE(0, 0, 0, 0));
__m128 posY = _mm_shuffle_ps(worldpos, worldpos, _MM_SHUFFLE(1, 1, 1, 1));
__m128 posZ = _mm_shuffle_ps(worldpos, worldpos, _MM_SHUFFLE(2, 2, 2, 2));
__m128 planeDist = _mm_add_ps(
_mm_add_ps(
_mm_mul_ps(planeX, posX),
_mm_mul_ps(planeY, posY)
),
_mm_add_ps(
_mm_mul_ps(planeZ, posZ),
planeW
)
);
inside = _mm_or_ps(inside, _mm_cmpge_ps(planeDist, _mm_setzero_ps()));
}
// 0xF means that we found at least one vertex inside every one of the planes.
// We don't bother with counts, though it wouldn't be hard if we had a use for them.
return _mm_movemask_ps(inside) == 0xF;
#elif PPSSPP_ARCH(ARM_NEON)
const float32x4_t worldX = vld1q_f32(gstate.worldMatrix);
const float32x4_t worldY = vld1q_f32(gstate.worldMatrix + 3);
const float32x4_t worldZ = vld1q_f32(gstate.worldMatrix + 6);
const float32x4_t worldW = vld1q_f32(gstate.worldMatrix + 9);
const float32x4_t planeX = vld1q_f32(planes_.x);
const float32x4_t planeY = vld1q_f32(planes_.y);
const float32x4_t planeZ = vld1q_f32(planes_.z);
const float32x4_t planeW = vld1q_f32(planes_.w);
uint32x4_t inside = vdupq_n_u32(0);
for (int i = 0; i < vertexCount; i++) {
const float *pos = verts + i * vertStride;
float32x4_t objpos = vld1q_f32(pos);
float32x4_t worldpos = vaddq_f32(
vmlaq_laneq_f32(
vmulq_laneq_f32(worldX, objpos, 0),
worldY, objpos, 1),
vmlaq_laneq_f32(worldW, worldZ, objpos, 2)
);
// OK, now we check it against the four planes.
// This is really curiously similar to a matrix multiplication (well, it is one).
float32x4_t planeDist = vaddq_f32(
vmlaq_laneq_f32(
vmulq_laneq_f32(planeX, worldpos, 0),
planeY, worldpos, 1),
vmlaq_laneq_f32(planeW, planeZ, worldpos, 2)
);
inside = vorrq_u32(inside, vcgezq_f32(planeDist));
}
uint64_t insideBits = vget_lane_u64(vreinterpret_u64_u16(vmovn_u32(inside)), 0);
return ~insideBits == 0; // InsideBits all ones means that we found at least one vertex inside every one of the planes. We don't bother with counts, though it wouldn't be hard.
#else
int inside[4]{};
for (int i = 0; i < vertexCount; i++) {
const float *pos = verts + i * vertStride;
float worldpos[3];
Vec3ByMatrix43(worldpos, pos, gstate.worldMatrix);
for (int plane = 0; plane < 4; plane++) {
float value = planes_.Test(plane, worldpos);
if (value >= 0.0f)
inside[plane]++;
}
}
for (int plane = 0; plane < 4; plane++) {
if (inside[plane] == 0) {
return false;
}
}
#endif
return true;
}
// TODO: This probably is not the best interface.
bool DrawEngineCommon::GetCurrentSimpleVertices(int count, std::vector<GPUDebugVertex> &vertices, std::vector<u16> &indices) {
// This is always for the current vertices.
@ -670,6 +847,31 @@ int DrawEngineCommon::ExtendNonIndexedPrim(const uint32_t *cmd, const uint32_t *
return cmd - start;
}
void DrawEngineCommon::SkipPrim(GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int *bytesRead) {
if (!indexGen.PrimCompatible(prevPrim_, prim)) {
DispatchFlush();
}
// This isn't exactly right, if we flushed, since prims can straddle previous calls.
// But it generally works for common usage.
if (prim == GE_PRIM_KEEP_PREVIOUS) {
// Has to be set to something, let's assume POINTS (0) if no previous.
if (prevPrim_ == GE_PRIM_INVALID)
prevPrim_ = GE_PRIM_POINTS;
prim = prevPrim_;
} else {
prevPrim_ = prim;
}
// If vtype has changed, setup the vertex decoder.
if (vertTypeID != lastVType_ || !dec_) {
dec_ = GetVertexDecoder(vertTypeID);
lastVType_ = vertTypeID;
}
*bytesRead = vertexCount * dec_->VertexSize();
}
// vertTypeID is the vertex type but with the UVGen mode smashed into the top bits.
bool DrawEngineCommon::SubmitPrim(const void *verts, const void *inds, GEPrimitiveType prim, int vertexCount, u32 vertTypeID, bool clockwise, int *bytesRead) {
if (!indexGen.PrimCompatible(prevPrim_, prim) || numDrawVerts_ >= MAX_DEFERRED_DRAW_VERTS || numDrawInds_ >= MAX_DEFERRED_DRAW_INDS || vertexCountInDrawCalls_ + vertexCount > VERTEX_BUFFER_MAX) {

View File

@ -69,11 +69,11 @@ public:
virtual void SendDataToShader(const SimpleVertex *const *points, int size_u, int size_v, u32 vertType, const Spline::Weight2D &weights) = 0;
};
// Culling plane.
struct Plane {
float x, y, z, w;
void Set(float _x, float _y, float _z, float _w) { x = _x; y = _y; z = _z; w = _w; }
float Test(const float f[3]) const { return x * f[0] + y * f[1] + z * f[2] + w; }
// Culling plane, group of 8.
struct alignas(16) Plane8 {
float x[8], y[8], z[8], w[8];
void Set(int i, float _x, float _y, float _z, float _w) { x[i] = _x; y[i] = _y; z[i] = _z; w[i] = _w; }
float Test(int i, const float f[3]) const { return x[i] * f[0] + y[i] * f[1] + z[i] * f[2] + w[i]; }
};
class DrawEngineCommon {
@ -104,6 +104,10 @@ public:
bool TestBoundingBox(const void *control_points, const void *inds, int vertexCount, u32 vertType);
// This is a less accurate version of TestBoundingBox, but faster. Can have more false positives.
// Doesn't support indexing.
bool TestBoundingBoxFast(const void *control_points, int vertexCount, u32 vertType);
void FlushSkin() {
bool applySkin = (lastVType_ & GE_VTYPE_WEIGHT_MASK) && decOptions_.applySkinInDecode;
if (applySkin) {
@ -113,6 +117,8 @@ public:
int ExtendNonIndexedPrim(const uint32_t *cmd, const uint32_t *stall, u32 vertTypeID, bool clockwise, int *bytesRead, bool isTriangle);
bool SubmitPrim(const void *verts, const void *inds, GEPrimitiveType prim, int vertexCount, u32 vertTypeID, bool clockwise, int *bytesRead);
void SkipPrim(GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int *bytesRead);
template<class Surface>
void SubmitCurve(const void *control_points, const void *indices, Surface &surface, u32 vertType, int *bytesRead, const char *scope);
void ClearSplineBezierWeights();
@ -287,7 +293,8 @@ protected:
TessellationDataTransfer *tessDataTransfer;
// Culling
Plane planes_[6];
Plane8 planes_;
Vec2f minOffset_;
Vec2f maxOffset_;
bool offsetOutsideEdge_;
};

View File

@ -76,6 +76,7 @@ struct GPUStatistics {
void ResetFrame() {
numDrawCalls = 0;
numVertexDecodes = 0;
numCulledDraws = 0;
numDrawSyncs = 0;
numListSyncs = 0;
numVertsSubmitted = 0;
@ -111,6 +112,7 @@ struct GPUStatistics {
// Per frame statistics
int numDrawCalls;
int numVertexDecodes;
int numCulledDraws;
int numDrawSyncs;
int numListSyncs;
int numFlushes;

View File

@ -989,9 +989,45 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) {
int cullMode = gstate.getCullMode();
uint32_t vertTypeID = GetVertTypeID(vertexType, gstate.getUVGenMode(), g_Config.bSoftwareSkinning);
if (!drawEngineCommon_->SubmitPrim(verts, inds, prim, count, vertTypeID, true, &bytesRead)) {
#define MAX_CULL_CHECK_COUNT 6
// For now, turn off culling on platforms where we don't have SIMD bounding box tests, like RISC-V.
#if PPSSPP_ARCH(ARM_NEON) || PPSSPP_ARCH(SSE2)
#define PASSES_CULLING ((vertexType & (GE_VTYPE_THROUGH_MASK | GE_VTYPE_MORPHCOUNT_MASK | GE_VTYPE_WEIGHT_MASK | GE_VTYPE_IDX_MASK)) || count > MAX_CULL_CHECK_COUNT)
#else
#define PASSES_CULLING true
#endif
// If certain conditions are true, do frustum culling.
bool passCulling = PASSES_CULLING;
if (!passCulling) {
// Do software culling.
if (drawEngineCommon_->TestBoundingBoxFast(verts, count, vertexType)) {
passCulling = true;
} else {
gpuStats.numCulledDraws++;
}
}
// If the first one in a batch passes, let's assume the whole batch passes.
// Cuts down on checking, while not losing that much efficiency.
bool onePassed = false;
if (passCulling) {
if (!drawEngineCommon_->SubmitPrim(verts, inds, prim, count, vertTypeID, true, &bytesRead)) {
canExtend = false;
}
onePassed = true;
} else {
// Still need to advance bytesRead.
drawEngineCommon_->SkipPrim(prim, count, vertTypeID, &bytesRead);
canExtend = false;
}
// After drawing, we advance the vertexAddr (when non indexed) or indexAddr (when indexed).
// Some games rely on this, they don't bother reloading VADDR and IADDR.
// The VADDR/IADDR registers are NOT updated.
@ -1027,7 +1063,7 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) {
bool clockwise = !gstate.isCullEnabled() || gstate.getCullMode() == cullMode;
if (canExtend) {
// Non-indexed draws can be cheaply merged if vertexAddr hasn't changed, that means the vertices
// are consecutive in memory.
// are consecutive in memory. We also ignore culling here.
_dbg_assert_((vertexType & GE_VTYPE_IDX_MASK) == GE_VTYPE_IDX_NONE);
int commandsExecuted = drawEngineCommon_->ExtendNonIndexedPrim(src, stall, vertTypeID, clockwise, &bytesRead, isTriangle);
if (!commandsExecuted) {
@ -1047,7 +1083,25 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) {
// We can extend again after submitting a normal draw.
canExtend = isTriangle;
}
if (!drawEngineCommon_->SubmitPrim(verts, inds, newPrim, count, vertTypeID, clockwise, &bytesRead)) {
bool passCulling = onePassed || PASSES_CULLING;
if (!passCulling) {
// Do software culling.
if (drawEngineCommon_->TestBoundingBox(verts, inds, count, vertexType)) {
passCulling = true;
} else {
gpuStats.numCulledDraws++;
}
}
if (passCulling) {
if (!drawEngineCommon_->SubmitPrim(verts, inds, newPrim, count, vertTypeID, clockwise, &bytesRead)) {
canExtend = false;
}
// As soon as one passes, assume we don't need to check the rest of this batch.
onePassed = true;
} else {
// Still need to advance bytesRead.
drawEngineCommon_->SkipPrim(newPrim, count, vertTypeID, &bytesRead);
canExtend = false;
}
AdvanceVerts(vertexType, count, bytesRead);
@ -1412,7 +1466,7 @@ void GPUCommonHW::Execute_WorldMtxNum(u32 op, u32 diff) {
if (dst[i] != newVal) {
Flush();
dst[i] = newVal;
gstate_c.Dirty(DIRTY_WORLDMATRIX | DIRTY_CULL_PLANES);
gstate_c.Dirty(DIRTY_WORLDMATRIX);
}
if (++i >= end) {
break;
@ -1435,7 +1489,7 @@ void GPUCommonHW::Execute_WorldMtxData(u32 op, u32 diff) {
if (num < 12 && newVal != ((const u32 *)gstate.worldMatrix)[num]) {
Flush();
((u32 *)gstate.worldMatrix)[num] = newVal;
gstate_c.Dirty(DIRTY_WORLDMATRIX | DIRTY_CULL_PLANES);
gstate_c.Dirty(DIRTY_WORLDMATRIX);
}
num++;
gstate.worldmtxnum = (GE_CMD_WORLDMATRIXNUMBER << 24) | (num & 0x00FFFFFF);
@ -1691,7 +1745,7 @@ size_t GPUCommonHW::FormatGPUStatsCommon(char *buffer, size_t size) {
float vertexAverageCycles = gpuStats.numVertsSubmitted > 0 ? (float)gpuStats.vertexGPUCycles / (float)gpuStats.numVertsSubmitted : 0.0f;
return snprintf(buffer, size,
"DL processing time: %0.2f ms, %d drawsync, %d listsync\n"
"Draw: %d (%d dec), flushes %d, clears %d, bbox jumps %d (%d updates)\n"
"Draw: %d (%d dec, %d culled), flushes %d, clears %d, bbox jumps %d (%d updates)\n"
"Vertices: %d drawn: %d\n"
"FBOs active: %d (evaluations: %d)\n"
"Textures: %d, dec: %d, invalidated: %d, hashed: %d kB\n"
@ -1705,6 +1759,7 @@ size_t GPUCommonHW::FormatGPUStatsCommon(char *buffer, size_t size) {
gpuStats.numListSyncs,
gpuStats.numDrawCalls,
gpuStats.numVertexDecodes,
gpuStats.numCulledDraws,
gpuStats.numFlushes,
gpuStats.numClears,
gpuStats.numBBOXJumps,

View File

@ -105,6 +105,7 @@
<ClInclude Include="..\..\Common\File\AndroidStorage.h" />
<ClInclude Include="..\..\Common\GPU\GPUBackendCommon.h" />
<ClInclude Include="..\..\Common\GPU\Vulkan\VulkanLoader.h" />
<ClInclude Include="..\..\Common\Math\CrossSIMD.h" />
<ClInclude Include="..\..\Common\Math\Statistics.h" />
<ClInclude Include="..\..\Common\Net\HTTPNaettRequest.h" />
<ClInclude Include="..\..\Common\Net\HTTPRequest.h" />

View File

@ -862,6 +862,9 @@
<ClInclude Include="..\..\ext\naett\naett.h">
<Filter>ext\naett</Filter>
</ClInclude>
<ClInclude Include="..\..\Common\Math\CrossSIMD.h">
<Filter>Math</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<None Include="..\..\Common\Math\fast\fast_matrix_neon.S">

View File

@ -11,6 +11,7 @@
#if defined(_M_IX86) || defined(__i386__) || defined (__EMSCRIPTEN__)
#define PPSSPP_ARCH_X86 1
#define PPSSPP_ARCH_32BIT 1
#define PPSSPP_ARCH_SSE2 1
//TODO: Remove this compat define
#ifndef _M_IX86
#define _M_IX86 600
@ -19,6 +20,7 @@
#if (defined(_M_X64) || defined(__amd64__) || defined(__x86_64__)) && !defined(__EMSCRIPTEN__)
#define PPSSPP_ARCH_AMD64 1
#define PPSSPP_ARCH_SSE2 1
#if defined(__ILP32__)
#define PPSSPP_ARCH_32BIT 1
#else