Merge pull request #17808 from hrydgard/frustum-cull-small-draws

Frustum-cull small draws (experiment)
2024-11-22 21:09:52 +00:00 · 2023-12-09 17:23:56 +01:00 · 2023-12-09 17:23:56 +01:00 · 27e47d9899
commit 27e47d9899
parent 7f65db909f 7e85d3d10a
13 changed files with 371 additions and 37 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -737,6 +737,7 @@ add_library(Common STATIC
 	Common/Input/InputState.cpp
 	Common/Input/InputState.h
 	Common/Math/fast/fast_matrix.c
+	Common/Math/CrossSIMD.h
 	Common/Math/curves.cpp
 	Common/Math/curves.h
 	Common/Math/expression_parser.cpp
--- a/Common/Common.vcxproj
+++ b/Common/Common.vcxproj
@ -484,6 +484,7 @@
    <ClInclude Include="Input\GestureDetector.h" />
    <ClInclude Include="Input\InputState.h" />
    <ClInclude Include="Input\KeyCodes.h" />
+    <ClInclude Include="Math\CrossSIMD.h" />
    <ClInclude Include="Math\curves.h" />
    <ClInclude Include="Math\expression_parser.h" />
    <ClInclude Include="Math\fast\fast_matrix.h" />
--- a/Common/Common.vcxproj.filters
+++ b/Common/Common.vcxproj.filters
@ -518,6 +518,9 @@
    <ClInclude Include="GPU\Vulkan\VulkanDescSet.h">
      <Filter>GPU\Vulkan</Filter>
    </ClInclude>
+    <ClInclude Include="Math\CrossSIMD.h">
+      <Filter>Math</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="ABI.cpp" />
--- a/Common/Math/CrossSIMD.h
+++ b/Common/Math/CrossSIMD.h
@ -0,0 +1,58 @@
+// CrossSIMD
+//
+// Compatibility wrappers for SIMD dialects.
+//
+// In the long run, might do a more general single-source-SIMD wrapper here consisting
+// of defines that translate to either NEON or SSE. It would be possible to write quite a lot of
+// our various color conversion functions and so on in a pretty generic manner.
+
+#include "ppsspp_config.h"
+
+#include <cstdint>
+
+#if PPSSPP_ARCH(SSE2)
+#include <emmintrin.h>
+#endif
+
+#if PPSSPP_ARCH(ARM_NEON)
+#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+#endif
+
+// Basic types
+
+#if PPSSPP_ARCH(ARM64_NEON)
+
+// No special ones here.
+
+#elif PPSSPP_ARCH(ARM_NEON)
+
+// Compatibility wrappers making ARM64 NEON code run on ARM32
+// With optimization on, these should compile down to the optimal code.
+
+inline float32x4_t vmulq_laneq_f32(float32x4_t a, float32x4_t b, int lane) {
+	switch (lane & 3) {
+	case 0: return vmulq_lane_f32(a, vget_low_f32(b), 0);
+	case 1: return vmulq_lane_f32(a, vget_low_f32(b), 1);
+	case 2: return vmulq_lane_f32(a, vget_high_f32(b), 0);
+	default: return vmulq_lane_f32(a, vget_high_f32(b), 1);
+	}
+}
+
+inline float32x4_t vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t c, int lane) {
+	switch (lane & 3) {
+	case 0: return vmlaq_lane_f32(a, b, vget_low_f32(c), 0);
+	case 1: return vmlaq_lane_f32(a, b, vget_low_f32(c), 1);
+	case 2: return vmlaq_lane_f32(a, b, vget_high_f32(c), 0);
+	default: return vmlaq_lane_f32(a, b, vget_high_f32(c), 1);
+	}
+}
+
+inline uint32x4_t vcgezq_f32(float32x4_t v) {
+	return vcgeq_f32(v, vdupq_n_f32(0.0f));
+}
+
+#endif
--- a/Common/UI/View.cpp
+++ b/Common/UI/View.cpp
@ -620,7 +620,6 @@ CollapsibleHeader::CollapsibleHeader(bool *toggle, const std::string &text, Layo

 void CollapsibleHeader::Draw(UIContext &dc) {
 	Style style = dc.theme->itemStyle;
-	style.background.color = 0;
 	if (HasFocus()) style = dc.theme->itemFocusedStyle;
 	if (down_) style = dc.theme->itemDownStyle;
 	if (!IsEnabled()) style = dc.theme->itemDisabledStyle;
--- a/Core/Config.cpp
+++ b/Core/Config.cpp
@ -1893,7 +1893,7 @@ void PlayTimeTracker::Load(const Section *section) {

 		// Parse the string.
 		PlayTime gameTime{};
-		if (2 == sscanf(value.c_str(), "%d,%llu", &gameTime.totalTimePlayed, &gameTime.lastTimePlayed)) {
+		if (2 == sscanf(value.c_str(), "%d,%llu", &gameTime.totalTimePlayed, (long long *)&gameTime.lastTimePlayed)) {
 			tracker_[key] = gameTime;
 		}
 	}
--- a/GPU/Common/DrawEngineCommon.cpp
+++ b/GPU/Common/DrawEngineCommon.cpp
@ -21,6 +21,7 @@
 #include "Common/Data/Convert/ColorConv.h"
 #include "Common/Profiler/Profiler.h"
 #include "Common/LogReporting.h"
+#include "Common/Math/CrossSIMD.h"
 #include "Common/Math/lin/matrix4x4.h"
 #include "Core/Config.h"
 #include "GPU/Common/DrawEngineCommon.h"
@ -197,15 +198,10 @@ void DrawEngineCommon::DispatchSubmitImm(GEPrimitiveType prim, TransformedVertex

 // Gated by DIRTY_CULL_PLANES
 void DrawEngineCommon::UpdatePlanes() {
-	float world[16];
 	float view[16];
-	float worldview[16];
-	float worldviewproj[16];
-	ConvertMatrix4x3To4x4(world, gstate.worldMatrix);
+	float viewproj[16];
 	ConvertMatrix4x3To4x4(view, gstate.viewMatrix);
-	// TODO: Create a Matrix4x3ByMatrix4x3, and Matrix4x4ByMatrix4x3?
-	Matrix4ByMatrix4(worldview, world, view);
-	Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix);
+	Matrix4ByMatrix4(viewproj, view, gstate.projMatrix);

 	// Next, we need to apply viewport, scissor, region, and even offset - but only for X/Y.
 	// Note that the PSP does not clip against the viewport.
@ -214,6 +210,9 @@ void DrawEngineCommon::UpdatePlanes() {
 	minOffset_ = baseOffset + Vec2f(std::max(gstate.getRegionRateX() - 0x100, gstate.getScissorX1()), std::max(gstate.getRegionRateY() - 0x100, gstate.getScissorY1())) - Vec2f(1.0f, 1.0f);
 	maxOffset_ = baseOffset + Vec2f(std::min(gstate.getRegionX2(), gstate.getScissorX2()), std::min(gstate.getRegionY2(), gstate.getScissorY2())) + Vec2f(1.0f, 1.0f);

+	// Let's not handle these special cases in the fast culler.
+	offsetOutsideEdge_ = maxOffset_.x >= 4096.0f || minOffset_.x < 1.0f || minOffset_.y < 1.0f || maxOffset_.y >= 4096.0f;
+
 	// Now let's apply the viewport to our scissor/region + offset range.
 	Vec2f inverseViewportScale = Vec2f(1.0f / gstate.getViewportXScale(), 1.0f / gstate.getViewportYScale());
 	Vec2f minViewport = (minOffset_ - Vec2f(gstate.getViewportXCenter(), gstate.getViewportYCenter())) * inverseViewportScale;
@ -232,14 +231,14 @@ void DrawEngineCommon::UpdatePlanes() {
 	applyViewport.wy = -(maxViewport.y + minViewport.y) * viewportInvSize.y;

 	float mtx[16];
-	Matrix4ByMatrix4(mtx, worldviewproj, applyViewport.m);
-
-	planes_[0].Set(mtx[3] - mtx[0], mtx[7] - mtx[4], mtx[11] - mtx[8], mtx[15] - mtx[12]);  // Right
-	planes_[1].Set(mtx[3] + mtx[0], mtx[7] + mtx[4], mtx[11] + mtx[8], mtx[15] + mtx[12]);  // Left
-	planes_[2].Set(mtx[3] + mtx[1], mtx[7] + mtx[5], mtx[11] + mtx[9], mtx[15] + mtx[13]);  // Bottom
-	planes_[3].Set(mtx[3] - mtx[1], mtx[7] - mtx[5], mtx[11] - mtx[9], mtx[15] - mtx[13]);  // Top
-	planes_[4].Set(mtx[3] + mtx[2], mtx[7] + mtx[6], mtx[11] + mtx[10], mtx[15] + mtx[14]); // Near
-	planes_[5].Set(mtx[3] - mtx[2], mtx[7] - mtx[6], mtx[11] - mtx[10], mtx[15] - mtx[14]); // Far
+	Matrix4ByMatrix4(mtx, viewproj, applyViewport.m);
+	// I'm sure there's some fairly optimized way to set these.
+	planes_.Set(0, mtx[3] - mtx[0], mtx[7] - mtx[4], mtx[11] - mtx[8],  mtx[15] - mtx[12]);  // Right
+	planes_.Set(1, mtx[3] + mtx[0], mtx[7] + mtx[4], mtx[11] + mtx[8],  mtx[15] + mtx[12]);  // Left
+	planes_.Set(2, mtx[3] + mtx[1], mtx[7] + mtx[5], mtx[11] + mtx[9],  mtx[15] + mtx[13]);  // Bottom
+	planes_.Set(3, mtx[3] - mtx[1], mtx[7] - mtx[5], mtx[11] - mtx[9],  mtx[15] - mtx[13]);  // Top
+	planes_.Set(4, mtx[3] + mtx[2], mtx[7] + mtx[6], mtx[11] + mtx[10], mtx[15] + mtx[14]);  // Near
+	planes_.Set(5, mtx[3] - mtx[2], mtx[7] - mtx[6], mtx[11] - mtx[10], mtx[15] - mtx[14]);  // Far
 }

 // This code has plenty of potential for optimization.
@ -262,7 +261,6 @@ bool DrawEngineCommon::TestBoundingBox(const void *vdata, const void *inds, int

 	SimpleVertex *corners = (SimpleVertex *)(decoded_ + 65536 * 12);
 	float *verts = (float *)(decoded_ + 65536 * 18);
-	int vertStride = 3;

 	// Although this may lead to drawing that shouldn't happen, the viewport is more complex on VR.
 	// Let's always say objects are within bounds.
@ -338,17 +336,23 @@ bool DrawEngineCommon::TestBoundingBox(const void *vdata, const void *inds, int
 				}
 				break;
 			case GE_VTYPE_POS_FLOAT:
-				// No need to copy in this case, we can just read directly from the source format with a stride.
-				verts = (float *)((uint8_t *)vdata + offset);
-				vertStride = stride / 4;
 				// Previous code:
-				// for (int i = 0; i < vertexCount; i++)
-				//   memcpy(&verts[i * 3], (const u8 *)vdata + stride * i + offset, sizeof(float) * 3);
+				for (int i = 0; i < vertexCount; i++)
+					memcpy(&verts[i * 3], (const u8 *)vdata + stride * i + offset, sizeof(float) * 3);
 				break;
 			}
 		}
 	}

+	// Pretransform the verts in-place so we don't have to do it inside the loop.
+	// We do this differently in the fast version below since we skip the max/minOffset checks there
+	// making it easier to get the whole thing ready for SIMD.
+	for (int i = 0; i < vertexCount; i++) {
+		float worldpos[3];
+		Vec3ByMatrix43(worldpos, &verts[i * 3], gstate.worldMatrix);
+		memcpy(&verts[i * 3], worldpos, 12);
+	}
+
 	// Note: near/far are not checked without clamp/clip enabled, so we skip those planes.
 	int totalPlanes = gstate.isDepthClampEnabled() ? 6 : 4;
 	for (int plane = 0; plane < totalPlanes; plane++) {
@ -358,8 +362,8 @@ bool DrawEngineCommon::TestBoundingBox(const void *vdata, const void *inds, int
 			// Test against the frustum planes, and count.
 			// TODO: We should test 4 vertices at a time using SIMD.
 			// I guess could also test one vertex against 4 planes at a time, though a lot of waste at the common case of 6.
-			const float *pos = verts + i * vertStride;
-			float value = planes_[plane].Test(pos);
+			const float *worldpos = verts + i * 3;
+			float value = planes_.Test(plane, worldpos);
 			if (value <= -FLT_EPSILON)  // Not sure why we use exactly this value. Probably '< 0' would do.
 				out++;
 			else
@ -388,6 +392,179 @@ bool DrawEngineCommon::TestBoundingBox(const void *vdata, const void *inds, int
 	return true;
 }

+// NOTE: This doesn't handle through-mode, indexing, morph, or skinning.
+bool DrawEngineCommon::TestBoundingBoxFast(const void *vdata, int vertexCount, u32 vertType) {
+	SimpleVertex *corners = (SimpleVertex *)(decoded_ + 65536 * 12);
+	float *verts = (float *)(decoded_ + 65536 * 18);
+
+	// Although this may lead to drawing that shouldn't happen, the viewport is more complex on VR.
+	// Let's always say objects are within bounds.
+	if (gstate_c.Use(GPU_USE_VIRTUAL_REALITY))
+		return true;
+
+	// Due to world matrix updates per "thing", this isn't quite as effective as it could be if we did world transform
+	// in here as well. Though, it still does cut down on a lot of updates in Tekken 6.
+	if (gstate_c.IsDirty(DIRTY_CULL_PLANES)) {
+		UpdatePlanes();
+		gpuStats.numPlaneUpdates++;
+		gstate_c.Clean(DIRTY_CULL_PLANES);
+	}
+
+	// Also let's just bail if offsetOutsideEdge_ is set, instead of handling the cases.
+	// NOTE: This is written to in UpdatePlanes so can't check it before.
+	if (offsetOutsideEdge_)
+		return true;
+
+	// Simple, most common case.
+	VertexDecoder *dec = GetVertexDecoder(vertType);
+	int stride = dec->VertexSize();
+	int offset = dec->posoff;
+	int vertStride = 3;
+
+	// TODO: Possibly do the plane tests directly against the source formats instead of converting.
+	switch (vertType & GE_VTYPE_POS_MASK) {
+	case GE_VTYPE_POS_8BIT:
+		for (int i = 0; i < vertexCount; i++) {
+			const s8 *data = (const s8 *)vdata + i * stride + offset;
+			for (int j = 0; j < 3; j++) {
+				verts[i * 3 + j] = data[j] * (1.0f / 128.0f);
+			}
+		}
+		break;
+	case GE_VTYPE_POS_16BIT:
+	{
+#if PPSSPP_ARCH(SSE2)
+		__m128 scaleFactor = _mm_set1_ps(1.0f / 32768.0f);
+		for (int i = 0; i < vertexCount; i++) {
+			const s16 *data = ((const s16 *)((const s8 *)vdata + i * stride + offset));
+			__m128i bits = _mm_castpd_si128(_mm_load_sd((const double *)data));
+			// Sign extension. Hacky without SSE4.
+			bits = _mm_srai_epi32(_mm_unpacklo_epi16(bits, bits), 16);
+			__m128 pos = _mm_mul_ps(_mm_cvtepi32_ps(bits), scaleFactor);
+			_mm_storeu_ps(verts + i * 3, pos);  // TODO: use stride 4 to avoid clashing writes?
+		}
+#elif PPSSPP_ARCH(ARM_NEON)
+		for (int i = 0; i < vertexCount; i++) {
+			const s16 *dataPtr = ((const s16 *)((const s8 *)vdata + i * stride + offset));
+			int32x4_t data = vmovl_s16(vld1_s16(dataPtr));
+			float32x4_t pos = vcvtq_n_f32_s32(data, 15);  // >> 15 = division by 32768.0f
+			vst1q_f32(verts + i * 3, pos);
+		}
+#else
+		for (int i = 0; i < vertexCount; i++) {
+			const s16 *data = ((const s16 *)((const s8 *)vdata + i * stride + offset));
+			for (int j = 0; j < 3; j++) {
+				verts[i * 3 + j] = data[j] * (1.0f / 32768.0f);
+			}
+		}
+#endif
+		break;
+	}
+	case GE_VTYPE_POS_FLOAT:
+		// No need to copy in this case, we can just read directly from the source format with a stride.
+		verts = (float *)((uint8_t *)vdata + offset);
+		vertStride = stride / 4;
+		break;
+	}
+
+	// We only check the 4 sides. Near/far won't likely make a huge difference.
+	// We test one vertex against 4 planes to get some SIMD. Vertices need to be transformed to world space
+	// for testing, don't want to re-do that, so we have to use that "pivot" of the data.
+#if PPSSPP_ARCH(SSE2)
+	const __m128 worldX = _mm_loadu_ps(gstate.worldMatrix);
+	const __m128 worldY = _mm_loadu_ps(gstate.worldMatrix + 3);
+	const __m128 worldZ = _mm_loadu_ps(gstate.worldMatrix + 6);
+	const __m128 worldW = _mm_loadu_ps(gstate.worldMatrix + 9);
+	const __m128 planeX = _mm_loadu_ps(planes_.x);
+	const __m128 planeY = _mm_loadu_ps(planes_.y);
+	const __m128 planeZ = _mm_loadu_ps(planes_.z);
+	const __m128 planeW = _mm_loadu_ps(planes_.w);
+	__m128 inside = _mm_set1_ps(0.0f);
+	for (int i = 0; i < vertexCount; i++) {
+		const float *pos = verts + i * vertStride;
+		__m128 worldpos = _mm_add_ps(
+			_mm_add_ps(
+				_mm_mul_ps(worldX, _mm_set1_ps(pos[0])),
+				_mm_mul_ps(worldY, _mm_set1_ps(pos[1]))
+			),
+			_mm_add_ps(
+				_mm_mul_ps(worldZ, _mm_set1_ps(pos[2])),
+				worldW
+			)
+		);
+		// OK, now we check it against the four planes.
+		// This is really curiously similar to a matrix multiplication (well, it is one).
+		__m128 posX = _mm_shuffle_ps(worldpos, worldpos, _MM_SHUFFLE(0, 0, 0, 0));
+		__m128 posY = _mm_shuffle_ps(worldpos, worldpos, _MM_SHUFFLE(1, 1, 1, 1));
+		__m128 posZ = _mm_shuffle_ps(worldpos, worldpos, _MM_SHUFFLE(2, 2, 2, 2));
+		__m128 planeDist = _mm_add_ps(
+			_mm_add_ps(
+				_mm_mul_ps(planeX, posX),
+				_mm_mul_ps(planeY, posY)
+			),
+			_mm_add_ps(
+				_mm_mul_ps(planeZ, posZ),
+				planeW
+			)
+		);
+		inside = _mm_or_ps(inside, _mm_cmpge_ps(planeDist, _mm_setzero_ps()));
+	}
+	// 0xF means that we found at least one vertex inside every one of the planes.
+	// We don't bother with counts, though it wouldn't be hard if we had a use for them.
+	return _mm_movemask_ps(inside) == 0xF;
+#elif PPSSPP_ARCH(ARM_NEON)
+	const float32x4_t worldX = vld1q_f32(gstate.worldMatrix);
+	const float32x4_t worldY = vld1q_f32(gstate.worldMatrix + 3);
+	const float32x4_t worldZ = vld1q_f32(gstate.worldMatrix + 6);
+	const float32x4_t worldW = vld1q_f32(gstate.worldMatrix + 9);
+	const float32x4_t planeX = vld1q_f32(planes_.x);
+	const float32x4_t planeY = vld1q_f32(planes_.y);
+	const float32x4_t planeZ = vld1q_f32(planes_.z);
+	const float32x4_t planeW = vld1q_f32(planes_.w);
+	uint32x4_t inside = vdupq_n_u32(0);
+	for (int i = 0; i < vertexCount; i++) {
+		const float *pos = verts + i * vertStride;
+		float32x4_t objpos = vld1q_f32(pos);
+		float32x4_t worldpos = vaddq_f32(
+			vmlaq_laneq_f32(
+				vmulq_laneq_f32(worldX, objpos, 0),
+				worldY, objpos, 1),
+			vmlaq_laneq_f32(worldW, worldZ, objpos, 2)
+		);
+		// OK, now we check it against the four planes.
+		// This is really curiously similar to a matrix multiplication (well, it is one).
+		float32x4_t planeDist = vaddq_f32(
+			vmlaq_laneq_f32(
+				vmulq_laneq_f32(planeX, worldpos, 0),
+				planeY, worldpos, 1),
+			vmlaq_laneq_f32(planeW, planeZ, worldpos, 2)
+		);
+		inside = vorrq_u32(inside, vcgezq_f32(planeDist));
+	}
+	uint64_t insideBits = vget_lane_u64(vreinterpret_u64_u16(vmovn_u32(inside)), 0);
+	return ~insideBits == 0;  // InsideBits all ones means that we found at least one vertex inside every one of the planes. We don't bother with counts, though it wouldn't be hard.
+#else
+	int inside[4]{};
+	for (int i = 0; i < vertexCount; i++) {
+		const float *pos = verts + i * vertStride;
+		float worldpos[3];
+		Vec3ByMatrix43(worldpos, pos, gstate.worldMatrix);
+		for (int plane = 0; plane < 4; plane++) {
+			float value = planes_.Test(plane, worldpos);
+			if (value >= 0.0f)
+				inside[plane]++;
+		}
+	}
+
+	for (int plane = 0; plane < 4; plane++) {
+		if (inside[plane] == 0) {
+			return false;
+		}
+	}
+#endif
+	return true;
+}
+
 // TODO: This probably is not the best interface.
 bool DrawEngineCommon::GetCurrentSimpleVertices(int count, std::vector<GPUDebugVertex> &vertices, std::vector<u16> &indices) {
 	// This is always for the current vertices.
@ -670,6 +847,31 @@ int DrawEngineCommon::ExtendNonIndexedPrim(const uint32_t *cmd, const uint32_t *
 	return cmd - start;
 }

+void DrawEngineCommon::SkipPrim(GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int *bytesRead) {
+	if (!indexGen.PrimCompatible(prevPrim_, prim)) {
+		DispatchFlush();
+	}
+
+	// This isn't exactly right, if we flushed, since prims can straddle previous calls.
+	// But it generally works for common usage.
+	if (prim == GE_PRIM_KEEP_PREVIOUS) {
+		// Has to be set to something, let's assume POINTS (0) if no previous.
+		if (prevPrim_ == GE_PRIM_INVALID)
+			prevPrim_ = GE_PRIM_POINTS;
+		prim = prevPrim_;
+	} else {
+		prevPrim_ = prim;
+	}
+
+	// If vtype has changed, setup the vertex decoder.
+	if (vertTypeID != lastVType_ || !dec_) {
+		dec_ = GetVertexDecoder(vertTypeID);
+		lastVType_ = vertTypeID;
+	}
+
+	*bytesRead = vertexCount * dec_->VertexSize();
+}
+
 // vertTypeID is the vertex type but with the UVGen mode smashed into the top bits.
 bool DrawEngineCommon::SubmitPrim(const void *verts, const void *inds, GEPrimitiveType prim, int vertexCount, u32 vertTypeID, bool clockwise, int *bytesRead) {
 	if (!indexGen.PrimCompatible(prevPrim_, prim) || numDrawVerts_ >= MAX_DEFERRED_DRAW_VERTS || numDrawInds_ >= MAX_DEFERRED_DRAW_INDS || vertexCountInDrawCalls_ + vertexCount > VERTEX_BUFFER_MAX) {
--- a/GPU/Common/DrawEngineCommon.h
+++ b/GPU/Common/DrawEngineCommon.h
@ -69,11 +69,11 @@ public:
 	virtual void SendDataToShader(const SimpleVertex *const *points, int size_u, int size_v, u32 vertType, const Spline::Weight2D &weights) = 0;
 };

-// Culling plane.
-struct Plane {
-	float x, y, z, w;
-	void Set(float _x, float _y, float _z, float _w) { x = _x; y = _y; z = _z; w = _w; }
-	float Test(const float f[3]) const { return x * f[0] + y * f[1] + z * f[2] + w; }
+// Culling plane, group of 8.
+struct alignas(16) Plane8 {
+	float x[8], y[8], z[8], w[8];
+	void Set(int i, float _x, float _y, float _z, float _w) { x[i] = _x; y[i] = _y; z[i] = _z; w[i] = _w; }
+	float Test(int i, const float f[3]) const { return x[i] * f[0] + y[i] * f[1] + z[i] * f[2] + w[i]; }
 };

 class DrawEngineCommon {
@ -104,6 +104,10 @@ public:

 	bool TestBoundingBox(const void *control_points, const void *inds, int vertexCount, u32 vertType);

+	// This is a less accurate version of TestBoundingBox, but faster. Can have more false positives.
+	// Doesn't support indexing.
+	bool TestBoundingBoxFast(const void *control_points, int vertexCount, u32 vertType);
+
 	void FlushSkin() {
 		bool applySkin = (lastVType_ & GE_VTYPE_WEIGHT_MASK) && decOptions_.applySkinInDecode;
 		if (applySkin) {
@ -113,6 +117,8 @@ public:

 	int ExtendNonIndexedPrim(const uint32_t *cmd, const uint32_t *stall, u32 vertTypeID, bool clockwise, int *bytesRead, bool isTriangle);
 	bool SubmitPrim(const void *verts, const void *inds, GEPrimitiveType prim, int vertexCount, u32 vertTypeID, bool clockwise, int *bytesRead);
+	void SkipPrim(GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int *bytesRead);
+
 	template<class Surface>
 	void SubmitCurve(const void *control_points, const void *indices, Surface &surface, u32 vertType, int *bytesRead, const char *scope);
 	void ClearSplineBezierWeights();
@ -287,7 +293,8 @@ protected:
 	TessellationDataTransfer *tessDataTransfer;

 	// Culling
-	Plane planes_[6];
+	Plane8 planes_;
 	Vec2f minOffset_;
 	Vec2f maxOffset_;
+	bool offsetOutsideEdge_;
 };
--- a/GPU/GPU.h
+++ b/GPU/GPU.h
@ -76,6 +76,7 @@ struct GPUStatistics {
 	void ResetFrame() {
 		numDrawCalls = 0;
 		numVertexDecodes = 0;
+		numCulledDraws = 0;
 		numDrawSyncs = 0;
 		numListSyncs = 0;
 		numVertsSubmitted = 0;
@ -111,6 +112,7 @@ struct GPUStatistics {
 	// Per frame statistics
 	int numDrawCalls;
 	int numVertexDecodes;
+	int numCulledDraws;
 	int numDrawSyncs;
 	int numListSyncs;
 	int numFlushes;
--- a/GPU/GPUCommonHW.cpp
+++ b/GPU/GPUCommonHW.cpp
@ -989,9 +989,45 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) {
 	int cullMode = gstate.getCullMode();

 	uint32_t vertTypeID = GetVertTypeID(vertexType, gstate.getUVGenMode(), g_Config.bSoftwareSkinning);
-	if (!drawEngineCommon_->SubmitPrim(verts, inds, prim, count, vertTypeID, true, &bytesRead)) {
+
+#define MAX_CULL_CHECK_COUNT 6
+
+// For now, turn off culling on platforms where we don't have SIMD bounding box tests, like RISC-V.
+#if PPSSPP_ARCH(ARM_NEON) || PPSSPP_ARCH(SSE2)
+
+#define PASSES_CULLING ((vertexType & (GE_VTYPE_THROUGH_MASK | GE_VTYPE_MORPHCOUNT_MASK | GE_VTYPE_WEIGHT_MASK | GE_VTYPE_IDX_MASK)) || count > MAX_CULL_CHECK_COUNT)
+
+#else
+
+#define PASSES_CULLING true
+
+#endif
+
+	// If certain conditions are true, do frustum culling.
+	bool passCulling = PASSES_CULLING;
+	if (!passCulling) {
+		// Do software culling.
+		if (drawEngineCommon_->TestBoundingBoxFast(verts, count, vertexType)) {
+			passCulling = true;
+		} else {
+			gpuStats.numCulledDraws++;
+		}
+	}
+
+	// If the first one in a batch passes, let's assume the whole batch passes.
+	// Cuts down on checking, while not losing that much efficiency.
+	bool onePassed = false;
+	if (passCulling) {
+		if (!drawEngineCommon_->SubmitPrim(verts, inds, prim, count, vertTypeID, true, &bytesRead)) {
+			canExtend = false;
+		}
+		onePassed = true;
+	} else {
+		// Still need to advance bytesRead.
+		drawEngineCommon_->SkipPrim(prim, count, vertTypeID, &bytesRead);
 		canExtend = false;
 	}
+
 	// After drawing, we advance the vertexAddr (when non indexed) or indexAddr (when indexed).
 	// Some games rely on this, they don't bother reloading VADDR and IADDR.
 	// The VADDR/IADDR registers are NOT updated.
@ -1027,7 +1063,7 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) {
 			bool clockwise = !gstate.isCullEnabled() || gstate.getCullMode() == cullMode;
 			if (canExtend) {
 				// Non-indexed draws can be cheaply merged if vertexAddr hasn't changed, that means the vertices
-				// are consecutive in memory.
+				// are consecutive in memory. We also ignore culling here.
 				_dbg_assert_((vertexType & GE_VTYPE_IDX_MASK) == GE_VTYPE_IDX_NONE);
 				int commandsExecuted = drawEngineCommon_->ExtendNonIndexedPrim(src, stall, vertTypeID, clockwise, &bytesRead, isTriangle);
 				if (!commandsExecuted) {
@ -1047,7 +1083,25 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) {
 				// We can extend again after submitting a normal draw.
 				canExtend = isTriangle;
 			}
-			if (!drawEngineCommon_->SubmitPrim(verts, inds, newPrim, count, vertTypeID, clockwise, &bytesRead)) {
+
+			bool passCulling = onePassed || PASSES_CULLING;
+			if (!passCulling) {
+				// Do software culling.
+				if (drawEngineCommon_->TestBoundingBox(verts, inds, count, vertexType)) {
+					passCulling = true;
+				} else {
+					gpuStats.numCulledDraws++;
+				}
+			}
+			if (passCulling) {
+				if (!drawEngineCommon_->SubmitPrim(verts, inds, newPrim, count, vertTypeID, clockwise, &bytesRead)) {
+					canExtend = false;
+				}
+				// As soon as one passes, assume we don't need to check the rest of this batch.
+				onePassed = true;
+			} else {
+				// Still need to advance bytesRead.
+				drawEngineCommon_->SkipPrim(newPrim, count, vertTypeID, &bytesRead);
 				canExtend = false;
 			}
 			AdvanceVerts(vertexType, count, bytesRead);
@ -1412,7 +1466,7 @@ void GPUCommonHW::Execute_WorldMtxNum(u32 op, u32 diff) {
 			if (dst[i] != newVal) {
 				Flush();
 				dst[i] = newVal;
-				gstate_c.Dirty(DIRTY_WORLDMATRIX | DIRTY_CULL_PLANES);
+				gstate_c.Dirty(DIRTY_WORLDMATRIX);
 			}
 			if (++i >= end) {
 				break;
@ -1435,7 +1489,7 @@ void GPUCommonHW::Execute_WorldMtxData(u32 op, u32 diff) {
 	if (num < 12 && newVal != ((const u32 *)gstate.worldMatrix)[num]) {
 		Flush();
 		((u32 *)gstate.worldMatrix)[num] = newVal;
-		gstate_c.Dirty(DIRTY_WORLDMATRIX | DIRTY_CULL_PLANES);
+		gstate_c.Dirty(DIRTY_WORLDMATRIX);
 	}
 	num++;
 	gstate.worldmtxnum = (GE_CMD_WORLDMATRIXNUMBER << 24) | (num & 0x00FFFFFF);
@ -1691,7 +1745,7 @@ size_t GPUCommonHW::FormatGPUStatsCommon(char *buffer, size_t size) {
 	float vertexAverageCycles = gpuStats.numVertsSubmitted > 0 ? (float)gpuStats.vertexGPUCycles / (float)gpuStats.numVertsSubmitted : 0.0f;
 	return snprintf(buffer, size,
 		"DL processing time: %0.2f ms, %d drawsync, %d listsync\n"
-		"Draw: %d (%d dec), flushes %d, clears %d, bbox jumps %d (%d updates)\n"
+		"Draw: %d (%d dec, %d culled), flushes %d, clears %d, bbox jumps %d (%d updates)\n"
 		"Vertices: %d drawn: %d\n"
 		"FBOs active: %d (evaluations: %d)\n"
 		"Textures: %d, dec: %d, invalidated: %d, hashed: %d kB\n"
@ -1705,6 +1759,7 @@ size_t GPUCommonHW::FormatGPUStatsCommon(char *buffer, size_t size) {
 		gpuStats.numListSyncs,
 		gpuStats.numDrawCalls,
 		gpuStats.numVertexDecodes,
+		gpuStats.numCulledDraws,
 		gpuStats.numFlushes,
 		gpuStats.numClears,
 		gpuStats.numBBOXJumps,
--- a/UWP/CommonUWP/CommonUWP.vcxproj
+++ b/UWP/CommonUWP/CommonUWP.vcxproj
@ -105,6 +105,7 @@
    <ClInclude Include="..\..\Common\File\AndroidStorage.h" />
    <ClInclude Include="..\..\Common\GPU\GPUBackendCommon.h" />
    <ClInclude Include="..\..\Common\GPU\Vulkan\VulkanLoader.h" />
+    <ClInclude Include="..\..\Common\Math\CrossSIMD.h" />
    <ClInclude Include="..\..\Common\Math\Statistics.h" />
    <ClInclude Include="..\..\Common\Net\HTTPNaettRequest.h" />
    <ClInclude Include="..\..\Common\Net\HTTPRequest.h" />
--- a/UWP/CommonUWP/CommonUWP.vcxproj.filters
+++ b/UWP/CommonUWP/CommonUWP.vcxproj.filters
@ -862,6 +862,9 @@
    <ClInclude Include="..\..\ext\naett\naett.h">
      <Filter>ext\naett</Filter>
    </ClInclude>
+    <ClInclude Include="..\..\Common\Math\CrossSIMD.h">
+      <Filter>Math</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <None Include="..\..\Common\Math\fast\fast_matrix_neon.S">
--- a/ppsspp_config.h
+++ b/ppsspp_config.h
@ -11,6 +11,7 @@
 #if defined(_M_IX86) || defined(__i386__) || defined (__EMSCRIPTEN__)
    #define PPSSPP_ARCH_X86 1
    #define PPSSPP_ARCH_32BIT 1
+    #define PPSSPP_ARCH_SSE2 1
    //TODO: Remove this compat define
    #ifndef _M_IX86
        #define _M_IX86 600
@ -19,6 +20,7 @@

 #if (defined(_M_X64) || defined(__amd64__) || defined(__x86_64__)) && !defined(__EMSCRIPTEN__)
    #define PPSSPP_ARCH_AMD64 1
+    #define PPSSPP_ARCH_SSE2 1
    #if defined(__ILP32__)
        #define PPSSPP_ARCH_32BIT 1
    #else