diff --git a/GPU/Software/Rasterizer.cpp b/GPU/Software/Rasterizer.cpp index eb23d4fdb2..a8ca4ce89e 100644 --- a/GPU/Software/Rasterizer.cpp +++ b/GPU/Software/Rasterizer.cpp @@ -19,6 +19,7 @@ #include #include +#include "Common/CPUDetect.h" #include "Common/Data/Convert/ColorConv.h" #include "Common/Profiler/Profiler.h" #include "Common/StringUtils.h" @@ -41,6 +42,10 @@ #include #endif +#if _M_SSE >= 0x401 +#include +#endif + namespace Rasterizer { // Only OK on x64 where our stack is aligned @@ -565,6 +570,7 @@ static inline void ApplyTexturing(Sampler::Funcs sampler, Vec4 *prim_color, } } +template struct TriangleEdge { Vec4 Start(const ScreenCoords &v0, const ScreenCoords &v1, const ScreenCoords &origin); inline Vec4 StepX(const Vec4 &w); @@ -577,7 +583,8 @@ struct TriangleEdge { Vec4 stepY; }; -Vec4 TriangleEdge::Start(const ScreenCoords &v0, const ScreenCoords &v1, const ScreenCoords &origin) { +template +Vec4 TriangleEdge::Start(const ScreenCoords &v0, const ScreenCoords &v1, const ScreenCoords &origin) { // Start at pixel centers. Vec4 initX = Vec4::AssignToAll(origin.x) + Vec4(7, 23, 7, 23); Vec4 initY = Vec4::AssignToAll(origin.y) + Vec4(7, 7, 23, 23); @@ -590,10 +597,18 @@ Vec4 TriangleEdge::Start(const ScreenCoords &v0, const ScreenCoords &v1, co stepX = Vec4::AssignToAll(xf * 16 * 2); stepY = Vec4::AssignToAll(yf * 16 * 2); +#if defined(_M_SSE) && !PPSSPP_ARCH(X86) && _M_SSE >= 0x401 + if (useSSE4) { + initX.ivec = _mm_mullo_epi32(initX.ivec, _mm_set1_epi32(xf)); + initY.ivec = _mm_mullo_epi32(initY.ivec, _mm_set1_epi32(yf)); + return _mm_add_epi32(_mm_add_epi32(initX.ivec, initY.ivec), _mm_set1_epi32(c)); + } +#endif return Vec4::AssignToAll(xf) * initX + Vec4::AssignToAll(yf) * initY + Vec4::AssignToAll(c); } -inline Vec4 TriangleEdge::StepX(const Vec4 &w) { +template +inline Vec4 TriangleEdge::StepX(const Vec4 &w) { #if defined(_M_SSE) && !PPSSPP_ARCH(X86) return _mm_add_epi32(w.ivec, stepX.ivec); #else @@ -601,7 +616,8 @@ inline Vec4 TriangleEdge::StepX(const Vec4 &w) { #endif } -inline Vec4 TriangleEdge::StepY(const Vec4 &w) { +template +inline Vec4 TriangleEdge::StepY(const Vec4 &w) { #if defined(_M_SSE) && !PPSSPP_ARCH(X86) return _mm_add_epi32(w.ivec, stepY.ivec); #else @@ -609,8 +625,20 @@ inline Vec4 TriangleEdge::StepY(const Vec4 &w) { #endif } -void TriangleEdge::NarrowMinMaxX(const Vec4 &w, int64_t minX, int64_t &rowMinX, int64_t &rowMaxX) { - int wmax = std::max(std::max(w.x, w.y), std::max(w.z, w.w)); +template +void TriangleEdge::NarrowMinMaxX(const Vec4 &w, int64_t minX, int64_t &rowMinX, int64_t &rowMaxX) { + int wmax; +#if defined(_M_SSE) && !PPSSPP_ARCH(X86) && _M_SSE >= 0x401 + if (useSSE4) { + __m128i max01 = _mm_max_epi32(w.ivec, _mm_shuffle_epi32(w.ivec, _MM_SHUFFLE(3, 2, 3, 2))); + __m128i max0 = _mm_max_epi32(max01, _mm_shuffle_epi32(max01, _MM_SHUFFLE(1, 1, 1, 1))); + wmax = _mm_cvtsi128_si32(max0); + } else { + wmax = std::max(std::max(w.x, w.y), std::max(w.z, w.w)); + } +#else + wmax = std::max(std::max(w.x, w.y), std::max(w.z, w.w)); +#endif if (wmax < 0) { if (stepX.x > 0) { int steps = -wmax / stepX.x; @@ -626,7 +654,12 @@ void TriangleEdge::NarrowMinMaxX(const Vec4 &w, int64_t minX, int64_t &rowM } } -inline Vec4 TriangleEdge::StepXTimes(const Vec4 &w, int c) { +template +inline Vec4 TriangleEdge::StepXTimes(const Vec4 &w, int c) { +#if defined(_M_SSE) && !PPSSPP_ARCH(X86) && _M_SSE >= 0x401 + if (useSSE4) + return _mm_add_epi32(w.ivec, _mm_mullo_epi32(_mm_set1_epi32(c), stepX.ivec)); +#endif return w + stepX * c; } @@ -642,8 +675,16 @@ static inline Vec4 MakeMask(const Vec4 &w0, const Vec4 &w1, const #endif } +template static inline bool AnyMask(const Vec4 &mask) { #if defined(_M_SSE) && !PPSSPP_ARCH(X86) +#if _M_SSE >= 0x401 + if (useSSE4) { + __m128i sig = _mm_srai_epi32(mask.ivec, 31); + return _mm_test_all_ones(sig) == 0; + } +#endif + // In other words: !(mask.x < 0 && mask.y < 0 && mask.z < 0 && mask.w < 0) __m128i low2 = _mm_and_si128(mask.ivec, _mm_shuffle_epi32(mask.ivec, _MM_SHUFFLE(3, 2, 3, 2))); __m128i low1 = _mm_and_si128(low2, _mm_shuffle_epi32(low2, _MM_SHUFFLE(1, 1, 1, 1))); @@ -664,7 +705,7 @@ static inline Vec4 EdgeRecip(const Vec4 &w0, const Vec4 &w1, co #endif } -template +template void DrawTriangleSlice( const VertexData& v0, const VertexData& v1, const VertexData& v2, int x1, int y1, int x2, int y2, @@ -697,9 +738,9 @@ void DrawTriangleSlice( } } - TriangleEdge e0; - TriangleEdge e1; - TriangleEdge e2; + TriangleEdge e0; + TriangleEdge e1; + TriangleEdge e2; int64_t minX = x1, maxX = x2, minY = y1, maxY = y2; @@ -733,11 +774,6 @@ void DrawTriangleSlice( Vec4 w1 = w1_base; Vec4 w2 = w2_base; - // TODO: Maybe we can clip the edges instead? - int scissorYPlus1 = curY + 16 > maxY ? -1 : 0; - Vec4 scissor_mask = Vec4(0, maxX - minX - 16, scissorYPlus1, (maxX - minX - 16) | scissorYPlus1); - Vec4 scissor_step = Vec4(0, -32, 0, -32); - DrawingCoords p = TransformUnit::ScreenToDrawing(ScreenCoords(minX, curY, 0)); int64_t rowMinX = minX, rowMaxX = maxX; @@ -749,9 +785,13 @@ void DrawTriangleSlice( w0 = e0.StepXTimes(w0, skipX); w1 = e1.StepXTimes(w1, skipX); w2 = e2.StepXTimes(w2, skipX); - scissor_mask = scissor_mask + scissor_step * skipX; p.x = (p.x + 2 * skipX) & 0x3FF; + // TODO: Maybe we can clip the edges instead? + int scissorYPlus1 = curY + 16 > maxY ? -1 : 0; + Vec4 scissor_mask = Vec4(0, rowMaxX - rowMinX - 16, scissorYPlus1, (rowMaxX - rowMinX - 16) | scissorYPlus1); + Vec4 scissor_step = Vec4(0, -32, 0, -32); + for (int64_t curX = rowMinX; curX <= rowMaxX; curX += 32, w0 = e0.StepX(w0), w1 = e1.StepX(w1), @@ -761,7 +801,7 @@ void DrawTriangleSlice( // If p is on or inside all edges, render pixel Vec4 mask = MakeMask(w0, w1, w2, bias0, bias1, bias2, scissor_mask); - if (AnyMask(mask)) { + if (AnyMask(mask)) { Vec4 wsum_recip = EdgeRecip(w0, w1, w2); Vec4 prim_color[4]; @@ -915,7 +955,9 @@ void DrawTriangle(const VertexData& v0, const VertexData& v1, const VertexData& Rasterizer::SingleFunc drawPixel = Rasterizer::GetSingleFunc(pixelID); Sampler::Funcs sampler = Sampler::GetFuncs(); - auto drawSlice = gstate.isModeClear() ? &DrawTriangleSlice : &DrawTriangleSlice; + auto drawSlice = cpu_info.bSSE4_1 ? + (gstate.isModeClear() ? &DrawTriangleSlice : &DrawTriangleSlice) : + (gstate.isModeClear() ? &DrawTriangleSlice : &DrawTriangleSlice); const int MIN_LINES_PER_THREAD = 4;