softgpu: Use some SSE4 in triangle interpolation.

This commit is contained in:
Unknown W. Brackets 2022-01-08 09:39:30 -08:00
parent 3b1cc0d3b8
commit c7fc448869

View File

@ -19,6 +19,7 @@
#include <algorithm>
#include <cmath>
#include "Common/CPUDetect.h"
#include "Common/Data/Convert/ColorConv.h"
#include "Common/Profiler/Profiler.h"
#include "Common/StringUtils.h"
@ -41,6 +42,10 @@
#include <emmintrin.h>
#endif
#if _M_SSE >= 0x401
#include <smmintrin.h>
#endif
namespace Rasterizer {
// Only OK on x64 where our stack is aligned
@ -565,6 +570,7 @@ static inline void ApplyTexturing(Sampler::Funcs sampler, Vec4<int> *prim_color,
}
}
template <bool useSSE4>
struct TriangleEdge {
Vec4<int> Start(const ScreenCoords &v0, const ScreenCoords &v1, const ScreenCoords &origin);
inline Vec4<int> StepX(const Vec4<int> &w);
@ -577,7 +583,8 @@ struct TriangleEdge {
Vec4<int> stepY;
};
Vec4<int> TriangleEdge::Start(const ScreenCoords &v0, const ScreenCoords &v1, const ScreenCoords &origin) {
template <bool useSSE4>
Vec4<int> TriangleEdge<useSSE4>::Start(const ScreenCoords &v0, const ScreenCoords &v1, const ScreenCoords &origin) {
// Start at pixel centers.
Vec4<int> initX = Vec4<int>::AssignToAll(origin.x) + Vec4<int>(7, 23, 7, 23);
Vec4<int> initY = Vec4<int>::AssignToAll(origin.y) + Vec4<int>(7, 7, 23, 23);
@ -590,10 +597,18 @@ Vec4<int> TriangleEdge::Start(const ScreenCoords &v0, const ScreenCoords &v1, co
stepX = Vec4<int>::AssignToAll(xf * 16 * 2);
stepY = Vec4<int>::AssignToAll(yf * 16 * 2);
#if defined(_M_SSE) && !PPSSPP_ARCH(X86) && _M_SSE >= 0x401
if (useSSE4) {
initX.ivec = _mm_mullo_epi32(initX.ivec, _mm_set1_epi32(xf));
initY.ivec = _mm_mullo_epi32(initY.ivec, _mm_set1_epi32(yf));
return _mm_add_epi32(_mm_add_epi32(initX.ivec, initY.ivec), _mm_set1_epi32(c));
}
#endif
return Vec4<int>::AssignToAll(xf) * initX + Vec4<int>::AssignToAll(yf) * initY + Vec4<int>::AssignToAll(c);
}
inline Vec4<int> TriangleEdge::StepX(const Vec4<int> &w) {
template <bool useSSE4>
inline Vec4<int> TriangleEdge<useSSE4>::StepX(const Vec4<int> &w) {
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
return _mm_add_epi32(w.ivec, stepX.ivec);
#else
@ -601,7 +616,8 @@ inline Vec4<int> TriangleEdge::StepX(const Vec4<int> &w) {
#endif
}
inline Vec4<int> TriangleEdge::StepY(const Vec4<int> &w) {
template <bool useSSE4>
inline Vec4<int> TriangleEdge<useSSE4>::StepY(const Vec4<int> &w) {
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
return _mm_add_epi32(w.ivec, stepY.ivec);
#else
@ -609,8 +625,20 @@ inline Vec4<int> TriangleEdge::StepY(const Vec4<int> &w) {
#endif
}
void TriangleEdge::NarrowMinMaxX(const Vec4<int> &w, int64_t minX, int64_t &rowMinX, int64_t &rowMaxX) {
int wmax = std::max(std::max(w.x, w.y), std::max(w.z, w.w));
template <bool useSSE4>
void TriangleEdge<useSSE4>::NarrowMinMaxX(const Vec4<int> &w, int64_t minX, int64_t &rowMinX, int64_t &rowMaxX) {
int wmax;
#if defined(_M_SSE) && !PPSSPP_ARCH(X86) && _M_SSE >= 0x401
if (useSSE4) {
__m128i max01 = _mm_max_epi32(w.ivec, _mm_shuffle_epi32(w.ivec, _MM_SHUFFLE(3, 2, 3, 2)));
__m128i max0 = _mm_max_epi32(max01, _mm_shuffle_epi32(max01, _MM_SHUFFLE(1, 1, 1, 1)));
wmax = _mm_cvtsi128_si32(max0);
} else {
wmax = std::max(std::max(w.x, w.y), std::max(w.z, w.w));
}
#else
wmax = std::max(std::max(w.x, w.y), std::max(w.z, w.w));
#endif
if (wmax < 0) {
if (stepX.x > 0) {
int steps = -wmax / stepX.x;
@ -626,7 +654,12 @@ void TriangleEdge::NarrowMinMaxX(const Vec4<int> &w, int64_t minX, int64_t &rowM
}
}
inline Vec4<int> TriangleEdge::StepXTimes(const Vec4<int> &w, int c) {
template <bool useSSE4>
inline Vec4<int> TriangleEdge<useSSE4>::StepXTimes(const Vec4<int> &w, int c) {
#if defined(_M_SSE) && !PPSSPP_ARCH(X86) && _M_SSE >= 0x401
if (useSSE4)
return _mm_add_epi32(w.ivec, _mm_mullo_epi32(_mm_set1_epi32(c), stepX.ivec));
#endif
return w + stepX * c;
}
@ -642,8 +675,16 @@ static inline Vec4<int> MakeMask(const Vec4<int> &w0, const Vec4<int> &w1, const
#endif
}
template <bool useSSE4>
static inline bool AnyMask(const Vec4<int> &mask) {
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
#if _M_SSE >= 0x401
if (useSSE4) {
__m128i sig = _mm_srai_epi32(mask.ivec, 31);
return _mm_test_all_ones(sig) == 0;
}
#endif
// In other words: !(mask.x < 0 && mask.y < 0 && mask.z < 0 && mask.w < 0)
__m128i low2 = _mm_and_si128(mask.ivec, _mm_shuffle_epi32(mask.ivec, _MM_SHUFFLE(3, 2, 3, 2)));
__m128i low1 = _mm_and_si128(low2, _mm_shuffle_epi32(low2, _MM_SHUFFLE(1, 1, 1, 1)));
@ -664,7 +705,7 @@ static inline Vec4<float> EdgeRecip(const Vec4<int> &w0, const Vec4<int> &w1, co
#endif
}
template <bool clearMode>
template <bool clearMode, bool useSSE4>
void DrawTriangleSlice(
const VertexData& v0, const VertexData& v1, const VertexData& v2,
int x1, int y1, int x2, int y2,
@ -697,9 +738,9 @@ void DrawTriangleSlice(
}
}
TriangleEdge e0;
TriangleEdge e1;
TriangleEdge e2;
TriangleEdge<useSSE4> e0;
TriangleEdge<useSSE4> e1;
TriangleEdge<useSSE4> e2;
int64_t minX = x1, maxX = x2, minY = y1, maxY = y2;
@ -733,11 +774,6 @@ void DrawTriangleSlice(
Vec4<int> w1 = w1_base;
Vec4<int> w2 = w2_base;
// TODO: Maybe we can clip the edges instead?
int scissorYPlus1 = curY + 16 > maxY ? -1 : 0;
Vec4<int> scissor_mask = Vec4<int>(0, maxX - minX - 16, scissorYPlus1, (maxX - minX - 16) | scissorYPlus1);
Vec4<int> scissor_step = Vec4<int>(0, -32, 0, -32);
DrawingCoords p = TransformUnit::ScreenToDrawing(ScreenCoords(minX, curY, 0));
int64_t rowMinX = minX, rowMaxX = maxX;
@ -749,9 +785,13 @@ void DrawTriangleSlice(
w0 = e0.StepXTimes(w0, skipX);
w1 = e1.StepXTimes(w1, skipX);
w2 = e2.StepXTimes(w2, skipX);
scissor_mask = scissor_mask + scissor_step * skipX;
p.x = (p.x + 2 * skipX) & 0x3FF;
// TODO: Maybe we can clip the edges instead?
int scissorYPlus1 = curY + 16 > maxY ? -1 : 0;
Vec4<int> scissor_mask = Vec4<int>(0, rowMaxX - rowMinX - 16, scissorYPlus1, (rowMaxX - rowMinX - 16) | scissorYPlus1);
Vec4<int> scissor_step = Vec4<int>(0, -32, 0, -32);
for (int64_t curX = rowMinX; curX <= rowMaxX; curX += 32,
w0 = e0.StepX(w0),
w1 = e1.StepX(w1),
@ -761,7 +801,7 @@ void DrawTriangleSlice(
// If p is on or inside all edges, render pixel
Vec4<int> mask = MakeMask(w0, w1, w2, bias0, bias1, bias2, scissor_mask);
if (AnyMask(mask)) {
if (AnyMask<useSSE4>(mask)) {
Vec4<float> wsum_recip = EdgeRecip(w0, w1, w2);
Vec4<int> prim_color[4];
@ -915,7 +955,9 @@ void DrawTriangle(const VertexData& v0, const VertexData& v1, const VertexData&
Rasterizer::SingleFunc drawPixel = Rasterizer::GetSingleFunc(pixelID);
Sampler::Funcs sampler = Sampler::GetFuncs();
auto drawSlice = gstate.isModeClear() ? &DrawTriangleSlice<true> : &DrawTriangleSlice<false>;
auto drawSlice = cpu_info.bSSE4_1 ?
(gstate.isModeClear() ? &DrawTriangleSlice<true, true> : &DrawTriangleSlice<false, true>) :
(gstate.isModeClear() ? &DrawTriangleSlice<true, false> : &DrawTriangleSlice<false, false>);
const int MIN_LINES_PER_THREAD = 4;