mirror of
https://github.com/hrydgard/ppsspp.git
synced 2025-03-05 12:47:08 +00:00
softgpu: Use some SSE4 in triangle interpolation.
This commit is contained in:
parent
3b1cc0d3b8
commit
c7fc448869
@ -19,6 +19,7 @@
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
|
||||
#include "Common/CPUDetect.h"
|
||||
#include "Common/Data/Convert/ColorConv.h"
|
||||
#include "Common/Profiler/Profiler.h"
|
||||
#include "Common/StringUtils.h"
|
||||
@ -41,6 +42,10 @@
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
#if _M_SSE >= 0x401
|
||||
#include <smmintrin.h>
|
||||
#endif
|
||||
|
||||
namespace Rasterizer {
|
||||
|
||||
// Only OK on x64 where our stack is aligned
|
||||
@ -565,6 +570,7 @@ static inline void ApplyTexturing(Sampler::Funcs sampler, Vec4<int> *prim_color,
|
||||
}
|
||||
}
|
||||
|
||||
template <bool useSSE4>
|
||||
struct TriangleEdge {
|
||||
Vec4<int> Start(const ScreenCoords &v0, const ScreenCoords &v1, const ScreenCoords &origin);
|
||||
inline Vec4<int> StepX(const Vec4<int> &w);
|
||||
@ -577,7 +583,8 @@ struct TriangleEdge {
|
||||
Vec4<int> stepY;
|
||||
};
|
||||
|
||||
Vec4<int> TriangleEdge::Start(const ScreenCoords &v0, const ScreenCoords &v1, const ScreenCoords &origin) {
|
||||
template <bool useSSE4>
|
||||
Vec4<int> TriangleEdge<useSSE4>::Start(const ScreenCoords &v0, const ScreenCoords &v1, const ScreenCoords &origin) {
|
||||
// Start at pixel centers.
|
||||
Vec4<int> initX = Vec4<int>::AssignToAll(origin.x) + Vec4<int>(7, 23, 7, 23);
|
||||
Vec4<int> initY = Vec4<int>::AssignToAll(origin.y) + Vec4<int>(7, 7, 23, 23);
|
||||
@ -590,10 +597,18 @@ Vec4<int> TriangleEdge::Start(const ScreenCoords &v0, const ScreenCoords &v1, co
|
||||
stepX = Vec4<int>::AssignToAll(xf * 16 * 2);
|
||||
stepY = Vec4<int>::AssignToAll(yf * 16 * 2);
|
||||
|
||||
#if defined(_M_SSE) && !PPSSPP_ARCH(X86) && _M_SSE >= 0x401
|
||||
if (useSSE4) {
|
||||
initX.ivec = _mm_mullo_epi32(initX.ivec, _mm_set1_epi32(xf));
|
||||
initY.ivec = _mm_mullo_epi32(initY.ivec, _mm_set1_epi32(yf));
|
||||
return _mm_add_epi32(_mm_add_epi32(initX.ivec, initY.ivec), _mm_set1_epi32(c));
|
||||
}
|
||||
#endif
|
||||
return Vec4<int>::AssignToAll(xf) * initX + Vec4<int>::AssignToAll(yf) * initY + Vec4<int>::AssignToAll(c);
|
||||
}
|
||||
|
||||
inline Vec4<int> TriangleEdge::StepX(const Vec4<int> &w) {
|
||||
template <bool useSSE4>
|
||||
inline Vec4<int> TriangleEdge<useSSE4>::StepX(const Vec4<int> &w) {
|
||||
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
|
||||
return _mm_add_epi32(w.ivec, stepX.ivec);
|
||||
#else
|
||||
@ -601,7 +616,8 @@ inline Vec4<int> TriangleEdge::StepX(const Vec4<int> &w) {
|
||||
#endif
|
||||
}
|
||||
|
||||
inline Vec4<int> TriangleEdge::StepY(const Vec4<int> &w) {
|
||||
template <bool useSSE4>
|
||||
inline Vec4<int> TriangleEdge<useSSE4>::StepY(const Vec4<int> &w) {
|
||||
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
|
||||
return _mm_add_epi32(w.ivec, stepY.ivec);
|
||||
#else
|
||||
@ -609,8 +625,20 @@ inline Vec4<int> TriangleEdge::StepY(const Vec4<int> &w) {
|
||||
#endif
|
||||
}
|
||||
|
||||
void TriangleEdge::NarrowMinMaxX(const Vec4<int> &w, int64_t minX, int64_t &rowMinX, int64_t &rowMaxX) {
|
||||
int wmax = std::max(std::max(w.x, w.y), std::max(w.z, w.w));
|
||||
template <bool useSSE4>
|
||||
void TriangleEdge<useSSE4>::NarrowMinMaxX(const Vec4<int> &w, int64_t minX, int64_t &rowMinX, int64_t &rowMaxX) {
|
||||
int wmax;
|
||||
#if defined(_M_SSE) && !PPSSPP_ARCH(X86) && _M_SSE >= 0x401
|
||||
if (useSSE4) {
|
||||
__m128i max01 = _mm_max_epi32(w.ivec, _mm_shuffle_epi32(w.ivec, _MM_SHUFFLE(3, 2, 3, 2)));
|
||||
__m128i max0 = _mm_max_epi32(max01, _mm_shuffle_epi32(max01, _MM_SHUFFLE(1, 1, 1, 1)));
|
||||
wmax = _mm_cvtsi128_si32(max0);
|
||||
} else {
|
||||
wmax = std::max(std::max(w.x, w.y), std::max(w.z, w.w));
|
||||
}
|
||||
#else
|
||||
wmax = std::max(std::max(w.x, w.y), std::max(w.z, w.w));
|
||||
#endif
|
||||
if (wmax < 0) {
|
||||
if (stepX.x > 0) {
|
||||
int steps = -wmax / stepX.x;
|
||||
@ -626,7 +654,12 @@ void TriangleEdge::NarrowMinMaxX(const Vec4<int> &w, int64_t minX, int64_t &rowM
|
||||
}
|
||||
}
|
||||
|
||||
inline Vec4<int> TriangleEdge::StepXTimes(const Vec4<int> &w, int c) {
|
||||
template <bool useSSE4>
|
||||
inline Vec4<int> TriangleEdge<useSSE4>::StepXTimes(const Vec4<int> &w, int c) {
|
||||
#if defined(_M_SSE) && !PPSSPP_ARCH(X86) && _M_SSE >= 0x401
|
||||
if (useSSE4)
|
||||
return _mm_add_epi32(w.ivec, _mm_mullo_epi32(_mm_set1_epi32(c), stepX.ivec));
|
||||
#endif
|
||||
return w + stepX * c;
|
||||
}
|
||||
|
||||
@ -642,8 +675,16 @@ static inline Vec4<int> MakeMask(const Vec4<int> &w0, const Vec4<int> &w1, const
|
||||
#endif
|
||||
}
|
||||
|
||||
template <bool useSSE4>
|
||||
static inline bool AnyMask(const Vec4<int> &mask) {
|
||||
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
|
||||
#if _M_SSE >= 0x401
|
||||
if (useSSE4) {
|
||||
__m128i sig = _mm_srai_epi32(mask.ivec, 31);
|
||||
return _mm_test_all_ones(sig) == 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
// In other words: !(mask.x < 0 && mask.y < 0 && mask.z < 0 && mask.w < 0)
|
||||
__m128i low2 = _mm_and_si128(mask.ivec, _mm_shuffle_epi32(mask.ivec, _MM_SHUFFLE(3, 2, 3, 2)));
|
||||
__m128i low1 = _mm_and_si128(low2, _mm_shuffle_epi32(low2, _MM_SHUFFLE(1, 1, 1, 1)));
|
||||
@ -664,7 +705,7 @@ static inline Vec4<float> EdgeRecip(const Vec4<int> &w0, const Vec4<int> &w1, co
|
||||
#endif
|
||||
}
|
||||
|
||||
template <bool clearMode>
|
||||
template <bool clearMode, bool useSSE4>
|
||||
void DrawTriangleSlice(
|
||||
const VertexData& v0, const VertexData& v1, const VertexData& v2,
|
||||
int x1, int y1, int x2, int y2,
|
||||
@ -697,9 +738,9 @@ void DrawTriangleSlice(
|
||||
}
|
||||
}
|
||||
|
||||
TriangleEdge e0;
|
||||
TriangleEdge e1;
|
||||
TriangleEdge e2;
|
||||
TriangleEdge<useSSE4> e0;
|
||||
TriangleEdge<useSSE4> e1;
|
||||
TriangleEdge<useSSE4> e2;
|
||||
|
||||
int64_t minX = x1, maxX = x2, minY = y1, maxY = y2;
|
||||
|
||||
@ -733,11 +774,6 @@ void DrawTriangleSlice(
|
||||
Vec4<int> w1 = w1_base;
|
||||
Vec4<int> w2 = w2_base;
|
||||
|
||||
// TODO: Maybe we can clip the edges instead?
|
||||
int scissorYPlus1 = curY + 16 > maxY ? -1 : 0;
|
||||
Vec4<int> scissor_mask = Vec4<int>(0, maxX - minX - 16, scissorYPlus1, (maxX - minX - 16) | scissorYPlus1);
|
||||
Vec4<int> scissor_step = Vec4<int>(0, -32, 0, -32);
|
||||
|
||||
DrawingCoords p = TransformUnit::ScreenToDrawing(ScreenCoords(minX, curY, 0));
|
||||
|
||||
int64_t rowMinX = minX, rowMaxX = maxX;
|
||||
@ -749,9 +785,13 @@ void DrawTriangleSlice(
|
||||
w0 = e0.StepXTimes(w0, skipX);
|
||||
w1 = e1.StepXTimes(w1, skipX);
|
||||
w2 = e2.StepXTimes(w2, skipX);
|
||||
scissor_mask = scissor_mask + scissor_step * skipX;
|
||||
p.x = (p.x + 2 * skipX) & 0x3FF;
|
||||
|
||||
// TODO: Maybe we can clip the edges instead?
|
||||
int scissorYPlus1 = curY + 16 > maxY ? -1 : 0;
|
||||
Vec4<int> scissor_mask = Vec4<int>(0, rowMaxX - rowMinX - 16, scissorYPlus1, (rowMaxX - rowMinX - 16) | scissorYPlus1);
|
||||
Vec4<int> scissor_step = Vec4<int>(0, -32, 0, -32);
|
||||
|
||||
for (int64_t curX = rowMinX; curX <= rowMaxX; curX += 32,
|
||||
w0 = e0.StepX(w0),
|
||||
w1 = e1.StepX(w1),
|
||||
@ -761,7 +801,7 @@ void DrawTriangleSlice(
|
||||
|
||||
// If p is on or inside all edges, render pixel
|
||||
Vec4<int> mask = MakeMask(w0, w1, w2, bias0, bias1, bias2, scissor_mask);
|
||||
if (AnyMask(mask)) {
|
||||
if (AnyMask<useSSE4>(mask)) {
|
||||
Vec4<float> wsum_recip = EdgeRecip(w0, w1, w2);
|
||||
|
||||
Vec4<int> prim_color[4];
|
||||
@ -915,7 +955,9 @@ void DrawTriangle(const VertexData& v0, const VertexData& v1, const VertexData&
|
||||
Rasterizer::SingleFunc drawPixel = Rasterizer::GetSingleFunc(pixelID);
|
||||
Sampler::Funcs sampler = Sampler::GetFuncs();
|
||||
|
||||
auto drawSlice = gstate.isModeClear() ? &DrawTriangleSlice<true> : &DrawTriangleSlice<false>;
|
||||
auto drawSlice = cpu_info.bSSE4_1 ?
|
||||
(gstate.isModeClear() ? &DrawTriangleSlice<true, true> : &DrawTriangleSlice<false, true>) :
|
||||
(gstate.isModeClear() ? &DrawTriangleSlice<true, false> : &DrawTriangleSlice<false, false>);
|
||||
|
||||
const int MIN_LINES_PER_THREAD = 4;
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user