Replace some signed divison in SoftGPU

This also adds a few bitwise operations to Vec4<int> and further
SIMDifies it.
Also, fixes unrelated warning.
This commit is contained in:
fp64 2023-06-29 16:43:21 -04:00
parent 51e71bd726
commit f133739cd0
4 changed files with 84 additions and 17 deletions

View File

@ -258,7 +258,7 @@ void *AllocateAlignedMemory(size_t size, size_t alignment) {
#endif
#endif
_assert_msg_(ptr != nullptr, "Failed to allocate aligned memory of size %lu", size);
_assert_msg_(ptr != nullptr, "Failed to allocate aligned memory of size %llu", (unsigned long long)size);
return ptr;
}

View File

@ -665,6 +665,20 @@ public:
{
return Vec4(x | other.x, y | other.y, z | other.z, w | other.w);
}
Vec4 operator & (const Vec4 &other) const
{
return Vec4(x & other.x, y & other.y, z & other.z, w & other.w);
}
Vec4 operator << (const int amount) const
{
// NOTE: x*(1<<amount), etc., might be safer, since
// left-shifting negatives is UB pre-C++20.
return Vec4(x << amount, y << amount, z << amount, w << amount);
}
Vec4 operator >> (const int amount) const
{
return Vec4(x >> amount, y >> amount, z >> amount, w >> amount);
}
template<typename V>
Vec4 operator * (const V& f) const
{
@ -1363,6 +1377,59 @@ inline Vec3<float> Vec3<float>::operator * (const float &other) const {
return Vec3<float>(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(other)));
}
// Vec4<int> operation
template<>
inline Vec4<int> Vec4<int>::operator + (const Vec4 &other) const {
return Vec4<int>(_mm_add_epi32(SAFE_M128I(ivec), SAFE_M128I(other.ivec)));
}
template<>
inline Vec4<int> Vec4<int>::operator * (const Vec4 &other) const {
__m128i a = SAFE_M128I(ivec);
__m128i b = SAFE_M128I(other.ivec);
#if _M_SSE >= 0x401
return Vec4<int>(_mm_mullo_epi32(a, b));
#else
// This is what clang does. Seems about as good
// as it gets.
__m128i m02 = _mm_mul_epu32(a, b);
__m128i m13 = _mm_mul_epu32( // 0xF5 -> [1, 1, 3, 3]
_mm_shuffle_epi32(a, 0xF5),
_mm_shuffle_epi32(b, 0xF5));
__m128i ret = _mm_unpacklo_epi32( // 0xE8 -> [0, 2, 2, 3]
_mm_shuffle_epi32(m02, 0xE8),
_mm_shuffle_epi32(m13, 0xE8));
return Vec4<int>(ret);
#endif
}
template<> template<>
inline Vec4<int> Vec4<int>::operator * (const int &other) const {
return (*this) * Vec4<int>(_mm_set1_epi32(other));
}
template<>
inline Vec4<int> Vec4<int>::operator | (const Vec4 &other) const {
return Vec4<int>(_mm_or_si128(SAFE_M128I(ivec), SAFE_M128I(other.ivec)));
}
template<>
inline Vec4<int> Vec4<int>::operator & (const Vec4 &other) const {
return Vec4<int>(_mm_and_si128(SAFE_M128I(ivec), SAFE_M128I(other.ivec)));
}
// NOTE: modern GCC, clang, and MSVC are all ok with
// non-compile-time-const amount for _mm_slli_epi32/_mm_srli_epi32.
template<>
inline Vec4<int> Vec4<int>::operator << (const int amount) const {
return Vec4<int>(_mm_slli_epi32(SAFE_M128I(ivec), amount));
}
template<>
inline Vec4<int> Vec4<int>::operator >> (const int amount) const {
return Vec4<int>(_mm_srli_epi32(SAFE_M128I(ivec), amount));
}
// Vec4<float> operation
template<>
inline void Vec4<float>::operator += (const Vec4<float> &other) {

View File

@ -283,7 +283,7 @@ static void ProcessSIMD(VertexData &vertex, const WorldCoords &worldpos, const W
Vec4<int> mec = Vec4<int>::FromRGBA(gstate.getMaterialEmissive());
Vec4<int> mac = state.colorForAmbient ? colorFactor : state.material.ambientColorFactor;
Vec4<int> ambient = (mac * state.baseAmbientColorFactor) / 1024;
Vec4<int> ambient = (mac * state.baseAmbientColorFactor) >> 10;
Vec4<int> final_color = mec + ambient;
Vec4<int> specular_color = Vec4<int>::AssignToAll(0);

View File

@ -276,13 +276,13 @@ static inline int GetPixelDataOffset(uint32_t row_pitch_pixels, uint32_t u, uint
if (!swizzled)
return (v * (row_pitch_pixels * texel_size_bits >> 3)) + (u * texel_size_bits >> 3);
const int tile_size_bits = 32;
const int tiles_in_block_horizontal = 4;
const int tiles_in_block_vertical = 8;
const uint32_t tile_size_bits = 32;
const uint32_t tiles_in_block_horizontal = 4;
const uint32_t tiles_in_block_vertical = 8;
constexpr int texels_per_tile = tile_size_bits / texel_size_bits;
int tile_u = u / texels_per_tile;
int tile_idx = (v % tiles_in_block_vertical) * (tiles_in_block_horizontal) +
constexpr uint32_t texels_per_tile = tile_size_bits / texel_size_bits;
uint32_t tile_u = u / texels_per_tile;
uint32_t tile_idx = (v % tiles_in_block_vertical) * (tiles_in_block_horizontal) +
// TODO: not sure if the *texel_size_bits/8 factor is correct
(v / tiles_in_block_vertical) * ((row_pitch_pixels*texel_size_bits/(tile_size_bits))*tiles_in_block_vertical) +
(tile_u % tiles_in_block_horizontal) +
@ -408,22 +408,22 @@ inline static Nearest4 SOFTRAST_CALL SampleNearest(const int u[N], const int v[N
case GE_TFMT_DXT1:
for (int i = 0; i < N; ++i) {
const DXT1Block *block = (const DXT1Block *)srcptr + (v[i] / 4) * (texbufw / 4) + (u[i] / 4);
res.v[i] = GetDXT1Texel(block, u[i] % 4, v[i] % 4);
const DXT1Block *block = (const DXT1Block *)srcptr + (v[i] >> 2) * (texbufw >> 2) + (u[i] >> 2);
res.v[i] = GetDXT1Texel(block, u[i] & 3, v[i] & 3);
}
return res;
case GE_TFMT_DXT3:
for (int i = 0; i < N; ++i) {
const DXT3Block *block = (const DXT3Block *)srcptr + (v[i] / 4) * (texbufw / 4) + (u[i] / 4);
res.v[i] = GetDXT3Texel(block, u[i] % 4, v[i] % 4);
const DXT3Block *block = (const DXT3Block *)srcptr + (v[i] >> 2) * (texbufw >> 2) + (u[i] >> 2);
res.v[i] = GetDXT3Texel(block, u[i] & 3, v[i] & 3);
}
return res;
case GE_TFMT_DXT5:
for (int i = 0; i < N; ++i) {
const DXT5Block *block = (const DXT5Block *)srcptr + (v[i] / 4) * (texbufw / 4) + (u[i] / 4);
res.v[i] = GetDXT5Texel(block, u[i] % 4, v[i] % 4);
const DXT5Block *block = (const DXT5Block *)srcptr + (v[i] >> 2) * (texbufw >> 2) + (u[i] >> 2);
res.v[i] = GetDXT5Texel(block, u[i] & 3, v[i] & 3);
}
return res;
@ -613,7 +613,7 @@ static Vec4IntResult SOFTRAST_CALL SampleNearest(float s, float t, Vec4IntArg pr
GetTexelCoordinates(level + 1, s, t, u, v, samplerID);
Vec4<int> c1 = Vec4<int>::FromRGBA(SampleNearest<1>(&u, &v, tptr[1], bufw[1], level + 1, samplerID).v[0]);
c0 = (c1 * levelFrac + c0 * (16 - levelFrac)) / 16;
c0 = (c1 * levelFrac + c0 * (16 - levelFrac)) >> 4;
}
return GetTextureFunctionOutput(prim_color, ToVec4IntArg(c0), samplerID);
@ -748,7 +748,7 @@ static Vec4IntResult SOFTRAST_CALL SampleLinearLevel(float s, float t, const u8
Vec4<int> texcolor_br = Vec4<int>::FromRGBA(c.v[3]);
Vec4<int> top = texcolor_tl * (0x10 - frac_u) + texcolor_tr * frac_u;
Vec4<int> bot = texcolor_bl * (0x10 - frac_u) + texcolor_br * frac_u;
return ToVec4IntResult((top * (0x10 - frac_v) + bot * frac_v) / (16 * 16));
return ToVec4IntResult((top * (0x10 - frac_v) + bot * frac_v) >> 8);
#endif
}
@ -756,7 +756,7 @@ static Vec4IntResult SOFTRAST_CALL SampleLinear(float s, float t, Vec4IntArg pri
Vec4<int> c0 = SampleLinearLevel(s, t, tptr, bufw, texlevel, samplerID);
if (levelFrac) {
const Vec4<int> c1 = SampleLinearLevel(s, t, tptr + 1, bufw + 1, texlevel + 1, samplerID);
c0 = (c1 * levelFrac + c0 * (16 - levelFrac)) / 16;
c0 = (c1 * levelFrac + c0 * (16 - levelFrac)) >> 4;
}
return GetTextureFunctionOutput(prim_color, ToVec4IntArg(c0), samplerID);
}