diff --git a/GPU/Software/Rasterizer.cpp b/GPU/Software/Rasterizer.cpp index 5c3fabb3ce..835869f513 100644 --- a/GPU/Software/Rasterizer.cpp +++ b/GPU/Software/Rasterizer.cpp @@ -773,7 +773,7 @@ static inline bool AlphaTestPassed(int alpha) return true; } -static inline Vec3 GetSourceFactor(int source_a, const Vec4& dst) +static inline Vec3 GetSourceFactor(const Vec4& source, const Vec4& dst) { switch (gstate.getBlendFuncA()) { case GE_SRCBLEND_DSTCOLOR: @@ -783,10 +783,18 @@ static inline Vec3 GetSourceFactor(int source_a, const Vec4& dst) return Vec3::AssignToAll(255) - dst.rgb(); case GE_SRCBLEND_SRCALPHA: - return Vec3::AssignToAll(source_a); +#if defined(_M_SSE) + return Vec3(_mm_shuffle_epi32(source.ivec, _MM_SHUFFLE(3, 3, 3, 3))); +#else + return Vec3::AssignToAll(source.a()); +#endif case GE_SRCBLEND_INVSRCALPHA: - return Vec3::AssignToAll(255 - source_a); +#if defined(_M_SSE) + return Vec3(_mm_sub_epi32(_mm_set1_epi32(255), _mm_shuffle_epi32(source.ivec, _MM_SHUFFLE(3, 3, 3, 3)))); +#else + return Vec3::AssignToAll(255 - source.a()); +#endif case GE_SRCBLEND_DSTALPHA: return Vec3::AssignToAll(dst.a()); @@ -795,10 +803,10 @@ static inline Vec3 GetSourceFactor(int source_a, const Vec4& dst) return Vec3::AssignToAll(255 - dst.a()); case GE_SRCBLEND_DOUBLESRCALPHA: - return Vec3::AssignToAll(2 * source_a); + return Vec3::AssignToAll(2 * source.a()); case GE_SRCBLEND_DOUBLEINVSRCALPHA: - return Vec3::AssignToAll(255 - 2 * source_a); + return Vec3::AssignToAll(255 - 2 * source.a()); case GE_SRCBLEND_DOUBLEDSTALPHA: return Vec3::AssignToAll(2 * dst.a()); @@ -816,20 +824,28 @@ static inline Vec3 GetSourceFactor(int source_a, const Vec4& dst) } } -static inline Vec3 GetDestFactor(const Vec3& source_rgb, int source_a, const Vec4& dst) +static inline Vec3 GetDestFactor(const Vec4& source, const Vec4& dst) { switch (gstate.getBlendFuncB()) { case GE_DSTBLEND_SRCCOLOR: - return source_rgb; + return source.rgb(); case GE_DSTBLEND_INVSRCCOLOR: - return Vec3::AssignToAll(255) - source_rgb; + return Vec3::AssignToAll(255) - source.rgb(); case GE_DSTBLEND_SRCALPHA: - return Vec3::AssignToAll(source_a); +#if defined(_M_SSE) + return Vec3(_mm_shuffle_epi32(source.ivec, _MM_SHUFFLE(3, 3, 3, 3))); +#else + return Vec3::AssignToAll(source.a()); +#endif case GE_DSTBLEND_INVSRCALPHA: - return Vec3::AssignToAll(255 - source_a); +#if defined(_M_SSE) + return Vec3(_mm_sub_epi32(_mm_set1_epi32(255), _mm_shuffle_epi32(source.ivec, _MM_SHUFFLE(3, 3, 3, 3)))); +#else + return Vec3::AssignToAll(255 - source.a()); +#endif case GE_DSTBLEND_DSTALPHA: return Vec3::AssignToAll(dst.a()); @@ -838,10 +854,10 @@ static inline Vec3 GetDestFactor(const Vec3& source_rgb, int source_a, return Vec3::AssignToAll(255 - dst.a()); case GE_DSTBLEND_DOUBLESRCALPHA: - return Vec3::AssignToAll(2 * source_a); + return Vec3::AssignToAll(2 * source.a()); case GE_DSTBLEND_DOUBLEINVSRCALPHA: - return Vec3::AssignToAll(255 - 2 * source_a); + return Vec3::AssignToAll(255 - 2 * source.a()); case GE_DSTBLEND_DOUBLEDSTALPHA: return Vec3::AssignToAll(2 * dst.a()); @@ -860,18 +876,42 @@ static inline Vec3 GetDestFactor(const Vec3& source_rgb, int source_a, static inline Vec3 AlphaBlendingResult(const Vec4& source, const Vec4 dst) { - Vec3 srcfactor = GetSourceFactor(source.a(), dst); - Vec3 dstfactor = GetDestFactor(source.rgb(), source.a(), dst); + Vec3 srcfactor = GetSourceFactor(source, dst); + Vec3 dstfactor = GetDestFactor(source, dst); switch (gstate.getBlendEq()) { case GE_BLENDMODE_MUL_AND_ADD: + { +#if defined(_M_SSE) + const __m128 s = _mm_mul_ps(_mm_cvtepi32_ps(source.ivec), _mm_cvtepi32_ps(srcfactor.ivec)); + const __m128 d = _mm_mul_ps(_mm_cvtepi32_ps(dst.ivec), _mm_cvtepi32_ps(dstfactor.ivec)); + return Vec3(_mm_cvtps_epi32(_mm_div_ps(_mm_add_ps(s, d), _mm_set_ps1(255.0f)))); +#else return (source.rgb() * srcfactor + dst.rgb() * dstfactor) / 255; +#endif + } case GE_BLENDMODE_MUL_AND_SUBTRACT: + { +#if defined(_M_SSE) + const __m128 s = _mm_mul_ps(_mm_cvtepi32_ps(source.ivec), _mm_cvtepi32_ps(srcfactor.ivec)); + const __m128 d = _mm_mul_ps(_mm_cvtepi32_ps(dst.ivec), _mm_cvtepi32_ps(dstfactor.ivec)); + return Vec3(_mm_cvtps_epi32(_mm_div_ps(_mm_sub_ps(s, d), _mm_set_ps1(255.0f)))); +#else return (source.rgb() * srcfactor - dst.rgb() * dstfactor) / 255; +#endif + } case GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE: + { +#if defined(_M_SSE) + const __m128 s = _mm_mul_ps(_mm_cvtepi32_ps(source.ivec), _mm_cvtepi32_ps(srcfactor.ivec)); + const __m128 d = _mm_mul_ps(_mm_cvtepi32_ps(dst.ivec), _mm_cvtepi32_ps(dstfactor.ivec)); + return Vec3(_mm_cvtps_epi32(_mm_div_ps(_mm_sub_ps(d, s), _mm_set_ps1(255.0f)))); +#else return (dst.rgb() * dstfactor - source.rgb() * srcfactor) / 255; +#endif + } case GE_BLENDMODE_MIN: return Vec3(std::min(source.r(), dst.r()), @@ -952,7 +992,14 @@ inline void DrawSinglePixel(const DrawingCoords &p, u16 z, Vec4 prim_color) if (gstate.isAlphaBlendEnabled() && !clearMode) { const Vec4 dst = Vec4::FromRGBA(old_color); +#if defined(_M_SSE) + const Vec3 blended = AlphaBlendingResult(prim_color, dst); + const __m128i blended16 = _mm_packs_epi32(blended.ivec, blended.ivec); + new_color = _mm_cvtsi128_si32(_mm_packus_epi16(blended16, blended16)); + new_color = (stencil << 24) | (new_color & 0x00FFFFFF); +#else new_color = Vec4(AlphaBlendingResult(prim_color, dst).Clamp(0, 255), stencil).ToRGBA(); +#endif } else { if (!clearMode) prim_color = prim_color.Clamp(0, 255);