// Copyright (c) 2012- PPSSPP Project. // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, version 2.0 or later versions. // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License 2.0 for more details. // A copy of the GPL 2.0 should have been included with the program. // If not, see http://www.gnu.org/licenses/ // Official git repository and contact information can be found at // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. #include "Common/Common.h" #include "GPU/Math3D.h" namespace Math3D { template<> float Vec2::Length() const { // Doubt this is worth it for a vec2 :/ #if defined(_M_SSE) float ret; __m128 xy = _mm_loadu_ps(&x); __m128 sq = _mm_mul_ps(xy, xy); const __m128 r2 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 1)); const __m128 res = _mm_add_ss(sq, r2); _mm_store_ss(&ret, _mm_sqrt_ss(res)); return ret; #elif PPSSPP_ARCH(ARM64_NEON) float32x2_t vec = vld1_f32(&x); float32x2_t sq = vmul_f32(vec, vec); float32x2_t add2 = vpadd_f32(sq, sq); float32x2_t res = vsqrt_f32(add2); return vget_lane_f32(res, 0); #else return sqrtf(Length2()); #endif } template<> void Vec2::SetLength(const float l) { (*this) *= l / Length(); } template<> Vec2 Vec2::WithLength(const float l) const { return (*this) * l / Length(); } template<> float Vec2::Distance2To(const Vec2 &other) const { return Vec2(other-(*this)).Length2(); } template<> Vec2 Vec2::Normalized() const { return (*this) / Length(); } template<> float Vec2::Normalize() { float len = Length(); (*this) = (*this)/len; return len; } template<> float Vec3::Length() const { #if defined(_M_SSE) float ret; __m128 xyz = _mm_loadu_ps(&x); __m128 sq = _mm_mul_ps(xyz, xyz); const __m128 r2 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 1)); const __m128 r3 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 2)); const __m128 res = _mm_add_ss(sq, _mm_add_ss(r2, r3)); _mm_store_ss(&ret, _mm_sqrt_ss(res)); return ret; #elif PPSSPP_ARCH(ARM64_NEON) float32x4_t sq = vsetq_lane_f32(0.0f, vmulq_f32(vec, vec), 3); float32x2_t add1 = vget_low_f32(vpaddq_f32(sq, sq)); float32x2_t add2 = vpadd_f32(add1, add1); float32x2_t res = vsqrt_f32(add2); return vget_lane_f32(res, 0); #else return sqrtf(Length2()); #endif } template<> void Vec3::SetLength(const float l) { (*this) *= l / Length(); } template<> Vec3 Vec3::WithLength(const float l) const { return (*this) * l / Length(); } template<> float Vec3::Distance2To(const Vec3 &other) const { return Vec3(other-(*this)).Length2(); } #if defined(_M_SSE) __m128 SSENormalizeMultiplierSSE2(__m128 v) { const __m128 sq = _mm_mul_ps(v, v); const __m128 r2 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 1)); const __m128 r3 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 2)); const __m128 res = _mm_add_ss(r3, _mm_add_ss(r2, sq)); const __m128 rt = _mm_rsqrt_ss(res); return _mm_shuffle_ps(rt, rt, _MM_SHUFFLE(0, 0, 0, 0)); } #if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) [[gnu::target("sse4.1")]] #endif __m128 SSENormalizeMultiplierSSE4(__m128 v) { // This is only used for Vec3f, so ignore the 4th component, might be garbage. return _mm_rsqrt_ps(_mm_dp_ps(v, v, 0x77)); } __m128 SSENormalizeMultiplier(bool useSSE4, __m128 v) { if (useSSE4) return SSENormalizeMultiplierSSE4(v); return SSENormalizeMultiplierSSE2(v); } template<> Vec3 Vec3::Normalized(bool useSSE4) const { const __m128 normalize = SSENormalizeMultiplier(useSSE4, vec); return _mm_mul_ps(normalize, vec); } template<> Vec3 Vec3::NormalizedOr001(bool useSSE4) const { const __m128 normalize = SSENormalizeMultiplier(useSSE4, vec); const __m128 result = _mm_mul_ps(normalize, vec); const __m128 mask = _mm_cmpunord_ps(result, vec); const __m128 replace = _mm_and_ps(_mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f), mask); // Replace with the constant if the mask matched. return _mm_or_ps(_mm_andnot_ps(mask, result), replace); } #elif PPSSPP_ARCH(ARM64_NEON) template<> Vec3 Vec3::Normalized(bool useSSE4) const { float32x4_t sq = vsetq_lane_f32(0.0f, vmulq_f32(vec, vec), 3); float32x2_t add1 = vget_low_f32(vpaddq_f32(sq, sq)); float32x2_t summed = vpadd_f32(add1, add1); float32x2_t e = vrsqrte_f32(summed); e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), summed), e); e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), summed), e); float32x4_t factor = vdupq_lane_f32(e, 0); return Vec3(vmulq_f32(vec, factor)); } template<> Vec3 Vec3::NormalizedOr001(bool useSSE4) const { float32x4_t sq = vsetq_lane_f32(0.0f, vmulq_f32(vec, vec), 3); float32x2_t add1 = vget_low_f32(vpaddq_f32(sq, sq)); float32x2_t summed = vpadd_f32(add1, add1); if (vget_lane_f32(summed, 0) == 0.0f) { return Vec3(vsetq_lane_f32(1.0f, vdupq_lane_f32(summed, 0), 2)); } float32x2_t e = vrsqrte_f32(summed); e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), summed), e); e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), summed), e); float32x4_t factor = vdupq_lane_f32(e, 0); return Vec3(vmulq_f32(vec, factor)); } #else template<> Vec3 Vec3::Normalized(bool useSSE4) const { return (*this) / Length(); } template<> Vec3 Vec3::NormalizedOr001(bool useSSE4) const { float len = Length(); if (len == 0.0f) { return Vec3(0.0f, 0.0f, 1.0f); } return *this / len; } #endif template<> float Vec3::Normalize() { float len = Length(); (*this) = (*this)/len; return len; } template<> float Vec3::NormalizeOr001() { float len = Length(); if (len == 0.0f) { z = 1.0f; } else { *this /= len; } return len; } template<> Vec3Packed Vec3Packed::FromRGB(unsigned int rgb) { return Vec3Packed((rgb & 0xFF) * (1.0f/255.0f), ((rgb >> 8) & 0xFF) * (1.0f/255.0f), ((rgb >> 16) & 0xFF) * (1.0f/255.0f)); } template<> Vec3Packed Vec3Packed::FromRGB(unsigned int rgb) { return Vec3Packed(rgb & 0xFF, (rgb >> 8) & 0xFF, (rgb >> 16) & 0xFF); } template<> unsigned int Vec3Packed::ToRGB() const { return ((unsigned int)(r()*255.f)) + ((unsigned int)(g()*255.f*256.f)) + ((unsigned int)(b()*255.f*256.f*256.f)); } template<> unsigned int Vec3Packed::ToRGB() const { return (r()&0xFF) | ((g()&0xFF)<<8) | ((b()&0xFF)<<16); } template<> float Vec3Packed::Length() const { return sqrtf(Length2()); } template<> void Vec3Packed::SetLength(const float l) { (*this) *= l / Length(); } template<> Vec3Packed Vec3Packed::WithLength(const float l) const { return (*this) * l / Length(); } template<> float Vec3Packed::Distance2To(const Vec3Packed &other) const { return Vec3Packed(other-(*this)).Length2(); } template<> Vec3Packed Vec3Packed::Normalized() const { return (*this) / Length(); } template<> float Vec3Packed::Normalize() { float len = Length(); (*this) = (*this)/len; return len; } template<> float Vec4::Length() const { #if defined(_M_SSE) float ret; __m128 xyzw = _mm_loadu_ps(&x); __m128 sq = _mm_mul_ps(xyzw, xyzw); const __m128 r2 = _mm_add_ps(sq, _mm_movehl_ps(sq, sq)); const __m128 res = _mm_add_ss(r2, _mm_shuffle_ps(r2, r2, _MM_SHUFFLE(0, 0, 0, 1))); _mm_store_ss(&ret, _mm_sqrt_ss(res)); return ret; #elif PPSSPP_ARCH(ARM64_NEON) float32x4_t sq = vmulq_f32(vec, vec); float32x2_t add1 = vget_low_f32(vpaddq_f32(sq, sq)); float32x2_t add2 = vpadd_f32(add1, add1); float32x2_t res = vsqrt_f32(add2); return vget_lane_f32(res, 0); #else return sqrtf(Length2()); #endif } template<> void Vec4::SetLength(const float l) { (*this) *= l / Length(); } template<> Vec4 Vec4::WithLength(const float l) const { return (*this) * l / Length(); } template<> float Vec4::Distance2To(const Vec4 &other) const { return Vec4(other-(*this)).Length2(); } template<> Vec4 Vec4::Normalized() const { return (*this) / Length(); } template<> float Vec4::Normalize() { float len = Length(); (*this) = (*this)/len; return len; } }; // namespace Math3D