// Copyright (c) 2012- PPSSPP Project. // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, version 2.0 or later versions. // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License 2.0 for more details. // A copy of the GPL 2.0 should have been included with the program. // If not, see http://www.gnu.org/licenses/ // Official git repository and contact information can be found at // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. #pragma once #include "ppsspp_config.h" #include #include "Common/Common.h" #include "Core/Util/AudioFormat.h" // for clamp_u8 #include "Common/Math/fast/fast_matrix.h" #if defined(_M_SSE) #include #include #endif #if PPSSPP_ARCH(ARM_NEON) #if defined(_MSC_VER) && PPSSPP_ARCH(ARM64) #include #else #include #endif #endif #if PPSSPP_PLATFORM(WINDOWS) && (defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER)) #define MATH3D_CALL __vectorcall #else #define MATH3D_CALL #endif namespace Math3D { // Helper for Vec classes to clamp values. template inline static T VecClamp(const T &v, const T &low, const T &high) { if (v > high) return high; if (v < low) return low; return v; } template class Vec2 { public: struct { T x,y; }; T* AsArray() { return &x; } const T* AsArray() const { return &x; } Vec2() {} Vec2(const T a[2]) : x(a[0]), y(a[1]) {} Vec2(const T& _x, const T& _y) : x(_x), y(_y) {} template Vec2 Cast() const { return Vec2((T2)x, (T2)y); } static Vec2 AssignToAll(const T& f) { return Vec2(f, f); } void Write(T a[2]) { a[0] = x; a[1] = y; } Vec2 operator +(const Vec2& other) const { return Vec2(x+other.x, y+other.y); } void operator += (const Vec2 &other) { x+=other.x; y+=other.y; } Vec2 operator -(const Vec2& other) const { return Vec2(x-other.x, y-other.y); } void operator -= (const Vec2& other) { x-=other.x; y-=other.y; } Vec2 operator -() const { return Vec2(-x,-y); } Vec2 operator * (const Vec2& other) const { return Vec2(x*other.x, y*other.y); } template Vec2 operator * (const V& f) const { return Vec2(x*f,y*f); } template void operator *= (const V& f) { x*=f; y*=f; } template Vec2 operator / (const V& f) const { return Vec2(x/f,y/f); } template void operator /= (const V& f) { *this = *this / f; } T Length2() const { return x*x + y*y; } Vec2 Clamp(const T &l, const T &h) const { return Vec2(VecClamp(x, l, h), VecClamp(y, l, h)); } // Only implemented for T=float float Length() const; void SetLength(const float l); Vec2 WithLength(const float l) const; float Distance2To(const Vec2 &other) const; Vec2 Normalized() const; float Normalize(); // returns the previous length, which is often useful T& operator [] (int i) //allow vector[1] = 3 (vector.y=3) { return *((&x) + i); } T operator [] (const int i) const { return *((&x) + i); } void SetZero() { x=0; y=0; } // Common aliases: UV (texel coordinates), ST (texture coordinates) T& u() { return x; } T& v() { return y; } T& s() { return x; } T& t() { return y; } const T& u() const { return x; } const T& v() const { return y; } const T& s() const { return x; } const T& t() const { return y; } // swizzlers - create a subvector of specific components const Vec2 yx() const { return Vec2(y, x); } const Vec2 vu() const { return Vec2(y, x); } const Vec2 ts() const { return Vec2(y, x); } }; template class Vec3Packed; template class Vec3 { public: union { struct { T x,y,z; }; #if defined(_M_SSE) __m128i ivec; __m128 vec; #elif PPSSPP_ARCH(ARM64_NEON) int32x4_t ivec; float32x4_t vec; #endif }; T* AsArray() { return &x; } const T* AsArray() const { return &x; } Vec3() {} Vec3(const T a[3]) : x(a[0]), y(a[1]), z(a[2]) {} constexpr Vec3(const T& _x, const T& _y, const T& _z) : x(_x), y(_y), z(_z) {} Vec3(const Vec2& _xy, const T& _z) : x(_xy.x), y(_xy.y), z(_z) {} #if defined(_M_SSE) constexpr Vec3(const __m128 &_vec) : vec(_vec) {} constexpr Vec3(const __m128i &_ivec) : ivec(_ivec) {} Vec3(const Vec3Packed &_xyz) { vec = _mm_loadu_ps(_xyz.AsArray()); } #elif PPSSPP_ARCH(ARM64_NEON) Vec3(const float32x4_t &_vec) : vec(_vec) {} #if !defined(_MSC_VER) Vec3(const int32x4_t &_ivec) : ivec(_ivec) {} #endif Vec3(const Vec3Packed &_xyz) { vec = vld1q_f32(_xyz.AsArray()); } #else Vec3(const Vec3Packed &_xyz) : x(_xyz.x), y(_xyz.y), z(_xyz.z) {} #endif template constexpr Vec3 Cast() const { return Vec3((T2)x, (T2)y, (T2)z); } // Only implemented for T=int and T=float static Vec3 FromRGB(unsigned int rgb); unsigned int ToRGB() const; // alpha bits set to zero static constexpr Vec3 AssignToAll(const T& f) { return Vec3(f, f, f); } void Write(T a[3]) { a[0] = x; a[1] = y; a[2] = z; } Vec3 operator +(const Vec3 &other) const { return Vec3(x+other.x, y+other.y, z+other.z); } void operator += (const Vec3 &other) { x+=other.x; y+=other.y; z+=other.z; } Vec3 operator -(const Vec3 &other) const { return Vec3(x-other.x, y-other.y, z-other.z); } void operator -= (const Vec3 &other) { x-=other.x; y-=other.y; z-=other.z; } Vec3 operator -() const { return Vec3(-x,-y,-z); } Vec3 operator * (const Vec3 &other) const { return Vec3(x*other.x, y*other.y, z*other.z); } template Vec3 operator * (const V& f) const { return Vec3(x*f,y*f,z*f); } template void operator *= (const V& f) { x*=f; y*=f; z*=f; } template Vec3 operator / (const V& f) const { return Vec3(x/f,y/f,z/f); } template void operator /= (const V& f) { *this = *this / f; } bool operator ==(const Vec3 &other) const { return x == other.x && y == other.y && z == other.z; } T Length2() const { return x*x + y*y + z*z; } Vec3 Clamp(const T &l, const T &h) const { return Vec3(VecClamp(x, l, h), VecClamp(y, l, h), VecClamp(z, l, h)); } // Only implemented for T=float float Length() const; void SetLength(const float l); Vec3 WithLength(const float l) const; float Distance2To(const Vec3 &other) const; Vec3 Normalized(bool useSSE4 = false) const; Vec3 NormalizedOr001(bool useSSE4 = false) const; float Normalize(); // returns the previous length, which is often useful float NormalizeOr001(); T& operator [] (int i) //allow vector[2] = 3 (vector.z=3) { return *((&x) + i); } T operator [] (const int i) const { return *((&x) + i); } void SetZero() { x=0; y=0; z=0; } // Common aliases: UVW (texel coordinates), RGB (colors), STQ (texture coordinates) T& u() { return x; } T& v() { return y; } T& w() { return z; } T& r() { return x; } T& g() { return y; } T& b() { return z; } T& s() { return x; } T& t() { return y; } T& q() { return z; } const T& u() const { return x; } const T& v() const { return y; } const T& w() const { return z; } const T& r() const { return x; } const T& g() const { return y; } const T& b() const { return z; } const T& s() const { return x; } const T& t() const { return y; } const T& q() const { return z; } // swizzlers - create a subvector of specific components // e.g. Vec2 uv() { return Vec2(x,y); } // _DEFINE_SWIZZLER2 defines a single such function, DEFINE_SWIZZLER2 defines all of them for all component names (x<->r) and permutations (xy<->yx) #define _DEFINE_SWIZZLER2(a, b, name) const Vec2 name() const { return Vec2(a, b); } #define DEFINE_SWIZZLER2(a, b, a2, b2, a3, b3, a4, b4) \ _DEFINE_SWIZZLER2(a, b, a##b); \ _DEFINE_SWIZZLER2(a, b, a2##b2); \ _DEFINE_SWIZZLER2(a, b, a3##b3); \ _DEFINE_SWIZZLER2(a, b, a4##b4); \ _DEFINE_SWIZZLER2(b, a, b##a); \ _DEFINE_SWIZZLER2(b, a, b2##a2); \ _DEFINE_SWIZZLER2(b, a, b3##a3); \ _DEFINE_SWIZZLER2(b, a, b4##a4); DEFINE_SWIZZLER2(x, y, r, g, u, v, s, t); DEFINE_SWIZZLER2(x, z, r, b, u, w, s, q); DEFINE_SWIZZLER2(y, z, g, b, v, w, t, q); #undef DEFINE_SWIZZLER2 #undef _DEFINE_SWIZZLER2 }; template class Vec3Packed { public: union { struct { T x,y,z; }; }; T* AsArray() { return &x; } const T* AsArray() const { return &x; } Vec3Packed() {} Vec3Packed(const T a[3]) : x(a[0]), y(a[1]), z(a[2]) {} Vec3Packed(const T& _x, const T& _y, const T& _z) : x(_x), y(_y), z(_z) {} Vec3Packed(const Vec2& _xy, const T& _z) : x(_xy.x), y(_xy.y), z(_z) {} Vec3Packed(const Vec3& _xyz) { memcpy(&x, _xyz.AsArray(), sizeof(float) * 3); } template Vec3Packed Cast() const { return Vec3Packed((T2)x, (T2)y, (T2)z); } // Only implemented for T=int and T=float static Vec3Packed FromRGB(unsigned int rgb); unsigned int ToRGB() const; // alpha bits set to zero static Vec3Packed AssignToAll(const T& f) { return Vec3Packed(f, f, f); } void Write(T a[3]) { a[0] = x; a[1] = y; a[2] = z; } Vec3Packed operator +(const Vec3Packed &other) const { return Vec3Packed(x+other.x, y+other.y, z+other.z); } void operator += (const Vec3Packed &other) { x+=other.x; y+=other.y; z+=other.z; } Vec3Packed operator -(const Vec3Packed &other) const { return Vec3Packed(x-other.x, y-other.y, z-other.z); } void operator -= (const Vec3Packed &other) { x-=other.x; y-=other.y; z-=other.z; } Vec3Packed operator -() const { return Vec3Packed(-x,-y,-z); } Vec3Packed operator * (const Vec3Packed &other) const { return Vec3Packed(x*other.x, y*other.y, z*other.z); } template Vec3Packed operator * (const V& f) const { return Vec3Packed(x*f,y*f,z*f); } template void operator *= (const V& f) { x*=f; y*=f; z*=f; } template Vec3Packed operator / (const V& f) const { return Vec3Packed(x/f,y/f,z/f); } template void operator /= (const V& f) { *this = *this / f; } T Length2() const { return x*x + y*y + z*z; } Vec3Packed Clamp(const T &l, const T &h) const { return Vec3Packed(VecClamp(x, l, h), VecClamp(y, l, h), VecClamp(z, l, h)); } // Only implemented for T=float float Length() const; void SetLength(const float l); Vec3Packed WithLength(const float l) const; float Distance2To(const Vec3Packed &other) const; Vec3Packed Normalized() const; float Normalize(); // returns the previous length, which is often useful T& operator [] (int i) //allow vector[2] = 3 (vector.z=3) { return *((&x) + i); } T operator [] (const int i) const { return *((&x) + i); } void SetZero() { x=0; y=0; z=0; } // Common aliases: UVW (texel coordinates), RGB (colors), STQ (texture coordinates) T& u() { return x; } T& v() { return y; } T& w() { return z; } T& r() { return x; } T& g() { return y; } T& b() { return z; } T& s() { return x; } T& t() { return y; } T& q() { return z; } const T& u() const { return x; } const T& v() const { return y; } const T& w() const { return z; } const T& r() const { return x; } const T& g() const { return y; } const T& b() const { return z; } const T& s() const { return x; } const T& t() const { return y; } const T& q() const { return z; } // swizzlers - create a subvector of specific components // e.g. Vec2 uv() { return Vec2(x,y); } // _DEFINE_SWIZZLER2 defines a single such function, DEFINE_SWIZZLER2 defines all of them for all component names (x<->r) and permutations (xy<->yx) #define _DEFINE_SWIZZLER2(a, b, name) const Vec2 name() const { return Vec2(a, b); } #define DEFINE_SWIZZLER2(a, b, a2, b2, a3, b3, a4, b4) \ _DEFINE_SWIZZLER2(a, b, a##b); \ _DEFINE_SWIZZLER2(a, b, a2##b2); \ _DEFINE_SWIZZLER2(a, b, a3##b3); \ _DEFINE_SWIZZLER2(a, b, a4##b4); \ _DEFINE_SWIZZLER2(b, a, b##a); \ _DEFINE_SWIZZLER2(b, a, b2##a2); \ _DEFINE_SWIZZLER2(b, a, b3##a3); \ _DEFINE_SWIZZLER2(b, a, b4##a4); DEFINE_SWIZZLER2(x, y, r, g, u, v, s, t); DEFINE_SWIZZLER2(x, z, r, b, u, w, s, q); DEFINE_SWIZZLER2(y, z, g, b, v, w, t, q); #undef DEFINE_SWIZZLER2 #undef _DEFINE_SWIZZLER2 }; template class Vec4 { public: union { struct { T x,y,z,w; }; #if defined(_M_SSE) __m128i ivec; __m128 vec; #elif PPSSPP_ARCH(ARM64_NEON) int32x4_t ivec; float32x4_t vec; #endif }; T* AsArray() { return &x; } const T* AsArray() const { return &x; } Vec4() {} Vec4(const T a[4]) : x(a[0]), y(a[1]), z(a[2]), w(a[3]) {} Vec4(const T& _x, const T& _y, const T& _z, const T& _w) : x(_x), y(_y), z(_z), w(_w) {} Vec4(const Vec2& _xy, const T& _z, const T& _w) : x(_xy.x), y(_xy.y), z(_z), w(_w) {} Vec4(const Vec3& _xyz, const T& _w) : x(_xyz.x), y(_xyz.y), z(_xyz.z), w(_w) {} #if defined(_M_SSE) Vec4(const __m128 &_vec) : vec(_vec) {} Vec4(const __m128i &_ivec) : ivec(_ivec) {} #elif PPSSPP_ARCH(ARM64_NEON) Vec4(const float32x4_t &_vec) : vec(_vec) {} #if !defined(_MSC_VER) Vec4(const int32x4_t &_ivec) : ivec(_ivec) {} #endif #endif template Vec4 Cast() const { return Vec4((T2)x, (T2)y, (T2)z, (T2)w); } // Only implemented for T=int and T=float static Vec4 FromRGBA(unsigned int rgba); static Vec4 FromRGBA(const u8 *rgba); unsigned int ToRGBA() const; void ToRGBA(u8 *rgba) const; static Vec4 AssignToAll(const T& f) { return Vec4(f, f, f, f); } void Write(T a[4]) { a[0] = x; a[1] = y; a[2] = z; a[3] = w; } Vec4 operator +(const Vec4& other) const { return Vec4(x+other.x, y+other.y, z+other.z, w+other.w); } void operator += (const Vec4& other) { x+=other.x; y+=other.y; z+=other.z; w+=other.w; } Vec4 operator -(const Vec4 &other) const { return Vec4(x-other.x, y-other.y, z-other.z, w-other.w); } void operator -= (const Vec4 &other) { x-=other.x; y-=other.y; z-=other.z; w-=other.w; } Vec4 operator -() const { return Vec4(-x,-y,-z,-w); } Vec4 operator * (const Vec4 &other) const { return Vec4(x*other.x, y*other.y, z*other.z, w*other.w); } Vec4 operator | (const Vec4 &other) const { return Vec4(x | other.x, y | other.y, z | other.z, w | other.w); } template Vec4 operator * (const V& f) const { return Vec4(x*f,y*f,z*f,w*f); } template void operator *= (const V& f) { x*=f; y*=f; z*=f; w*=f; } template Vec4 operator / (const V& f) const { return Vec4(x/f,y/f,z/f,w/f); } template void operator /= (const V& f) { *this = *this / f; } bool operator ==(const Vec4 &other) const { return x == other.x && y == other.y && z == other.z && w == other.w; } T Length2() const { return x*x + y*y + z*z + w*w; } Vec4 Clamp(const T &l, const T &h) const { return Vec4(VecClamp(x, l, h), VecClamp(y, l, h), VecClamp(z, l, h), VecClamp(w, l, h)); } Vec4 Reciprocal() const { const T one = 1.0f; return Vec4(one / x, one / y, one / z, one / w); } // Only implemented for T=float float Length() const; void SetLength(const float l); Vec4 WithLength(const float l) const; float Distance2To(const Vec4 &other) const; Vec4 Normalized() const; float Normalize(); // returns the previous length, which is often useful T& operator [] (int i) //allow vector[2] = 3 (vector.z=3) { return *((&x) + i); } T operator [] (const int i) const { return *((&x) + i); } void SetZero() { x=0; y=0; z=0; w=0; } // Common alias: RGBA (colors) T& r() { return x; } T& g() { return y; } T& b() { return z; } T& a() { return w; } const T& r() const { return x; } const T& g() const { return y; } const T& b() const { return z; } const T& a() const { return w; } // swizzlers - create a subvector of specific components // e.g. Vec2 uv() { return Vec2(x,y); } // _DEFINE_SWIZZLER2 defines a single such function, DEFINE_SWIZZLER2 defines all of them for all component names (x<->r) and permutations (xy<->yx) #define _DEFINE_SWIZZLER2(a, b, name) const Vec2 name() const { return Vec2(a, b); } #define DEFINE_SWIZZLER2(a, b, a2, b2) \ _DEFINE_SWIZZLER2(a, b, a##b); \ _DEFINE_SWIZZLER2(a, b, a2##b2); \ _DEFINE_SWIZZLER2(b, a, b##a); \ _DEFINE_SWIZZLER2(b, a, b2##a2); DEFINE_SWIZZLER2(x, y, r, g); DEFINE_SWIZZLER2(x, z, r, b); DEFINE_SWIZZLER2(x, w, r, a); DEFINE_SWIZZLER2(y, z, g, b); DEFINE_SWIZZLER2(y, w, g, a); DEFINE_SWIZZLER2(z, w, b, a); #undef DEFINE_SWIZZLER2 #undef _DEFINE_SWIZZLER2 #define _DEFINE_SWIZZLER3(a, b, c, name) const Vec3 name() const { return Vec3(a, b, c); } #define DEFINE_SWIZZLER3(a, b, c, a2, b2, c2) \ _DEFINE_SWIZZLER3(a, b, c, a##b##c); \ _DEFINE_SWIZZLER3(a, c, b, a##c##b); \ _DEFINE_SWIZZLER3(b, a, c, b##a##c); \ _DEFINE_SWIZZLER3(b, c, a, b##c##a); \ _DEFINE_SWIZZLER3(c, a, b, c##a##b); \ _DEFINE_SWIZZLER3(c, b, a, c##b##a); \ _DEFINE_SWIZZLER3(a, b, c, a2##b2##c2); \ _DEFINE_SWIZZLER3(a, c, b, a2##c2##b2); \ _DEFINE_SWIZZLER3(b, a, c, b2##a2##c2); \ _DEFINE_SWIZZLER3(b, c, a, b2##c2##a2); \ _DEFINE_SWIZZLER3(c, a, b, c2##a2##b2); \ _DEFINE_SWIZZLER3(c, b, a, c2##b2##a2); DEFINE_SWIZZLER3(x, y, z, r, g, b); DEFINE_SWIZZLER3(x, y, w, r, g, a); DEFINE_SWIZZLER3(x, z, w, r, b, a); DEFINE_SWIZZLER3(y, z, w, g, b, a); #undef DEFINE_SWIZZLER3 #undef _DEFINE_SWIZZLER3 }; template class Mat3x3 { public: // Convention: first three values = first column Mat3x3(const BaseType values[]) { for (unsigned int i = 0; i < 3*3; ++i) { this->values[i] = values[i]; } } Mat3x3(BaseType _00, BaseType _01, BaseType _02, BaseType _10, BaseType _11, BaseType _12, BaseType _20, BaseType _21, BaseType _22) { values[0] = _00; values[1] = _01; values[2] = _02; values[3] = _10; values[4] = _11; values[5] = _12; values[6] = _20; values[7] = _21; values[8] = _22; } template Vec3 operator * (const Vec3& vec) const { Vec3 ret; ret.x = values[0]*vec.x + values[3]*vec.y + values[6]*vec.z; ret.y = values[1]*vec.x + values[4]*vec.y + values[7]*vec.z; ret.z = values[2]*vec.x + values[5]*vec.y + values[8]*vec.z; return ret; } Mat3x3 Inverse() const { float a = values[0]; float b = values[1]; float c = values[2]; float d = values[3]; float e = values[4]; float f = values[5]; float g = values[6]; float h = values[7]; float i = values[8]; return Mat3x3(e*i-f*h, f*g-d*i, d*h-e*g, c*h-b*i, a*i-c*g, b*g-a*h, b*f-c*e, c*d-a*f, a*e-b*d) / Det(); } BaseType Det() const { return values[0]*values[4]*values[8] + values[3]*values[7]*values[2] + values[6]*values[1]*values[5] - values[2]*values[4]*values[6] - values[5]*values[7]*values[0] - values[8]*values[1]*values[3]; } Mat3x3 operator / (const BaseType& val) const { return Mat3x3(values[0]/val, values[1]/val, values[2]/val, values[3]/val, values[4]/val, values[5]/val, values[6]/val, values[7]/val, values[8]/val); } private: BaseType values[3*3]; }; template class Mat4x4 { public: // Convention: first four values in arrow = first column Mat4x4(const BaseType values[]) { for (unsigned int i = 0; i < 4*4; ++i) { this->values[i] = values[i]; } } template Vec4 operator * (const Vec4& vec) const { Vec4 ret; ret.x = values[0]*vec.x + values[4]*vec.y + values[8]*vec.z + values[12]*vec.w; ret.y = values[1]*vec.x + values[5]*vec.y + values[9]*vec.z + values[13]*vec.w; ret.z = values[2]*vec.x + values[6]*vec.y + values[10]*vec.z + values[14]*vec.w; ret.w = values[3]*vec.x + values[7]*vec.y + values[11]*vec.z + values[15]*vec.w; return ret; } private: BaseType values[4*4]; }; }; // namespace Math3D typedef Math3D::Vec2 Vec2f; typedef Math3D::Vec3 Vec3f; typedef Math3D::Vec3Packed Vec3Packedf; typedef Math3D::Vec4 Vec4f; #if defined(_M_SSE) template float MATH3D_CALL vectorGetByIndex(__m128 v) { // shuffle V so that the element that we want is moved to the bottom return _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(i, i, i, i))); } #endif #if defined(_M_SSE) // x, y, and z should be broadcast. Should only be used through Vec3f version. inline __m128 MATH3D_CALL Vec3ByMatrix43Internal(__m128 x, __m128 y, __m128 z, const float m[12]) { __m128 col0 = _mm_loadu_ps(m); __m128 col1 = _mm_loadu_ps(m + 3); __m128 col2 = _mm_loadu_ps(m + 6); __m128 col3 = _mm_loadu_ps(m + 9); __m128 sum = _mm_add_ps( _mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)), _mm_add_ps(_mm_mul_ps(col2, z), col3)); return sum; } #elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64) inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) { float32x4_t col0 = vld1q_f32(m); float32x4_t col1 = vld1q_f32(m + 3); float32x4_t col2 = vld1q_f32(m + 6); float32x4_t col3 = vld1q_f32(m + 9); float32x4_t sum = vaddq_f32( vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)), vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3)); return sum; } #endif // v and vecOut must point to different memory. inline void Vec3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) { #if defined(_M_SSE) __m128 x = _mm_set1_ps(v[0]); __m128 y = _mm_set1_ps(v[1]); __m128 z = _mm_set1_ps(v[2]); __m128 sum = Vec3ByMatrix43Internal(x, y, z, m); // Not sure what the best way to store 3 elements is. Ideally, we should // probably store all four. vecOut[0] = _mm_cvtss_f32(sum); vecOut[1] = vectorGetByIndex<1>(sum); vecOut[2] = vectorGetByIndex<2>(sum); #elif PPSSPP_ARCH(ARM64_NEON) float vecIn[4] = {v[0], v[1], v[2], 1.0f}; float32x4_t sum = Vec3ByMatrix43Internal(vld1q_f32(vecIn), m); vecOut[0] = vgetq_lane_f32(sum, 0); vecOut[1] = vgetq_lane_f32(sum, 1); vecOut[2] = vgetq_lane_f32(sum, 2); #else vecOut[0] = v[0] * m[0] + v[1] * m[3] + v[2] * m[6] + m[9]; vecOut[1] = v[0] * m[1] + v[1] * m[4] + v[2] * m[7] + m[10]; vecOut[2] = v[0] * m[2] + v[1] * m[5] + v[2] * m[8] + m[11]; #endif } inline Vec3f MATH3D_CALL Vec3ByMatrix43(const Vec3f v, const float m[12]) { #if defined(_M_SSE) && PPSSPP_ARCH(64BIT) __m128 x = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(0, 0, 0, 0)); __m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1)); __m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2)); return Vec3ByMatrix43Internal(x, y, z, m); #elif PPSSPP_ARCH(ARM64_NEON) return Vec3ByMatrix43Internal(v.vec, m); #else Vec3f vecOut; Vec3ByMatrix43(vecOut.AsArray(), v.AsArray(), m); return vecOut; #endif } #if defined(_M_SSE) // x, y, and z should be broadcast. Should only be used through Vec3f version. inline __m128 MATH3D_CALL Vec3ByMatrix44Internal(__m128 x, __m128 y, __m128 z, const float m[16]) { __m128 col0 = _mm_loadu_ps(m); __m128 col1 = _mm_loadu_ps(m + 4); __m128 col2 = _mm_loadu_ps(m + 8); __m128 col3 = _mm_loadu_ps(m + 12); __m128 sum = _mm_add_ps( _mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)), _mm_add_ps(_mm_mul_ps(col2, z), col3)); return sum; } #elif PPSSPP_ARCH(ARM64_NEON) inline float32x4_t Vec3ByMatrix44Internal(float32x4_t vec, const float m[16]) { float32x4_t col0 = vld1q_f32(m); float32x4_t col1 = vld1q_f32(m + 4); float32x4_t col2 = vld1q_f32(m + 8); float32x4_t col3 = vld1q_f32(m + 12); float32x4_t sum = vaddq_f32( vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)), vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3)); return sum; } #endif inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16]) { #if defined(_M_SSE) __m128 x = _mm_set1_ps(v[0]); __m128 y = _mm_set1_ps(v[1]); __m128 z = _mm_set1_ps(v[2]); __m128 sum = Vec3ByMatrix44Internal(x, y, z, m); _mm_storeu_ps(vecOut, sum); #elif PPSSPP_ARCH(ARM64_NEON) float vecIn[4] = {v[0], v[1], v[2], 1.0f}; float32x4_t sum = Vec3ByMatrix44Internal(vld1q_f32(vecIn), m); vst1q_f32(vecOut, sum); #else vecOut[0] = v[0] * m[0] + v[1] * m[4] + v[2] * m[8] + m[12]; vecOut[1] = v[0] * m[1] + v[1] * m[5] + v[2] * m[9] + m[13]; vecOut[2] = v[0] * m[2] + v[1] * m[6] + v[2] * m[10] + m[14]; vecOut[3] = v[0] * m[3] + v[1] * m[7] + v[2] * m[11] + m[15]; #endif } inline Vec4f MATH3D_CALL Vec3ByMatrix44(const Vec3f v, const float m[16]) { #if defined(_M_SSE) && PPSSPP_ARCH(64BIT) __m128 x = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(0, 0, 0, 0)); __m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1)); __m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2)); return Vec3ByMatrix44Internal(x, y, z, m); #elif PPSSPP_ARCH(ARM64_NEON) return Vec3ByMatrix44Internal(v.vec, m); #else Vec4f vecOut; Vec3ByMatrix44(vecOut.AsArray(), v.AsArray(), m); return vecOut; #endif } #if defined(_M_SSE) // x, y, and z should be broadcast. Should only be used through Vec3f version. inline __m128 MATH3D_CALL Norm3ByMatrix43Internal(__m128 x, __m128 y, __m128 z, const float m[12]) { __m128 col0 = _mm_loadu_ps(m); __m128 col1 = _mm_loadu_ps(m + 3); __m128 col2 = _mm_loadu_ps(m + 6); __m128 sum = _mm_add_ps( _mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)), _mm_mul_ps(col2, z)); return sum; } #elif PPSSPP_ARCH(ARM64_NEON) inline float32x4_t Norm3ByMatrix43Internal(float32x4_t vec, const float m[16]) { float32x4_t col0 = vld1q_f32(m); float32x4_t col1 = vld1q_f32(m + 3); float32x4_t col2 = vld1q_f32(m + 6); float32x4_t sum = vaddq_f32( vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)), vmulq_laneq_f32(col2, vec, 2)); return sum; } #endif inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) { #if defined(_M_SSE) __m128 x = _mm_set1_ps(v[0]); __m128 y = _mm_set1_ps(v[1]); __m128 z = _mm_set1_ps(v[2]); __m128 sum = Norm3ByMatrix43Internal(x, y, z, m); vecOut[0] = _mm_cvtss_f32(sum); vecOut[1] = vectorGetByIndex<1>(sum); vecOut[2] = vectorGetByIndex<2>(sum); #elif PPSSPP_ARCH(ARM64_NEON) float32x4_t sum = Norm3ByMatrix43Internal(vld1q_f32(v), m); vecOut[0] = vgetq_lane_f32(sum, 0); vecOut[1] = vgetq_lane_f32(sum, 1); vecOut[2] = vgetq_lane_f32(sum, 2); #else vecOut[0] = v[0] * m[0] + v[1] * m[3] + v[2] * m[6]; vecOut[1] = v[0] * m[1] + v[1] * m[4] + v[2] * m[7]; vecOut[2] = v[0] * m[2] + v[1] * m[5] + v[2] * m[8]; #endif } inline Vec3f MATH3D_CALL Norm3ByMatrix43(const Vec3f v, const float m[12]) { #if defined(_M_SSE) && PPSSPP_ARCH(64BIT) __m128 x = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(0, 0, 0, 0)); __m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1)); __m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2)); return Norm3ByMatrix43Internal(x, y, z, m); #elif PPSSPP_ARCH(ARM64_NEON) return Norm3ByMatrix43Internal(v.vec, m); #else Vec3f vecOut; Norm3ByMatrix43(vecOut.AsArray(), v.AsArray(), m); return vecOut; #endif } inline void Matrix4ByMatrix4(float out[16], const float a[16], const float b[16]) { fast_matrix_mul_4x4(out, b, a); } inline void ConvertMatrix4x3To4x4(float *m4x4, const float *m4x3) { m4x4[0] = m4x3[0]; m4x4[1] = m4x3[1]; m4x4[2] = m4x3[2]; m4x4[3] = 0.0f; m4x4[4] = m4x3[3]; m4x4[5] = m4x3[4]; m4x4[6] = m4x3[5]; m4x4[7] = 0.0f; m4x4[8] = m4x3[6]; m4x4[9] = m4x3[7]; m4x4[10] = m4x3[8]; m4x4[11] = 0.0f; m4x4[12] = m4x3[9]; m4x4[13] = m4x3[10]; m4x4[14] = m4x3[11]; m4x4[15] = 1.0f; } inline void ConvertMatrix4x3To4x4Transposed(float *m4x4, const float *m4x3) { m4x4[0] = m4x3[0]; m4x4[1] = m4x3[3]; m4x4[2] = m4x3[6]; m4x4[3] = m4x3[9]; m4x4[4] = m4x3[1]; m4x4[5] = m4x3[4]; m4x4[6] = m4x3[7]; m4x4[7] = m4x3[10]; m4x4[8] = m4x3[2]; m4x4[9] = m4x3[5]; m4x4[10] = m4x3[8]; m4x4[11] = m4x3[11]; m4x4[12] = 0.0f; m4x4[13] = 0.0f; m4x4[14] = 0.0f; m4x4[15] = 1.0f; } // 0369 // 147A // 258B // ->>- // 0123 // 4567 // 89AB // Don't see a way to SIMD that. Should be pretty fast anyway. inline void ConvertMatrix4x3To3x4Transposed(float *m4x4, const float *m4x3) { m4x4[0] = m4x3[0]; m4x4[1] = m4x3[3]; m4x4[2] = m4x3[6]; m4x4[3] = m4x3[9]; m4x4[4] = m4x3[1]; m4x4[5] = m4x3[4]; m4x4[6] = m4x3[7]; m4x4[7] = m4x3[10]; m4x4[8] = m4x3[2]; m4x4[9] = m4x3[5]; m4x4[10] = m4x3[8]; m4x4[11] = m4x3[11]; } inline void Transpose4x4(float out[16], const float in[16]) { for (int i = 0; i < 4; i++) { for (int j = 0; j < 4; j++) { out[i * 4 + j] = in[j * 4 + i]; } } } inline float Vec3Dot(const float v1[3], const float v2[3]) { return v1[0]*v2[0] + v1[1]*v2[1] + v1[2]*v2[2]; } namespace Math3D { template inline T Dot(const Vec2& a, const Vec2& b) { return a.x*b.x + a.y*b.y; } template inline T Dot(const Vec3& a, const Vec3& b) { return a.x*b.x + a.y*b.y + a.z*b.z; } template inline T Dot(const Vec4& a, const Vec4& b) { return a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w; } template inline Vec3 Cross(const Vec3& a, const Vec3& b) { return Vec3(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x); } template inline Vec3Packed Cross(const Vec3Packed& a, const Vec3Packed& b) { return Vec3Packed(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x); } template<> inline Vec3 Vec3::FromRGB(unsigned int rgb) { #if defined(_M_SSE) __m128i z = _mm_setzero_si128(); __m128i c = _mm_cvtsi32_si128(rgb); c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z); return Vec3(_mm_mul_ps(_mm_cvtepi32_ps(c), _mm_set_ps1(1.0f / 255.0f))); #elif PPSSPP_ARCH(ARM64_NEON) uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgb)); uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c))); return Vec3(vmulq_f32(vcvtq_f32_u32(u), vdupq_n_f32(1.0f / 255.0f))); #else return Vec3((rgb & 0xFF) * (1.0f/255.0f), ((rgb >> 8) & 0xFF) * (1.0f/255.0f), ((rgb >> 16) & 0xFF) * (1.0f/255.0f)); #endif } template<> inline Vec3 Vec3::FromRGB(unsigned int rgb) { #if defined(_M_SSE) __m128i z = _mm_setzero_si128(); __m128i c = _mm_cvtsi32_si128(rgb); c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z); return Vec3(c); #elif PPSSPP_ARCH(ARM64_NEON) uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgb)); uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c))); return Vec3(vreinterpretq_s32_u32(u)); #else return Vec3(rgb & 0xFF, (rgb >> 8) & 0xFF, (rgb >> 16) & 0xFF); #endif } template<> __forceinline unsigned int Vec3::ToRGB() const { #if defined(_M_SSE) #if PPSSPP_ARCH(64BIT) __m128i c = _mm_cvtps_epi32(_mm_mul_ps(vec, _mm_set_ps1(255.0f))); #else __m128i c = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps((float *)&vec), _mm_set_ps1(255.0f))); #endif __m128i c16 = _mm_packs_epi32(c, c); return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)) & 0x00FFFFFF; #elif PPSSPP_ARCH(ARM64_NEON) uint16x4_t c16 = vqmovun_s32(vcvtq_s32_f32(vmulq_f32(vsetq_lane_f32(0.0f, vec, 3), vdupq_n_f32(255.0f)))); uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16)); return vget_lane_u32(vreinterpret_u32_u8(c8), 0); #else return (clamp_u8((int)(r() * 255.f)) << 0) | (clamp_u8((int)(g() * 255.f)) << 8) | (clamp_u8((int)(b() * 255.f)) << 16); #endif } template<> __forceinline unsigned int Vec3::ToRGB() const { #if defined(_M_SSE) #if PPSSPP_ARCH(64BIT) __m128i c16 = _mm_packs_epi32(ivec, ivec); #else __m128i c16 = _mm_packs_epi32(_mm_loadu_si128(&ivec), _mm_setzero_si128()); #endif return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)) & 0x00FFFFFF; #elif PPSSPP_ARCH(ARM64_NEON) uint16x4_t c16 = vqmovun_s32(vsetq_lane_s32(0, ivec, 3)); uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16)); return vget_lane_u32(vreinterpret_u32_u8(c8), 0); #else return clamp_u8(r()) | (clamp_u8(g()) << 8) | (clamp_u8(b()) << 16); #endif } template<> inline Vec4 Vec4::FromRGBA(unsigned int rgba) { #if defined(_M_SSE) __m128i z = _mm_setzero_si128(); __m128i c = _mm_cvtsi32_si128(rgba); c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z); return Vec4(_mm_mul_ps(_mm_cvtepi32_ps(c), _mm_set_ps1(1.0f / 255.0f))); #elif PPSSPP_ARCH(ARM64_NEON) uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgba)); uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c))); return Vec4(vmulq_f32(vcvtq_f32_u32(u), vdupq_n_f32(1.0f / 255.0f))); #else return Vec4((rgba & 0xFF) * (1.0f/255.0f), ((rgba >> 8) & 0xFF) * (1.0f/255.0f), ((rgba >> 16) & 0xFF) * (1.0f/255.0f), ((rgba >> 24) & 0xFF) * (1.0f/255.0f)); #endif } template inline Vec4 Vec4::FromRGBA(const u8 *rgba) { return Vec4::FromRGBA(*(unsigned int *)rgba); } template<> inline Vec4 Vec4::FromRGBA(unsigned int rgba) { #if defined(_M_SSE) __m128i z = _mm_setzero_si128(); __m128i c = _mm_cvtsi32_si128(rgba); c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z); return Vec4(c); #elif PPSSPP_ARCH(ARM64_NEON) uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgba)); uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c))); return Vec4(vreinterpretq_s32_u32(u)); #else return Vec4(rgba & 0xFF, (rgba >> 8) & 0xFF, (rgba >> 16) & 0xFF, (rgba >> 24) & 0xFF); #endif } template<> __forceinline unsigned int Vec4::ToRGBA() const { #if defined(_M_SSE) #if PPSSPP_ARCH(64BIT) __m128i c = _mm_cvtps_epi32(_mm_mul_ps(vec, _mm_set_ps1(255.0f))); #else __m128i c = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps((float *)&vec), _mm_set_ps1(255.0f))); #endif __m128i c16 = _mm_packs_epi32(c, c); return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)); #elif PPSSPP_ARCH(ARM64_NEON) uint16x4_t c16 = vqmovun_s32(vcvtq_s32_f32(vmulq_f32(vec, vdupq_n_f32(255.0f)))); uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16)); return vget_lane_u32(vreinterpret_u32_u8(c8), 0); #else return (clamp_u8((int)(r() * 255.f)) << 0) | (clamp_u8((int)(g() * 255.f)) << 8) | (clamp_u8((int)(b() * 255.f)) << 16) | (clamp_u8((int)(a() * 255.f)) << 24); #endif } template<> __forceinline unsigned int Vec4::ToRGBA() const { #if defined(_M_SSE) #if PPSSPP_ARCH(64BIT) __m128i c16 = _mm_packs_epi32(ivec, ivec); #else __m128i c16 = _mm_packs_epi32(_mm_loadu_si128(&ivec), _mm_setzero_si128()); #endif return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)); #elif PPSSPP_ARCH(ARM64_NEON) uint16x4_t c16 = vqmovun_s32(ivec); uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16)); return vget_lane_u32(vreinterpret_u32_u8(c8), 0); #else return clamp_u8(r()) | (clamp_u8(g()) << 8) | (clamp_u8(b()) << 16) | (clamp_u8(a()) << 24); #endif } template __forceinline void Vec4::ToRGBA(u8 *rgba) const { *(u32 *)rgba = ToRGBA(); } #if defined(_M_SSE) // Specialized for SIMD optimization // Vec3 operation template<> inline void Vec3::operator += (const Vec3 &other) { vec = _mm_add_ps(vec, other.vec); } template<> inline Vec3 Vec3::operator + (const Vec3 &other) const { return Vec3(_mm_add_ps(vec, other.vec)); } template<> inline Vec3 Vec3::operator * (const Vec3 &other) const { return Vec3(_mm_mul_ps(vec, other.vec)); } template<> template<> inline Vec3 Vec3::operator * (const float &other) const { return Vec3(_mm_mul_ps(vec, _mm_set_ps1(other))); } // Vec4 operation template<> inline void Vec4::operator += (const Vec4 &other) { vec = _mm_add_ps(vec, other.vec); } template<> inline Vec4 Vec4::operator + (const Vec4 &other) const { return Vec4(_mm_add_ps(vec, other.vec)); } template<> inline Vec4 Vec4::operator * (const Vec4 &other) const { return Vec4(_mm_mul_ps(vec, other.vec)); } template<> template<> inline Vec4 Vec4::operator * (const float &other) const { return Vec4(_mm_mul_ps(vec, _mm_set_ps1(other))); } // Vec3 cross product template<> inline Vec3 Cross(const Vec3 &a, const Vec3 &b) { const __m128 left = _mm_mul_ps(_mm_shuffle_ps(a.vec, a.vec, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(b.vec, b.vec, _MM_SHUFFLE(3, 1, 0, 2))); const __m128 right = _mm_mul_ps(_mm_shuffle_ps(a.vec, a.vec, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(b.vec, b.vec, _MM_SHUFFLE(3, 0, 2, 1))); return _mm_sub_ps(left, right); } #endif }; // namespace Math3D // linear interpolation via float: 0.0=begin, 1.0=end template inline X Lerp(const X& begin, const X& end, const float t) { return begin*(1.f-t) + end*t; } // linear interpolation via int: 0=begin, base=end template inline X LerpInt(const X& begin, const X& end, const int t) { return (begin*(base-t) + end*t) / base; }