mirror of
https://github.com/hrydgard/ppsspp.git
synced 2024-11-23 05:19:56 +00:00
1542 lines
41 KiB
C++
1542 lines
41 KiB
C++
// Copyright (c) 2012- PPSSPP Project.
|
|
|
|
// This program is free software: you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation, version 2.0 or later versions.
|
|
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License 2.0 for more details.
|
|
|
|
// A copy of the GPL 2.0 should have been included with the program.
|
|
// If not, see http://www.gnu.org/licenses/
|
|
|
|
// Official git repository and contact information can be found at
|
|
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
|
|
|
|
#pragma once
|
|
|
|
#include "ppsspp_config.h"
|
|
#include <cmath>
|
|
|
|
#include "Common/Common.h"
|
|
#include "Core/Util/AudioFormat.h" // for clamp_u8
|
|
#include "Common/Math/fast/fast_matrix.h"
|
|
|
|
#if defined(_M_SSE)
|
|
#include <emmintrin.h>
|
|
#include <smmintrin.h>
|
|
#endif
|
|
|
|
#if PPSSPP_ARCH(ARM_NEON)
|
|
#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
|
|
#include <arm64_neon.h>
|
|
#else
|
|
#include <arm_neon.h>
|
|
#endif
|
|
#endif
|
|
|
|
#if PPSSPP_PLATFORM(WINDOWS) && (defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER))
|
|
#define MATH3D_CALL __vectorcall
|
|
#else
|
|
#define MATH3D_CALL
|
|
#endif
|
|
|
|
// There's probably a better place to define these macros.
|
|
#if PPSSPP_ARCH(X86)
|
|
// On 32-bit x86, MSVC does not guarantee alignment for
|
|
// SSE arguments passed on stack (Compiler Error C2719), see e.g.:
|
|
// https://stackoverflow.com/questions/10484422/msvc-cannot-send-function-parameters-of-16byte-alignment-on-x86
|
|
// https://stackoverflow.com/questions/28488986/formal-parameter-with-declspecalign16-wont-be-aligned
|
|
// So, as a workaround, "dangerous" cases are loaded via loadu* on 32-bit x86.
|
|
// Compilers are decently ok at eliminating these extra loads, at least
|
|
// in trivial cases.
|
|
// NOTE: not to be outdone, GCC has its own flavor of broken, see e.g.:
|
|
// http://www.peterstock.co.uk/games/mingw_sse/
|
|
// https://github.com/nothings/stb/issues/81
|
|
// which is probably worse since it breaks alignment of locals and/or
|
|
// spills, but that, hopefully, does not affect PPSSPP (modern GCC+Linux
|
|
// is 16-byte aligned on x86, and MinGW is not a supported PPSSPP target).
|
|
// NOTE: weird double-casts add a bit of type-safety.
|
|
#define SAFE_M128(v) _mm_loadu_ps (reinterpret_cast<const float*> (static_cast<const __m128*> (&(v))))
|
|
#define SAFE_M128I(v) _mm_loadu_si128(reinterpret_cast<const __m128i*>(static_cast<const __m128i*>(&(v))))
|
|
#else // x64, FWIW also works for non-x86.
|
|
#define SAFE_M128(v) (v)
|
|
#define SAFE_M128I(v) (v)
|
|
#endif
|
|
|
|
namespace Math3D {
|
|
|
|
// Helper for Vec classes to clamp values.
|
|
template<typename T>
|
|
inline static T VecClamp(const T &v, const T &low, const T &high)
|
|
{
|
|
if (v > high)
|
|
return high;
|
|
if (v < low)
|
|
return low;
|
|
return v;
|
|
}
|
|
|
|
template<typename T>
|
|
class Vec2 {
|
|
public:
|
|
struct {
|
|
T x,y;
|
|
};
|
|
|
|
T* AsArray() { return &x; }
|
|
const T* AsArray() const { return &x; }
|
|
|
|
Vec2() {}
|
|
Vec2(const T a[2]) : x(a[0]), y(a[1]) {}
|
|
Vec2(const T& _x, const T& _y) : x(_x), y(_y) {}
|
|
|
|
template<typename T2>
|
|
Vec2<T2> Cast() const
|
|
{
|
|
return Vec2<T2>((T2)x, (T2)y);
|
|
}
|
|
|
|
static Vec2 AssignToAll(const T& f)
|
|
{
|
|
return Vec2<T>(f, f);
|
|
}
|
|
|
|
void Write(T a[2])
|
|
{
|
|
a[0] = x; a[1] = y;
|
|
}
|
|
|
|
Vec2 operator +(const Vec2& other) const
|
|
{
|
|
return Vec2(x+other.x, y+other.y);
|
|
}
|
|
void operator += (const Vec2 &other)
|
|
{
|
|
x+=other.x; y+=other.y;
|
|
}
|
|
Vec2 operator -(const Vec2& other) const
|
|
{
|
|
return Vec2(x-other.x, y-other.y);
|
|
}
|
|
void operator -= (const Vec2& other)
|
|
{
|
|
x-=other.x; y-=other.y;
|
|
}
|
|
Vec2 operator -() const
|
|
{
|
|
return Vec2(-x,-y);
|
|
}
|
|
Vec2 operator * (const Vec2& other) const
|
|
{
|
|
return Vec2(x*other.x, y*other.y);
|
|
}
|
|
template<typename V>
|
|
Vec2 operator * (const V& f) const
|
|
{
|
|
return Vec2(x*f,y*f);
|
|
}
|
|
template<typename V>
|
|
void operator *= (const V& f)
|
|
{
|
|
x*=f; y*=f;
|
|
}
|
|
template<typename V>
|
|
Vec2 operator / (const V& f) const
|
|
{
|
|
return Vec2(x/f,y/f);
|
|
}
|
|
template<typename V>
|
|
void operator /= (const V& f)
|
|
{
|
|
*this = *this / f;
|
|
}
|
|
|
|
T Length2() const
|
|
{
|
|
return x*x + y*y;
|
|
}
|
|
|
|
Vec2 Clamp(const T &l, const T &h) const
|
|
{
|
|
return Vec2(VecClamp(x, l, h), VecClamp(y, l, h));
|
|
}
|
|
|
|
// Only implemented for T=float
|
|
float Length() const;
|
|
void SetLength(const float l);
|
|
Vec2 WithLength(const float l) const;
|
|
float Distance2To(const Vec2 &other) const;
|
|
Vec2 Normalized() const;
|
|
float Normalize(); // returns the previous length, which is often useful
|
|
|
|
T& operator [] (int i) //allow vector[1] = 3 (vector.y=3)
|
|
{
|
|
return *((&x) + i);
|
|
}
|
|
T operator [] (const int i) const
|
|
{
|
|
return *((&x) + i);
|
|
}
|
|
|
|
void SetZero()
|
|
{
|
|
x=0; y=0;
|
|
}
|
|
|
|
// Common aliases: UV (texel coordinates), ST (texture coordinates)
|
|
T& u() { return x; }
|
|
T& v() { return y; }
|
|
T& s() { return x; }
|
|
T& t() { return y; }
|
|
|
|
const T& u() const { return x; }
|
|
const T& v() const { return y; }
|
|
const T& s() const { return x; }
|
|
const T& t() const { return y; }
|
|
|
|
// swizzlers - create a subvector of specific components
|
|
const Vec2 yx() const { return Vec2(y, x); }
|
|
const Vec2 vu() const { return Vec2(y, x); }
|
|
const Vec2 ts() const { return Vec2(y, x); }
|
|
};
|
|
|
|
template<typename T>
|
|
class Vec3Packed;
|
|
|
|
template<typename T>
|
|
class Vec3
|
|
{
|
|
public:
|
|
union
|
|
{
|
|
struct
|
|
{
|
|
T x,y,z;
|
|
};
|
|
#if defined(_M_SSE)
|
|
__m128i ivec;
|
|
__m128 vec;
|
|
#elif PPSSPP_ARCH(ARM_NEON)
|
|
int32x4_t ivec;
|
|
float32x4_t vec;
|
|
#endif
|
|
};
|
|
|
|
T* AsArray() { return &x; }
|
|
const T* AsArray() const { return &x; }
|
|
|
|
Vec3() {}
|
|
Vec3(const T a[3]) : x(a[0]), y(a[1]), z(a[2]) {}
|
|
constexpr Vec3(const T& _x, const T& _y, const T& _z) : x(_x), y(_y), z(_z) {}
|
|
Vec3(const Vec2<T>& _xy, const T& _z) : x(_xy.x), y(_xy.y), z(_z) {}
|
|
#if defined(_M_SSE)
|
|
constexpr Vec3(const __m128 &_vec) : vec(_vec) {}
|
|
constexpr Vec3(const __m128i &_ivec) : ivec(_ivec) {}
|
|
Vec3(const Vec3Packed<T> &_xyz) {
|
|
vec = _mm_loadu_ps(_xyz.AsArray());
|
|
}
|
|
#elif PPSSPP_ARCH(ARM_NEON)
|
|
Vec3(const float32x4_t &_vec) : vec(_vec) {}
|
|
#if !defined(_MSC_VER)
|
|
Vec3(const int32x4_t &_ivec) : ivec(_ivec) {}
|
|
#endif
|
|
Vec3(const Vec3Packed<T> &_xyz) {
|
|
vec = vld1q_f32(_xyz.AsArray());
|
|
}
|
|
#else
|
|
Vec3(const Vec3Packed<T> &_xyz) : x(_xyz.x), y(_xyz.y), z(_xyz.z) {}
|
|
#endif
|
|
|
|
template<typename T2>
|
|
constexpr Vec3<T2> Cast() const
|
|
{
|
|
return Vec3<T2>((T2)x, (T2)y, (T2)z);
|
|
}
|
|
|
|
// Only implemented for T=int and T=float
|
|
static Vec3 FromRGB(unsigned int rgb);
|
|
unsigned int ToRGB() const; // alpha bits set to zero
|
|
|
|
static constexpr Vec3 AssignToAll(const T& f)
|
|
{
|
|
return Vec3<T>(f, f, f);
|
|
}
|
|
|
|
void Write(T a[3])
|
|
{
|
|
a[0] = x; a[1] = y; a[2] = z;
|
|
}
|
|
|
|
Vec3 operator +(const Vec3 &other) const
|
|
{
|
|
return Vec3(x+other.x, y+other.y, z+other.z);
|
|
}
|
|
void operator += (const Vec3 &other)
|
|
{
|
|
x+=other.x; y+=other.y; z+=other.z;
|
|
}
|
|
Vec3 operator -(const Vec3 &other) const
|
|
{
|
|
return Vec3(x-other.x, y-other.y, z-other.z);
|
|
}
|
|
void operator -= (const Vec3 &other)
|
|
{
|
|
x-=other.x; y-=other.y; z-=other.z;
|
|
}
|
|
Vec3 operator -() const
|
|
{
|
|
return Vec3(-x,-y,-z);
|
|
}
|
|
Vec3 operator * (const Vec3 &other) const
|
|
{
|
|
return Vec3(x*other.x, y*other.y, z*other.z);
|
|
}
|
|
template<typename V>
|
|
Vec3 operator * (const V& f) const
|
|
{
|
|
return Vec3(x*f,y*f,z*f);
|
|
}
|
|
template<typename V>
|
|
void operator *= (const V& f)
|
|
{
|
|
x*=f; y*=f; z*=f;
|
|
}
|
|
template<typename V>
|
|
Vec3 operator / (const V& f) const
|
|
{
|
|
return Vec3(x/f,y/f,z/f);
|
|
}
|
|
template<typename V>
|
|
void operator /= (const V& f)
|
|
{
|
|
*this = *this / f;
|
|
}
|
|
|
|
bool operator ==(const Vec3 &other) const {
|
|
return x == other.x && y == other.y && z == other.z;
|
|
}
|
|
|
|
T Length2() const
|
|
{
|
|
return x*x + y*y + z*z;
|
|
}
|
|
|
|
Vec3 Clamp(const T &l, const T &h) const
|
|
{
|
|
return Vec3(VecClamp(x, l, h), VecClamp(y, l, h), VecClamp(z, l, h));
|
|
}
|
|
|
|
// Only implemented for T=float
|
|
float Length() const;
|
|
void SetLength(const float l);
|
|
Vec3 WithLength(const float l) const;
|
|
float Distance2To(const Vec3 &other) const;
|
|
Vec3 Normalized(bool useSSE4 = false) const;
|
|
Vec3 NormalizedOr001(bool useSSE4 = false) const;
|
|
float Normalize(); // returns the previous length, which is often useful
|
|
float NormalizeOr001();
|
|
|
|
T& operator [] (int i) //allow vector[2] = 3 (vector.z=3)
|
|
{
|
|
return *((&x) + i);
|
|
}
|
|
T operator [] (const int i) const
|
|
{
|
|
return *((&x) + i);
|
|
}
|
|
|
|
void SetZero()
|
|
{
|
|
x=0; y=0; z=0;
|
|
}
|
|
|
|
// Common aliases: UVW (texel coordinates), RGB (colors), STQ (texture coordinates)
|
|
T& u() { return x; }
|
|
T& v() { return y; }
|
|
T& w() { return z; }
|
|
|
|
T& r() { return x; }
|
|
T& g() { return y; }
|
|
T& b() { return z; }
|
|
|
|
T& s() { return x; }
|
|
T& t() { return y; }
|
|
T& q() { return z; }
|
|
|
|
const T& u() const { return x; }
|
|
const T& v() const { return y; }
|
|
const T& w() const { return z; }
|
|
|
|
const T& r() const { return x; }
|
|
const T& g() const { return y; }
|
|
const T& b() const { return z; }
|
|
|
|
const T& s() const { return x; }
|
|
const T& t() const { return y; }
|
|
const T& q() const { return z; }
|
|
|
|
// swizzlers - create a subvector of specific components
|
|
// e.g. Vec2 uv() { return Vec2(x,y); }
|
|
// _DEFINE_SWIZZLER2 defines a single such function, DEFINE_SWIZZLER2 defines all of them for all component names (x<->r) and permutations (xy<->yx)
|
|
#define _DEFINE_SWIZZLER2(a, b, name) const Vec2<T> name() const { return Vec2<T>(a, b); }
|
|
#define DEFINE_SWIZZLER2(a, b, a2, b2, a3, b3, a4, b4) \
|
|
_DEFINE_SWIZZLER2(a, b, a##b); \
|
|
_DEFINE_SWIZZLER2(a, b, a2##b2); \
|
|
_DEFINE_SWIZZLER2(a, b, a3##b3); \
|
|
_DEFINE_SWIZZLER2(a, b, a4##b4); \
|
|
_DEFINE_SWIZZLER2(b, a, b##a); \
|
|
_DEFINE_SWIZZLER2(b, a, b2##a2); \
|
|
_DEFINE_SWIZZLER2(b, a, b3##a3); \
|
|
_DEFINE_SWIZZLER2(b, a, b4##a4);
|
|
|
|
DEFINE_SWIZZLER2(x, y, r, g, u, v, s, t);
|
|
DEFINE_SWIZZLER2(x, z, r, b, u, w, s, q);
|
|
DEFINE_SWIZZLER2(y, z, g, b, v, w, t, q);
|
|
#undef DEFINE_SWIZZLER2
|
|
#undef _DEFINE_SWIZZLER2
|
|
};
|
|
|
|
template<typename T>
|
|
class Vec3Packed
|
|
{
|
|
public:
|
|
union
|
|
{
|
|
struct
|
|
{
|
|
T x,y,z;
|
|
};
|
|
};
|
|
|
|
T* AsArray() { return &x; }
|
|
const T* AsArray() const { return &x; }
|
|
|
|
Vec3Packed() {}
|
|
Vec3Packed(const T a[3]) : x(a[0]), y(a[1]), z(a[2]) {}
|
|
Vec3Packed(const T& _x, const T& _y, const T& _z) : x(_x), y(_y), z(_z) {}
|
|
Vec3Packed(const Vec2<T>& _xy, const T& _z) : x(_xy.x), y(_xy.y), z(_z) {}
|
|
Vec3Packed(const Vec3<T>& _xyz) {
|
|
memcpy(&x, _xyz.AsArray(), sizeof(float) * 3);
|
|
}
|
|
|
|
template<typename T2>
|
|
Vec3Packed<T2> Cast() const
|
|
{
|
|
return Vec3Packed<T2>((T2)x, (T2)y, (T2)z);
|
|
}
|
|
|
|
// Only implemented for T=int and T=float
|
|
static Vec3Packed FromRGB(unsigned int rgb);
|
|
unsigned int ToRGB() const; // alpha bits set to zero
|
|
|
|
static Vec3Packed AssignToAll(const T& f)
|
|
{
|
|
return Vec3Packed<T>(f, f, f);
|
|
}
|
|
|
|
void Write(T a[3])
|
|
{
|
|
a[0] = x; a[1] = y; a[2] = z;
|
|
}
|
|
|
|
Vec3Packed operator +(const Vec3Packed &other) const
|
|
{
|
|
return Vec3Packed(x+other.x, y+other.y, z+other.z);
|
|
}
|
|
void operator += (const Vec3Packed &other)
|
|
{
|
|
x+=other.x; y+=other.y; z+=other.z;
|
|
}
|
|
Vec3Packed operator -(const Vec3Packed &other) const
|
|
{
|
|
return Vec3Packed(x-other.x, y-other.y, z-other.z);
|
|
}
|
|
void operator -= (const Vec3Packed &other)
|
|
{
|
|
x-=other.x; y-=other.y; z-=other.z;
|
|
}
|
|
Vec3Packed operator -() const
|
|
{
|
|
return Vec3Packed(-x,-y,-z);
|
|
}
|
|
Vec3Packed operator * (const Vec3Packed &other) const
|
|
{
|
|
return Vec3Packed(x*other.x, y*other.y, z*other.z);
|
|
}
|
|
template<typename V>
|
|
Vec3Packed operator * (const V& f) const
|
|
{
|
|
return Vec3Packed(x*f,y*f,z*f);
|
|
}
|
|
template<typename V>
|
|
void operator *= (const V& f)
|
|
{
|
|
x*=f; y*=f; z*=f;
|
|
}
|
|
template<typename V>
|
|
Vec3Packed operator / (const V& f) const
|
|
{
|
|
return Vec3Packed(x/f,y/f,z/f);
|
|
}
|
|
template<typename V>
|
|
void operator /= (const V& f)
|
|
{
|
|
*this = *this / f;
|
|
}
|
|
|
|
T Length2() const
|
|
{
|
|
return x*x + y*y + z*z;
|
|
}
|
|
|
|
Vec3Packed Clamp(const T &l, const T &h) const
|
|
{
|
|
return Vec3Packed(VecClamp(x, l, h), VecClamp(y, l, h), VecClamp(z, l, h));
|
|
}
|
|
|
|
// Only implemented for T=float
|
|
float Length() const;
|
|
void SetLength(const float l);
|
|
Vec3Packed WithLength(const float l) const;
|
|
float Distance2To(const Vec3Packed &other) const;
|
|
Vec3Packed Normalized() const;
|
|
float Normalize(); // returns the previous length, which is often useful
|
|
|
|
T& operator [] (int i) //allow vector[2] = 3 (vector.z=3)
|
|
{
|
|
return *((&x) + i);
|
|
}
|
|
T operator [] (const int i) const
|
|
{
|
|
return *((&x) + i);
|
|
}
|
|
|
|
void SetZero()
|
|
{
|
|
x=0; y=0; z=0;
|
|
}
|
|
|
|
// Common aliases: UVW (texel coordinates), RGB (colors), STQ (texture coordinates)
|
|
T& u() { return x; }
|
|
T& v() { return y; }
|
|
T& w() { return z; }
|
|
|
|
T& r() { return x; }
|
|
T& g() { return y; }
|
|
T& b() { return z; }
|
|
|
|
T& s() { return x; }
|
|
T& t() { return y; }
|
|
T& q() { return z; }
|
|
|
|
const T& u() const { return x; }
|
|
const T& v() const { return y; }
|
|
const T& w() const { return z; }
|
|
|
|
const T& r() const { return x; }
|
|
const T& g() const { return y; }
|
|
const T& b() const { return z; }
|
|
|
|
const T& s() const { return x; }
|
|
const T& t() const { return y; }
|
|
const T& q() const { return z; }
|
|
|
|
// swizzlers - create a subvector of specific components
|
|
// e.g. Vec2 uv() { return Vec2(x,y); }
|
|
// _DEFINE_SWIZZLER2 defines a single such function, DEFINE_SWIZZLER2 defines all of them for all component names (x<->r) and permutations (xy<->yx)
|
|
#define _DEFINE_SWIZZLER2(a, b, name) const Vec2<T> name() const { return Vec2<T>(a, b); }
|
|
#define DEFINE_SWIZZLER2(a, b, a2, b2, a3, b3, a4, b4) \
|
|
_DEFINE_SWIZZLER2(a, b, a##b); \
|
|
_DEFINE_SWIZZLER2(a, b, a2##b2); \
|
|
_DEFINE_SWIZZLER2(a, b, a3##b3); \
|
|
_DEFINE_SWIZZLER2(a, b, a4##b4); \
|
|
_DEFINE_SWIZZLER2(b, a, b##a); \
|
|
_DEFINE_SWIZZLER2(b, a, b2##a2); \
|
|
_DEFINE_SWIZZLER2(b, a, b3##a3); \
|
|
_DEFINE_SWIZZLER2(b, a, b4##a4);
|
|
|
|
DEFINE_SWIZZLER2(x, y, r, g, u, v, s, t);
|
|
DEFINE_SWIZZLER2(x, z, r, b, u, w, s, q);
|
|
DEFINE_SWIZZLER2(y, z, g, b, v, w, t, q);
|
|
#undef DEFINE_SWIZZLER2
|
|
#undef _DEFINE_SWIZZLER2
|
|
};
|
|
|
|
template<typename T>
|
|
class Vec4
|
|
{
|
|
public:
|
|
union
|
|
{
|
|
struct
|
|
{
|
|
T x,y,z,w;
|
|
};
|
|
#if defined(_M_SSE)
|
|
__m128i ivec;
|
|
__m128 vec;
|
|
#elif PPSSPP_ARCH(ARM_NEON)
|
|
int32x4_t ivec;
|
|
float32x4_t vec;
|
|
#endif
|
|
};
|
|
|
|
T* AsArray() { return &x; }
|
|
const T* AsArray() const { return &x; }
|
|
|
|
Vec4() {}
|
|
Vec4(const T a[4]) : x(a[0]), y(a[1]), z(a[2]), w(a[3]) {}
|
|
Vec4(const T& _x, const T& _y, const T& _z, const T& _w) : x(_x), y(_y), z(_z), w(_w) {}
|
|
Vec4(const Vec2<T>& _xy, const T& _z, const T& _w) : x(_xy.x), y(_xy.y), z(_z), w(_w) {}
|
|
Vec4(const Vec3<T>& _xyz, const T& _w) : x(_xyz.x), y(_xyz.y), z(_xyz.z), w(_w) {}
|
|
#if defined(_M_SSE)
|
|
Vec4(const __m128 &_vec) : vec(_vec) {}
|
|
Vec4(const __m128i &_ivec) : ivec(_ivec) {}
|
|
#elif PPSSPP_ARCH(ARM_NEON)
|
|
Vec4(const float32x4_t &_vec) : vec(_vec) {}
|
|
#if !defined(_MSC_VER)
|
|
Vec4(const int32x4_t &_ivec) : ivec(_ivec) {}
|
|
#endif
|
|
#endif
|
|
|
|
template<typename T2>
|
|
Vec4<T2> Cast() const {
|
|
if constexpr (std::is_same<T, float>::value && std::is_same<T2, int>::value) {
|
|
#if defined(_M_SSE)
|
|
return _mm_cvtps_epi32(SAFE_M128(vec));
|
|
#elif PPSSPP_ARCH(ARM_NEON)
|
|
return vcvtq_s32_f32(vec);
|
|
#endif
|
|
}
|
|
if constexpr (std::is_same<T, int>::value && std::is_same<T2, float>::value) {
|
|
#if defined(_M_SSE)
|
|
return _mm_cvtepi32_ps(SAFE_M128I(ivec));
|
|
#elif PPSSPP_ARCH(ARM_NEON)
|
|
return vcvtq_f32_s32(ivec);
|
|
#endif
|
|
}
|
|
return Vec4<T2>((T2)x, (T2)y, (T2)z, (T2)w);
|
|
}
|
|
|
|
// Only implemented for T=int and T=float
|
|
static Vec4 FromRGBA(unsigned int rgba);
|
|
static Vec4 FromRGBA(const u8 *rgba);
|
|
unsigned int ToRGBA() const;
|
|
void ToRGBA(u8 *rgba) const;
|
|
|
|
static Vec4 AssignToAll(const T& f)
|
|
{
|
|
return Vec4<T>(f, f, f, f);
|
|
}
|
|
|
|
void Write(T a[4])
|
|
{
|
|
a[0] = x; a[1] = y; a[2] = z; a[3] = w;
|
|
}
|
|
|
|
Vec4 operator +(const Vec4& other) const
|
|
{
|
|
return Vec4(x+other.x, y+other.y, z+other.z, w+other.w);
|
|
}
|
|
void operator += (const Vec4& other)
|
|
{
|
|
x+=other.x; y+=other.y; z+=other.z; w+=other.w;
|
|
}
|
|
Vec4 operator -(const Vec4 &other) const
|
|
{
|
|
return Vec4(x-other.x, y-other.y, z-other.z, w-other.w);
|
|
}
|
|
void operator -= (const Vec4 &other)
|
|
{
|
|
x-=other.x; y-=other.y; z-=other.z; w-=other.w;
|
|
}
|
|
Vec4 operator -() const
|
|
{
|
|
return Vec4(-x,-y,-z,-w);
|
|
}
|
|
Vec4 operator * (const Vec4 &other) const
|
|
{
|
|
return Vec4(x*other.x, y*other.y, z*other.z, w*other.w);
|
|
}
|
|
Vec4 operator | (const Vec4 &other) const
|
|
{
|
|
return Vec4(x | other.x, y | other.y, z | other.z, w | other.w);
|
|
}
|
|
Vec4 operator & (const Vec4 &other) const
|
|
{
|
|
return Vec4(x & other.x, y & other.y, z & other.z, w & other.w);
|
|
}
|
|
Vec4 operator << (const int amount) const
|
|
{
|
|
// NOTE: x*(1<<amount), etc., might be safer, since
|
|
// left-shifting negatives is UB pre-C++20.
|
|
return Vec4(x << amount, y << amount, z << amount, w << amount);
|
|
}
|
|
Vec4 operator >> (const int amount) const
|
|
{
|
|
return Vec4(x >> amount, y >> amount, z >> amount, w >> amount);
|
|
}
|
|
template<typename V>
|
|
Vec4 operator * (const V& f) const
|
|
{
|
|
return Vec4(x*f,y*f,z*f,w*f);
|
|
}
|
|
template<typename V>
|
|
void operator *= (const V& f)
|
|
{
|
|
x*=f; y*=f; z*=f; w*=f;
|
|
}
|
|
template<typename V>
|
|
Vec4 operator / (const V& f) const
|
|
{
|
|
return Vec4(x/f,y/f,z/f,w/f);
|
|
}
|
|
template<typename V>
|
|
void operator /= (const V& f)
|
|
{
|
|
*this = *this / f;
|
|
}
|
|
|
|
bool operator ==(const Vec4 &other) const {
|
|
return x == other.x && y == other.y && z == other.z && w == other.w;
|
|
}
|
|
|
|
T Length2() const
|
|
{
|
|
return x*x + y*y + z*z + w*w;
|
|
}
|
|
|
|
Vec4 Clamp(const T &l, const T &h) const
|
|
{
|
|
return Vec4(VecClamp(x, l, h), VecClamp(y, l, h), VecClamp(z, l, h), VecClamp(w, l, h));
|
|
}
|
|
|
|
Vec4 Reciprocal() const
|
|
{
|
|
const T one = 1.0f;
|
|
return Vec4(one / x, one / y, one / z, one / w);
|
|
}
|
|
|
|
// Only implemented for T=float
|
|
float Length() const;
|
|
void SetLength(const float l);
|
|
Vec4 WithLength(const float l) const;
|
|
float Distance2To(const Vec4 &other) const;
|
|
Vec4 Normalized() const;
|
|
float Normalize(); // returns the previous length, which is often useful
|
|
|
|
T& operator [] (int i) //allow vector[2] = 3 (vector.z=3)
|
|
{
|
|
return *((&x) + i);
|
|
}
|
|
T operator [] (const int i) const
|
|
{
|
|
return *((&x) + i);
|
|
}
|
|
|
|
void SetZero()
|
|
{
|
|
x=0; y=0; z=0; w=0;
|
|
}
|
|
|
|
// Common alias: RGBA (colors)
|
|
T& r() { return x; }
|
|
T& g() { return y; }
|
|
T& b() { return z; }
|
|
T& a() { return w; }
|
|
|
|
const T& r() const { return x; }
|
|
const T& g() const { return y; }
|
|
const T& b() const { return z; }
|
|
const T& a() const { return w; }
|
|
|
|
// swizzlers - create a subvector of specific components
|
|
// e.g. Vec2 uv() { return Vec2(x,y); }
|
|
// _DEFINE_SWIZZLER2 defines a single such function, DEFINE_SWIZZLER2 defines all of them for all component names (x<->r) and permutations (xy<->yx)
|
|
#define _DEFINE_SWIZZLER2(a, b, name) const Vec2<T> name() const { return Vec2<T>(a, b); }
|
|
#define DEFINE_SWIZZLER2(a, b, a2, b2) \
|
|
_DEFINE_SWIZZLER2(a, b, a##b); \
|
|
_DEFINE_SWIZZLER2(a, b, a2##b2); \
|
|
_DEFINE_SWIZZLER2(b, a, b##a); \
|
|
_DEFINE_SWIZZLER2(b, a, b2##a2);
|
|
|
|
DEFINE_SWIZZLER2(x, y, r, g);
|
|
DEFINE_SWIZZLER2(x, z, r, b);
|
|
DEFINE_SWIZZLER2(x, w, r, a);
|
|
DEFINE_SWIZZLER2(y, z, g, b);
|
|
DEFINE_SWIZZLER2(y, w, g, a);
|
|
DEFINE_SWIZZLER2(z, w, b, a);
|
|
#undef DEFINE_SWIZZLER2
|
|
#undef _DEFINE_SWIZZLER2
|
|
|
|
#define _DEFINE_SWIZZLER3(a, b, c, name) const Vec3<T> name() const { return Vec3<T>(a, b, c); }
|
|
#define DEFINE_SWIZZLER3(a, b, c, a2, b2, c2) \
|
|
_DEFINE_SWIZZLER3(a, b, c, a##b##c); \
|
|
_DEFINE_SWIZZLER3(a, c, b, a##c##b); \
|
|
_DEFINE_SWIZZLER3(b, a, c, b##a##c); \
|
|
_DEFINE_SWIZZLER3(b, c, a, b##c##a); \
|
|
_DEFINE_SWIZZLER3(c, a, b, c##a##b); \
|
|
_DEFINE_SWIZZLER3(c, b, a, c##b##a); \
|
|
_DEFINE_SWIZZLER3(a, b, c, a2##b2##c2); \
|
|
_DEFINE_SWIZZLER3(a, c, b, a2##c2##b2); \
|
|
_DEFINE_SWIZZLER3(b, a, c, b2##a2##c2); \
|
|
_DEFINE_SWIZZLER3(b, c, a, b2##c2##a2); \
|
|
_DEFINE_SWIZZLER3(c, a, b, c2##a2##b2); \
|
|
_DEFINE_SWIZZLER3(c, b, a, c2##b2##a2);
|
|
|
|
DEFINE_SWIZZLER3(x, y, z, r, g, b);
|
|
DEFINE_SWIZZLER3(x, y, w, r, g, a);
|
|
DEFINE_SWIZZLER3(x, z, w, r, b, a);
|
|
DEFINE_SWIZZLER3(y, z, w, g, b, a);
|
|
#undef DEFINE_SWIZZLER3
|
|
#undef _DEFINE_SWIZZLER3
|
|
};
|
|
|
|
|
|
template<typename BaseType>
|
|
class Mat3x3
|
|
{
|
|
public:
|
|
// Convention: first three values = first column
|
|
Mat3x3(const BaseType values[])
|
|
{
|
|
for (unsigned int i = 0; i < 3*3; ++i)
|
|
{
|
|
this->values[i] = values[i];
|
|
}
|
|
}
|
|
|
|
Mat3x3(BaseType _00, BaseType _01, BaseType _02, BaseType _10, BaseType _11, BaseType _12, BaseType _20, BaseType _21, BaseType _22)
|
|
{
|
|
values[0] = _00;
|
|
values[1] = _01;
|
|
values[2] = _02;
|
|
values[3] = _10;
|
|
values[4] = _11;
|
|
values[5] = _12;
|
|
values[6] = _20;
|
|
values[7] = _21;
|
|
values[8] = _22;
|
|
}
|
|
|
|
template<typename T>
|
|
Vec3<T> operator * (const Vec3<T>& vec) const
|
|
{
|
|
Vec3<T> ret;
|
|
ret.x = values[0]*vec.x + values[3]*vec.y + values[6]*vec.z;
|
|
ret.y = values[1]*vec.x + values[4]*vec.y + values[7]*vec.z;
|
|
ret.z = values[2]*vec.x + values[5]*vec.y + values[8]*vec.z;
|
|
return ret;
|
|
}
|
|
|
|
Mat3x3 Inverse() const
|
|
{
|
|
float a = values[0];
|
|
float b = values[1];
|
|
float c = values[2];
|
|
float d = values[3];
|
|
float e = values[4];
|
|
float f = values[5];
|
|
float g = values[6];
|
|
float h = values[7];
|
|
float i = values[8];
|
|
return Mat3x3(e*i-f*h, f*g-d*i, d*h-e*g,
|
|
c*h-b*i, a*i-c*g, b*g-a*h,
|
|
b*f-c*e, c*d-a*f, a*e-b*d) / Det();
|
|
}
|
|
|
|
BaseType Det() const
|
|
{
|
|
return values[0]*values[4]*values[8] + values[3]*values[7]*values[2] +
|
|
values[6]*values[1]*values[5] - values[2]*values[4]*values[6] -
|
|
values[5]*values[7]*values[0] - values[8]*values[1]*values[3];
|
|
}
|
|
|
|
Mat3x3 operator / (const BaseType& val) const
|
|
{
|
|
return Mat3x3(values[0]/val, values[1]/val, values[2]/val,
|
|
values[3]/val, values[4]/val, values[5]/val,
|
|
values[6]/val, values[7]/val, values[8]/val);
|
|
}
|
|
|
|
private:
|
|
BaseType values[3*3];
|
|
};
|
|
|
|
|
|
template<typename BaseType>
|
|
class Mat4x4
|
|
{
|
|
public:
|
|
// Convention: first four values in arrow = first column
|
|
Mat4x4(const BaseType values[])
|
|
{
|
|
for (unsigned int i = 0; i < 4*4; ++i)
|
|
{
|
|
this->values[i] = values[i];
|
|
}
|
|
}
|
|
|
|
template<typename T>
|
|
Vec4<T> operator * (const Vec4<T>& vec) const
|
|
{
|
|
Vec4<T> ret;
|
|
ret.x = values[0]*vec.x + values[4]*vec.y + values[8]*vec.z + values[12]*vec.w;
|
|
ret.y = values[1]*vec.x + values[5]*vec.y + values[9]*vec.z + values[13]*vec.w;
|
|
ret.z = values[2]*vec.x + values[6]*vec.y + values[10]*vec.z + values[14]*vec.w;
|
|
ret.w = values[3]*vec.x + values[7]*vec.y + values[11]*vec.z + values[15]*vec.w;
|
|
return ret;
|
|
}
|
|
|
|
private:
|
|
BaseType values[4*4];
|
|
};
|
|
|
|
}; // namespace Math3D
|
|
|
|
typedef Math3D::Vec2<float> Vec2f;
|
|
typedef Math3D::Vec3<float> Vec3f;
|
|
typedef Math3D::Vec3Packed<float> Vec3Packedf;
|
|
typedef Math3D::Vec4<float> Vec4f;
|
|
|
|
#if defined(_M_SSE)
|
|
template<unsigned i>
|
|
float MATH3D_CALL vectorGetByIndex(__m128 v) {
|
|
// shuffle V so that the element that we want is moved to the bottom
|
|
return _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(i, i, i, i)));
|
|
}
|
|
#endif
|
|
|
|
#if defined(_M_SSE)
|
|
// x, y, and z should be broadcast. Should only be used through Vec3f version.
|
|
inline __m128 MATH3D_CALL Vec3ByMatrix43Internal(__m128 x, __m128 y, __m128 z, const float m[12]) {
|
|
__m128 col0 = _mm_loadu_ps(m);
|
|
__m128 col1 = _mm_loadu_ps(m + 3);
|
|
__m128 col2 = _mm_loadu_ps(m + 6);
|
|
__m128 col3 = _mm_loadu_ps(m + 9);
|
|
__m128 sum = _mm_add_ps(
|
|
_mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)),
|
|
_mm_add_ps(_mm_mul_ps(col2, z), col3));
|
|
return sum;
|
|
}
|
|
#elif PPSSPP_ARCH(ARM64_NEON)
|
|
inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
|
|
float32x4_t col0 = vld1q_f32(m);
|
|
float32x4_t col1 = vld1q_f32(m + 3);
|
|
float32x4_t col2 = vld1q_f32(m + 6);
|
|
float32x4_t col3 = vld1q_f32(m + 9);
|
|
float32x4_t sum = vaddq_f32(
|
|
vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)),
|
|
vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3));
|
|
return sum;
|
|
}
|
|
#elif PPSSPP_ARCH(ARM_NEON)
|
|
inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
|
|
float32x4_t col0 = vld1q_f32(m);
|
|
float32x4_t col1 = vld1q_f32(m + 3);
|
|
float32x4_t col2 = vld1q_f32(m + 6);
|
|
float32x4_t col3 = vld1q_f32(m + 9);
|
|
float32x4_t sum = vaddq_f32(
|
|
vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)),
|
|
vaddq_f32(vmulq_lane_f32(col2, vget_high_f32(vec), 0), col3));
|
|
return sum;
|
|
}
|
|
#endif
|
|
|
|
// v and vecOut must point to different memory.
|
|
inline void Vec3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) {
|
|
#if defined(_M_SSE)
|
|
__m128 x = _mm_set1_ps(v[0]);
|
|
__m128 y = _mm_set1_ps(v[1]);
|
|
__m128 z = _mm_set1_ps(v[2]);
|
|
__m128 sum = Vec3ByMatrix43Internal(x, y, z, m);
|
|
// Not sure what the best way to store 3 elements is. Ideally, we should
|
|
// probably store all four.
|
|
vecOut[0] = _mm_cvtss_f32(sum);
|
|
vecOut[1] = vectorGetByIndex<1>(sum);
|
|
vecOut[2] = vectorGetByIndex<2>(sum);
|
|
#elif PPSSPP_ARCH(ARM_NEON)
|
|
float vecIn[4] = {v[0], v[1], v[2], 1.0f};
|
|
float32x4_t sum = Vec3ByMatrix43Internal(vld1q_f32(vecIn), m);
|
|
vecOut[0] = vgetq_lane_f32(sum, 0);
|
|
vecOut[1] = vgetq_lane_f32(sum, 1);
|
|
vecOut[2] = vgetq_lane_f32(sum, 2);
|
|
#else
|
|
vecOut[0] = v[0] * m[0] + v[1] * m[3] + v[2] * m[6] + m[9];
|
|
vecOut[1] = v[0] * m[1] + v[1] * m[4] + v[2] * m[7] + m[10];
|
|
vecOut[2] = v[0] * m[2] + v[1] * m[5] + v[2] * m[8] + m[11];
|
|
#endif
|
|
}
|
|
|
|
inline Vec3f MATH3D_CALL Vec3ByMatrix43(const Vec3f v, const float m[12]) {
|
|
#if defined(_M_SSE)
|
|
const __m128 vv = SAFE_M128(v.vec);
|
|
__m128 x = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(0, 0, 0, 0));
|
|
__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
|
|
__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
|
|
return Vec3ByMatrix43Internal(x, y, z, m);
|
|
#elif PPSSPP_ARCH(ARM_NEON)
|
|
return Vec3ByMatrix43Internal(v.vec, m);
|
|
#else
|
|
Vec3f vecOut;
|
|
Vec3ByMatrix43(vecOut.AsArray(), v.AsArray(), m);
|
|
return vecOut;
|
|
#endif
|
|
}
|
|
|
|
#if defined(_M_SSE)
|
|
// x, y, and z should be broadcast. Should only be used through Vec3f version.
|
|
inline __m128 MATH3D_CALL Vec3ByMatrix44Internal(__m128 x, __m128 y, __m128 z, const float m[16]) {
|
|
__m128 col0 = _mm_loadu_ps(m);
|
|
__m128 col1 = _mm_loadu_ps(m + 4);
|
|
__m128 col2 = _mm_loadu_ps(m + 8);
|
|
__m128 col3 = _mm_loadu_ps(m + 12);
|
|
__m128 sum = _mm_add_ps(
|
|
_mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)),
|
|
_mm_add_ps(_mm_mul_ps(col2, z), col3));
|
|
return sum;
|
|
}
|
|
#elif PPSSPP_ARCH(ARM64_NEON)
|
|
inline float32x4_t Vec3ByMatrix44Internal(float32x4_t vec, const float m[16]) {
|
|
float32x4_t col0 = vld1q_f32(m);
|
|
float32x4_t col1 = vld1q_f32(m + 4);
|
|
float32x4_t col2 = vld1q_f32(m + 8);
|
|
float32x4_t col3 = vld1q_f32(m + 12);
|
|
float32x4_t sum = vaddq_f32(
|
|
vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)),
|
|
vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3));
|
|
return sum;
|
|
}
|
|
#elif PPSSPP_ARCH(ARM_NEON)
|
|
inline float32x4_t Vec3ByMatrix44Internal(float32x4_t vec, const float m[16]) {
|
|
float32x4_t col0 = vld1q_f32(m);
|
|
float32x4_t col1 = vld1q_f32(m + 4);
|
|
float32x4_t col2 = vld1q_f32(m + 8);
|
|
float32x4_t col3 = vld1q_f32(m + 12);
|
|
float32x4_t sum = vaddq_f32(
|
|
vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)),
|
|
vaddq_f32(vmulq_lane_f32(col2, vget_high_f32(vec), 0), col3));
|
|
return sum;
|
|
}
|
|
#endif
|
|
|
|
inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16]) {
|
|
#if defined(_M_SSE)
|
|
__m128 x = _mm_set1_ps(v[0]);
|
|
__m128 y = _mm_set1_ps(v[1]);
|
|
__m128 z = _mm_set1_ps(v[2]);
|
|
__m128 sum = Vec3ByMatrix44Internal(x, y, z, m);
|
|
_mm_storeu_ps(vecOut, sum);
|
|
#elif PPSSPP_ARCH(ARM_NEON)
|
|
float vecIn[4] = {v[0], v[1], v[2], 1.0f};
|
|
float32x4_t sum = Vec3ByMatrix44Internal(vld1q_f32(vecIn), m);
|
|
vst1q_f32(vecOut, sum);
|
|
#else
|
|
vecOut[0] = v[0] * m[0] + v[1] * m[4] + v[2] * m[8] + m[12];
|
|
vecOut[1] = v[0] * m[1] + v[1] * m[5] + v[2] * m[9] + m[13];
|
|
vecOut[2] = v[0] * m[2] + v[1] * m[6] + v[2] * m[10] + m[14];
|
|
vecOut[3] = v[0] * m[3] + v[1] * m[7] + v[2] * m[11] + m[15];
|
|
#endif
|
|
}
|
|
|
|
inline Vec4f MATH3D_CALL Vec3ByMatrix44(const Vec3f v, const float m[16]) {
|
|
#if defined(_M_SSE)
|
|
const __m128 vv = SAFE_M128(v.vec);
|
|
__m128 x = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(0, 0, 0, 0));
|
|
__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
|
|
__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
|
|
return Vec3ByMatrix44Internal(x, y, z, m);
|
|
#elif PPSSPP_ARCH(ARM_NEON)
|
|
return Vec3ByMatrix44Internal(v.vec, m);
|
|
#else
|
|
Vec4f vecOut;
|
|
Vec3ByMatrix44(vecOut.AsArray(), v.AsArray(), m);
|
|
return vecOut;
|
|
#endif
|
|
}
|
|
|
|
#if defined(_M_SSE)
|
|
// x, y, and z should be broadcast. Should only be used through Vec3f version.
|
|
inline __m128 MATH3D_CALL Norm3ByMatrix43Internal(__m128 x, __m128 y, __m128 z, const float m[12]) {
|
|
__m128 col0 = _mm_loadu_ps(m);
|
|
__m128 col1 = _mm_loadu_ps(m + 3);
|
|
__m128 col2 = _mm_loadu_ps(m + 6);
|
|
__m128 sum = _mm_add_ps(
|
|
_mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)),
|
|
_mm_mul_ps(col2, z));
|
|
return sum;
|
|
}
|
|
#elif PPSSPP_ARCH(ARM64_NEON)
|
|
inline float32x4_t Norm3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
|
|
float32x4_t col0 = vld1q_f32(m);
|
|
float32x4_t col1 = vld1q_f32(m + 3);
|
|
float32x4_t col2 = vld1q_f32(m + 6);
|
|
float32x4_t sum = vaddq_f32(
|
|
vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)),
|
|
vmulq_laneq_f32(col2, vec, 2));
|
|
return sum;
|
|
}
|
|
#elif PPSSPP_ARCH(ARM_NEON)
|
|
inline float32x4_t Norm3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
|
|
float32x4_t col0 = vld1q_f32(m);
|
|
float32x4_t col1 = vld1q_f32(m + 3);
|
|
float32x4_t col2 = vld1q_f32(m + 6);
|
|
float32x4_t sum = vaddq_f32(
|
|
vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)),
|
|
vmulq_lane_f32(col2, vget_high_f32(vec), 0));
|
|
return sum;
|
|
}
|
|
#endif
|
|
|
|
inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) {
|
|
#if defined(_M_SSE)
|
|
__m128 x = _mm_set1_ps(v[0]);
|
|
__m128 y = _mm_set1_ps(v[1]);
|
|
__m128 z = _mm_set1_ps(v[2]);
|
|
__m128 sum = Norm3ByMatrix43Internal(x, y, z, m);
|
|
vecOut[0] = _mm_cvtss_f32(sum);
|
|
vecOut[1] = vectorGetByIndex<1>(sum);
|
|
vecOut[2] = vectorGetByIndex<2>(sum);
|
|
#elif PPSSPP_ARCH(ARM_NEON)
|
|
float32x4_t sum = Norm3ByMatrix43Internal(vld1q_f32(v), m);
|
|
vecOut[0] = vgetq_lane_f32(sum, 0);
|
|
vecOut[1] = vgetq_lane_f32(sum, 1);
|
|
vecOut[2] = vgetq_lane_f32(sum, 2);
|
|
#else
|
|
vecOut[0] = v[0] * m[0] + v[1] * m[3] + v[2] * m[6];
|
|
vecOut[1] = v[0] * m[1] + v[1] * m[4] + v[2] * m[7];
|
|
vecOut[2] = v[0] * m[2] + v[1] * m[5] + v[2] * m[8];
|
|
#endif
|
|
}
|
|
|
|
inline Vec3f MATH3D_CALL Norm3ByMatrix43(const Vec3f v, const float m[12]) {
|
|
#if defined(_M_SSE)
|
|
const __m128 vv = SAFE_M128(v.vec);
|
|
__m128 x = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(0, 0, 0, 0));
|
|
__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
|
|
__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
|
|
return Norm3ByMatrix43Internal(x, y, z, m);
|
|
#elif PPSSPP_ARCH(ARM_NEON)
|
|
return Norm3ByMatrix43Internal(v.vec, m);
|
|
#else
|
|
Vec3f vecOut;
|
|
Norm3ByMatrix43(vecOut.AsArray(), v.AsArray(), m);
|
|
return vecOut;
|
|
#endif
|
|
}
|
|
|
|
inline void Matrix4ByMatrix4(float out[16], const float a[16], const float b[16]) {
|
|
fast_matrix_mul_4x4(out, b, a);
|
|
}
|
|
|
|
inline void ConvertMatrix4x3To4x4(float *m4x4, const float *m4x3) {
|
|
m4x4[0] = m4x3[0];
|
|
m4x4[1] = m4x3[1];
|
|
m4x4[2] = m4x3[2];
|
|
m4x4[3] = 0.0f;
|
|
m4x4[4] = m4x3[3];
|
|
m4x4[5] = m4x3[4];
|
|
m4x4[6] = m4x3[5];
|
|
m4x4[7] = 0.0f;
|
|
m4x4[8] = m4x3[6];
|
|
m4x4[9] = m4x3[7];
|
|
m4x4[10] = m4x3[8];
|
|
m4x4[11] = 0.0f;
|
|
m4x4[12] = m4x3[9];
|
|
m4x4[13] = m4x3[10];
|
|
m4x4[14] = m4x3[11];
|
|
m4x4[15] = 1.0f;
|
|
}
|
|
|
|
inline void ConvertMatrix4x3To4x4Transposed(float *m4x4, const float *m4x3) {
|
|
#if PPSSPP_ARCH(ARM_NEON)
|
|
// vld3q is a perfect match here!
|
|
float32x4x3_t packed = vld3q_f32(m4x3);
|
|
vst1q_f32(m4x4, packed.val[0]);
|
|
vst1q_f32(m4x4 + 4, packed.val[1]);
|
|
vst1q_f32(m4x4 + 8, packed.val[2]);
|
|
#else
|
|
m4x4[0] = m4x3[0];
|
|
m4x4[1] = m4x3[3];
|
|
m4x4[2] = m4x3[6];
|
|
m4x4[3] = m4x3[9];
|
|
m4x4[4] = m4x3[1];
|
|
m4x4[5] = m4x3[4];
|
|
m4x4[6] = m4x3[7];
|
|
m4x4[7] = m4x3[10];
|
|
m4x4[8] = m4x3[2];
|
|
m4x4[9] = m4x3[5];
|
|
m4x4[10] = m4x3[8];
|
|
m4x4[11] = m4x3[11];
|
|
#endif
|
|
m4x4[12] = 0.0f;
|
|
m4x4[13] = 0.0f;
|
|
m4x4[14] = 0.0f;
|
|
m4x4[15] = 1.0f;
|
|
}
|
|
|
|
// 0369
|
|
// 147A
|
|
// 258B
|
|
// ->>-
|
|
// 0123
|
|
// 4567
|
|
// 89AB
|
|
// Don't see a way to SIMD that. Should be pretty fast anyway.
|
|
inline void ConvertMatrix4x3To3x4Transposed(float *m4x4, const float *m4x3) {
|
|
#if PPSSPP_ARCH(ARM_NEON)
|
|
// vld3q is a perfect match here!
|
|
float32x4x3_t packed = vld3q_f32(m4x3);
|
|
vst1q_f32(m4x4, packed.val[0]);
|
|
vst1q_f32(m4x4 + 4, packed.val[1]);
|
|
vst1q_f32(m4x4 + 8, packed.val[2]);
|
|
#else
|
|
m4x4[0] = m4x3[0];
|
|
m4x4[1] = m4x3[3];
|
|
m4x4[2] = m4x3[6];
|
|
m4x4[3] = m4x3[9];
|
|
m4x4[4] = m4x3[1];
|
|
m4x4[5] = m4x3[4];
|
|
m4x4[6] = m4x3[7];
|
|
m4x4[7] = m4x3[10];
|
|
m4x4[8] = m4x3[2];
|
|
m4x4[9] = m4x3[5];
|
|
m4x4[10] = m4x3[8];
|
|
m4x4[11] = m4x3[11];
|
|
#endif
|
|
}
|
|
|
|
inline void Transpose4x4(float out[16], const float in[16]) {
|
|
for (int i = 0; i < 4; i++) {
|
|
for (int j = 0; j < 4; j++) {
|
|
out[i * 4 + j] = in[j * 4 + i];
|
|
}
|
|
}
|
|
}
|
|
|
|
namespace Math3D {
|
|
|
|
template<typename T>
|
|
inline T Dot(const Vec2<T>& a, const Vec2<T>& b)
|
|
{
|
|
return a.x*b.x + a.y*b.y;
|
|
}
|
|
|
|
template<typename T>
|
|
inline T Dot(const Vec3<T>& a, const Vec3<T>& b)
|
|
{
|
|
return a.x*b.x + a.y*b.y + a.z*b.z;
|
|
}
|
|
|
|
template<typename T>
|
|
inline T Dot(const Vec4<T>& a, const Vec4<T>& b)
|
|
{
|
|
return a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w;
|
|
}
|
|
|
|
template<typename T>
|
|
inline Vec3<T> Cross(const Vec3<T>& a, const Vec3<T>& b)
|
|
{
|
|
return Vec3<T>(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
|
|
}
|
|
|
|
template<typename T>
|
|
inline Vec3Packed<T> Cross(const Vec3Packed<T>& a, const Vec3Packed<T>& b)
|
|
{
|
|
return Vec3Packed<T>(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
|
|
}
|
|
|
|
template<>
|
|
inline Vec3<float> Vec3<float>::FromRGB(unsigned int rgb)
|
|
{
|
|
#if defined(_M_SSE)
|
|
__m128i z = _mm_setzero_si128();
|
|
__m128i c = _mm_cvtsi32_si128(rgb);
|
|
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
|
|
return Vec3<float>(_mm_mul_ps(_mm_cvtepi32_ps(c), _mm_set_ps1(1.0f / 255.0f)));
|
|
#elif PPSSPP_ARCH(ARM_NEON)
|
|
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgb));
|
|
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
|
|
return Vec3<float>(vmulq_f32(vcvtq_f32_u32(u), vdupq_n_f32(1.0f / 255.0f)));
|
|
#else
|
|
return Vec3((rgb & 0xFF) * (1.0f/255.0f),
|
|
((rgb >> 8) & 0xFF) * (1.0f/255.0f),
|
|
((rgb >> 16) & 0xFF) * (1.0f/255.0f));
|
|
#endif
|
|
}
|
|
|
|
template<>
|
|
inline Vec3<int> Vec3<int>::FromRGB(unsigned int rgb)
|
|
{
|
|
#if defined(_M_SSE)
|
|
__m128i z = _mm_setzero_si128();
|
|
__m128i c = _mm_cvtsi32_si128(rgb);
|
|
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
|
|
return Vec3<int>(c);
|
|
#elif PPSSPP_ARCH(ARM_NEON)
|
|
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgb));
|
|
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
|
|
return Vec3<int>(vreinterpretq_s32_u32(u));
|
|
#else
|
|
return Vec3(rgb & 0xFF, (rgb >> 8) & 0xFF, (rgb >> 16) & 0xFF);
|
|
#endif
|
|
}
|
|
|
|
template<>
|
|
__forceinline unsigned int Vec3<float>::ToRGB() const
|
|
{
|
|
#if defined(_M_SSE)
|
|
__m128i c = _mm_cvtps_epi32(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(255.0f)));
|
|
__m128i c16 = _mm_packs_epi32(c, c);
|
|
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)) & 0x00FFFFFF;
|
|
#elif PPSSPP_ARCH(ARM_NEON)
|
|
uint16x4_t c16 = vqmovun_s32(vcvtq_s32_f32(vmulq_f32(vsetq_lane_f32(0.0f, vec, 3), vdupq_n_f32(255.0f))));
|
|
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
|
|
return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
|
|
#else
|
|
return (clamp_u8((int)(r() * 255.f)) << 0) |
|
|
(clamp_u8((int)(g() * 255.f)) << 8) |
|
|
(clamp_u8((int)(b() * 255.f)) << 16);
|
|
#endif
|
|
}
|
|
|
|
template<>
|
|
__forceinline unsigned int Vec3<int>::ToRGB() const
|
|
{
|
|
#if defined(_M_SSE)
|
|
__m128i c16 = _mm_packs_epi32(SAFE_M128I(ivec), SAFE_M128I(ivec));
|
|
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)) & 0x00FFFFFF;
|
|
#elif PPSSPP_ARCH(ARM_NEON)
|
|
uint16x4_t c16 = vqmovun_s32(vsetq_lane_s32(0, ivec, 3));
|
|
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
|
|
return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
|
|
#else
|
|
return clamp_u8(r()) | (clamp_u8(g()) << 8) | (clamp_u8(b()) << 16);
|
|
#endif
|
|
}
|
|
|
|
template<>
|
|
inline Vec4<float> Vec4<float>::FromRGBA(unsigned int rgba)
|
|
{
|
|
#if defined(_M_SSE)
|
|
__m128i z = _mm_setzero_si128();
|
|
__m128i c = _mm_cvtsi32_si128(rgba);
|
|
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
|
|
return Vec4<float>(_mm_mul_ps(_mm_cvtepi32_ps(c), _mm_set_ps1(1.0f / 255.0f)));
|
|
#elif PPSSPP_ARCH(ARM_NEON)
|
|
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgba));
|
|
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
|
|
return Vec4<float>(vmulq_f32(vcvtq_f32_u32(u), vdupq_n_f32(1.0f / 255.0f)));
|
|
#else
|
|
return Vec4((rgba & 0xFF) * (1.0f/255.0f),
|
|
((rgba >> 8) & 0xFF) * (1.0f/255.0f),
|
|
((rgba >> 16) & 0xFF) * (1.0f/255.0f),
|
|
((rgba >> 24) & 0xFF) * (1.0f/255.0f));
|
|
#endif
|
|
}
|
|
|
|
template<typename T>
|
|
inline Vec4<T> Vec4<T>::FromRGBA(const u8 *rgba)
|
|
{
|
|
return Vec4<T>::FromRGBA(*(unsigned int *)rgba);
|
|
}
|
|
|
|
template<>
|
|
inline Vec4<int> Vec4<int>::FromRGBA(unsigned int rgba)
|
|
{
|
|
#if defined(_M_SSE)
|
|
__m128i z = _mm_setzero_si128();
|
|
__m128i c = _mm_cvtsi32_si128(rgba);
|
|
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
|
|
return Vec4<int>(c);
|
|
#elif PPSSPP_ARCH(ARM_NEON)
|
|
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgba));
|
|
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
|
|
return Vec4<int>(vreinterpretq_s32_u32(u));
|
|
#else
|
|
return Vec4(rgba & 0xFF, (rgba >> 8) & 0xFF, (rgba >> 16) & 0xFF, (rgba >> 24) & 0xFF);
|
|
#endif
|
|
}
|
|
|
|
template<>
|
|
__forceinline unsigned int Vec4<float>::ToRGBA() const
|
|
{
|
|
#if defined(_M_SSE)
|
|
__m128i c = _mm_cvtps_epi32(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(255.0f)));
|
|
__m128i c16 = _mm_packs_epi32(c, c);
|
|
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16));
|
|
#elif PPSSPP_ARCH(ARM_NEON)
|
|
uint16x4_t c16 = vqmovun_s32(vcvtq_s32_f32(vmulq_f32(vec, vdupq_n_f32(255.0f))));
|
|
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
|
|
return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
|
|
#else
|
|
return (clamp_u8((int)(r() * 255.f)) << 0) |
|
|
(clamp_u8((int)(g() * 255.f)) << 8) |
|
|
(clamp_u8((int)(b() * 255.f)) << 16) |
|
|
(clamp_u8((int)(a() * 255.f)) << 24);
|
|
#endif
|
|
}
|
|
|
|
template<>
|
|
__forceinline unsigned int Vec4<int>::ToRGBA() const
|
|
{
|
|
#if defined(_M_SSE)
|
|
__m128i c16 = _mm_packs_epi32(SAFE_M128I(ivec), SAFE_M128I(ivec));
|
|
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16));
|
|
#elif PPSSPP_ARCH(ARM_NEON)
|
|
uint16x4_t c16 = vqmovun_s32(ivec);
|
|
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
|
|
return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
|
|
#else
|
|
return clamp_u8(r()) | (clamp_u8(g()) << 8) | (clamp_u8(b()) << 16) | (clamp_u8(a()) << 24);
|
|
#endif
|
|
}
|
|
|
|
template<typename T>
|
|
__forceinline void Vec4<T>::ToRGBA(u8 *rgba) const
|
|
{
|
|
*(u32 *)rgba = ToRGBA();
|
|
}
|
|
|
|
#if defined(_M_SSE)
|
|
// Specialized for SIMD optimization
|
|
|
|
// Vec3<float> operation
|
|
template<>
|
|
inline void Vec3<float>::operator += (const Vec3<float> &other) {
|
|
vec = _mm_add_ps(SAFE_M128(vec), SAFE_M128(other.vec));
|
|
}
|
|
|
|
template<>
|
|
inline Vec3<float> Vec3<float>::operator + (const Vec3 &other) const {
|
|
return Vec3<float>(_mm_add_ps(SAFE_M128(vec), SAFE_M128(other.vec)));
|
|
}
|
|
|
|
template<>
|
|
inline void Vec3<float>::operator -= (const Vec3<float> &other) {
|
|
vec = _mm_sub_ps(SAFE_M128(vec), SAFE_M128(other.vec));
|
|
}
|
|
|
|
template<>
|
|
inline Vec3<float> Vec3<float>::operator - (const Vec3 &other) const {
|
|
return Vec3<float>(_mm_sub_ps(SAFE_M128(vec), SAFE_M128(other.vec)));
|
|
}
|
|
|
|
template<>
|
|
inline Vec3<float> Vec3<float>::operator * (const Vec3 &other) const {
|
|
return Vec3<float>(_mm_mul_ps(SAFE_M128(vec), SAFE_M128(other.vec)));
|
|
}
|
|
|
|
template<> template<>
|
|
inline Vec3<float> Vec3<float>::operator * (const float &other) const {
|
|
return Vec3<float>(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(other)));
|
|
}
|
|
|
|
// Vec4<int> operation
|
|
template<>
|
|
inline Vec4<int> Vec4<int>::operator + (const Vec4 &other) const {
|
|
return Vec4<int>(_mm_add_epi32(SAFE_M128I(ivec), SAFE_M128I(other.ivec)));
|
|
}
|
|
|
|
template<>
|
|
inline Vec4<int> Vec4<int>::operator * (const Vec4 &other) const {
|
|
__m128i a = SAFE_M128I(ivec);
|
|
__m128i b = SAFE_M128I(other.ivec);
|
|
// Intel in its immense wisdom decided that
|
|
// SSE2 does not get _mm_mullo_epi32(),
|
|
// so we do it this way. This is what clang does,
|
|
// which seems about as good as it gets.
|
|
__m128i m02 = _mm_mul_epu32(a, b);
|
|
__m128i m13 = _mm_mul_epu32(
|
|
_mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 1, 1)),
|
|
_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)));
|
|
__m128i ret = _mm_unpacklo_epi32(
|
|
_mm_shuffle_epi32(m02, _MM_SHUFFLE(3, 2, 2, 0)),
|
|
_mm_shuffle_epi32(m13, _MM_SHUFFLE(3, 2, 2, 0)));
|
|
return Vec4<int>(ret);
|
|
}
|
|
|
|
template<> template<>
|
|
inline Vec4<int> Vec4<int>::operator * (const int &other) const {
|
|
return (*this) * Vec4<int>(_mm_set1_epi32(other));
|
|
}
|
|
|
|
template<>
|
|
inline Vec4<int> Vec4<int>::operator | (const Vec4 &other) const {
|
|
return Vec4<int>(_mm_or_si128(SAFE_M128I(ivec), SAFE_M128I(other.ivec)));
|
|
}
|
|
|
|
template<>
|
|
inline Vec4<int> Vec4<int>::operator & (const Vec4 &other) const {
|
|
return Vec4<int>(_mm_and_si128(SAFE_M128I(ivec), SAFE_M128I(other.ivec)));
|
|
}
|
|
|
|
// NOTE: modern GCC, clang, and MSVC are all ok with
|
|
// non-compile-time-const amount for _mm_slli_epi32/_mm_srli_epi32.
|
|
template<>
|
|
inline Vec4<int> Vec4<int>::operator << (const int amount) const {
|
|
return Vec4<int>(_mm_slli_epi32(SAFE_M128I(ivec), amount));
|
|
}
|
|
|
|
template<>
|
|
inline Vec4<int> Vec4<int>::operator >> (const int amount) const {
|
|
return Vec4<int>(_mm_srli_epi32(SAFE_M128I(ivec), amount));
|
|
}
|
|
|
|
// Vec4<float> operation
|
|
template<>
|
|
inline void Vec4<float>::operator += (const Vec4<float> &other) {
|
|
vec = _mm_add_ps(SAFE_M128(vec), SAFE_M128(other.vec));
|
|
}
|
|
|
|
template<>
|
|
inline Vec4<float> Vec4<float>::operator + (const Vec4 &other) const {
|
|
return Vec4<float>(_mm_add_ps(SAFE_M128(vec), SAFE_M128(other.vec)));
|
|
}
|
|
|
|
template<>
|
|
inline Vec4<float> Vec4<float>::operator * (const Vec4 &other) const {
|
|
return Vec4<float>(_mm_mul_ps(SAFE_M128(vec), SAFE_M128(other.vec)));
|
|
}
|
|
|
|
template<> template<>
|
|
inline Vec4<float> Vec4<float>::operator * (const float &other) const {
|
|
return Vec4<float>(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(other)));
|
|
}
|
|
|
|
// Vec3<float> cross product
|
|
template<>
|
|
inline Vec3<float> Cross(const Vec3<float> &a, const Vec3<float> &b)
|
|
{
|
|
#if PPSSPP_ARCH(X86)
|
|
__m128 avec = _mm_loadu_ps(&a.x);
|
|
__m128 bvec = _mm_loadu_ps(&b.x);
|
|
#else
|
|
__m128 avec = a.vec;
|
|
__m128 bvec = b.vec;
|
|
#endif
|
|
const __m128 left = _mm_mul_ps(_mm_shuffle_ps(avec, avec, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(bvec, bvec, _MM_SHUFFLE(3, 1, 0, 2)));
|
|
const __m128 right = _mm_mul_ps(_mm_shuffle_ps(avec, avec, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(bvec, bvec, _MM_SHUFFLE(3, 0, 2, 1)));
|
|
return _mm_sub_ps(left, right);
|
|
}
|
|
#endif
|
|
|
|
}; // namespace Math3D
|
|
|
|
// linear interpolation via float: 0.0=begin, 1.0=end
|
|
template<typename X>
|
|
inline X Lerp(const X& begin, const X& end, const float t)
|
|
{
|
|
return begin*(1.f-t) + end*t;
|
|
}
|
|
|
|
// linear interpolation via int: 0=begin, base=end
|
|
template<typename X, int base>
|
|
inline X LerpInt(const X& begin, const X& end, const int t)
|
|
{
|
|
return (begin*(base-t) + end*t) / base;
|
|
}
|