Just add a packed version of Vec3f.

This way we can have it aligned to memory where needed.  I think it'd be
better to avoid this if possible so that we can actually vectorize
spline/etc. code.

Fixes #5673.
This commit is contained in:
Unknown W. Brackets 2014-03-17 06:58:50 -07:00
parent 38d0bac1df
commit 6630e45eff
4 changed files with 262 additions and 26 deletions

View File

@ -24,6 +24,6 @@
struct SimpleVertex {
float uv[2];
u8 color[4];
Vec3f nrm;
Vec3f pos;
Vec3Packedf nrm;
Vec3Packedf pos;
};

View File

@ -80,16 +80,16 @@ u32 TransformDrawEngine::NormalizeVertices(u8 *outPtr, u8 *bufPtr, const u8 *inP
float weights[8];
reader.ReadWeights(weights);
// Skinning
Vec3f psum(0,0,0);
Vec3f nsum(0,0,0);
Vec3Packedf psum(0,0,0);
Vec3Packedf nsum(0,0,0);
for (int w = 0; w < numBoneWeights; w++) {
if (weights[w] != 0.0f) {
Vec3ByMatrix43(bpos, pos, gstate.boneMatrix+w*12);
Vec3f tpos(bpos);
Vec3Packedf tpos(bpos);
psum += tpos * weights[w];
Norm3ByMatrix43(bnrm, nrm, gstate.boneMatrix+w*12);
Vec3f tnorm(bnrm);
Vec3Packedf tnorm(bnrm);
nsum += tnorm * weights[w];
}
}
@ -288,11 +288,11 @@ inline float bern2deriv(float x) { return 3 * (2 - 3 * x) * x; }
inline float bern3deriv(float x) { return 3 * x * x; }
// http://en.wikipedia.org/wiki/Bernstein_polynomial
Vec3f Bernstein3D(const Vec3f p0, const Vec3f p1, const Vec3f p2, const Vec3f p3, float x) {
Vec3Packedf Bernstein3D(const Vec3Packedf p0, const Vec3Packedf p1, const Vec3Packedf p2, const Vec3Packedf p3, float x) {
return p0 * bern0(x) + p1 * bern1(x) + p2 * bern2(x) + p3 * bern3(x);
}
Vec3f Bernstein3DDerivative(const Vec3f p0, const Vec3f p1, const Vec3f p2, const Vec3f p3, float x) {
Vec3Packedf Bernstein3DDerivative(const Vec3Packedf p0, const Vec3Packedf p1, const Vec3Packedf p2, const Vec3Packedf p3, float x) {
return p0 * bern0deriv(x) + p1 * bern1deriv(x) + p2 * bern2deriv(x) + p3 * bern3deriv(x);
}
@ -379,7 +379,7 @@ void TesselateSplinePatch(u8 *&dest, int &count, const SplinePatch &spatch, u32
// Generate normal if lighting is enabled (otherwise there's no point).
// This is a really poor quality algorithm, we get facet normals.
if (gstate.isLightingEnabled()) {
Vec3f norm = Cross(v1.pos - v0.pos, v2.pos - v0.pos);
Vec3Packedf norm = Cross(v1.pos - v0.pos, v2.pos - v0.pos);
norm.Normalize();
if (gstate.patchfacing & 1)
norm *= -1.0f;
@ -503,8 +503,8 @@ void TesselateSplinePatch(u8 *&dest, int &count, const SplinePatch &spatch, u32
int r = std::min(patch_div_s, u + 1);
int b = std::min(patch_div_t, v + 1);
const Vec3f &right = vertices[v * (patch_div_s + 1) + r].pos - vertices[v * (patch_div_s + 1) + l].pos;
const Vec3f &down = vertices[b * (patch_div_s + 1) + u].pos - vertices[t * (patch_div_s + 1) + u].pos;
const Vec3Packedf &right = vertices[v * (patch_div_s + 1) + r].pos - vertices[v * (patch_div_s + 1) + l].pos;
const Vec3Packedf &down = vertices[b * (patch_div_s + 1) + u].pos - vertices[t * (patch_div_s + 1) + u].pos;
vertices[v * (patch_div_s + 1) + u].nrm = Cross(right, down).Normalized();
if (gstate.patchfacing & 1) {
@ -570,7 +570,7 @@ void TesselateBezierPatch(u8 *&dest, int &count, int tess_u, int tess_v, const B
// Generate normal if lighting is enabled (otherwise there's no point).
// This is a really poor quality algorithm, we get facet normals.
if (gstate.isLightingEnabled()) {
Vec3f norm = Cross(v1.pos - v0.pos, v2.pos - v0.pos);
Vec3Packedf norm = Cross(v1.pos - v0.pos, v2.pos - v0.pos);
norm.Normalize();
if (gstate.patchfacing & 1)
norm *= -1.0f;
@ -591,10 +591,10 @@ void TesselateBezierPatch(u8 *&dest, int &count, int tess_u, int tess_v, const B
// First compute all the vertices and put them in an array
SimpleVertex *vertices = new SimpleVertex[(tess_u + 1) * (tess_v + 1)];
Vec3f *horiz = new Vec3f[(tess_u + 1) * 4];
Vec3f *horiz2 = horiz + (tess_u + 1) * 1;
Vec3f *horiz3 = horiz + (tess_u + 1) * 2;
Vec3f *horiz4 = horiz + (tess_u + 1) * 3;
Vec3Packedf *horiz = new Vec3Packedf[(tess_u + 1) * 4];
Vec3Packedf *horiz2 = horiz + (tess_u + 1) * 1;
Vec3Packedf *horiz3 = horiz + (tess_u + 1) * 2;
Vec3Packedf *horiz4 = horiz + (tess_u + 1) * 3;
// Precompute the horizontal curves to we only have to evaluate the vertical ones.
for (int i = 0; i < tess_u + 1; i++) {
@ -615,20 +615,20 @@ void TesselateBezierPatch(u8 *&dest, int &count, int tess_u, int tess_v, const B
float bv = v;
// TODO: Should be able to precompute the four curves per U, then just Bernstein per V. Will benefit large tesselation factors.
const Vec3f &pos1 = horiz[tile_u];
const Vec3f &pos2 = horiz2[tile_u];
const Vec3f &pos3 = horiz3[tile_u];
const Vec3f &pos4 = horiz4[tile_u];
const Vec3Packedf &pos1 = horiz[tile_u];
const Vec3Packedf &pos2 = horiz2[tile_u];
const Vec3Packedf &pos3 = horiz3[tile_u];
const Vec3Packedf &pos4 = horiz4[tile_u];
SimpleVertex &vert = vertices[tile_v * (tess_u + 1) + tile_u];
if (computeNormals) {
Vec3f derivU1 = Bernstein3DDerivative(patch.points[0]->pos, patch.points[1]->pos, patch.points[2]->pos, patch.points[3]->pos, bu);
Vec3f derivU2 = Bernstein3DDerivative(patch.points[4]->pos, patch.points[5]->pos, patch.points[6]->pos, patch.points[7]->pos, bu);
Vec3f derivU3 = Bernstein3DDerivative(patch.points[8]->pos, patch.points[9]->pos, patch.points[10]->pos, patch.points[11]->pos, bu);
Vec3f derivU4 = Bernstein3DDerivative(patch.points[12]->pos, patch.points[13]->pos, patch.points[14]->pos, patch.points[15]->pos, bu);
Vec3f derivU = Bernstein3D(derivU1, derivU2, derivU3, derivU4, bv);
Vec3f derivV = Bernstein3DDerivative(pos1, pos2, pos3, pos4, bv);
Vec3Packedf derivU1 = Bernstein3DDerivative(patch.points[0]->pos, patch.points[1]->pos, patch.points[2]->pos, patch.points[3]->pos, bu);
Vec3Packedf derivU2 = Bernstein3DDerivative(patch.points[4]->pos, patch.points[5]->pos, patch.points[6]->pos, patch.points[7]->pos, bu);
Vec3Packedf derivU3 = Bernstein3DDerivative(patch.points[8]->pos, patch.points[9]->pos, patch.points[10]->pos, patch.points[11]->pos, bu);
Vec3Packedf derivU4 = Bernstein3DDerivative(patch.points[12]->pos, patch.points[13]->pos, patch.points[14]->pos, patch.points[15]->pos, bu);
Vec3Packedf derivU = Bernstein3D(derivU1, derivU2, derivU3, derivU4, bv);
Vec3Packedf derivV = Bernstein3DDerivative(pos1, pos2, pos3, pos4, bv);
// TODO: Interpolate normals instead of generating them, if available?
vert.nrm = Cross(derivU, derivV).Normalized();

View File

@ -167,6 +167,72 @@ float Vec3<float>::Normalize()
return len;
}
template<>
Vec3Packed<float> Vec3Packed<float>::FromRGB(unsigned int rgb)
{
return Vec3Packed((rgb & 0xFF) * (1.0f/255.0f),
((rgb >> 8) & 0xFF) * (1.0f/255.0f),
((rgb >> 16) & 0xFF) * (1.0f/255.0f));
}
template<>
Vec3Packed<int> Vec3Packed<int>::FromRGB(unsigned int rgb)
{
return Vec3Packed(rgb & 0xFF, (rgb >> 8) & 0xFF, (rgb >> 16) & 0xFF);
}
template<>
unsigned int Vec3Packed<float>::ToRGB() const
{
return ((unsigned int)(r()*255.f)) +
((unsigned int)(g()*255.f*256.f)) +
((unsigned int)(b()*255.f*256.f*256.f));
}
template<>
unsigned int Vec3Packed<int>::ToRGB() const
{
return (r()&0xFF) | ((g()&0xFF)<<8) | ((b()&0xFF)<<16);
}
template<>
float Vec3Packed<float>::Length() const
{
return sqrtf(Length2());
}
template<>
void Vec3Packed<float>::SetLength(const float l)
{
(*this) *= l / Length();
}
template<>
Vec3Packed<float> Vec3Packed<float>::WithLength(const float l) const
{
return (*this) * l / Length();
}
template<>
float Vec3Packed<float>::Distance2To(Vec3Packed<float> &other)
{
return Vec3Packed<float>(other-(*this)).Length2();
}
template<>
Vec3Packed<float> Vec3Packed<float>::Normalized() const
{
return (*this) / Length();
}
template<>
float Vec3Packed<float>::Normalize()
{
float len = Length();
(*this) = (*this)/len;
return len;
}
template<>
Vec4<float> Vec4<float>::FromRGBA(unsigned int rgba)
{

View File

@ -347,6 +347,169 @@ public:
#undef _DEFINE_SWIZZLER2
};
template<typename T>
class Vec3Packed
{
public:
union
{
struct
{
T x,y,z;
};
};
T* AsArray() { return &x; }
const T* AsArray() const { return &x; }
Vec3Packed() {}
Vec3Packed(const T a[3]) : x(a[0]), y(a[1]), z(a[2]) {}
Vec3Packed(const T& _x, const T& _y, const T& _z) : x(_x), y(_y), z(_z) {}
Vec3Packed(const Vec2<T>& _xy, const T& _z) : x(_xy.x), y(_xy.y), z(_z) {}
template<typename T2>
Vec3Packed<T2> Cast() const
{
return Vec3Packed<T2>((T2)x, (T2)y, (T2)z);
}
// Only implemented for T=int and T=float
static Vec3Packed FromRGB(unsigned int rgb);
unsigned int ToRGB() const; // alpha bits set to zero
static Vec3Packed AssignToAll(const T& f)
{
return Vec3Packed<T>(f, f, f);
}
void Write(T a[3])
{
a[0] = x; a[1] = y; a[2] = z;
}
Vec3Packed operator +(const Vec3Packed &other) const
{
return Vec3Packed(x+other.x, y+other.y, z+other.z);
}
void operator += (const Vec3Packed &other)
{
x+=other.x; y+=other.y; z+=other.z;
}
Vec3Packed operator -(const Vec3Packed &other) const
{
return Vec3Packed(x-other.x, y-other.y, z-other.z);
}
void operator -= (const Vec3Packed &other)
{
x-=other.x; y-=other.y; z-=other.z;
}
Vec3Packed operator -() const
{
return Vec3Packed(-x,-y,-z);
}
Vec3Packed operator * (const Vec3Packed &other) const
{
return Vec3Packed(x*other.x, y*other.y, z*other.z);
}
template<typename V>
Vec3Packed operator * (const V& f) const
{
return Vec3Packed(x*f,y*f,z*f);
}
template<typename V>
void operator *= (const V& f)
{
x*=f; y*=f; z*=f;
}
template<typename V>
Vec3Packed operator / (const V& f) const
{
return Vec3Packed(x/f,y/f,z/f);
}
template<typename V>
void operator /= (const V& f)
{
*this = *this / f;
}
T Length2() const
{
return x*x + y*y + z*z;
}
Vec3Packed Clamp(const T &l, const T &h) const
{
return Vec3Packed(VecClamp(x, l, h), VecClamp(y, l, h), VecClamp(z, l, h));
}
// Only implemented for T=float
float Length() const;
void SetLength(const float l);
Vec3Packed WithLength(const float l) const;
float Distance2To(Vec3Packed &other);
Vec3Packed Normalized() const;
float Normalize(); // returns the previous length, which is often useful
T& operator [] (int i) //allow vector[2] = 3 (vector.z=3)
{
return *((&x) + i);
}
T operator [] (const int i) const
{
return *((&x) + i);
}
void SetZero()
{
x=0; y=0; z=0;
}
// Common aliases: UVW (texel coordinates), RGB (colors), STQ (texture coordinates)
T& u() { return x; }
T& v() { return y; }
T& w() { return z; }
T& r() { return x; }
T& g() { return y; }
T& b() { return z; }
T& s() { return x; }
T& t() { return y; }
T& q() { return z; }
const T& u() const { return x; }
const T& v() const { return y; }
const T& w() const { return z; }
const T& r() const { return x; }
const T& g() const { return y; }
const T& b() const { return z; }
const T& s() const { return x; }
const T& t() const { return y; }
const T& q() const { return z; }
// swizzlers - create a subvector of specific components
// e.g. Vec2 uv() { return Vec2(x,y); }
// _DEFINE_SWIZZLER2 defines a single such function, DEFINE_SWIZZLER2 defines all of them for all component names (x<->r) and permutations (xy<->yx)
#define _DEFINE_SWIZZLER2(a, b, name) Vec2<T> name() const { return Vec2<T>(a, b); }
#define DEFINE_SWIZZLER2(a, b, a2, b2, a3, b3, a4, b4) \
_DEFINE_SWIZZLER2(a, b, a##b); \
_DEFINE_SWIZZLER2(a, b, a2##b2); \
_DEFINE_SWIZZLER2(a, b, a3##b3); \
_DEFINE_SWIZZLER2(a, b, a4##b4); \
_DEFINE_SWIZZLER2(b, a, b##a); \
_DEFINE_SWIZZLER2(b, a, b2##a2); \
_DEFINE_SWIZZLER2(b, a, b3##a3); \
_DEFINE_SWIZZLER2(b, a, b4##a4);
DEFINE_SWIZZLER2(x, y, r, g, u, v, s, t);
DEFINE_SWIZZLER2(x, z, r, b, u, w, s, q);
DEFINE_SWIZZLER2(y, z, g, b, v, w, t, q);
#undef DEFINE_SWIZZLER2
#undef _DEFINE_SWIZZLER2
};
template<typename T>
class Vec4
{
@ -629,6 +792,7 @@ private:
}; // namespace Math3D
typedef Math3D::Vec3<float> Vec3f;
typedef Math3D::Vec3Packed<float> Vec3Packedf;
typedef Math3D::Vec4<float> Vec4f;
@ -721,6 +885,12 @@ inline Vec3<T> Cross(const Vec3<T>& a, const Vec3<T>& b)
return Vec3<T>(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
}
template<typename T>
inline Vec3Packed<T> Cross(const Vec3Packed<T>& a, const Vec3Packed<T>& b)
{
return Vec3Packed<T>(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
}
}; // namespace Math3D
// linear interpolation via float: 0.0=begin, 1.0=end