further ironing out the underlying math lib / SIMD intrinsics (theoretically now if -march=native binaries are shipped they shouldn't crash through multi-versioning)

This commit is contained in:
ecker 2025-09-04 19:39:22 -05:00
parent c87eac5e05
commit e1d824a5ac
10 changed files with 886 additions and 517 deletions

View File

@ -1,8 +1,9 @@
#include "pod.inl"
#if UF_USE_CLASS_OF_PODS
#include "class.inl"
#if UF_USE_SIMD
#include "simd.inl"
#endif
#include "pod.inl"
template<typename T, size_t R, size_t C>
uf::stl::string /*UF_API*/ uf::string::toString( const pod::Matrix<T,R,C>& m ) {
return uf::matrix::toString(m);

View File

@ -81,6 +81,11 @@ inline bool pod::Matrix<T,R,C>::operator!=( const Matrix<T,R,C>& matrix ) const
return !uf::matrix::equals( *this, matrix );
}
template<typename T> bool uf::matrix::equals( const T& left, const T& right, float eps ) {
#if UF_USE_SIMD
if constexpr (std::is_same_v<T,float>) {
return uf::simd::matEquals( left, right, eps );
}
#endif
bool result = true;
FOR_EACH(T::rows * T::columns, {
if ( fabs(left[i] - right[i]) > eps ) result = false;
@ -91,27 +96,11 @@ template<typename T> pod::Matrix<T,4,4> uf::matrix::multiply( const pod::Matrix<
pod::Matrix<T,4,4> res;
#if UF_USE_SIMD
auto row1 = uf::simd::load(&left[0]);
auto row2 = uf::simd::load(&left[4]);
auto row3 = uf::simd::load(&left[8]);
auto row4 = uf::simd::load(&left[12]);
FOR_EACH(4, {
auto brod1 = uf::simd::set(right[4*i + 0]);
auto brod2 = uf::simd::set(right[4*i + 1]);
auto brod3 = uf::simd::set(right[4*i + 2]);
auto brod4 = uf::simd::set(right[4*i + 3]);
auto row = uf::simd::add(
uf::simd::add(
uf::simd::mul(brod1, row1),
uf::simd::mul(brod2, row2)),
uf::simd::add(
uf::simd::mul(brod3, row3),
uf::simd::mul(brod4, row4)));
uf::simd::store(row, &res[4*i]);
});
return res;
#elif UF_ENV_DREAMCAST
if constexpr (std::is_same_v<T,float>) {
return uf::simd::matMult( left, right );
}
#endif
#if UF_ENV_DREAMCAST
// kallistios has dedicated SH4 asm for these or something
mat_load( (matrix_t*) &left[0] );
mat_apply( (matrix_t*) &right[0] );
@ -122,7 +111,6 @@ template<typename T> pod::Matrix<T,4,4> uf::matrix::multiply( const pod::Matrix<
// MATH_Store_XMTRX( (ALL_FLOATS_STRUCT*) &res[0]);
return res;
#else
#if 1
FOR_EACH_2D(4, 4, {
T sum = T{0};
for (size_t k = 0; k < 4; ++k) {
@ -131,29 +119,6 @@ template<typename T> pod::Matrix<T,4,4> uf::matrix::multiply( const pod::Matrix<
res(r, c) = sum;
});
return res;
#else
// it works
const pod::Vector<T,4>& srcA0 = *((pod::Vector<T,4>*) &left[0]);
const pod::Vector<T,4>& srcA1 = *((pod::Vector<T,4>*) &left[4]);
const pod::Vector<T,4>& srcA2 = *((pod::Vector<T,4>*) &left[8]);
const pod::Vector<T,4>& srcA3 = *((pod::Vector<T,4>*) &left[12]);
const pod::Vector<T,4>& srcB0 = *((pod::Vector<T,4>*) &right[0]);
const pod::Vector<T,4>& srcB1 = *((pod::Vector<T,4>*) &right[4]);
const pod::Vector<T,4>& srcB2 = *((pod::Vector<T,4>*) &right[8]);
const pod::Vector<T,4>& srcB3 = *((pod::Vector<T,4>*) &right[12]);
pod::Vector<T,4>& dst0 = *((pod::Vector<T,4>*) &res[0]);
pod::Vector<T,4>& dst1 = *((pod::Vector<T,4>*) &res[4]);
pod::Vector<T,4>& dst2 = *((pod::Vector<T,4>*) &res[8]);
pod::Vector<T,4>& dst3 = *((pod::Vector<T,4>*) &res[12]);
dst0 = srcA0 * srcB0[0] + srcA1 * srcB0[1] + srcA2 * srcB0[2] + srcA3 * srcB0[3];
dst1 = srcA0 * srcB1[0] + srcA1 * srcB1[1] + srcA2 * srcB1[2] + srcA3 * srcB1[3];
dst2 = srcA0 * srcB2[0] + srcA1 * srcB2[1] + srcA2 * srcB2[2] + srcA3 * srcB2[3];
dst3 = srcA0 * srcB3[0] + srcA1 * srcB3[1] + srcA2 * srcB3[2] + srcA3 * srcB3[3];
return res;
#endif
#endif
}
template<typename T, typename U> pod::Matrix<typename T::type_t, T::columns, T::columns> uf::matrix::multiply( const T& left, const U& right ) {
@ -200,8 +165,12 @@ template<typename T> T /*UF_API*/ uf::matrix::add( const T& lhs, const T& rhs )
return matrix;
}
template<typename T> T uf::matrix::transpose( const T& matrix ) {
#if UF_USE_SIMD
if constexpr (std::is_same_v<T,float> && T::rows == 4 && T::columns == 4 ) {
return uf::simd::matTranspose( matrix );
}
#endif
T transpose;
FOR_EACH_2D(T::rows, T::columns, {
transpose(c, r) = matrix(r, c);
});
@ -283,6 +252,13 @@ pod::Vector3t<T> uf::matrix::multiply(const pod::Matrix3t<T>& mat, const pod::Ve
};
}
template<typename T> pod::Vector4t<T> uf::matrix::multiply( const pod::Matrix4t<T>& mat, const pod::Vector4t<T>& v, bool div ) {
#if UF_USE_SIMD
if constexpr (std::is_same_v<T,float>) {
pod::Vector4t<T> res = uf::simd::matMult( mat, v );
if ( div && res.w > 0 ) res /= res.w;
return res;
}
#endif
#if UF_ENV_DREAMCAST
MATH_Load_XMTRX( (ALL_FLOATS_STRUCT*) &mat[0] );
auto t = MATH_Matrix_Transform( v[0], v[1], v[2], v[3] );
@ -463,17 +439,15 @@ pod::Matrix4t<T> /*UF_API*/ uf::matrix::perspective( T fov, T raidou, T znear, T
#endif
}
template<typename T> T& uf::matrix::copy( T& destination, const T& source ) {
#pragma unroll // GCC unroll 16
for ( auto i = 0; i < 16; ++i )
FOR_EACH(T::rows * T::columns, {
destination[i] = source[i];
});
return destination;
}
template<typename T> T& uf::matrix::copy( T& destination, typename T::type_t* const source ) {
#pragma unroll // GCC unroll 16
for ( auto i = 0; i < 16; ++i )
FOR_EACH(T::rows * T::columns, {
destination[i] = source[i];
});
return destination;
}

View File

@ -0,0 +1,226 @@
namespace uf {
namespace simd {
template<typename T>
class alignas(16) matrix_value {
public:
typedef typename traits<T>::value value_type;
value_type m[4]; // 4 x 4
inline matrix_value();
inline matrix_value(const pod::Matrix<T,4>& rhs);
inline bool operator==(const matrix_value&) const;
inline operator pod::Matrix<T,4>() const;
};
}
namespace simd {
inline uf::simd::matrix_value<float> matMult( const uf::simd::matrix_value<float>& A, const uf::simd::matrix_value<float>& B );
inline uf::simd::vector<float> matMult( const uf::simd::matrix_value<float>& A, uf::simd::vector<float> B );
inline uf::simd::matrix_value<float> matTranspose( const uf::simd::matrix_value<float>& M );
inline bool matEquals( const uf::simd::matrix_value<float>& A, const uf::simd::matrix_value<float>& B, float eps );
}
}
namespace {
__attribute__((target("default")))
uf::simd::matrix_value<float> matMult_impl(const uf::simd::matrix_value<float>& A, const uf::simd::matrix_value<float>& B) {
uf::simd::matrix_value<float> R;
uf::simd::matrix_value<float> Bt = uf::simd::matTranspose(B);
FOR_EACH(4, {
__m128 bcol = B.m[i];
__m128 vx = _mm_shuffle_ps(bcol, bcol, 0x00); // xxxx
__m128 vy = _mm_shuffle_ps(bcol, bcol, 0x55); // yyyy
__m128 vz = _mm_shuffle_ps(bcol, bcol, 0xAA); // zzzz
__m128 vw = _mm_shuffle_ps(bcol, bcol, 0xFF); // wwww
R.m[i] = _mm_add_ps(
_mm_add_ps(
_mm_mul_ps(A.m[0], vx),
_mm_mul_ps(A.m[1], vy)),
_mm_add_ps(
_mm_mul_ps(A.m[2], vz),
_mm_mul_ps(A.m[3], vw))
);
});
return R;
}
#if 1
__attribute__((target("sse4.1")))
uf::simd::matrix_value<float> matMult_impl(const uf::simd::matrix_value<float>& A, const uf::simd::matrix_value<float>& B) {
uf::simd::matrix_value<float> R;
uf::simd::matrix_value<float> Bt = uf::simd::matTranspose(B);
FOR_EACH(4, {
__m128 bcol = B.m[i];
__m128 vx = _mm_shuffle_ps(bcol, bcol, 0x00); // xxxx
__m128 vy = _mm_shuffle_ps(bcol, bcol, 0x55); // yyyy
__m128 vz = _mm_shuffle_ps(bcol, bcol, 0xAA); // zzzz
__m128 vw = _mm_shuffle_ps(bcol, bcol, 0xFF); // wwww
R.m[i] = _mm_add_ps(
_mm_add_ps(
_mm_mul_ps(A.m[0], vx),
_mm_mul_ps(A.m[1], vy)),
_mm_add_ps(
_mm_mul_ps(A.m[2], vz),
_mm_mul_ps(A.m[3], vw))
);
});
return R;
}
#endif
#if 1
__attribute__((target("avx2,fma")))
uf::simd::matrix_value<float> matMult_impl(const uf::simd::matrix_value<float>& A, const uf::simd::matrix_value<float>& B) {
uf::simd::matrix_value<float> R;
uf::simd::matrix_value<float> Bt = uf::simd::matTranspose(B);
FOR_EACH(4, {
__m128 bcol = B.m[i];
__m256 vx = _mm256_broadcastss_ps(bcol); // xxxx
__m256 vy = _mm256_broadcastss_ps(_mm_shuffle_ps(bcol, bcol, 0x55)); // yyyy
__m256 vz = _mm256_broadcastss_ps(_mm_shuffle_ps(bcol, bcol, 0xAA)); // zzzz
__m256 vw = _mm256_broadcastss_ps(_mm_shuffle_ps(bcol, bcol, 0xFF)); // wwww
__m256 a0 = _mm256_castps128_ps256(A.m[0]);
__m256 a1 = _mm256_castps128_ps256(A.m[1]);
__m256 a2 = _mm256_castps128_ps256(A.m[2]);
__m256 a3 = _mm256_castps128_ps256(A.m[3]);
__m256 r = _mm256_fmadd_ps(a0, vx,
_mm256_fmadd_ps(a1, vy,
_mm256_fmadd_ps(a2, vz,
_mm256_mul_ps(a3, vw))));
__m128 r128 = _mm_add_ps(
_mm256_castps256_ps128(r),
_mm256_extractf128_ps(r, 1)
);
R.m[i] = r128;
});
return R;
}
#endif
#if 1
__attribute__((target("avx512f")))
uf::simd::matrix_value<float> matMult_impl( const uf::simd::matrix_value<float>& A, const uf::simd::matrix_value<float>& B) {
uf::simd::matrix_value<float> R;
uf::simd::matrix_value<float> Bt = uf::simd::matTranspose(B);
FOR_EACH(4, {
__m128 bcol = B.m[i];
__m512 vx = _mm512_set1_ps(((const float*)&bcol)[0]); // xxxx
__m512 vy = _mm512_set1_ps(((const float*)&bcol)[1]); // yyyy
__m512 vz = _mm512_set1_ps(((const float*)&bcol)[2]); // zzzz
__m512 vw = _mm512_set1_ps(((const float*)&bcol)[3]); // wwww
__m512 a0 = _mm512_castps128_ps512(A.m[0]);
__m512 a1 = _mm512_castps128_ps512(A.m[1]);
__m512 a2 = _mm512_castps128_ps512(A.m[2]);
__m512 a3 = _mm512_castps128_ps512(A.m[3]);
__m512 r = _mm512_fmadd_ps(a0, vx,
_mm512_fmadd_ps(a1, vy,
_mm512_fmadd_ps(a2, vz,
_mm512_mul_ps(a3, vw))));
__m128 r128 = _mm_add_ps(
_mm_add_ps(
_mm512_castps512_ps128(r), // low 128
_mm512_extractf32x4_ps(r, 1)), // next 128
_mm_add_ps(
_mm512_extractf32x4_ps(r, 2), // next 128
_mm512_extractf32x4_ps(r, 3)) // high 128
);
R.m[i] = r128;
});
return R;
}
#endif
__attribute__((target("default")))
uf::simd::vector<float> matMult_impl( const uf::simd::matrix_value<float>& M, uf::simd::vector<float> v ) {
__m128 vx = _mm_shuffle_ps(v, v, 0x00);
__m128 vy = _mm_shuffle_ps(v, v, 0x55);
__m128 vz = _mm_shuffle_ps(v, v, 0xAA);
__m128 vw = _mm_shuffle_ps(v, v, 0xFF);
__m128 r0 = _mm_mul_ps(M.m[0], vx);
__m128 r1 = _mm_mul_ps(M.m[1], vy);
__m128 r2 = _mm_mul_ps(M.m[2], vz);
__m128 r3 = _mm_mul_ps(M.m[3], vw);
return _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));
}
#if 1
__attribute__((target("fma")))
uf::simd::vector<float> matMult_impl( const uf::simd::matrix_value<float>& M, uf::simd::vector<float> v ) {
__m128 vx = _mm_shuffle_ps(v, v, 0x00);
__m128 vy = _mm_shuffle_ps(v, v, 0x55);
__m128 vz = _mm_shuffle_ps(v, v, 0xAA);
__m128 vw = _mm_shuffle_ps(v, v, 0xFF);
return _mm_fmadd_ps(M.m[0], vx,
_mm_fmadd_ps(M.m[1], vy,
_mm_fmadd_ps(M.m[2], vz,
_mm_mul_ps(M.m[3], vw))));
}
#endif
}
template<typename T>
inline uf::simd::matrix_value<T>::matrix_value() {}
template<typename T>
inline uf::simd::matrix_value<T>::matrix_value( const pod::Matrix<T,4>& mat ) {
m[0] = _mm_loadu_ps(&mat[0]);
m[1] = _mm_loadu_ps(&mat[4]);
m[2] = _mm_loadu_ps(&mat[8]);
m[3] = _mm_loadu_ps(&mat[12]);
}
template<typename T>
inline bool uf::simd::matrix_value<T>::operator==(const matrix_value& rhs) const {
return uf::simd::matEquals( *this, rhs );
}
template<typename T>
inline uf::simd::matrix_value<T>::operator pod::Matrix<T,4>() const {
pod::Matrix4f mat;
_mm_storeu_ps(&mat[0], m[0]);
_mm_storeu_ps(&mat[4], m[1]);
_mm_storeu_ps(&mat[8], m[2]);
_mm_storeu_ps(&mat[12], m[3]);
return mat;
}
inline uf::simd::matrix_value<float> uf::simd::matMult( const uf::simd::matrix_value<float>& A, const uf::simd::matrix_value<float>& B ) {
return ::matMult_impl( A, B );
}
inline uf::simd::vector<float> uf::simd::matMult( const uf::simd::matrix_value<float>& M, uf::simd::vector<float> vec ) {
return ::matMult_impl( M, vec );
}
inline uf::simd::matrix_value<float> uf::simd::matTranspose( const uf::simd::matrix_value<float>& M ) {
uf::simd::matrix_value<float> R = M;
_MM_TRANSPOSE4_PS(R.m[0], R.m[1], R.m[2], R.m[3]);
return R;
}
inline bool uf::simd::matEquals( const uf::simd::matrix_value<float>& A, const uf::simd::matrix_value<float>& B, float eps ) {
bool result = true;
__m128 e = _mm_set1_ps(eps);
FOR_EACH(4, {
__m128 diff = _mm_sub_ps(A.m[i], B.m[i]);
__m128 mask = _mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.0f), diff), e);
if (_mm_movemask_ps(mask)) result = false;
});
return result;
}

View File

@ -28,9 +28,9 @@ namespace pod {
};
struct SupportPoint {
alignas(16) pod::Vector3f p;
alignas(16) pod::Vector3f pA;
alignas(16) pod::Vector3f pB;
/*alignas(16)*/ pod::Vector3f p;
/*alignas(16)*/ pod::Vector3f pA;
/*alignas(16)*/ pod::Vector3f pB;
};
struct Simplex {
@ -39,7 +39,7 @@ namespace pod {
struct Face {
pod::SupportPoint a, b, c;
alignas(16) pod::Vector3f normal;
/*alignas(16)*/ pod::Vector3f normal;
float distance;
};
@ -62,7 +62,7 @@ namespace pod {
typedef uf::stl::unordered_set<pair_t, PairHash, PairEq> pairs_t;
struct Node {
alignas(16) pod::AABB bounds = {};
/*alignas(16)*/ pod::AABB bounds = {};
int32_t left = -1;
int32_t right = -1;
int32_t start = 0;
@ -71,7 +71,7 @@ namespace pod {
bool asleep = false;
};
struct FlatNode {
alignas(16) pod::AABB bounds = {};
/*alignas(16)*/ pod::AABB bounds = {};
int32_t start = -1;
int32_t count = -1;
int32_t skipIndex = -1;
@ -171,23 +171,23 @@ namespace pod {
float mass = 1.0f;
float inverseMass = 1.0f;
alignas(16) pod::Vector3f offset = {};
/*alignas(16)*/ pod::Vector3f offset = {};
alignas(16) pod::Vector3f velocity = {};
alignas(16) pod::Vector3f forceAccumulator = {};
/*alignas(16)*/ pod::Vector3f velocity = {};
/*alignas(16)*/ pod::Vector3f forceAccumulator = {};
alignas(16) pod::Vector3f angularVelocity = {};
alignas(16) pod::Vector3f torqueAccumulator = {};
/*alignas(16)*/ pod::Vector3f angularVelocity = {};
/*alignas(16)*/ pod::Vector3f torqueAccumulator = {};
alignas(16) pod::Vector3f inertiaTensor = { 1, 1, 1 };
alignas(16) pod::Vector3f inverseInertiaTensor = { 1, 1, 1 };
/*alignas(16)*/ pod::Vector3f inertiaTensor = { 1, 1, 1 };
/*alignas(16)*/ pod::Vector3f inverseInertiaTensor = { 1, 1, 1 };
alignas(16) pod::Vector3f gravity = { NAN, NAN, NAN }; // an invalid gravity will fallback to world gravity
/*alignas(16)*/ pod::Vector3f gravity = { NAN, NAN, NAN }; // an invalid gravity will fallback to world gravity
alignas(16) pod::AABB bounds;
alignas(16) pod::Collider collider;
alignas(16) pod::PhysicsMaterial material;
alignas(16) pod::Activity activity;
/*alignas(16)*/ pod::AABB bounds;
/*alignas(16)*/ pod::Collider collider;
/*alignas(16)*/ pod::PhysicsMaterial material;
/*alignas(16)*/ pod::Activity activity;
};
struct Contact {

View File

@ -9,7 +9,7 @@ template<typename T> pod::Quaternion<T> uf::quaternion::identity() {
return pod::Quaternion<T>{ 0, 0, 0, 1 };
}
template<typename T> T uf::quaternion::multiply( const T& q1, const T& q2 ) {
#if 0 && UF_USE_SIMD
#if UF_USE_SIMD
if constexpr (std::is_same_v<typename T::type_t, float>) {
return uf::simd::quatMul( q1 , q2 );
}
@ -22,9 +22,9 @@ template<typename T> T uf::quaternion::multiply( const T& q1, const T& q2 ) {
};
}
template<typename T> pod::Vector3t<T> uf::quaternion::rotate( const pod::Quaternion<T>& Q, const pod::Vector3t<T>& v ) {
#if 0 && UF_USE_SIMD
#if UF_USE_SIMD
if constexpr (std::is_same_v<T,float>) {
return uf::simd::quatRot( Q, v );
return uf::simd::quatRot_3f( Q, v );
}
#endif
pod::Vector3t<T> q = { Q.x, Q.y, Q.z };

View File

@ -1,5 +1,3 @@
#pragma once
#if UF_USE_SIMD
#include "simd.inl"
#endif

View File

@ -1,94 +1,81 @@
namespace uf {
namespace simd {
inline value<float> /*UF_API*/ quatMul( value<float>, value<float> );
inline value<float> /*UF_API*/ quatRot( value<float>, value<float> );
inline pod::Matrix4f /*UF_API*/ quatMat( value<float> );
inline vector<float> /*UF_API*/ quatMul( vector<float>, vector<float> );
inline vector<float> /*UF_API*/ quatRot_3f( vector<float>, vector<float> );
inline pod::Matrix4f /*UF_API*/ quatMat( vector<float> );
}
}
inline uf::simd::value<float> uf::simd::quatMul( uf::simd::value<float> Q1, uf::simd::value<float> Q2 ) {
//__m128 Q1 = q1;
//__m128 Q2 = q2;
inline uf::simd::vector<float> uf::simd::quatMul( uf::simd::vector<float> Q1, uf::simd::vector<float> Q2 ) {
// broadcast q1 components
__m128 x1 = _mm_shuffle_ps(Q1, Q1, _MM_SHUFFLE(0,0,0,0));
__m128 y1 = _mm_shuffle_ps(Q1, Q1, _MM_SHUFFLE(1,1,1,1));
__m128 z1 = _mm_shuffle_ps(Q1, Q1, _MM_SHUFFLE(2,2,2,2));
__m128 w1 = _mm_shuffle_ps(Q1, Q1, _MM_SHUFFLE(3,3,3,3));
// Broadcast q1.w, q1.x, q1.y, q1.z
__m128 q1w = _mm_shuffle_ps(Q1, Q1, _MM_SHUFFLE(3,3,3,3));
__m128 q1x = _mm_shuffle_ps(Q1, Q1, _MM_SHUFFLE(0,0,0,0));
__m128 q1y = _mm_shuffle_ps(Q1, Q1, _MM_SHUFFLE(1,1,1,1));
__m128 q1z = _mm_shuffle_ps(Q1, Q1, _MM_SHUFFLE(2,2,2,2));
// broadcast q2 components
__m128 x2 = _mm_shuffle_ps(Q2, Q2, _MM_SHUFFLE(0,0,0,0));
__m128 y2 = _mm_shuffle_ps(Q2, Q2, _MM_SHUFFLE(1,1,1,1));
__m128 z2 = _mm_shuffle_ps(Q2, Q2, _MM_SHUFFLE(2,2,2,2));
__m128 w2 = _mm_shuffle_ps(Q2, Q2, _MM_SHUFFLE(3,3,3,3));
// Shuffle q2 into (x,y,z,w) permutations
__m128 q2xyzw = Q2; // (x,y,z,w)
__m128 q2wzyx = _mm_shuffle_ps(Q2, Q2, _MM_SHUFFLE(0,1,2,3)); // (w,z,y,x)
__m128 q2yzxw = _mm_shuffle_ps(Q2, Q2, _MM_SHUFFLE(3,0,2,1)); // (y,z,x,w)
__m128 q2zxyw = _mm_shuffle_ps(Q2, Q2, _MM_SHUFFLE(3,1,0,2)); // (z,x,y,w)
// compute each component
__m128 X = _mm_add_ps(
_mm_add_ps(_mm_mul_ps(w1, x2), _mm_mul_ps(x1, w2)),
_mm_sub_ps(_mm_mul_ps(y1, z2), _mm_mul_ps(z1, y2))
);
// Compute terms
__m128 t0 = _mm_mul_ps(q1w, q2xyzw); // w1 * (x2,y2,z2,w2)
__m128 t1 = _mm_mul_ps(q1x, q2wzyx); // x1 * (w2,z2,y2,x2)
__m128 t2 = _mm_mul_ps(q1y, q2yzxw); // y1 * (y2,z2,x2,w2)
__m128 t3 = _mm_mul_ps(q1z, q2zxyw); // z1 * (z2,x2,y2,w2)
__m128 Y = _mm_add_ps(
_mm_add_ps(_mm_mul_ps(w1, y2), _mm_mul_ps(y1, w2)),
_mm_sub_ps(_mm_mul_ps(z1, x2), _mm_mul_ps(x1, z2))
);
// Signs: (+,+,+,+), (+,-,+,-), (-,+,-,+), (+,-,-,+)
const __m128 sign1 = _mm_set_ps( 1.f,-1.f, 1.f,-1.f);
const __m128 sign2 = _mm_set_ps(-1.f, 1.f,-1.f, 1.f);
const __m128 sign3 = _mm_set_ps( 1.f,-1.f,-1.f, 1.f);
__m128 Z = _mm_add_ps(
_mm_add_ps(_mm_mul_ps(w1, z2), _mm_mul_ps(z1, w2)),
_mm_sub_ps(_mm_mul_ps(x1, y2), _mm_mul_ps(y1, x2))
);
t1 = _mm_mul_ps(t1, sign1);
t2 = _mm_mul_ps(t2, sign2);
t3 = _mm_mul_ps(t3, sign3);
__m128 W = _mm_sub_ps(
_mm_mul_ps(w1, w2),
_mm_add_ps(
_mm_add_ps(_mm_mul_ps(x1, x2), _mm_mul_ps(y1, y2)),
_mm_mul_ps(z1, z2)
)
);
__m128 result = _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3));
// pack back into (x,y,z,w)
__m128 result = _mm_movelh_ps(_mm_unpacklo_ps(X, Y), _mm_unpacklo_ps(Z, W));
return result;
}
inline uf::simd::value<float> uf::simd::quatRot( uf::simd::value<float> Q, uf::simd::value<float> V ) {
//__m128 Q = q; // (x,y,z,w)
//__m128 V = v; // (vx,vy,vz,0)
// Extract q.xyz and q.w
inline uf::simd::vector<float> uf::simd::quatRot_3f( uf::simd::vector<float> Q, uf::simd::vector<float> V ) {
// extract q.xyz and q.w
__m128 qxyz = _mm_and_ps(Q, _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))); // mask out w
__m128 qw = _mm_shuffle_ps(Q, Q, _MM_SHUFFLE(3,3,3,3));
// dot(q.xyz, v)
#if SSE_INSTR_SET >= 4
__m128 dot_qv = _mm_dp_ps(qxyz, V, 0x71); // result in lowest lane
#else
__m128 mul = _mm_mul_ps(qxyz, V);
__m128 shuf = _mm_movehdup_ps(mul);
__m128 sums = _mm_add_ps(mul, shuf);
shuf = _mm_movehl_ps(shuf, sums);
sums = _mm_add_ss(sums, shuf);
__m128 dot_qv = sums;
#endif
__m128 term1 = _mm_mul_ps(_mm_mul_ps(dot_qv, _mm_set1_ps(2.0f)), qxyz);
// dot(q.xyz, q.xyz)
#if SSE_INSTR_SET >= 4
__m128 dot_qq = _mm_dp_ps(qxyz, qxyz, 0x71);
#else
__m128 mul2 = _mm_mul_ps(qxyz, qxyz);
__m128 shuf2 = _mm_movehdup_ps(mul2);
__m128 sums2 = _mm_add_ps(mul2, shuf2);
shuf2 = _mm_movehl_ps(shuf2, sums2);
sums2 = _mm_add_ss(sums2, shuf2);
__m128 dot_qq = sums2;
#endif
__m128 w2 = _mm_mul_ps(qw, qw);
__m128 coeff = _mm_sub_ps(w2, dot_qq);
__m128 term2 = _mm_mul_ps(coeff, V);
// cross(q.xyz, v)
__m128 q_yzx = _mm_shuffle_ps(qxyz, qxyz, _MM_SHUFFLE(3,0,2,1));
__m128 v_yzx = _mm_shuffle_ps(V, V, _MM_SHUFFLE(3,0,2,1));
__m128 cross = _mm_sub_ps(_mm_mul_ps(qxyz, v_yzx), _mm_mul_ps(q_yzx, V));
cross = _mm_shuffle_ps(cross, cross, _MM_SHUFFLE(3,0,2,1));
__m128 term3 = _mm_mul_ps(_mm_mul_ps(cross, qw), _mm_set1_ps(2.0f));
__m128 cross1 = _mm_sub_ps(_mm_mul_ps(qxyz, v_yzx), _mm_mul_ps(q_yzx, V));
cross1 = _mm_shuffle_ps(cross1, cross1, _MM_SHUFFLE(3,0,2,1));
// Final result
__m128 result = _mm_add_ps(_mm_add_ps(term1, term2), term3);
// 2 * w * cross(q,v)
__m128 term1 = _mm_mul_ps(_mm_mul_ps(cross1, qw), _mm_set1_ps(2.0f));
// cross(q, cross(q,v))
__m128 c1_yzx = _mm_shuffle_ps(cross1, cross1, _MM_SHUFFLE(3,0,2,1));
__m128 cross2 = _mm_sub_ps(_mm_mul_ps(qxyz, c1_yzx), _mm_mul_ps(q_yzx, cross1));
cross2 = _mm_shuffle_ps(cross2, cross2, _MM_SHUFFLE(3,0,2,1));
// 2 * cross(q, cross(q,v))
__m128 term2 = _mm_mul_ps(cross2, _mm_set1_ps(2.0f));
// v + term1 + term2
__m128 result = _mm_add_ps(_mm_add_ps(V, term1), term2);
return result;
}
inline pod::Matrix4f uf::simd::quatMat( uf::simd::value<float> Q ) {
inline pod::Matrix4f uf::simd::quatMat( uf::simd::vector<float> Q ) {
// Shuffle out components
__m128 qx = _mm_shuffle_ps(Q, Q, _MM_SHUFFLE(0,0,0,0));
__m128 qy = _mm_shuffle_ps(Q, Q, _MM_SHUFFLE(1,1,1,1));

View File

@ -17,7 +17,7 @@ constexpr void for_each_index(F&& f) {
template<typename T, typename Op>
T elementwise( const T& left, const T& right, Op&& op ) {
alignas(16) T res;
T res;
FOR_EACH(T::size, {
res[i] = op(left[i], right[i]);
});
@ -46,7 +46,7 @@ pod::Vector<T, N> uf::vector::copy( const pod::Vector<T, N>& v ) {
}
template<typename T, size_t N, typename U>
pod::Vector<T, N> uf::vector::cast( const U& from ) {
alignas(16) pod::Vector<T, N> to;
pod::Vector<T, N> to;
#pragma unroll // GCC unroll N
for ( auto i = 0; i < N && i < U::size; ++i )
to[i] = from[i];
@ -147,7 +147,7 @@ T uf::vector::add( const T& left, const T& right ) {
return uf::simd::add( left, right );
}
#endif
alignas(16) T res;
T res;
FOR_EACH(T::size, {
res[i] = left[i] + right[i];
});
@ -160,7 +160,7 @@ T uf::vector::add( const T& vector, typename T::type_t scalar ) {
return uf::simd::add( vector, scalar );
}
#endif
alignas(16) T res;
T res;
FOR_EACH(T::size, {
res[i] = vector[i] + scalar;
});
@ -177,7 +177,7 @@ T uf::vector::subtract( const T& left, const T& right ) {
return uf::simd::sub( left, right );
}
#endif
alignas(16) T res;
T res;
FOR_EACH(T::size, {
res[i] = left[i] - right[i];
});
@ -190,7 +190,7 @@ T uf::vector::subtract( const T& vector, typename T::type_t scalar ) {
return uf::simd::sub( vector, scalar );
}
#endif
alignas(16) T res;
T res;
FOR_EACH(T::size, {
res[i] = vector[i] - scalar;
});
@ -203,7 +203,7 @@ T uf::vector::subtract( typename T::type_t scalar, const T& vector ) {
return uf::simd::sub( scalar, vector );
}
#endif
alignas(16) T res;
T res;
FOR_EACH(T::size, {
res[i] = scalar - vector[i];
});
@ -216,7 +216,7 @@ T uf::vector::multiply( const T& left, const T& right ) {
return uf::simd::mul( left, right );
}
#endif
alignas(16) T res;
T res;
FOR_EACH(T::size, {
res[i] = left[i] * right[i];
});
@ -229,7 +229,7 @@ T uf::vector::multiply( const T& vector, typename T::type_t scalar ) {
return uf::simd::mul( vector, scalar );
}
#endif
alignas(16) T res;
T res;
FOR_EACH(T::size, {
res[i] = vector[i] * scalar;
});
@ -247,14 +247,14 @@ T uf::vector::divide( const T& left, const T& right ) {
}
#elif UF_ENV_DREAMCAST && UF_ENV_DREAMCAST_SIMD
if constexpr ( simd_able_v<typename T::type_t> ) {
alignas(16) T res;
T res;
FOR_EACH(T::size, {
res[i] = MATH_Fast_Divide( left[i], right[i] );
});
return res;
}
#endif
alignas(16) T res;
T res;
FOR_EACH(T::size, {
res[i] = left[i] / right[i];
});
@ -268,14 +268,14 @@ T uf::vector::divide( const T& vector, typename T::type_t scalar ) {
}
#elif UF_ENV_DREAMCAST && UF_ENV_DREAMCAST_SIMD
if constexpr ( simd_able_v<typename T::type_t> ) {
alignas(16) T res;
T res;
FOR_EACH(T::size, {
res[i] = MATH_Fast_Divide( vector[i], scalar );
});
return res;
}
#endif
alignas(16) T res;
T res;
scalar = static_cast<typename T::type_t>(1) / scalar;
FOR_EACH(T::size, {
res[i] = vector[i] * scalar;
@ -290,14 +290,14 @@ T uf::vector::divide( typename T::type_t scalar, const T& vector ) {
}
#elif UF_ENV_DREAMCAST && UF_ENV_DREAMCAST_SIMD
if constexpr ( simd_able_v<typename T::type_t> ) {
alignas(16) T res;
T res;
FOR_EACH(T::size, {
res[i] = MATH_Fast_Divide( scalar, vector[i] );
});
return res;
}
#endif
alignas(16) T res;
T res;
scalar = static_cast<T>(1) / scalar;
FOR_EACH(T::size, {
res[i] = scalar / vector[i];
@ -327,7 +327,7 @@ T uf::vector::negate( const T& vector ) {
return uf::simd::mul( vector, -1.f );
}
#endif
alignas(16) T res;
T res;
FOR_EACH(T::size, {
res[i] = -vector[i];
});
@ -335,7 +335,7 @@ T uf::vector::negate( const T& vector ) {
}
template<typename T>
T uf::vector::abs( const T& vector ) {
alignas(16) T res;
T res;
FOR_EACH(T::size, {
res[i] = std::abs( vector[i] );
});
@ -575,7 +575,7 @@ T uf::vector::lerp( const T& from, const T& to, double delta, bool clamp ) {
// from + ( ( to - from ) * delta )
#if UF_ENV_DREAMCAST && UF_ENV_DREAMCAST_SIMD
if constexpr ( simd_able_v<typename T::type_t> ) {
alignas(16) T res;
T res;
FOR_EACH(T::size, {
res[i] = MATH_Lerp( from[i], to[i], delta );
});
@ -594,7 +594,7 @@ T uf::vector::lerp( const T& from, const T& to, const T& delta, bool clamp ) {
// from + ( ( to - from ) * delta )
#if UF_ENV_DREAMCAST && UF_ENV_DREAMCAST_SIMD
if constexpr ( simd_able_v<typename T::type_t> ) {
alignas(16) T res;
T res;
FOR_EACH(T::size, {
res[i] = MATH_Lerp( from[i], to[i], delta[i] );
});
@ -638,16 +638,16 @@ template<typename T>
typename T::type_t uf::vector::distanceSquared( const T& a, const T& b ) {
#if UF_ENV_DREAMCAST && UF_ENV_DREAMCAST_SIMD
if constexpr ( simd_able_v<typename T::type_t> ) {
alignas(16) T delta = uf::vector::subtract(b, a);
T delta = uf::vector::subtract(b, a);
return MATH_Sum_of_Squares( UF_EZ_VEC4( delta, T::size ) );
}
#elif UF_USE_SIMD
if constexpr ( simd_able_v<typename T::type_t> ) {
uf::simd::value<typename T::type_t> delta = uf::simd::sub( b, a );
uf::simd::vector<typename T::type_t> delta = uf::simd::sub( b, a );
return uf::simd::dot( delta, delta );
}
#endif
alignas(16) T delta = uf::vector::subtract( b, a );
T delta = uf::vector::subtract( b, a );
return uf::vector::dot( delta, delta );
}
template<typename T>
@ -674,10 +674,15 @@ typename T::type_t uf::vector::norm( const T& vector ) {
}
template<typename T>
T uf::vector::normalize( const T& vector ) {
#if UF_USE_SIMD
if constexpr ( std::is_same_v<T,float> ) {
return uf::simd::normalize( vector );
}
#endif
typename T::type_t norm = uf::vector::norm(vector);
if ( norm == 0 ) return vector;
#if UF_ENV_DREAMCAST && UF_ENV_DREAMCAST_SIMD
if constexpr ( simd_able_v<typename T::type_t> ) {
if constexpr ( std::is_same_v<T,float> ) {
return uf::vector::multiply(vector, MATH_fsrra(norm));
}
#endif
@ -698,8 +703,8 @@ T uf::vector::clampMagnitude( const T& v, float maxMag ) {
template<typename T>
void uf::vector::orthonormalize( T& normal, T& tangent ) {
normal = uf::vector::normalize( normal );
alignas(16) T norm = normal;
alignas(16) T tan = uf::vector::normalize( tangent );
T norm = normal;
T tan = uf::vector::normalize( tangent );
tangent = uf::vector::subtract( tan, uf::vector::multiply( norm, uf::vector::dot( norm, tan ) ) );
tangent = uf::vector::normalize( tangent );
}
@ -711,32 +716,15 @@ template<typename T>
T uf::vector::cross( const T& a, const T& b ) {
#if UF_USE_SIMD
if constexpr ( simd_able_v<typename T::type_t> ) {
uf::simd::value<typename T::type_t> x = a;
uf::simd::value<typename T::type_t> y = b;
#if SSE_INSTR_SET >= 7
uf::simd::value<typename T::type_t> tmp0 = _mm_shuffle_ps(y,y,_MM_SHUFFLE(3,0,2,1));
uf::simd::value<typename T::type_t> tmp1 = _mm_shuffle_ps(x,x,_MM_SHUFFLE(3,0,2,1));
tmp1 = _mm_mul_ps(tmp1,y);
uf::simd::value<typename T::type_t> tmp2 = _mm_fmsub_ps( tmp0,x, tmp1 );
uf::simd::value<typename T::type_t> res = _mm_shuffle_ps(tmp2,tmp2,_MM_SHUFFLE(3,0,2,1));
return res;
#else
uf::simd::value<typename T::type_t> tmp0 = _mm_shuffle_ps(y,y,_MM_SHUFFLE(3,0,2,1));
uf::simd::value<typename T::type_t> tmp1 = _mm_shuffle_ps(x,x,_MM_SHUFFLE(3,0,2,1));
tmp0 = _mm_mul_ps(tmp0,x);
tmp1 = _mm_mul_ps(tmp1,y);
uf::simd::value<typename T::type_t> tmp2 = _mm_sub_ps(tmp0,tmp1);
uf::simd::value<typename T::type_t> res = _mm_shuffle_ps(tmp2,tmp2,_MM_SHUFFLE(3,0,2,1));
return res;
#endif
return uf::simd::cross( a, b );
}
#elif UF_ENV_DREAMCAST && UF_ENV_DREAMCAST_SIMD
if constexpr ( simd_able_v<typename T::type_t> ) {
alignas(16) auto res = MATH_Cross_Product( a.x, a.y, a.z, b.x, b.y, b.z );
auto res = MATH_Cross_Product( a.x, a.y, a.z, b.x, b.y, b.z );
return *((T*) &res);
}
#endif
alignas(16) T res{
T res{
a.y * b.z - b.y * a.z,
a.z * b.x - b.z * a.x,
a.x * b.y - b.x * a.y

View File

@ -7,34 +7,34 @@
#endif
#define DEFINE_SIMD(T)\
inline value<T> /*UF_API*/ load( const T* );\
inline void /*UF_API*/ store( value<T>, T* );\
inline value<T> /*UF_API*/ set( T );\
inline value<T> /*UF_API*/ set( T, T, T, T );\
inline value<T> /*UF_API*/ add( value<T>, value<T> );\
inline value<T> /*UF_API*/ sub( value<T>, value<T> );\
inline value<T> /*UF_API*/ mul( value<T>, value<T> );\
inline value<T> /*UF_API*/ div( value<T>, value<T> );\
inline value<T> /*UF_API*/ min( value<T>, value<T> );\
inline value<T> /*UF_API*/ max( value<T>, value<T> );\
inline bool /*UF_API*/ all( value<T> );\
inline bool /*UF_API*/ any( value<T> );\
inline value<T> /*UF_API*/ less( value<T>, value<T> );\
inline value<T> /*UF_API*/ lessEquals( value<T>, value<T> );\
inline value<T> /*UF_API*/ greater( value<T>, value<T> );\
inline value<T> /*UF_API*/ greaterEquals( value<T>, value<T> );\
inline value<T> /*UF_API*/ equals( value<T>, value<T> );\
inline value<T> /*UF_API*/ notEquals( value<T>, value<T> );\
inline value<T> /*UF_API*/ sqrt( value<T> );\
inline value<T> /*UF_API*/ hadd( value<T>, value<T> );\
inline T /*UF_API*/ dot( value<T>, value<T> );\
template<size_t N = 4> inline pod::Vector<T,N> vector( const value<T> );\
inline vector<T> /*UF_API*/ load( const T* );\
inline void /*UF_API*/ store( vector<T>, T* );\
inline vector<T> /*UF_API*/ set( T );\
inline vector<T> /*UF_API*/ set( T, T, T, T );\
inline vector<T> /*UF_API*/ add( vector<T>, vector<T> );\
inline vector<T> /*UF_API*/ sub( vector<T>, vector<T> );\
inline vector<T> /*UF_API*/ mul( vector<T>, vector<T> );\
inline vector<T> /*UF_API*/ div( vector<T>, vector<T> );\
inline vector<T> /*UF_API*/ min( vector<T>, vector<T> );\
inline vector<T> /*UF_API*/ max( vector<T>, vector<T> );\
inline bool /*UF_API*/ all( vector<T> );\
inline bool /*UF_API*/ any( vector<T> );\
inline vector<T> /*UF_API*/ less( vector<T>, vector<T> );\
inline vector<T> /*UF_API*/ lessEquals( vector<T>, vector<T> );\
inline vector<T> /*UF_API*/ greater( vector<T>, vector<T> );\
inline vector<T> /*UF_API*/ greaterEquals( vector<T>, vector<T> );\
inline vector<T> /*UF_API*/ equals( vector<T>, vector<T> );\
inline vector<T> /*UF_API*/ notEquals( vector<T>, vector<T> );\
inline vector<T> /*UF_API*/ sqrt( vector<T> );\
inline vector<T> /*UF_API*/ hadd( vector<T>, vector<T> );\
inline T /*UF_API*/ dot( vector<T>, vector<T> );\
template<size_t N = 4> inline pod::Vector<T,N> cast( const vector<T> );\
namespace uf {
namespace simd {
template<typename T>
struct UF_API traits {
static const size_t size = 4;
static constexpr size_t size = 4;
typedef T type;
typedef __m128 value;
typedef pod::Vector<T,size> vector;
@ -42,60 +42,60 @@ namespace uf {
template<>
struct UF_API traits<int32_t> {
static const size_t size = 4;
static constexpr size_t size = 4;
typedef int32_t type;
typedef __m128i value;
typedef pod::Vector<int32_t,4> vector;
typedef pod::Vector<int32_t,size> vector;
};
template<>
struct UF_API traits<uint32_t> {
static const size_t size = 4;
static constexpr size_t size = 4;
typedef uint32_t type;
typedef __m128i value;
typedef pod::Vector<uint32_t,4> vector;
typedef pod::Vector<uint32_t,size> vector;
};
template<>
struct UF_API traits<float> {
static const size_t size = 4;
static constexpr size_t size = 4;
typedef float type;
typedef __m128 value;
typedef pod::Vector<float,4> vector;
typedef pod::Vector<float,size> vector;
};
template<typename T>
class /*UF_API*/ alignas(16) value {
private:
// __m128 m_value;
typedef typename traits<T>::value value_type;
value_type m_value;
class /*UF_API*/ alignas(16) vector {
public:
inline value();
inline value(const T* f);
inline value(T f);
inline value(T f0, T f1, T f2, T f3);
inline value(const value_type& rhs);
inline value(const value& rhs);
// __m128 m;
typedef typename traits<T>::value value_type;
value_type m;
inline vector();
inline vector(const T* f);
inline vector(T f);
inline vector(T f0, T f1, T f2, T f3);
inline vector(bool f0, bool f1, bool f2, bool f3);
inline vector(const value_type& rhs);
inline vector(const vector& rhs);
inline value(const pod::Vector<T,1>& rhs);
inline value(const pod::Vector<T,2>& rhs);
inline value(const pod::Vector<T,3>& rhs);
inline value(const pod::Vector<T,4>& rhs);
inline vector(const pod::Vector<T,1>& rhs);
inline vector(const pod::Vector<T,2>& rhs);
inline vector(const pod::Vector<T,3>& rhs);
inline vector(const pod::Vector<T,4>& rhs);
inline value operator+( const value& rhs );
inline value operator-( const value& rhs );
inline value operator*( const value& rhs );
inline value operator/( const value& rhs );
inline vector operator+( const vector& rhs );
inline vector operator-( const vector& rhs );
inline vector operator*( const vector& rhs );
inline vector operator/( const vector& rhs );
inline value operator<( const value& rhs );
inline value operator<=( const value& rhs );
inline value operator>( const value& rhs );
inline value operator>=( const value& rhs );
inline value operator==( const value& rhs );
inline value operator!=( const value& rhs );
inline vector operator<( const vector& rhs );
inline vector operator<=( const vector& rhs );
inline vector operator>( const vector& rhs );
inline vector operator>=( const vector& rhs );
inline vector operator==( const vector& rhs );
inline vector operator!=( const vector& rhs );
inline value& operator=(const value_type& rhs);
inline value& operator=(const value& rhs);
inline value& operator=(const pod::Vector<T,4>& rhs);
inline vector& operator=(const value_type& rhs);
inline vector& operator=(const vector& rhs);
inline vector& operator=(const pod::Vector<T,4>& rhs);
inline operator value_type() const;
@ -107,12 +107,23 @@ namespace uf {
DEFINE_SIMD(uint32_t);
// these are effectively NOPs
/*
#if UF_USE_FLOAT16
DEFINE_SIMD(std::float16_t)
#endif
#if UF_USE_BFLOAT16
DEFINE_SIMD(std::bfloat16_t)
#endif
*/
// specializations
inline vector<float> /*UF_API*/ set_f( bool, bool, bool, bool );
inline vector<int32_t> /*UF_API*/ set_i( bool, bool, bool, bool );
inline vector<uint32_t> /*UF_API*/ set_ui( bool, bool, bool, bool );
inline vector<float> /*UF_API*/ cross( vector<float> x, vector<float> y );
inline vector<float> /*UF_API*/ normalize( vector<float> x );
inline vector<float> /*UF_API*/ normalize_fast( vector<float> x );
}
}

View File

@ -5,143 +5,157 @@ namespace {
const __m128i signbit = _mm_set1_epi32(0x80000000);
return _mm_xor_si128(v, signbit);
}
inline int32_t boolMask(bool b) {
return b ? -1 : 0; // 0xFFFFFFFF for true, 0x00000000 for false
}
}
template<typename T>
inline uf::simd::value<T>::value() {}
template<typename T>
inline uf::simd::value<T>::value(const T* f) : m_value(uf::simd::load(f)) {}
template<typename T>
inline uf::simd::value<T>::value(T f) : m_value(uf::simd::set(f)) {}
template<typename T>
inline uf::simd::value<T>::value(T f0, T f1, T f2, T f3) : m_value(uf::simd::set(f0,f1,f2,f3)) {}
template<typename T>
inline uf::simd::value<T>::value(const value_type& rhs) : m_value(rhs) {}
template<typename T>
inline uf::simd::value<T>::value(const value& rhs) : m_value(rhs.m_value) {}
#define MV_INSTR_SET_DEFAULT __attribute__((target("default")))
#define MV_INSTR_SET_2 __attribute__((target("sse2")))
#define MV_INSTR_SET_3 __attribute__((target("sse3")))
#define MV_INSTR_SET_4 __attribute__((target("ssse3")))
#define MV_INSTR_SET_5 __attribute__((target("sse4.1")))
#define MV_INSTR_SET_6 __attribute__((target("sse4.2")))
#define MV_INSTR_SET_7 __attribute__((target("avx")))
template<typename T>
inline uf::simd::value<T>::value(const pod::Vector<T,1>& rhs) : value((T) rhs[0]){}
inline uf::simd::vector<T>::vector() {}
template<typename T>
inline uf::simd::value<T>::value(const pod::Vector<T,2>& rhs) : value((T) rhs[0], (T) rhs[1], 0, 0){}
inline uf::simd::vector<T>::vector(const T* f) : m(uf::simd::load(f)) {}
template<typename T>
inline uf::simd::value<T>::value(const pod::Vector<T,3>& rhs) : value((T) rhs[0], (T) rhs[1], (T) rhs[2], 0){}
inline uf::simd::vector<T>::vector(T f) : m(uf::simd::set(f)) {}
template<typename T>
inline uf::simd::value<T>::value(const pod::Vector<T,4>& rhs) : value((T) rhs[0], (T) rhs[1], (T) rhs[2], (T) rhs[3]){}
inline uf::simd::vector<T>::vector(T f0, T f1, T f2, T f3) : m(uf::simd::set(f0,f1,f2,f3)) {}
template<typename T>
inline uf::simd::vector<T>::vector(bool f0, bool f1, bool f2, bool f3) : m(uf::simd::set(f0,f1,f2,f3)) {}
template<typename T>
inline uf::simd::vector<T>::vector(const value_type& rhs) : m(rhs) {}
template<typename T>
inline uf::simd::vector<T>::vector(const vector& rhs) : m(rhs.m) {}
template<typename T>
inline uf::simd::value<T> uf::simd::value<T>::operator+( const value& rhs ) {
inline uf::simd::vector<T>::vector(const pod::Vector<T,1>& rhs) : vector((T) rhs[0]){}
template<typename T>
inline uf::simd::vector<T>::vector(const pod::Vector<T,2>& rhs) : vector((T) rhs[0], (T) rhs[1], 0, 0){}
template<typename T>
inline uf::simd::vector<T>::vector(const pod::Vector<T,3>& rhs) : vector((T) rhs[0], (T) rhs[1], (T) rhs[2], 0){}
template<typename T>
inline uf::simd::vector<T>::vector(const pod::Vector<T,4>& rhs) : vector((T) rhs[0], (T) rhs[1], (T) rhs[2], (T) rhs[3]){}
template<typename T>
inline uf::simd::vector<T> uf::simd::vector<T>::operator+( const vector& rhs ) {
return uf::simd::add( *this, rhs );
}
template<typename T>
inline uf::simd::value<T> uf::simd::value<T>::operator-( const value& rhs ) {
inline uf::simd::vector<T> uf::simd::vector<T>::operator-( const vector& rhs ) {
return uf::simd::sub( *this, rhs );
}
template<typename T>
inline uf::simd::value<T> uf::simd::value<T>::operator*( const value& rhs ) {
inline uf::simd::vector<T> uf::simd::vector<T>::operator*( const vector& rhs ) {
return uf::simd::mul( *this, rhs );
}
template<typename T>
inline uf::simd::value<T> uf::simd::value<T>::operator/( const value& rhs ) {
inline uf::simd::vector<T> uf::simd::vector<T>::operator/( const vector& rhs ) {
return uf::simd::div( *this, rhs );
}
template<typename T>
inline uf::simd::value<T> uf::simd::value<T>::operator<( const value& rhs ) {
inline uf::simd::vector<T> uf::simd::vector<T>::operator<( const vector& rhs ) {
return uf::simd::less( *this, rhs );
}
template<typename T>
inline uf::simd::value<T> uf::simd::value<T>::operator<=( const value& rhs ) {
inline uf::simd::vector<T> uf::simd::vector<T>::operator<=( const vector& rhs ) {
return uf::simd::lessEquals( *this, rhs );
}
template<typename T>
inline uf::simd::value<T> uf::simd::value<T>::operator>( const value& rhs ) {
inline uf::simd::vector<T> uf::simd::vector<T>::operator>( const vector& rhs ) {
return uf::simd::greater( *this, rhs );
}
template<typename T>
inline uf::simd::value<T> uf::simd::value<T>::operator>=( const value& rhs ) {
inline uf::simd::vector<T> uf::simd::vector<T>::operator>=( const vector& rhs ) {
return uf::simd::greaterEquals( *this, rhs );
}
template<typename T>
inline uf::simd::value<T> uf::simd::value<T>::operator==( const value& rhs ) {
inline uf::simd::vector<T> uf::simd::vector<T>::operator==( const vector& rhs ) {
return uf::simd::equals( *this, rhs );
}
template<typename T>
inline uf::simd::value<T> uf::simd::value<T>::operator!=( const value& rhs ) {
inline uf::simd::vector<T> uf::simd::vector<T>::operator!=( const vector& rhs ) {
return uf::simd::notEquals( *this, rhs );
}
template<typename T>
inline uf::simd::value<T>& uf::simd::value<T>::operator=(const uf::simd::value<T>::value_type& rhs) {
m_value = rhs;
inline uf::simd::vector<T>& uf::simd::vector<T>::operator=(const uf::simd::vector<T>::value_type& rhs) {
m = rhs;
return *this;
}
template<typename T>
inline uf::simd::value<T>& uf::simd::value<T>::operator=(const value& rhs) {
m_value = rhs.m_value;
inline uf::simd::vector<T>& uf::simd::vector<T>::operator=(const vector& rhs) {
m = rhs.m;
return *this;
}
template<typename T>
inline uf::simd::value<T>& uf::simd::value<T>::operator=(const pod::Vector<T,4>& rhs) {
m_value = uf::simd::load(&rhs[0]);
inline uf::simd::vector<T>& uf::simd::vector<T>::operator=(const pod::Vector<T,4>& rhs) {
m = uf::simd::load(&rhs[0]);
return *this;
}
template<typename T>
inline uf::simd::value<T>::operator uf::simd::value<T>::value_type() const {
return m_value;
inline uf::simd::vector<T>::operator uf::simd::vector<T>::value_type() const {
return m;
}
template<typename T>
template<size_t N>
inline uf::simd::value<T>::operator pod::Vector<T,N>() const {
return uf::simd::vector<N>(*this);
inline uf::simd::vector<T>::operator pod::Vector<T,N>() const {
return uf::simd::cast<N>(*this);
}
template<size_t N>
inline pod::Vector<float,N> uf::simd::vector( const uf::simd::value<float> v ){
inline pod::Vector<float,N> uf::simd::cast( const uf::simd::vector<float> v ){
pod::Vector4f r;
uf::simd::store( v, &r[0] );
return uf::vector::cast<float,N>(r);
}
template<size_t N>
inline pod::Vector<int32_t,N> uf::simd::vector( const uf::simd::value<int32_t> v ){
inline pod::Vector<int32_t,N> uf::simd::cast( const uf::simd::vector<int32_t> v ){
pod::Vector4i r;
uf::simd::store( v, &r[0] );
return uf::vector::cast<int32_t,N>(r);
}
template<size_t N>
inline pod::Vector<uint32_t,N> uf::simd::vector( const uf::simd::value<uint32_t> v ){
inline pod::Vector<uint32_t,N> uf::simd::cast( const uf::simd::vector<uint32_t> v ){
pod::Vector4ui r;
uf::simd::store( v, &r[0] );
return uf::vector::cast<uint32_t,N>(r);
}
inline uf::simd::value<float> uf::simd::load( const float* f ) {
inline uf::simd::vector<float> uf::simd::load( const float* f ) {
// if ( uf::aligned(f, 16) ) return _mm_load_ps(f);
return _mm_loadu_ps(f);
return _mm_loadu_ps( f );
}
inline void uf::simd::store( uf::simd::value<float> v, float* f ) {
inline void uf::simd::store( uf::simd::vector<float> v, float* f ) {
/* if ( uf::aligned(f, 16) ) _mm_store_ps(f, v);
else */ _mm_storeu_ps(f, v);
else */ _mm_storeu_ps( f, v );
}
inline uf::simd::value<float> uf::simd::set( float f ) {
return _mm_set1_ps(f);
inline uf::simd::vector<float> uf::simd::set( float f ) {
return _mm_set1_ps( f );
}
inline uf::simd::value<float> uf::simd::set( float x, float y, float z, float w ) {
return _mm_setr_ps(x, y, z, w);
inline uf::simd::vector<float> uf::simd::set( float x, float y, float z, float w ) {
return _mm_setr_ps( x, y, z, w );
}
inline uf::simd::value<float> uf::simd::add( uf::simd::value<float> x, uf::simd::value<float> y ) {
inline uf::simd::vector<float> uf::simd::add( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
return _mm_add_ps( x, y );
}
inline uf::simd::value<float> uf::simd::sub( uf::simd::value<float> x, uf::simd::value<float> y ) {
inline uf::simd::vector<float> uf::simd::sub( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
return _mm_sub_ps( x, y );
}
inline uf::simd::value<float> uf::simd::mul( uf::simd::value<float> x, uf::simd::value<float> y ) {
inline uf::simd::vector<float> uf::simd::mul( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
return _mm_mul_ps( x, y );
}
inline uf::simd::value<float> uf::simd::div( uf::simd::value<float> x, uf::simd::value<float> y ) {
inline uf::simd::vector<float> uf::simd::div( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
return _mm_div_ps( x, y );
}
/*
inline uf::simd::value<float> uf::simd::hadd( uf::simd::value<float> x, uf::simd::value<float> y ) {
inline uf::simd::vector<float> uf::simd::hadd( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
#if 0
return _mm_hadd_ps( x, y );
#else
@ -154,312 +168,482 @@ inline uf::simd::value<float> uf::simd::hadd( uf::simd::value<float> x, uf::simd
}
*/
inline uf::simd::value<float> uf::simd::min( uf::simd::value<float> x, uf::simd::value<float> y ) {
inline uf::simd::vector<float> uf::simd::min( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
return _mm_min_ps( x, y );
}
inline uf::simd::value<float> uf::simd::max( uf::simd::value<float> x, uf::simd::value<float> y ) {
inline uf::simd::vector<float> uf::simd::max( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
return _mm_max_ps( x, y );
}
inline bool uf::simd::all( uf::simd::value<float> mask) {
return _mm_movemask_ps(mask) == 0xF; // all 4 bits set
inline bool uf::simd::all( uf::simd::vector<float> mask) {
return _mm_movemask_ps( mask ) == 0xF; // all 4 bits set
}
inline bool uf::simd::any( uf::simd::value<float> mask) {
return _mm_movemask_ps(mask) != 0x0; // any bit set
inline bool uf::simd::any( uf::simd::vector<float> mask) {
return _mm_movemask_ps( mask ) != 0x0; // any bit set
}
inline uf::simd::value<float> uf::simd::less( uf::simd::value<float> x, uf::simd::value<float> y ) {
inline uf::simd::vector<float> uf::simd::less( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
return _mm_cmplt_ps( x, y );
}
inline uf::simd::value<float> uf::simd::lessEquals( uf::simd::value<float> x, uf::simd::value<float> y ) {
inline uf::simd::vector<float> uf::simd::lessEquals( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
return _mm_cmple_ps( x, y );
}
inline uf::simd::value<float> uf::simd::greater( uf::simd::value<float> x, uf::simd::value<float> y ) {
inline uf::simd::vector<float> uf::simd::greater( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
return _mm_cmpgt_ps( x, y );
}
inline uf::simd::value<float> uf::simd::greaterEquals( uf::simd::value<float> x, uf::simd::value<float> y ) {
inline uf::simd::vector<float> uf::simd::greaterEquals( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
return _mm_cmpge_ps( x, y );
}
inline uf::simd::value<float> uf::simd::equals( uf::simd::value<float> x, uf::simd::value<float> y ) {
inline uf::simd::vector<float> uf::simd::equals( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
return _mm_cmpeq_ps( x, y );
}
inline uf::simd::value<float> uf::simd::notEquals( uf::simd::value<float> x, uf::simd::value<float> y ) {
inline uf::simd::vector<float> uf::simd::notEquals( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
return _mm_cmpneq_ps( x, y );
}
inline uf::simd::value<float> uf::simd::sqrt( uf::simd::value<float> v ) {
inline uf::simd::vector<float> uf::simd::sqrt( uf::simd::vector<float> v ) {
return _mm_sqrt_ps( v );
}
inline float uf::simd::dot( uf::simd::value<float> x, uf::simd::value<float> y ) {
#if SSE_INSTR_SET >= 5
__m128 result = _mm_dp_ps(x, y, 0xF1);
return _mm_cvtss_f32(result);
#elif SSE_INSTR_SET >= 3
__m128 mulRes = _mm_mul_ps(x, y);
__m128 shufReg = _mm_movehdup_ps(mulRes);
__m128 sumsReg = _mm_add_ps(mulRes, shufReg);
shufReg = _mm_movehl_ps(shufReg, sumsReg);
sumsReg = _mm_add_ss(sumsReg, shufReg);
return _mm_cvtss_f32(sumsReg);
#else
return uf::vector::sum( uf::simd::vector( uf::simd::mul( x, y ) ) );
#endif
namespace {
MV_INSTR_SET_DEFAULT
uf::simd::vector<float> dot_impl( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
return uf::simd::mul( x, y );
}
MV_INSTR_SET_3
uf::simd::vector<float> dot_impl( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
__m128 mulRes = _mm_mul_ps(x, y);
__m128 shufReg = _mm_movehdup_ps(mulRes);
__m128 sumsReg = _mm_add_ps(mulRes, shufReg);
shufReg = _mm_movehl_ps(shufReg, sumsReg);
return _mm_add_ss(sumsReg, shufReg);
}
MV_INSTR_SET_5
uf::simd::vector<float> dot_impl( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
return _mm_dp_ps(x, y, 0xF1);
}
}
inline uf::simd::value<int32_t> uf::simd::load( const int32_t* f ) {
#if SSE_INSTR_SET >= 3
// if ( uf::aligned(f, 16) ) return _mm_load_si128(reinterpret_cast<const __m128i*>(f));
return _mm_loadu_si128(reinterpret_cast<const __m128i*>(f));
#else
return uf::simd::value<int32_t>( f[0], f[1], f[2], f[3] );
#endif
inline float uf::simd::dot( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
return _mm_cvtss_f32( ::dot_impl( x, y ) );
}
inline void uf::simd::store( uf::simd::value<int32_t> v, int32_t* f ) {
#if SSE_INSTR_SET >= 3
/*if ( uf::aligned(f, 16) ) _mm_store_si128(reinterpret_cast<__m128i*>(f), v);
else*/ _mm_storeu_si128(reinterpret_cast<__m128i*>(f), v);
#else
union { __m128i x; int32_t y[4]; } kludge;
kludge.x = v;
f[0] = kludge.y[0];
f[1] = kludge.y[1];
f[2] = kludge.y[2];
f[3] = kludge.y[3];
#endif
namespace {
MV_INSTR_SET_DEFAULT
uf::simd::vector<int32_t> load_impl( const int32_t* f ) {
return uf::simd::vector<int32_t>( f[0], f[1], f[2], f[3] );
}
MV_INSTR_SET_3
uf::simd::vector<int32_t> load_impl( const int32_t* f ) {
// if ( uf::aligned(f, 16) ) return _mm_load_si128(reinterpret_cast<const __m128i*>(f));
return _mm_loadu_si128(reinterpret_cast<const __m128i*>(f));
}
}
inline uf::simd::value<int32_t> uf::simd::set( int32_t f ) {
inline uf::simd::vector<int32_t> uf::simd::load( const int32_t* f ) {
return ::load_impl( f );
}
namespace {
MV_INSTR_SET_DEFAULT
void store_impl( uf::simd::vector<int32_t> v, int32_t* f ) {
union { __m128i x; int32_t y[4]; } kludge;
kludge.x = v;
f[0] = kludge.y[0];
f[1] = kludge.y[1];
f[2] = kludge.y[2];
f[3] = kludge.y[3];
}
MV_INSTR_SET_3
void store_impl( uf::simd::vector<int32_t> v, int32_t* f ) {
/*if ( uf::aligned(f, 16) ) _mm_store_si128(reinterpret_cast<__m128i*>(f), v);
else*/ _mm_storeu_si128(reinterpret_cast<__m128i*>(f), v);
}
}
inline void uf::simd::store( uf::simd::vector<int32_t> v, int32_t* f ) {
return ::store_impl( v, f );
}
inline uf::simd::vector<int32_t> uf::simd::set( int32_t f ) {
return _mm_set1_epi32(f);
}
inline uf::simd::value<int32_t> uf::simd::set( int32_t x, int32_t y, int32_t z, int32_t w ) {
inline uf::simd::vector<int32_t> uf::simd::set( int32_t x, int32_t y, int32_t z, int32_t w ) {
return _mm_setr_epi32(x, y, z, w);
}
inline uf::simd::value<int32_t> uf::simd::add( uf::simd::value<int32_t> x, uf::simd::value<int32_t> y ) {
inline uf::simd::vector<int32_t> uf::simd::add( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
return _mm_add_epi32(x, y);
}
inline uf::simd::value<int32_t> uf::simd::sub( uf::simd::value<int32_t> x, uf::simd::value<int32_t> y ) {
inline uf::simd::vector<int32_t> uf::simd::sub( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
return _mm_sub_epi32(x, y);
}
inline uf::simd::value<int32_t> uf::simd::mul( uf::simd::value<int32_t> x, uf::simd::value<int32_t> y ) {
#if SSE_INSTR_SET >= 4
return _mm_mullo_epi32(x, y);
#else
auto X = uf::simd::vector(x);
auto Y = uf::simd::vector(y);
return uf::simd::set(X[0]*Y[0], X[1]*Y[1], X[2]*Y[2], X[3]*Y[3]);
#endif
namespace {
MV_INSTR_SET_DEFAULT
uf::simd::vector<int32_t> mul_impl( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
auto X = uf::simd::cast(x);
auto Y = uf::simd::cast(y);
return uf::simd::set(X[0]*Y[0], X[1]*Y[1], X[2]*Y[2], X[3]*Y[3]);
}
MV_INSTR_SET_4
uf::simd::vector<int32_t> mul_impl( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
return _mm_mullo_epi32(x, y);
}
}
inline uf::simd::value<int32_t> uf::simd::div( uf::simd::value<int32_t> x, uf::simd::value<int32_t> y ) {
auto X = uf::simd::vector( x );
auto Y = uf::simd::vector( y );
inline uf::simd::vector<int32_t> uf::simd::mul( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
return ::mul_impl( x, y );
}
inline uf::simd::vector<int32_t> uf::simd::div( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
auto X = uf::simd::cast( x );
auto Y = uf::simd::cast( y );
return uf::simd::set( X[0] / Y[0], X[1] / Y[1], X[2] / Y[2], X[3] / Y[3] );
}
/*
inline uf::simd::value<int32_t> uf::simd::hadd( uf::simd::value<int32_t> x, uf::simd::value<int32_t> y ) {
auto X = uf::simd::vector( x );
auto Y = uf::simd::vector( y );
inline uf::simd::vector<int32_t> uf::simd::hadd( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
auto X = uf::simd::cast( x );
auto Y = uf::simd::cast( y );
return uf::simd::set( X[0] + Y[0], X[1] + Y[1], X[2] + Y[2], X[3] + Y[3] );
}
*/
inline uf::simd::value<int32_t> uf::simd::min( uf::simd::value<int32_t> x, uf::simd::value<int32_t> y ) {
#if SSE_INSTR_SET >= 4
return _mm_min_epi32(x, y);
#else
auto X = uf::simd::vector(x);
auto Y = uf::simd::vector(y);
return uf::simd::set(std::min(X[0],Y[0]), std::min(X[1],Y[1]), std::min(X[2],Y[2]), std::min(X[3],Y[3]));
#endif
namespace {
MV_INSTR_SET_DEFAULT
uf::simd::vector<int32_t> min_impl( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
auto X = uf::simd::cast(x);
auto Y = uf::simd::cast(y);
return uf::simd::set(std::min(X[0],Y[0]), std::min(X[1],Y[1]), std::min(X[2],Y[2]), std::min(X[3],Y[3]));
}
MV_INSTR_SET_4
uf::simd::vector<int32_t> min_impl( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
return _mm_min_epi32(x, y);
}
MV_INSTR_SET_DEFAULT
uf::simd::vector<int32_t> max_impl( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
auto X = uf::simd::cast(x);
auto Y = uf::simd::cast(y);
return uf::simd::set(std::max(X[0],Y[0]), std::max(X[1],Y[1]), std::max(X[2],Y[2]), std::max(X[3],Y[3]));
}
MV_INSTR_SET_4
uf::simd::vector<int32_t> max_impl( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
return _mm_max_epi32(x, y);
}
}
inline uf::simd::value<int32_t> uf::simd::max( uf::simd::value<int32_t> x, uf::simd::value<int32_t> y ) {
#if SSE_INSTR_SET >= 4
return _mm_max_epi32(x, y);
#else
auto X = uf::simd::vector(x);
auto Y = uf::simd::vector(y);
return uf::simd::set(std::max(X[0],Y[0]), std::max(X[1],Y[1]), std::max(X[2],Y[2]), std::max(X[3],Y[3]));
#endif
inline uf::simd::vector<int32_t> uf::simd::min( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
return ::min_impl(x, y);
}
inline bool uf::simd::all( uf::simd::value<int32_t> mask) {
inline uf::simd::vector<int32_t> uf::simd::max( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
return ::max_impl(x, y);
}
inline bool uf::simd::all( uf::simd::vector<int32_t> mask) {
return _mm_movemask_epi8( mask ) == 0xFFFF; // all 4 bits set
}
inline bool uf::simd::any( uf::simd::value<int32_t> mask) {
inline bool uf::simd::any( uf::simd::vector<int32_t> mask) {
return _mm_movemask_epi8( mask ) != 0x0; // any bit set
}
inline uf::simd::value<int32_t> uf::simd::less( uf::simd::value<int32_t> x, uf::simd::value<int32_t> y ) {
#if SSE_INSTR_SET >= 4
return _mm_cmplt_epi32( x, y );
#else
auto X = vector( x ), Y = vector( y );
return set(X[0] < Y[0], X[1] < Y[1], X[2] < Y[2], X[3] < Y[3]);
#endif
namespace {
MV_INSTR_SET_DEFAULT
uf::simd::vector<int32_t> less_impl( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
auto X = uf::simd::cast( x ), Y = uf::simd::cast( y );
return uf::simd::set_i(X[0] < Y[0], X[1] < Y[1], X[2] < Y[2], X[3] < Y[3]);
}
MV_INSTR_SET_4
uf::simd::vector<int32_t> less_impl( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
return _mm_cmplt_epi32( x, y );
}
MV_INSTR_SET_DEFAULT
uf::simd::vector<int32_t> lessEquals_impl( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
auto X = uf::simd::cast( x ), Y = uf::simd::cast( y );
return uf::simd::set_i(X[0] <= Y[0], X[1] <= Y[1], X[2] <= Y[2], X[3] <= Y[3]);
}
MV_INSTR_SET_4
uf::simd::vector<int32_t> lessEquals_impl( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
__m128i gt = _mm_cmpgt_epi32(x, y);
return _mm_xor_si128(gt, _mm_set1_epi32(-1));
}
MV_INSTR_SET_DEFAULT
uf::simd::vector<int32_t> greater_impl( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
auto X = uf::simd::cast( x ), Y = uf::simd::cast( y );
return uf::simd::set_i(X[0] > Y[0], X[1] > Y[1], X[2] > Y[2], X[3] > Y[3]);
}
MV_INSTR_SET_4
uf::simd::vector<int32_t> greater_impl( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
return _mm_cmpgt_epi32(x, y);
}
MV_INSTR_SET_DEFAULT
uf::simd::vector<int32_t> greaterEquals_impl( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
auto X = uf::simd::cast( x ), Y = uf::simd::cast( y );
return uf::simd::set_i(X[0] >= Y[0], X[1] >= Y[1], X[2] >= Y[2], X[3] >= Y[3]);
}
MV_INSTR_SET_4
uf::simd::vector<int32_t> greaterEquals_impl( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
__m128i gt = _mm_cmplt_epi32(x, y);
return _mm_xor_si128(gt, _mm_set1_epi32(-1));
}
}
inline uf::simd::value<int32_t> uf::simd::lessEquals( uf::simd::value<int32_t> x, uf::simd::value<int32_t> y ) {
#if SSE_INSTR_SET >= 4
__m128i gt = _mm_cmpgt_epi32(x, y);
return _mm_xor_si128(gt, _mm_set1_epi32(-1));
#else
auto X = vector( x ), Y = vector( y );
return uf::simd::set(X[0] <= Y[0], X[1] <= Y[1], X[2] <= Y[2], X[3] <= Y[3]);
#endif
inline uf::simd::vector<int32_t> uf::simd::less( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
return ::less_impl( x, y );
}
inline uf::simd::value<int32_t> uf::simd::greater( uf::simd::value<int32_t> x, uf::simd::value<int32_t> y ) {
#if SSE_INSTR_SET >= 4
return _mm_cmpgt_epi32( x, y );
#else
auto X = vector( x ), Y = vector( y );
return uf::simd::set(X[0] > Y[0], X[1] > Y[1], X[2] > Y[2], X[3] > Y[3]);
#endif
inline uf::simd::vector<int32_t> uf::simd::lessEquals( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
return ::lessEquals_impl( x, y );
}
inline uf::simd::value<int32_t> uf::simd::greaterEquals( uf::simd::value<int32_t> x, uf::simd::value<int32_t> y ) {
#if SSE_INSTR_SET >= 4
__m128i gt = _mm_cmplt_epi32(x, y);
return _mm_xor_si128(gt, _mm_set1_epi32(-1));
#else
auto X = vector( x ), Y = vector( y );
return uf::simd::set(X[0] >= Y[0], X[1] >= Y[1], X[2] >= Y[2], X[3] >= Y[3]);
#endif
inline uf::simd::vector<int32_t> uf::simd::greater( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
return ::greater_impl( x, y );
}
inline uf::simd::value<int32_t> uf::simd::equals( uf::simd::value<int32_t> x, uf::simd::value<int32_t> y ) {
inline uf::simd::vector<int32_t> uf::simd::greaterEquals( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
return ::greaterEquals_impl( x, y );
}
inline uf::simd::vector<int32_t> uf::simd::equals( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
return _mm_cmpeq_epi32(x, y);
}
inline uf::simd::value<int32_t> uf::simd::notEquals( uf::simd::value<int32_t> x, uf::simd::value<int32_t> y ) {
inline uf::simd::vector<int32_t> uf::simd::notEquals( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
return _mm_xor_si128(_mm_cmpeq_epi32(x, y), _mm_set1_epi32(-1));
}
inline uf::simd::value<int32_t> uf::simd::sqrt( uf::simd::value<int32_t> v ) {
auto V = uf::simd::vector( v );
inline uf::simd::vector<int32_t> uf::simd::sqrt( uf::simd::vector<int32_t> v ) {
auto V = uf::simd::cast( v );
return uf::simd::set( (int32_t) std::sqrt(V[0]), (int32_t) std::sqrt(V[1]), (int32_t) std::sqrt(V[2]), (int32_t) std::sqrt(V[3]) );
}
inline int32_t uf::simd::dot( uf::simd::value<int32_t> x, uf::simd::value<int32_t> y ) {
auto X = uf::simd::vector( x );
auto Y = uf::simd::vector( y );
inline int32_t uf::simd::dot( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
auto X = uf::simd::cast( x );
auto Y = uf::simd::cast( y );
return X[0] * Y[0] + X[1] * Y[1] + X[2] * Y[2] + X[3] * Y[3];
}
inline uf::simd::value<uint32_t> uf::simd::load( const uint32_t* f ) {
#if SSE_INSTR_SET >= 3
// if ( uf::aligned(f, 16) ) return _mm_load_si128(reinterpret_cast<const __m128i*>(f));
return _mm_loadu_si128(reinterpret_cast<const __m128i*>(f));
#else
return uf::simd::value<uint32_t>( f[0], f[1], f[2], f[3] );
#endif
namespace {
MV_INSTR_SET_DEFAULT
uf::simd::vector<uint32_t> load_impl( const uint32_t* f ) {
return uf::simd::vector<uint32_t>( f[0], f[1], f[2], f[3] );
}
MV_INSTR_SET_3
uf::simd::vector<uint32_t> load_impl( const uint32_t* f ) {
// if ( uf::aligned(f, 16) ) return _mm_load_si128(reinterpret_cast<const __m128i*>(f));
return _mm_loadu_si128(reinterpret_cast<const __m128i*>(f));
}
MV_INSTR_SET_DEFAULT
void store_impl( uf::simd::vector<uint32_t> v, uint32_t* f ) {
union { __m128i x; uint32_t y[4]; } kludge;
kludge.x = v;
f[0] = kludge.y[0];
f[1] = kludge.y[1];
f[2] = kludge.y[2];
f[3] = kludge.y[3];
}
MV_INSTR_SET_3
void store_impl( uf::simd::vector<uint32_t> v, uint32_t* f ) {
/*if ( uf::aligned(f, 16) ) _mm_store_si128(reinterpret_cast<__m128i*>(f), v);
else*/ _mm_storeu_si128(reinterpret_cast<__m128i*>(f), v);
}
}
inline void uf::simd::store( uf::simd::value<uint32_t> v, uint32_t* f ) {
#if SSE_INSTR_SET >= 3
/*if ( uf::aligned(f, 16) ) _mm_store_si128(reinterpret_cast<__m128i*>(f), v);
else*/ _mm_storeu_si128(reinterpret_cast<__m128i*>(f), v);
#else
union { __m128i x; uint32_t y[4]; } kludge;
kludge.x = v;
f[0] = kludge.y[0];
f[1] = kludge.y[1];
f[2] = kludge.y[2];
f[3] = kludge.y[3];
#endif
inline uf::simd::vector<uint32_t> uf::simd::load( const uint32_t* f ) {
return ::load_impl( f );
}
inline uf::simd::value<uint32_t> uf::simd::set( uint32_t f ) {
inline void uf::simd::store( uf::simd::vector<uint32_t> v, uint32_t* f ) {
return ::store_impl( v, f );
}
inline uf::simd::vector<uint32_t> uf::simd::set( uint32_t f ) {
return _mm_set1_epi32(f);
}
inline uf::simd::value<uint32_t> uf::simd::set( uint32_t x, uint32_t y, uint32_t z, uint32_t w ) {
inline uf::simd::vector<uint32_t> uf::simd::set( uint32_t x, uint32_t y, uint32_t z, uint32_t w ) {
return _mm_setr_epi32(x, y, z, w);
}
inline uf::simd::value<uint32_t> uf::simd::add( uf::simd::value<uint32_t> x, uf::simd::value<uint32_t> y ) {
inline uf::simd::vector<uint32_t> uf::simd::add( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
return _mm_add_epi32(x, y);
}
inline uf::simd::value<uint32_t> uf::simd::sub( uf::simd::value<uint32_t> x, uf::simd::value<uint32_t> y ) {
inline uf::simd::vector<uint32_t> uf::simd::sub( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
return _mm_sub_epi32(x, y);
}
inline uf::simd::value<uint32_t> uf::simd::mul( uf::simd::value<uint32_t> x, uf::simd::value<uint32_t> y ) {
#if SSE_INSTR_SET >= 4
return _mm_mullo_epi32(x, y);
#else
auto X = uf::simd::vector(x);
auto Y = uf::simd::vector(y);
return uf::simd::set(X[0]*Y[0], X[1]*Y[1], X[2]*Y[2], X[3]*Y[3]);
#endif
namespace {
MV_INSTR_SET_DEFAULT
uf::simd::vector<uint32_t> mul_impl( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
auto X = uf::simd::cast(x);
auto Y = uf::simd::cast(y);
return uf::simd::set(X[0]*Y[0], X[1]*Y[1], X[2]*Y[2], X[3]*Y[3]);
}
MV_INSTR_SET_4
uf::simd::vector<uint32_t> mul_impl( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
return _mm_mullo_epi32(x, y);
}
}
inline uf::simd::value<uint32_t> uf::simd::div( uf::simd::value<uint32_t> x, uf::simd::value<uint32_t> y ) {
auto X = uf::simd::vector( x );
auto Y = uf::simd::vector( y );
inline uf::simd::vector<uint32_t> uf::simd::mul( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
return ::mul_impl( x, y );
}
inline uf::simd::vector<uint32_t> uf::simd::div( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
auto X = uf::simd::cast( x );
auto Y = uf::simd::cast( y );
return uf::simd::set( X[0] / Y[0], X[1] / Y[1], X[2] / Y[2], X[3] / Y[3] );
}
/*
inline uf::simd::value<uint32_t> uf::simd::hadd( uf::simd::value<uint32_t> x, uf::simd::value<uint32_t> y ) {
auto X = uf::simd::vector( x );
auto Y = uf::simd::vector( y );
inline uf::simd::vector<uint32_t> uf::simd::hadd( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
auto X = uf::simd::cast( x );
auto Y = uf::simd::cast( y );
return uf::simd::set( X[0] + Y[0], X[1] + Y[1], X[2] + Y[2], X[3] + Y[3] );
}
*/
inline uf::simd::value<uint32_t> uf::simd::min( uf::simd::value<uint32_t> x, uf::simd::value<uint32_t> y ) {
#if SSE_INSTR_SET >= 4
return _mm_min_epu32(x, y); // unsigned min
#else
auto X = uf::simd::vector(x);
auto Y = uf::simd::vector(y);
return uf::simd::set(std::min(X[0],Y[0]), std::min(X[1],Y[1]), std::min(X[2],Y[2]), std::min(X[3],Y[3]));
#endif
namespace {
MV_INSTR_SET_DEFAULT
uf::simd::vector<uint32_t> min_impl( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
auto X = uf::simd::cast(x);
auto Y = uf::simd::cast(y);
return uf::simd::set(std::min(X[0],Y[0]), std::min(X[1],Y[1]), std::min(X[2],Y[2]), std::min(X[3],Y[3]));
}
MV_INSTR_SET_4
uf::simd::vector<uint32_t> min_impl( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
return _mm_min_epu32(x, y); // unsigned min
}
MV_INSTR_SET_DEFAULT
uf::simd::vector<uint32_t> max_impl( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
auto X = uf::simd::cast(x);
auto Y = uf::simd::cast(y);
return uf::simd::set(std::max(X[0],Y[0]), std::max(X[1],Y[1]), std::max(X[2],Y[2]), std::max(X[3],Y[3]));
}
MV_INSTR_SET_4
uf::simd::vector<uint32_t> max_impl( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
return _mm_max_epu32(x, y); // unsigned max
}
}
inline uf::simd::value<uint32_t> uf::simd::max( uf::simd::value<uint32_t> x, uf::simd::value<uint32_t> y ) {
#if SSE_INSTR_SET >= 4
return _mm_max_epu32(x, y); // unsigned max
#else
auto X = uf::simd::vector(x);
auto Y = uf::simd::vector(y);
return uf::simd::set(std::max(X[0],Y[0]), std::max(X[1],Y[1]), std::max(X[2],Y[2]), std::max(X[3],Y[3]));
#endif
inline uf::simd::vector<uint32_t> uf::simd::min( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
return ::min_impl(x, y);
}
inline bool uf::simd::all( uf::simd::value<uint32_t> mask) {
inline uf::simd::vector<uint32_t> uf::simd::max( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
return ::max_impl(x, y);
}
inline bool uf::simd::all( uf::simd::vector<uint32_t> mask) {
return _mm_movemask_epi8( mask ) == 0xFFFF; // all 4 bits set
}
inline bool uf::simd::any( uf::simd::value<uint32_t> mask) {
inline bool uf::simd::any( uf::simd::vector<uint32_t> mask) {
return _mm_movemask_epi8( mask ) != 0x0; // any bit set
}
inline uf::simd::value<uint32_t> uf::simd::less( uf::simd::value<uint32_t> x, uf::simd::value<uint32_t> y ) {
#if SSE_INSTR_SET >= 4
return _mm_cmplt_epi32( ::bias_unsigned( x ), ::bias_unsigned( y ) );
#else
auto X = vector( x ), Y = vector( y );
return set(X[0] < Y[0], X[1] < Y[1], X[2] < Y[2], X[3] < Y[3]);
#endif
namespace {
MV_INSTR_SET_DEFAULT
uf::simd::vector<uint32_t> less_impl( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
auto X = uf::simd::cast( x ), Y = uf::simd::cast( y );
return uf::simd::set_ui(X[0] < Y[0], X[1] < Y[1], X[2] < Y[2], X[3] < Y[3]);
}
MV_INSTR_SET_4
uf::simd::vector<uint32_t> less_impl( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
return _mm_cmplt_epi32( ::bias_unsigned(x), ::bias_unsigned(y) );
}
MV_INSTR_SET_DEFAULT
uf::simd::vector<uint32_t> lessEquals_impl( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y) {
auto X = uf::simd::cast(x), Y = uf::simd::cast(y);
return uf::simd::set_ui(X[0] <= Y[0], X[1] <= Y[1], X[2] <= Y[2], X[3] <= Y[3]);
}
MV_INSTR_SET_2
uf::simd::vector<uint32_t> lessEquals_impl( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y) {
// a <= b <=> !(a > b)
__m128i bx = ::bias_unsigned(x);
__m128i by = ::bias_unsigned(y);
__m128i gt = _mm_cmpgt_epi32(bx, by); // signed compare
return _mm_xor_si128(gt, _mm_set1_epi32(-1)); // invert mask
}
MV_INSTR_SET_DEFAULT
uf::simd::vector<uint32_t> greater_impl( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
auto X = uf::simd::cast( x ), Y = uf::simd::cast( y );
return uf::simd::set_ui(X[0] > Y[0], X[1] > Y[1], X[2] > Y[2], X[3] > Y[3]);
}
MV_INSTR_SET_4
uf::simd::vector<uint32_t> greater_impl( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
return _mm_cmpgt_epi32( ::bias_unsigned(x), ::bias_unsigned(y) );
}
MV_INSTR_SET_DEFAULT
uf::simd::vector<uint32_t> greaterEquals_impl( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y) {
auto X = uf::simd::cast(x), Y = uf::simd::cast(y);
return uf::simd::set_ui(X[0] >= Y[0], X[1] >= Y[1], X[2] >= Y[2], X[3] >= Y[3]);
}
MV_INSTR_SET_2
uf::simd::vector<uint32_t> greaterEquals_impl( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y) {
// a >= b <=> !(a < b)
__m128i bx = ::bias_unsigned(x);
__m128i by = ::bias_unsigned(y);
__m128i lt = _mm_cmplt_epi32(bx, by); // signed compare
return _mm_xor_si128(lt, _mm_set1_epi32(-1)); // invert mask
}
}
inline uf::simd::value<uint32_t> uf::simd::lessEquals(value<uint32_t> x, value<uint32_t> y) {
#if SSE_INSTR_SET >= 2
// a <= b <=> !(a > b)
__m128i bx = ::bias_unsigned(x);
__m128i by = ::bias_unsigned(y);
__m128i gt = _mm_cmpgt_epi32(bx, by); // signed compare
return _mm_xor_si128(gt, _mm_set1_epi32(-1)); // invert mask
#else
auto X = vector(x), Y = vector(y);
return set(X[0] <= Y[0], X[1] <= Y[1], X[2] <= Y[2], X[3] <= Y[3]);
#endif
inline uf::simd::vector<uint32_t> uf::simd::less( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
return ::less_impl( x, y );
}
inline uf::simd::value<uint32_t> uf::simd::greater( uf::simd::value<uint32_t> x, uf::simd::value<uint32_t> y ) {
#if SSE_INSTR_SET >= 4
return _mm_cmpgt_epi32( ::bias_unsigned( x ), ::bias_unsigned( y ) );
#else
auto X = vector( x ), Y = vector( y );
return uf::simd::set(X[0] > Y[0], X[1] > Y[1], X[2] > Y[2], X[3] > Y[3]);
#endif
inline uf::simd::vector<uint32_t> uf::simd::lessEquals( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
return ::lessEquals_impl( x, y );
}
inline uf::simd::value<uint32_t> uf::simd::greaterEquals(value<uint32_t> x, value<uint32_t> y) {
#if SSE_INSTR_SET >= 2
// a >= b <=> !(a < b)
__m128i bx = ::bias_unsigned(x);
__m128i by = ::bias_unsigned(y);
__m128i lt = _mm_cmplt_epi32(bx, by); // signed compare
return _mm_xor_si128(lt, _mm_set1_epi32(-1)); // invert mask
#else
auto X = vector(x), Y = vector(y);
return set(X[0] >= Y[0], X[1] >= Y[1], X[2] >= Y[2], X[3] >= Y[3]);
#endif
inline uf::simd::vector<uint32_t> uf::simd::greater( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
return ::greater_impl( x, y );
}
inline uf::simd::value<uint32_t> uf::simd::equals( uf::simd::value<uint32_t> x, uf::simd::value<uint32_t> y ) {
inline uf::simd::vector<uint32_t> uf::simd::greaterEquals( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
return ::greaterEquals_impl( x, y );
}
inline uf::simd::vector<uint32_t> uf::simd::equals( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
return _mm_cmpeq_epi32(x, y);
}
inline uf::simd::value<uint32_t> uf::simd::notEquals( uf::simd::value<uint32_t> x, uf::simd::value<uint32_t> y ) {
inline uf::simd::vector<uint32_t> uf::simd::notEquals( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
return _mm_xor_si128(_mm_cmpeq_epi32(x, y), _mm_set1_epi32(-1));
}
inline uf::simd::value<uint32_t> uf::simd::sqrt( uf::simd::value<uint32_t> v ) {
auto V = uf::simd::vector( v );
inline uf::simd::vector<uint32_t> uf::simd::sqrt( uf::simd::vector<uint32_t> v ) {
auto V = uf::simd::cast( v );
return uf::simd::set( (uint32_t) std::sqrt(V[0]), (uint32_t) std::sqrt(V[1]), (uint32_t) std::sqrt(V[2]), (uint32_t) std::sqrt(V[3]) );
}
inline uint32_t uf::simd::dot( uf::simd::value<uint32_t> x, uf::simd::value<uint32_t> y ) {
auto X = uf::simd::vector( x );
auto Y = uf::simd::vector( y );
inline uint32_t uf::simd::dot( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
auto X = uf::simd::cast( x );
auto Y = uf::simd::cast( y );
return X[0] * Y[0] + X[1] * Y[1] + X[2] * Y[2] + X[3] * Y[3];
}
inline uf::simd::vector<float> uf::simd::set_f( bool x, bool y, bool z, bool w ) {
return _mm_castsi128_ps(_mm_setr_epi32(::boolMask(x), ::boolMask(y), ::boolMask(z), ::boolMask(w)));
}
inline uf::simd::vector<int32_t> uf::simd::set_i( bool x, bool y, bool z, bool w ) {
return _mm_setr_epi32(::boolMask(x), ::boolMask(y), ::boolMask(z), ::boolMask(w));
}
inline uf::simd::vector<uint32_t> uf::simd::set_ui( bool x, bool y, bool z, bool w ) {
return _mm_setr_epi32(::boolMask(x), ::boolMask(y), ::boolMask(z), ::boolMask(w));
}
namespace {
MV_INSTR_SET_DEFAULT
uf::simd::vector<float> cross_impl( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
__m128 tmp0 = _mm_shuffle_ps(y,y,_MM_SHUFFLE(3,0,2,1));
__m128 tmp1 = _mm_shuffle_ps(x,x,_MM_SHUFFLE(3,0,2,1));
tmp0 = _mm_mul_ps(tmp0,x);
tmp1 = _mm_mul_ps(tmp1,y);
__m128 tmp2 = _mm_sub_ps(tmp0,tmp1);
__m128 res = _mm_shuffle_ps(tmp2,tmp2,_MM_SHUFFLE(3,0,2,1));
return res;
}
MV_INSTR_SET_7
uf::simd::vector<float> cross_impl( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
__m128 tmp0 = _mm_shuffle_ps(y,y,_MM_SHUFFLE(3,0,2,1));
__m128 tmp1 = _mm_shuffle_ps(x,x,_MM_SHUFFLE(3,0,2,1));
tmp1 = _mm_mul_ps(tmp1,y);
__m128 tmp2 = _mm_fmsub_ps( tmp0,x, tmp1 );
__m128 res = _mm_shuffle_ps(tmp2,tmp2,_MM_SHUFFLE(3,0,2,1));
return res;
}
}
inline uf::simd::vector<float> uf::simd::cross( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
return ::cross_impl( x, y );
}
inline uf::simd::vector<float> uf::simd::normalize( uf::simd::vector<float> v ) {
__m128 len = _mm_sqrt_ss( ::dot_impl( v,v ) );
len = _mm_shuffle_ps(len, len, 0x00);
return _mm_div_ps(v, len);
}
inline uf::simd::vector<float> uf::simd::normalize_fast( uf::simd::vector<float> v ) {
__m128 invLen = _mm_rsqrt_ps(::dot_impl(v, v));
return _mm_mul_ps(v, invLen);
}