further ironing out the underlying math lib / SIMD intrinsics (theoretically now if -march=native binaries are shipped they shouldn't crash through multi-versioning)
This commit is contained in:
parent
c87eac5e05
commit
e1d824a5ac
@ -1,8 +1,9 @@
|
||||
#include "pod.inl"
|
||||
#if UF_USE_CLASS_OF_PODS
|
||||
#include "class.inl"
|
||||
#if UF_USE_SIMD
|
||||
#include "simd.inl"
|
||||
#endif
|
||||
|
||||
#include "pod.inl"
|
||||
|
||||
template<typename T, size_t R, size_t C>
|
||||
uf::stl::string /*UF_API*/ uf::string::toString( const pod::Matrix<T,R,C>& m ) {
|
||||
return uf::matrix::toString(m);
|
||||
|
@ -81,6 +81,11 @@ inline bool pod::Matrix<T,R,C>::operator!=( const Matrix<T,R,C>& matrix ) const
|
||||
return !uf::matrix::equals( *this, matrix );
|
||||
}
|
||||
template<typename T> bool uf::matrix::equals( const T& left, const T& right, float eps ) {
|
||||
#if UF_USE_SIMD
|
||||
if constexpr (std::is_same_v<T,float>) {
|
||||
return uf::simd::matEquals( left, right, eps );
|
||||
}
|
||||
#endif
|
||||
bool result = true;
|
||||
FOR_EACH(T::rows * T::columns, {
|
||||
if ( fabs(left[i] - right[i]) > eps ) result = false;
|
||||
@ -91,27 +96,11 @@ template<typename T> pod::Matrix<T,4,4> uf::matrix::multiply( const pod::Matrix<
|
||||
pod::Matrix<T,4,4> res;
|
||||
|
||||
#if UF_USE_SIMD
|
||||
auto row1 = uf::simd::load(&left[0]);
|
||||
auto row2 = uf::simd::load(&left[4]);
|
||||
auto row3 = uf::simd::load(&left[8]);
|
||||
auto row4 = uf::simd::load(&left[12]);
|
||||
FOR_EACH(4, {
|
||||
auto brod1 = uf::simd::set(right[4*i + 0]);
|
||||
auto brod2 = uf::simd::set(right[4*i + 1]);
|
||||
auto brod3 = uf::simd::set(right[4*i + 2]);
|
||||
auto brod4 = uf::simd::set(right[4*i + 3]);
|
||||
auto row = uf::simd::add(
|
||||
uf::simd::add(
|
||||
uf::simd::mul(brod1, row1),
|
||||
uf::simd::mul(brod2, row2)),
|
||||
uf::simd::add(
|
||||
uf::simd::mul(brod3, row3),
|
||||
uf::simd::mul(brod4, row4)));
|
||||
uf::simd::store(row, &res[4*i]);
|
||||
});
|
||||
|
||||
return res;
|
||||
#elif UF_ENV_DREAMCAST
|
||||
if constexpr (std::is_same_v<T,float>) {
|
||||
return uf::simd::matMult( left, right );
|
||||
}
|
||||
#endif
|
||||
#if UF_ENV_DREAMCAST
|
||||
// kallistios has dedicated SH4 asm for these or something
|
||||
mat_load( (matrix_t*) &left[0] );
|
||||
mat_apply( (matrix_t*) &right[0] );
|
||||
@ -122,7 +111,6 @@ template<typename T> pod::Matrix<T,4,4> uf::matrix::multiply( const pod::Matrix<
|
||||
// MATH_Store_XMTRX( (ALL_FLOATS_STRUCT*) &res[0]);
|
||||
return res;
|
||||
#else
|
||||
#if 1
|
||||
FOR_EACH_2D(4, 4, {
|
||||
T sum = T{0};
|
||||
for (size_t k = 0; k < 4; ++k) {
|
||||
@ -131,29 +119,6 @@ template<typename T> pod::Matrix<T,4,4> uf::matrix::multiply( const pod::Matrix<
|
||||
res(r, c) = sum;
|
||||
});
|
||||
return res;
|
||||
#else
|
||||
// it works
|
||||
const pod::Vector<T,4>& srcA0 = *((pod::Vector<T,4>*) &left[0]);
|
||||
const pod::Vector<T,4>& srcA1 = *((pod::Vector<T,4>*) &left[4]);
|
||||
const pod::Vector<T,4>& srcA2 = *((pod::Vector<T,4>*) &left[8]);
|
||||
const pod::Vector<T,4>& srcA3 = *((pod::Vector<T,4>*) &left[12]);
|
||||
|
||||
const pod::Vector<T,4>& srcB0 = *((pod::Vector<T,4>*) &right[0]);
|
||||
const pod::Vector<T,4>& srcB1 = *((pod::Vector<T,4>*) &right[4]);
|
||||
const pod::Vector<T,4>& srcB2 = *((pod::Vector<T,4>*) &right[8]);
|
||||
const pod::Vector<T,4>& srcB3 = *((pod::Vector<T,4>*) &right[12]);
|
||||
|
||||
pod::Vector<T,4>& dst0 = *((pod::Vector<T,4>*) &res[0]);
|
||||
pod::Vector<T,4>& dst1 = *((pod::Vector<T,4>*) &res[4]);
|
||||
pod::Vector<T,4>& dst2 = *((pod::Vector<T,4>*) &res[8]);
|
||||
pod::Vector<T,4>& dst3 = *((pod::Vector<T,4>*) &res[12]);
|
||||
|
||||
dst0 = srcA0 * srcB0[0] + srcA1 * srcB0[1] + srcA2 * srcB0[2] + srcA3 * srcB0[3];
|
||||
dst1 = srcA0 * srcB1[0] + srcA1 * srcB1[1] + srcA2 * srcB1[2] + srcA3 * srcB1[3];
|
||||
dst2 = srcA0 * srcB2[0] + srcA1 * srcB2[1] + srcA2 * srcB2[2] + srcA3 * srcB2[3];
|
||||
dst3 = srcA0 * srcB3[0] + srcA1 * srcB3[1] + srcA2 * srcB3[2] + srcA3 * srcB3[3];
|
||||
return res;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
template<typename T, typename U> pod::Matrix<typename T::type_t, T::columns, T::columns> uf::matrix::multiply( const T& left, const U& right ) {
|
||||
@ -200,8 +165,12 @@ template<typename T> T /*UF_API*/ uf::matrix::add( const T& lhs, const T& rhs )
|
||||
return matrix;
|
||||
}
|
||||
template<typename T> T uf::matrix::transpose( const T& matrix ) {
|
||||
#if UF_USE_SIMD
|
||||
if constexpr (std::is_same_v<T,float> && T::rows == 4 && T::columns == 4 ) {
|
||||
return uf::simd::matTranspose( matrix );
|
||||
}
|
||||
#endif
|
||||
T transpose;
|
||||
|
||||
FOR_EACH_2D(T::rows, T::columns, {
|
||||
transpose(c, r) = matrix(r, c);
|
||||
});
|
||||
@ -283,6 +252,13 @@ pod::Vector3t<T> uf::matrix::multiply(const pod::Matrix3t<T>& mat, const pod::Ve
|
||||
};
|
||||
}
|
||||
template<typename T> pod::Vector4t<T> uf::matrix::multiply( const pod::Matrix4t<T>& mat, const pod::Vector4t<T>& v, bool div ) {
|
||||
#if UF_USE_SIMD
|
||||
if constexpr (std::is_same_v<T,float>) {
|
||||
pod::Vector4t<T> res = uf::simd::matMult( mat, v );
|
||||
if ( div && res.w > 0 ) res /= res.w;
|
||||
return res;
|
||||
}
|
||||
#endif
|
||||
#if UF_ENV_DREAMCAST
|
||||
MATH_Load_XMTRX( (ALL_FLOATS_STRUCT*) &mat[0] );
|
||||
auto t = MATH_Matrix_Transform( v[0], v[1], v[2], v[3] );
|
||||
@ -463,17 +439,15 @@ pod::Matrix4t<T> /*UF_API*/ uf::matrix::perspective( T fov, T raidou, T znear, T
|
||||
#endif
|
||||
}
|
||||
template<typename T> T& uf::matrix::copy( T& destination, const T& source ) {
|
||||
#pragma unroll // GCC unroll 16
|
||||
for ( auto i = 0; i < 16; ++i )
|
||||
FOR_EACH(T::rows * T::columns, {
|
||||
destination[i] = source[i];
|
||||
|
||||
});
|
||||
return destination;
|
||||
}
|
||||
template<typename T> T& uf::matrix::copy( T& destination, typename T::type_t* const source ) {
|
||||
#pragma unroll // GCC unroll 16
|
||||
for ( auto i = 0; i < 16; ++i )
|
||||
FOR_EACH(T::rows * T::columns, {
|
||||
destination[i] = source[i];
|
||||
|
||||
});
|
||||
return destination;
|
||||
}
|
||||
|
||||
|
226
engine/inc/uf/utils/math/matrix/simd.inl
Normal file
226
engine/inc/uf/utils/math/matrix/simd.inl
Normal file
@ -0,0 +1,226 @@
|
||||
namespace uf {
|
||||
namespace simd {
|
||||
template<typename T>
|
||||
class alignas(16) matrix_value {
|
||||
public:
|
||||
typedef typename traits<T>::value value_type;
|
||||
value_type m[4]; // 4 x 4
|
||||
|
||||
inline matrix_value();
|
||||
inline matrix_value(const pod::Matrix<T,4>& rhs);
|
||||
|
||||
inline bool operator==(const matrix_value&) const;
|
||||
inline operator pod::Matrix<T,4>() const;
|
||||
};
|
||||
}
|
||||
|
||||
namespace simd {
|
||||
inline uf::simd::matrix_value<float> matMult( const uf::simd::matrix_value<float>& A, const uf::simd::matrix_value<float>& B );
|
||||
inline uf::simd::vector<float> matMult( const uf::simd::matrix_value<float>& A, uf::simd::vector<float> B );
|
||||
inline uf::simd::matrix_value<float> matTranspose( const uf::simd::matrix_value<float>& M );
|
||||
inline bool matEquals( const uf::simd::matrix_value<float>& A, const uf::simd::matrix_value<float>& B, float eps );
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
__attribute__((target("default")))
|
||||
uf::simd::matrix_value<float> matMult_impl(const uf::simd::matrix_value<float>& A, const uf::simd::matrix_value<float>& B) {
|
||||
uf::simd::matrix_value<float> R;
|
||||
uf::simd::matrix_value<float> Bt = uf::simd::matTranspose(B);
|
||||
FOR_EACH(4, {
|
||||
__m128 bcol = B.m[i];
|
||||
|
||||
__m128 vx = _mm_shuffle_ps(bcol, bcol, 0x00); // xxxx
|
||||
__m128 vy = _mm_shuffle_ps(bcol, bcol, 0x55); // yyyy
|
||||
__m128 vz = _mm_shuffle_ps(bcol, bcol, 0xAA); // zzzz
|
||||
__m128 vw = _mm_shuffle_ps(bcol, bcol, 0xFF); // wwww
|
||||
|
||||
R.m[i] = _mm_add_ps(
|
||||
_mm_add_ps(
|
||||
_mm_mul_ps(A.m[0], vx),
|
||||
_mm_mul_ps(A.m[1], vy)),
|
||||
_mm_add_ps(
|
||||
_mm_mul_ps(A.m[2], vz),
|
||||
_mm_mul_ps(A.m[3], vw))
|
||||
);
|
||||
});
|
||||
|
||||
return R;
|
||||
|
||||
}
|
||||
#if 1
|
||||
__attribute__((target("sse4.1")))
|
||||
uf::simd::matrix_value<float> matMult_impl(const uf::simd::matrix_value<float>& A, const uf::simd::matrix_value<float>& B) {
|
||||
uf::simd::matrix_value<float> R;
|
||||
uf::simd::matrix_value<float> Bt = uf::simd::matTranspose(B);
|
||||
|
||||
FOR_EACH(4, {
|
||||
__m128 bcol = B.m[i];
|
||||
|
||||
__m128 vx = _mm_shuffle_ps(bcol, bcol, 0x00); // xxxx
|
||||
__m128 vy = _mm_shuffle_ps(bcol, bcol, 0x55); // yyyy
|
||||
__m128 vz = _mm_shuffle_ps(bcol, bcol, 0xAA); // zzzz
|
||||
__m128 vw = _mm_shuffle_ps(bcol, bcol, 0xFF); // wwww
|
||||
|
||||
R.m[i] = _mm_add_ps(
|
||||
_mm_add_ps(
|
||||
_mm_mul_ps(A.m[0], vx),
|
||||
_mm_mul_ps(A.m[1], vy)),
|
||||
_mm_add_ps(
|
||||
_mm_mul_ps(A.m[2], vz),
|
||||
_mm_mul_ps(A.m[3], vw))
|
||||
);
|
||||
});
|
||||
|
||||
return R;
|
||||
}
|
||||
#endif
|
||||
#if 1
|
||||
__attribute__((target("avx2,fma")))
|
||||
uf::simd::matrix_value<float> matMult_impl(const uf::simd::matrix_value<float>& A, const uf::simd::matrix_value<float>& B) {
|
||||
uf::simd::matrix_value<float> R;
|
||||
uf::simd::matrix_value<float> Bt = uf::simd::matTranspose(B);
|
||||
|
||||
FOR_EACH(4, {
|
||||
__m128 bcol = B.m[i];
|
||||
|
||||
__m256 vx = _mm256_broadcastss_ps(bcol); // xxxx
|
||||
__m256 vy = _mm256_broadcastss_ps(_mm_shuffle_ps(bcol, bcol, 0x55)); // yyyy
|
||||
__m256 vz = _mm256_broadcastss_ps(_mm_shuffle_ps(bcol, bcol, 0xAA)); // zzzz
|
||||
__m256 vw = _mm256_broadcastss_ps(_mm_shuffle_ps(bcol, bcol, 0xFF)); // wwww
|
||||
|
||||
__m256 a0 = _mm256_castps128_ps256(A.m[0]);
|
||||
__m256 a1 = _mm256_castps128_ps256(A.m[1]);
|
||||
__m256 a2 = _mm256_castps128_ps256(A.m[2]);
|
||||
__m256 a3 = _mm256_castps128_ps256(A.m[3]);
|
||||
|
||||
__m256 r = _mm256_fmadd_ps(a0, vx,
|
||||
_mm256_fmadd_ps(a1, vy,
|
||||
_mm256_fmadd_ps(a2, vz,
|
||||
_mm256_mul_ps(a3, vw))));
|
||||
|
||||
__m128 r128 = _mm_add_ps(
|
||||
_mm256_castps256_ps128(r),
|
||||
_mm256_extractf128_ps(r, 1)
|
||||
);
|
||||
|
||||
R.m[i] = r128;
|
||||
});
|
||||
|
||||
return R;
|
||||
}
|
||||
#endif
|
||||
#if 1
|
||||
__attribute__((target("avx512f")))
|
||||
uf::simd::matrix_value<float> matMult_impl( const uf::simd::matrix_value<float>& A, const uf::simd::matrix_value<float>& B) {
|
||||
uf::simd::matrix_value<float> R;
|
||||
uf::simd::matrix_value<float> Bt = uf::simd::matTranspose(B);
|
||||
|
||||
FOR_EACH(4, {
|
||||
__m128 bcol = B.m[i];
|
||||
|
||||
__m512 vx = _mm512_set1_ps(((const float*)&bcol)[0]); // xxxx
|
||||
__m512 vy = _mm512_set1_ps(((const float*)&bcol)[1]); // yyyy
|
||||
__m512 vz = _mm512_set1_ps(((const float*)&bcol)[2]); // zzzz
|
||||
__m512 vw = _mm512_set1_ps(((const float*)&bcol)[3]); // wwww
|
||||
|
||||
__m512 a0 = _mm512_castps128_ps512(A.m[0]);
|
||||
__m512 a1 = _mm512_castps128_ps512(A.m[1]);
|
||||
__m512 a2 = _mm512_castps128_ps512(A.m[2]);
|
||||
__m512 a3 = _mm512_castps128_ps512(A.m[3]);
|
||||
|
||||
__m512 r = _mm512_fmadd_ps(a0, vx,
|
||||
_mm512_fmadd_ps(a1, vy,
|
||||
_mm512_fmadd_ps(a2, vz,
|
||||
_mm512_mul_ps(a3, vw))));
|
||||
|
||||
__m128 r128 = _mm_add_ps(
|
||||
_mm_add_ps(
|
||||
_mm512_castps512_ps128(r), // low 128
|
||||
_mm512_extractf32x4_ps(r, 1)), // next 128
|
||||
_mm_add_ps(
|
||||
_mm512_extractf32x4_ps(r, 2), // next 128
|
||||
_mm512_extractf32x4_ps(r, 3)) // high 128
|
||||
);
|
||||
|
||||
R.m[i] = r128;
|
||||
});
|
||||
|
||||
return R;
|
||||
}
|
||||
#endif
|
||||
|
||||
__attribute__((target("default")))
|
||||
uf::simd::vector<float> matMult_impl( const uf::simd::matrix_value<float>& M, uf::simd::vector<float> v ) {
|
||||
__m128 vx = _mm_shuffle_ps(v, v, 0x00);
|
||||
__m128 vy = _mm_shuffle_ps(v, v, 0x55);
|
||||
__m128 vz = _mm_shuffle_ps(v, v, 0xAA);
|
||||
__m128 vw = _mm_shuffle_ps(v, v, 0xFF);
|
||||
|
||||
__m128 r0 = _mm_mul_ps(M.m[0], vx);
|
||||
__m128 r1 = _mm_mul_ps(M.m[1], vy);
|
||||
__m128 r2 = _mm_mul_ps(M.m[2], vz);
|
||||
__m128 r3 = _mm_mul_ps(M.m[3], vw);
|
||||
|
||||
return _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));
|
||||
}
|
||||
#if 1
|
||||
__attribute__((target("fma")))
|
||||
uf::simd::vector<float> matMult_impl( const uf::simd::matrix_value<float>& M, uf::simd::vector<float> v ) {
|
||||
__m128 vx = _mm_shuffle_ps(v, v, 0x00);
|
||||
__m128 vy = _mm_shuffle_ps(v, v, 0x55);
|
||||
__m128 vz = _mm_shuffle_ps(v, v, 0xAA);
|
||||
__m128 vw = _mm_shuffle_ps(v, v, 0xFF);
|
||||
|
||||
return _mm_fmadd_ps(M.m[0], vx,
|
||||
_mm_fmadd_ps(M.m[1], vy,
|
||||
_mm_fmadd_ps(M.m[2], vz,
|
||||
_mm_mul_ps(M.m[3], vw))));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
inline uf::simd::matrix_value<T>::matrix_value() {}
|
||||
template<typename T>
|
||||
inline uf::simd::matrix_value<T>::matrix_value( const pod::Matrix<T,4>& mat ) {
|
||||
m[0] = _mm_loadu_ps(&mat[0]);
|
||||
m[1] = _mm_loadu_ps(&mat[4]);
|
||||
m[2] = _mm_loadu_ps(&mat[8]);
|
||||
m[3] = _mm_loadu_ps(&mat[12]);
|
||||
}
|
||||
template<typename T>
|
||||
inline bool uf::simd::matrix_value<T>::operator==(const matrix_value& rhs) const {
|
||||
return uf::simd::matEquals( *this, rhs );
|
||||
}
|
||||
template<typename T>
|
||||
inline uf::simd::matrix_value<T>::operator pod::Matrix<T,4>() const {
|
||||
pod::Matrix4f mat;
|
||||
_mm_storeu_ps(&mat[0], m[0]);
|
||||
_mm_storeu_ps(&mat[4], m[1]);
|
||||
_mm_storeu_ps(&mat[8], m[2]);
|
||||
_mm_storeu_ps(&mat[12], m[3]);
|
||||
return mat;
|
||||
}
|
||||
|
||||
inline uf::simd::matrix_value<float> uf::simd::matMult( const uf::simd::matrix_value<float>& A, const uf::simd::matrix_value<float>& B ) {
|
||||
return ::matMult_impl( A, B );
|
||||
}
|
||||
inline uf::simd::vector<float> uf::simd::matMult( const uf::simd::matrix_value<float>& M, uf::simd::vector<float> vec ) {
|
||||
return ::matMult_impl( M, vec );
|
||||
}
|
||||
inline uf::simd::matrix_value<float> uf::simd::matTranspose( const uf::simd::matrix_value<float>& M ) {
|
||||
uf::simd::matrix_value<float> R = M;
|
||||
_MM_TRANSPOSE4_PS(R.m[0], R.m[1], R.m[2], R.m[3]);
|
||||
return R;
|
||||
}
|
||||
inline bool uf::simd::matEquals( const uf::simd::matrix_value<float>& A, const uf::simd::matrix_value<float>& B, float eps ) {
|
||||
bool result = true;
|
||||
__m128 e = _mm_set1_ps(eps);
|
||||
FOR_EACH(4, {
|
||||
__m128 diff = _mm_sub_ps(A.m[i], B.m[i]);
|
||||
__m128 mask = _mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.0f), diff), e);
|
||||
if (_mm_movemask_ps(mask)) result = false;
|
||||
});
|
||||
return result;
|
||||
}
|
@ -28,9 +28,9 @@ namespace pod {
|
||||
};
|
||||
|
||||
struct SupportPoint {
|
||||
alignas(16) pod::Vector3f p;
|
||||
alignas(16) pod::Vector3f pA;
|
||||
alignas(16) pod::Vector3f pB;
|
||||
/*alignas(16)*/ pod::Vector3f p;
|
||||
/*alignas(16)*/ pod::Vector3f pA;
|
||||
/*alignas(16)*/ pod::Vector3f pB;
|
||||
};
|
||||
|
||||
struct Simplex {
|
||||
@ -39,7 +39,7 @@ namespace pod {
|
||||
|
||||
struct Face {
|
||||
pod::SupportPoint a, b, c;
|
||||
alignas(16) pod::Vector3f normal;
|
||||
/*alignas(16)*/ pod::Vector3f normal;
|
||||
float distance;
|
||||
};
|
||||
|
||||
@ -62,7 +62,7 @@ namespace pod {
|
||||
typedef uf::stl::unordered_set<pair_t, PairHash, PairEq> pairs_t;
|
||||
|
||||
struct Node {
|
||||
alignas(16) pod::AABB bounds = {};
|
||||
/*alignas(16)*/ pod::AABB bounds = {};
|
||||
int32_t left = -1;
|
||||
int32_t right = -1;
|
||||
int32_t start = 0;
|
||||
@ -71,7 +71,7 @@ namespace pod {
|
||||
bool asleep = false;
|
||||
};
|
||||
struct FlatNode {
|
||||
alignas(16) pod::AABB bounds = {};
|
||||
/*alignas(16)*/ pod::AABB bounds = {};
|
||||
int32_t start = -1;
|
||||
int32_t count = -1;
|
||||
int32_t skipIndex = -1;
|
||||
@ -171,23 +171,23 @@ namespace pod {
|
||||
float mass = 1.0f;
|
||||
float inverseMass = 1.0f;
|
||||
|
||||
alignas(16) pod::Vector3f offset = {};
|
||||
/*alignas(16)*/ pod::Vector3f offset = {};
|
||||
|
||||
alignas(16) pod::Vector3f velocity = {};
|
||||
alignas(16) pod::Vector3f forceAccumulator = {};
|
||||
/*alignas(16)*/ pod::Vector3f velocity = {};
|
||||
/*alignas(16)*/ pod::Vector3f forceAccumulator = {};
|
||||
|
||||
alignas(16) pod::Vector3f angularVelocity = {};
|
||||
alignas(16) pod::Vector3f torqueAccumulator = {};
|
||||
/*alignas(16)*/ pod::Vector3f angularVelocity = {};
|
||||
/*alignas(16)*/ pod::Vector3f torqueAccumulator = {};
|
||||
|
||||
alignas(16) pod::Vector3f inertiaTensor = { 1, 1, 1 };
|
||||
alignas(16) pod::Vector3f inverseInertiaTensor = { 1, 1, 1 };
|
||||
/*alignas(16)*/ pod::Vector3f inertiaTensor = { 1, 1, 1 };
|
||||
/*alignas(16)*/ pod::Vector3f inverseInertiaTensor = { 1, 1, 1 };
|
||||
|
||||
alignas(16) pod::Vector3f gravity = { NAN, NAN, NAN }; // an invalid gravity will fallback to world gravity
|
||||
/*alignas(16)*/ pod::Vector3f gravity = { NAN, NAN, NAN }; // an invalid gravity will fallback to world gravity
|
||||
|
||||
alignas(16) pod::AABB bounds;
|
||||
alignas(16) pod::Collider collider;
|
||||
alignas(16) pod::PhysicsMaterial material;
|
||||
alignas(16) pod::Activity activity;
|
||||
/*alignas(16)*/ pod::AABB bounds;
|
||||
/*alignas(16)*/ pod::Collider collider;
|
||||
/*alignas(16)*/ pod::PhysicsMaterial material;
|
||||
/*alignas(16)*/ pod::Activity activity;
|
||||
};
|
||||
|
||||
struct Contact {
|
||||
|
@ -9,7 +9,7 @@ template<typename T> pod::Quaternion<T> uf::quaternion::identity() {
|
||||
return pod::Quaternion<T>{ 0, 0, 0, 1 };
|
||||
}
|
||||
template<typename T> T uf::quaternion::multiply( const T& q1, const T& q2 ) {
|
||||
#if 0 && UF_USE_SIMD
|
||||
#if UF_USE_SIMD
|
||||
if constexpr (std::is_same_v<typename T::type_t, float>) {
|
||||
return uf::simd::quatMul( q1 , q2 );
|
||||
}
|
||||
@ -22,9 +22,9 @@ template<typename T> T uf::quaternion::multiply( const T& q1, const T& q2 ) {
|
||||
};
|
||||
}
|
||||
template<typename T> pod::Vector3t<T> uf::quaternion::rotate( const pod::Quaternion<T>& Q, const pod::Vector3t<T>& v ) {
|
||||
#if 0 && UF_USE_SIMD
|
||||
#if UF_USE_SIMD
|
||||
if constexpr (std::is_same_v<T,float>) {
|
||||
return uf::simd::quatRot( Q, v );
|
||||
return uf::simd::quatRot_3f( Q, v );
|
||||
}
|
||||
#endif
|
||||
pod::Vector3t<T> q = { Q.x, Q.y, Q.z };
|
||||
|
@ -1,5 +1,3 @@
|
||||
#pragma once
|
||||
|
||||
#if UF_USE_SIMD
|
||||
#include "simd.inl"
|
||||
#endif
|
||||
|
@ -1,94 +1,81 @@
|
||||
namespace uf {
|
||||
namespace simd {
|
||||
inline value<float> /*UF_API*/ quatMul( value<float>, value<float> );
|
||||
inline value<float> /*UF_API*/ quatRot( value<float>, value<float> );
|
||||
inline pod::Matrix4f /*UF_API*/ quatMat( value<float> );
|
||||
inline vector<float> /*UF_API*/ quatMul( vector<float>, vector<float> );
|
||||
inline vector<float> /*UF_API*/ quatRot_3f( vector<float>, vector<float> );
|
||||
inline pod::Matrix4f /*UF_API*/ quatMat( vector<float> );
|
||||
}
|
||||
}
|
||||
|
||||
inline uf::simd::value<float> uf::simd::quatMul( uf::simd::value<float> Q1, uf::simd::value<float> Q2 ) {
|
||||
//__m128 Q1 = q1;
|
||||
//__m128 Q2 = q2;
|
||||
inline uf::simd::vector<float> uf::simd::quatMul( uf::simd::vector<float> Q1, uf::simd::vector<float> Q2 ) {
|
||||
// broadcast q1 components
|
||||
__m128 x1 = _mm_shuffle_ps(Q1, Q1, _MM_SHUFFLE(0,0,0,0));
|
||||
__m128 y1 = _mm_shuffle_ps(Q1, Q1, _MM_SHUFFLE(1,1,1,1));
|
||||
__m128 z1 = _mm_shuffle_ps(Q1, Q1, _MM_SHUFFLE(2,2,2,2));
|
||||
__m128 w1 = _mm_shuffle_ps(Q1, Q1, _MM_SHUFFLE(3,3,3,3));
|
||||
|
||||
// Broadcast q1.w, q1.x, q1.y, q1.z
|
||||
__m128 q1w = _mm_shuffle_ps(Q1, Q1, _MM_SHUFFLE(3,3,3,3));
|
||||
__m128 q1x = _mm_shuffle_ps(Q1, Q1, _MM_SHUFFLE(0,0,0,0));
|
||||
__m128 q1y = _mm_shuffle_ps(Q1, Q1, _MM_SHUFFLE(1,1,1,1));
|
||||
__m128 q1z = _mm_shuffle_ps(Q1, Q1, _MM_SHUFFLE(2,2,2,2));
|
||||
// broadcast q2 components
|
||||
__m128 x2 = _mm_shuffle_ps(Q2, Q2, _MM_SHUFFLE(0,0,0,0));
|
||||
__m128 y2 = _mm_shuffle_ps(Q2, Q2, _MM_SHUFFLE(1,1,1,1));
|
||||
__m128 z2 = _mm_shuffle_ps(Q2, Q2, _MM_SHUFFLE(2,2,2,2));
|
||||
__m128 w2 = _mm_shuffle_ps(Q2, Q2, _MM_SHUFFLE(3,3,3,3));
|
||||
|
||||
// Shuffle q2 into (x,y,z,w) permutations
|
||||
__m128 q2xyzw = Q2; // (x,y,z,w)
|
||||
__m128 q2wzyx = _mm_shuffle_ps(Q2, Q2, _MM_SHUFFLE(0,1,2,3)); // (w,z,y,x)
|
||||
__m128 q2yzxw = _mm_shuffle_ps(Q2, Q2, _MM_SHUFFLE(3,0,2,1)); // (y,z,x,w)
|
||||
__m128 q2zxyw = _mm_shuffle_ps(Q2, Q2, _MM_SHUFFLE(3,1,0,2)); // (z,x,y,w)
|
||||
// compute each component
|
||||
__m128 X = _mm_add_ps(
|
||||
_mm_add_ps(_mm_mul_ps(w1, x2), _mm_mul_ps(x1, w2)),
|
||||
_mm_sub_ps(_mm_mul_ps(y1, z2), _mm_mul_ps(z1, y2))
|
||||
);
|
||||
|
||||
// Compute terms
|
||||
__m128 t0 = _mm_mul_ps(q1w, q2xyzw); // w1 * (x2,y2,z2,w2)
|
||||
__m128 t1 = _mm_mul_ps(q1x, q2wzyx); // x1 * (w2,z2,y2,x2)
|
||||
__m128 t2 = _mm_mul_ps(q1y, q2yzxw); // y1 * (y2,z2,x2,w2)
|
||||
__m128 t3 = _mm_mul_ps(q1z, q2zxyw); // z1 * (z2,x2,y2,w2)
|
||||
__m128 Y = _mm_add_ps(
|
||||
_mm_add_ps(_mm_mul_ps(w1, y2), _mm_mul_ps(y1, w2)),
|
||||
_mm_sub_ps(_mm_mul_ps(z1, x2), _mm_mul_ps(x1, z2))
|
||||
);
|
||||
|
||||
// Signs: (+,+,+,+), (+,-,+,-), (-,+,-,+), (+,-,-,+)
|
||||
const __m128 sign1 = _mm_set_ps( 1.f,-1.f, 1.f,-1.f);
|
||||
const __m128 sign2 = _mm_set_ps(-1.f, 1.f,-1.f, 1.f);
|
||||
const __m128 sign3 = _mm_set_ps( 1.f,-1.f,-1.f, 1.f);
|
||||
__m128 Z = _mm_add_ps(
|
||||
_mm_add_ps(_mm_mul_ps(w1, z2), _mm_mul_ps(z1, w2)),
|
||||
_mm_sub_ps(_mm_mul_ps(x1, y2), _mm_mul_ps(y1, x2))
|
||||
);
|
||||
|
||||
t1 = _mm_mul_ps(t1, sign1);
|
||||
t2 = _mm_mul_ps(t2, sign2);
|
||||
t3 = _mm_mul_ps(t3, sign3);
|
||||
__m128 W = _mm_sub_ps(
|
||||
_mm_mul_ps(w1, w2),
|
||||
_mm_add_ps(
|
||||
_mm_add_ps(_mm_mul_ps(x1, x2), _mm_mul_ps(y1, y2)),
|
||||
_mm_mul_ps(z1, z2)
|
||||
)
|
||||
);
|
||||
|
||||
__m128 result = _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3));
|
||||
// pack back into (x,y,z,w)
|
||||
__m128 result = _mm_movelh_ps(_mm_unpacklo_ps(X, Y), _mm_unpacklo_ps(Z, W));
|
||||
return result;
|
||||
}
|
||||
inline uf::simd::value<float> uf::simd::quatRot( uf::simd::value<float> Q, uf::simd::value<float> V ) {
|
||||
//__m128 Q = q; // (x,y,z,w)
|
||||
//__m128 V = v; // (vx,vy,vz,0)
|
||||
|
||||
// Extract q.xyz and q.w
|
||||
inline uf::simd::vector<float> uf::simd::quatRot_3f( uf::simd::vector<float> Q, uf::simd::vector<float> V ) {
|
||||
// extract q.xyz and q.w
|
||||
__m128 qxyz = _mm_and_ps(Q, _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))); // mask out w
|
||||
__m128 qw = _mm_shuffle_ps(Q, Q, _MM_SHUFFLE(3,3,3,3));
|
||||
|
||||
// dot(q.xyz, v)
|
||||
#if SSE_INSTR_SET >= 4
|
||||
__m128 dot_qv = _mm_dp_ps(qxyz, V, 0x71); // result in lowest lane
|
||||
#else
|
||||
__m128 mul = _mm_mul_ps(qxyz, V);
|
||||
__m128 shuf = _mm_movehdup_ps(mul);
|
||||
__m128 sums = _mm_add_ps(mul, shuf);
|
||||
shuf = _mm_movehl_ps(shuf, sums);
|
||||
sums = _mm_add_ss(sums, shuf);
|
||||
__m128 dot_qv = sums;
|
||||
#endif
|
||||
__m128 term1 = _mm_mul_ps(_mm_mul_ps(dot_qv, _mm_set1_ps(2.0f)), qxyz);
|
||||
|
||||
// dot(q.xyz, q.xyz)
|
||||
#if SSE_INSTR_SET >= 4
|
||||
__m128 dot_qq = _mm_dp_ps(qxyz, qxyz, 0x71);
|
||||
#else
|
||||
__m128 mul2 = _mm_mul_ps(qxyz, qxyz);
|
||||
__m128 shuf2 = _mm_movehdup_ps(mul2);
|
||||
__m128 sums2 = _mm_add_ps(mul2, shuf2);
|
||||
shuf2 = _mm_movehl_ps(shuf2, sums2);
|
||||
sums2 = _mm_add_ss(sums2, shuf2);
|
||||
__m128 dot_qq = sums2;
|
||||
#endif
|
||||
__m128 w2 = _mm_mul_ps(qw, qw);
|
||||
__m128 coeff = _mm_sub_ps(w2, dot_qq);
|
||||
__m128 term2 = _mm_mul_ps(coeff, V);
|
||||
|
||||
// cross(q.xyz, v)
|
||||
__m128 q_yzx = _mm_shuffle_ps(qxyz, qxyz, _MM_SHUFFLE(3,0,2,1));
|
||||
__m128 v_yzx = _mm_shuffle_ps(V, V, _MM_SHUFFLE(3,0,2,1));
|
||||
__m128 cross = _mm_sub_ps(_mm_mul_ps(qxyz, v_yzx), _mm_mul_ps(q_yzx, V));
|
||||
cross = _mm_shuffle_ps(cross, cross, _MM_SHUFFLE(3,0,2,1));
|
||||
__m128 term3 = _mm_mul_ps(_mm_mul_ps(cross, qw), _mm_set1_ps(2.0f));
|
||||
__m128 cross1 = _mm_sub_ps(_mm_mul_ps(qxyz, v_yzx), _mm_mul_ps(q_yzx, V));
|
||||
cross1 = _mm_shuffle_ps(cross1, cross1, _MM_SHUFFLE(3,0,2,1));
|
||||
|
||||
// Final result
|
||||
__m128 result = _mm_add_ps(_mm_add_ps(term1, term2), term3);
|
||||
// 2 * w * cross(q,v)
|
||||
__m128 term1 = _mm_mul_ps(_mm_mul_ps(cross1, qw), _mm_set1_ps(2.0f));
|
||||
|
||||
// cross(q, cross(q,v))
|
||||
__m128 c1_yzx = _mm_shuffle_ps(cross1, cross1, _MM_SHUFFLE(3,0,2,1));
|
||||
__m128 cross2 = _mm_sub_ps(_mm_mul_ps(qxyz, c1_yzx), _mm_mul_ps(q_yzx, cross1));
|
||||
cross2 = _mm_shuffle_ps(cross2, cross2, _MM_SHUFFLE(3,0,2,1));
|
||||
|
||||
// 2 * cross(q, cross(q,v))
|
||||
__m128 term2 = _mm_mul_ps(cross2, _mm_set1_ps(2.0f));
|
||||
|
||||
// v + term1 + term2
|
||||
__m128 result = _mm_add_ps(_mm_add_ps(V, term1), term2);
|
||||
|
||||
return result;
|
||||
}
|
||||
inline pod::Matrix4f uf::simd::quatMat( uf::simd::value<float> Q ) {
|
||||
|
||||
inline pod::Matrix4f uf::simd::quatMat( uf::simd::vector<float> Q ) {
|
||||
// Shuffle out components
|
||||
__m128 qx = _mm_shuffle_ps(Q, Q, _MM_SHUFFLE(0,0,0,0));
|
||||
__m128 qy = _mm_shuffle_ps(Q, Q, _MM_SHUFFLE(1,1,1,1));
|
||||
|
@ -17,7 +17,7 @@ constexpr void for_each_index(F&& f) {
|
||||
|
||||
template<typename T, typename Op>
|
||||
T elementwise( const T& left, const T& right, Op&& op ) {
|
||||
alignas(16) T res;
|
||||
T res;
|
||||
FOR_EACH(T::size, {
|
||||
res[i] = op(left[i], right[i]);
|
||||
});
|
||||
@ -46,7 +46,7 @@ pod::Vector<T, N> uf::vector::copy( const pod::Vector<T, N>& v ) {
|
||||
}
|
||||
template<typename T, size_t N, typename U>
|
||||
pod::Vector<T, N> uf::vector::cast( const U& from ) {
|
||||
alignas(16) pod::Vector<T, N> to;
|
||||
pod::Vector<T, N> to;
|
||||
#pragma unroll // GCC unroll N
|
||||
for ( auto i = 0; i < N && i < U::size; ++i )
|
||||
to[i] = from[i];
|
||||
@ -147,7 +147,7 @@ T uf::vector::add( const T& left, const T& right ) {
|
||||
return uf::simd::add( left, right );
|
||||
}
|
||||
#endif
|
||||
alignas(16) T res;
|
||||
T res;
|
||||
FOR_EACH(T::size, {
|
||||
res[i] = left[i] + right[i];
|
||||
});
|
||||
@ -160,7 +160,7 @@ T uf::vector::add( const T& vector, typename T::type_t scalar ) {
|
||||
return uf::simd::add( vector, scalar );
|
||||
}
|
||||
#endif
|
||||
alignas(16) T res;
|
||||
T res;
|
||||
FOR_EACH(T::size, {
|
||||
res[i] = vector[i] + scalar;
|
||||
});
|
||||
@ -177,7 +177,7 @@ T uf::vector::subtract( const T& left, const T& right ) {
|
||||
return uf::simd::sub( left, right );
|
||||
}
|
||||
#endif
|
||||
alignas(16) T res;
|
||||
T res;
|
||||
FOR_EACH(T::size, {
|
||||
res[i] = left[i] - right[i];
|
||||
});
|
||||
@ -190,7 +190,7 @@ T uf::vector::subtract( const T& vector, typename T::type_t scalar ) {
|
||||
return uf::simd::sub( vector, scalar );
|
||||
}
|
||||
#endif
|
||||
alignas(16) T res;
|
||||
T res;
|
||||
FOR_EACH(T::size, {
|
||||
res[i] = vector[i] - scalar;
|
||||
});
|
||||
@ -203,7 +203,7 @@ T uf::vector::subtract( typename T::type_t scalar, const T& vector ) {
|
||||
return uf::simd::sub( scalar, vector );
|
||||
}
|
||||
#endif
|
||||
alignas(16) T res;
|
||||
T res;
|
||||
FOR_EACH(T::size, {
|
||||
res[i] = scalar - vector[i];
|
||||
});
|
||||
@ -216,7 +216,7 @@ T uf::vector::multiply( const T& left, const T& right ) {
|
||||
return uf::simd::mul( left, right );
|
||||
}
|
||||
#endif
|
||||
alignas(16) T res;
|
||||
T res;
|
||||
FOR_EACH(T::size, {
|
||||
res[i] = left[i] * right[i];
|
||||
});
|
||||
@ -229,7 +229,7 @@ T uf::vector::multiply( const T& vector, typename T::type_t scalar ) {
|
||||
return uf::simd::mul( vector, scalar );
|
||||
}
|
||||
#endif
|
||||
alignas(16) T res;
|
||||
T res;
|
||||
FOR_EACH(T::size, {
|
||||
res[i] = vector[i] * scalar;
|
||||
});
|
||||
@ -247,14 +247,14 @@ T uf::vector::divide( const T& left, const T& right ) {
|
||||
}
|
||||
#elif UF_ENV_DREAMCAST && UF_ENV_DREAMCAST_SIMD
|
||||
if constexpr ( simd_able_v<typename T::type_t> ) {
|
||||
alignas(16) T res;
|
||||
T res;
|
||||
FOR_EACH(T::size, {
|
||||
res[i] = MATH_Fast_Divide( left[i], right[i] );
|
||||
});
|
||||
return res;
|
||||
}
|
||||
#endif
|
||||
alignas(16) T res;
|
||||
T res;
|
||||
FOR_EACH(T::size, {
|
||||
res[i] = left[i] / right[i];
|
||||
});
|
||||
@ -268,14 +268,14 @@ T uf::vector::divide( const T& vector, typename T::type_t scalar ) {
|
||||
}
|
||||
#elif UF_ENV_DREAMCAST && UF_ENV_DREAMCAST_SIMD
|
||||
if constexpr ( simd_able_v<typename T::type_t> ) {
|
||||
alignas(16) T res;
|
||||
T res;
|
||||
FOR_EACH(T::size, {
|
||||
res[i] = MATH_Fast_Divide( vector[i], scalar );
|
||||
});
|
||||
return res;
|
||||
}
|
||||
#endif
|
||||
alignas(16) T res;
|
||||
T res;
|
||||
scalar = static_cast<typename T::type_t>(1) / scalar;
|
||||
FOR_EACH(T::size, {
|
||||
res[i] = vector[i] * scalar;
|
||||
@ -290,14 +290,14 @@ T uf::vector::divide( typename T::type_t scalar, const T& vector ) {
|
||||
}
|
||||
#elif UF_ENV_DREAMCAST && UF_ENV_DREAMCAST_SIMD
|
||||
if constexpr ( simd_able_v<typename T::type_t> ) {
|
||||
alignas(16) T res;
|
||||
T res;
|
||||
FOR_EACH(T::size, {
|
||||
res[i] = MATH_Fast_Divide( scalar, vector[i] );
|
||||
});
|
||||
return res;
|
||||
}
|
||||
#endif
|
||||
alignas(16) T res;
|
||||
T res;
|
||||
scalar = static_cast<T>(1) / scalar;
|
||||
FOR_EACH(T::size, {
|
||||
res[i] = scalar / vector[i];
|
||||
@ -327,7 +327,7 @@ T uf::vector::negate( const T& vector ) {
|
||||
return uf::simd::mul( vector, -1.f );
|
||||
}
|
||||
#endif
|
||||
alignas(16) T res;
|
||||
T res;
|
||||
FOR_EACH(T::size, {
|
||||
res[i] = -vector[i];
|
||||
});
|
||||
@ -335,7 +335,7 @@ T uf::vector::negate( const T& vector ) {
|
||||
}
|
||||
template<typename T>
|
||||
T uf::vector::abs( const T& vector ) {
|
||||
alignas(16) T res;
|
||||
T res;
|
||||
FOR_EACH(T::size, {
|
||||
res[i] = std::abs( vector[i] );
|
||||
});
|
||||
@ -575,7 +575,7 @@ T uf::vector::lerp( const T& from, const T& to, double delta, bool clamp ) {
|
||||
// from + ( ( to - from ) * delta )
|
||||
#if UF_ENV_DREAMCAST && UF_ENV_DREAMCAST_SIMD
|
||||
if constexpr ( simd_able_v<typename T::type_t> ) {
|
||||
alignas(16) T res;
|
||||
T res;
|
||||
FOR_EACH(T::size, {
|
||||
res[i] = MATH_Lerp( from[i], to[i], delta );
|
||||
});
|
||||
@ -594,7 +594,7 @@ T uf::vector::lerp( const T& from, const T& to, const T& delta, bool clamp ) {
|
||||
// from + ( ( to - from ) * delta )
|
||||
#if UF_ENV_DREAMCAST && UF_ENV_DREAMCAST_SIMD
|
||||
if constexpr ( simd_able_v<typename T::type_t> ) {
|
||||
alignas(16) T res;
|
||||
T res;
|
||||
FOR_EACH(T::size, {
|
||||
res[i] = MATH_Lerp( from[i], to[i], delta[i] );
|
||||
});
|
||||
@ -638,16 +638,16 @@ template<typename T>
|
||||
typename T::type_t uf::vector::distanceSquared( const T& a, const T& b ) {
|
||||
#if UF_ENV_DREAMCAST && UF_ENV_DREAMCAST_SIMD
|
||||
if constexpr ( simd_able_v<typename T::type_t> ) {
|
||||
alignas(16) T delta = uf::vector::subtract(b, a);
|
||||
T delta = uf::vector::subtract(b, a);
|
||||
return MATH_Sum_of_Squares( UF_EZ_VEC4( delta, T::size ) );
|
||||
}
|
||||
#elif UF_USE_SIMD
|
||||
if constexpr ( simd_able_v<typename T::type_t> ) {
|
||||
uf::simd::value<typename T::type_t> delta = uf::simd::sub( b, a );
|
||||
uf::simd::vector<typename T::type_t> delta = uf::simd::sub( b, a );
|
||||
return uf::simd::dot( delta, delta );
|
||||
}
|
||||
#endif
|
||||
alignas(16) T delta = uf::vector::subtract( b, a );
|
||||
T delta = uf::vector::subtract( b, a );
|
||||
return uf::vector::dot( delta, delta );
|
||||
}
|
||||
template<typename T>
|
||||
@ -674,10 +674,15 @@ typename T::type_t uf::vector::norm( const T& vector ) {
|
||||
}
|
||||
template<typename T>
|
||||
T uf::vector::normalize( const T& vector ) {
|
||||
#if UF_USE_SIMD
|
||||
if constexpr ( std::is_same_v<T,float> ) {
|
||||
return uf::simd::normalize( vector );
|
||||
}
|
||||
#endif
|
||||
typename T::type_t norm = uf::vector::norm(vector);
|
||||
if ( norm == 0 ) return vector;
|
||||
#if UF_ENV_DREAMCAST && UF_ENV_DREAMCAST_SIMD
|
||||
if constexpr ( simd_able_v<typename T::type_t> ) {
|
||||
if constexpr ( std::is_same_v<T,float> ) {
|
||||
return uf::vector::multiply(vector, MATH_fsrra(norm));
|
||||
}
|
||||
#endif
|
||||
@ -698,8 +703,8 @@ T uf::vector::clampMagnitude( const T& v, float maxMag ) {
|
||||
template<typename T>
|
||||
void uf::vector::orthonormalize( T& normal, T& tangent ) {
|
||||
normal = uf::vector::normalize( normal );
|
||||
alignas(16) T norm = normal;
|
||||
alignas(16) T tan = uf::vector::normalize( tangent );
|
||||
T norm = normal;
|
||||
T tan = uf::vector::normalize( tangent );
|
||||
tangent = uf::vector::subtract( tan, uf::vector::multiply( norm, uf::vector::dot( norm, tan ) ) );
|
||||
tangent = uf::vector::normalize( tangent );
|
||||
}
|
||||
@ -711,32 +716,15 @@ template<typename T>
|
||||
T uf::vector::cross( const T& a, const T& b ) {
|
||||
#if UF_USE_SIMD
|
||||
if constexpr ( simd_able_v<typename T::type_t> ) {
|
||||
uf::simd::value<typename T::type_t> x = a;
|
||||
uf::simd::value<typename T::type_t> y = b;
|
||||
#if SSE_INSTR_SET >= 7
|
||||
uf::simd::value<typename T::type_t> tmp0 = _mm_shuffle_ps(y,y,_MM_SHUFFLE(3,0,2,1));
|
||||
uf::simd::value<typename T::type_t> tmp1 = _mm_shuffle_ps(x,x,_MM_SHUFFLE(3,0,2,1));
|
||||
tmp1 = _mm_mul_ps(tmp1,y);
|
||||
uf::simd::value<typename T::type_t> tmp2 = _mm_fmsub_ps( tmp0,x, tmp1 );
|
||||
uf::simd::value<typename T::type_t> res = _mm_shuffle_ps(tmp2,tmp2,_MM_SHUFFLE(3,0,2,1));
|
||||
return res;
|
||||
#else
|
||||
uf::simd::value<typename T::type_t> tmp0 = _mm_shuffle_ps(y,y,_MM_SHUFFLE(3,0,2,1));
|
||||
uf::simd::value<typename T::type_t> tmp1 = _mm_shuffle_ps(x,x,_MM_SHUFFLE(3,0,2,1));
|
||||
tmp0 = _mm_mul_ps(tmp0,x);
|
||||
tmp1 = _mm_mul_ps(tmp1,y);
|
||||
uf::simd::value<typename T::type_t> tmp2 = _mm_sub_ps(tmp0,tmp1);
|
||||
uf::simd::value<typename T::type_t> res = _mm_shuffle_ps(tmp2,tmp2,_MM_SHUFFLE(3,0,2,1));
|
||||
return res;
|
||||
#endif
|
||||
return uf::simd::cross( a, b );
|
||||
}
|
||||
#elif UF_ENV_DREAMCAST && UF_ENV_DREAMCAST_SIMD
|
||||
if constexpr ( simd_able_v<typename T::type_t> ) {
|
||||
alignas(16) auto res = MATH_Cross_Product( a.x, a.y, a.z, b.x, b.y, b.z );
|
||||
auto res = MATH_Cross_Product( a.x, a.y, a.z, b.x, b.y, b.z );
|
||||
return *((T*) &res);
|
||||
}
|
||||
#endif
|
||||
alignas(16) T res{
|
||||
T res{
|
||||
a.y * b.z - b.y * a.z,
|
||||
a.z * b.x - b.z * a.x,
|
||||
a.x * b.y - b.x * a.y
|
||||
|
@ -7,34 +7,34 @@
|
||||
#endif
|
||||
|
||||
#define DEFINE_SIMD(T)\
|
||||
inline value<T> /*UF_API*/ load( const T* );\
|
||||
inline void /*UF_API*/ store( value<T>, T* );\
|
||||
inline value<T> /*UF_API*/ set( T );\
|
||||
inline value<T> /*UF_API*/ set( T, T, T, T );\
|
||||
inline value<T> /*UF_API*/ add( value<T>, value<T> );\
|
||||
inline value<T> /*UF_API*/ sub( value<T>, value<T> );\
|
||||
inline value<T> /*UF_API*/ mul( value<T>, value<T> );\
|
||||
inline value<T> /*UF_API*/ div( value<T>, value<T> );\
|
||||
inline value<T> /*UF_API*/ min( value<T>, value<T> );\
|
||||
inline value<T> /*UF_API*/ max( value<T>, value<T> );\
|
||||
inline bool /*UF_API*/ all( value<T> );\
|
||||
inline bool /*UF_API*/ any( value<T> );\
|
||||
inline value<T> /*UF_API*/ less( value<T>, value<T> );\
|
||||
inline value<T> /*UF_API*/ lessEquals( value<T>, value<T> );\
|
||||
inline value<T> /*UF_API*/ greater( value<T>, value<T> );\
|
||||
inline value<T> /*UF_API*/ greaterEquals( value<T>, value<T> );\
|
||||
inline value<T> /*UF_API*/ equals( value<T>, value<T> );\
|
||||
inline value<T> /*UF_API*/ notEquals( value<T>, value<T> );\
|
||||
inline value<T> /*UF_API*/ sqrt( value<T> );\
|
||||
inline value<T> /*UF_API*/ hadd( value<T>, value<T> );\
|
||||
inline T /*UF_API*/ dot( value<T>, value<T> );\
|
||||
template<size_t N = 4> inline pod::Vector<T,N> vector( const value<T> );\
|
||||
inline vector<T> /*UF_API*/ load( const T* );\
|
||||
inline void /*UF_API*/ store( vector<T>, T* );\
|
||||
inline vector<T> /*UF_API*/ set( T );\
|
||||
inline vector<T> /*UF_API*/ set( T, T, T, T );\
|
||||
inline vector<T> /*UF_API*/ add( vector<T>, vector<T> );\
|
||||
inline vector<T> /*UF_API*/ sub( vector<T>, vector<T> );\
|
||||
inline vector<T> /*UF_API*/ mul( vector<T>, vector<T> );\
|
||||
inline vector<T> /*UF_API*/ div( vector<T>, vector<T> );\
|
||||
inline vector<T> /*UF_API*/ min( vector<T>, vector<T> );\
|
||||
inline vector<T> /*UF_API*/ max( vector<T>, vector<T> );\
|
||||
inline bool /*UF_API*/ all( vector<T> );\
|
||||
inline bool /*UF_API*/ any( vector<T> );\
|
||||
inline vector<T> /*UF_API*/ less( vector<T>, vector<T> );\
|
||||
inline vector<T> /*UF_API*/ lessEquals( vector<T>, vector<T> );\
|
||||
inline vector<T> /*UF_API*/ greater( vector<T>, vector<T> );\
|
||||
inline vector<T> /*UF_API*/ greaterEquals( vector<T>, vector<T> );\
|
||||
inline vector<T> /*UF_API*/ equals( vector<T>, vector<T> );\
|
||||
inline vector<T> /*UF_API*/ notEquals( vector<T>, vector<T> );\
|
||||
inline vector<T> /*UF_API*/ sqrt( vector<T> );\
|
||||
inline vector<T> /*UF_API*/ hadd( vector<T>, vector<T> );\
|
||||
inline T /*UF_API*/ dot( vector<T>, vector<T> );\
|
||||
template<size_t N = 4> inline pod::Vector<T,N> cast( const vector<T> );\
|
||||
|
||||
namespace uf {
|
||||
namespace simd {
|
||||
template<typename T>
|
||||
struct UF_API traits {
|
||||
static const size_t size = 4;
|
||||
static constexpr size_t size = 4;
|
||||
typedef T type;
|
||||
typedef __m128 value;
|
||||
typedef pod::Vector<T,size> vector;
|
||||
@ -42,60 +42,60 @@ namespace uf {
|
||||
|
||||
template<>
|
||||
struct UF_API traits<int32_t> {
|
||||
static const size_t size = 4;
|
||||
static constexpr size_t size = 4;
|
||||
typedef int32_t type;
|
||||
typedef __m128i value;
|
||||
typedef pod::Vector<int32_t,4> vector;
|
||||
typedef pod::Vector<int32_t,size> vector;
|
||||
};
|
||||
template<>
|
||||
struct UF_API traits<uint32_t> {
|
||||
static const size_t size = 4;
|
||||
static constexpr size_t size = 4;
|
||||
typedef uint32_t type;
|
||||
typedef __m128i value;
|
||||
typedef pod::Vector<uint32_t,4> vector;
|
||||
typedef pod::Vector<uint32_t,size> vector;
|
||||
};
|
||||
template<>
|
||||
struct UF_API traits<float> {
|
||||
static const size_t size = 4;
|
||||
static constexpr size_t size = 4;
|
||||
typedef float type;
|
||||
typedef __m128 value;
|
||||
typedef pod::Vector<float,4> vector;
|
||||
typedef pod::Vector<float,size> vector;
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
class /*UF_API*/ alignas(16) value {
|
||||
private:
|
||||
// __m128 m_value;
|
||||
typedef typename traits<T>::value value_type;
|
||||
value_type m_value;
|
||||
class /*UF_API*/ alignas(16) vector {
|
||||
public:
|
||||
inline value();
|
||||
inline value(const T* f);
|
||||
inline value(T f);
|
||||
inline value(T f0, T f1, T f2, T f3);
|
||||
inline value(const value_type& rhs);
|
||||
inline value(const value& rhs);
|
||||
// __m128 m;
|
||||
typedef typename traits<T>::value value_type;
|
||||
value_type m;
|
||||
inline vector();
|
||||
inline vector(const T* f);
|
||||
inline vector(T f);
|
||||
inline vector(T f0, T f1, T f2, T f3);
|
||||
inline vector(bool f0, bool f1, bool f2, bool f3);
|
||||
inline vector(const value_type& rhs);
|
||||
inline vector(const vector& rhs);
|
||||
|
||||
inline value(const pod::Vector<T,1>& rhs);
|
||||
inline value(const pod::Vector<T,2>& rhs);
|
||||
inline value(const pod::Vector<T,3>& rhs);
|
||||
inline value(const pod::Vector<T,4>& rhs);
|
||||
inline vector(const pod::Vector<T,1>& rhs);
|
||||
inline vector(const pod::Vector<T,2>& rhs);
|
||||
inline vector(const pod::Vector<T,3>& rhs);
|
||||
inline vector(const pod::Vector<T,4>& rhs);
|
||||
|
||||
inline value operator+( const value& rhs );
|
||||
inline value operator-( const value& rhs );
|
||||
inline value operator*( const value& rhs );
|
||||
inline value operator/( const value& rhs );
|
||||
inline vector operator+( const vector& rhs );
|
||||
inline vector operator-( const vector& rhs );
|
||||
inline vector operator*( const vector& rhs );
|
||||
inline vector operator/( const vector& rhs );
|
||||
|
||||
inline value operator<( const value& rhs );
|
||||
inline value operator<=( const value& rhs );
|
||||
inline value operator>( const value& rhs );
|
||||
inline value operator>=( const value& rhs );
|
||||
inline value operator==( const value& rhs );
|
||||
inline value operator!=( const value& rhs );
|
||||
inline vector operator<( const vector& rhs );
|
||||
inline vector operator<=( const vector& rhs );
|
||||
inline vector operator>( const vector& rhs );
|
||||
inline vector operator>=( const vector& rhs );
|
||||
inline vector operator==( const vector& rhs );
|
||||
inline vector operator!=( const vector& rhs );
|
||||
|
||||
inline value& operator=(const value_type& rhs);
|
||||
inline value& operator=(const value& rhs);
|
||||
inline value& operator=(const pod::Vector<T,4>& rhs);
|
||||
inline vector& operator=(const value_type& rhs);
|
||||
inline vector& operator=(const vector& rhs);
|
||||
inline vector& operator=(const pod::Vector<T,4>& rhs);
|
||||
|
||||
inline operator value_type() const;
|
||||
|
||||
@ -107,12 +107,23 @@ namespace uf {
|
||||
DEFINE_SIMD(uint32_t);
|
||||
|
||||
// these are effectively NOPs
|
||||
/*
|
||||
#if UF_USE_FLOAT16
|
||||
DEFINE_SIMD(std::float16_t)
|
||||
#endif
|
||||
#if UF_USE_BFLOAT16
|
||||
DEFINE_SIMD(std::bfloat16_t)
|
||||
#endif
|
||||
*/
|
||||
|
||||
// specializations
|
||||
inline vector<float> /*UF_API*/ set_f( bool, bool, bool, bool );
|
||||
inline vector<int32_t> /*UF_API*/ set_i( bool, bool, bool, bool );
|
||||
inline vector<uint32_t> /*UF_API*/ set_ui( bool, bool, bool, bool );
|
||||
|
||||
inline vector<float> /*UF_API*/ cross( vector<float> x, vector<float> y );
|
||||
inline vector<float> /*UF_API*/ normalize( vector<float> x );
|
||||
inline vector<float> /*UF_API*/ normalize_fast( vector<float> x );
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -5,143 +5,157 @@ namespace {
|
||||
const __m128i signbit = _mm_set1_epi32(0x80000000);
|
||||
return _mm_xor_si128(v, signbit);
|
||||
}
|
||||
|
||||
inline int32_t boolMask(bool b) {
|
||||
return b ? -1 : 0; // 0xFFFFFFFF for true, 0x00000000 for false
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
inline uf::simd::value<T>::value() {}
|
||||
template<typename T>
|
||||
inline uf::simd::value<T>::value(const T* f) : m_value(uf::simd::load(f)) {}
|
||||
template<typename T>
|
||||
inline uf::simd::value<T>::value(T f) : m_value(uf::simd::set(f)) {}
|
||||
template<typename T>
|
||||
inline uf::simd::value<T>::value(T f0, T f1, T f2, T f3) : m_value(uf::simd::set(f0,f1,f2,f3)) {}
|
||||
template<typename T>
|
||||
inline uf::simd::value<T>::value(const value_type& rhs) : m_value(rhs) {}
|
||||
template<typename T>
|
||||
inline uf::simd::value<T>::value(const value& rhs) : m_value(rhs.m_value) {}
|
||||
#define MV_INSTR_SET_DEFAULT __attribute__((target("default")))
|
||||
#define MV_INSTR_SET_2 __attribute__((target("sse2")))
|
||||
#define MV_INSTR_SET_3 __attribute__((target("sse3")))
|
||||
#define MV_INSTR_SET_4 __attribute__((target("ssse3")))
|
||||
#define MV_INSTR_SET_5 __attribute__((target("sse4.1")))
|
||||
#define MV_INSTR_SET_6 __attribute__((target("sse4.2")))
|
||||
#define MV_INSTR_SET_7 __attribute__((target("avx")))
|
||||
|
||||
template<typename T>
|
||||
inline uf::simd::value<T>::value(const pod::Vector<T,1>& rhs) : value((T) rhs[0]){}
|
||||
inline uf::simd::vector<T>::vector() {}
|
||||
template<typename T>
|
||||
inline uf::simd::value<T>::value(const pod::Vector<T,2>& rhs) : value((T) rhs[0], (T) rhs[1], 0, 0){}
|
||||
inline uf::simd::vector<T>::vector(const T* f) : m(uf::simd::load(f)) {}
|
||||
template<typename T>
|
||||
inline uf::simd::value<T>::value(const pod::Vector<T,3>& rhs) : value((T) rhs[0], (T) rhs[1], (T) rhs[2], 0){}
|
||||
inline uf::simd::vector<T>::vector(T f) : m(uf::simd::set(f)) {}
|
||||
template<typename T>
|
||||
inline uf::simd::value<T>::value(const pod::Vector<T,4>& rhs) : value((T) rhs[0], (T) rhs[1], (T) rhs[2], (T) rhs[3]){}
|
||||
inline uf::simd::vector<T>::vector(T f0, T f1, T f2, T f3) : m(uf::simd::set(f0,f1,f2,f3)) {}
|
||||
template<typename T>
|
||||
inline uf::simd::vector<T>::vector(bool f0, bool f1, bool f2, bool f3) : m(uf::simd::set(f0,f1,f2,f3)) {}
|
||||
template<typename T>
|
||||
inline uf::simd::vector<T>::vector(const value_type& rhs) : m(rhs) {}
|
||||
template<typename T>
|
||||
inline uf::simd::vector<T>::vector(const vector& rhs) : m(rhs.m) {}
|
||||
|
||||
template<typename T>
|
||||
inline uf::simd::value<T> uf::simd::value<T>::operator+( const value& rhs ) {
|
||||
inline uf::simd::vector<T>::vector(const pod::Vector<T,1>& rhs) : vector((T) rhs[0]){}
|
||||
template<typename T>
|
||||
inline uf::simd::vector<T>::vector(const pod::Vector<T,2>& rhs) : vector((T) rhs[0], (T) rhs[1], 0, 0){}
|
||||
template<typename T>
|
||||
inline uf::simd::vector<T>::vector(const pod::Vector<T,3>& rhs) : vector((T) rhs[0], (T) rhs[1], (T) rhs[2], 0){}
|
||||
template<typename T>
|
||||
inline uf::simd::vector<T>::vector(const pod::Vector<T,4>& rhs) : vector((T) rhs[0], (T) rhs[1], (T) rhs[2], (T) rhs[3]){}
|
||||
|
||||
template<typename T>
|
||||
inline uf::simd::vector<T> uf::simd::vector<T>::operator+( const vector& rhs ) {
|
||||
return uf::simd::add( *this, rhs );
|
||||
}
|
||||
template<typename T>
|
||||
inline uf::simd::value<T> uf::simd::value<T>::operator-( const value& rhs ) {
|
||||
inline uf::simd::vector<T> uf::simd::vector<T>::operator-( const vector& rhs ) {
|
||||
return uf::simd::sub( *this, rhs );
|
||||
}
|
||||
template<typename T>
|
||||
inline uf::simd::value<T> uf::simd::value<T>::operator*( const value& rhs ) {
|
||||
inline uf::simd::vector<T> uf::simd::vector<T>::operator*( const vector& rhs ) {
|
||||
return uf::simd::mul( *this, rhs );
|
||||
}
|
||||
template<typename T>
|
||||
inline uf::simd::value<T> uf::simd::value<T>::operator/( const value& rhs ) {
|
||||
inline uf::simd::vector<T> uf::simd::vector<T>::operator/( const vector& rhs ) {
|
||||
return uf::simd::div( *this, rhs );
|
||||
}
|
||||
template<typename T>
|
||||
inline uf::simd::value<T> uf::simd::value<T>::operator<( const value& rhs ) {
|
||||
inline uf::simd::vector<T> uf::simd::vector<T>::operator<( const vector& rhs ) {
|
||||
return uf::simd::less( *this, rhs );
|
||||
}
|
||||
template<typename T>
|
||||
inline uf::simd::value<T> uf::simd::value<T>::operator<=( const value& rhs ) {
|
||||
inline uf::simd::vector<T> uf::simd::vector<T>::operator<=( const vector& rhs ) {
|
||||
return uf::simd::lessEquals( *this, rhs );
|
||||
}
|
||||
template<typename T>
|
||||
inline uf::simd::value<T> uf::simd::value<T>::operator>( const value& rhs ) {
|
||||
inline uf::simd::vector<T> uf::simd::vector<T>::operator>( const vector& rhs ) {
|
||||
return uf::simd::greater( *this, rhs );
|
||||
}
|
||||
template<typename T>
|
||||
inline uf::simd::value<T> uf::simd::value<T>::operator>=( const value& rhs ) {
|
||||
inline uf::simd::vector<T> uf::simd::vector<T>::operator>=( const vector& rhs ) {
|
||||
return uf::simd::greaterEquals( *this, rhs );
|
||||
}
|
||||
template<typename T>
|
||||
inline uf::simd::value<T> uf::simd::value<T>::operator==( const value& rhs ) {
|
||||
inline uf::simd::vector<T> uf::simd::vector<T>::operator==( const vector& rhs ) {
|
||||
return uf::simd::equals( *this, rhs );
|
||||
}
|
||||
template<typename T>
|
||||
inline uf::simd::value<T> uf::simd::value<T>::operator!=( const value& rhs ) {
|
||||
inline uf::simd::vector<T> uf::simd::vector<T>::operator!=( const vector& rhs ) {
|
||||
return uf::simd::notEquals( *this, rhs );
|
||||
}
|
||||
template<typename T>
|
||||
inline uf::simd::value<T>& uf::simd::value<T>::operator=(const uf::simd::value<T>::value_type& rhs) {
|
||||
m_value = rhs;
|
||||
inline uf::simd::vector<T>& uf::simd::vector<T>::operator=(const uf::simd::vector<T>::value_type& rhs) {
|
||||
m = rhs;
|
||||
return *this;
|
||||
}
|
||||
template<typename T>
|
||||
inline uf::simd::value<T>& uf::simd::value<T>::operator=(const value& rhs) {
|
||||
m_value = rhs.m_value;
|
||||
inline uf::simd::vector<T>& uf::simd::vector<T>::operator=(const vector& rhs) {
|
||||
m = rhs.m;
|
||||
return *this;
|
||||
}
|
||||
template<typename T>
|
||||
inline uf::simd::value<T>& uf::simd::value<T>::operator=(const pod::Vector<T,4>& rhs) {
|
||||
m_value = uf::simd::load(&rhs[0]);
|
||||
inline uf::simd::vector<T>& uf::simd::vector<T>::operator=(const pod::Vector<T,4>& rhs) {
|
||||
m = uf::simd::load(&rhs[0]);
|
||||
return *this;
|
||||
}
|
||||
template<typename T>
|
||||
inline uf::simd::value<T>::operator uf::simd::value<T>::value_type() const {
|
||||
return m_value;
|
||||
inline uf::simd::vector<T>::operator uf::simd::vector<T>::value_type() const {
|
||||
return m;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
template<size_t N>
|
||||
inline uf::simd::value<T>::operator pod::Vector<T,N>() const {
|
||||
return uf::simd::vector<N>(*this);
|
||||
inline uf::simd::vector<T>::operator pod::Vector<T,N>() const {
|
||||
return uf::simd::cast<N>(*this);
|
||||
}
|
||||
|
||||
template<size_t N>
|
||||
inline pod::Vector<float,N> uf::simd::vector( const uf::simd::value<float> v ){
|
||||
inline pod::Vector<float,N> uf::simd::cast( const uf::simd::vector<float> v ){
|
||||
pod::Vector4f r;
|
||||
uf::simd::store( v, &r[0] );
|
||||
return uf::vector::cast<float,N>(r);
|
||||
}
|
||||
template<size_t N>
|
||||
inline pod::Vector<int32_t,N> uf::simd::vector( const uf::simd::value<int32_t> v ){
|
||||
inline pod::Vector<int32_t,N> uf::simd::cast( const uf::simd::vector<int32_t> v ){
|
||||
pod::Vector4i r;
|
||||
uf::simd::store( v, &r[0] );
|
||||
return uf::vector::cast<int32_t,N>(r);
|
||||
}
|
||||
template<size_t N>
|
||||
inline pod::Vector<uint32_t,N> uf::simd::vector( const uf::simd::value<uint32_t> v ){
|
||||
inline pod::Vector<uint32_t,N> uf::simd::cast( const uf::simd::vector<uint32_t> v ){
|
||||
pod::Vector4ui r;
|
||||
uf::simd::store( v, &r[0] );
|
||||
return uf::vector::cast<uint32_t,N>(r);
|
||||
}
|
||||
|
||||
inline uf::simd::value<float> uf::simd::load( const float* f ) {
|
||||
inline uf::simd::vector<float> uf::simd::load( const float* f ) {
|
||||
// if ( uf::aligned(f, 16) ) return _mm_load_ps(f);
|
||||
return _mm_loadu_ps(f);
|
||||
return _mm_loadu_ps( f );
|
||||
}
|
||||
inline void uf::simd::store( uf::simd::value<float> v, float* f ) {
|
||||
inline void uf::simd::store( uf::simd::vector<float> v, float* f ) {
|
||||
/* if ( uf::aligned(f, 16) ) _mm_store_ps(f, v);
|
||||
else */ _mm_storeu_ps(f, v);
|
||||
else */ _mm_storeu_ps( f, v );
|
||||
}
|
||||
inline uf::simd::value<float> uf::simd::set( float f ) {
|
||||
return _mm_set1_ps(f);
|
||||
inline uf::simd::vector<float> uf::simd::set( float f ) {
|
||||
return _mm_set1_ps( f );
|
||||
}
|
||||
inline uf::simd::value<float> uf::simd::set( float x, float y, float z, float w ) {
|
||||
return _mm_setr_ps(x, y, z, w);
|
||||
inline uf::simd::vector<float> uf::simd::set( float x, float y, float z, float w ) {
|
||||
return _mm_setr_ps( x, y, z, w );
|
||||
}
|
||||
inline uf::simd::value<float> uf::simd::add( uf::simd::value<float> x, uf::simd::value<float> y ) {
|
||||
inline uf::simd::vector<float> uf::simd::add( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
|
||||
return _mm_add_ps( x, y );
|
||||
}
|
||||
inline uf::simd::value<float> uf::simd::sub( uf::simd::value<float> x, uf::simd::value<float> y ) {
|
||||
inline uf::simd::vector<float> uf::simd::sub( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
|
||||
return _mm_sub_ps( x, y );
|
||||
}
|
||||
inline uf::simd::value<float> uf::simd::mul( uf::simd::value<float> x, uf::simd::value<float> y ) {
|
||||
inline uf::simd::vector<float> uf::simd::mul( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
|
||||
return _mm_mul_ps( x, y );
|
||||
}
|
||||
inline uf::simd::value<float> uf::simd::div( uf::simd::value<float> x, uf::simd::value<float> y ) {
|
||||
inline uf::simd::vector<float> uf::simd::div( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
|
||||
return _mm_div_ps( x, y );
|
||||
}
|
||||
/*
|
||||
inline uf::simd::value<float> uf::simd::hadd( uf::simd::value<float> x, uf::simd::value<float> y ) {
|
||||
inline uf::simd::vector<float> uf::simd::hadd( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
|
||||
#if 0
|
||||
return _mm_hadd_ps( x, y );
|
||||
#else
|
||||
@ -154,312 +168,482 @@ inline uf::simd::value<float> uf::simd::hadd( uf::simd::value<float> x, uf::simd
|
||||
}
|
||||
*/
|
||||
|
||||
inline uf::simd::value<float> uf::simd::min( uf::simd::value<float> x, uf::simd::value<float> y ) {
|
||||
inline uf::simd::vector<float> uf::simd::min( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
|
||||
return _mm_min_ps( x, y );
|
||||
}
|
||||
inline uf::simd::value<float> uf::simd::max( uf::simd::value<float> x, uf::simd::value<float> y ) {
|
||||
inline uf::simd::vector<float> uf::simd::max( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
|
||||
return _mm_max_ps( x, y );
|
||||
}
|
||||
inline bool uf::simd::all( uf::simd::value<float> mask) {
|
||||
return _mm_movemask_ps(mask) == 0xF; // all 4 bits set
|
||||
inline bool uf::simd::all( uf::simd::vector<float> mask) {
|
||||
return _mm_movemask_ps( mask ) == 0xF; // all 4 bits set
|
||||
}
|
||||
inline bool uf::simd::any( uf::simd::value<float> mask) {
|
||||
return _mm_movemask_ps(mask) != 0x0; // any bit set
|
||||
inline bool uf::simd::any( uf::simd::vector<float> mask) {
|
||||
return _mm_movemask_ps( mask ) != 0x0; // any bit set
|
||||
}
|
||||
inline uf::simd::value<float> uf::simd::less( uf::simd::value<float> x, uf::simd::value<float> y ) {
|
||||
inline uf::simd::vector<float> uf::simd::less( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
|
||||
return _mm_cmplt_ps( x, y );
|
||||
}
|
||||
inline uf::simd::value<float> uf::simd::lessEquals( uf::simd::value<float> x, uf::simd::value<float> y ) {
|
||||
inline uf::simd::vector<float> uf::simd::lessEquals( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
|
||||
return _mm_cmple_ps( x, y );
|
||||
}
|
||||
inline uf::simd::value<float> uf::simd::greater( uf::simd::value<float> x, uf::simd::value<float> y ) {
|
||||
inline uf::simd::vector<float> uf::simd::greater( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
|
||||
return _mm_cmpgt_ps( x, y );
|
||||
}
|
||||
inline uf::simd::value<float> uf::simd::greaterEquals( uf::simd::value<float> x, uf::simd::value<float> y ) {
|
||||
inline uf::simd::vector<float> uf::simd::greaterEquals( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
|
||||
return _mm_cmpge_ps( x, y );
|
||||
}
|
||||
inline uf::simd::value<float> uf::simd::equals( uf::simd::value<float> x, uf::simd::value<float> y ) {
|
||||
inline uf::simd::vector<float> uf::simd::equals( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
|
||||
return _mm_cmpeq_ps( x, y );
|
||||
}
|
||||
inline uf::simd::value<float> uf::simd::notEquals( uf::simd::value<float> x, uf::simd::value<float> y ) {
|
||||
inline uf::simd::vector<float> uf::simd::notEquals( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
|
||||
return _mm_cmpneq_ps( x, y );
|
||||
}
|
||||
inline uf::simd::value<float> uf::simd::sqrt( uf::simd::value<float> v ) {
|
||||
inline uf::simd::vector<float> uf::simd::sqrt( uf::simd::vector<float> v ) {
|
||||
return _mm_sqrt_ps( v );
|
||||
}
|
||||
inline float uf::simd::dot( uf::simd::value<float> x, uf::simd::value<float> y ) {
|
||||
#if SSE_INSTR_SET >= 5
|
||||
__m128 result = _mm_dp_ps(x, y, 0xF1);
|
||||
return _mm_cvtss_f32(result);
|
||||
#elif SSE_INSTR_SET >= 3
|
||||
__m128 mulRes = _mm_mul_ps(x, y);
|
||||
__m128 shufReg = _mm_movehdup_ps(mulRes);
|
||||
__m128 sumsReg = _mm_add_ps(mulRes, shufReg);
|
||||
shufReg = _mm_movehl_ps(shufReg, sumsReg);
|
||||
sumsReg = _mm_add_ss(sumsReg, shufReg);
|
||||
return _mm_cvtss_f32(sumsReg);
|
||||
#else
|
||||
return uf::vector::sum( uf::simd::vector( uf::simd::mul( x, y ) ) );
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
MV_INSTR_SET_DEFAULT
|
||||
uf::simd::vector<float> dot_impl( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
|
||||
return uf::simd::mul( x, y );
|
||||
}
|
||||
MV_INSTR_SET_3
|
||||
uf::simd::vector<float> dot_impl( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
|
||||
__m128 mulRes = _mm_mul_ps(x, y);
|
||||
__m128 shufReg = _mm_movehdup_ps(mulRes);
|
||||
__m128 sumsReg = _mm_add_ps(mulRes, shufReg);
|
||||
shufReg = _mm_movehl_ps(shufReg, sumsReg);
|
||||
return _mm_add_ss(sumsReg, shufReg);
|
||||
}
|
||||
MV_INSTR_SET_5
|
||||
uf::simd::vector<float> dot_impl( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
|
||||
return _mm_dp_ps(x, y, 0xF1);
|
||||
}
|
||||
}
|
||||
inline uf::simd::value<int32_t> uf::simd::load( const int32_t* f ) {
|
||||
#if SSE_INSTR_SET >= 3
|
||||
// if ( uf::aligned(f, 16) ) return _mm_load_si128(reinterpret_cast<const __m128i*>(f));
|
||||
return _mm_loadu_si128(reinterpret_cast<const __m128i*>(f));
|
||||
#else
|
||||
return uf::simd::value<int32_t>( f[0], f[1], f[2], f[3] );
|
||||
#endif
|
||||
inline float uf::simd::dot( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
|
||||
return _mm_cvtss_f32( ::dot_impl( x, y ) );
|
||||
}
|
||||
inline void uf::simd::store( uf::simd::value<int32_t> v, int32_t* f ) {
|
||||
#if SSE_INSTR_SET >= 3
|
||||
/*if ( uf::aligned(f, 16) ) _mm_store_si128(reinterpret_cast<__m128i*>(f), v);
|
||||
else*/ _mm_storeu_si128(reinterpret_cast<__m128i*>(f), v);
|
||||
#else
|
||||
union { __m128i x; int32_t y[4]; } kludge;
|
||||
kludge.x = v;
|
||||
f[0] = kludge.y[0];
|
||||
f[1] = kludge.y[1];
|
||||
f[2] = kludge.y[2];
|
||||
f[3] = kludge.y[3];
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
MV_INSTR_SET_DEFAULT
|
||||
uf::simd::vector<int32_t> load_impl( const int32_t* f ) {
|
||||
return uf::simd::vector<int32_t>( f[0], f[1], f[2], f[3] );
|
||||
}
|
||||
MV_INSTR_SET_3
|
||||
uf::simd::vector<int32_t> load_impl( const int32_t* f ) {
|
||||
// if ( uf::aligned(f, 16) ) return _mm_load_si128(reinterpret_cast<const __m128i*>(f));
|
||||
return _mm_loadu_si128(reinterpret_cast<const __m128i*>(f));
|
||||
}
|
||||
}
|
||||
inline uf::simd::value<int32_t> uf::simd::set( int32_t f ) {
|
||||
inline uf::simd::vector<int32_t> uf::simd::load( const int32_t* f ) {
|
||||
return ::load_impl( f );
|
||||
}
|
||||
|
||||
namespace {
|
||||
MV_INSTR_SET_DEFAULT
|
||||
void store_impl( uf::simd::vector<int32_t> v, int32_t* f ) {
|
||||
union { __m128i x; int32_t y[4]; } kludge;
|
||||
kludge.x = v;
|
||||
f[0] = kludge.y[0];
|
||||
f[1] = kludge.y[1];
|
||||
f[2] = kludge.y[2];
|
||||
f[3] = kludge.y[3];
|
||||
}
|
||||
MV_INSTR_SET_3
|
||||
void store_impl( uf::simd::vector<int32_t> v, int32_t* f ) {
|
||||
/*if ( uf::aligned(f, 16) ) _mm_store_si128(reinterpret_cast<__m128i*>(f), v);
|
||||
else*/ _mm_storeu_si128(reinterpret_cast<__m128i*>(f), v);
|
||||
}
|
||||
}
|
||||
inline void uf::simd::store( uf::simd::vector<int32_t> v, int32_t* f ) {
|
||||
return ::store_impl( v, f );
|
||||
}
|
||||
|
||||
|
||||
inline uf::simd::vector<int32_t> uf::simd::set( int32_t f ) {
|
||||
return _mm_set1_epi32(f);
|
||||
}
|
||||
inline uf::simd::value<int32_t> uf::simd::set( int32_t x, int32_t y, int32_t z, int32_t w ) {
|
||||
inline uf::simd::vector<int32_t> uf::simd::set( int32_t x, int32_t y, int32_t z, int32_t w ) {
|
||||
return _mm_setr_epi32(x, y, z, w);
|
||||
}
|
||||
inline uf::simd::value<int32_t> uf::simd::add( uf::simd::value<int32_t> x, uf::simd::value<int32_t> y ) {
|
||||
inline uf::simd::vector<int32_t> uf::simd::add( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
|
||||
return _mm_add_epi32(x, y);
|
||||
}
|
||||
inline uf::simd::value<int32_t> uf::simd::sub( uf::simd::value<int32_t> x, uf::simd::value<int32_t> y ) {
|
||||
inline uf::simd::vector<int32_t> uf::simd::sub( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
|
||||
return _mm_sub_epi32(x, y);
|
||||
}
|
||||
inline uf::simd::value<int32_t> uf::simd::mul( uf::simd::value<int32_t> x, uf::simd::value<int32_t> y ) {
|
||||
#if SSE_INSTR_SET >= 4
|
||||
return _mm_mullo_epi32(x, y);
|
||||
#else
|
||||
auto X = uf::simd::vector(x);
|
||||
auto Y = uf::simd::vector(y);
|
||||
return uf::simd::set(X[0]*Y[0], X[1]*Y[1], X[2]*Y[2], X[3]*Y[3]);
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
MV_INSTR_SET_DEFAULT
|
||||
uf::simd::vector<int32_t> mul_impl( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
|
||||
auto X = uf::simd::cast(x);
|
||||
auto Y = uf::simd::cast(y);
|
||||
return uf::simd::set(X[0]*Y[0], X[1]*Y[1], X[2]*Y[2], X[3]*Y[3]);
|
||||
}
|
||||
MV_INSTR_SET_4
|
||||
uf::simd::vector<int32_t> mul_impl( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
|
||||
return _mm_mullo_epi32(x, y);
|
||||
}
|
||||
}
|
||||
inline uf::simd::value<int32_t> uf::simd::div( uf::simd::value<int32_t> x, uf::simd::value<int32_t> y ) {
|
||||
auto X = uf::simd::vector( x );
|
||||
auto Y = uf::simd::vector( y );
|
||||
inline uf::simd::vector<int32_t> uf::simd::mul( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
|
||||
return ::mul_impl( x, y );
|
||||
}
|
||||
|
||||
|
||||
inline uf::simd::vector<int32_t> uf::simd::div( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
|
||||
auto X = uf::simd::cast( x );
|
||||
auto Y = uf::simd::cast( y );
|
||||
return uf::simd::set( X[0] / Y[0], X[1] / Y[1], X[2] / Y[2], X[3] / Y[3] );
|
||||
}
|
||||
/*
|
||||
inline uf::simd::value<int32_t> uf::simd::hadd( uf::simd::value<int32_t> x, uf::simd::value<int32_t> y ) {
|
||||
auto X = uf::simd::vector( x );
|
||||
auto Y = uf::simd::vector( y );
|
||||
inline uf::simd::vector<int32_t> uf::simd::hadd( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
|
||||
auto X = uf::simd::cast( x );
|
||||
auto Y = uf::simd::cast( y );
|
||||
return uf::simd::set( X[0] + Y[0], X[1] + Y[1], X[2] + Y[2], X[3] + Y[3] );
|
||||
}
|
||||
*/
|
||||
inline uf::simd::value<int32_t> uf::simd::min( uf::simd::value<int32_t> x, uf::simd::value<int32_t> y ) {
|
||||
#if SSE_INSTR_SET >= 4
|
||||
return _mm_min_epi32(x, y);
|
||||
#else
|
||||
auto X = uf::simd::vector(x);
|
||||
auto Y = uf::simd::vector(y);
|
||||
return uf::simd::set(std::min(X[0],Y[0]), std::min(X[1],Y[1]), std::min(X[2],Y[2]), std::min(X[3],Y[3]));
|
||||
#endif
|
||||
namespace {
|
||||
MV_INSTR_SET_DEFAULT
|
||||
uf::simd::vector<int32_t> min_impl( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
|
||||
auto X = uf::simd::cast(x);
|
||||
auto Y = uf::simd::cast(y);
|
||||
return uf::simd::set(std::min(X[0],Y[0]), std::min(X[1],Y[1]), std::min(X[2],Y[2]), std::min(X[3],Y[3]));
|
||||
}
|
||||
MV_INSTR_SET_4
|
||||
uf::simd::vector<int32_t> min_impl( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
|
||||
return _mm_min_epi32(x, y);
|
||||
}
|
||||
|
||||
MV_INSTR_SET_DEFAULT
|
||||
uf::simd::vector<int32_t> max_impl( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
|
||||
auto X = uf::simd::cast(x);
|
||||
auto Y = uf::simd::cast(y);
|
||||
return uf::simd::set(std::max(X[0],Y[0]), std::max(X[1],Y[1]), std::max(X[2],Y[2]), std::max(X[3],Y[3]));
|
||||
}
|
||||
MV_INSTR_SET_4
|
||||
uf::simd::vector<int32_t> max_impl( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
|
||||
return _mm_max_epi32(x, y);
|
||||
}
|
||||
}
|
||||
inline uf::simd::value<int32_t> uf::simd::max( uf::simd::value<int32_t> x, uf::simd::value<int32_t> y ) {
|
||||
#if SSE_INSTR_SET >= 4
|
||||
return _mm_max_epi32(x, y);
|
||||
#else
|
||||
auto X = uf::simd::vector(x);
|
||||
auto Y = uf::simd::vector(y);
|
||||
return uf::simd::set(std::max(X[0],Y[0]), std::max(X[1],Y[1]), std::max(X[2],Y[2]), std::max(X[3],Y[3]));
|
||||
#endif
|
||||
inline uf::simd::vector<int32_t> uf::simd::min( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
|
||||
return ::min_impl(x, y);
|
||||
}
|
||||
inline bool uf::simd::all( uf::simd::value<int32_t> mask) {
|
||||
inline uf::simd::vector<int32_t> uf::simd::max( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
|
||||
return ::max_impl(x, y);
|
||||
}
|
||||
|
||||
|
||||
inline bool uf::simd::all( uf::simd::vector<int32_t> mask) {
|
||||
return _mm_movemask_epi8( mask ) == 0xFFFF; // all 4 bits set
|
||||
}
|
||||
inline bool uf::simd::any( uf::simd::value<int32_t> mask) {
|
||||
inline bool uf::simd::any( uf::simd::vector<int32_t> mask) {
|
||||
return _mm_movemask_epi8( mask ) != 0x0; // any bit set
|
||||
}
|
||||
inline uf::simd::value<int32_t> uf::simd::less( uf::simd::value<int32_t> x, uf::simd::value<int32_t> y ) {
|
||||
#if SSE_INSTR_SET >= 4
|
||||
return _mm_cmplt_epi32( x, y );
|
||||
#else
|
||||
auto X = vector( x ), Y = vector( y );
|
||||
return set(X[0] < Y[0], X[1] < Y[1], X[2] < Y[2], X[3] < Y[3]);
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
MV_INSTR_SET_DEFAULT
|
||||
uf::simd::vector<int32_t> less_impl( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
|
||||
auto X = uf::simd::cast( x ), Y = uf::simd::cast( y );
|
||||
return uf::simd::set_i(X[0] < Y[0], X[1] < Y[1], X[2] < Y[2], X[3] < Y[3]);
|
||||
}
|
||||
MV_INSTR_SET_4
|
||||
uf::simd::vector<int32_t> less_impl( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
|
||||
return _mm_cmplt_epi32( x, y );
|
||||
}
|
||||
|
||||
MV_INSTR_SET_DEFAULT
|
||||
uf::simd::vector<int32_t> lessEquals_impl( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
|
||||
auto X = uf::simd::cast( x ), Y = uf::simd::cast( y );
|
||||
return uf::simd::set_i(X[0] <= Y[0], X[1] <= Y[1], X[2] <= Y[2], X[3] <= Y[3]);
|
||||
}
|
||||
MV_INSTR_SET_4
|
||||
uf::simd::vector<int32_t> lessEquals_impl( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
|
||||
__m128i gt = _mm_cmpgt_epi32(x, y);
|
||||
return _mm_xor_si128(gt, _mm_set1_epi32(-1));
|
||||
}
|
||||
|
||||
MV_INSTR_SET_DEFAULT
|
||||
uf::simd::vector<int32_t> greater_impl( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
|
||||
auto X = uf::simd::cast( x ), Y = uf::simd::cast( y );
|
||||
return uf::simd::set_i(X[0] > Y[0], X[1] > Y[1], X[2] > Y[2], X[3] > Y[3]);
|
||||
}
|
||||
MV_INSTR_SET_4
|
||||
uf::simd::vector<int32_t> greater_impl( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
|
||||
return _mm_cmpgt_epi32(x, y);
|
||||
}
|
||||
|
||||
MV_INSTR_SET_DEFAULT
|
||||
uf::simd::vector<int32_t> greaterEquals_impl( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
|
||||
auto X = uf::simd::cast( x ), Y = uf::simd::cast( y );
|
||||
return uf::simd::set_i(X[0] >= Y[0], X[1] >= Y[1], X[2] >= Y[2], X[3] >= Y[3]);
|
||||
}
|
||||
MV_INSTR_SET_4
|
||||
uf::simd::vector<int32_t> greaterEquals_impl( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
|
||||
__m128i gt = _mm_cmplt_epi32(x, y);
|
||||
return _mm_xor_si128(gt, _mm_set1_epi32(-1));
|
||||
}
|
||||
}
|
||||
inline uf::simd::value<int32_t> uf::simd::lessEquals( uf::simd::value<int32_t> x, uf::simd::value<int32_t> y ) {
|
||||
#if SSE_INSTR_SET >= 4
|
||||
__m128i gt = _mm_cmpgt_epi32(x, y);
|
||||
return _mm_xor_si128(gt, _mm_set1_epi32(-1));
|
||||
#else
|
||||
auto X = vector( x ), Y = vector( y );
|
||||
return uf::simd::set(X[0] <= Y[0], X[1] <= Y[1], X[2] <= Y[2], X[3] <= Y[3]);
|
||||
#endif
|
||||
inline uf::simd::vector<int32_t> uf::simd::less( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
|
||||
return ::less_impl( x, y );
|
||||
}
|
||||
inline uf::simd::value<int32_t> uf::simd::greater( uf::simd::value<int32_t> x, uf::simd::value<int32_t> y ) {
|
||||
#if SSE_INSTR_SET >= 4
|
||||
return _mm_cmpgt_epi32( x, y );
|
||||
#else
|
||||
auto X = vector( x ), Y = vector( y );
|
||||
return uf::simd::set(X[0] > Y[0], X[1] > Y[1], X[2] > Y[2], X[3] > Y[3]);
|
||||
#endif
|
||||
inline uf::simd::vector<int32_t> uf::simd::lessEquals( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
|
||||
return ::lessEquals_impl( x, y );
|
||||
}
|
||||
inline uf::simd::value<int32_t> uf::simd::greaterEquals( uf::simd::value<int32_t> x, uf::simd::value<int32_t> y ) {
|
||||
#if SSE_INSTR_SET >= 4
|
||||
__m128i gt = _mm_cmplt_epi32(x, y);
|
||||
return _mm_xor_si128(gt, _mm_set1_epi32(-1));
|
||||
#else
|
||||
auto X = vector( x ), Y = vector( y );
|
||||
return uf::simd::set(X[0] >= Y[0], X[1] >= Y[1], X[2] >= Y[2], X[3] >= Y[3]);
|
||||
#endif
|
||||
inline uf::simd::vector<int32_t> uf::simd::greater( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
|
||||
return ::greater_impl( x, y );
|
||||
}
|
||||
inline uf::simd::value<int32_t> uf::simd::equals( uf::simd::value<int32_t> x, uf::simd::value<int32_t> y ) {
|
||||
inline uf::simd::vector<int32_t> uf::simd::greaterEquals( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
|
||||
return ::greaterEquals_impl( x, y );
|
||||
}
|
||||
|
||||
inline uf::simd::vector<int32_t> uf::simd::equals( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
|
||||
return _mm_cmpeq_epi32(x, y);
|
||||
}
|
||||
inline uf::simd::value<int32_t> uf::simd::notEquals( uf::simd::value<int32_t> x, uf::simd::value<int32_t> y ) {
|
||||
inline uf::simd::vector<int32_t> uf::simd::notEquals( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
|
||||
return _mm_xor_si128(_mm_cmpeq_epi32(x, y), _mm_set1_epi32(-1));
|
||||
}
|
||||
inline uf::simd::value<int32_t> uf::simd::sqrt( uf::simd::value<int32_t> v ) {
|
||||
auto V = uf::simd::vector( v );
|
||||
inline uf::simd::vector<int32_t> uf::simd::sqrt( uf::simd::vector<int32_t> v ) {
|
||||
auto V = uf::simd::cast( v );
|
||||
return uf::simd::set( (int32_t) std::sqrt(V[0]), (int32_t) std::sqrt(V[1]), (int32_t) std::sqrt(V[2]), (int32_t) std::sqrt(V[3]) );
|
||||
}
|
||||
inline int32_t uf::simd::dot( uf::simd::value<int32_t> x, uf::simd::value<int32_t> y ) {
|
||||
auto X = uf::simd::vector( x );
|
||||
auto Y = uf::simd::vector( y );
|
||||
inline int32_t uf::simd::dot( uf::simd::vector<int32_t> x, uf::simd::vector<int32_t> y ) {
|
||||
auto X = uf::simd::cast( x );
|
||||
auto Y = uf::simd::cast( y );
|
||||
return X[0] * Y[0] + X[1] * Y[1] + X[2] * Y[2] + X[3] * Y[3];
|
||||
}
|
||||
|
||||
inline uf::simd::value<uint32_t> uf::simd::load( const uint32_t* f ) {
|
||||
#if SSE_INSTR_SET >= 3
|
||||
// if ( uf::aligned(f, 16) ) return _mm_load_si128(reinterpret_cast<const __m128i*>(f));
|
||||
return _mm_loadu_si128(reinterpret_cast<const __m128i*>(f));
|
||||
#else
|
||||
return uf::simd::value<uint32_t>( f[0], f[1], f[2], f[3] );
|
||||
#endif
|
||||
namespace {
|
||||
MV_INSTR_SET_DEFAULT
|
||||
uf::simd::vector<uint32_t> load_impl( const uint32_t* f ) {
|
||||
return uf::simd::vector<uint32_t>( f[0], f[1], f[2], f[3] );
|
||||
}
|
||||
MV_INSTR_SET_3
|
||||
uf::simd::vector<uint32_t> load_impl( const uint32_t* f ) {
|
||||
// if ( uf::aligned(f, 16) ) return _mm_load_si128(reinterpret_cast<const __m128i*>(f));
|
||||
return _mm_loadu_si128(reinterpret_cast<const __m128i*>(f));
|
||||
}
|
||||
|
||||
MV_INSTR_SET_DEFAULT
|
||||
void store_impl( uf::simd::vector<uint32_t> v, uint32_t* f ) {
|
||||
union { __m128i x; uint32_t y[4]; } kludge;
|
||||
kludge.x = v;
|
||||
f[0] = kludge.y[0];
|
||||
f[1] = kludge.y[1];
|
||||
f[2] = kludge.y[2];
|
||||
f[3] = kludge.y[3];
|
||||
}
|
||||
MV_INSTR_SET_3
|
||||
void store_impl( uf::simd::vector<uint32_t> v, uint32_t* f ) {
|
||||
/*if ( uf::aligned(f, 16) ) _mm_store_si128(reinterpret_cast<__m128i*>(f), v);
|
||||
else*/ _mm_storeu_si128(reinterpret_cast<__m128i*>(f), v);
|
||||
}
|
||||
}
|
||||
inline void uf::simd::store( uf::simd::value<uint32_t> v, uint32_t* f ) {
|
||||
#if SSE_INSTR_SET >= 3
|
||||
/*if ( uf::aligned(f, 16) ) _mm_store_si128(reinterpret_cast<__m128i*>(f), v);
|
||||
else*/ _mm_storeu_si128(reinterpret_cast<__m128i*>(f), v);
|
||||
#else
|
||||
union { __m128i x; uint32_t y[4]; } kludge;
|
||||
kludge.x = v;
|
||||
f[0] = kludge.y[0];
|
||||
f[1] = kludge.y[1];
|
||||
f[2] = kludge.y[2];
|
||||
f[3] = kludge.y[3];
|
||||
#endif
|
||||
inline uf::simd::vector<uint32_t> uf::simd::load( const uint32_t* f ) {
|
||||
return ::load_impl( f );
|
||||
}
|
||||
inline uf::simd::value<uint32_t> uf::simd::set( uint32_t f ) {
|
||||
inline void uf::simd::store( uf::simd::vector<uint32_t> v, uint32_t* f ) {
|
||||
return ::store_impl( v, f );
|
||||
}
|
||||
|
||||
inline uf::simd::vector<uint32_t> uf::simd::set( uint32_t f ) {
|
||||
return _mm_set1_epi32(f);
|
||||
}
|
||||
inline uf::simd::value<uint32_t> uf::simd::set( uint32_t x, uint32_t y, uint32_t z, uint32_t w ) {
|
||||
inline uf::simd::vector<uint32_t> uf::simd::set( uint32_t x, uint32_t y, uint32_t z, uint32_t w ) {
|
||||
return _mm_setr_epi32(x, y, z, w);
|
||||
}
|
||||
inline uf::simd::value<uint32_t> uf::simd::add( uf::simd::value<uint32_t> x, uf::simd::value<uint32_t> y ) {
|
||||
inline uf::simd::vector<uint32_t> uf::simd::add( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
|
||||
return _mm_add_epi32(x, y);
|
||||
}
|
||||
inline uf::simd::value<uint32_t> uf::simd::sub( uf::simd::value<uint32_t> x, uf::simd::value<uint32_t> y ) {
|
||||
inline uf::simd::vector<uint32_t> uf::simd::sub( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
|
||||
return _mm_sub_epi32(x, y);
|
||||
}
|
||||
inline uf::simd::value<uint32_t> uf::simd::mul( uf::simd::value<uint32_t> x, uf::simd::value<uint32_t> y ) {
|
||||
#if SSE_INSTR_SET >= 4
|
||||
return _mm_mullo_epi32(x, y);
|
||||
#else
|
||||
auto X = uf::simd::vector(x);
|
||||
auto Y = uf::simd::vector(y);
|
||||
return uf::simd::set(X[0]*Y[0], X[1]*Y[1], X[2]*Y[2], X[3]*Y[3]);
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
MV_INSTR_SET_DEFAULT
|
||||
uf::simd::vector<uint32_t> mul_impl( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
|
||||
auto X = uf::simd::cast(x);
|
||||
auto Y = uf::simd::cast(y);
|
||||
return uf::simd::set(X[0]*Y[0], X[1]*Y[1], X[2]*Y[2], X[3]*Y[3]);
|
||||
}
|
||||
MV_INSTR_SET_4
|
||||
uf::simd::vector<uint32_t> mul_impl( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
|
||||
return _mm_mullo_epi32(x, y);
|
||||
}
|
||||
}
|
||||
inline uf::simd::value<uint32_t> uf::simd::div( uf::simd::value<uint32_t> x, uf::simd::value<uint32_t> y ) {
|
||||
auto X = uf::simd::vector( x );
|
||||
auto Y = uf::simd::vector( y );
|
||||
inline uf::simd::vector<uint32_t> uf::simd::mul( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
|
||||
return ::mul_impl( x, y );
|
||||
}
|
||||
|
||||
inline uf::simd::vector<uint32_t> uf::simd::div( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
|
||||
auto X = uf::simd::cast( x );
|
||||
auto Y = uf::simd::cast( y );
|
||||
return uf::simd::set( X[0] / Y[0], X[1] / Y[1], X[2] / Y[2], X[3] / Y[3] );
|
||||
}
|
||||
/*
|
||||
inline uf::simd::value<uint32_t> uf::simd::hadd( uf::simd::value<uint32_t> x, uf::simd::value<uint32_t> y ) {
|
||||
auto X = uf::simd::vector( x );
|
||||
auto Y = uf::simd::vector( y );
|
||||
inline uf::simd::vector<uint32_t> uf::simd::hadd( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
|
||||
auto X = uf::simd::cast( x );
|
||||
auto Y = uf::simd::cast( y );
|
||||
return uf::simd::set( X[0] + Y[0], X[1] + Y[1], X[2] + Y[2], X[3] + Y[3] );
|
||||
}
|
||||
*/
|
||||
inline uf::simd::value<uint32_t> uf::simd::min( uf::simd::value<uint32_t> x, uf::simd::value<uint32_t> y ) {
|
||||
#if SSE_INSTR_SET >= 4
|
||||
return _mm_min_epu32(x, y); // unsigned min
|
||||
#else
|
||||
auto X = uf::simd::vector(x);
|
||||
auto Y = uf::simd::vector(y);
|
||||
return uf::simd::set(std::min(X[0],Y[0]), std::min(X[1],Y[1]), std::min(X[2],Y[2]), std::min(X[3],Y[3]));
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
MV_INSTR_SET_DEFAULT
|
||||
uf::simd::vector<uint32_t> min_impl( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
|
||||
auto X = uf::simd::cast(x);
|
||||
auto Y = uf::simd::cast(y);
|
||||
return uf::simd::set(std::min(X[0],Y[0]), std::min(X[1],Y[1]), std::min(X[2],Y[2]), std::min(X[3],Y[3]));
|
||||
}
|
||||
MV_INSTR_SET_4
|
||||
uf::simd::vector<uint32_t> min_impl( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
|
||||
return _mm_min_epu32(x, y); // unsigned min
|
||||
}
|
||||
|
||||
MV_INSTR_SET_DEFAULT
|
||||
uf::simd::vector<uint32_t> max_impl( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
|
||||
auto X = uf::simd::cast(x);
|
||||
auto Y = uf::simd::cast(y);
|
||||
return uf::simd::set(std::max(X[0],Y[0]), std::max(X[1],Y[1]), std::max(X[2],Y[2]), std::max(X[3],Y[3]));
|
||||
}
|
||||
MV_INSTR_SET_4
|
||||
uf::simd::vector<uint32_t> max_impl( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
|
||||
return _mm_max_epu32(x, y); // unsigned max
|
||||
}
|
||||
}
|
||||
inline uf::simd::value<uint32_t> uf::simd::max( uf::simd::value<uint32_t> x, uf::simd::value<uint32_t> y ) {
|
||||
#if SSE_INSTR_SET >= 4
|
||||
return _mm_max_epu32(x, y); // unsigned max
|
||||
#else
|
||||
auto X = uf::simd::vector(x);
|
||||
auto Y = uf::simd::vector(y);
|
||||
return uf::simd::set(std::max(X[0],Y[0]), std::max(X[1],Y[1]), std::max(X[2],Y[2]), std::max(X[3],Y[3]));
|
||||
#endif
|
||||
inline uf::simd::vector<uint32_t> uf::simd::min( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
|
||||
return ::min_impl(x, y);
|
||||
}
|
||||
inline bool uf::simd::all( uf::simd::value<uint32_t> mask) {
|
||||
inline uf::simd::vector<uint32_t> uf::simd::max( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
|
||||
return ::max_impl(x, y);
|
||||
}
|
||||
|
||||
inline bool uf::simd::all( uf::simd::vector<uint32_t> mask) {
|
||||
return _mm_movemask_epi8( mask ) == 0xFFFF; // all 4 bits set
|
||||
}
|
||||
inline bool uf::simd::any( uf::simd::value<uint32_t> mask) {
|
||||
inline bool uf::simd::any( uf::simd::vector<uint32_t> mask) {
|
||||
return _mm_movemask_epi8( mask ) != 0x0; // any bit set
|
||||
}
|
||||
inline uf::simd::value<uint32_t> uf::simd::less( uf::simd::value<uint32_t> x, uf::simd::value<uint32_t> y ) {
|
||||
#if SSE_INSTR_SET >= 4
|
||||
return _mm_cmplt_epi32( ::bias_unsigned( x ), ::bias_unsigned( y ) );
|
||||
#else
|
||||
auto X = vector( x ), Y = vector( y );
|
||||
return set(X[0] < Y[0], X[1] < Y[1], X[2] < Y[2], X[3] < Y[3]);
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
MV_INSTR_SET_DEFAULT
|
||||
uf::simd::vector<uint32_t> less_impl( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
|
||||
auto X = uf::simd::cast( x ), Y = uf::simd::cast( y );
|
||||
return uf::simd::set_ui(X[0] < Y[0], X[1] < Y[1], X[2] < Y[2], X[3] < Y[3]);
|
||||
}
|
||||
MV_INSTR_SET_4
|
||||
uf::simd::vector<uint32_t> less_impl( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
|
||||
return _mm_cmplt_epi32( ::bias_unsigned(x), ::bias_unsigned(y) );
|
||||
}
|
||||
|
||||
MV_INSTR_SET_DEFAULT
|
||||
uf::simd::vector<uint32_t> lessEquals_impl( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y) {
|
||||
auto X = uf::simd::cast(x), Y = uf::simd::cast(y);
|
||||
return uf::simd::set_ui(X[0] <= Y[0], X[1] <= Y[1], X[2] <= Y[2], X[3] <= Y[3]);
|
||||
}
|
||||
MV_INSTR_SET_2
|
||||
uf::simd::vector<uint32_t> lessEquals_impl( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y) {
|
||||
// a <= b <=> !(a > b)
|
||||
__m128i bx = ::bias_unsigned(x);
|
||||
__m128i by = ::bias_unsigned(y);
|
||||
__m128i gt = _mm_cmpgt_epi32(bx, by); // signed compare
|
||||
return _mm_xor_si128(gt, _mm_set1_epi32(-1)); // invert mask
|
||||
}
|
||||
|
||||
MV_INSTR_SET_DEFAULT
|
||||
uf::simd::vector<uint32_t> greater_impl( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
|
||||
auto X = uf::simd::cast( x ), Y = uf::simd::cast( y );
|
||||
return uf::simd::set_ui(X[0] > Y[0], X[1] > Y[1], X[2] > Y[2], X[3] > Y[3]);
|
||||
}
|
||||
MV_INSTR_SET_4
|
||||
uf::simd::vector<uint32_t> greater_impl( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
|
||||
return _mm_cmpgt_epi32( ::bias_unsigned(x), ::bias_unsigned(y) );
|
||||
}
|
||||
|
||||
MV_INSTR_SET_DEFAULT
|
||||
uf::simd::vector<uint32_t> greaterEquals_impl( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y) {
|
||||
auto X = uf::simd::cast(x), Y = uf::simd::cast(y);
|
||||
return uf::simd::set_ui(X[0] >= Y[0], X[1] >= Y[1], X[2] >= Y[2], X[3] >= Y[3]);
|
||||
}
|
||||
MV_INSTR_SET_2
|
||||
uf::simd::vector<uint32_t> greaterEquals_impl( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y) {
|
||||
// a >= b <=> !(a < b)
|
||||
__m128i bx = ::bias_unsigned(x);
|
||||
__m128i by = ::bias_unsigned(y);
|
||||
__m128i lt = _mm_cmplt_epi32(bx, by); // signed compare
|
||||
return _mm_xor_si128(lt, _mm_set1_epi32(-1)); // invert mask
|
||||
}
|
||||
}
|
||||
inline uf::simd::value<uint32_t> uf::simd::lessEquals(value<uint32_t> x, value<uint32_t> y) {
|
||||
#if SSE_INSTR_SET >= 2
|
||||
// a <= b <=> !(a > b)
|
||||
__m128i bx = ::bias_unsigned(x);
|
||||
__m128i by = ::bias_unsigned(y);
|
||||
__m128i gt = _mm_cmpgt_epi32(bx, by); // signed compare
|
||||
return _mm_xor_si128(gt, _mm_set1_epi32(-1)); // invert mask
|
||||
#else
|
||||
auto X = vector(x), Y = vector(y);
|
||||
return set(X[0] <= Y[0], X[1] <= Y[1], X[2] <= Y[2], X[3] <= Y[3]);
|
||||
#endif
|
||||
inline uf::simd::vector<uint32_t> uf::simd::less( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
|
||||
return ::less_impl( x, y );
|
||||
}
|
||||
inline uf::simd::value<uint32_t> uf::simd::greater( uf::simd::value<uint32_t> x, uf::simd::value<uint32_t> y ) {
|
||||
#if SSE_INSTR_SET >= 4
|
||||
return _mm_cmpgt_epi32( ::bias_unsigned( x ), ::bias_unsigned( y ) );
|
||||
#else
|
||||
auto X = vector( x ), Y = vector( y );
|
||||
return uf::simd::set(X[0] > Y[0], X[1] > Y[1], X[2] > Y[2], X[3] > Y[3]);
|
||||
#endif
|
||||
inline uf::simd::vector<uint32_t> uf::simd::lessEquals( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
|
||||
return ::lessEquals_impl( x, y );
|
||||
}
|
||||
inline uf::simd::value<uint32_t> uf::simd::greaterEquals(value<uint32_t> x, value<uint32_t> y) {
|
||||
#if SSE_INSTR_SET >= 2
|
||||
// a >= b <=> !(a < b)
|
||||
__m128i bx = ::bias_unsigned(x);
|
||||
__m128i by = ::bias_unsigned(y);
|
||||
__m128i lt = _mm_cmplt_epi32(bx, by); // signed compare
|
||||
return _mm_xor_si128(lt, _mm_set1_epi32(-1)); // invert mask
|
||||
#else
|
||||
auto X = vector(x), Y = vector(y);
|
||||
return set(X[0] >= Y[0], X[1] >= Y[1], X[2] >= Y[2], X[3] >= Y[3]);
|
||||
#endif
|
||||
inline uf::simd::vector<uint32_t> uf::simd::greater( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
|
||||
return ::greater_impl( x, y );
|
||||
}
|
||||
inline uf::simd::value<uint32_t> uf::simd::equals( uf::simd::value<uint32_t> x, uf::simd::value<uint32_t> y ) {
|
||||
inline uf::simd::vector<uint32_t> uf::simd::greaterEquals( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
|
||||
return ::greaterEquals_impl( x, y );
|
||||
}
|
||||
|
||||
|
||||
inline uf::simd::vector<uint32_t> uf::simd::equals( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
|
||||
return _mm_cmpeq_epi32(x, y);
|
||||
}
|
||||
inline uf::simd::value<uint32_t> uf::simd::notEquals( uf::simd::value<uint32_t> x, uf::simd::value<uint32_t> y ) {
|
||||
inline uf::simd::vector<uint32_t> uf::simd::notEquals( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
|
||||
return _mm_xor_si128(_mm_cmpeq_epi32(x, y), _mm_set1_epi32(-1));
|
||||
}
|
||||
inline uf::simd::value<uint32_t> uf::simd::sqrt( uf::simd::value<uint32_t> v ) {
|
||||
auto V = uf::simd::vector( v );
|
||||
inline uf::simd::vector<uint32_t> uf::simd::sqrt( uf::simd::vector<uint32_t> v ) {
|
||||
auto V = uf::simd::cast( v );
|
||||
return uf::simd::set( (uint32_t) std::sqrt(V[0]), (uint32_t) std::sqrt(V[1]), (uint32_t) std::sqrt(V[2]), (uint32_t) std::sqrt(V[3]) );
|
||||
}
|
||||
inline uint32_t uf::simd::dot( uf::simd::value<uint32_t> x, uf::simd::value<uint32_t> y ) {
|
||||
auto X = uf::simd::vector( x );
|
||||
auto Y = uf::simd::vector( y );
|
||||
inline uint32_t uf::simd::dot( uf::simd::vector<uint32_t> x, uf::simd::vector<uint32_t> y ) {
|
||||
auto X = uf::simd::cast( x );
|
||||
auto Y = uf::simd::cast( y );
|
||||
return X[0] * Y[0] + X[1] * Y[1] + X[2] * Y[2] + X[3] * Y[3];
|
||||
}
|
||||
|
||||
inline uf::simd::vector<float> uf::simd::set_f( bool x, bool y, bool z, bool w ) {
|
||||
return _mm_castsi128_ps(_mm_setr_epi32(::boolMask(x), ::boolMask(y), ::boolMask(z), ::boolMask(w)));
|
||||
}
|
||||
inline uf::simd::vector<int32_t> uf::simd::set_i( bool x, bool y, bool z, bool w ) {
|
||||
return _mm_setr_epi32(::boolMask(x), ::boolMask(y), ::boolMask(z), ::boolMask(w));
|
||||
}
|
||||
inline uf::simd::vector<uint32_t> uf::simd::set_ui( bool x, bool y, bool z, bool w ) {
|
||||
return _mm_setr_epi32(::boolMask(x), ::boolMask(y), ::boolMask(z), ::boolMask(w));
|
||||
}
|
||||
|
||||
namespace {
|
||||
MV_INSTR_SET_DEFAULT
|
||||
uf::simd::vector<float> cross_impl( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
|
||||
__m128 tmp0 = _mm_shuffle_ps(y,y,_MM_SHUFFLE(3,0,2,1));
|
||||
__m128 tmp1 = _mm_shuffle_ps(x,x,_MM_SHUFFLE(3,0,2,1));
|
||||
tmp0 = _mm_mul_ps(tmp0,x);
|
||||
tmp1 = _mm_mul_ps(tmp1,y);
|
||||
__m128 tmp2 = _mm_sub_ps(tmp0,tmp1);
|
||||
__m128 res = _mm_shuffle_ps(tmp2,tmp2,_MM_SHUFFLE(3,0,2,1));
|
||||
return res;
|
||||
}
|
||||
MV_INSTR_SET_7
|
||||
uf::simd::vector<float> cross_impl( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
|
||||
__m128 tmp0 = _mm_shuffle_ps(y,y,_MM_SHUFFLE(3,0,2,1));
|
||||
__m128 tmp1 = _mm_shuffle_ps(x,x,_MM_SHUFFLE(3,0,2,1));
|
||||
tmp1 = _mm_mul_ps(tmp1,y);
|
||||
__m128 tmp2 = _mm_fmsub_ps( tmp0,x, tmp1 );
|
||||
__m128 res = _mm_shuffle_ps(tmp2,tmp2,_MM_SHUFFLE(3,0,2,1));
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
inline uf::simd::vector<float> uf::simd::cross( uf::simd::vector<float> x, uf::simd::vector<float> y ) {
|
||||
return ::cross_impl( x, y );
|
||||
}
|
||||
inline uf::simd::vector<float> uf::simd::normalize( uf::simd::vector<float> v ) {
|
||||
__m128 len = _mm_sqrt_ss( ::dot_impl( v,v ) );
|
||||
len = _mm_shuffle_ps(len, len, 0x00);
|
||||
return _mm_div_ps(v, len);
|
||||
}
|
||||
inline uf::simd::vector<float> uf::simd::normalize_fast( uf::simd::vector<float> v ) {
|
||||
__m128 invLen = _mm_rsqrt_ps(::dot_impl(v, v));
|
||||
return _mm_mul_ps(v, invLen);
|
||||
}
|
Loading…
Reference in New Issue
Block a user