diff --git a/engine/inc/uf/utils/math/matrix/matrix.inl b/engine/inc/uf/utils/math/matrix/matrix.inl index 439ec9f0..2268fb18 100644 --- a/engine/inc/uf/utils/math/matrix/matrix.inl +++ b/engine/inc/uf/utils/math/matrix/matrix.inl @@ -1,8 +1,9 @@ -#include "pod.inl" -#if UF_USE_CLASS_OF_PODS - #include "class.inl" +#if UF_USE_SIMD + #include "simd.inl" #endif +#include "pod.inl" + template uf::stl::string /*UF_API*/ uf::string::toString( const pod::Matrix& m ) { return uf::matrix::toString(m); diff --git a/engine/inc/uf/utils/math/matrix/pod.inl b/engine/inc/uf/utils/math/matrix/pod.inl index c1d03f87..9ae45090 100644 --- a/engine/inc/uf/utils/math/matrix/pod.inl +++ b/engine/inc/uf/utils/math/matrix/pod.inl @@ -81,6 +81,11 @@ inline bool pod::Matrix::operator!=( const Matrix& matrix ) const return !uf::matrix::equals( *this, matrix ); } template bool uf::matrix::equals( const T& left, const T& right, float eps ) { +#if UF_USE_SIMD + if constexpr (std::is_same_v) { + return uf::simd::matEquals( left, right, eps ); + } +#endif bool result = true; FOR_EACH(T::rows * T::columns, { if ( fabs(left[i] - right[i]) > eps ) result = false; @@ -91,27 +96,11 @@ template pod::Matrix uf::matrix::multiply( const pod::Matrix< pod::Matrix res; #if UF_USE_SIMD - auto row1 = uf::simd::load(&left[0]); - auto row2 = uf::simd::load(&left[4]); - auto row3 = uf::simd::load(&left[8]); - auto row4 = uf::simd::load(&left[12]); - FOR_EACH(4, { - auto brod1 = uf::simd::set(right[4*i + 0]); - auto brod2 = uf::simd::set(right[4*i + 1]); - auto brod3 = uf::simd::set(right[4*i + 2]); - auto brod4 = uf::simd::set(right[4*i + 3]); - auto row = uf::simd::add( - uf::simd::add( - uf::simd::mul(brod1, row1), - uf::simd::mul(brod2, row2)), - uf::simd::add( - uf::simd::mul(brod3, row3), - uf::simd::mul(brod4, row4))); - uf::simd::store(row, &res[4*i]); - }); - - return res; -#elif UF_ENV_DREAMCAST + if constexpr (std::is_same_v) { + return uf::simd::matMult( left, right ); + } +#endif +#if UF_ENV_DREAMCAST // kallistios has dedicated SH4 asm for these or something mat_load( (matrix_t*) &left[0] ); mat_apply( (matrix_t*) &right[0] ); @@ -122,7 +111,6 @@ template pod::Matrix uf::matrix::multiply( const pod::Matrix< // MATH_Store_XMTRX( (ALL_FLOATS_STRUCT*) &res[0]); return res; #else -#if 1 FOR_EACH_2D(4, 4, { T sum = T{0}; for (size_t k = 0; k < 4; ++k) { @@ -131,29 +119,6 @@ template pod::Matrix uf::matrix::multiply( const pod::Matrix< res(r, c) = sum; }); return res; -#else - // it works - const pod::Vector& srcA0 = *((pod::Vector*) &left[0]); - const pod::Vector& srcA1 = *((pod::Vector*) &left[4]); - const pod::Vector& srcA2 = *((pod::Vector*) &left[8]); - const pod::Vector& srcA3 = *((pod::Vector*) &left[12]); - - const pod::Vector& srcB0 = *((pod::Vector*) &right[0]); - const pod::Vector& srcB1 = *((pod::Vector*) &right[4]); - const pod::Vector& srcB2 = *((pod::Vector*) &right[8]); - const pod::Vector& srcB3 = *((pod::Vector*) &right[12]); - - pod::Vector& dst0 = *((pod::Vector*) &res[0]); - pod::Vector& dst1 = *((pod::Vector*) &res[4]); - pod::Vector& dst2 = *((pod::Vector*) &res[8]); - pod::Vector& dst3 = *((pod::Vector*) &res[12]); - - dst0 = srcA0 * srcB0[0] + srcA1 * srcB0[1] + srcA2 * srcB0[2] + srcA3 * srcB0[3]; - dst1 = srcA0 * srcB1[0] + srcA1 * srcB1[1] + srcA2 * srcB1[2] + srcA3 * srcB1[3]; - dst2 = srcA0 * srcB2[0] + srcA1 * srcB2[1] + srcA2 * srcB2[2] + srcA3 * srcB2[3]; - dst3 = srcA0 * srcB3[0] + srcA1 * srcB3[1] + srcA2 * srcB3[2] + srcA3 * srcB3[3]; - return res; -#endif #endif } template pod::Matrix uf::matrix::multiply( const T& left, const U& right ) { @@ -200,8 +165,12 @@ template T /*UF_API*/ uf::matrix::add( const T& lhs, const T& rhs ) return matrix; } template T uf::matrix::transpose( const T& matrix ) { +#if UF_USE_SIMD + if constexpr (std::is_same_v && T::rows == 4 && T::columns == 4 ) { + return uf::simd::matTranspose( matrix ); + } +#endif T transpose; - FOR_EACH_2D(T::rows, T::columns, { transpose(c, r) = matrix(r, c); }); @@ -283,6 +252,13 @@ pod::Vector3t uf::matrix::multiply(const pod::Matrix3t& mat, const pod::Ve }; } template pod::Vector4t uf::matrix::multiply( const pod::Matrix4t& mat, const pod::Vector4t& v, bool div ) { +#if UF_USE_SIMD + if constexpr (std::is_same_v) { + pod::Vector4t res = uf::simd::matMult( mat, v ); + if ( div && res.w > 0 ) res /= res.w; + return res; + } +#endif #if UF_ENV_DREAMCAST MATH_Load_XMTRX( (ALL_FLOATS_STRUCT*) &mat[0] ); auto t = MATH_Matrix_Transform( v[0], v[1], v[2], v[3] ); @@ -463,17 +439,15 @@ pod::Matrix4t /*UF_API*/ uf::matrix::perspective( T fov, T raidou, T znear, T #endif } template T& uf::matrix::copy( T& destination, const T& source ) { - #pragma unroll // GCC unroll 16 - for ( auto i = 0; i < 16; ++i ) + FOR_EACH(T::rows * T::columns, { destination[i] = source[i]; - + }); return destination; } template T& uf::matrix::copy( T& destination, typename T::type_t* const source ) { - #pragma unroll // GCC unroll 16 - for ( auto i = 0; i < 16; ++i ) + FOR_EACH(T::rows * T::columns, { destination[i] = source[i]; - + }); return destination; } diff --git a/engine/inc/uf/utils/math/matrix/simd.inl b/engine/inc/uf/utils/math/matrix/simd.inl new file mode 100644 index 00000000..65bf0ae9 --- /dev/null +++ b/engine/inc/uf/utils/math/matrix/simd.inl @@ -0,0 +1,226 @@ +namespace uf { + namespace simd { + template + class alignas(16) matrix_value { + public: + typedef typename traits::value value_type; + value_type m[4]; // 4 x 4 + + inline matrix_value(); + inline matrix_value(const pod::Matrix& rhs); + + inline bool operator==(const matrix_value&) const; + inline operator pod::Matrix() const; + }; + } + + namespace simd { + inline uf::simd::matrix_value matMult( const uf::simd::matrix_value& A, const uf::simd::matrix_value& B ); + inline uf::simd::vector matMult( const uf::simd::matrix_value& A, uf::simd::vector B ); + inline uf::simd::matrix_value matTranspose( const uf::simd::matrix_value& M ); + inline bool matEquals( const uf::simd::matrix_value& A, const uf::simd::matrix_value& B, float eps ); + } +} + +namespace { + __attribute__((target("default"))) + uf::simd::matrix_value matMult_impl(const uf::simd::matrix_value& A, const uf::simd::matrix_value& B) { + uf::simd::matrix_value R; + uf::simd::matrix_value Bt = uf::simd::matTranspose(B); + FOR_EACH(4, { + __m128 bcol = B.m[i]; + + __m128 vx = _mm_shuffle_ps(bcol, bcol, 0x00); // xxxx + __m128 vy = _mm_shuffle_ps(bcol, bcol, 0x55); // yyyy + __m128 vz = _mm_shuffle_ps(bcol, bcol, 0xAA); // zzzz + __m128 vw = _mm_shuffle_ps(bcol, bcol, 0xFF); // wwww + + R.m[i] = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps(A.m[0], vx), + _mm_mul_ps(A.m[1], vy)), + _mm_add_ps( + _mm_mul_ps(A.m[2], vz), + _mm_mul_ps(A.m[3], vw)) + ); + }); + + return R; + + } + #if 1 + __attribute__((target("sse4.1"))) + uf::simd::matrix_value matMult_impl(const uf::simd::matrix_value& A, const uf::simd::matrix_value& B) { + uf::simd::matrix_value R; + uf::simd::matrix_value Bt = uf::simd::matTranspose(B); + + FOR_EACH(4, { + __m128 bcol = B.m[i]; + + __m128 vx = _mm_shuffle_ps(bcol, bcol, 0x00); // xxxx + __m128 vy = _mm_shuffle_ps(bcol, bcol, 0x55); // yyyy + __m128 vz = _mm_shuffle_ps(bcol, bcol, 0xAA); // zzzz + __m128 vw = _mm_shuffle_ps(bcol, bcol, 0xFF); // wwww + + R.m[i] = _mm_add_ps( + _mm_add_ps( + _mm_mul_ps(A.m[0], vx), + _mm_mul_ps(A.m[1], vy)), + _mm_add_ps( + _mm_mul_ps(A.m[2], vz), + _mm_mul_ps(A.m[3], vw)) + ); + }); + + return R; + } + #endif + #if 1 + __attribute__((target("avx2,fma"))) + uf::simd::matrix_value matMult_impl(const uf::simd::matrix_value& A, const uf::simd::matrix_value& B) { + uf::simd::matrix_value R; + uf::simd::matrix_value Bt = uf::simd::matTranspose(B); + + FOR_EACH(4, { + __m128 bcol = B.m[i]; + + __m256 vx = _mm256_broadcastss_ps(bcol); // xxxx + __m256 vy = _mm256_broadcastss_ps(_mm_shuffle_ps(bcol, bcol, 0x55)); // yyyy + __m256 vz = _mm256_broadcastss_ps(_mm_shuffle_ps(bcol, bcol, 0xAA)); // zzzz + __m256 vw = _mm256_broadcastss_ps(_mm_shuffle_ps(bcol, bcol, 0xFF)); // wwww + + __m256 a0 = _mm256_castps128_ps256(A.m[0]); + __m256 a1 = _mm256_castps128_ps256(A.m[1]); + __m256 a2 = _mm256_castps128_ps256(A.m[2]); + __m256 a3 = _mm256_castps128_ps256(A.m[3]); + + __m256 r = _mm256_fmadd_ps(a0, vx, + _mm256_fmadd_ps(a1, vy, + _mm256_fmadd_ps(a2, vz, + _mm256_mul_ps(a3, vw)))); + + __m128 r128 = _mm_add_ps( + _mm256_castps256_ps128(r), + _mm256_extractf128_ps(r, 1) + ); + + R.m[i] = r128; + }); + + return R; + } + #endif + #if 1 + __attribute__((target("avx512f"))) + uf::simd::matrix_value matMult_impl( const uf::simd::matrix_value& A, const uf::simd::matrix_value& B) { + uf::simd::matrix_value R; + uf::simd::matrix_value Bt = uf::simd::matTranspose(B); + + FOR_EACH(4, { + __m128 bcol = B.m[i]; + + __m512 vx = _mm512_set1_ps(((const float*)&bcol)[0]); // xxxx + __m512 vy = _mm512_set1_ps(((const float*)&bcol)[1]); // yyyy + __m512 vz = _mm512_set1_ps(((const float*)&bcol)[2]); // zzzz + __m512 vw = _mm512_set1_ps(((const float*)&bcol)[3]); // wwww + + __m512 a0 = _mm512_castps128_ps512(A.m[0]); + __m512 a1 = _mm512_castps128_ps512(A.m[1]); + __m512 a2 = _mm512_castps128_ps512(A.m[2]); + __m512 a3 = _mm512_castps128_ps512(A.m[3]); + + __m512 r = _mm512_fmadd_ps(a0, vx, + _mm512_fmadd_ps(a1, vy, + _mm512_fmadd_ps(a2, vz, + _mm512_mul_ps(a3, vw)))); + + __m128 r128 = _mm_add_ps( + _mm_add_ps( + _mm512_castps512_ps128(r), // low 128 + _mm512_extractf32x4_ps(r, 1)), // next 128 + _mm_add_ps( + _mm512_extractf32x4_ps(r, 2), // next 128 + _mm512_extractf32x4_ps(r, 3)) // high 128 + ); + + R.m[i] = r128; + }); + + return R; + } + #endif + + __attribute__((target("default"))) + uf::simd::vector matMult_impl( const uf::simd::matrix_value& M, uf::simd::vector v ) { + __m128 vx = _mm_shuffle_ps(v, v, 0x00); + __m128 vy = _mm_shuffle_ps(v, v, 0x55); + __m128 vz = _mm_shuffle_ps(v, v, 0xAA); + __m128 vw = _mm_shuffle_ps(v, v, 0xFF); + + __m128 r0 = _mm_mul_ps(M.m[0], vx); + __m128 r1 = _mm_mul_ps(M.m[1], vy); + __m128 r2 = _mm_mul_ps(M.m[2], vz); + __m128 r3 = _mm_mul_ps(M.m[3], vw); + + return _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3)); + } + #if 1 + __attribute__((target("fma"))) + uf::simd::vector matMult_impl( const uf::simd::matrix_value& M, uf::simd::vector v ) { + __m128 vx = _mm_shuffle_ps(v, v, 0x00); + __m128 vy = _mm_shuffle_ps(v, v, 0x55); + __m128 vz = _mm_shuffle_ps(v, v, 0xAA); + __m128 vw = _mm_shuffle_ps(v, v, 0xFF); + + return _mm_fmadd_ps(M.m[0], vx, + _mm_fmadd_ps(M.m[1], vy, + _mm_fmadd_ps(M.m[2], vz, + _mm_mul_ps(M.m[3], vw)))); + } + #endif +} + +template +inline uf::simd::matrix_value::matrix_value() {} +template +inline uf::simd::matrix_value::matrix_value( const pod::Matrix& mat ) { + m[0] = _mm_loadu_ps(&mat[0]); + m[1] = _mm_loadu_ps(&mat[4]); + m[2] = _mm_loadu_ps(&mat[8]); + m[3] = _mm_loadu_ps(&mat[12]); +} +template +inline bool uf::simd::matrix_value::operator==(const matrix_value& rhs) const { + return uf::simd::matEquals( *this, rhs ); +} +template +inline uf::simd::matrix_value::operator pod::Matrix() const { + pod::Matrix4f mat; + _mm_storeu_ps(&mat[0], m[0]); + _mm_storeu_ps(&mat[4], m[1]); + _mm_storeu_ps(&mat[8], m[2]); + _mm_storeu_ps(&mat[12], m[3]); + return mat; +} + +inline uf::simd::matrix_value uf::simd::matMult( const uf::simd::matrix_value& A, const uf::simd::matrix_value& B ) { + return ::matMult_impl( A, B ); +} +inline uf::simd::vector uf::simd::matMult( const uf::simd::matrix_value& M, uf::simd::vector vec ) { + return ::matMult_impl( M, vec ); +} +inline uf::simd::matrix_value uf::simd::matTranspose( const uf::simd::matrix_value& M ) { + uf::simd::matrix_value R = M; + _MM_TRANSPOSE4_PS(R.m[0], R.m[1], R.m[2], R.m[3]); + return R; +} +inline bool uf::simd::matEquals( const uf::simd::matrix_value& A, const uf::simd::matrix_value& B, float eps ) { + bool result = true; + __m128 e = _mm_set1_ps(eps); + FOR_EACH(4, { + __m128 diff = _mm_sub_ps(A.m[i], B.m[i]); + __m128 mask = _mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.0f), diff), e); + if (_mm_movemask_ps(mask)) result = false; + }); + return result; +} \ No newline at end of file diff --git a/engine/inc/uf/utils/math/physics/impl.h b/engine/inc/uf/utils/math/physics/impl.h index 0e367dd1..4cbf65bd 100644 --- a/engine/inc/uf/utils/math/physics/impl.h +++ b/engine/inc/uf/utils/math/physics/impl.h @@ -28,9 +28,9 @@ namespace pod { }; struct SupportPoint { - alignas(16) pod::Vector3f p; - alignas(16) pod::Vector3f pA; - alignas(16) pod::Vector3f pB; + /*alignas(16)*/ pod::Vector3f p; + /*alignas(16)*/ pod::Vector3f pA; + /*alignas(16)*/ pod::Vector3f pB; }; struct Simplex { @@ -39,7 +39,7 @@ namespace pod { struct Face { pod::SupportPoint a, b, c; - alignas(16) pod::Vector3f normal; + /*alignas(16)*/ pod::Vector3f normal; float distance; }; @@ -62,7 +62,7 @@ namespace pod { typedef uf::stl::unordered_set pairs_t; struct Node { - alignas(16) pod::AABB bounds = {}; + /*alignas(16)*/ pod::AABB bounds = {}; int32_t left = -1; int32_t right = -1; int32_t start = 0; @@ -71,7 +71,7 @@ namespace pod { bool asleep = false; }; struct FlatNode { - alignas(16) pod::AABB bounds = {}; + /*alignas(16)*/ pod::AABB bounds = {}; int32_t start = -1; int32_t count = -1; int32_t skipIndex = -1; @@ -171,23 +171,23 @@ namespace pod { float mass = 1.0f; float inverseMass = 1.0f; - alignas(16) pod::Vector3f offset = {}; + /*alignas(16)*/ pod::Vector3f offset = {}; - alignas(16) pod::Vector3f velocity = {}; - alignas(16) pod::Vector3f forceAccumulator = {}; + /*alignas(16)*/ pod::Vector3f velocity = {}; + /*alignas(16)*/ pod::Vector3f forceAccumulator = {}; - alignas(16) pod::Vector3f angularVelocity = {}; - alignas(16) pod::Vector3f torqueAccumulator = {}; + /*alignas(16)*/ pod::Vector3f angularVelocity = {}; + /*alignas(16)*/ pod::Vector3f torqueAccumulator = {}; - alignas(16) pod::Vector3f inertiaTensor = { 1, 1, 1 }; - alignas(16) pod::Vector3f inverseInertiaTensor = { 1, 1, 1 }; + /*alignas(16)*/ pod::Vector3f inertiaTensor = { 1, 1, 1 }; + /*alignas(16)*/ pod::Vector3f inverseInertiaTensor = { 1, 1, 1 }; - alignas(16) pod::Vector3f gravity = { NAN, NAN, NAN }; // an invalid gravity will fallback to world gravity + /*alignas(16)*/ pod::Vector3f gravity = { NAN, NAN, NAN }; // an invalid gravity will fallback to world gravity - alignas(16) pod::AABB bounds; - alignas(16) pod::Collider collider; - alignas(16) pod::PhysicsMaterial material; - alignas(16) pod::Activity activity; + /*alignas(16)*/ pod::AABB bounds; + /*alignas(16)*/ pod::Collider collider; + /*alignas(16)*/ pod::PhysicsMaterial material; + /*alignas(16)*/ pod::Activity activity; }; struct Contact { diff --git a/engine/inc/uf/utils/math/quaternion/pod.inl b/engine/inc/uf/utils/math/quaternion/pod.inl index e791e123..60c4c9f2 100644 --- a/engine/inc/uf/utils/math/quaternion/pod.inl +++ b/engine/inc/uf/utils/math/quaternion/pod.inl @@ -9,7 +9,7 @@ template pod::Quaternion uf::quaternion::identity() { return pod::Quaternion{ 0, 0, 0, 1 }; } template T uf::quaternion::multiply( const T& q1, const T& q2 ) { -#if 0 && UF_USE_SIMD +#if UF_USE_SIMD if constexpr (std::is_same_v) { return uf::simd::quatMul( q1 , q2 ); } @@ -22,9 +22,9 @@ template T uf::quaternion::multiply( const T& q1, const T& q2 ) { }; } template pod::Vector3t uf::quaternion::rotate( const pod::Quaternion& Q, const pod::Vector3t& v ) { -#if 0 && UF_USE_SIMD +#if UF_USE_SIMD if constexpr (std::is_same_v) { - return uf::simd::quatRot( Q, v ); + return uf::simd::quatRot_3f( Q, v ); } #endif pod::Vector3t q = { Q.x, Q.y, Q.z }; diff --git a/engine/inc/uf/utils/math/quaternion/quaternion.inl b/engine/inc/uf/utils/math/quaternion/quaternion.inl index f29984c5..13539a06 100644 --- a/engine/inc/uf/utils/math/quaternion/quaternion.inl +++ b/engine/inc/uf/utils/math/quaternion/quaternion.inl @@ -1,5 +1,3 @@ -#pragma once - #if UF_USE_SIMD #include "simd.inl" #endif diff --git a/engine/inc/uf/utils/math/quaternion/simd.inl b/engine/inc/uf/utils/math/quaternion/simd.inl index d504abbb..a3f71f48 100644 --- a/engine/inc/uf/utils/math/quaternion/simd.inl +++ b/engine/inc/uf/utils/math/quaternion/simd.inl @@ -1,94 +1,81 @@ namespace uf { namespace simd { - inline value /*UF_API*/ quatMul( value, value ); - inline value /*UF_API*/ quatRot( value, value ); - inline pod::Matrix4f /*UF_API*/ quatMat( value ); + inline vector /*UF_API*/ quatMul( vector, vector ); + inline vector /*UF_API*/ quatRot_3f( vector, vector ); + inline pod::Matrix4f /*UF_API*/ quatMat( vector ); } } -inline uf::simd::value uf::simd::quatMul( uf::simd::value Q1, uf::simd::value Q2 ) { - //__m128 Q1 = q1; - //__m128 Q2 = q2; +inline uf::simd::vector uf::simd::quatMul( uf::simd::vector Q1, uf::simd::vector Q2 ) { + // broadcast q1 components + __m128 x1 = _mm_shuffle_ps(Q1, Q1, _MM_SHUFFLE(0,0,0,0)); + __m128 y1 = _mm_shuffle_ps(Q1, Q1, _MM_SHUFFLE(1,1,1,1)); + __m128 z1 = _mm_shuffle_ps(Q1, Q1, _MM_SHUFFLE(2,2,2,2)); + __m128 w1 = _mm_shuffle_ps(Q1, Q1, _MM_SHUFFLE(3,3,3,3)); - // Broadcast q1.w, q1.x, q1.y, q1.z - __m128 q1w = _mm_shuffle_ps(Q1, Q1, _MM_SHUFFLE(3,3,3,3)); - __m128 q1x = _mm_shuffle_ps(Q1, Q1, _MM_SHUFFLE(0,0,0,0)); - __m128 q1y = _mm_shuffle_ps(Q1, Q1, _MM_SHUFFLE(1,1,1,1)); - __m128 q1z = _mm_shuffle_ps(Q1, Q1, _MM_SHUFFLE(2,2,2,2)); + // broadcast q2 components + __m128 x2 = _mm_shuffle_ps(Q2, Q2, _MM_SHUFFLE(0,0,0,0)); + __m128 y2 = _mm_shuffle_ps(Q2, Q2, _MM_SHUFFLE(1,1,1,1)); + __m128 z2 = _mm_shuffle_ps(Q2, Q2, _MM_SHUFFLE(2,2,2,2)); + __m128 w2 = _mm_shuffle_ps(Q2, Q2, _MM_SHUFFLE(3,3,3,3)); - // Shuffle q2 into (x,y,z,w) permutations - __m128 q2xyzw = Q2; // (x,y,z,w) - __m128 q2wzyx = _mm_shuffle_ps(Q2, Q2, _MM_SHUFFLE(0,1,2,3)); // (w,z,y,x) - __m128 q2yzxw = _mm_shuffle_ps(Q2, Q2, _MM_SHUFFLE(3,0,2,1)); // (y,z,x,w) - __m128 q2zxyw = _mm_shuffle_ps(Q2, Q2, _MM_SHUFFLE(3,1,0,2)); // (z,x,y,w) + // compute each component + __m128 X = _mm_add_ps( + _mm_add_ps(_mm_mul_ps(w1, x2), _mm_mul_ps(x1, w2)), + _mm_sub_ps(_mm_mul_ps(y1, z2), _mm_mul_ps(z1, y2)) + ); - // Compute terms - __m128 t0 = _mm_mul_ps(q1w, q2xyzw); // w1 * (x2,y2,z2,w2) - __m128 t1 = _mm_mul_ps(q1x, q2wzyx); // x1 * (w2,z2,y2,x2) - __m128 t2 = _mm_mul_ps(q1y, q2yzxw); // y1 * (y2,z2,x2,w2) - __m128 t3 = _mm_mul_ps(q1z, q2zxyw); // z1 * (z2,x2,y2,w2) + __m128 Y = _mm_add_ps( + _mm_add_ps(_mm_mul_ps(w1, y2), _mm_mul_ps(y1, w2)), + _mm_sub_ps(_mm_mul_ps(z1, x2), _mm_mul_ps(x1, z2)) + ); - // Signs: (+,+,+,+), (+,-,+,-), (-,+,-,+), (+,-,-,+) - const __m128 sign1 = _mm_set_ps( 1.f,-1.f, 1.f,-1.f); - const __m128 sign2 = _mm_set_ps(-1.f, 1.f,-1.f, 1.f); - const __m128 sign3 = _mm_set_ps( 1.f,-1.f,-1.f, 1.f); + __m128 Z = _mm_add_ps( + _mm_add_ps(_mm_mul_ps(w1, z2), _mm_mul_ps(z1, w2)), + _mm_sub_ps(_mm_mul_ps(x1, y2), _mm_mul_ps(y1, x2)) + ); - t1 = _mm_mul_ps(t1, sign1); - t2 = _mm_mul_ps(t2, sign2); - t3 = _mm_mul_ps(t3, sign3); + __m128 W = _mm_sub_ps( + _mm_mul_ps(w1, w2), + _mm_add_ps( + _mm_add_ps(_mm_mul_ps(x1, x2), _mm_mul_ps(y1, y2)), + _mm_mul_ps(z1, z2) + ) + ); - __m128 result = _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3)); + // pack back into (x,y,z,w) + __m128 result = _mm_movelh_ps(_mm_unpacklo_ps(X, Y), _mm_unpacklo_ps(Z, W)); return result; } -inline uf::simd::value uf::simd::quatRot( uf::simd::value Q, uf::simd::value V ) { - //__m128 Q = q; // (x,y,z,w) - //__m128 V = v; // (vx,vy,vz,0) - - // Extract q.xyz and q.w +inline uf::simd::vector uf::simd::quatRot_3f( uf::simd::vector Q, uf::simd::vector V ) { + // extract q.xyz and q.w __m128 qxyz = _mm_and_ps(Q, _mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))); // mask out w __m128 qw = _mm_shuffle_ps(Q, Q, _MM_SHUFFLE(3,3,3,3)); - // dot(q.xyz, v) -#if SSE_INSTR_SET >= 4 - __m128 dot_qv = _mm_dp_ps(qxyz, V, 0x71); // result in lowest lane -#else - __m128 mul = _mm_mul_ps(qxyz, V); - __m128 shuf = _mm_movehdup_ps(mul); - __m128 sums = _mm_add_ps(mul, shuf); - shuf = _mm_movehl_ps(shuf, sums); - sums = _mm_add_ss(sums, shuf); - __m128 dot_qv = sums; -#endif - __m128 term1 = _mm_mul_ps(_mm_mul_ps(dot_qv, _mm_set1_ps(2.0f)), qxyz); - - // dot(q.xyz, q.xyz) -#if SSE_INSTR_SET >= 4 - __m128 dot_qq = _mm_dp_ps(qxyz, qxyz, 0x71); -#else - __m128 mul2 = _mm_mul_ps(qxyz, qxyz); - __m128 shuf2 = _mm_movehdup_ps(mul2); - __m128 sums2 = _mm_add_ps(mul2, shuf2); - shuf2 = _mm_movehl_ps(shuf2, sums2); - sums2 = _mm_add_ss(sums2, shuf2); - __m128 dot_qq = sums2; -#endif - __m128 w2 = _mm_mul_ps(qw, qw); - __m128 coeff = _mm_sub_ps(w2, dot_qq); - __m128 term2 = _mm_mul_ps(coeff, V); - // cross(q.xyz, v) __m128 q_yzx = _mm_shuffle_ps(qxyz, qxyz, _MM_SHUFFLE(3,0,2,1)); __m128 v_yzx = _mm_shuffle_ps(V, V, _MM_SHUFFLE(3,0,2,1)); - __m128 cross = _mm_sub_ps(_mm_mul_ps(qxyz, v_yzx), _mm_mul_ps(q_yzx, V)); - cross = _mm_shuffle_ps(cross, cross, _MM_SHUFFLE(3,0,2,1)); - __m128 term3 = _mm_mul_ps(_mm_mul_ps(cross, qw), _mm_set1_ps(2.0f)); + __m128 cross1 = _mm_sub_ps(_mm_mul_ps(qxyz, v_yzx), _mm_mul_ps(q_yzx, V)); + cross1 = _mm_shuffle_ps(cross1, cross1, _MM_SHUFFLE(3,0,2,1)); - // Final result - __m128 result = _mm_add_ps(_mm_add_ps(term1, term2), term3); + // 2 * w * cross(q,v) + __m128 term1 = _mm_mul_ps(_mm_mul_ps(cross1, qw), _mm_set1_ps(2.0f)); + + // cross(q, cross(q,v)) + __m128 c1_yzx = _mm_shuffle_ps(cross1, cross1, _MM_SHUFFLE(3,0,2,1)); + __m128 cross2 = _mm_sub_ps(_mm_mul_ps(qxyz, c1_yzx), _mm_mul_ps(q_yzx, cross1)); + cross2 = _mm_shuffle_ps(cross2, cross2, _MM_SHUFFLE(3,0,2,1)); + + // 2 * cross(q, cross(q,v)) + __m128 term2 = _mm_mul_ps(cross2, _mm_set1_ps(2.0f)); + + // v + term1 + term2 + __m128 result = _mm_add_ps(_mm_add_ps(V, term1), term2); return result; } -inline pod::Matrix4f uf::simd::quatMat( uf::simd::value Q ) { + +inline pod::Matrix4f uf::simd::quatMat( uf::simd::vector Q ) { // Shuffle out components __m128 qx = _mm_shuffle_ps(Q, Q, _MM_SHUFFLE(0,0,0,0)); __m128 qy = _mm_shuffle_ps(Q, Q, _MM_SHUFFLE(1,1,1,1)); diff --git a/engine/inc/uf/utils/math/vector/pod.inl b/engine/inc/uf/utils/math/vector/pod.inl index 370571ea..f78e13d5 100644 --- a/engine/inc/uf/utils/math/vector/pod.inl +++ b/engine/inc/uf/utils/math/vector/pod.inl @@ -17,7 +17,7 @@ constexpr void for_each_index(F&& f) { template T elementwise( const T& left, const T& right, Op&& op ) { - alignas(16) T res; + T res; FOR_EACH(T::size, { res[i] = op(left[i], right[i]); }); @@ -46,7 +46,7 @@ pod::Vector uf::vector::copy( const pod::Vector& v ) { } template pod::Vector uf::vector::cast( const U& from ) { - alignas(16) pod::Vector to; + pod::Vector to; #pragma unroll // GCC unroll N for ( auto i = 0; i < N && i < U::size; ++i ) to[i] = from[i]; @@ -147,7 +147,7 @@ T uf::vector::add( const T& left, const T& right ) { return uf::simd::add( left, right ); } #endif - alignas(16) T res; + T res; FOR_EACH(T::size, { res[i] = left[i] + right[i]; }); @@ -160,7 +160,7 @@ T uf::vector::add( const T& vector, typename T::type_t scalar ) { return uf::simd::add( vector, scalar ); } #endif - alignas(16) T res; + T res; FOR_EACH(T::size, { res[i] = vector[i] + scalar; }); @@ -177,7 +177,7 @@ T uf::vector::subtract( const T& left, const T& right ) { return uf::simd::sub( left, right ); } #endif - alignas(16) T res; + T res; FOR_EACH(T::size, { res[i] = left[i] - right[i]; }); @@ -190,7 +190,7 @@ T uf::vector::subtract( const T& vector, typename T::type_t scalar ) { return uf::simd::sub( vector, scalar ); } #endif - alignas(16) T res; + T res; FOR_EACH(T::size, { res[i] = vector[i] - scalar; }); @@ -203,7 +203,7 @@ T uf::vector::subtract( typename T::type_t scalar, const T& vector ) { return uf::simd::sub( scalar, vector ); } #endif - alignas(16) T res; + T res; FOR_EACH(T::size, { res[i] = scalar - vector[i]; }); @@ -216,7 +216,7 @@ T uf::vector::multiply( const T& left, const T& right ) { return uf::simd::mul( left, right ); } #endif - alignas(16) T res; + T res; FOR_EACH(T::size, { res[i] = left[i] * right[i]; }); @@ -229,7 +229,7 @@ T uf::vector::multiply( const T& vector, typename T::type_t scalar ) { return uf::simd::mul( vector, scalar ); } #endif - alignas(16) T res; + T res; FOR_EACH(T::size, { res[i] = vector[i] * scalar; }); @@ -247,14 +247,14 @@ T uf::vector::divide( const T& left, const T& right ) { } #elif UF_ENV_DREAMCAST && UF_ENV_DREAMCAST_SIMD if constexpr ( simd_able_v ) { - alignas(16) T res; + T res; FOR_EACH(T::size, { res[i] = MATH_Fast_Divide( left[i], right[i] ); }); return res; } #endif - alignas(16) T res; + T res; FOR_EACH(T::size, { res[i] = left[i] / right[i]; }); @@ -268,14 +268,14 @@ T uf::vector::divide( const T& vector, typename T::type_t scalar ) { } #elif UF_ENV_DREAMCAST && UF_ENV_DREAMCAST_SIMD if constexpr ( simd_able_v ) { - alignas(16) T res; + T res; FOR_EACH(T::size, { res[i] = MATH_Fast_Divide( vector[i], scalar ); }); return res; } #endif - alignas(16) T res; + T res; scalar = static_cast(1) / scalar; FOR_EACH(T::size, { res[i] = vector[i] * scalar; @@ -290,14 +290,14 @@ T uf::vector::divide( typename T::type_t scalar, const T& vector ) { } #elif UF_ENV_DREAMCAST && UF_ENV_DREAMCAST_SIMD if constexpr ( simd_able_v ) { - alignas(16) T res; + T res; FOR_EACH(T::size, { res[i] = MATH_Fast_Divide( scalar, vector[i] ); }); return res; } #endif - alignas(16) T res; + T res; scalar = static_cast(1) / scalar; FOR_EACH(T::size, { res[i] = scalar / vector[i]; @@ -327,7 +327,7 @@ T uf::vector::negate( const T& vector ) { return uf::simd::mul( vector, -1.f ); } #endif - alignas(16) T res; + T res; FOR_EACH(T::size, { res[i] = -vector[i]; }); @@ -335,7 +335,7 @@ T uf::vector::negate( const T& vector ) { } template T uf::vector::abs( const T& vector ) { - alignas(16) T res; + T res; FOR_EACH(T::size, { res[i] = std::abs( vector[i] ); }); @@ -575,7 +575,7 @@ T uf::vector::lerp( const T& from, const T& to, double delta, bool clamp ) { // from + ( ( to - from ) * delta ) #if UF_ENV_DREAMCAST && UF_ENV_DREAMCAST_SIMD if constexpr ( simd_able_v ) { - alignas(16) T res; + T res; FOR_EACH(T::size, { res[i] = MATH_Lerp( from[i], to[i], delta ); }); @@ -594,7 +594,7 @@ T uf::vector::lerp( const T& from, const T& to, const T& delta, bool clamp ) { // from + ( ( to - from ) * delta ) #if UF_ENV_DREAMCAST && UF_ENV_DREAMCAST_SIMD if constexpr ( simd_able_v ) { - alignas(16) T res; + T res; FOR_EACH(T::size, { res[i] = MATH_Lerp( from[i], to[i], delta[i] ); }); @@ -638,16 +638,16 @@ template typename T::type_t uf::vector::distanceSquared( const T& a, const T& b ) { #if UF_ENV_DREAMCAST && UF_ENV_DREAMCAST_SIMD if constexpr ( simd_able_v ) { - alignas(16) T delta = uf::vector::subtract(b, a); + T delta = uf::vector::subtract(b, a); return MATH_Sum_of_Squares( UF_EZ_VEC4( delta, T::size ) ); } #elif UF_USE_SIMD if constexpr ( simd_able_v ) { - uf::simd::value delta = uf::simd::sub( b, a ); + uf::simd::vector delta = uf::simd::sub( b, a ); return uf::simd::dot( delta, delta ); } #endif - alignas(16) T delta = uf::vector::subtract( b, a ); + T delta = uf::vector::subtract( b, a ); return uf::vector::dot( delta, delta ); } template @@ -674,10 +674,15 @@ typename T::type_t uf::vector::norm( const T& vector ) { } template T uf::vector::normalize( const T& vector ) { +#if UF_USE_SIMD + if constexpr ( std::is_same_v ) { + return uf::simd::normalize( vector ); + } +#endif typename T::type_t norm = uf::vector::norm(vector); if ( norm == 0 ) return vector; #if UF_ENV_DREAMCAST && UF_ENV_DREAMCAST_SIMD - if constexpr ( simd_able_v ) { + if constexpr ( std::is_same_v ) { return uf::vector::multiply(vector, MATH_fsrra(norm)); } #endif @@ -698,8 +703,8 @@ T uf::vector::clampMagnitude( const T& v, float maxMag ) { template void uf::vector::orthonormalize( T& normal, T& tangent ) { normal = uf::vector::normalize( normal ); - alignas(16) T norm = normal; - alignas(16) T tan = uf::vector::normalize( tangent ); + T norm = normal; + T tan = uf::vector::normalize( tangent ); tangent = uf::vector::subtract( tan, uf::vector::multiply( norm, uf::vector::dot( norm, tan ) ) ); tangent = uf::vector::normalize( tangent ); } @@ -711,32 +716,15 @@ template T uf::vector::cross( const T& a, const T& b ) { #if UF_USE_SIMD if constexpr ( simd_able_v ) { - uf::simd::value x = a; - uf::simd::value y = b; - #if SSE_INSTR_SET >= 7 - uf::simd::value tmp0 = _mm_shuffle_ps(y,y,_MM_SHUFFLE(3,0,2,1)); - uf::simd::value tmp1 = _mm_shuffle_ps(x,x,_MM_SHUFFLE(3,0,2,1)); - tmp1 = _mm_mul_ps(tmp1,y); - uf::simd::value tmp2 = _mm_fmsub_ps( tmp0,x, tmp1 ); - uf::simd::value res = _mm_shuffle_ps(tmp2,tmp2,_MM_SHUFFLE(3,0,2,1)); - return res; - #else - uf::simd::value tmp0 = _mm_shuffle_ps(y,y,_MM_SHUFFLE(3,0,2,1)); - uf::simd::value tmp1 = _mm_shuffle_ps(x,x,_MM_SHUFFLE(3,0,2,1)); - tmp0 = _mm_mul_ps(tmp0,x); - tmp1 = _mm_mul_ps(tmp1,y); - uf::simd::value tmp2 = _mm_sub_ps(tmp0,tmp1); - uf::simd::value res = _mm_shuffle_ps(tmp2,tmp2,_MM_SHUFFLE(3,0,2,1)); - return res; - #endif + return uf::simd::cross( a, b ); } #elif UF_ENV_DREAMCAST && UF_ENV_DREAMCAST_SIMD if constexpr ( simd_able_v ) { - alignas(16) auto res = MATH_Cross_Product( a.x, a.y, a.z, b.x, b.y, b.z ); + auto res = MATH_Cross_Product( a.x, a.y, a.z, b.x, b.y, b.z ); return *((T*) &res); } #endif - alignas(16) T res{ + T res{ a.y * b.z - b.y * a.z, a.z * b.x - b.z * a.x, a.x * b.y - b.x * a.y diff --git a/engine/inc/uf/utils/math/vector/simd.h b/engine/inc/uf/utils/math/vector/simd.h index 6359eedc..0e8248e7 100644 --- a/engine/inc/uf/utils/math/vector/simd.h +++ b/engine/inc/uf/utils/math/vector/simd.h @@ -7,34 +7,34 @@ #endif #define DEFINE_SIMD(T)\ - inline value /*UF_API*/ load( const T* );\ - inline void /*UF_API*/ store( value, T* );\ - inline value /*UF_API*/ set( T );\ - inline value /*UF_API*/ set( T, T, T, T );\ - inline value /*UF_API*/ add( value, value );\ - inline value /*UF_API*/ sub( value, value );\ - inline value /*UF_API*/ mul( value, value );\ - inline value /*UF_API*/ div( value, value );\ - inline value /*UF_API*/ min( value, value );\ - inline value /*UF_API*/ max( value, value );\ - inline bool /*UF_API*/ all( value );\ - inline bool /*UF_API*/ any( value );\ - inline value /*UF_API*/ less( value, value );\ - inline value /*UF_API*/ lessEquals( value, value );\ - inline value /*UF_API*/ greater( value, value );\ - inline value /*UF_API*/ greaterEquals( value, value );\ - inline value /*UF_API*/ equals( value, value );\ - inline value /*UF_API*/ notEquals( value, value );\ - inline value /*UF_API*/ sqrt( value );\ - inline value /*UF_API*/ hadd( value, value );\ - inline T /*UF_API*/ dot( value, value );\ - template inline pod::Vector vector( const value );\ + inline vector /*UF_API*/ load( const T* );\ + inline void /*UF_API*/ store( vector, T* );\ + inline vector /*UF_API*/ set( T );\ + inline vector /*UF_API*/ set( T, T, T, T );\ + inline vector /*UF_API*/ add( vector, vector );\ + inline vector /*UF_API*/ sub( vector, vector );\ + inline vector /*UF_API*/ mul( vector, vector );\ + inline vector /*UF_API*/ div( vector, vector );\ + inline vector /*UF_API*/ min( vector, vector );\ + inline vector /*UF_API*/ max( vector, vector );\ + inline bool /*UF_API*/ all( vector );\ + inline bool /*UF_API*/ any( vector );\ + inline vector /*UF_API*/ less( vector, vector );\ + inline vector /*UF_API*/ lessEquals( vector, vector );\ + inline vector /*UF_API*/ greater( vector, vector );\ + inline vector /*UF_API*/ greaterEquals( vector, vector );\ + inline vector /*UF_API*/ equals( vector, vector );\ + inline vector /*UF_API*/ notEquals( vector, vector );\ + inline vector /*UF_API*/ sqrt( vector );\ + inline vector /*UF_API*/ hadd( vector, vector );\ + inline T /*UF_API*/ dot( vector, vector );\ + template inline pod::Vector cast( const vector );\ namespace uf { namespace simd { template struct UF_API traits { - static const size_t size = 4; + static constexpr size_t size = 4; typedef T type; typedef __m128 value; typedef pod::Vector vector; @@ -42,60 +42,60 @@ namespace uf { template<> struct UF_API traits { - static const size_t size = 4; + static constexpr size_t size = 4; typedef int32_t type; typedef __m128i value; - typedef pod::Vector vector; + typedef pod::Vector vector; }; template<> struct UF_API traits { - static const size_t size = 4; + static constexpr size_t size = 4; typedef uint32_t type; typedef __m128i value; - typedef pod::Vector vector; + typedef pod::Vector vector; }; template<> struct UF_API traits { - static const size_t size = 4; + static constexpr size_t size = 4; typedef float type; typedef __m128 value; - typedef pod::Vector vector; + typedef pod::Vector vector; }; template - class /*UF_API*/ alignas(16) value { - private: - // __m128 m_value; - typedef typename traits::value value_type; - value_type m_value; + class /*UF_API*/ alignas(16) vector { public: - inline value(); - inline value(const T* f); - inline value(T f); - inline value(T f0, T f1, T f2, T f3); - inline value(const value_type& rhs); - inline value(const value& rhs); + // __m128 m; + typedef typename traits::value value_type; + value_type m; + inline vector(); + inline vector(const T* f); + inline vector(T f); + inline vector(T f0, T f1, T f2, T f3); + inline vector(bool f0, bool f1, bool f2, bool f3); + inline vector(const value_type& rhs); + inline vector(const vector& rhs); - inline value(const pod::Vector& rhs); - inline value(const pod::Vector& rhs); - inline value(const pod::Vector& rhs); - inline value(const pod::Vector& rhs); + inline vector(const pod::Vector& rhs); + inline vector(const pod::Vector& rhs); + inline vector(const pod::Vector& rhs); + inline vector(const pod::Vector& rhs); - inline value operator+( const value& rhs ); - inline value operator-( const value& rhs ); - inline value operator*( const value& rhs ); - inline value operator/( const value& rhs ); + inline vector operator+( const vector& rhs ); + inline vector operator-( const vector& rhs ); + inline vector operator*( const vector& rhs ); + inline vector operator/( const vector& rhs ); - inline value operator<( const value& rhs ); - inline value operator<=( const value& rhs ); - inline value operator>( const value& rhs ); - inline value operator>=( const value& rhs ); - inline value operator==( const value& rhs ); - inline value operator!=( const value& rhs ); + inline vector operator<( const vector& rhs ); + inline vector operator<=( const vector& rhs ); + inline vector operator>( const vector& rhs ); + inline vector operator>=( const vector& rhs ); + inline vector operator==( const vector& rhs ); + inline vector operator!=( const vector& rhs ); - inline value& operator=(const value_type& rhs); - inline value& operator=(const value& rhs); - inline value& operator=(const pod::Vector& rhs); + inline vector& operator=(const value_type& rhs); + inline vector& operator=(const vector& rhs); + inline vector& operator=(const pod::Vector& rhs); inline operator value_type() const; @@ -107,12 +107,23 @@ namespace uf { DEFINE_SIMD(uint32_t); // these are effectively NOPs + /* #if UF_USE_FLOAT16 DEFINE_SIMD(std::float16_t) #endif #if UF_USE_BFLOAT16 DEFINE_SIMD(std::bfloat16_t) #endif + */ + + // specializations + inline vector /*UF_API*/ set_f( bool, bool, bool, bool ); + inline vector /*UF_API*/ set_i( bool, bool, bool, bool ); + inline vector /*UF_API*/ set_ui( bool, bool, bool, bool ); + + inline vector /*UF_API*/ cross( vector x, vector y ); + inline vector /*UF_API*/ normalize( vector x ); + inline vector /*UF_API*/ normalize_fast( vector x ); } } diff --git a/engine/inc/uf/utils/math/vector/simd.inl b/engine/inc/uf/utils/math/vector/simd.inl index 4717bced..c71cc347 100644 --- a/engine/inc/uf/utils/math/vector/simd.inl +++ b/engine/inc/uf/utils/math/vector/simd.inl @@ -5,143 +5,157 @@ namespace { const __m128i signbit = _mm_set1_epi32(0x80000000); return _mm_xor_si128(v, signbit); } + + inline int32_t boolMask(bool b) { + return b ? -1 : 0; // 0xFFFFFFFF for true, 0x00000000 for false + } } -template -inline uf::simd::value::value() {} -template -inline uf::simd::value::value(const T* f) : m_value(uf::simd::load(f)) {} -template -inline uf::simd::value::value(T f) : m_value(uf::simd::set(f)) {} -template -inline uf::simd::value::value(T f0, T f1, T f2, T f3) : m_value(uf::simd::set(f0,f1,f2,f3)) {} -template -inline uf::simd::value::value(const value_type& rhs) : m_value(rhs) {} -template -inline uf::simd::value::value(const value& rhs) : m_value(rhs.m_value) {} +#define MV_INSTR_SET_DEFAULT __attribute__((target("default"))) +#define MV_INSTR_SET_2 __attribute__((target("sse2"))) +#define MV_INSTR_SET_3 __attribute__((target("sse3"))) +#define MV_INSTR_SET_4 __attribute__((target("ssse3"))) +#define MV_INSTR_SET_5 __attribute__((target("sse4.1"))) +#define MV_INSTR_SET_6 __attribute__((target("sse4.2"))) +#define MV_INSTR_SET_7 __attribute__((target("avx"))) template -inline uf::simd::value::value(const pod::Vector& rhs) : value((T) rhs[0]){} +inline uf::simd::vector::vector() {} template -inline uf::simd::value::value(const pod::Vector& rhs) : value((T) rhs[0], (T) rhs[1], 0, 0){} +inline uf::simd::vector::vector(const T* f) : m(uf::simd::load(f)) {} template -inline uf::simd::value::value(const pod::Vector& rhs) : value((T) rhs[0], (T) rhs[1], (T) rhs[2], 0){} +inline uf::simd::vector::vector(T f) : m(uf::simd::set(f)) {} template -inline uf::simd::value::value(const pod::Vector& rhs) : value((T) rhs[0], (T) rhs[1], (T) rhs[2], (T) rhs[3]){} +inline uf::simd::vector::vector(T f0, T f1, T f2, T f3) : m(uf::simd::set(f0,f1,f2,f3)) {} +template +inline uf::simd::vector::vector(bool f0, bool f1, bool f2, bool f3) : m(uf::simd::set(f0,f1,f2,f3)) {} +template +inline uf::simd::vector::vector(const value_type& rhs) : m(rhs) {} +template +inline uf::simd::vector::vector(const vector& rhs) : m(rhs.m) {} template -inline uf::simd::value uf::simd::value::operator+( const value& rhs ) { +inline uf::simd::vector::vector(const pod::Vector& rhs) : vector((T) rhs[0]){} +template +inline uf::simd::vector::vector(const pod::Vector& rhs) : vector((T) rhs[0], (T) rhs[1], 0, 0){} +template +inline uf::simd::vector::vector(const pod::Vector& rhs) : vector((T) rhs[0], (T) rhs[1], (T) rhs[2], 0){} +template +inline uf::simd::vector::vector(const pod::Vector& rhs) : vector((T) rhs[0], (T) rhs[1], (T) rhs[2], (T) rhs[3]){} + +template +inline uf::simd::vector uf::simd::vector::operator+( const vector& rhs ) { return uf::simd::add( *this, rhs ); } template -inline uf::simd::value uf::simd::value::operator-( const value& rhs ) { +inline uf::simd::vector uf::simd::vector::operator-( const vector& rhs ) { return uf::simd::sub( *this, rhs ); } template -inline uf::simd::value uf::simd::value::operator*( const value& rhs ) { +inline uf::simd::vector uf::simd::vector::operator*( const vector& rhs ) { return uf::simd::mul( *this, rhs ); } template -inline uf::simd::value uf::simd::value::operator/( const value& rhs ) { +inline uf::simd::vector uf::simd::vector::operator/( const vector& rhs ) { return uf::simd::div( *this, rhs ); } template -inline uf::simd::value uf::simd::value::operator<( const value& rhs ) { +inline uf::simd::vector uf::simd::vector::operator<( const vector& rhs ) { return uf::simd::less( *this, rhs ); } template -inline uf::simd::value uf::simd::value::operator<=( const value& rhs ) { +inline uf::simd::vector uf::simd::vector::operator<=( const vector& rhs ) { return uf::simd::lessEquals( *this, rhs ); } template -inline uf::simd::value uf::simd::value::operator>( const value& rhs ) { +inline uf::simd::vector uf::simd::vector::operator>( const vector& rhs ) { return uf::simd::greater( *this, rhs ); } template -inline uf::simd::value uf::simd::value::operator>=( const value& rhs ) { +inline uf::simd::vector uf::simd::vector::operator>=( const vector& rhs ) { return uf::simd::greaterEquals( *this, rhs ); } template -inline uf::simd::value uf::simd::value::operator==( const value& rhs ) { +inline uf::simd::vector uf::simd::vector::operator==( const vector& rhs ) { return uf::simd::equals( *this, rhs ); } template -inline uf::simd::value uf::simd::value::operator!=( const value& rhs ) { +inline uf::simd::vector uf::simd::vector::operator!=( const vector& rhs ) { return uf::simd::notEquals( *this, rhs ); } template -inline uf::simd::value& uf::simd::value::operator=(const uf::simd::value::value_type& rhs) { - m_value = rhs; +inline uf::simd::vector& uf::simd::vector::operator=(const uf::simd::vector::value_type& rhs) { + m = rhs; return *this; } template -inline uf::simd::value& uf::simd::value::operator=(const value& rhs) { - m_value = rhs.m_value; +inline uf::simd::vector& uf::simd::vector::operator=(const vector& rhs) { + m = rhs.m; return *this; } template -inline uf::simd::value& uf::simd::value::operator=(const pod::Vector& rhs) { - m_value = uf::simd::load(&rhs[0]); +inline uf::simd::vector& uf::simd::vector::operator=(const pod::Vector& rhs) { + m = uf::simd::load(&rhs[0]); return *this; } template -inline uf::simd::value::operator uf::simd::value::value_type() const { - return m_value; +inline uf::simd::vector::operator uf::simd::vector::value_type() const { + return m; } template template -inline uf::simd::value::operator pod::Vector() const { - return uf::simd::vector(*this); +inline uf::simd::vector::operator pod::Vector() const { + return uf::simd::cast(*this); } template -inline pod::Vector uf::simd::vector( const uf::simd::value v ){ +inline pod::Vector uf::simd::cast( const uf::simd::vector v ){ pod::Vector4f r; uf::simd::store( v, &r[0] ); return uf::vector::cast(r); } template -inline pod::Vector uf::simd::vector( const uf::simd::value v ){ +inline pod::Vector uf::simd::cast( const uf::simd::vector v ){ pod::Vector4i r; uf::simd::store( v, &r[0] ); return uf::vector::cast(r); } template -inline pod::Vector uf::simd::vector( const uf::simd::value v ){ +inline pod::Vector uf::simd::cast( const uf::simd::vector v ){ pod::Vector4ui r; uf::simd::store( v, &r[0] ); return uf::vector::cast(r); } -inline uf::simd::value uf::simd::load( const float* f ) { +inline uf::simd::vector uf::simd::load( const float* f ) { // if ( uf::aligned(f, 16) ) return _mm_load_ps(f); - return _mm_loadu_ps(f); + return _mm_loadu_ps( f ); } -inline void uf::simd::store( uf::simd::value v, float* f ) { +inline void uf::simd::store( uf::simd::vector v, float* f ) { /* if ( uf::aligned(f, 16) ) _mm_store_ps(f, v); - else */ _mm_storeu_ps(f, v); + else */ _mm_storeu_ps( f, v ); } -inline uf::simd::value uf::simd::set( float f ) { - return _mm_set1_ps(f); +inline uf::simd::vector uf::simd::set( float f ) { + return _mm_set1_ps( f ); } -inline uf::simd::value uf::simd::set( float x, float y, float z, float w ) { - return _mm_setr_ps(x, y, z, w); +inline uf::simd::vector uf::simd::set( float x, float y, float z, float w ) { + return _mm_setr_ps( x, y, z, w ); } -inline uf::simd::value uf::simd::add( uf::simd::value x, uf::simd::value y ) { +inline uf::simd::vector uf::simd::add( uf::simd::vector x, uf::simd::vector y ) { return _mm_add_ps( x, y ); } -inline uf::simd::value uf::simd::sub( uf::simd::value x, uf::simd::value y ) { +inline uf::simd::vector uf::simd::sub( uf::simd::vector x, uf::simd::vector y ) { return _mm_sub_ps( x, y ); } -inline uf::simd::value uf::simd::mul( uf::simd::value x, uf::simd::value y ) { +inline uf::simd::vector uf::simd::mul( uf::simd::vector x, uf::simd::vector y ) { return _mm_mul_ps( x, y ); } -inline uf::simd::value uf::simd::div( uf::simd::value x, uf::simd::value y ) { +inline uf::simd::vector uf::simd::div( uf::simd::vector x, uf::simd::vector y ) { return _mm_div_ps( x, y ); } /* -inline uf::simd::value uf::simd::hadd( uf::simd::value x, uf::simd::value y ) { +inline uf::simd::vector uf::simd::hadd( uf::simd::vector x, uf::simd::vector y ) { #if 0 return _mm_hadd_ps( x, y ); #else @@ -154,312 +168,482 @@ inline uf::simd::value uf::simd::hadd( uf::simd::value x, uf::simd } */ -inline uf::simd::value uf::simd::min( uf::simd::value x, uf::simd::value y ) { +inline uf::simd::vector uf::simd::min( uf::simd::vector x, uf::simd::vector y ) { return _mm_min_ps( x, y ); } -inline uf::simd::value uf::simd::max( uf::simd::value x, uf::simd::value y ) { +inline uf::simd::vector uf::simd::max( uf::simd::vector x, uf::simd::vector y ) { return _mm_max_ps( x, y ); } -inline bool uf::simd::all( uf::simd::value mask) { - return _mm_movemask_ps(mask) == 0xF; // all 4 bits set +inline bool uf::simd::all( uf::simd::vector mask) { + return _mm_movemask_ps( mask ) == 0xF; // all 4 bits set } -inline bool uf::simd::any( uf::simd::value mask) { - return _mm_movemask_ps(mask) != 0x0; // any bit set +inline bool uf::simd::any( uf::simd::vector mask) { + return _mm_movemask_ps( mask ) != 0x0; // any bit set } -inline uf::simd::value uf::simd::less( uf::simd::value x, uf::simd::value y ) { +inline uf::simd::vector uf::simd::less( uf::simd::vector x, uf::simd::vector y ) { return _mm_cmplt_ps( x, y ); } -inline uf::simd::value uf::simd::lessEquals( uf::simd::value x, uf::simd::value y ) { +inline uf::simd::vector uf::simd::lessEquals( uf::simd::vector x, uf::simd::vector y ) { return _mm_cmple_ps( x, y ); } -inline uf::simd::value uf::simd::greater( uf::simd::value x, uf::simd::value y ) { +inline uf::simd::vector uf::simd::greater( uf::simd::vector x, uf::simd::vector y ) { return _mm_cmpgt_ps( x, y ); } -inline uf::simd::value uf::simd::greaterEquals( uf::simd::value x, uf::simd::value y ) { +inline uf::simd::vector uf::simd::greaterEquals( uf::simd::vector x, uf::simd::vector y ) { return _mm_cmpge_ps( x, y ); } -inline uf::simd::value uf::simd::equals( uf::simd::value x, uf::simd::value y ) { +inline uf::simd::vector uf::simd::equals( uf::simd::vector x, uf::simd::vector y ) { return _mm_cmpeq_ps( x, y ); } -inline uf::simd::value uf::simd::notEquals( uf::simd::value x, uf::simd::value y ) { +inline uf::simd::vector uf::simd::notEquals( uf::simd::vector x, uf::simd::vector y ) { return _mm_cmpneq_ps( x, y ); } -inline uf::simd::value uf::simd::sqrt( uf::simd::value v ) { +inline uf::simd::vector uf::simd::sqrt( uf::simd::vector v ) { return _mm_sqrt_ps( v ); } -inline float uf::simd::dot( uf::simd::value x, uf::simd::value y ) { -#if SSE_INSTR_SET >= 5 - __m128 result = _mm_dp_ps(x, y, 0xF1); - return _mm_cvtss_f32(result); -#elif SSE_INSTR_SET >= 3 - __m128 mulRes = _mm_mul_ps(x, y); - __m128 shufReg = _mm_movehdup_ps(mulRes); - __m128 sumsReg = _mm_add_ps(mulRes, shufReg); - shufReg = _mm_movehl_ps(shufReg, sumsReg); - sumsReg = _mm_add_ss(sumsReg, shufReg); - return _mm_cvtss_f32(sumsReg); -#else - return uf::vector::sum( uf::simd::vector( uf::simd::mul( x, y ) ) ); -#endif + +namespace { + MV_INSTR_SET_DEFAULT + uf::simd::vector dot_impl( uf::simd::vector x, uf::simd::vector y ) { + return uf::simd::mul( x, y ); + } + MV_INSTR_SET_3 + uf::simd::vector dot_impl( uf::simd::vector x, uf::simd::vector y ) { + __m128 mulRes = _mm_mul_ps(x, y); + __m128 shufReg = _mm_movehdup_ps(mulRes); + __m128 sumsReg = _mm_add_ps(mulRes, shufReg); + shufReg = _mm_movehl_ps(shufReg, sumsReg); + return _mm_add_ss(sumsReg, shufReg); + } + MV_INSTR_SET_5 + uf::simd::vector dot_impl( uf::simd::vector x, uf::simd::vector y ) { + return _mm_dp_ps(x, y, 0xF1); + } } -inline uf::simd::value uf::simd::load( const int32_t* f ) { -#if SSE_INSTR_SET >= 3 - // if ( uf::aligned(f, 16) ) return _mm_load_si128(reinterpret_cast(f)); - return _mm_loadu_si128(reinterpret_cast(f)); -#else - return uf::simd::value( f[0], f[1], f[2], f[3] ); -#endif +inline float uf::simd::dot( uf::simd::vector x, uf::simd::vector y ) { + return _mm_cvtss_f32( ::dot_impl( x, y ) ); } -inline void uf::simd::store( uf::simd::value v, int32_t* f ) { -#if SSE_INSTR_SET >= 3 - /*if ( uf::aligned(f, 16) ) _mm_store_si128(reinterpret_cast<__m128i*>(f), v); - else*/ _mm_storeu_si128(reinterpret_cast<__m128i*>(f), v); -#else - union { __m128i x; int32_t y[4]; } kludge; - kludge.x = v; - f[0] = kludge.y[0]; - f[1] = kludge.y[1]; - f[2] = kludge.y[2]; - f[3] = kludge.y[3]; -#endif + +namespace { + MV_INSTR_SET_DEFAULT + uf::simd::vector load_impl( const int32_t* f ) { + return uf::simd::vector( f[0], f[1], f[2], f[3] ); + } + MV_INSTR_SET_3 + uf::simd::vector load_impl( const int32_t* f ) { + // if ( uf::aligned(f, 16) ) return _mm_load_si128(reinterpret_cast(f)); + return _mm_loadu_si128(reinterpret_cast(f)); + } } -inline uf::simd::value uf::simd::set( int32_t f ) { +inline uf::simd::vector uf::simd::load( const int32_t* f ) { + return ::load_impl( f ); +} + +namespace { + MV_INSTR_SET_DEFAULT + void store_impl( uf::simd::vector v, int32_t* f ) { + union { __m128i x; int32_t y[4]; } kludge; + kludge.x = v; + f[0] = kludge.y[0]; + f[1] = kludge.y[1]; + f[2] = kludge.y[2]; + f[3] = kludge.y[3]; + } + MV_INSTR_SET_3 + void store_impl( uf::simd::vector v, int32_t* f ) { + /*if ( uf::aligned(f, 16) ) _mm_store_si128(reinterpret_cast<__m128i*>(f), v); + else*/ _mm_storeu_si128(reinterpret_cast<__m128i*>(f), v); + } +} +inline void uf::simd::store( uf::simd::vector v, int32_t* f ) { + return ::store_impl( v, f ); +} + + +inline uf::simd::vector uf::simd::set( int32_t f ) { return _mm_set1_epi32(f); } -inline uf::simd::value uf::simd::set( int32_t x, int32_t y, int32_t z, int32_t w ) { +inline uf::simd::vector uf::simd::set( int32_t x, int32_t y, int32_t z, int32_t w ) { return _mm_setr_epi32(x, y, z, w); } -inline uf::simd::value uf::simd::add( uf::simd::value x, uf::simd::value y ) { +inline uf::simd::vector uf::simd::add( uf::simd::vector x, uf::simd::vector y ) { return _mm_add_epi32(x, y); } -inline uf::simd::value uf::simd::sub( uf::simd::value x, uf::simd::value y ) { +inline uf::simd::vector uf::simd::sub( uf::simd::vector x, uf::simd::vector y ) { return _mm_sub_epi32(x, y); } -inline uf::simd::value uf::simd::mul( uf::simd::value x, uf::simd::value y ) { -#if SSE_INSTR_SET >= 4 - return _mm_mullo_epi32(x, y); -#else - auto X = uf::simd::vector(x); - auto Y = uf::simd::vector(y); - return uf::simd::set(X[0]*Y[0], X[1]*Y[1], X[2]*Y[2], X[3]*Y[3]); -#endif + +namespace { + MV_INSTR_SET_DEFAULT + uf::simd::vector mul_impl( uf::simd::vector x, uf::simd::vector y ) { + auto X = uf::simd::cast(x); + auto Y = uf::simd::cast(y); + return uf::simd::set(X[0]*Y[0], X[1]*Y[1], X[2]*Y[2], X[3]*Y[3]); + } + MV_INSTR_SET_4 + uf::simd::vector mul_impl( uf::simd::vector x, uf::simd::vector y ) { + return _mm_mullo_epi32(x, y); + } } -inline uf::simd::value uf::simd::div( uf::simd::value x, uf::simd::value y ) { - auto X = uf::simd::vector( x ); - auto Y = uf::simd::vector( y ); +inline uf::simd::vector uf::simd::mul( uf::simd::vector x, uf::simd::vector y ) { + return ::mul_impl( x, y ); +} + + +inline uf::simd::vector uf::simd::div( uf::simd::vector x, uf::simd::vector y ) { + auto X = uf::simd::cast( x ); + auto Y = uf::simd::cast( y ); return uf::simd::set( X[0] / Y[0], X[1] / Y[1], X[2] / Y[2], X[3] / Y[3] ); } /* -inline uf::simd::value uf::simd::hadd( uf::simd::value x, uf::simd::value y ) { - auto X = uf::simd::vector( x ); - auto Y = uf::simd::vector( y ); +inline uf::simd::vector uf::simd::hadd( uf::simd::vector x, uf::simd::vector y ) { + auto X = uf::simd::cast( x ); + auto Y = uf::simd::cast( y ); return uf::simd::set( X[0] + Y[0], X[1] + Y[1], X[2] + Y[2], X[3] + Y[3] ); } */ -inline uf::simd::value uf::simd::min( uf::simd::value x, uf::simd::value y ) { -#if SSE_INSTR_SET >= 4 - return _mm_min_epi32(x, y); -#else - auto X = uf::simd::vector(x); - auto Y = uf::simd::vector(y); - return uf::simd::set(std::min(X[0],Y[0]), std::min(X[1],Y[1]), std::min(X[2],Y[2]), std::min(X[3],Y[3])); -#endif +namespace { + MV_INSTR_SET_DEFAULT + uf::simd::vector min_impl( uf::simd::vector x, uf::simd::vector y ) { + auto X = uf::simd::cast(x); + auto Y = uf::simd::cast(y); + return uf::simd::set(std::min(X[0],Y[0]), std::min(X[1],Y[1]), std::min(X[2],Y[2]), std::min(X[3],Y[3])); + } + MV_INSTR_SET_4 + uf::simd::vector min_impl( uf::simd::vector x, uf::simd::vector y ) { + return _mm_min_epi32(x, y); + } + + MV_INSTR_SET_DEFAULT + uf::simd::vector max_impl( uf::simd::vector x, uf::simd::vector y ) { + auto X = uf::simd::cast(x); + auto Y = uf::simd::cast(y); + return uf::simd::set(std::max(X[0],Y[0]), std::max(X[1],Y[1]), std::max(X[2],Y[2]), std::max(X[3],Y[3])); + } + MV_INSTR_SET_4 + uf::simd::vector max_impl( uf::simd::vector x, uf::simd::vector y ) { + return _mm_max_epi32(x, y); + } } -inline uf::simd::value uf::simd::max( uf::simd::value x, uf::simd::value y ) { -#if SSE_INSTR_SET >= 4 - return _mm_max_epi32(x, y); -#else - auto X = uf::simd::vector(x); - auto Y = uf::simd::vector(y); - return uf::simd::set(std::max(X[0],Y[0]), std::max(X[1],Y[1]), std::max(X[2],Y[2]), std::max(X[3],Y[3])); -#endif +inline uf::simd::vector uf::simd::min( uf::simd::vector x, uf::simd::vector y ) { + return ::min_impl(x, y); } -inline bool uf::simd::all( uf::simd::value mask) { +inline uf::simd::vector uf::simd::max( uf::simd::vector x, uf::simd::vector y ) { + return ::max_impl(x, y); +} + + +inline bool uf::simd::all( uf::simd::vector mask) { return _mm_movemask_epi8( mask ) == 0xFFFF; // all 4 bits set } -inline bool uf::simd::any( uf::simd::value mask) { +inline bool uf::simd::any( uf::simd::vector mask) { return _mm_movemask_epi8( mask ) != 0x0; // any bit set } -inline uf::simd::value uf::simd::less( uf::simd::value x, uf::simd::value y ) { -#if SSE_INSTR_SET >= 4 - return _mm_cmplt_epi32( x, y ); -#else - auto X = vector( x ), Y = vector( y ); - return set(X[0] < Y[0], X[1] < Y[1], X[2] < Y[2], X[3] < Y[3]); -#endif + +namespace { + MV_INSTR_SET_DEFAULT + uf::simd::vector less_impl( uf::simd::vector x, uf::simd::vector y ) { + auto X = uf::simd::cast( x ), Y = uf::simd::cast( y ); + return uf::simd::set_i(X[0] < Y[0], X[1] < Y[1], X[2] < Y[2], X[3] < Y[3]); + } + MV_INSTR_SET_4 + uf::simd::vector less_impl( uf::simd::vector x, uf::simd::vector y ) { + return _mm_cmplt_epi32( x, y ); + } + + MV_INSTR_SET_DEFAULT + uf::simd::vector lessEquals_impl( uf::simd::vector x, uf::simd::vector y ) { + auto X = uf::simd::cast( x ), Y = uf::simd::cast( y ); + return uf::simd::set_i(X[0] <= Y[0], X[1] <= Y[1], X[2] <= Y[2], X[3] <= Y[3]); + } + MV_INSTR_SET_4 + uf::simd::vector lessEquals_impl( uf::simd::vector x, uf::simd::vector y ) { + __m128i gt = _mm_cmpgt_epi32(x, y); + return _mm_xor_si128(gt, _mm_set1_epi32(-1)); + } + + MV_INSTR_SET_DEFAULT + uf::simd::vector greater_impl( uf::simd::vector x, uf::simd::vector y ) { + auto X = uf::simd::cast( x ), Y = uf::simd::cast( y ); + return uf::simd::set_i(X[0] > Y[0], X[1] > Y[1], X[2] > Y[2], X[3] > Y[3]); + } + MV_INSTR_SET_4 + uf::simd::vector greater_impl( uf::simd::vector x, uf::simd::vector y ) { + return _mm_cmpgt_epi32(x, y); + } + + MV_INSTR_SET_DEFAULT + uf::simd::vector greaterEquals_impl( uf::simd::vector x, uf::simd::vector y ) { + auto X = uf::simd::cast( x ), Y = uf::simd::cast( y ); + return uf::simd::set_i(X[0] >= Y[0], X[1] >= Y[1], X[2] >= Y[2], X[3] >= Y[3]); + } + MV_INSTR_SET_4 + uf::simd::vector greaterEquals_impl( uf::simd::vector x, uf::simd::vector y ) { + __m128i gt = _mm_cmplt_epi32(x, y); + return _mm_xor_si128(gt, _mm_set1_epi32(-1)); + } } -inline uf::simd::value uf::simd::lessEquals( uf::simd::value x, uf::simd::value y ) { -#if SSE_INSTR_SET >= 4 - __m128i gt = _mm_cmpgt_epi32(x, y); - return _mm_xor_si128(gt, _mm_set1_epi32(-1)); -#else - auto X = vector( x ), Y = vector( y ); - return uf::simd::set(X[0] <= Y[0], X[1] <= Y[1], X[2] <= Y[2], X[3] <= Y[3]); -#endif +inline uf::simd::vector uf::simd::less( uf::simd::vector x, uf::simd::vector y ) { + return ::less_impl( x, y ); } -inline uf::simd::value uf::simd::greater( uf::simd::value x, uf::simd::value y ) { -#if SSE_INSTR_SET >= 4 - return _mm_cmpgt_epi32( x, y ); -#else - auto X = vector( x ), Y = vector( y ); - return uf::simd::set(X[0] > Y[0], X[1] > Y[1], X[2] > Y[2], X[3] > Y[3]); -#endif +inline uf::simd::vector uf::simd::lessEquals( uf::simd::vector x, uf::simd::vector y ) { + return ::lessEquals_impl( x, y ); } -inline uf::simd::value uf::simd::greaterEquals( uf::simd::value x, uf::simd::value y ) { -#if SSE_INSTR_SET >= 4 - __m128i gt = _mm_cmplt_epi32(x, y); - return _mm_xor_si128(gt, _mm_set1_epi32(-1)); -#else - auto X = vector( x ), Y = vector( y ); - return uf::simd::set(X[0] >= Y[0], X[1] >= Y[1], X[2] >= Y[2], X[3] >= Y[3]); -#endif +inline uf::simd::vector uf::simd::greater( uf::simd::vector x, uf::simd::vector y ) { + return ::greater_impl( x, y ); } -inline uf::simd::value uf::simd::equals( uf::simd::value x, uf::simd::value y ) { +inline uf::simd::vector uf::simd::greaterEquals( uf::simd::vector x, uf::simd::vector y ) { + return ::greaterEquals_impl( x, y ); +} + +inline uf::simd::vector uf::simd::equals( uf::simd::vector x, uf::simd::vector y ) { return _mm_cmpeq_epi32(x, y); } -inline uf::simd::value uf::simd::notEquals( uf::simd::value x, uf::simd::value y ) { +inline uf::simd::vector uf::simd::notEquals( uf::simd::vector x, uf::simd::vector y ) { return _mm_xor_si128(_mm_cmpeq_epi32(x, y), _mm_set1_epi32(-1)); } -inline uf::simd::value uf::simd::sqrt( uf::simd::value v ) { - auto V = uf::simd::vector( v ); +inline uf::simd::vector uf::simd::sqrt( uf::simd::vector v ) { + auto V = uf::simd::cast( v ); return uf::simd::set( (int32_t) std::sqrt(V[0]), (int32_t) std::sqrt(V[1]), (int32_t) std::sqrt(V[2]), (int32_t) std::sqrt(V[3]) ); } -inline int32_t uf::simd::dot( uf::simd::value x, uf::simd::value y ) { - auto X = uf::simd::vector( x ); - auto Y = uf::simd::vector( y ); +inline int32_t uf::simd::dot( uf::simd::vector x, uf::simd::vector y ) { + auto X = uf::simd::cast( x ); + auto Y = uf::simd::cast( y ); return X[0] * Y[0] + X[1] * Y[1] + X[2] * Y[2] + X[3] * Y[3]; } -inline uf::simd::value uf::simd::load( const uint32_t* f ) { -#if SSE_INSTR_SET >= 3 - // if ( uf::aligned(f, 16) ) return _mm_load_si128(reinterpret_cast(f)); - return _mm_loadu_si128(reinterpret_cast(f)); -#else - return uf::simd::value( f[0], f[1], f[2], f[3] ); -#endif +namespace { + MV_INSTR_SET_DEFAULT + uf::simd::vector load_impl( const uint32_t* f ) { + return uf::simd::vector( f[0], f[1], f[2], f[3] ); + } + MV_INSTR_SET_3 + uf::simd::vector load_impl( const uint32_t* f ) { + // if ( uf::aligned(f, 16) ) return _mm_load_si128(reinterpret_cast(f)); + return _mm_loadu_si128(reinterpret_cast(f)); + } + + MV_INSTR_SET_DEFAULT + void store_impl( uf::simd::vector v, uint32_t* f ) { + union { __m128i x; uint32_t y[4]; } kludge; + kludge.x = v; + f[0] = kludge.y[0]; + f[1] = kludge.y[1]; + f[2] = kludge.y[2]; + f[3] = kludge.y[3]; + } + MV_INSTR_SET_3 + void store_impl( uf::simd::vector v, uint32_t* f ) { + /*if ( uf::aligned(f, 16) ) _mm_store_si128(reinterpret_cast<__m128i*>(f), v); + else*/ _mm_storeu_si128(reinterpret_cast<__m128i*>(f), v); + } } -inline void uf::simd::store( uf::simd::value v, uint32_t* f ) { -#if SSE_INSTR_SET >= 3 - /*if ( uf::aligned(f, 16) ) _mm_store_si128(reinterpret_cast<__m128i*>(f), v); - else*/ _mm_storeu_si128(reinterpret_cast<__m128i*>(f), v); -#else - union { __m128i x; uint32_t y[4]; } kludge; - kludge.x = v; - f[0] = kludge.y[0]; - f[1] = kludge.y[1]; - f[2] = kludge.y[2]; - f[3] = kludge.y[3]; -#endif +inline uf::simd::vector uf::simd::load( const uint32_t* f ) { + return ::load_impl( f ); } -inline uf::simd::value uf::simd::set( uint32_t f ) { +inline void uf::simd::store( uf::simd::vector v, uint32_t* f ) { + return ::store_impl( v, f ); +} + +inline uf::simd::vector uf::simd::set( uint32_t f ) { return _mm_set1_epi32(f); } -inline uf::simd::value uf::simd::set( uint32_t x, uint32_t y, uint32_t z, uint32_t w ) { +inline uf::simd::vector uf::simd::set( uint32_t x, uint32_t y, uint32_t z, uint32_t w ) { return _mm_setr_epi32(x, y, z, w); } -inline uf::simd::value uf::simd::add( uf::simd::value x, uf::simd::value y ) { +inline uf::simd::vector uf::simd::add( uf::simd::vector x, uf::simd::vector y ) { return _mm_add_epi32(x, y); } -inline uf::simd::value uf::simd::sub( uf::simd::value x, uf::simd::value y ) { +inline uf::simd::vector uf::simd::sub( uf::simd::vector x, uf::simd::vector y ) { return _mm_sub_epi32(x, y); } -inline uf::simd::value uf::simd::mul( uf::simd::value x, uf::simd::value y ) { -#if SSE_INSTR_SET >= 4 - return _mm_mullo_epi32(x, y); -#else - auto X = uf::simd::vector(x); - auto Y = uf::simd::vector(y); - return uf::simd::set(X[0]*Y[0], X[1]*Y[1], X[2]*Y[2], X[3]*Y[3]); -#endif + +namespace { + MV_INSTR_SET_DEFAULT + uf::simd::vector mul_impl( uf::simd::vector x, uf::simd::vector y ) { + auto X = uf::simd::cast(x); + auto Y = uf::simd::cast(y); + return uf::simd::set(X[0]*Y[0], X[1]*Y[1], X[2]*Y[2], X[3]*Y[3]); + } + MV_INSTR_SET_4 + uf::simd::vector mul_impl( uf::simd::vector x, uf::simd::vector y ) { + return _mm_mullo_epi32(x, y); + } } -inline uf::simd::value uf::simd::div( uf::simd::value x, uf::simd::value y ) { - auto X = uf::simd::vector( x ); - auto Y = uf::simd::vector( y ); +inline uf::simd::vector uf::simd::mul( uf::simd::vector x, uf::simd::vector y ) { + return ::mul_impl( x, y ); +} + +inline uf::simd::vector uf::simd::div( uf::simd::vector x, uf::simd::vector y ) { + auto X = uf::simd::cast( x ); + auto Y = uf::simd::cast( y ); return uf::simd::set( X[0] / Y[0], X[1] / Y[1], X[2] / Y[2], X[3] / Y[3] ); } /* -inline uf::simd::value uf::simd::hadd( uf::simd::value x, uf::simd::value y ) { - auto X = uf::simd::vector( x ); - auto Y = uf::simd::vector( y ); +inline uf::simd::vector uf::simd::hadd( uf::simd::vector x, uf::simd::vector y ) { + auto X = uf::simd::cast( x ); + auto Y = uf::simd::cast( y ); return uf::simd::set( X[0] + Y[0], X[1] + Y[1], X[2] + Y[2], X[3] + Y[3] ); } */ -inline uf::simd::value uf::simd::min( uf::simd::value x, uf::simd::value y ) { -#if SSE_INSTR_SET >= 4 - return _mm_min_epu32(x, y); // unsigned min -#else - auto X = uf::simd::vector(x); - auto Y = uf::simd::vector(y); - return uf::simd::set(std::min(X[0],Y[0]), std::min(X[1],Y[1]), std::min(X[2],Y[2]), std::min(X[3],Y[3])); -#endif + +namespace { + MV_INSTR_SET_DEFAULT + uf::simd::vector min_impl( uf::simd::vector x, uf::simd::vector y ) { + auto X = uf::simd::cast(x); + auto Y = uf::simd::cast(y); + return uf::simd::set(std::min(X[0],Y[0]), std::min(X[1],Y[1]), std::min(X[2],Y[2]), std::min(X[3],Y[3])); + } + MV_INSTR_SET_4 + uf::simd::vector min_impl( uf::simd::vector x, uf::simd::vector y ) { + return _mm_min_epu32(x, y); // unsigned min + } + + MV_INSTR_SET_DEFAULT + uf::simd::vector max_impl( uf::simd::vector x, uf::simd::vector y ) { + auto X = uf::simd::cast(x); + auto Y = uf::simd::cast(y); + return uf::simd::set(std::max(X[0],Y[0]), std::max(X[1],Y[1]), std::max(X[2],Y[2]), std::max(X[3],Y[3])); + } + MV_INSTR_SET_4 + uf::simd::vector max_impl( uf::simd::vector x, uf::simd::vector y ) { + return _mm_max_epu32(x, y); // unsigned max + } } -inline uf::simd::value uf::simd::max( uf::simd::value x, uf::simd::value y ) { -#if SSE_INSTR_SET >= 4 - return _mm_max_epu32(x, y); // unsigned max -#else - auto X = uf::simd::vector(x); - auto Y = uf::simd::vector(y); - return uf::simd::set(std::max(X[0],Y[0]), std::max(X[1],Y[1]), std::max(X[2],Y[2]), std::max(X[3],Y[3])); -#endif +inline uf::simd::vector uf::simd::min( uf::simd::vector x, uf::simd::vector y ) { + return ::min_impl(x, y); } -inline bool uf::simd::all( uf::simd::value mask) { +inline uf::simd::vector uf::simd::max( uf::simd::vector x, uf::simd::vector y ) { + return ::max_impl(x, y); +} + +inline bool uf::simd::all( uf::simd::vector mask) { return _mm_movemask_epi8( mask ) == 0xFFFF; // all 4 bits set } -inline bool uf::simd::any( uf::simd::value mask) { +inline bool uf::simd::any( uf::simd::vector mask) { return _mm_movemask_epi8( mask ) != 0x0; // any bit set } -inline uf::simd::value uf::simd::less( uf::simd::value x, uf::simd::value y ) { -#if SSE_INSTR_SET >= 4 - return _mm_cmplt_epi32( ::bias_unsigned( x ), ::bias_unsigned( y ) ); -#else - auto X = vector( x ), Y = vector( y ); - return set(X[0] < Y[0], X[1] < Y[1], X[2] < Y[2], X[3] < Y[3]); -#endif + +namespace { + MV_INSTR_SET_DEFAULT + uf::simd::vector less_impl( uf::simd::vector x, uf::simd::vector y ) { + auto X = uf::simd::cast( x ), Y = uf::simd::cast( y ); + return uf::simd::set_ui(X[0] < Y[0], X[1] < Y[1], X[2] < Y[2], X[3] < Y[3]); + } + MV_INSTR_SET_4 + uf::simd::vector less_impl( uf::simd::vector x, uf::simd::vector y ) { + return _mm_cmplt_epi32( ::bias_unsigned(x), ::bias_unsigned(y) ); + } + + MV_INSTR_SET_DEFAULT + uf::simd::vector lessEquals_impl( uf::simd::vector x, uf::simd::vector y) { + auto X = uf::simd::cast(x), Y = uf::simd::cast(y); + return uf::simd::set_ui(X[0] <= Y[0], X[1] <= Y[1], X[2] <= Y[2], X[3] <= Y[3]); + } + MV_INSTR_SET_2 + uf::simd::vector lessEquals_impl( uf::simd::vector x, uf::simd::vector y) { + // a <= b <=> !(a > b) + __m128i bx = ::bias_unsigned(x); + __m128i by = ::bias_unsigned(y); + __m128i gt = _mm_cmpgt_epi32(bx, by); // signed compare + return _mm_xor_si128(gt, _mm_set1_epi32(-1)); // invert mask + } + + MV_INSTR_SET_DEFAULT + uf::simd::vector greater_impl( uf::simd::vector x, uf::simd::vector y ) { + auto X = uf::simd::cast( x ), Y = uf::simd::cast( y ); + return uf::simd::set_ui(X[0] > Y[0], X[1] > Y[1], X[2] > Y[2], X[3] > Y[3]); + } + MV_INSTR_SET_4 + uf::simd::vector greater_impl( uf::simd::vector x, uf::simd::vector y ) { + return _mm_cmpgt_epi32( ::bias_unsigned(x), ::bias_unsigned(y) ); + } + + MV_INSTR_SET_DEFAULT + uf::simd::vector greaterEquals_impl( uf::simd::vector x, uf::simd::vector y) { + auto X = uf::simd::cast(x), Y = uf::simd::cast(y); + return uf::simd::set_ui(X[0] >= Y[0], X[1] >= Y[1], X[2] >= Y[2], X[3] >= Y[3]); + } + MV_INSTR_SET_2 + uf::simd::vector greaterEquals_impl( uf::simd::vector x, uf::simd::vector y) { + // a >= b <=> !(a < b) + __m128i bx = ::bias_unsigned(x); + __m128i by = ::bias_unsigned(y); + __m128i lt = _mm_cmplt_epi32(bx, by); // signed compare + return _mm_xor_si128(lt, _mm_set1_epi32(-1)); // invert mask + } } -inline uf::simd::value uf::simd::lessEquals(value x, value y) { -#if SSE_INSTR_SET >= 2 - // a <= b <=> !(a > b) - __m128i bx = ::bias_unsigned(x); - __m128i by = ::bias_unsigned(y); - __m128i gt = _mm_cmpgt_epi32(bx, by); // signed compare - return _mm_xor_si128(gt, _mm_set1_epi32(-1)); // invert mask -#else - auto X = vector(x), Y = vector(y); - return set(X[0] <= Y[0], X[1] <= Y[1], X[2] <= Y[2], X[3] <= Y[3]); -#endif +inline uf::simd::vector uf::simd::less( uf::simd::vector x, uf::simd::vector y ) { + return ::less_impl( x, y ); } -inline uf::simd::value uf::simd::greater( uf::simd::value x, uf::simd::value y ) { -#if SSE_INSTR_SET >= 4 - return _mm_cmpgt_epi32( ::bias_unsigned( x ), ::bias_unsigned( y ) ); -#else - auto X = vector( x ), Y = vector( y ); - return uf::simd::set(X[0] > Y[0], X[1] > Y[1], X[2] > Y[2], X[3] > Y[3]); -#endif +inline uf::simd::vector uf::simd::lessEquals( uf::simd::vector x, uf::simd::vector y ) { + return ::lessEquals_impl( x, y ); } -inline uf::simd::value uf::simd::greaterEquals(value x, value y) { -#if SSE_INSTR_SET >= 2 - // a >= b <=> !(a < b) - __m128i bx = ::bias_unsigned(x); - __m128i by = ::bias_unsigned(y); - __m128i lt = _mm_cmplt_epi32(bx, by); // signed compare - return _mm_xor_si128(lt, _mm_set1_epi32(-1)); // invert mask -#else - auto X = vector(x), Y = vector(y); - return set(X[0] >= Y[0], X[1] >= Y[1], X[2] >= Y[2], X[3] >= Y[3]); -#endif +inline uf::simd::vector uf::simd::greater( uf::simd::vector x, uf::simd::vector y ) { + return ::greater_impl( x, y ); } -inline uf::simd::value uf::simd::equals( uf::simd::value x, uf::simd::value y ) { +inline uf::simd::vector uf::simd::greaterEquals( uf::simd::vector x, uf::simd::vector y ) { + return ::greaterEquals_impl( x, y ); +} + + +inline uf::simd::vector uf::simd::equals( uf::simd::vector x, uf::simd::vector y ) { return _mm_cmpeq_epi32(x, y); } -inline uf::simd::value uf::simd::notEquals( uf::simd::value x, uf::simd::value y ) { +inline uf::simd::vector uf::simd::notEquals( uf::simd::vector x, uf::simd::vector y ) { return _mm_xor_si128(_mm_cmpeq_epi32(x, y), _mm_set1_epi32(-1)); } -inline uf::simd::value uf::simd::sqrt( uf::simd::value v ) { - auto V = uf::simd::vector( v ); +inline uf::simd::vector uf::simd::sqrt( uf::simd::vector v ) { + auto V = uf::simd::cast( v ); return uf::simd::set( (uint32_t) std::sqrt(V[0]), (uint32_t) std::sqrt(V[1]), (uint32_t) std::sqrt(V[2]), (uint32_t) std::sqrt(V[3]) ); } -inline uint32_t uf::simd::dot( uf::simd::value x, uf::simd::value y ) { - auto X = uf::simd::vector( x ); - auto Y = uf::simd::vector( y ); +inline uint32_t uf::simd::dot( uf::simd::vector x, uf::simd::vector y ) { + auto X = uf::simd::cast( x ); + auto Y = uf::simd::cast( y ); return X[0] * Y[0] + X[1] * Y[1] + X[2] * Y[2] + X[3] * Y[3]; +} + +inline uf::simd::vector uf::simd::set_f( bool x, bool y, bool z, bool w ) { + return _mm_castsi128_ps(_mm_setr_epi32(::boolMask(x), ::boolMask(y), ::boolMask(z), ::boolMask(w))); +} +inline uf::simd::vector uf::simd::set_i( bool x, bool y, bool z, bool w ) { + return _mm_setr_epi32(::boolMask(x), ::boolMask(y), ::boolMask(z), ::boolMask(w)); +} +inline uf::simd::vector uf::simd::set_ui( bool x, bool y, bool z, bool w ) { + return _mm_setr_epi32(::boolMask(x), ::boolMask(y), ::boolMask(z), ::boolMask(w)); +} + +namespace { + MV_INSTR_SET_DEFAULT + uf::simd::vector cross_impl( uf::simd::vector x, uf::simd::vector y ) { + __m128 tmp0 = _mm_shuffle_ps(y,y,_MM_SHUFFLE(3,0,2,1)); + __m128 tmp1 = _mm_shuffle_ps(x,x,_MM_SHUFFLE(3,0,2,1)); + tmp0 = _mm_mul_ps(tmp0,x); + tmp1 = _mm_mul_ps(tmp1,y); + __m128 tmp2 = _mm_sub_ps(tmp0,tmp1); + __m128 res = _mm_shuffle_ps(tmp2,tmp2,_MM_SHUFFLE(3,0,2,1)); + return res; + } + MV_INSTR_SET_7 + uf::simd::vector cross_impl( uf::simd::vector x, uf::simd::vector y ) { + __m128 tmp0 = _mm_shuffle_ps(y,y,_MM_SHUFFLE(3,0,2,1)); + __m128 tmp1 = _mm_shuffle_ps(x,x,_MM_SHUFFLE(3,0,2,1)); + tmp1 = _mm_mul_ps(tmp1,y); + __m128 tmp2 = _mm_fmsub_ps( tmp0,x, tmp1 ); + __m128 res = _mm_shuffle_ps(tmp2,tmp2,_MM_SHUFFLE(3,0,2,1)); + return res; + } +} + +inline uf::simd::vector uf::simd::cross( uf::simd::vector x, uf::simd::vector y ) { + return ::cross_impl( x, y ); +} +inline uf::simd::vector uf::simd::normalize( uf::simd::vector v ) { + __m128 len = _mm_sqrt_ss( ::dot_impl( v,v ) ); + len = _mm_shuffle_ps(len, len, 0x00); + return _mm_div_ps(v, len); +} +inline uf::simd::vector uf::simd::normalize_fast( uf::simd::vector v ) { + __m128 invLen = _mm_rsqrt_ps(::dot_impl(v, v)); + return _mm_mul_ps(v, invLen); } \ No newline at end of file