22 #ifndef __D_VECTOR_X86_SIMD_H__
23 #define __D_VECTOR_X86_SIMD_H__
25 #ifndef DG_SCALAR_VECTOR_CLASS
27 #ifdef D_NEWTON_USE_DOUBLE
28 #define dVector dBigVector
40 #define PERMUTE_MASK(w, z, y, x) _MM_SHUFFLE (w, z, y, x)
46 D_INLINE
dVector(
const __m128i type)
51 D_INLINE
dVector(
const __m128 type)
56 D_INLINE
dVector (
const dFloat32 a)
57 :m_type(_mm_set_ps1(a))
61 D_INLINE
dVector (
const dFloat32*
const ptr)
62 :m_type(_mm_loadu_ps (ptr))
66 #ifndef D_NEWTON_USE_DOUBLE
67 D_INLINE
dVector(
const dFloat64*
const ptr)
68 :m_type(_mm_set_ps(dFloat32(ptr[3]), dFloat32(ptr[2]), dFloat32(ptr[1]), dFloat32(ptr[0])))
79 :m_type(_mm_shuffle_ps (_mm_cvtpd_ps (((__m128d*)©)[0]), _mm_cvtpd_ps (((__m128d*)©)[1]), PERMUTE_MASK(1, 0, 1, 0)))
81 dAssert (dCheckVector ((*
this)));
84 D_INLINE
dVector (dFloat32 x, dFloat32 y, dFloat32 z, dFloat32 w)
85 :m_type(_mm_set_ps(w, z, y, x))
89 D_INLINE
dVector (dInt32 ix, dInt32 iy, dInt32 iz, dInt32 iw)
90 :m_type(_mm_set_ps(*(dFloat32*)&iw, *(dFloat32*)&iz, *(dFloat32*)&iy, *(dFloat32*)&ix))
94 D_INLINE
void *
operator new[](
size_t size)
99 D_INLINE
void *
operator new (
size_t size)
104 D_INLINE
void operator delete[] (
void* ptr)
109 D_INLINE
void operator delete (
void* ptr)
115 D_INLINE dFloat32 GetScalar ()
const
118 return _mm_cvtss_f32 (m_type);
121 D_INLINE
void Store (dFloat32*
const dst)
const
123 _mm_storeu_ps(dst, m_type);
126 D_INLINE
dVector BroadcastX ()
const
128 return _mm_shuffle_ps (m_type, m_type, PERMUTE_MASK(0, 0, 0, 0));
131 D_INLINE
dVector BroadcastY ()
const
133 return _mm_shuffle_ps (m_type, m_type, PERMUTE_MASK(1, 1, 1, 1));
136 D_INLINE
dVector BroadcastZ ()
const
138 return _mm_shuffle_ps (m_type, m_type, PERMUTE_MASK(2, 2, 2, 2));
141 D_INLINE
dVector BroadcastW ()
const
143 return _mm_shuffle_ps (m_type, m_type, PERMUTE_MASK(3, 3, 3, 3));
146 D_INLINE
dVector Scale (dFloat32 s)
const
148 return _mm_mul_ps (m_type, _mm_set_ps1(s));
151 D_INLINE dFloat32& operator[] (dInt32 i)
158 D_INLINE
const dFloat32& operator[] (dInt32 i)
const
167 return _mm_add_ps (m_type, A.m_type);
172 return _mm_sub_ps (m_type, A.m_type);
177 return _mm_mul_ps(m_type, A.m_type);
182 return (*
this = _mm_add_ps (m_type, A.m_type));
187 return (*
this = _mm_sub_ps (m_type, A.m_type));
192 return (*
this = _mm_mul_ps(m_type, A.m_type));
198 return _mm_sub_ps (_mm_mul_ps (_mm_shuffle_ps (m_type, m_type, PERMUTE_MASK(3, 0, 2, 1)), _mm_shuffle_ps (B.m_type, B.m_type, PERMUTE_MASK(3, 1, 0, 2))),
199 _mm_mul_ps (_mm_shuffle_ps (m_type, m_type, PERMUTE_MASK(3, 1, 0, 2)), _mm_shuffle_ps (B.m_type, B.m_type, PERMUTE_MASK(3, 0, 2, 1))));
204 dVector tmp(_mm_mul_ps(m_type, A.m_type));
205 return tmp.AddHorizontal();
210 dFloat32 cofactor[3][3];
211 dFloat32 array[4][4];
214 for (dInt32 i = 0; i < 4; i ++)
219 array[3][i] = dFloat32 (1.0f);
223 dFloat32 sign = dFloat32 (-1.0f);
224 for (dInt32 i = 0; i < 4; i ++)
226 for (dInt32 j = 0; j < 3; j ++)
229 for (dInt32 k = 0; k < 4; k ++)
233 cofactor[j][k0] = array[j][k];
238 dFloat32 x = cofactor[0][0] * (cofactor[1][1] * cofactor[2][2] - cofactor[1][2] * cofactor[2][1]);
239 dFloat32 y = cofactor[0][1] * (cofactor[1][2] * cofactor[2][0] - cofactor[1][0] * cofactor[2][2]);
240 dFloat32 z = cofactor[0][2] * (cofactor[1][0] * cofactor[2][1] - cofactor[1][1] * cofactor[2][0]);
241 dFloat32 det = x + y + z;
243 normal[i] = sign * det;
244 sign *= dFloat32 (-1.0f);
250 D_INLINE
dVector Reciproc ()
const
252 return _mm_div_ps (m_one.m_type, m_type);
257 return _mm_add_ps(m_type, _mm_mul_ps(A.m_type, B.m_type));
262 return _mm_sub_ps(m_type, _mm_mul_ps(A.m_type, B.m_type));
265 D_INLINE
dVector AddHorizontal ()
const
267 __m128 tmp (_mm_hadd_ps (m_type, m_type));
268 return _mm_hadd_ps (tmp, tmp);
273 return _mm_and_ps (m_type, m_signMask.m_type);
276 dFloat32 GetMax ()
const
278 __m128 tmp (_mm_max_ps (m_type, _mm_shuffle_ps (m_type, m_type, PERMUTE_MASK(3, 2, 3, 2))));
280 return _mm_cvtss_f32(_mm_max_ss (tmp, _mm_shuffle_ps(tmp, tmp, PERMUTE_MASK(3, 2, 0, 1))));
285 return _mm_max_ps (m_type, data.m_type);
290 return _mm_min_ps (m_type, data.m_type);
293 D_INLINE
dVector GetInt ()
const
295 return dVector(_mm_cvtps_epi32(Floor().m_type));
298 D_INLINE
dVector TestZero()
const
301 return m_negOne & (*
this == m_zero);
304 D_INLINE
dVector Floor ()
const
306 dVector truncated (_mm_cvtepi32_ps (_mm_cvttps_epi32 (m_type)));
307 dVector ret (truncated - (dVector::m_one & (*
this < truncated)));
308 dAssert (ret.m_f[0] == dFloor(m_f[0]));
309 dAssert (ret.m_f[1] == dFloor(m_f[1]));
310 dAssert (ret.m_f[2] == dFloor(m_f[2]));
311 dAssert (ret.m_f[3] == dFloor(m_f[3]));
317 return _mm_sqrt_ps(m_type);
320 D_INLINE
dVector InvSqrt ()
const
322 dVector tmp0 (_mm_rsqrt_ps(m_type));
323 return m_half * tmp0 * (m_three - *
this * tmp0 * tmp0);
326 D_INLINE
dVector InvMagSqrt ()
const
328 return DotProduct(*this).InvSqrt();
331 D_INLINE
dVector Normalize ()
const
333 dAssert (m_w == dFloat32 (0.0f));
335 return Scale(dFloat32(1.0f) / dSqrt(DotProduct(*this).GetScalar()));
341 return _mm_cmpgt_ps (m_type, data.m_type);
346 return _mm_cmpeq_ps (m_type, data.m_type);
351 return _mm_cmplt_ps (m_type, data.m_type);
356 return _mm_cmpge_ps (m_type, data.m_type);
361 return _mm_cmple_ps (m_type, data.m_type);
367 return _mm_and_ps (m_type, data.m_type);
372 return _mm_or_ps (m_type, data.m_type);
377 return _mm_xor_ps (m_type, data.m_type);
382 return _mm_andnot_ps(data.m_type, m_type);
389 return _mm_xor_ps(m_type, _mm_and_ps (mask.m_type, _mm_xor_ps(m_type, data.m_type)));
392 D_INLINE dInt32 GetSignMask()
const
394 return _mm_movemask_ps(m_type);
397 D_INLINE
dVector ShiftRight()
const
399 return _mm_shuffle_ps(m_type, m_type, PERMUTE_MASK(2, 1, 0, 3));
402 D_INLINE
dVector ShiftTripleRight ()
const
404 return _mm_shuffle_ps(m_type, m_type, PERMUTE_MASK(3, 1, 0, 2));
407 D_INLINE
dVector ShiftTripleLeft ()
const
409 return _mm_shuffle_ps (m_type, m_type, PERMUTE_MASK(3, 0, 2, 1));
412 D_INLINE
dVector ShiftRightLogical (dInt32 bits)
const
414 return dVector (_mm_srli_epi32(m_typeInt, bits));
419 __m128 tmp0 (_mm_unpacklo_ps (src0.m_type, src1.m_type));
420 __m128 tmp1 (_mm_unpacklo_ps (src2.m_type, src3.m_type));
421 __m128 tmp2 (_mm_unpackhi_ps (src0.m_type, src1.m_type));
422 __m128 tmp3 (_mm_unpackhi_ps (src2.m_type, src3.m_type));
424 dst0 =
dVector (_mm_movelh_ps (tmp0, tmp1));
425 dst1 =
dVector (_mm_movehl_ps (tmp1, tmp0));
426 dst2 =
dVector (_mm_movelh_ps (tmp2, tmp3));
427 dst3 =
dVector (_mm_movehl_ps (tmp3, tmp2));
431 D_INLINE
void Trace(
char*
const name)
const
437 D_INLINE
void Trace(
char*
const name)
const {}
462 D_CORE_API
static dVector m_zero;
463 D_CORE_API
static dVector m_one;
464 D_CORE_API
static dVector m_wOne;
465 D_CORE_API
static dVector m_two;
466 D_CORE_API
static dVector m_half;
467 D_CORE_API
static dVector m_three;
468 D_CORE_API
static dVector m_negOne;
469 D_CORE_API
static dVector m_xMask;
470 D_CORE_API
static dVector m_yMask;
471 D_CORE_API
static dVector m_zMask;
472 D_CORE_API
static dVector m_wMask;
473 D_CORE_API
static dVector m_epsilon;
474 D_CORE_API
static dVector m_signMask;
475 D_CORE_API
static dVector m_triplexMask;
476 } D_GCC_NEWTON_ALIGN_16 ;
485 #ifdef D_USE_VECTOR_AVX
486 D_MSV_NEWTON_ALIGN_32
515 :m_type(_mm256_set1_pd(a))
519 #ifdef D_NEWTON_USE_DOUBLE
520 D_INLINE
dBigVector(
const dFloat32*
const ptr)
521 :m_type(_mm256_set_pd(ptr[3], ptr[2], ptr[1], ptr[0]))
527 :m_type(_mm256_cvtps_pd(v.m_type))
529 dAssert(dCheckVector((*
this)));
532 D_INLINE
dBigVector(
const dFloat64*
const ptr)
533 :m_type(_mm256_loadu_pd(ptr))
538 D_INLINE
dBigVector(dFloat64 x, dFloat64 y, dFloat64 z, dFloat64 w)
539 :m_type(_mm256_set_pd(w, z, y, x))
543 D_INLINE
dBigVector(dInt32 ix, dInt32 iy, dInt32 iz, dInt32 iw)
544 :m_ix(dInt64(ix)), m_iy(dInt64(iy)), m_iz(dInt64(iz)), m_iw(dInt64(iw))
548 D_INLINE
dBigVector(dInt64 ix, dInt64 iy, dInt64 iz, dInt64 iw)
549 :m_ix(ix), m_iy(iy), m_iz(iz), m_iw(iw)
553 D_INLINE dFloat64& operator[] (dInt32 i)
560 D_INLINE
const dFloat64& operator[] (dInt32 i)
const
567 D_INLINE dFloat64 GetScalar()
const
575 return _mm256_add_pd(m_type, A.m_type);
580 return _mm256_sub_pd(m_type, A.m_type);
585 return _mm256_mul_pd(m_type, A.m_type);
590 m_type = _mm256_add_pd(m_type, A.m_type);
596 m_type = _mm256_sub_pd(m_type, A.m_type);
602 m_type = _mm256_mul_pd(m_type, A.m_type);
608 return *
this + A * B;
613 return *
this - A * B;
619 return dBigVector(m_y * B.m_z - m_z * B.m_y, m_z * B.m_x - m_x * B.m_z, m_x * B.m_y - m_y * B.m_x, m_w);
624 __m256d tmp0(_mm256_hadd_pd(m_type, m_type));
625 __m256d tmp1(_mm256_permute2f128_pd(tmp0, tmp0, 3));
626 return _mm256_add_pd(tmp0, tmp1);
651 __m256d tmp0(_mm256_set1_pd(s));
652 return _mm256_mul_pd(m_type, tmp0);
657 return _mm256_and_pd(m_type, m_signMask.m_type);
662 return _mm256_div_pd(m_one.m_type, m_type);
667 return _mm256_sqrt_pd(m_type);
672 return Sqrt().Reciproc();
677 dAssert(m_w == dFloat32(0.0f));
678 dFloat64 mag2 = DotProduct(*this).GetScalar();
679 return Scale(dFloat64(1.0f) / sqrt(mag2));
682 dFloat64 GetMax()
const
684 __m256d tmp0(_mm256_permute2f128_pd(m_type, m_type, 5));
685 __m256d tmp1(_mm256_max_pd(m_type, tmp0));
686 __m256d tmp2(_mm256_unpackhi_pd(tmp1, tmp1));
687 __m256d tmp3(_mm256_max_pd(tmp1, tmp2));
689 return tmp4.GetScalar();
694 return _mm256_max_pd(m_type, data.m_type);
699 return _mm256_min_pd(m_type, data.m_type);
716 tmp = _mm256_cvttpd_epi32(temp.m_type);
723 return _mm256_cmp_pd(m_type, data.m_type, _CMP_GT_OQ);
728 return _mm256_cmp_pd(m_type, data.m_type, _CMP_EQ_OQ);
733 return _mm256_cmp_pd(m_type, data.m_type, _CMP_LT_OQ);
738 return _mm256_cmp_pd(m_type, data.m_type, _CMP_GE_OQ);
743 return _mm256_cmp_pd(m_type, data.m_type, _CMP_LE_OQ);
749 return _mm256_and_pd(m_type, data.m_type);
754 return _mm256_or_pd(m_type, data.m_type);
759 return _mm256_xor_pd(m_type, data.m_type);
764 return _mm256_andnot_pd(data.m_type, m_type);
770 return _mm256_xor_pd(m_type, _mm256_and_pd(mask.m_type, _mm256_xor_pd(m_type, data.m_type)));
776 __m256d tmp0(_mm256_permute2f128_pd(m_type, m_type, 5));
777 __m256d tmp1(_mm256_blend_pd(m_type, tmp0, 10));
778 __m256d tmp2(_mm256_shuffle_pd(tmp1, tmp1, 5));
785 __m256d tmp0(_mm256_permute2f128_pd(m_type, m_type, 5));
786 __m256d tmp1(_mm256_shuffle_pd(m_type, m_type, 5));
787 __m256d tmp2(_mm256_blend_pd(tmp0, tmp1, 6));
788 __m256d tmp3(_mm256_shuffle_pd(tmp2, tmp2, 6));
795 __m256d tmp0(_mm256_permute2f128_pd(m_type, m_type, 5));
796 __m256d tmp1(_mm256_blend_pd(m_type, tmp0, 10));
797 __m256d tmp2(_mm256_permute2f128_pd(tmp1, tmp1, 5));
798 __m256d tmp3(_mm256_shuffle_pd(tmp2, tmp2, 9));
802 D_INLINE
dBigVector ShiftRightLogical(dInt32 bits)
const
804 dUnsigned64 x = ((dUnsigned64)m_ix) >> bits;
805 dUnsigned64 y = ((dUnsigned64)m_iy) >> bits;
806 dUnsigned64 z = ((dUnsigned64)m_iz) >> bits;
807 dUnsigned64 w = ((dUnsigned64)m_iw) >> bits;
808 return dBigVector((dInt64)x, (dInt64)y, (dInt64)z, (dInt64)w);
811 D_INLINE dInt32 GetSignMask()
const
813 return _mm256_movemask_pd(m_type);
819 dAssert(ret.m_f[0] == floor(m_f[0]));
820 dAssert(ret.m_f[1] == floor(m_f[1]));
821 dAssert(ret.m_f[2] == floor(m_f[2]));
822 dAssert(ret.m_f[3] == floor(m_f[3]));
828 return m_negOne & (*
this == dBigVector::m_zero);
831 D_INLINE
static void Transpose4x4(
836 __m256d tmp0(_mm256_unpacklo_pd(src0.m_type, src1.m_type));
837 __m256d tmp1(_mm256_unpackhi_pd(src0.m_type, src1.m_type));
838 dst2 = _mm256_unpacklo_pd(src2.m_type, src3.m_type);
839 dst3 = _mm256_unpackhi_pd(src2.m_type, src3.m_type);
841 dst0 = _mm256_permute2f128_pd(dst2.m_type, tmp0, 2);
842 dst1 = _mm256_permute2f128_pd(dst3.m_type, tmp1, 2);
843 tmp0 = _mm256_permute2f128_pd(tmp0, tmp0, 1);
844 tmp1 = _mm256_permute2f128_pd(tmp1, tmp1, 1);
845 dst2 = _mm256_blend_pd(tmp0, dst2.m_type, 12);
846 dst3 = _mm256_blend_pd(tmp1, dst3.m_type, 12);
852 dBigVector tmp(_mm256_mul_pd(m_type, A.m_type));
853 return tmp.AddHorizontal();
858 dFloat64 array[4][4];
859 dFloat64 cofactor[3][3];
862 for (dInt32 i = 0; i < 4; i++)
867 array[3][i] = dFloat64(1.0f);
871 dFloat64 sign = dFloat64(-1.0f);
872 for (dInt32 i = 0; i < 4; i++)
874 for (dInt32 j = 0; j < 3; j++)
877 for (dInt32 k = 0; k < 4; k++)
881 cofactor[j][k0] = array[j][k];
886 dFloat64 x = cofactor[0][0] * (cofactor[1][1] * cofactor[2][2] - cofactor[1][2] * cofactor[2][1]);
887 dFloat64 y = cofactor[0][1] * (cofactor[1][2] * cofactor[2][0] - cofactor[1][0] * cofactor[2][2]);
888 dFloat64 z = cofactor[0][2] * (cofactor[1][0] * cofactor[2][1] - cofactor[1][1] * cofactor[2][0]);
889 dFloat64 det = x + y + z;
891 normal[i] = sign * det;
892 sign *= dFloat64(-1.0f);
935 } D_GCC_NEWTON_ALIGN_32;
937 D_MSV_NEWTON_ALIGN_32
946 :m_d0(_mm256_set1_pd(dFloat64(a)))
951 #ifdef D_NEWTON_USE_DOUBLE
962 :m_d0(_mm256_cvtps_pd(low.m_type))
963 ,m_d1(_mm256_cvtps_pd(high.ShiftTripleLeft().m_type))
966 m_f[6] = dFloat64(0.0f);
967 m_f[7] = dFloat64(0.0f);
983 D_INLINE dFloat64& operator[] (dInt32 i)
990 D_INLINE
const dFloat64& operator[] (dInt32 i)
const
999 return dSpatialVector(_mm256_add_pd(m_d0, A.m_d0), _mm256_add_pd(m_d1, A.m_d1));
1004 return dSpatialVector(_mm256_mul_pd(m_d0, A.m_d0), _mm256_mul_pd(m_d1, A.m_d1));
1010 __m256d tmp0(_mm256_add_pd(tmp.m_d0, tmp.m_d1));
1011 __m256d tmp1(_mm256_hadd_pd(tmp0, tmp0));
1012 __m256d tmp2(_mm256_permute2f128_pd(tmp1, tmp1, 1));
1013 __m256d tmp3(_mm256_add_pd(tmp1, tmp2));
1014 return *((dFloat64*)&tmp3);
1019 __m256d tmp(_mm256_set1_pd(s));
1020 return dSpatialVector(_mm256_mul_pd(m_d0, tmp), _mm256_mul_pd(m_d1, tmp));
1033 } D_GCC_NEWTON_ALIGN_32;
1036 D_MSV_NEWTON_ALIGN_32
1039 #define PERMUT_MASK_DOUBLE(y, x) _MM_SHUFFLE2 (y, x)
1047 :m_typeLow(copy.m_typeLow)
1048 ,m_typeHigh(copy.m_typeHigh)
1052 D_INLINE
dBigVector(
const __m128d typeLow,
const __m128d typeHigh)
1054 ,m_typeHigh(typeHigh)
1058 D_INLINE
dBigVector(
const __m128i typeLow,
const __m128i typeHigh)
1059 :m_typeIntLow(typeLow)
1060 ,m_typeIntHigh(typeHigh)
1065 :m_typeLow(_mm_set1_pd(a))
1066 ,m_typeHigh(_mm_set1_pd(a))
1070 #ifdef D_NEWTON_USE_DOUBLE
1071 D_INLINE
dBigVector (
const dFloat32*
const ptr)
1072 :m_typeLow(_mm_loadu_pd(ptr))
1073 ,m_typeHigh(_mm_loadu_pd(&ptr[2]))
1079 :m_typeLow(_mm_cvtps_pd (v.m_type))
1080 ,m_typeHigh(_mm_cvtps_pd (_mm_shuffle_ps (v.m_type, v.m_type, PERMUTE_MASK(3, 2, 3, 2))))
1082 dAssert(dCheckVector((*
this)));
1085 D_INLINE
dBigVector(
const dFloat64*
const ptr)
1086 :m_typeLow(_mm_loadu_pd(ptr))
1087 ,m_typeHigh(_mm_loadu_pd(&ptr[2]))
1092 D_INLINE
dBigVector(dFloat64 x, dFloat64 y, dFloat64 z, dFloat64 w)
1093 :m_typeLow(_mm_set_pd(y, x))
1094 ,m_typeHigh(_mm_set_pd(w, z))
1098 D_INLINE
dBigVector(dInt32 ix, dInt32 iy, dInt32 iz, dInt32 iw)
1099 :m_ix(dInt64(ix)), m_iy(dInt64(iy)), m_iz(dInt64(iz)), m_iw(dInt64(iw))
1103 D_INLINE
dBigVector(dInt64 ix, dInt64 iy, dInt64 iz, dInt64 iw)
1104 :m_ix(ix), m_iy(iy), m_iz(iz), m_iw(iw)
1108 D_INLINE dFloat64& operator[] (dInt32 i)
1115 D_INLINE
const dFloat64& operator[] (dInt32 i)
const
1122 D_INLINE dFloat64 GetScalar()
const
1125 return _mm_cvtsd_f64(m_typeLow);
1130 return dBigVector(_mm_add_pd(m_typeLow, A.m_typeLow), _mm_add_pd(m_typeHigh, A.m_typeHigh));
1135 return dBigVector(_mm_sub_pd(m_typeLow, A.m_typeLow), _mm_sub_pd(m_typeHigh, A.m_typeHigh));
1140 return dBigVector(_mm_mul_pd(m_typeLow, A.m_typeLow), _mm_mul_pd(m_typeHigh, A.m_typeHigh));
1145 m_typeLow = _mm_add_pd(m_typeLow, A.m_typeLow);
1146 m_typeHigh = _mm_add_pd(m_typeHigh, A.m_typeHigh);
1152 m_typeLow = _mm_sub_pd(m_typeLow, A.m_typeLow);
1153 m_typeHigh = _mm_sub_pd(m_typeHigh, A.m_typeHigh);
1159 m_typeLow = _mm_mul_pd(m_typeLow, A.m_typeLow);
1160 m_typeHigh = _mm_mul_pd(m_typeHigh, A.m_typeHigh);
1166 return *
this + A * B;
1171 return *
this - A * B;
1177 return dBigVector(m_y * B.m_z - m_z * B.m_y, m_z * B.m_x - m_x * B.m_z, m_x * B.m_y - m_y * B.m_x, m_w);
1182 __m128d tmp0(_mm_add_pd(m_typeHigh, m_typeLow));
1183 __m128d tmp1(_mm_hadd_pd(tmp0, tmp0));
1209 __m128d tmp0(_mm_set1_pd(s));
1210 return dBigVector(_mm_mul_pd(m_typeLow, tmp0), _mm_mul_pd(m_typeHigh, tmp0));
1215 return dBigVector(_mm_and_pd(m_typeLow, m_signMask.m_typeLow), _mm_and_pd(m_typeHigh, m_signMask.m_typeLow));
1220 return dBigVector(_mm_div_pd(m_one.m_typeLow, m_typeLow), _mm_div_pd(m_one.m_typeHigh, m_typeHigh));
1225 return dBigVector(_mm_sqrt_pd(m_typeLow), _mm_sqrt_pd(m_typeHigh));
1230 return Sqrt().Reciproc();
1235 dAssert (m_w == dFloat32 (0.0f));
1236 dFloat64 mag2 = DotProduct(*this).GetScalar();
1237 return Scale(dFloat64 (1.0f) / sqrt (mag2));
1240 dFloat64 GetMax()
const
1242 __m128d tmp(_mm_max_pd(m_typeLow, m_typeHigh));
1243 return dBigVector(_mm_max_pd(tmp, _mm_shuffle_pd(tmp, tmp, PERMUT_MASK_DOUBLE(0, 1))), tmp).GetScalar();
1248 return dBigVector(_mm_max_pd(m_typeLow, data.m_typeLow), _mm_max_pd(m_typeHigh, data.m_typeHigh));
1253 return dBigVector(_mm_min_pd(m_typeLow, data.m_typeLow), _mm_min_pd(m_typeHigh, data.m_typeHigh));
1259 dInt64 x = _mm_cvtsd_si32(temp.m_typeLow);
1260 dInt64 y = _mm_cvtsd_si32(_mm_shuffle_pd(temp.m_typeLow, temp.m_typeLow, PERMUT_MASK_DOUBLE(1, 1)));
1261 dInt64 z = _mm_cvtsd_si32(temp.m_typeHigh);
1262 dInt64 w = _mm_cvtsd_si32(_mm_shuffle_pd(temp.m_typeHigh, temp.m_typeHigh, PERMUT_MASK_DOUBLE(1, 1)));
1263 return dBigVector(_mm_set_pd(*(dFloat32*)&y, *(dFloat32*)&x), _mm_set_pd(*(dFloat32*)&w, *(dFloat32*)&z));
1269 return dBigVector(_mm_cmpgt_pd(m_typeLow, data.m_typeLow), _mm_cmpgt_pd(m_typeHigh, data.m_typeHigh));
1274 return dBigVector(_mm_cmpeq_pd(m_typeLow, data.m_typeLow), _mm_cmpeq_pd(m_typeHigh, data.m_typeHigh));
1279 return dBigVector(_mm_cmplt_pd(m_typeLow, data.m_typeLow), _mm_cmplt_pd(m_typeHigh, data.m_typeHigh));
1284 return dBigVector(_mm_cmpge_pd(m_typeLow, data.m_typeLow), _mm_cmpge_pd(m_typeHigh, data.m_typeHigh));
1289 return dBigVector(_mm_cmple_pd(m_typeLow, data.m_typeLow), _mm_cmple_pd(m_typeHigh, data.m_typeHigh));
1295 return dBigVector(_mm_and_pd(m_typeLow, data.m_typeLow), _mm_and_pd(m_typeHigh, data.m_typeHigh));
1300 return dBigVector(_mm_or_pd(m_typeLow, data.m_typeLow), _mm_or_pd(m_typeHigh, data.m_typeHigh));
1305 return dBigVector(_mm_xor_pd(m_typeLow, data.m_typeLow), _mm_xor_pd(m_typeHigh, data.m_typeHigh));
1310 return dBigVector(_mm_andnot_pd(data.m_typeLow, m_typeLow), _mm_andnot_pd(data.m_typeHigh, m_typeHigh));
1316 return dBigVector(_mm_xor_pd(m_typeLow, _mm_and_pd(mask.m_typeLow, _mm_xor_pd(m_typeLow, data.m_typeLow))),
1317 _mm_xor_pd(m_typeHigh, _mm_and_pd(mask.m_typeHigh, _mm_xor_pd(m_typeHigh, data.m_typeHigh))));
1323 return dBigVector(_mm_shuffle_pd(m_typeHigh, m_typeLow, PERMUT_MASK_DOUBLE(0, 1)), _mm_shuffle_pd(m_typeLow, m_typeHigh, PERMUT_MASK_DOUBLE(0, 1)));
1328 return dBigVector(_mm_shuffle_pd(m_typeHigh, m_typeLow, PERMUT_MASK_DOUBLE(0, 0)), _mm_shuffle_pd(m_typeLow, m_typeHigh, PERMUT_MASK_DOUBLE(1, 1)));
1333 return dBigVector(_mm_shuffle_pd(m_typeLow, m_typeHigh, PERMUT_MASK_DOUBLE(0, 1)), _mm_shuffle_pd(m_typeLow, m_typeHigh, PERMUT_MASK_DOUBLE(1, 0)));
1336 D_INLINE
dBigVector ShiftRightLogical(dInt32 bits)
const
1339 return dBigVector(_mm_srli_epi64(m_typeIntLow, bits), _mm_srli_epi64(m_typeIntHigh, bits));
1342 D_INLINE dInt32 GetSignMask()
const
1344 return _mm_movemask_pd(m_typeLow) | (_mm_movemask_pd(m_typeHigh) << 2);
1349 return dBigVector(floor(m_x), floor(m_y), floor(m_z), floor(m_w));
1354 return m_negOne & (*
this == m_zero);
1365 dst0 =
dBigVector(tmp0.m_x, tmp1.m_x, tmp2.m_x, tmp3.m_x);
1366 dst1 =
dBigVector(tmp0.m_y, tmp1.m_y, tmp2.m_y, tmp3.m_y);
1367 dst2 =
dBigVector(tmp0.m_z, tmp1.m_z, tmp2.m_z, tmp3.m_z);
1368 dst3 =
dBigVector(tmp0.m_w, tmp1.m_w, tmp2.m_w, tmp3.m_w);
1374 dBigVector tmp(_mm_mul_pd(m_typeLow, A.m_typeLow), _mm_mul_pd(m_typeHigh, A.m_typeHigh));
1375 return tmp.AddHorizontal();
1380 dFloat64 cofactor[3][3];
1381 dFloat64 array[4][4];
1384 for (dInt32 i = 0; i < 4; i++) {
1385 array[0][i] = me[i];
1388 array[3][i] = dFloat64(1.0f);
1392 dFloat64 sign = dFloat64(-1.0f);
1393 for (dInt32 i = 0; i < 4; i++)
1395 for (dInt32 j = 0; j < 3; j++)
1398 for (dInt32 k = 0; k < 4; k++)
1402 cofactor[j][k0] = array[j][k];
1407 dFloat64 x = cofactor[0][0] * (cofactor[1][1] * cofactor[2][2] - cofactor[1][2] * cofactor[2][1]);
1408 dFloat64 y = cofactor[0][1] * (cofactor[1][2] * cofactor[2][0] - cofactor[1][0] * cofactor[2][2]);
1409 dFloat64 z = cofactor[0][2] * (cofactor[1][0] * cofactor[2][1] - cofactor[1][1] * cofactor[2][0]);
1410 dFloat64 det = x + y + z;
1412 normal[i] = sign * det;
1413 sign *= dFloat64(-1.0f);
1430 __m128i m_typeIntLow;
1431 __m128i m_typeIntHigh;
1463 } D_GCC_NEWTON_ALIGN_32 ;
1465 D_MSV_NEWTON_ALIGN_32
1474 :m_d0(_mm_set1_pd(a))
1475 ,m_d1(_mm_set1_pd(a))
1476 ,m_d2(_mm_set1_pd(a))
1480 #ifdef D_NEWTON_USE_DOUBLE
1481 #define PURMUT_MASK2(y, x) _MM_SHUFFLE2(x, y)
1483 :m_d0(low.m_typeLow)
1484 ,m_d1(_mm_shuffle_pd(low.m_typeHigh, high.m_typeLow, PURMUT_MASK2(0, 0)))
1485 ,m_d2(_mm_shuffle_pd(high.m_typeLow, high.m_typeHigh, PURMUT_MASK2(1, 0)))
1490 :m_d0(_mm_cvtps_pd(low.m_type))
1491 ,m_d1(_mm_cvtps_pd(_mm_unpackhi_ps(low.m_type, _mm_shuffle_ps(low.m_type, high.m_type, PERMUTE_MASK(0, 0, 0, 2)))))
1492 ,m_d2(_mm_cvtps_pd(_mm_shuffle_ps(high.m_type, high.m_type, PERMUTE_MASK(3, 3, 2, 1))))
1504 D_INLINE
dSpatialVector(
const __m128d d0,
const __m128d d1,
const __m128d d2)
1511 D_INLINE dFloat64& operator[] (dInt32 i)
1515 return ((dFloat64*)&m_d0)[i];
1518 D_INLINE
const dFloat64& operator[] (dInt32 i)
const
1522 return ((dFloat64*)&m_d0)[i];
1527 return dSpatialVector(_mm_add_pd(m_d0, A.m_d0), _mm_add_pd(m_d1, A.m_d1), _mm_add_pd(m_d2, A.m_d2));
1532 return dSpatialVector(_mm_mul_pd(m_d0, A.m_d0), _mm_mul_pd(m_d1, A.m_d1), _mm_mul_pd(m_d2, A.m_d2));
1538 __m128d tmp2(_mm_add_pd(tmp.m_d0, _mm_add_pd(tmp.m_d1, tmp.m_d2)));
1539 return _mm_cvtsd_f64(_mm_hadd_pd(tmp2, tmp2));
1544 __m128d tmp(_mm_set1_pd(s));
1545 return dSpatialVector(_mm_mul_pd(m_d0, tmp), _mm_mul_pd(m_d1, tmp), _mm_mul_pd(m_d2, tmp));
1559 } D_GCC_NEWTON_ALIGN_32 ;