22 #ifndef __ND_VECTOR_X86_SIMD_H__
23 #define __ND_VECTOR_X86_SIMD_H__
25 #ifndef D_SCALAR_VECTOR_CLASS
27 #ifdef D_NEWTON_USE_DOUBLE
28 #define ndVector ndBigVector
40 #define PERMUTE_MASK(w, z, y, x) _MM_SHUFFLE (w, z, y, x)
42 D_OPERATOR_NEW_AND_DELETE
66 inline ndVector (
const ndFloat32*
const ptr)
67 :m_type(_mm_loadu_ps (ptr))
72 inline ndVector(
const ndFloat32*
const baseAddr,
const ndInt32*
const index)
73 :m_x(baseAddr[index[0]])
74 ,m_y(baseAddr[index[1]])
75 ,m_z(baseAddr[index[2]])
76 ,m_w(baseAddr[index[3]])
80 #ifndef D_NEWTON_USE_DOUBLE
81 inline ndVector(
const ndFloat64*
const ptr)
82 :m_x(ndFloat32(ptr[0]))
83 ,m_y(ndFloat32(ptr[1]))
84 ,m_z(ndFloat32(ptr[2]))
85 ,m_w(ndFloat32(ptr[3]))
96 :m_type(_mm_shuffle_ps (_mm_cvtpd_ps (((__m128d*)©)[0]), _mm_cvtpd_ps (((__m128d*)©)[1]), PERMUTE_MASK(1, 0, 1, 0)))
98 ndAssert (ndCheckVector ((*
this)));
101 inline ndVector (ndFloat32 x, ndFloat32 y, ndFloat32 z, ndFloat32 w)
109 inline ndVector (ndInt32 ix, ndInt32 iy, ndInt32 iz, ndInt32 iw)
117 inline ndFloat32 GetX()
const
122 inline ndFloat32 GetY()
const
127 inline ndFloat32 GetZ()
const
132 inline ndFloat32 GetW()
const
137 inline void SetX(ndFloat32 x)
142 inline void SetY(ndFloat32 x)
147 inline void SetZ(ndFloat32 x)
152 inline void SetW(ndFloat32 x)
158 inline ndFloat32 GetScalar ()
const
160 return _mm_cvtss_f32 (m_type);
163 inline void Store (ndFloat32*
const dst)
const
165 _mm_storeu_ps(dst, m_type);
170 return _mm_shuffle_ps (m_type, m_type, PERMUTE_MASK(0, 0, 0, 0));
175 return _mm_shuffle_ps (m_type, m_type, PERMUTE_MASK(1, 1, 1, 1));
180 return _mm_shuffle_ps (m_type, m_type, PERMUTE_MASK(2, 2, 2, 2));
185 return _mm_shuffle_ps (m_type, m_type, PERMUTE_MASK(3, 3, 3, 3));
188 inline ndVector Scale (ndFloat32 s)
const
190 return _mm_mul_ps (m_type, _mm_set_ps1(s));
193 inline ndFloat32& operator[] (ndInt32 i)
200 inline const ndFloat32& operator[] (ndInt32 i)
const
209 return _mm_add_ps (m_type, A.m_type);
214 return _mm_sub_ps (m_type, A.m_type);
219 return _mm_mul_ps(m_type, A.m_type);
224 return (*
this = _mm_add_ps (m_type, A.m_type));
229 return (*
this = _mm_sub_ps (m_type, A.m_type));
234 return (*
this = _mm_mul_ps(m_type, A.m_type));
242 return (*
this * A).AddHorizontal();
248 __m128 tmp0 = _mm_shuffle_ps( m_type, m_type, _MM_SHUFFLE(3, 0, 2, 1));
249 __m128 tmp1 = _mm_shuffle_ps(B.m_type, B.m_type, _MM_SHUFFLE(3, 1, 0, 2));
250 __m128 tmp2 = _mm_mul_ps(tmp0, B.m_type);
251 __m128 tmp3 = _mm_mul_ps(tmp0, tmp1);
252 __m128 tmp4 = _mm_shuffle_ps(tmp2, tmp2, _MM_SHUFFLE(3, 0, 2, 1));
253 return _mm_sub_ps(tmp3, tmp4);
259 ndFloat32 array[4][4];
260 ndFloat32 cofactor[3][3];
263 for (ndInt32 i = 0; i < 4; ++i)
268 array[3][i] = ndFloat32 (1.0f);
272 ndFloat32 sign = ndFloat32 (-1.0f);
273 for (ndInt32 i = 0; i < 4; ++i)
275 for (ndInt32 j = 0; j < 3; ++j)
278 for (ndInt32 k = 0; k < 4; ++k)
282 cofactor[j][k0] = array[j][k];
287 ndFloat32 x = cofactor[0][0] * (cofactor[1][1] * cofactor[2][2] - cofactor[1][2] * cofactor[2][1]);
288 ndFloat32 y = cofactor[0][1] * (cofactor[1][2] * cofactor[2][0] - cofactor[1][0] * cofactor[2][2]);
289 ndFloat32 z = cofactor[0][2] * (cofactor[1][0] * cofactor[2][1] - cofactor[1][1] * cofactor[2][0]);
290 ndFloat32 det = x + y + z;
292 normal[i] = sign * det;
293 sign *= ndFloat32 (-1.0f);
301 return _mm_div_ps (m_one.m_type, m_type);
306 return _mm_add_ps(m_type, _mm_mul_ps(A.m_type, B.m_type));
311 return _mm_sub_ps(m_type, _mm_mul_ps(A.m_type, B.m_type));
314 inline ndVector AddHorizontal ()
const
317 __m128 tmp (_mm_hadd_ps (m_type, m_type));
318 return _mm_hadd_ps (tmp, tmp);
320 __m128 tmp (_mm_add_ps (m_type, _mm_shuffle_ps(m_type, m_type, PERMUTE_MASK(1, 0, 3, 2))));
321 return _mm_add_ps(tmp, _mm_shuffle_ps(tmp, tmp, PERMUTE_MASK(2, 3, 0, 1)));
327 return _mm_and_ps (m_type, m_signMask.m_type);
332 __m128 tmp(_mm_max_ps(m_type, _mm_shuffle_ps(m_type, m_type, PERMUTE_MASK(1, 0, 3, 2))));
333 return _mm_max_ps(tmp, _mm_shuffle_ps(tmp, tmp, PERMUTE_MASK(2, 3, 0, 1)));
338 return _mm_max_ps (m_type, data.m_type);
343 return _mm_min_ps (m_type, data.m_type);
348 return ndVector(_mm_cvtps_epi32(Floor().m_type));
354 return m_negOne & (*
this == m_zero);
359 ndVector truncated (_mm_cvtepi32_ps (_mm_cvttps_epi32 (m_type)));
360 ndVector ret (truncated - (ndVector::m_one & (*
this < truncated)));
361 ndAssert (ret.m_f[0] == ndFloor(m_f[0]));
362 ndAssert (ret.m_f[1] == ndFloor(m_f[1]));
363 ndAssert (ret.m_f[2] == ndFloor(m_f[2]));
364 ndAssert (ret.m_f[3] == ndFloor(m_f[3]));
370 return _mm_sqrt_ps(m_type);
375 ndVector tmp0 (_mm_rsqrt_ps(m_type));
376 return m_half * tmp0 * (m_three - *
this * tmp0 * tmp0);
381 return DotProduct(*this).InvSqrt();
386 return Scale(ndFloat32(1.0f) / ndSqrt(DotProduct(*this).GetScalar()));
392 return _mm_cmpgt_ps (m_type, data.m_type);
397 return _mm_cmpeq_ps (m_type, data.m_type);
402 return _mm_cmplt_ps (m_type, data.m_type);
407 return _mm_cmpge_ps (m_type, data.m_type);
412 return _mm_cmple_ps (m_type, data.m_type);
418 return _mm_and_ps (m_type, data.m_type);
423 return _mm_or_ps (m_type, data.m_type);
428 return _mm_xor_ps (m_type, data.m_type);
433 return _mm_andnot_ps(data.m_type, m_type);
440 return _mm_xor_ps(m_type, _mm_and_ps (mask.m_type, _mm_xor_ps(m_type, data.m_type)));
443 inline ndInt32 GetSignMask()
const
445 return _mm_movemask_ps(m_type);
450 return _mm_shuffle_ps(m_type, m_type, PERMUTE_MASK(2, 1, 0, 3));
453 inline ndVector ShiftTripleRight ()
const
455 return _mm_shuffle_ps(m_type, m_type, PERMUTE_MASK(3, 1, 0, 2));
458 inline ndVector ShiftTripleLeft ()
const
460 return _mm_shuffle_ps (m_type, m_type, PERMUTE_MASK(3, 0, 2, 1));
463 inline ndVector ShiftRightLogical (ndInt32 bits)
const
465 return ndVector (_mm_srli_epi32(m_typeInt, bits));
470 __m128 tmp0 (_mm_unpacklo_ps (src0.m_type, src1.m_type));
471 __m128 tmp1 (_mm_unpacklo_ps (src2.m_type, src3.m_type));
472 __m128 tmp2 (_mm_unpackhi_ps (src0.m_type, src1.m_type));
473 __m128 tmp3 (_mm_unpackhi_ps (src2.m_type, src3.m_type));
475 dst0 =
ndVector (_mm_movelh_ps (tmp0, tmp1));
476 dst1 =
ndVector (_mm_movehl_ps (tmp1, tmp0));
477 dst2 =
ndVector (_mm_movelh_ps (tmp2, tmp3));
478 dst3 =
ndVector (_mm_movehl_ps (tmp3, tmp2));
483 inline void Trace(
char*
const)
const
489 inline void Trace(
char*
const)
const {}
520 D_CORE_API
static ndVector m_negOne;
525 D_CORE_API
static ndVector m_xyzwMask;
526 D_CORE_API
static ndVector m_epsilon;
527 D_CORE_API
static ndVector m_signMask;
528 D_CORE_API
static ndVector m_triplexMask;
529 } D_GCC_NEWTON_ALIGN_16 ;
538 D_MSV_NEWTON_ALIGN_32
541 #define PERMUT_MASK_DOUBLE(y, x) _MM_SHUFFLE2 (y, x)
544 D_OPERATOR_NEW_AND_DELETE
551 :m_typeLow(copy.m_typeLow)
552 ,m_typeHigh(copy.m_typeHigh)
556 inline ndBigVector(
const __m128d typeLow,
const __m128d typeHigh)
558 ,m_typeHigh(typeHigh)
562 inline ndBigVector(
const __m128i typeLow,
const __m128i typeHigh)
563 :m_typeIntLow(typeLow)
564 ,m_typeIntHigh(typeHigh)
569 :m_typeLow(_mm_set1_pd(a))
570 ,m_typeHigh(_mm_set1_pd(a))
574 inline ndBigVector(
const ndFloat64*
const baseAddr,
const ndInt64*
const index)
575 :m_x(baseAddr[index[0]])
576 ,m_y(baseAddr[index[1]])
577 ,m_z(baseAddr[index[2]])
578 ,m_w(baseAddr[index[3]])
582 #ifdef D_NEWTON_USE_DOUBLE
584 :m_typeLow(_mm_loadu_pd(ptr))
585 ,m_typeHigh(_mm_loadu_pd(&ptr[2]))
591 :m_typeLow(_mm_cvtps_pd (v.m_type))
592 ,m_typeHigh(_mm_cvtps_pd (_mm_shuffle_ps (v.m_type, v.m_type, PERMUTE_MASK(3, 2, 3, 2))))
594 ndAssert(ndCheckVector((*
this)));
598 :m_typeLow(_mm_loadu_pd(ptr))
599 ,m_typeHigh(_mm_loadu_pd(&ptr[2]))
604 inline ndBigVector(ndFloat64 x, ndFloat64 y, ndFloat64 z, ndFloat64 w)
605 :m_typeLow(_mm_set_pd(y, x))
606 ,m_typeHigh(_mm_set_pd(w, z))
610 inline ndBigVector(ndInt32 ix, ndInt32 iy, ndInt32 iz, ndInt32 iw)
611 :m_ix(ndInt64(ix)), m_iy(ndInt64(iy)), m_iz(ndInt64(iz)), m_iw(ndInt64(iw))
615 inline ndBigVector(ndInt64 ix, ndInt64 iy, ndInt64 iz, ndInt64 iw)
616 :m_ix(ix), m_iy(iy), m_iz(iz), m_iw(iw)
620 inline ndFloat64 GetX()
const
625 inline ndFloat64 GetY()
const
630 inline ndFloat64 GetZ()
const
635 inline ndFloat64 GetW()
const
640 inline void SetX(ndFloat64 x)
645 inline void SetY(ndFloat64 x)
650 inline void SetZ(ndFloat64 x)
655 inline void SetW(ndFloat64 x)
660 inline ndFloat64 GetScalar()
const
663 return _mm_cvtsd_f64(m_typeLow);
666 inline ndFloat64& operator[] (ndInt32 i)
673 inline const ndFloat64& operator[] (ndInt32 i)
const
682 return ndBigVector(_mm_add_pd(m_typeLow, A.m_typeLow), _mm_add_pd(m_typeHigh, A.m_typeHigh));
687 return ndBigVector(_mm_sub_pd(m_typeLow, A.m_typeLow), _mm_sub_pd(m_typeHigh, A.m_typeHigh));
692 return ndBigVector(_mm_mul_pd(m_typeLow, A.m_typeLow), _mm_mul_pd(m_typeHigh, A.m_typeHigh));
697 m_typeLow = _mm_add_pd(m_typeLow, A.m_typeLow);
698 m_typeHigh = _mm_add_pd(m_typeHigh, A.m_typeHigh);
704 m_typeLow = _mm_sub_pd(m_typeLow, A.m_typeLow);
705 m_typeHigh = _mm_sub_pd(m_typeHigh, A.m_typeHigh);
711 m_typeLow = _mm_mul_pd(m_typeLow, A.m_typeLow);
712 m_typeHigh = _mm_mul_pd(m_typeHigh, A.m_typeHigh);
718 return *
this + A * B;
723 return *
this - A * B;
728 __m128d tmp0(_mm_add_pd(m_typeHigh, m_typeLow));
730 __m128d tmp1(_mm_hadd_pd(tmp0, tmp0));
732 __m128d tmp1(_mm_add_pd(tmp0, _mm_shuffle_pd(tmp0, tmp0, PERMUT_MASK_DOUBLE(0, 1))));
759 __m128d tmp0(_mm_set1_pd(s));
760 return ndBigVector(_mm_mul_pd(m_typeLow, tmp0), _mm_mul_pd(m_typeHigh, tmp0));
765 return ndBigVector(_mm_and_pd(m_typeLow, m_signMask.m_typeLow), _mm_and_pd(m_typeHigh, m_signMask.m_typeLow));
770 return ndBigVector(_mm_div_pd(m_one.m_typeLow, m_typeLow), _mm_div_pd(m_one.m_typeHigh, m_typeHigh));
775 return ndBigVector(_mm_sqrt_pd(m_typeLow), _mm_sqrt_pd(m_typeHigh));
780 return Sqrt().Reciproc();
785 return DotProduct(*this).InvSqrt();
790 ndFloat64 mag2 = DotProduct(*this).GetScalar();
791 return Scale(ndFloat64 (1.0f) / sqrt (mag2));
796 __m128d tmp(_mm_max_pd(m_typeLow, m_typeHigh));
797 tmp = _mm_max_pd(tmp, _mm_shuffle_pd(tmp, tmp, PERMUT_MASK_DOUBLE(0, 1)));
803 return ndBigVector(_mm_max_pd(m_typeLow, data.m_typeLow), _mm_max_pd(m_typeHigh, data.m_typeHigh));
808 return ndBigVector(_mm_min_pd(m_typeLow, data.m_typeLow), _mm_min_pd(m_typeHigh, data.m_typeHigh));
814 ndInt64 x = _mm_cvtsd_si32(temp.m_typeLow);
815 ndInt64 y = _mm_cvtsd_si32(_mm_shuffle_pd(temp.m_typeLow, temp.m_typeLow, PERMUT_MASK_DOUBLE(1, 1)));
816 ndInt64 z = _mm_cvtsd_si32(temp.m_typeHigh);
817 ndInt64 w = _mm_cvtsd_si32(_mm_shuffle_pd(temp.m_typeHigh, temp.m_typeHigh, PERMUT_MASK_DOUBLE(1, 1)));
818 return ndBigVector(_mm_set_epi64x(y, x), _mm_set_epi64x(w, z));
824 return ndBigVector(_mm_cmpgt_pd(m_typeLow, data.m_typeLow), _mm_cmpgt_pd(m_typeHigh, data.m_typeHigh));
829 return ndBigVector(_mm_cmpeq_pd(m_typeLow, data.m_typeLow), _mm_cmpeq_pd(m_typeHigh, data.m_typeHigh));
834 return ndBigVector(_mm_cmplt_pd(m_typeLow, data.m_typeLow), _mm_cmplt_pd(m_typeHigh, data.m_typeHigh));
839 return ndBigVector(_mm_cmpge_pd(m_typeLow, data.m_typeLow), _mm_cmpge_pd(m_typeHigh, data.m_typeHigh));
844 return ndBigVector(_mm_cmple_pd(m_typeLow, data.m_typeLow), _mm_cmple_pd(m_typeHigh, data.m_typeHigh));
850 return ndBigVector(_mm_and_pd(m_typeLow, data.m_typeLow), _mm_and_pd(m_typeHigh, data.m_typeHigh));
855 return ndBigVector(_mm_or_pd(m_typeLow, data.m_typeLow), _mm_or_pd(m_typeHigh, data.m_typeHigh));
860 return ndBigVector(_mm_xor_pd(m_typeLow, data.m_typeLow), _mm_xor_pd(m_typeHigh, data.m_typeHigh));
865 return ndBigVector(_mm_andnot_pd(data.m_typeLow, m_typeLow), _mm_andnot_pd(data.m_typeHigh, m_typeHigh));
871 return ndBigVector(_mm_xor_pd(m_typeLow, _mm_and_pd(mask.m_typeLow, _mm_xor_pd(m_typeLow, data.m_typeLow))),
872 _mm_xor_pd(m_typeHigh, _mm_and_pd(mask.m_typeHigh, _mm_xor_pd(m_typeHigh, data.m_typeHigh))));
878 return ndBigVector(_mm_shuffle_pd(m_typeHigh, m_typeLow, PERMUT_MASK_DOUBLE(0, 1)), _mm_shuffle_pd(m_typeLow, m_typeHigh, PERMUT_MASK_DOUBLE(0, 1)));
883 return ndBigVector(_mm_shuffle_pd(m_typeHigh, m_typeLow, PERMUT_MASK_DOUBLE(0, 0)), _mm_shuffle_pd(m_typeLow, m_typeHigh, PERMUT_MASK_DOUBLE(1, 1)));
888 return ndBigVector(_mm_shuffle_pd(m_typeLow, m_typeHigh, PERMUT_MASK_DOUBLE(0, 1)), _mm_shuffle_pd(m_typeLow, m_typeHigh, PERMUT_MASK_DOUBLE(1, 0)));
891 inline ndBigVector ShiftRightLogical(ndInt32 bits)
const
893 return ndBigVector(_mm_srli_epi64(m_typeIntLow, bits), _mm_srli_epi64(m_typeIntHigh, bits));
896 inline ndInt32 GetSignMask()
const
898 return _mm_movemask_pd(m_typeLow) | (_mm_movemask_pd(m_typeHigh) << 2);
903 return ndBigVector(floor(m_x), floor(m_y), floor(m_z), floor(m_w));
908 return m_negOne & (*
this == m_zero);
919 dst0 =
ndBigVector(tmp0.m_x, tmp1.m_x, tmp2.m_x, tmp3.m_x);
920 dst1 =
ndBigVector(tmp0.m_y, tmp1.m_y, tmp2.m_y, tmp3.m_y);
921 dst2 =
ndBigVector(tmp0.m_z, tmp1.m_z, tmp2.m_z, tmp3.m_z);
922 dst3 =
ndBigVector(tmp0.m_w, tmp1.m_w, tmp2.m_w, tmp3.m_w);
930 return (*
this * A).AddHorizontal();
936 return ndBigVector(m_y * B.m_z - m_z * B.m_y, m_z * B.m_x - m_x * B.m_z, m_x * B.m_y - m_y * B.m_x, m_w);
942 ndFloat64 cofactor[3][3];
943 ndFloat64 array[4][4];
946 for (ndInt32 i = 0; i < 4; ++i)
951 array[3][i] = ndFloat64(1.0f);
955 ndFloat64 sign = ndFloat64(-1.0f);
956 for (ndInt32 i = 0; i < 4; ++i)
958 for (ndInt32 j = 0; j < 3; ++j)
961 for (ndInt32 k = 0; k < 4; ++k)
965 cofactor[j][k0] = array[j][k];
970 ndFloat64 x = cofactor[0][0] * (cofactor[1][1] * cofactor[2][2] - cofactor[1][2] * cofactor[2][1]);
971 ndFloat64 y = cofactor[0][1] * (cofactor[1][2] * cofactor[2][0] - cofactor[1][0] * cofactor[2][2]);
972 ndFloat64 z = cofactor[0][2] * (cofactor[1][0] * cofactor[2][1] - cofactor[1][1] * cofactor[2][0]);
973 ndFloat64 det = x + y + z;
975 normal[i] = sign * det;
976 sign *= ndFloat64(-1.0f);
993 __m128i m_typeIntLow;
994 __m128i m_typeIntHigh;
1027 } D_GCC_NEWTON_ALIGN_32;