Newton Dynamics  4.00
dVectorSimd.h
1 /* Copyright (c) <2003-2019> <Julio Jerez, Newton Game Dynamics>
2 *
3 * This software is provided 'as-is', without any express or implied
4 * warranty. In no event will the authors be held liable for any damages
5 * arising from the use of this software.
6 *
7 * Permission is granted to anyone to use this software for any purpose,
8 * including commercial applications, and to alter it and redistribute it
9 * freely, subject to the following restrictions:
10 *
11 * 1. The origin of this software must not be misrepresented; you must not
12 * claim that you wrote the original software. If you use this software
13 * in a product, an acknowledgment in the product documentation would be
14 * appreciated but is not required.
15 *
16 * 2. Altered source versions must be plainly marked as such, and must not be
17 * misrepresented as being the original software.
18 *
19 * 3. This notice may not be removed or altered from any source distribution.
20 */
21 
22 #ifndef __D_VECTOR_X86_SIMD_H__
23 #define __D_VECTOR_X86_SIMD_H__
24 
25 #ifndef DG_SCALAR_VECTOR_CLASS
26 
27 #ifdef D_NEWTON_USE_DOUBLE
28  #define dVector dBigVector
29 #else
30 
31 class dBigVector;
32 // *****************************************************************************************
33 //
34 // 4 x 1 single precision SSE vector class declaration
35 //
36 // *****************************************************************************************
37 D_MSV_NEWTON_ALIGN_16
38 class dVector
39 {
40  #define PERMUTE_MASK(w, z, y, x) _MM_SHUFFLE (w, z, y, x)
41  public:
42  D_INLINE dVector()
43  {
44  }
45 
46  D_INLINE dVector(const __m128i type)
47  :m_typeInt (type)
48  {
49  }
50 
51  D_INLINE dVector(const __m128 type)
52  : m_type(type)
53  {
54  }
55 
56  D_INLINE dVector (const dFloat32 a)
57  :m_type(_mm_set_ps1(a))
58  {
59  }
60 
61  D_INLINE dVector (const dFloat32* const ptr)
62  :m_type(_mm_loadu_ps (ptr))
63  {
64  }
65 
66 #ifndef D_NEWTON_USE_DOUBLE
67  D_INLINE dVector(const dFloat64* const ptr)
68  :m_type(_mm_set_ps(dFloat32(ptr[3]), dFloat32(ptr[2]), dFloat32(ptr[1]), dFloat32(ptr[0])))
69  {
70  }
71 #endif
72 
73  D_INLINE dVector (const dVector& copy)
74  :m_type(copy.m_type)
75  {
76  }
77 
78  D_INLINE dVector (const dBigVector& copy)
79  :m_type(_mm_shuffle_ps (_mm_cvtpd_ps (((__m128d*)&copy)[0]), _mm_cvtpd_ps (((__m128d*)&copy)[1]), PERMUTE_MASK(1, 0, 1, 0)))
80  {
81  dAssert (dCheckVector ((*this)));
82  }
83 
84  D_INLINE dVector (dFloat32 x, dFloat32 y, dFloat32 z, dFloat32 w)
85  :m_type(_mm_set_ps(w, z, y, x))
86  {
87  }
88 
89  D_INLINE dVector (dInt32 ix, dInt32 iy, dInt32 iz, dInt32 iw)
90  :m_type(_mm_set_ps(*(dFloat32*)&iw, *(dFloat32*)&iz, *(dFloat32*)&iy, *(dFloat32*)&ix))
91  {
92  }
93 
94  D_INLINE void *operator new[](size_t size)
95  {
96  return dMemory::Malloc(size);
97  }
98 
99  D_INLINE void *operator new (size_t size)
100  {
101  return dMemory::Malloc(size);
102  }
103 
104  D_INLINE void operator delete[] (void* ptr)
105  {
106  dMemory::Free(ptr);
107  }
108 
109  D_INLINE void operator delete (void* ptr)
110  {
111  dMemory::Free(ptr);
112  }
113 
114 
115  D_INLINE dFloat32 GetScalar () const
116  {
117  //return m_x;
118  return _mm_cvtss_f32 (m_type);
119  }
120 
121  D_INLINE void Store (dFloat32* const dst) const
122  {
123  _mm_storeu_ps(dst, m_type);
124  }
125 
126  D_INLINE dVector BroadcastX () const
127  {
128  return _mm_shuffle_ps (m_type, m_type, PERMUTE_MASK(0, 0, 0, 0));
129  }
130 
131  D_INLINE dVector BroadcastY () const
132  {
133  return _mm_shuffle_ps (m_type, m_type, PERMUTE_MASK(1, 1, 1, 1));
134  }
135 
136  D_INLINE dVector BroadcastZ () const
137  {
138  return _mm_shuffle_ps (m_type, m_type, PERMUTE_MASK(2, 2, 2, 2));
139  }
140 
141  D_INLINE dVector BroadcastW () const
142  {
143  return _mm_shuffle_ps (m_type, m_type, PERMUTE_MASK(3, 3, 3, 3));
144  }
145 
146  D_INLINE dVector Scale (dFloat32 s) const
147  {
148  return _mm_mul_ps (m_type, _mm_set_ps1(s));
149  }
150 
151  D_INLINE dFloat32& operator[] (dInt32 i)
152  {
153  dAssert (i < 4);
154  dAssert (i >= 0);
155  return m_f[i];
156  }
157 
158  D_INLINE const dFloat32& operator[] (dInt32 i) const
159  {
160  dAssert (i < 4);
161  dAssert (i >= 0);
162  return m_f[i];
163  }
164 
165  D_INLINE dVector operator+ (const dVector& A) const
166  {
167  return _mm_add_ps (m_type, A.m_type);
168  }
169 
170  D_INLINE dVector operator- (const dVector& A) const
171  {
172  return _mm_sub_ps (m_type, A.m_type);
173  }
174 
175  D_INLINE dVector operator* (const dVector& A) const
176  {
177  return _mm_mul_ps(m_type, A.m_type);
178  }
179 
180  D_INLINE dVector& operator+= (const dVector& A)
181  {
182  return (*this = _mm_add_ps (m_type, A.m_type));
183  }
184 
185  D_INLINE dVector& operator-= (const dVector& A)
186  {
187  return (*this = _mm_sub_ps (m_type, A.m_type));
188  }
189 
190  D_INLINE dVector& operator*= (const dVector& A)
191  {
192  return (*this = _mm_mul_ps(m_type, A.m_type));
193  }
194 
195  // return cross product
196  D_INLINE dVector CrossProduct (const dVector& B) const
197  {
198  return _mm_sub_ps (_mm_mul_ps (_mm_shuffle_ps (m_type, m_type, PERMUTE_MASK(3, 0, 2, 1)), _mm_shuffle_ps (B.m_type, B.m_type, PERMUTE_MASK(3, 1, 0, 2))),
199  _mm_mul_ps (_mm_shuffle_ps (m_type, m_type, PERMUTE_MASK(3, 1, 0, 2)), _mm_shuffle_ps (B.m_type, B.m_type, PERMUTE_MASK(3, 0, 2, 1))));
200  }
201 
202  D_INLINE dVector DotProduct(const dVector& A) const
203  {
204  dVector tmp(_mm_mul_ps(m_type, A.m_type));
205  return tmp.AddHorizontal();
206  }
207 
208  D_INLINE dVector CrossProduct (const dVector& A, const dVector& B) const
209  {
210  dFloat32 cofactor[3][3];
211  dFloat32 array[4][4];
212 
213  const dVector& me = *this;
214  for (dInt32 i = 0; i < 4; i ++)
215  {
216  array[0][i] = me[i];
217  array[1][i] = A[i];
218  array[2][i] = B[i];
219  array[3][i] = dFloat32 (1.0f);
220  }
221 
222  dVector normal;
223  dFloat32 sign = dFloat32 (-1.0f);
224  for (dInt32 i = 0; i < 4; i ++)
225  {
226  for (dInt32 j = 0; j < 3; j ++)
227  {
228  dInt32 k0 = 0;
229  for (dInt32 k = 0; k < 4; k ++)
230  {
231  if (k != i)
232  {
233  cofactor[j][k0] = array[j][k];
234  k0 ++;
235  }
236  }
237  }
238  dFloat32 x = cofactor[0][0] * (cofactor[1][1] * cofactor[2][2] - cofactor[1][2] * cofactor[2][1]);
239  dFloat32 y = cofactor[0][1] * (cofactor[1][2] * cofactor[2][0] - cofactor[1][0] * cofactor[2][2]);
240  dFloat32 z = cofactor[0][2] * (cofactor[1][0] * cofactor[2][1] - cofactor[1][1] * cofactor[2][0]);
241  dFloat32 det = x + y + z;
242 
243  normal[i] = sign * det;
244  sign *= dFloat32 (-1.0f);
245  }
246 
247  return normal;
248  }
249 
250  D_INLINE dVector Reciproc () const
251  {
252  return _mm_div_ps (m_one.m_type, m_type);
253  }
254 
255  D_INLINE dVector MulAdd(const dVector& A, const dVector& B) const
256  {
257  return _mm_add_ps(m_type, _mm_mul_ps(A.m_type, B.m_type));
258  }
259 
260  D_INLINE dVector MulSub(const dVector& A, const dVector& B) const
261  {
262  return _mm_sub_ps(m_type, _mm_mul_ps(A.m_type, B.m_type));
263  }
264 
265  D_INLINE dVector AddHorizontal () const
266  {
267  __m128 tmp (_mm_hadd_ps (m_type, m_type));
268  return _mm_hadd_ps (tmp, tmp);
269  }
270 
271  D_INLINE dVector Abs () const
272  {
273  return _mm_and_ps (m_type, m_signMask.m_type);
274  }
275 
276  dFloat32 GetMax () const
277  {
278  __m128 tmp (_mm_max_ps (m_type, _mm_shuffle_ps (m_type, m_type, PERMUTE_MASK(3, 2, 3, 2))));
279  //return dVector (_mm_max_ps (tmp, _mm_shuffle_ps (tmp, tmp, PERMUTE_MASK(3, 2, 0, 1)))).GetScalar();
280  return _mm_cvtss_f32(_mm_max_ss (tmp, _mm_shuffle_ps(tmp, tmp, PERMUTE_MASK(3, 2, 0, 1))));
281  }
282 
283  dVector GetMax (const dVector& data) const
284  {
285  return _mm_max_ps (m_type, data.m_type);
286  }
287 
288  dVector GetMin (const dVector& data) const
289  {
290  return _mm_min_ps (m_type, data.m_type);
291  }
292 
293  D_INLINE dVector GetInt () const
294  {
295  return dVector(_mm_cvtps_epi32(Floor().m_type));
296  }
297 
298  D_INLINE dVector TestZero() const
299  {
300  //return dVector (_mm_cmpeq_epi32 (m_typeInt, m_zero.m_typeInt)) & m_negOne;
301  return m_negOne & (*this == m_zero);
302  }
303 
304  D_INLINE dVector Floor () const
305  {
306  dVector truncated (_mm_cvtepi32_ps (_mm_cvttps_epi32 (m_type)));
307  dVector ret (truncated - (dVector::m_one & (*this < truncated)));
308  dAssert (ret.m_f[0] == dFloor(m_f[0]));
309  dAssert (ret.m_f[1] == dFloor(m_f[1]));
310  dAssert (ret.m_f[2] == dFloor(m_f[2]));
311  dAssert (ret.m_f[3] == dFloor(m_f[3]));
312  return ret;
313  }
314 
315  D_INLINE dVector Sqrt () const
316  {
317  return _mm_sqrt_ps(m_type);
318  }
319 
320  D_INLINE dVector InvSqrt () const
321  {
322  dVector tmp0 (_mm_rsqrt_ps(m_type));
323  return m_half * tmp0 * (m_three - *this * tmp0 * tmp0);
324  }
325 
326  D_INLINE dVector InvMagSqrt () const
327  {
328  return DotProduct(*this).InvSqrt();
329  }
330 
331  D_INLINE dVector Normalize () const
332  {
333  dAssert (m_w == dFloat32 (0.0f));
334  //return *this * InvMagSqrt ();
335  return Scale(dFloat32(1.0f) / dSqrt(DotProduct(*this).GetScalar()));
336  }
337 
338  // relational operators
339  D_INLINE dVector operator> (const dVector& data) const
340  {
341  return _mm_cmpgt_ps (m_type, data.m_type);
342  }
343 
344  D_INLINE dVector operator== (const dVector& data) const
345  {
346  return _mm_cmpeq_ps (m_type, data.m_type);
347  }
348 
349  D_INLINE dVector operator< (const dVector& data) const
350  {
351  return _mm_cmplt_ps (m_type, data.m_type);
352  }
353 
354  D_INLINE dVector operator>= (const dVector& data) const
355  {
356  return _mm_cmpge_ps (m_type, data.m_type);
357  }
358 
359  D_INLINE dVector operator<= (const dVector& data) const
360  {
361  return _mm_cmple_ps (m_type, data.m_type);
362  }
363 
364  // logical operations
365  D_INLINE dVector operator& (const dVector& data) const
366  {
367  return _mm_and_ps (m_type, data.m_type);
368  }
369 
370  D_INLINE dVector operator| (const dVector& data) const
371  {
372  return _mm_or_ps (m_type, data.m_type);
373  }
374 
375  D_INLINE dVector operator^ (const dVector& data) const
376  {
377  return _mm_xor_ps (m_type, data.m_type);
378  }
379 
380  D_INLINE dVector AndNot(const dVector& data) const
381  {
382  return _mm_andnot_ps(data.m_type, m_type);
383  }
384 
385  D_INLINE dVector Select(const dVector& data, const dVector& mask) const
386  {
387  // (((b ^ a) & mask)^a)
388  //return _mm_or_ps (_mm_and_ps (mask.m_type, data.m_type), _mm_andnot_ps(mask.m_type, m_type));
389  return _mm_xor_ps(m_type, _mm_and_ps (mask.m_type, _mm_xor_ps(m_type, data.m_type)));
390  }
391 
392  D_INLINE dInt32 GetSignMask() const
393  {
394  return _mm_movemask_ps(m_type);
395  }
396 
397  D_INLINE dVector ShiftRight() const
398  {
399  return _mm_shuffle_ps(m_type, m_type, PERMUTE_MASK(2, 1, 0, 3));
400  }
401 
402  D_INLINE dVector ShiftTripleRight () const
403  {
404  return _mm_shuffle_ps(m_type, m_type, PERMUTE_MASK(3, 1, 0, 2));
405  }
406 
407  D_INLINE dVector ShiftTripleLeft () const
408  {
409  return _mm_shuffle_ps (m_type, m_type, PERMUTE_MASK(3, 0, 2, 1));
410  }
411 
412  D_INLINE dVector ShiftRightLogical (dInt32 bits) const
413  {
414  return dVector (_mm_srli_epi32(m_typeInt, bits));
415  }
416 
417  D_INLINE static void Transpose4x4 (dVector& dst0, dVector& dst1, dVector& dst2, dVector& dst3, const dVector& src0, const dVector& src1, const dVector& src2, const dVector& src3)
418  {
419  __m128 tmp0 (_mm_unpacklo_ps (src0.m_type, src1.m_type));
420  __m128 tmp1 (_mm_unpacklo_ps (src2.m_type, src3.m_type));
421  __m128 tmp2 (_mm_unpackhi_ps (src0.m_type, src1.m_type));
422  __m128 tmp3 (_mm_unpackhi_ps (src2.m_type, src3.m_type));
423 
424  dst0 = dVector (_mm_movelh_ps (tmp0, tmp1));
425  dst1 = dVector (_mm_movehl_ps (tmp1, tmp0));
426  dst2 = dVector (_mm_movelh_ps (tmp2, tmp3));
427  dst3 = dVector (_mm_movehl_ps (tmp3, tmp2));
428  }
429 
430 #ifdef _DEBUG
431  D_INLINE void Trace(char* const name) const
432  {
433  dAssert(0);
434  //dTrace(("%s %f %f %f %f\n", name, m_x, m_y, m_z, m_w));
435  }
436 #else
437  D_INLINE void Trace(char* const name) const {}
438 #endif
439 
440  union
441  {
442  dFloat32 m_f[4];
443  dInt32 m_i[4];
444  __m128 m_type;
445  __m128i m_typeInt;
446  struct
447  {
448  dFloat32 m_x;
449  dFloat32 m_y;
450  dFloat32 m_z;
451  dFloat32 m_w;
452  };
453  struct
454  {
455  dInt32 m_ix;
456  dInt32 m_iy;
457  dInt32 m_iz;
458  dInt32 m_iw;
459  };
460  };
461 
462  D_CORE_API static dVector m_zero;
463  D_CORE_API static dVector m_one;
464  D_CORE_API static dVector m_wOne;
465  D_CORE_API static dVector m_two;
466  D_CORE_API static dVector m_half;
467  D_CORE_API static dVector m_three;
468  D_CORE_API static dVector m_negOne;
469  D_CORE_API static dVector m_xMask;
470  D_CORE_API static dVector m_yMask;
471  D_CORE_API static dVector m_zMask;
472  D_CORE_API static dVector m_wMask;
473  D_CORE_API static dVector m_epsilon;
474  D_CORE_API static dVector m_signMask;
475  D_CORE_API static dVector m_triplexMask;
476 } D_GCC_NEWTON_ALIGN_16 ;
477 #endif
478 
479 
480 // *****************************************************************************************
481 //
482 // 4 x 1 double precision SSE2 vector class declaration
483 //
484 // *****************************************************************************************
485 #ifdef D_USE_VECTOR_AVX
486  D_MSV_NEWTON_ALIGN_32
487  class dBigVector
488  {
489  public:
490  D_INLINE dBigVector()
491  {
492  }
493 
494  D_INLINE dBigVector(const dBigVector& copy)
495  :m_type(copy.m_type)
496  {
497  }
498 
499  D_INLINE dBigVector(const __m256d type)
500  :m_type(type)
501  {
502  }
503 
504  D_INLINE dBigVector(const __m256i type)
505  : m_typeInt(type)
506  {
507  }
508 
509  //D_INLINE dBigVector(const __m128 typeLow, const __m128 typeHigh)
510  // : m_typeGen(_mm256_setr_m128(typeLow, typeHigh))
511  //{
512  //}
513 
514  D_INLINE dBigVector(const dFloat64 a)
515  :m_type(_mm256_set1_pd(a))
516  {
517  }
518 
519 #ifdef D_NEWTON_USE_DOUBLE
520  D_INLINE dBigVector(const dFloat32* const ptr)
521  :m_type(_mm256_set_pd(ptr[3], ptr[2], ptr[1], ptr[0]))
522  {
523  }
524 #else
525 
526  D_INLINE dBigVector(const dVector& v)
527  :m_type(_mm256_cvtps_pd(v.m_type))
528  {
529  dAssert(dCheckVector((*this)));
530  }
531 
532  D_INLINE dBigVector(const dFloat64* const ptr)
533  :m_type(_mm256_loadu_pd(ptr))
534  {
535  }
536 #endif
537 
538  D_INLINE dBigVector(dFloat64 x, dFloat64 y, dFloat64 z, dFloat64 w)
539  :m_type(_mm256_set_pd(w, z, y, x))
540  {
541  }
542 
543  D_INLINE dBigVector(dInt32 ix, dInt32 iy, dInt32 iz, dInt32 iw)
544  :m_ix(dInt64(ix)), m_iy(dInt64(iy)), m_iz(dInt64(iz)), m_iw(dInt64(iw))
545  {
546  }
547 
548  D_INLINE dBigVector(dInt64 ix, dInt64 iy, dInt64 iz, dInt64 iw)
549  :m_ix(ix), m_iy(iy), m_iz(iz), m_iw(iw)
550  {
551  }
552 
553  D_INLINE dFloat64& operator[] (dInt32 i)
554  {
555  dAssert(i < 4);
556  dAssert(i >= 0);
557  return m_f[i];
558  }
559 
560  D_INLINE const dFloat64& operator[] (dInt32 i) const
561  {
562  dAssert(i < 4);
563  dAssert(i >= 0);
564  return m_f[i];
565  }
566 
567  D_INLINE dFloat64 GetScalar() const
568  {
569  //return _mm256_cvtsd_f64(m_type);
570  return m_x;
571  }
572 
573  D_INLINE dBigVector operator+ (const dBigVector& A) const
574  {
575  return _mm256_add_pd(m_type, A.m_type);
576  }
577 
578  D_INLINE dBigVector operator- (const dBigVector& A) const
579  {
580  return _mm256_sub_pd(m_type, A.m_type);
581  }
582 
583  D_INLINE dBigVector operator* (const dBigVector& A) const
584  {
585  return _mm256_mul_pd(m_type, A.m_type);
586  }
587 
588  D_INLINE dBigVector& operator+= (const dBigVector& A)
589  {
590  m_type = _mm256_add_pd(m_type, A.m_type);
591  return *this;
592  }
593 
594  D_INLINE dBigVector& operator-= (const dBigVector& A)
595  {
596  m_type = _mm256_sub_pd(m_type, A.m_type);
597  return *this;
598  }
599 
600  D_INLINE dBigVector& operator*= (const dBigVector& A)
601  {
602  m_type = _mm256_mul_pd(m_type, A.m_type);
603  return *this;
604  }
605 
606  D_INLINE dBigVector MulAdd(const dBigVector& A, const dBigVector& B) const
607  {
608  return *this + A * B;
609  }
610 
611  D_INLINE dBigVector MulSub(const dBigVector& A, const dBigVector& B) const
612  {
613  return *this - A * B;
614  }
615 
616  // return cross product
617  D_INLINE dBigVector CrossProduct(const dBigVector& B) const
618  {
619  return dBigVector(m_y * B.m_z - m_z * B.m_y, m_z * B.m_x - m_x * B.m_z, m_x * B.m_y - m_y * B.m_x, m_w);
620  }
621 
622  D_INLINE dBigVector AddHorizontal() const
623  {
624  __m256d tmp0(_mm256_hadd_pd(m_type, m_type));
625  __m256d tmp1(_mm256_permute2f128_pd(tmp0, tmp0, 3));
626  return _mm256_add_pd(tmp0, tmp1);
627  }
628 
629  D_INLINE dBigVector BroadcastX() const
630  {
631  return dBigVector(m_x);
632  }
633 
634  D_INLINE dBigVector BroadcastY() const
635  {
636  return dBigVector(m_y);
637  }
638 
639  D_INLINE dBigVector BroadcastZ() const
640  {
641  return dBigVector(m_z);
642  }
643 
644  D_INLINE dBigVector BroadcastW() const
645  {
646  return dBigVector(m_w);
647  }
648 
649  D_INLINE dBigVector Scale(dFloat64 s) const
650  {
651  __m256d tmp0(_mm256_set1_pd(s));
652  return _mm256_mul_pd(m_type, tmp0);
653  }
654 
655  D_INLINE dBigVector Abs() const
656  {
657  return _mm256_and_pd(m_type, m_signMask.m_type);
658  }
659 
660  D_INLINE dBigVector Reciproc() const
661  {
662  return _mm256_div_pd(m_one.m_type, m_type);
663  }
664 
665  D_INLINE dBigVector Sqrt() const
666  {
667  return _mm256_sqrt_pd(m_type);
668  }
669 
670  D_INLINE dBigVector InvSqrt() const
671  {
672  return Sqrt().Reciproc();
673  }
674 
675  D_INLINE dBigVector Normalize() const
676  {
677  dAssert(m_w == dFloat32(0.0f));
678  dFloat64 mag2 = DotProduct(*this).GetScalar();
679  return Scale(dFloat64(1.0f) / sqrt(mag2));
680  }
681 
682  dFloat64 GetMax() const
683  {
684  __m256d tmp0(_mm256_permute2f128_pd(m_type, m_type, 5));
685  __m256d tmp1(_mm256_max_pd(m_type, tmp0));
686  __m256d tmp2(_mm256_unpackhi_pd(tmp1, tmp1));
687  __m256d tmp3(_mm256_max_pd(tmp1, tmp2));
688  dBigVector tmp4(tmp3);
689  return tmp4.GetScalar();
690  }
691 
692  dBigVector GetMax(const dBigVector& data) const
693  {
694  return _mm256_max_pd(m_type, data.m_type);
695  }
696 
697  dBigVector GetMin(const dBigVector& data) const
698  {
699  return _mm256_min_pd(m_type, data.m_type);
700  }
701 
702  D_INLINE dBigVector GetInt() const
703  {
704  dBigVector temp(Floor());
705  union
706  {
707  __m128i tmp;
708  struct
709  {
710  dInt32 m_x;
711  dInt32 m_y;
712  dInt32 m_z;
713  dInt32 m_w;
714  };
715  };
716  tmp = _mm256_cvttpd_epi32(temp.m_type);
717  return dBigVector(m_x, m_y, m_z, m_w);
718  }
719 
720  // relational operators
721  D_INLINE dBigVector operator> (const dBigVector& data) const
722  {
723  return _mm256_cmp_pd(m_type, data.m_type, _CMP_GT_OQ);
724  }
725 
726  D_INLINE dBigVector operator== (const dBigVector& data) const
727  {
728  return _mm256_cmp_pd(m_type, data.m_type, _CMP_EQ_OQ);
729  }
730 
731  D_INLINE dBigVector operator< (const dBigVector& data) const
732  {
733  return _mm256_cmp_pd(m_type, data.m_type, _CMP_LT_OQ);
734  }
735 
736  D_INLINE dBigVector operator>= (const dBigVector& data) const
737  {
738  return _mm256_cmp_pd(m_type, data.m_type, _CMP_GE_OQ);
739  }
740 
741  D_INLINE dBigVector operator<= (const dBigVector& data) const
742  {
743  return _mm256_cmp_pd(m_type, data.m_type, _CMP_LE_OQ);
744  }
745 
746  // logical operations
747  D_INLINE dBigVector operator& (const dBigVector& data) const
748  {
749  return _mm256_and_pd(m_type, data.m_type);
750  }
751 
752  D_INLINE dBigVector operator| (const dBigVector& data) const
753  {
754  return _mm256_or_pd(m_type, data.m_type);
755  }
756 
757  D_INLINE dBigVector operator^ (const dBigVector& data) const
758  {
759  return _mm256_xor_pd(m_type, data.m_type);
760  }
761 
762  D_INLINE dBigVector AndNot(const dBigVector& data) const
763  {
764  return _mm256_andnot_pd(data.m_type, m_type);
765  }
766 
767  D_INLINE dBigVector Select(const dBigVector& data, const dBigVector& mask) const
768  {
769  // (((b ^ a) & mask)^a)
770  return _mm256_xor_pd(m_type, _mm256_and_pd(mask.m_type, _mm256_xor_pd(m_type, data.m_type)));
771  }
772 
773  D_INLINE dBigVector ShiftRight() const
774  {
775  //return dBigVector(m_w, m_x, m_y, m_z);
776  __m256d tmp0(_mm256_permute2f128_pd(m_type, m_type, 5));
777  __m256d tmp1(_mm256_blend_pd(m_type, tmp0, 10));
778  __m256d tmp2(_mm256_shuffle_pd(tmp1, tmp1, 5));
779  return tmp2;
780  }
781 
782  D_INLINE dBigVector ShiftTripleRight() const
783  {
784  //return dBigVector(m_z, m_x, m_y, m_w);
785  __m256d tmp0(_mm256_permute2f128_pd(m_type, m_type, 5));
786  __m256d tmp1(_mm256_shuffle_pd(m_type, m_type, 5));
787  __m256d tmp2(_mm256_blend_pd(tmp0, tmp1, 6));
788  __m256d tmp3(_mm256_shuffle_pd(tmp2, tmp2, 6));
789  return tmp3;
790  }
791 
792  D_INLINE dBigVector ShiftTripleLeft() const
793  {
794  //return dBigVector(m_y, m_z, m_x, m_w);
795  __m256d tmp0(_mm256_permute2f128_pd(m_type, m_type, 5));
796  __m256d tmp1(_mm256_blend_pd(m_type, tmp0, 10));
797  __m256d tmp2(_mm256_permute2f128_pd(tmp1, tmp1, 5));
798  __m256d tmp3(_mm256_shuffle_pd(tmp2, tmp2, 9));
799  return tmp3;
800  }
801 
802  D_INLINE dBigVector ShiftRightLogical(dInt32 bits) const
803  {
804  dUnsigned64 x = ((dUnsigned64)m_ix) >> bits;
805  dUnsigned64 y = ((dUnsigned64)m_iy) >> bits;
806  dUnsigned64 z = ((dUnsigned64)m_iz) >> bits;
807  dUnsigned64 w = ((dUnsigned64)m_iw) >> bits;
808  return dBigVector((dInt64)x, (dInt64)y, (dInt64)z, (dInt64)w);
809  }
810 
811  D_INLINE dInt32 GetSignMask() const
812  {
813  return _mm256_movemask_pd(m_type);
814  }
815 
816  D_INLINE dBigVector Floor() const
817  {
818  dBigVector ret(_mm256_floor_pd(m_type));
819  dAssert(ret.m_f[0] == floor(m_f[0]));
820  dAssert(ret.m_f[1] == floor(m_f[1]));
821  dAssert(ret.m_f[2] == floor(m_f[2]));
822  dAssert(ret.m_f[3] == floor(m_f[3]));
823  return ret;
824  }
825 
826  D_INLINE dBigVector TestZero() const
827  {
828  return m_negOne & (*this == dBigVector::m_zero);
829  }
830 
831  D_INLINE static void Transpose4x4(
832  dBigVector& dst0, dBigVector& dst1, dBigVector& dst2, dBigVector& dst3,
833  const dBigVector& src0, const dBigVector& src1,
834  const dBigVector& src2, const dBigVector& src3)
835  {
836  __m256d tmp0(_mm256_unpacklo_pd(src0.m_type, src1.m_type));
837  __m256d tmp1(_mm256_unpackhi_pd(src0.m_type, src1.m_type));
838  dst2 = _mm256_unpacklo_pd(src2.m_type, src3.m_type);
839  dst3 = _mm256_unpackhi_pd(src2.m_type, src3.m_type);
840 
841  dst0 = _mm256_permute2f128_pd(dst2.m_type, tmp0, 2);
842  dst1 = _mm256_permute2f128_pd(dst3.m_type, tmp1, 2);
843  tmp0 = _mm256_permute2f128_pd(tmp0, tmp0, 1);
844  tmp1 = _mm256_permute2f128_pd(tmp1, tmp1, 1);
845  dst2 = _mm256_blend_pd(tmp0, dst2.m_type, 12);
846  dst3 = _mm256_blend_pd(tmp1, dst3.m_type, 12);
847  }
848 
849  // return dot 4d dot product
850  D_INLINE dBigVector DotProduct(const dBigVector &A) const
851  {
852  dBigVector tmp(_mm256_mul_pd(m_type, A.m_type));
853  return tmp.AddHorizontal();
854  }
855 
856  D_INLINE dBigVector CrossProduct(const dBigVector& A, const dBigVector& B) const
857  {
858  dFloat64 array[4][4];
859  dFloat64 cofactor[3][3];
860 
861  const dBigVector& me = *this;
862  for (dInt32 i = 0; i < 4; i++)
863  {
864  array[0][i] = me[i];
865  array[1][i] = A[i];
866  array[2][i] = B[i];
867  array[3][i] = dFloat64(1.0f);
868  }
869 
870  dBigVector normal;
871  dFloat64 sign = dFloat64(-1.0f);
872  for (dInt32 i = 0; i < 4; i++)
873  {
874  for (dInt32 j = 0; j < 3; j++)
875  {
876  dInt32 k0 = 0;
877  for (dInt32 k = 0; k < 4; k++)
878  {
879  if (k != i)
880  {
881  cofactor[j][k0] = array[j][k];
882  k0++;
883  }
884  }
885  }
886  dFloat64 x = cofactor[0][0] * (cofactor[1][1] * cofactor[2][2] - cofactor[1][2] * cofactor[2][1]);
887  dFloat64 y = cofactor[0][1] * (cofactor[1][2] * cofactor[2][0] - cofactor[1][0] * cofactor[2][2]);
888  dFloat64 z = cofactor[0][2] * (cofactor[1][0] * cofactor[2][1] - cofactor[1][1] * cofactor[2][0]);
889  dFloat64 det = x + y + z;
890 
891  normal[i] = sign * det;
892  sign *= dFloat64(-1.0f);
893  }
894 
895  return normal;
896  }
897 
898  union
899  {
900  struct
901  {
902  dFloat64 m_x;
903  dFloat64 m_y;
904  dFloat64 m_z;
905  dFloat64 m_w;
906  };
907  dFloat64 m_f[4];
908  dInt64 m_i[4];
909  __m256d m_type;
910  __m256i m_typeInt;
911  __m256 m_typeGen;
912  struct
913  {
914  dInt64 m_ix;
915  dInt64 m_iy;
916  dInt64 m_iz;
917  dInt64 m_iw;
918  };
919  };
920 
921  D_CORE_API static dBigVector m_zero;
922  D_CORE_API static dBigVector m_one;
923  D_CORE_API static dBigVector m_wOne;
924  D_CORE_API static dBigVector m_two;
925  D_CORE_API static dBigVector m_half;
926  D_CORE_API static dBigVector m_three;
927  D_CORE_API static dBigVector m_negOne;
928  D_CORE_API static dBigVector m_xMask;
929  D_CORE_API static dBigVector m_yMask;
930  D_CORE_API static dBigVector m_zMask;
931  D_CORE_API static dBigVector m_wMask;
932  D_CORE_API static dBigVector m_epsilon;
933  D_CORE_API static dBigVector m_signMask;
934  D_CORE_API static dBigVector m_triplexMask;
935  } D_GCC_NEWTON_ALIGN_32;
936 
937  D_MSV_NEWTON_ALIGN_32
938  class dSpatialVector
939  {
940  public:
941  D_INLINE dSpatialVector()
942  {
943  }
944 
945  D_INLINE dSpatialVector(const dFloat32 a)
946  :m_d0(_mm256_set1_pd(dFloat64(a)))
947  ,m_d1(m_d0)
948  {
949  }
950 
951 #ifdef D_NEWTON_USE_DOUBLE
952  //#define PURMUT_MASK2(y, x) _MM_SHUFFLE2(x, y)
953  D_INLINE dSpatialVector(const dVector& low, const dVector& high)
954  //:m_d0(low.m_typeLow)
955  //,m_d1(_mm_shuffle_pd(low.m_typeHigh, high.m_typeLow, PURMUT_MASK2(0, 0)))
956  //,m_d2(_mm_shuffle_pd(high.m_typeLow, high.m_typeHigh, PURMUT_MASK2(1, 0)))
957  {
958  dAssert(0);
959  }
960 #else
961  D_INLINE dSpatialVector(const dVector& low, const dVector& high)
962  :m_d0(_mm256_cvtps_pd(low.m_type))
963  ,m_d1(_mm256_cvtps_pd(high.ShiftTripleLeft().m_type))
964  {
965  m_f[3] = m_f[6];
966  m_f[6] = dFloat64(0.0f);
967  m_f[7] = dFloat64(0.0f);
968  }
969 #endif
970 
971  D_INLINE dSpatialVector(const dSpatialVector& copy)
972  :m_d0(copy.m_d0)
973  ,m_d1(copy.m_d1)
974  {
975  }
976 
977  D_INLINE dSpatialVector(const __m256d d0, const __m256d d1)
978  :m_d0(d0)
979  ,m_d1(d1)
980  {
981  }
982 
983  D_INLINE dFloat64& operator[] (dInt32 i)
984  {
985  dAssert(i < 6);
986  dAssert(i >= 0);
987  return m_f[i];
988  }
989 
990  D_INLINE const dFloat64& operator[] (dInt32 i) const
991  {
992  dAssert(i < 6);
993  dAssert(i >= 0);
994  return m_f[i];
995  }
996 
997  D_INLINE dSpatialVector operator+ (const dSpatialVector& A) const
998  {
999  return dSpatialVector(_mm256_add_pd(m_d0, A.m_d0), _mm256_add_pd(m_d1, A.m_d1));
1000  }
1001 
1002  D_INLINE dSpatialVector operator*(const dSpatialVector& A) const
1003  {
1004  return dSpatialVector(_mm256_mul_pd(m_d0, A.m_d0), _mm256_mul_pd(m_d1, A.m_d1));
1005  }
1006 
1007  D_INLINE dFloat64 DotProduct(const dSpatialVector& v) const
1008  {
1009  dSpatialVector tmp(*this * v);
1010  __m256d tmp0(_mm256_add_pd(tmp.m_d0, tmp.m_d1));
1011  __m256d tmp1(_mm256_hadd_pd(tmp0, tmp0));
1012  __m256d tmp2(_mm256_permute2f128_pd(tmp1, tmp1, 1));
1013  __m256d tmp3(_mm256_add_pd(tmp1, tmp2));
1014  return *((dFloat64*)&tmp3);
1015  }
1016 
1017  D_INLINE dSpatialVector Scale(dFloat64 s) const
1018  {
1019  __m256d tmp(_mm256_set1_pd(s));
1020  return dSpatialVector(_mm256_mul_pd(m_d0, tmp), _mm256_mul_pd(m_d1, tmp));
1021  }
1022 
1023  union
1024  {
1025  dFloat64 m_f[8];
1026  struct
1027  {
1028  __m256d m_d0;
1029  __m256d m_d1;
1030  };
1031  };
1032  static dSpatialVector m_zero;
1033  } D_GCC_NEWTON_ALIGN_32;
1034 
1035 #else
1036  D_MSV_NEWTON_ALIGN_32
1037  class dBigVector
1038  {
1039  #define PERMUT_MASK_DOUBLE(y, x) _MM_SHUFFLE2 (y, x)
1040 
1041  public:
1042  D_INLINE dBigVector()
1043  {
1044  }
1045 
1046  D_INLINE dBigVector(const dBigVector& copy)
1047  :m_typeLow(copy.m_typeLow)
1048  ,m_typeHigh(copy.m_typeHigh)
1049  {
1050  }
1051 
1052  D_INLINE dBigVector(const __m128d typeLow, const __m128d typeHigh)
1053  :m_typeLow(typeLow)
1054  ,m_typeHigh(typeHigh)
1055  {
1056  }
1057 
1058  D_INLINE dBigVector(const __m128i typeLow, const __m128i typeHigh)
1059  :m_typeIntLow(typeLow)
1060  ,m_typeIntHigh(typeHigh)
1061  {
1062  }
1063 
1064  D_INLINE dBigVector(const dFloat64 a)
1065  :m_typeLow(_mm_set1_pd(a))
1066  ,m_typeHigh(_mm_set1_pd(a))
1067  {
1068  }
1069 
1070  #ifdef D_NEWTON_USE_DOUBLE
1071  D_INLINE dBigVector (const dFloat32* const ptr)
1072  :m_typeLow(_mm_loadu_pd(ptr))
1073  ,m_typeHigh(_mm_loadu_pd(&ptr[2]))
1074  {
1075  }
1076  #else
1077 
1078  D_INLINE dBigVector(const dVector& v)
1079  :m_typeLow(_mm_cvtps_pd (v.m_type))
1080  ,m_typeHigh(_mm_cvtps_pd (_mm_shuffle_ps (v.m_type, v.m_type, PERMUTE_MASK(3, 2, 3, 2))))
1081  {
1082  dAssert(dCheckVector((*this)));
1083  }
1084 
1085  D_INLINE dBigVector(const dFloat64* const ptr)
1086  :m_typeLow(_mm_loadu_pd(ptr))
1087  ,m_typeHigh(_mm_loadu_pd(&ptr[2]))
1088  {
1089  }
1090  #endif
1091 
1092  D_INLINE dBigVector(dFloat64 x, dFloat64 y, dFloat64 z, dFloat64 w)
1093  :m_typeLow(_mm_set_pd(y, x))
1094  ,m_typeHigh(_mm_set_pd(w, z))
1095  {
1096  }
1097 
1098  D_INLINE dBigVector(dInt32 ix, dInt32 iy, dInt32 iz, dInt32 iw)
1099  :m_ix(dInt64(ix)), m_iy(dInt64(iy)), m_iz(dInt64(iz)), m_iw(dInt64(iw))
1100  {
1101  }
1102 
1103  D_INLINE dBigVector(dInt64 ix, dInt64 iy, dInt64 iz, dInt64 iw)
1104  :m_ix(ix), m_iy(iy), m_iz(iz), m_iw(iw)
1105  {
1106  }
1107 
1108  D_INLINE dFloat64& operator[] (dInt32 i)
1109  {
1110  dAssert(i < 4);
1111  dAssert(i >= 0);
1112  return m_f[i];
1113  }
1114 
1115  D_INLINE const dFloat64& operator[] (dInt32 i) const
1116  {
1117  dAssert(i < 4);
1118  dAssert(i >= 0);
1119  return m_f[i];
1120  }
1121 
1122  D_INLINE dFloat64 GetScalar() const
1123  {
1124  //return m_x;
1125  return _mm_cvtsd_f64(m_typeLow);
1126  }
1127 
1128  D_INLINE dBigVector operator+ (const dBigVector& A) const
1129  {
1130  return dBigVector(_mm_add_pd(m_typeLow, A.m_typeLow), _mm_add_pd(m_typeHigh, A.m_typeHigh));
1131  }
1132 
1133  D_INLINE dBigVector operator- (const dBigVector& A) const
1134  {
1135  return dBigVector(_mm_sub_pd(m_typeLow, A.m_typeLow), _mm_sub_pd(m_typeHigh, A.m_typeHigh));
1136  }
1137 
1138  D_INLINE dBigVector operator* (const dBigVector& A) const
1139  {
1140  return dBigVector(_mm_mul_pd(m_typeLow, A.m_typeLow), _mm_mul_pd(m_typeHigh, A.m_typeHigh));
1141  }
1142 
1143  D_INLINE dBigVector& operator+= (const dBigVector& A)
1144  {
1145  m_typeLow = _mm_add_pd(m_typeLow, A.m_typeLow);
1146  m_typeHigh = _mm_add_pd(m_typeHigh, A.m_typeHigh);
1147  return *this;
1148  }
1149 
1150  D_INLINE dBigVector& operator-= (const dBigVector& A)
1151  {
1152  m_typeLow = _mm_sub_pd(m_typeLow, A.m_typeLow);
1153  m_typeHigh = _mm_sub_pd(m_typeHigh, A.m_typeHigh);
1154  return *this;
1155  }
1156 
1157  D_INLINE dBigVector& operator*= (const dBigVector& A)
1158  {
1159  m_typeLow = _mm_mul_pd(m_typeLow, A.m_typeLow);
1160  m_typeHigh = _mm_mul_pd(m_typeHigh, A.m_typeHigh);
1161  return *this;
1162  }
1163 
1164  D_INLINE dBigVector MulAdd(const dBigVector& A, const dBigVector& B) const
1165  {
1166  return *this + A * B;
1167  }
1168 
1169  D_INLINE dBigVector MulSub(const dBigVector& A, const dBigVector& B) const
1170  {
1171  return *this - A * B;
1172  }
1173 
1174  // return cross product
1175  D_INLINE dBigVector CrossProduct(const dBigVector& B) const
1176  {
1177  return dBigVector(m_y * B.m_z - m_z * B.m_y, m_z * B.m_x - m_x * B.m_z, m_x * B.m_y - m_y * B.m_x, m_w);
1178  }
1179 
1180  D_INLINE dBigVector AddHorizontal() const
1181  {
1182  __m128d tmp0(_mm_add_pd(m_typeHigh, m_typeLow));
1183  __m128d tmp1(_mm_hadd_pd(tmp0, tmp0));
1184  return dBigVector(tmp1, tmp1);
1185  }
1186 
1187  D_INLINE dBigVector BroadcastX() const
1188  {
1189  return dBigVector(m_x);
1190  }
1191 
1192  D_INLINE dBigVector BroadcastY() const
1193  {
1194  return dBigVector(m_y);
1195  }
1196 
1197  D_INLINE dBigVector BroadcastZ() const
1198  {
1199  return dBigVector(m_z);
1200  }
1201 
1202  D_INLINE dBigVector BroadcastW() const
1203  {
1204  return dBigVector(m_w);
1205  }
1206 
1207  D_INLINE dBigVector Scale(dFloat64 s) const
1208  {
1209  __m128d tmp0(_mm_set1_pd(s));
1210  return dBigVector(_mm_mul_pd(m_typeLow, tmp0), _mm_mul_pd(m_typeHigh, tmp0));
1211  }
1212 
1213  D_INLINE dBigVector Abs() const
1214  {
1215  return dBigVector(_mm_and_pd(m_typeLow, m_signMask.m_typeLow), _mm_and_pd(m_typeHigh, m_signMask.m_typeLow));
1216  }
1217 
1218  D_INLINE dBigVector Reciproc() const
1219  {
1220  return dBigVector(_mm_div_pd(m_one.m_typeLow, m_typeLow), _mm_div_pd(m_one.m_typeHigh, m_typeHigh));
1221  }
1222 
1223  D_INLINE dBigVector Sqrt() const
1224  {
1225  return dBigVector(_mm_sqrt_pd(m_typeLow), _mm_sqrt_pd(m_typeHigh));
1226  }
1227 
1228  D_INLINE dBigVector InvSqrt() const
1229  {
1230  return Sqrt().Reciproc();
1231  }
1232 
1233  D_INLINE dBigVector Normalize() const
1234  {
1235  dAssert (m_w == dFloat32 (0.0f));
1236  dFloat64 mag2 = DotProduct(*this).GetScalar();
1237  return Scale(dFloat64 (1.0f) / sqrt (mag2));
1238  }
1239 
1240  dFloat64 GetMax() const
1241  {
1242  __m128d tmp(_mm_max_pd(m_typeLow, m_typeHigh));
1243  return dBigVector(_mm_max_pd(tmp, _mm_shuffle_pd(tmp, tmp, PERMUT_MASK_DOUBLE(0, 1))), tmp).GetScalar();
1244  }
1245 
1246  dBigVector GetMax(const dBigVector& data) const
1247  {
1248  return dBigVector(_mm_max_pd(m_typeLow, data.m_typeLow), _mm_max_pd(m_typeHigh, data.m_typeHigh));
1249  }
1250 
1251  dBigVector GetMin(const dBigVector& data) const
1252  {
1253  return dBigVector(_mm_min_pd(m_typeLow, data.m_typeLow), _mm_min_pd(m_typeHigh, data.m_typeHigh));
1254  }
1255 
1256  D_INLINE dBigVector GetInt() const
1257  {
1258  dBigVector temp(Floor());
1259  dInt64 x = _mm_cvtsd_si32(temp.m_typeLow);
1260  dInt64 y = _mm_cvtsd_si32(_mm_shuffle_pd(temp.m_typeLow, temp.m_typeLow, PERMUT_MASK_DOUBLE(1, 1)));
1261  dInt64 z = _mm_cvtsd_si32(temp.m_typeHigh);
1262  dInt64 w = _mm_cvtsd_si32(_mm_shuffle_pd(temp.m_typeHigh, temp.m_typeHigh, PERMUT_MASK_DOUBLE(1, 1)));
1263  return dBigVector(_mm_set_pd(*(dFloat32*)&y, *(dFloat32*)&x), _mm_set_pd(*(dFloat32*)&w, *(dFloat32*)&z));
1264  }
1265 
1266  // relational operators
1267  D_INLINE dBigVector operator> (const dBigVector& data) const
1268  {
1269  return dBigVector(_mm_cmpgt_pd(m_typeLow, data.m_typeLow), _mm_cmpgt_pd(m_typeHigh, data.m_typeHigh));
1270  }
1271 
1272  D_INLINE dBigVector operator== (const dBigVector& data) const
1273  {
1274  return dBigVector(_mm_cmpeq_pd(m_typeLow, data.m_typeLow), _mm_cmpeq_pd(m_typeHigh, data.m_typeHigh));
1275  }
1276 
1277  D_INLINE dBigVector operator< (const dBigVector& data) const
1278  {
1279  return dBigVector(_mm_cmplt_pd(m_typeLow, data.m_typeLow), _mm_cmplt_pd(m_typeHigh, data.m_typeHigh));
1280  }
1281 
1282  D_INLINE dBigVector operator>= (const dBigVector& data) const
1283  {
1284  return dBigVector(_mm_cmpge_pd(m_typeLow, data.m_typeLow), _mm_cmpge_pd(m_typeHigh, data.m_typeHigh));
1285  }
1286 
1287  D_INLINE dBigVector operator<= (const dBigVector& data) const
1288  {
1289  return dBigVector(_mm_cmple_pd(m_typeLow, data.m_typeLow), _mm_cmple_pd(m_typeHigh, data.m_typeHigh));
1290  }
1291 
1292  // logical operations
1293  D_INLINE dBigVector operator& (const dBigVector& data) const
1294  {
1295  return dBigVector(_mm_and_pd(m_typeLow, data.m_typeLow), _mm_and_pd(m_typeHigh, data.m_typeHigh));
1296  }
1297 
1298  D_INLINE dBigVector operator| (const dBigVector& data) const
1299  {
1300  return dBigVector(_mm_or_pd(m_typeLow, data.m_typeLow), _mm_or_pd(m_typeHigh, data.m_typeHigh));
1301  }
1302 
1303  D_INLINE dBigVector operator^ (const dBigVector& data) const
1304  {
1305  return dBigVector(_mm_xor_pd(m_typeLow, data.m_typeLow), _mm_xor_pd(m_typeHigh, data.m_typeHigh));
1306  }
1307 
1308  D_INLINE dBigVector AndNot(const dBigVector& data) const
1309  {
1310  return dBigVector(_mm_andnot_pd(data.m_typeLow, m_typeLow), _mm_andnot_pd(data.m_typeHigh, m_typeHigh));
1311  }
1312 
1313  D_INLINE dBigVector Select(const dBigVector& data, const dBigVector& mask) const
1314  {
1315  // (((b ^ a) & mask)^a)
1316  return dBigVector(_mm_xor_pd(m_typeLow, _mm_and_pd(mask.m_typeLow, _mm_xor_pd(m_typeLow, data.m_typeLow))),
1317  _mm_xor_pd(m_typeHigh, _mm_and_pd(mask.m_typeHigh, _mm_xor_pd(m_typeHigh, data.m_typeHigh))));
1318  }
1319 
1320  D_INLINE dBigVector ShiftRight() const
1321  {
1322  //return dBigVector (m_w, m_x, m_y, m_z);
1323  return dBigVector(_mm_shuffle_pd(m_typeHigh, m_typeLow, PERMUT_MASK_DOUBLE(0, 1)), _mm_shuffle_pd(m_typeLow, m_typeHigh, PERMUT_MASK_DOUBLE(0, 1)));
1324  }
1325 
1326  D_INLINE dBigVector ShiftTripleRight() const
1327  {
1328  return dBigVector(_mm_shuffle_pd(m_typeHigh, m_typeLow, PERMUT_MASK_DOUBLE(0, 0)), _mm_shuffle_pd(m_typeLow, m_typeHigh, PERMUT_MASK_DOUBLE(1, 1)));
1329  }
1330 
1331  D_INLINE dBigVector ShiftTripleLeft() const
1332  {
1333  return dBigVector(_mm_shuffle_pd(m_typeLow, m_typeHigh, PERMUT_MASK_DOUBLE(0, 1)), _mm_shuffle_pd(m_typeLow, m_typeHigh, PERMUT_MASK_DOUBLE(1, 0)));
1334  }
1335 
1336  D_INLINE dBigVector ShiftRightLogical(dInt32 bits) const
1337  {
1338  //return dBigVector(dInt64(dUnsigned64(m_ix) >> bits), dInt64(dUnsigned64(m_iy) >> bits), dInt64(dUnsigned64(m_iz) >> bits), dInt64(dUnsigned64(m_iw) >> bits));
1339  return dBigVector(_mm_srli_epi64(m_typeIntLow, bits), _mm_srli_epi64(m_typeIntHigh, bits));
1340  }
1341 
1342  D_INLINE dInt32 GetSignMask() const
1343  {
1344  return _mm_movemask_pd(m_typeLow) | (_mm_movemask_pd(m_typeHigh) << 2);
1345  }
1346 
1347  D_INLINE dBigVector Floor() const
1348  {
1349  return dBigVector(floor(m_x), floor(m_y), floor(m_z), floor(m_w));
1350  }
1351 
1352  D_INLINE dBigVector TestZero() const
1353  {
1354  return m_negOne & (*this == m_zero);
1355  }
1356 
1357  D_INLINE static void Transpose4x4(dBigVector& dst0, dBigVector& dst1, dBigVector& dst2, dBigVector& dst3,
1358  const dBigVector& src0, const dBigVector& src1, const dBigVector& src2, const dBigVector& src3)
1359  {
1360  dBigVector tmp0(src0);
1361  dBigVector tmp1(src1);
1362  dBigVector tmp2(src2);
1363  dBigVector tmp3(src3);
1364 
1365  dst0 = dBigVector(tmp0.m_x, tmp1.m_x, tmp2.m_x, tmp3.m_x);
1366  dst1 = dBigVector(tmp0.m_y, tmp1.m_y, tmp2.m_y, tmp3.m_y);
1367  dst2 = dBigVector(tmp0.m_z, tmp1.m_z, tmp2.m_z, tmp3.m_z);
1368  dst3 = dBigVector(tmp0.m_w, tmp1.m_w, tmp2.m_w, tmp3.m_w);
1369  }
1370 
1371  // return dot 4d dot product
1372  D_INLINE dBigVector DotProduct(const dBigVector &A) const
1373  {
1374  dBigVector tmp(_mm_mul_pd(m_typeLow, A.m_typeLow), _mm_mul_pd(m_typeHigh, A.m_typeHigh));
1375  return tmp.AddHorizontal();
1376  }
1377 
1378  D_INLINE dBigVector CrossProduct(const dBigVector& A, const dBigVector& B) const
1379  {
1380  dFloat64 cofactor[3][3];
1381  dFloat64 array[4][4];
1382 
1383  const dBigVector& me = *this;
1384  for (dInt32 i = 0; i < 4; i++) {
1385  array[0][i] = me[i];
1386  array[1][i] = A[i];
1387  array[2][i] = B[i];
1388  array[3][i] = dFloat64(1.0f);
1389  }
1390 
1391  dBigVector normal;
1392  dFloat64 sign = dFloat64(-1.0f);
1393  for (dInt32 i = 0; i < 4; i++)
1394  {
1395  for (dInt32 j = 0; j < 3; j++)
1396  {
1397  dInt32 k0 = 0;
1398  for (dInt32 k = 0; k < 4; k++)
1399  {
1400  if (k != i)
1401  {
1402  cofactor[j][k0] = array[j][k];
1403  k0++;
1404  }
1405  }
1406  }
1407  dFloat64 x = cofactor[0][0] * (cofactor[1][1] * cofactor[2][2] - cofactor[1][2] * cofactor[2][1]);
1408  dFloat64 y = cofactor[0][1] * (cofactor[1][2] * cofactor[2][0] - cofactor[1][0] * cofactor[2][2]);
1409  dFloat64 z = cofactor[0][2] * (cofactor[1][0] * cofactor[2][1] - cofactor[1][1] * cofactor[2][0]);
1410  dFloat64 det = x + y + z;
1411 
1412  normal[i] = sign * det;
1413  sign *= dFloat64(-1.0f);
1414  }
1415 
1416  return normal;
1417  }
1418 
1419  union
1420  {
1421  dFloat64 m_f[4];
1422  dInt64 m_i[4];
1423  struct
1424  {
1425  __m128d m_typeLow;
1426  __m128d m_typeHigh;
1427  };
1428  struct
1429  {
1430  __m128i m_typeIntLow;
1431  __m128i m_typeIntHigh;
1432  };
1433  struct
1434  {
1435  dFloat64 m_x;
1436  dFloat64 m_y;
1437  dFloat64 m_z;
1438  dFloat64 m_w;
1439  };
1440  struct
1441  {
1442  dInt64 m_ix;
1443  dInt64 m_iy;
1444  dInt64 m_iz;
1445  dInt64 m_iw;
1446  };
1447  };
1448 
1449  D_CORE_API static dBigVector m_zero;
1450  D_CORE_API static dBigVector m_one;
1451  D_CORE_API static dBigVector m_wOne;
1452  D_CORE_API static dBigVector m_two;
1453  D_CORE_API static dBigVector m_half;
1454  D_CORE_API static dBigVector m_three;
1455  D_CORE_API static dBigVector m_negOne;
1456  D_CORE_API static dBigVector m_xMask;
1457  D_CORE_API static dBigVector m_yMask;
1458  D_CORE_API static dBigVector m_zMask;
1459  D_CORE_API static dBigVector m_wMask;
1460  D_CORE_API static dBigVector m_epsilon;
1461  D_CORE_API static dBigVector m_signMask;
1462  D_CORE_API static dBigVector m_triplexMask;
1463  } D_GCC_NEWTON_ALIGN_32 ;
1464 
1465  D_MSV_NEWTON_ALIGN_32
1466  class dSpatialVector
1467  {
1468  public:
1469  D_INLINE dSpatialVector()
1470  {
1471  }
1472 
1473  D_INLINE dSpatialVector(const dFloat32 a)
1474  :m_d0(_mm_set1_pd(a))
1475  ,m_d1(_mm_set1_pd(a))
1476  ,m_d2(_mm_set1_pd(a))
1477  {
1478  }
1479 
1480  #ifdef D_NEWTON_USE_DOUBLE
1481  #define PURMUT_MASK2(y, x) _MM_SHUFFLE2(x, y)
1482  D_INLINE dSpatialVector(const dVector& low, const dVector& high)
1483  :m_d0(low.m_typeLow)
1484  ,m_d1(_mm_shuffle_pd(low.m_typeHigh, high.m_typeLow, PURMUT_MASK2(0, 0)))
1485  ,m_d2(_mm_shuffle_pd(high.m_typeLow, high.m_typeHigh, PURMUT_MASK2(1, 0)))
1486  {
1487  }
1488  #else
1489  D_INLINE dSpatialVector(const dVector& low, const dVector& high)
1490  :m_d0(_mm_cvtps_pd(low.m_type))
1491  ,m_d1(_mm_cvtps_pd(_mm_unpackhi_ps(low.m_type, _mm_shuffle_ps(low.m_type, high.m_type, PERMUTE_MASK(0, 0, 0, 2)))))
1492  ,m_d2(_mm_cvtps_pd(_mm_shuffle_ps(high.m_type, high.m_type, PERMUTE_MASK(3, 3, 2, 1))))
1493  {
1494  }
1495  #endif
1496 
1497  D_INLINE dSpatialVector(const dSpatialVector& copy)
1498  :m_d0(copy.m_d0)
1499  ,m_d1(copy.m_d1)
1500  ,m_d2(copy.m_d2)
1501  {
1502  }
1503 
1504  D_INLINE dSpatialVector(const __m128d d0, const __m128d d1, const __m128d d2)
1505  :m_d0(d0)
1506  ,m_d1(d1)
1507  ,m_d2(d2)
1508  {
1509  }
1510 
1511  D_INLINE dFloat64& operator[] (dInt32 i)
1512  {
1513  dAssert(i < 6);
1514  dAssert(i >= 0);
1515  return ((dFloat64*)&m_d0)[i];
1516  }
1517 
1518  D_INLINE const dFloat64& operator[] (dInt32 i) const
1519  {
1520  dAssert(i < 6);
1521  dAssert(i >= 0);
1522  return ((dFloat64*)&m_d0)[i];
1523  }
1524 
1525  D_INLINE dSpatialVector operator+ (const dSpatialVector& A) const
1526  {
1527  return dSpatialVector(_mm_add_pd(m_d0, A.m_d0), _mm_add_pd(m_d1, A.m_d1), _mm_add_pd(m_d2, A.m_d2));
1528  }
1529 
1530  D_INLINE dSpatialVector operator*(const dSpatialVector& A) const
1531  {
1532  return dSpatialVector(_mm_mul_pd(m_d0, A.m_d0), _mm_mul_pd(m_d1, A.m_d1), _mm_mul_pd(m_d2, A.m_d2));
1533  }
1534 
1535  D_INLINE dFloat64 DotProduct(const dSpatialVector& v) const
1536  {
1537  dSpatialVector tmp(*this * v);
1538  __m128d tmp2(_mm_add_pd(tmp.m_d0, _mm_add_pd(tmp.m_d1, tmp.m_d2)));
1539  return _mm_cvtsd_f64(_mm_hadd_pd(tmp2, tmp2));
1540  }
1541 
1542  D_INLINE dSpatialVector Scale(dFloat64 s) const
1543  {
1544  __m128d tmp(_mm_set1_pd(s));
1545  return dSpatialVector(_mm_mul_pd(m_d0, tmp), _mm_mul_pd(m_d1, tmp), _mm_mul_pd(m_d2, tmp));
1546  }
1547 
1548  union
1549  {
1550  dFloat64 m_f[6];
1551  struct
1552  {
1553  __m128d m_d0;
1554  __m128d m_d1;
1555  __m128d m_d2;
1556  };
1557  };
1558  D_CORE_API static dSpatialVector m_zero;
1559  } D_GCC_NEWTON_ALIGN_32 ;
1560 #endif
1561 
1562 #endif
1563 #endif
dMemory::Free
static D_CORE_API void Free(void *const ptr)
Destroy a memory buffer previously allocated by Malloc.
Definition: dMemory.cpp:58
dBigVector
Definition: dVectorArmNeon.h:1521
dVector
Definition: dVectorArmNeon.h:1104
dSpatialVector
Definition: dVectorArmNeon.h:1954
dMemory::Malloc
static D_CORE_API void * Malloc(size_t size)
General Memory allocation function.
Definition: dMemory.cpp:44