Newton Dynamics  4.00
ndGeneralMatrix.h
1 
2 /* Copyright (c) <2003-2022> <Julio Jerez, Newton Game Dynamics>
3 *
4 * This software is provided 'as-is', without any express or implied
5 * warranty. In no event will the authors be held liable for any damages
6 * arising from the use of this software.
7 *
8 * Permission is granted to anyone to use this software for any purpose,
9 * including commercial applications, and to alter it and redistribute it
10 * freely, subject to the following restrictions:
11 *
12 * 1. The origin of this software must not be misrepresented; you must not
13 * claim that you wrote the original software. If you use this software
14 * in a product, an acknowledgment in the product documentation would be
15 * appreciated but is not required.
16 *
17 * 2. Altered source versions must be plainly marked as such, and must not be
18 * misrepresented as being the original software.
19 *
20 * 3. This notice may not be removed or altered from any source distribution.
21 */
22 
23 #ifndef __ND_GENERAL_MATRIX_H__
24 #define __ND_GENERAL_MATRIX_H__
25 
26 #include "ndCoreStdafx.h"
27 #include "ndTypes.h"
28 #include "ndUtils.h"
29 #include "ndGeneralVector.h"
30 
31 #define D_LCP_MAX_VALUE ndFloat32 (1.0e15f)
32 
33 //*************************************************************
34 //
35 // generic linear algebra functions
36 //
37 //*************************************************************
38 template<class T>
39 void ndMatrixTimeVector(ndInt32 size, const T* const matrix, const T* const v, T* const out)
40 {
41  ndInt32 stride = 0;
42  for (ndInt32 i = 0; i < size; ++i)
43  {
44  const T* const row = &matrix[stride];
45  out[i] = ndDotProduct(size, row, v);
46  stride += size;
47  }
48 }
49 
50 template<class T>
51 void ndMatrixTimeMatrix(ndInt32 size, const T* const matrixA, const T* const matrixB, T* const out)
52 {
53  for (ndInt32 i = 0; i < size; ++i)
54  {
55  const T* const rowA = &matrixA[i * size];
56  T* const rowOut = &out[i * size];
57  for (ndInt32 j = 0; j < size; ++j)
58  {
59  T acc = T(0.0f);
60  for (ndInt32 k = 0; k < size; ++k)
61  {
62  acc += rowA[k] * matrixB[k * size + j];
63  }
64  rowOut[j] = acc;
65  }
66  }
67 }
68 
69 template<class T>
70 void ndCovarianceMatrix(ndInt32 size, T* const matrix, const T* const vectorA, const T* const vectorB)
71 {
72  ndInt32 stride = 0;
73  for (ndInt32 i = 0; i < size; ++i)
74  {
75  T scale(vectorA[i]);
76  T* const row = &matrix[stride];
77  for (ndInt32 j = 0; j < size; ++j)
78  {
79  row[j] = scale * vectorB[j];
80  }
81  stride += size;
82  }
83 }
84 
85 template<class T>
86 bool ndCholeskyFactorizationAddRow(ndInt32, ndInt32 stride, ndInt32 n, T* const matrix, T* const invDiagonalOut)
87 {
88  T* const rowN = &matrix[stride * n];
89 
90  ndInt32 base = 0;
91  for (ndInt32 j = 0; j <= n; ++j)
92  {
93  T s(0.0f);
94  T* const rowJ = &matrix[base];
95  for (ndInt32 k = 0; k < j; ++k)
96  {
97  s += rowN[k] * rowJ[k];
98  }
99 
100  if (n == j)
101  {
102  T diag = rowN[n] - s;
103  if (diag < T(1.0e-6f))
104  {
105  return false;
106  }
107 
108  rowN[n] = T(sqrt(diag));
109  invDiagonalOut[n] = T(1.0f) / rowN[n];
110  }
111  else
112  {
113  rowJ[n] = T(0.0f);
114  //rowN[j] = (rowN[j] - s) / rowJ[j];
115  rowN[j] = invDiagonalOut[j] * (rowN[j] - s);
116  }
117 
118  base += stride;
119  }
120 
121  return true;
122 }
123 
124 template<class T>
125 bool ndCholeskyFactorization(ndInt32 size, ndInt32 stride, T* const psdMatrix)
126 {
127  bool state = true;
128  T* const invDiagonal = ndAlloca(T, size);
129  for (ndInt32 i = 0; (i < size) && state; ++i)
130  {
131  state = state && ndCholeskyFactorizationAddRow(size, stride, i, psdMatrix, invDiagonal);
132  }
133  return state;
134 }
135 
136 template<class T>
137 bool ndTestPSDmatrix(ndInt32 size, ndInt32 stride, T* const matrix)
138 {
139  T* const copy = ndAlloca(T, size * size);
140  ndInt32 row = 0;
141  for (ndInt32 i = 0; i < size; ++i)
142  {
143  memcpy(&copy[i * size], &matrix[row], size * sizeof (T));
144  row += stride;
145  }
146  return ndCholeskyFactorization(size, size, copy);
147 }
148 
149 template<class T>
150 void ndCholeskyApplyRegularizer (ndInt32 size, ndInt32 stride, T* const psdMatrix, T* const regularizer)
151 {
152  bool isPsdMatrix = false;
153  ndFloat32* const lowerTriangule = ndAlloca(ndFloat32, stride * stride);
154  do
155  {
156  memcpy(lowerTriangule, psdMatrix, sizeof(ndFloat32) * stride * stride);
157  isPsdMatrix = ndCholeskyFactorization(size, stride, lowerTriangule);
158  if (!isPsdMatrix)
159  {
160  for (ndInt32 i = 0; i < size; ++i)
161  {
162  regularizer[i] *= ndFloat32(4.0f);
163  psdMatrix[i * stride + i] += regularizer[i];
164  }
165  }
166  } while (!isPsdMatrix);
167 }
168 
169 template<class T>
170 void ndSolveCholesky(ndInt32 size, ndInt32 stride, const T* const choleskyMatrix, T* const x, const T* const b)
171 {
172  ndInt32 rowStart = 0;
173  for (ndInt32 i = 0; i < size; ++i)
174  {
175  T acc(0.0f);
176  const T* const row = &choleskyMatrix[rowStart];
177  for (ndInt32 j = 0; j < i; ++j)
178  {
179  acc = acc + row[j] * x[j];
180  }
181  x[i] = (b[i] - acc) / row[i];
182  rowStart += stride;
183  }
184 
185  for (ndInt32 i = size - 1; i >= 0; i--)
186  {
187  T acc = 0.0f;
188  for (ndInt32 j = i + 1; j < size; ++j)
189  {
190  acc = acc + choleskyMatrix[stride * j + i] * x[j];
191  }
192  x[i] = (x[i] - acc) / choleskyMatrix[stride * i + i];
193  }
194 }
195 
196 template<class T>
197 void ndSolveCholesky(ndInt32 size, T* const choleskyMatrix, T* const x)
198 {
199  ndSolveCholesky(size, size, choleskyMatrix, x);
200 }
201 
202 template<class T>
203 bool ndSolveGaussian(ndInt32 size, T* const matrix, T* const b)
204 {
205  for (ndInt32 i = 0; i < size - 1; ++i)
206  {
207  const T* const rowI = &matrix[i * size];
208  ndInt32 m = i;
209  T maxVal (ndAbs(rowI[i]));
210  for (ndInt32 j = i + 1; j < size - 1; ++j)
211  {
212  T val (ndAbs(matrix[size * j + i]));
213  if (val > maxVal)
214  {
215  m = j;
216  maxVal = val;
217  }
218  }
219 
220  if (maxVal < T(1.0e-12f))
221  {
222  return false;
223  }
224 
225  if (m != i)
226  {
227  T* const rowK = &matrix[m * size];
228  T* const rowJ = &matrix[i * size];
229  for (ndInt32 j = 0; j < size; ++j)
230  {
231  ndSwap(rowK[j], rowJ[j]);
232  }
233  ndSwap(b[i], b[m]);
234  }
235 
236  T den = T(1.0f) / rowI[i];
237  for (ndInt32 k = i + 1; k < size; ++k)
238  {
239  T* const rowK = &matrix[size * k];
240  T factor(-rowK[i] * den);
241  for (ndInt32 j = i + 1; j < size; ++j)
242  {
243  rowK[j] += rowI[j] * factor;
244  }
245  rowK[i] = T(0.0f);
246  b[k] += b[i] * factor;
247  }
248  }
249 
250  for (ndInt32 i = size - 1; i >= 0; i--)
251  {
252  T acc(0);
253  T* const rowI = &matrix[i * size];
254  for (ndInt32 j = i + 1; j < size; ++j)
255  {
256  acc = acc + rowI[j] * b[j];
257  }
258  b[i] = (b[i] - acc) / rowI[i];
259  }
260  return true;
261 }
262 
263 template <class T>
264 void ndEigenValues(const ndInt32 size, const ndInt32 stride, const T* const symmetricMatrix, T* const eigenValues)
265 {
266  T* const offDiag = ndAlloca(T, size);
267  T* const matrix = ndAlloca(T, size * stride);
268 
269  memcpy(matrix, symmetricMatrix, sizeof(T) * size * stride);
270  for (ndInt32 i = size - 1; i > 0; i--)
271  {
272  T h(0.0f);
273  T* const rowI = &matrix[i * stride];
274 
275  if (i > 1)
276  {
277  T scale(0.0f);
278  for (ndInt32 k = 0; k < i; ++k)
279  {
280  scale += ndAbs(rowI[k]);
281  }
282 
283  if (scale == T(0.0f))
284  {
285  offDiag[i] = rowI[i - 1];
286  }
287  else
288  {
289  for (ndInt32 k = 0; k < i; ++k)
290  {
291  rowI[k] /= scale;
292  h += rowI[k] * rowI[k];
293  }
294 
295  T f(rowI[i - 1]);
296  T g((f >= T(0.0f) ? -T(sqrt(h)) : T(sqrt(h))));
297  offDiag[i] = scale * g;
298  h -= f * g;
299  rowI[i - 1] = f - g;
300  f = T(0.0f);
301 
302  for (ndInt32 j = 0; j < i; ++j)
303  {
304  g = T(0.0f);
305  const T* const rowJ = &matrix[j * stride];
306  for (ndInt32 k = 0; k <= j; ++k)
307  {
308  g += rowJ[k] * rowI[k];
309  }
310  for (ndInt32 k = j + 1; k < i; ++k)
311  {
312  g += matrix[k * stride + j] * rowI[k];
313  }
314  offDiag[j] = g / h;
315  f += offDiag[j] * rowI[j];
316  }
317 
318  T hh(f / (h + h));
319  for (ndInt32 j = 0; j < i; ++j)
320  {
321  T f1 (rowI[j]);
322  T g1(offDiag[j] - hh * f1);
323  offDiag[j] = g1;
324  T* const rowJ = &matrix[j * stride];
325  for (ndInt32 k = 0; k <= j; ++k)
326  {
327  rowJ[k] -= (f1 * offDiag[k] + g1 * rowI[k]);
328  }
329  }
330  }
331  }
332  else
333  {
334  offDiag[i] = rowI[i - 1];
335  }
336  eigenValues[i] = h;
337  }
338 
339  ndInt32 index = stride;
340  eigenValues[0] = matrix[0];
341  for (ndInt32 i = 1; i < size; ++i)
342  {
343  eigenValues[i] = matrix[index + i];
344  offDiag[i - 1] = offDiag[i];
345  index += stride;
346  }
347 
348  for (ndInt32 i = 0; i < size; ++i)
349  {
350  ndInt32 j;
351  ndInt32 iter = 0;
352  do
353  {
354  for (j = i; j < size - 1; ++j)
355  {
356  T dd(ndAbs(eigenValues[j]) + ndAbs(eigenValues[j + 1]));
357  if (ndAbs(offDiag[j]) <= (T(1.e-6f) * dd))
358  {
359  break;
360  }
361  }
362 
363  if (j != i)
364  {
365  iter++;
366  if (iter == 10)
367  {
368  ndAssert(0);
369  return;
370  }
371 
372  T g((eigenValues[i + 1] - eigenValues[i]) / (T(2.0f) * offDiag[i]));
373  T r(ndPythag(g, T(1.0f)));
374  g = eigenValues[j] - eigenValues[i] + offDiag[i] / (g + ndSign(r, g));
375  T s(1.0f);
376  T c(1.0f);
377  T p(0.0f);
378 
379  ndInt32 k;
380  for (k = j - 1; k >= i; k--)
381  {
382  T f(s * offDiag[k]);
383  T b(c * offDiag[k]);
384  T d(ndPythag(f, g));
385  offDiag[k + 1] = d;
386  if (d == T(0.0f))
387  {
388  eigenValues[k + 1] -= p;
389  offDiag[j] = T(0.0f);
390  break;
391  }
392  s = f / d;
393  c = g / d;
394  g = eigenValues[k + 1] - p;
395  d = (eigenValues[k] - g) * s + T(2.0f) * c * b;
396  p = s * d;
397  eigenValues[k + 1] = g + p;
398  g = c * d - b;
399  }
400 
401  if (r == T(0.0f) && k >= i)
402  {
403  continue;
404  }
405  eigenValues[i] -= p;
406  offDiag[i] = g;
407  offDiag[j] = T(0.0f);
408  }
409  } while (j != i);
410  }
411 }
412 
413 template <class T>
414 T ndConditionNumber(const ndInt32 size, const ndInt32 stride, const T* const choleskyMatrix)
415 {
416  T* const eigenValues = ndAlloca(T, size);
417  ndEigenValues(size, stride, choleskyMatrix, eigenValues);
418 
419  T minVal = T(1.0e20f);
420  T maxVal = T(-1.0e20f);
421  for (ndInt32 i = 0; i < size; ++i)
422  {
423  minVal = ndMin(minVal, eigenValues[i]);
424  maxVal = ndMax(maxVal, eigenValues[i]);
425  }
426  T condition = T(ndAbs(maxVal) / ndAbs(minVal));
427  return condition;
428 }
429 
430 // solve a general Linear complementary program (LCP)
431 // A * x = b + r
432 // subjected to constraints
433 // x(i) = low(i), if r(i) >= 0
434 // x(i) = high(i), if r(i) <= 0
435 // low(i) <= x(i) <= high(i), if r(i) == 0
436 //
437 // return true is the system has a solution.
438 // in return
439 // x is the solution,
440 // r is return in vector b
441 // note: although the system is called LCP, the solver is far more general than a strict LCP
442 // to solve a strict LCP, set the following
443 // low(i) = 0
444 // high(i) = infinity.
445 // this the same as enforcing the constraint: x(i) * r(i) = 0
446 template <class T>
447 void ndGaussSeidelLcpSor(const ndInt32 size, const T* const matrix, T* const x, const T* const b, const T* const low, const T* const high, T tol2, ndInt32 maxIterCount, ndInt16* const clipped, T sor)
448 {
449  const T* const me = matrix;
450  T* const invDiag1 = ndAlloca(T, size);
451 
452  ndInt32 stride = 0;
453  for (ndInt32 i = 0; i < size; ++i)
454  {
455  x[i] = ndClamp(T(0.0f), low[i], high[i]);
456  invDiag1[i] = T(1.0f) / me[stride + i];
457  stride += size;
458  }
459 
460  T tolerance(tol2 * 2.0f);
461  const T* const invDiag = invDiag1;
462 #ifdef _DEBUG
463  ndInt32 passes = 0;
464 #endif
465  for (ndInt32 i = 0; (i < maxIterCount) && (tolerance > tol2); ++i)
466  {
467  ndInt32 base = 0;
468  tolerance = T(0.0f);
469 #ifdef _DEBUG
470  passes++;
471 #endif
472  for (ndInt32 j = 0; j < size; ++j)
473  {
474  const T* const row = &me[base];
475  T r(b[j] - ndDotProduct(size, row, x));
476  T f((r + row[j] * x[j]) * invDiag[j]);
477  if (f > high[j])
478  {
479  x[j] = high[j];
480  clipped[j] = 1;
481  }
482  else if (f < low[j])
483  {
484  x[j] = low[j];
485  clipped[j] = 1;
486  }
487  else
488  {
489  clipped[j] = 0;
490  tolerance += r * r;
491  x[j] = x[j] + (f - x[j]) * sor;
492  }
493  base += size;
494  }
495  }
496 }
497 
498 template <class T>
499 void ndGaussSeidelLcpSor(const ndInt32 size, const ndInt32 stride, const T* const matrix, T* const x, const T* const b, const ndInt32* const normalIndex, const T* const low, const T* const high, T tol2, ndInt32 maxIterCount, T sor)
500 {
501  const T* const me = matrix;
502  T* const invDiag1 = ndAlloca(T, size);
503  T* const u = ndAlloca(T, size + 1);
504  ndInt32* const index = ndAlloca(ndInt32, size);
505 
506  u[size] = T(1.0f);
507  ndInt32 rowStart = 0;
508  for (ndInt32 j = 0; j < size; ++j)
509  {
510  u[j] = x[j];
511  index[j] = normalIndex[j] ? j + normalIndex[j] : size;
512  }
513 
514  for (ndInt32 j = 0; j < size; ++j)
515  {
516  const T val = u[index[j]];
517  const T l = low[j] * val;
518  const T h = high[j] * val;
519  u[j] = ndClamp(u[j], l, h);
520  invDiag1[j] = T(1.0f) / me[rowStart + j];
521  rowStart += stride;
522  }
523 
524  T tolerance(tol2 * 2.0f);
525  const T* const invDiag = invDiag1;
526  const ndInt32 maxCount = ndMax(8, size);
527  for (ndInt32 i = 0; (i < maxCount) && (tolerance > tol2); ++i)
528  {
529  ndInt32 base = 0;
530  tolerance = T(0.0f);
531  for (ndInt32 j = 0; j < size; ++j)
532  {
533  const T* const row = &me[base];
534  T r(b[j] - ndDotProduct(size, row, u));
535  T f((r + row[j] * u[j]) * invDiag[j]);
536 
537  const T val = u[index[j]];
538  const T l = low[j] * val;
539  const T h = high[j] * val;
540  if (f > h)
541  {
542  u[j] = h;
543  }
544  else if (f < l)
545  {
546  u[j] = l;
547  }
548  else
549  {
550  tolerance += r * r;
551  u[j] = f;
552  }
553  base += stride;
554  }
555  }
556 
557 #ifdef _DEBUG
558  ndInt32 passes = 0;
559 #endif
560  for (ndInt32 i = 0; (i < maxIterCount) && (tolerance > tol2); ++i)
561  {
562  ndInt32 base = 0;
563  tolerance = T(0.0f);
564 #ifdef _DEBUG
565  passes++;
566 #endif
567  for (ndInt32 j = 0; j < size; ++j)
568  {
569  const T* const row = &me[base];
570  T r(b[j] - ndDotProduct(size, row, u));
571  T f((r + row[j] * u[j]) * invDiag[j]);
572  f = u[j] + (f - u[j]) * sor;
573 
574  const T val = u[index[j]];
575  const T l = low[j] * val;
576  const T h = high[j] * val;
577  if (f > h)
578  {
579  u[j] = h;
580  }
581  else if (f < l)
582  {
583  u[j] = l;
584  }
585  else
586  {
587  tolerance += r * r;
588  u[j] = f;
589  }
590  base += stride;
591  }
592  }
593 
594  for (ndInt32 j = 0; j < size; ++j)
595  {
596  x[j] = u[j];
597  }
598 }
599 
600 // solve a general Linear complementary program (LCP)
601 // A * x = b + r
602 // subjected to constraints
603 // x(i) = low(i), if r(i) >= 0
604 // x(i) = high(i), if r(i) <= 0
605 // low(i) <= x(i) <= high(i), if r(i) == 0
606 //
607 // return true is the system has a solution.
608 // in return
609 // x is the solution,
610 // r is return in vector b
611 // note: although the system is called LCP, the solver is far more general than a strict LCP
612 // to solve a strict LCP, set the following
613 // low(i) = 0
614 // high(i) = infinity.
615 // this the same as enforcing the constraint: x(i) * r(i) = 0
616 template <class T>
617 void ndGaussSeidelLCP(const ndInt32 size, const T* const matrix, T* const x, const T* const b, const T* const low, const T* const high, T sor = T(1.2f))
618 {
619  ndInt16* const clipped = ndAlloca(ndInt16, size);
620  ndGaussSeidelLcpSor(size, matrix, x, b, low, high, T(1.0e-3f), size * size, clipped, sor);
621 }
622 
623 template<class T>
624 void ndPermuteRows(ndInt32 size, ndInt32 i, ndInt32 j, T* const matrix, T* const choleskyMatrix, T* const x, T* const r, T* const low, T* const high, ndInt16* const permute)
625 {
626  if (i != j)
627  {
628  T* const A = &matrix[size * i];
629  T* const B = &matrix[size * j];
630  T* const invA = &choleskyMatrix[size * i];
631  T* const invB = &choleskyMatrix[size * j];
632  for (ndInt32 k = 0; k < size; ++k)
633  {
634  ndSwap(A[k], B[k]);
635  ndSwap(invA[k], invB[k]);
636  }
637 
638  ndInt32 stride = 0;
639  for (ndInt32 k = 0; k < size; ++k)
640  {
641  ndSwap(matrix[stride + i], matrix[stride + j]);
642  stride += size;
643  }
644 
645  ndSwap(x[i], x[j]);
646  ndSwap(r[i], r[j]);
647  ndSwap(low[i], low[j]);
648  ndSwap(high[i], high[j]);
649  ndSwap(permute[i], permute[j]);
650  }
651 }
652 
653 template<class T>
654 void ndCalculateDelta_x(ndInt32 size, ndInt32 n, const T* const matrix, const T* const choleskyMatrix, T* const delta_x)
655 {
656  const T* const row = &matrix[size * n];
657  for (ndInt32 i = 0; i < n; ++i)
658  {
659  delta_x[i] = -row[i];
660  }
661  ndSolveCholesky(size, n, choleskyMatrix, delta_x, delta_x);
662  delta_x[n] = T(1.0f);
663 }
664 
665 // calculate delta_r = A * delta_x
666 template<class T>
667 void ndCalculateDelta_r(ndInt32 size, ndInt32 n, const T* const matrix, const T* const delta_x, T* const delta_r)
668 {
669  ndInt32 stride = n * size;
670  const ndInt32 size1 = n + 1;
671  for (ndInt32 i = n; i < size; ++i)
672  {
673  delta_r[i] = ndDotProduct(size1, &matrix[stride], delta_x);
674  stride += size;
675  }
676 }
677 
678 template<class T>
679 void ndHouseholderReflection(ndInt32 size, ndInt32 row, ndInt32 colum, T* const choleskyMatrix, T* const tmp, T* const reflection)
680 {
681  ndAssert(row <= colum);
682  if (row < colum)
683  {
684  for (ndInt32 i = row; i <= colum; ++i)
685  {
686  T* const rowI = &choleskyMatrix[size * i];
687  T mag2(0.0f);
688  for (ndInt32 j = i + 1; j <= colum; ++j)
689  {
690  mag2 += rowI[j] * rowI[j];
691  reflection[j] = rowI[j];
692  }
693  if (mag2 > T(1.0e-14f))
694  {
695  reflection[i] = rowI[i] + ndSign(rowI[i]) * T(sqrt(mag2 + rowI[i] * rowI[i]));
696 
697  const T vMag2(mag2 + reflection[i] * reflection[i]);
698  const T den = T(2.0f) / vMag2;
699  for (ndInt32 j = i; j < size; ++j)
700  {
701  T acc(0.0f);
702  T* const rowJ = &choleskyMatrix[size * j];
703  for (ndInt32 k = i; k <= colum; ++k) {
704  acc += rowJ[k] * reflection[k];
705  }
706  tmp[j] = acc;
707  }
708 
709  for (ndInt32 j = i + 1; j < size; ++j)
710  {
711  rowI[j] = T(0.0f);
712  T* const rowJ = &choleskyMatrix[size * j];
713  const T a = tmp[j] * den;
714  for (ndInt32 k = i; k <= colum; ++k)
715  {
716  rowJ[k] -= a * reflection[k];
717  }
718  }
719  rowI[i] -= tmp[i] * reflection[i] * den;
720  }
721 
722  if (rowI[i] < T(0.0f))
723  {
724  for (ndInt32 k = i; k < size; ++k)
725  {
726  choleskyMatrix[size * k + i] = -choleskyMatrix[size * k + i];
727  }
728  }
729  }
730 
731  for (ndInt32 i = row; i < size; ++i)
732  {
733  choleskyMatrix[size * i + i] = ndMax(choleskyMatrix[size * i + i], T(1.0e-6f));
734  }
735  }
736 }
737 
738 template<class T>
739 void ndCholeskyUpdate(ndInt32 size, ndInt32 row, ndInt32 colum, T* const choleskyMatrix, T* const tmp, T* const reflexion, const T* const psdMatrix)
740 {
741  const ndInt32 n0 = colum - row;
742  const ndInt32 n1 = n0 + 1;
743  const ndInt32 choleskyCost = size * size * size / 3;
744  const ndInt32 householdCost = n0 * (n0 + 1) / 2 + n1 * (n1 + 1) * (2 * (2 * n1 + 1) - 3 + 3 * (size - colum - 1)) / 6 - 1;
745 
746  if (householdCost < choleskyCost)
747  {
748  ndHouseholderReflection(size, row, colum, choleskyMatrix, tmp, reflexion);
749  }
750  else
751  {
752  memcpy (choleskyMatrix, psdMatrix, sizeof (T) * size * size);
753  ndCholeskyFactorization(size, choleskyMatrix);
754  }
755 
756 //#if _DEBUG
757 #if 0
758  T* const psdMatrixCopy = dAlloca(T, size * size);
759  memcpy(psdMatrixCopy, psdMatrix, sizeof(T) * size * size);
760  dCholeskyFactorization(size, psdMatrixCopy);
761 
762  for (dInt32 i = 0; i < size; ++i)
763  {
764  for (dInt32 j = 0; j < size; ++j)
765  {
766  T err = psdMatrixCopy[i*size + j] - choleskyMatrix[i*size + j];
767  dAssert(dAbs(err) < T(1.0e-4f));
768  }
769  }
770 #endif
771 }
772 
773 // solve a general Linear complementary program (LCP)
774 // A * x = b + r
775 // subjected to constraints
776 // x(i) = low(i), if r(i) >= 0
777 // x(i) = high(i), if r(i) <= 0
778 // low(i) <= x(i) <= high(i), if r(i) == 0
779 //
780 // return true is the system has a solution.
781 // in return
782 // x is the solution,
783 // r is return in vector b
784 // note: although the system is called LCP, the solver is far more general than a strict LCP
785 // to solve a strict LCP, set the following
786 // low(i) = 0
787 // high(i) = infinity.
788 // this the same as enforcing the constraint: x(i) * r(i) = 0
789 template <class T>
790 void ndSolveDantzigLcpLow(ndInt32 size, T* const symmetricMatrixPSD, T* const x, T* const b, T* const low, T* const high)
791 {
792  T* const x0 = ndAlloca(T, size);
793  T* const r0 = ndAlloca(T, size);
794  T* const tmp0 = ndAlloca(T, size);
795  T* const tmp1 = ndAlloca(T, size);
796  T* const delta_r = ndAlloca(T, size);
797  T* const delta_x = ndAlloca(T, size);
798  T* const lowerTriangularMatrix = ndAlloca(T, size * size);
799  ndInt16* const permute = ndAlloca(ndInt16, size);
800 
801  for (ndInt32 i = 0; i < size; ++i)
802  {
803  permute[i] = ndInt16(i);
804  x0[i] = T(0.0f);
805  x[i] = ndMax (b[i] * b[i], T (1.0f));
806  }
807 
808  for (ndInt32 n = size - 1, i = size - 1; i >= 0; i--)
809  {
810  if (x[i] > T(1.0))
811  {
812  ndPermuteRows(size, n, i, symmetricMatrixPSD, lowerTriangularMatrix, x, b, low, high, permute);
813  n --;
814  }
815  }
816 
817  for (ndInt32 i = size - 1; (i >= 0) && (x[i] > T(1.0f)) ; i--)
818  {
819  ndInt32 min = i;
820  for (ndInt32 j = i - 1; (j >= 0) && (x[j] > T(1.0f)); j--)
821  {
822  if (x[j] > x[min])
823  {
824  min = j;
825  }
826  }
827  if (min != i)
828  {
829  ndPermuteRows(size, i, min, symmetricMatrixPSD, lowerTriangularMatrix, x, b, low, high, permute);
830  }
831  }
832 
833  ndInt32 initialGuessCount = size;
834  while (x[initialGuessCount - 1] >= T(16.0f))
835  {
836  initialGuessCount --;
837  }
838 
839  memcpy(lowerTriangularMatrix, symmetricMatrixPSD, sizeof(T) * size * size);
840 #ifdef _DEBUG
841  bool valid = ndCholeskyFactorization(size, lowerTriangularMatrix);
842  ndAssert(valid);
843 #else
844  ndCholeskyFactorization(size, lowerTriangularMatrix);
845 #endif
846  for (ndInt32 j = 0; (j != -1) && initialGuessCount;)
847  {
848  ndSolveCholesky(size, initialGuessCount, lowerTriangularMatrix, x0, b);
849 
850  j = -1;
851  T alpha(1.0f);
852  T value(0.0f);
853  for (ndInt32 i = initialGuessCount - 1; i >= 0; i--)
854  {
855  T x1 = alpha * x0[i];
856  if (x1 < low[i])
857  {
858  j = i;
859  value = low[i];
860  alpha = low[i] / x0[i];
861  }
862  else if (x1 > high[i])
863  {
864  j = i;
865  value = high[i];
866  alpha = high[i] / x0[i];
867  }
868  }
869 
870  if (j != -1)
871  {
872  x0[j] = value;
873  initialGuessCount--;
874  ndPermuteRows(size, j, initialGuessCount, symmetricMatrixPSD, lowerTriangularMatrix, x0, b, low, high, permute);
875  ndCholeskyUpdate(size, j, initialGuessCount, lowerTriangularMatrix, tmp0, tmp1, symmetricMatrixPSD);
876  }
877  }
878 
879  if (initialGuessCount == size)
880  {
881  for (ndInt32 i = 0; i < size; ++i)
882  {
883  ndInt32 j = permute[i];
884  x[j] = x0[i];
885  b[i] = T(0.0f);
886  }
887  return;
888  }
889 
890  ndInt32 clampedIndex = size;
891  ndInt32 index = initialGuessCount;
892  ndInt32 count = size - initialGuessCount;
893  ndInt32 stride = index * size;
894 
895  for (ndInt32 i = 0; i < size; ++i)
896  {
897  r0[i] = T(0.0f);
898  delta_x[i] = T(0.0f);
899  delta_r[i] = T(0.0f);
900  }
901 
902  for (ndInt32 i = index; i < size; ++i)
903  {
904  r0[i] = ndDotProduct(size, &symmetricMatrixPSD[stride], x0) - b[i];
905  stride += size;
906  }
907 
908 
909  while (count)
910  {
911  bool loop = true;
912 
913  while (loop)
914  {
915  loop = false;
916  T clamp_x(0.0f);
917  ndInt32 swapIndex = -1;
918 
919  if (ndAbs(r0[index]) > T(1.0e-12f))
920  {
921  ndCalculateDelta_x(size, index, symmetricMatrixPSD, lowerTriangularMatrix, delta_x);
922  ndCalculateDelta_r(size, index, symmetricMatrixPSD, delta_x, delta_r);
923 
924  ndAssert(delta_r[index] != T(0.0f));
925  ndAssert(ndAbs(delta_x[index]) == T(1.0f));
926  delta_r[index] = (delta_r[index] == T(0.0f)) ? T(1.0e-12f) : delta_r[index];
927 
928  T scale = -r0[index] / delta_r[index];
929  ndAssert(ndAbs(scale) >= T(0.0f));
930 
931  for (ndInt32 i = 0; i <= index; ++i)
932  {
933  T x1 = x0[i] + scale * delta_x[i];
934  if (x1 > high[i])
935  {
936  swapIndex = i;
937  clamp_x = high[i];
938  scale = (high[i] - x0[i]) / delta_x[i];
939  }
940  else if (x1 < low[i])
941  {
942  swapIndex = i;
943  clamp_x = low[i];
944  scale = (low[i] - x0[i]) / delta_x[i];
945  }
946  }
947  ndAssert(ndAbs(scale) >= T(0.0f));
948 
949  for (ndInt32 i = clampedIndex; (i < size) && (scale > T(1.0e-12f)); ++i)
950  {
951  T r1 = r0[i] + scale * delta_r[i];
952  if ((r1 * r0[i]) < T(0.0f))
953  {
954  ndAssert(ndAbs(delta_r[i]) > T(0.0f));
955  T s1 = -r0[i] / delta_r[i];
956  ndAssert(ndAbs(s1) >= T(0.0f));
957  ndAssert(ndAbs(s1) <= ndAbs(scale));
958  if (ndAbs(s1) < ndAbs(scale))
959  {
960  scale = s1;
961  swapIndex = i;
962  }
963  }
964  }
965 
966  if (ndAbs(scale) > T(1.0e-12f))
967  {
968  for (ndInt32 i = 0; i < size; ++i)
969  {
970  x0[i] += scale * delta_x[i];
971  r0[i] += scale * delta_r[i];
972  }
973  }
974  }
975 
976  if (swapIndex == -1)
977  {
978  r0[index] = T(0.0f);
979  delta_r[index] = T(0.0f);
980  index++;
981  count--;
982  loop = false;
983  }
984  else if (swapIndex == index)
985  {
986  count--;
987  clampedIndex--;
988  x0[index] = clamp_x;
989  ndPermuteRows(size, index, clampedIndex, symmetricMatrixPSD, lowerTriangularMatrix, x0, r0, low, high, permute);
990  ndCholeskyUpdate(size, index, clampedIndex, lowerTriangularMatrix, tmp0, tmp1, symmetricMatrixPSD);
991  loop = count ? true : false;
992  }
993  else if (swapIndex > index)
994  {
995  loop = true;
996  r0[swapIndex] = T(0.0f);
997  ndAssert(swapIndex < size);
998  ndAssert(clampedIndex <= size);
999  if (swapIndex < clampedIndex)
1000  {
1001  count--;
1002  clampedIndex--;
1003  ndPermuteRows(size, clampedIndex, swapIndex, symmetricMatrixPSD, lowerTriangularMatrix, x0, r0, low, high, permute);
1004  ndCholeskyUpdate(size, swapIndex, clampedIndex, lowerTriangularMatrix, tmp0, tmp1, symmetricMatrixPSD);
1005  ndAssert(clampedIndex >= index);
1006  }
1007  else
1008  {
1009  count++;
1010  ndAssert(clampedIndex < size);
1011  ndPermuteRows(size, clampedIndex, swapIndex, symmetricMatrixPSD, lowerTriangularMatrix, x0, r0, low, high, permute);
1012  ndCholeskyUpdate(size, clampedIndex, swapIndex, lowerTriangularMatrix, tmp0, tmp1, symmetricMatrixPSD);
1013  clampedIndex++;
1014  ndAssert(clampedIndex <= size);
1015  ndAssert(clampedIndex >= index);
1016  }
1017  }
1018  else
1019  {
1020  ndAssert(index > 0);
1021  x0[swapIndex] = clamp_x;
1022  delta_x[index] = T(0.0f);
1023 
1024  ndAssert(swapIndex < index);
1025  ndPermuteRows(size, swapIndex, index - 1, symmetricMatrixPSD, lowerTriangularMatrix, x0, r0, low, high, permute);
1026  ndPermuteRows(size, index - 1, index, symmetricMatrixPSD, lowerTriangularMatrix, x0, r0, low, high, permute);
1027  ndPermuteRows(size, clampedIndex - 1, index, symmetricMatrixPSD, lowerTriangularMatrix, x0, r0, low, high, permute);
1028  ndCholeskyUpdate (size, swapIndex, clampedIndex - 1, lowerTriangularMatrix, tmp0, tmp1, symmetricMatrixPSD);
1029 
1030  clampedIndex--;
1031  index--;
1032  loop = true;
1033  }
1034  }
1035  }
1036 
1037  for (ndInt32 i = 0; i < size; ++i)
1038  {
1039  ndInt32 j = permute[i];
1040  x[j] = x0[i];
1041  b[j] = r0[i];
1042  }
1043 }
1044 
1045 /*
1046 // solve a general Linear complementary program (LCP)
1047 // A * x = b + r
1048 // subjected to constraints
1049 // x(i) = low(i), if r(i) >= 0
1050 // x(i) = high(i), if r(i) <= 0
1051 // low(i) <= x(i) <= high(i), if r(i) == 0
1052 //
1053 // return true is the system has a solution.
1054 // in return
1055 // x is the solution,
1056 // r is return in vector b
1057 // note: although the system is called LCP, the solver is far more general than a strict LCP
1058 // to solve a strict LCP, set the following
1059 // low(i) = 0
1060 // high(i) = infinity.
1061 // this the same as enforcing the constraint: x(i) * r(i) = 0
1062 template <class T>
1063 bool ndSolveDantzigLCP(ndInt32 size, T* const symetricMatrix, T* const x, T* const b, T* const low, T* const high)
1064 {
1065  T* const choleskyMatrix = ndAlloca(T, size * size);
1066  dCheckAligment(choleskyMatrix);
1067 
1068  memcpy (choleskyMatrix, symetricMatrix, sizeof (T) * size * size);
1069  ndCholeskyFactorization(size, choleskyMatrix);
1070  for (ndInt32 i = 0; i < size; ++i)
1071  {
1072  T* const row = &choleskyMatrix[i * size];
1073  for (ndInt32 j = i + 1; j < size; ++j)
1074  {
1075  row[j] = T(0.0f);
1076  }
1077  }
1078  return ndSolveDantzigLCP(size, symetricMatrix, choleskyMatrix, x, b, low, high);
1079 }
1080 */
1081 
1082 // solve a general Linear complementary program (LCP)
1083 // A * x = b + r
1084 // subjected to constraints
1085 // x(i) = low(i), if r(i) >= 0
1086 // x(i) = high(i), if r(i) <= 0
1087 // low(i) <= x(i) <= high(i), if r(i) == 0
1088 //
1089 // return true is the system has a solution.
1090 // in return
1091 // x is the solution,
1092 // b is zero
1093 // note: although the system is called LCP, the solver is far more general than a strict LCP
1094 // to solve a strict LCP, set the following
1095 // low(i) = 0
1096 // high(i) = infinity.
1097 // this is the same as enforcing the constraint: x(i) * r(i) = 0
1098 template <class T>
1099 bool ndSolvePartitionDantzigLCP(ndInt32 size, T* const symmetricMatrixPSD , T* const x, T* const b, T* const low, T* const high)
1100 {
1101  ndInt16* const permute = ndAlloca(ndInt16, size);
1102 
1103  for (ndInt32 i = 0; i < size; ++i)
1104  {
1105  x[i] = b[i];
1106  permute[i] = ndInt16(i);
1107  }
1108 
1109  ndInt32 unboundedSize = size;
1110  for (ndInt32 i = 0; i < unboundedSize; ++i)
1111  {
1112  if ((low[i] <= T(-D_LCP_MAX_VALUE)) && (high[i] >= T(D_LCP_MAX_VALUE)))
1113  {
1114  ndCholeskyFactorizationAddRow(size, i, symmetricMatrixPSD );
1115  }
1116  else
1117  {
1118  ndInt32 j = unboundedSize - 1;
1119  if (i != j)
1120  {
1121  T* const A = &symmetricMatrixPSD [size * i];
1122  T* const B = &symmetricMatrixPSD [size * j];
1123  for (ndInt32 k = 0; k < size; ++k)
1124  {
1125  ndSwap(A[k], B[k]);
1126  }
1127 
1128  ndInt32 stride = 0;
1129  for (ndInt32 k = 0; k < size; ++k)
1130  {
1131  ndSwap(symmetricMatrixPSD [stride + i], symmetricMatrixPSD [stride + j]);
1132  stride += size;
1133  }
1134  ndSwap(x[i], x[j]);
1135  ndSwap(b[i], b[j]);
1136  ndSwap(low[i], low[j]);
1137  ndSwap(high[i], high[j]);
1138  ndSwap(permute[i], permute[j]);
1139  }
1140 
1141  i--;
1142  unboundedSize--;
1143  }
1144  }
1145 
1146  bool ret = false;
1147  if (unboundedSize > 0)
1148  {
1149  ndSolveCholesky(size, unboundedSize, symmetricMatrixPSD , x);
1150  ndInt32 base = unboundedSize * size;
1151  for (ndInt32 i = unboundedSize; i < size; ++i)
1152  {
1153  b[i] -= ndDotProduct(unboundedSize, &symmetricMatrixPSD[base], x);
1154  base += size;
1155  }
1156 
1157  const ndInt32 boundedSize = size - unboundedSize;
1158  T* const l = ndAlloca(T, boundedSize);
1159  T* const h = ndAlloca(T, boundedSize);
1160  T* const c = ndAlloca(T, boundedSize);
1161  T* const u = ndAlloca(T, boundedSize);
1162  T* const a11 = ndAlloca(T, boundedSize * boundedSize);
1163  T* const a10 = ndAlloca(T, boundedSize * unboundedSize);
1164 
1165  for (ndInt32 i = 0; i < boundedSize; ++i)
1166  {
1167  T* const g = &a10[i * unboundedSize];
1168  const T* const row = &symmetricMatrixPSD [(unboundedSize + i) * size];
1169  for (ndInt32 j = 0; j < unboundedSize; ++j)
1170  {
1171  g[j] = -row[j];
1172  }
1173  ndSolveCholesky(size, unboundedSize, symmetricMatrixPSD, g);
1174 
1175  T* const arow = &a11[i * boundedSize];
1176  const T* const row2 = &symmetricMatrixPSD[(unboundedSize + i) * size];
1177  arow[i] = row2[unboundedSize + i] + ndDotProduct(unboundedSize, g, row2);
1178  for (ndInt32 j = i + 1; j < boundedSize; ++j)
1179  {
1180  const T* const row1 = &symmetricMatrixPSD [(unboundedSize + j) * size];
1181  T elem = row1[unboundedSize + i] + ndDotProduct(unboundedSize, g, row1);
1182  arow[j] = elem;
1183  a11[j * boundedSize + i] = elem;
1184  }
1185  u[i] = T(0.0f);
1186  c[i] = b[i + unboundedSize];
1187  l[i] = low[i + unboundedSize];
1188  h[i] = high[i + unboundedSize];
1189  }
1190 
1191  if (ndSolveDantzigLCP(boundedSize, a11, u, c, l, h))
1192  {
1193  for (ndInt32 i = 0; i < boundedSize; ++i)
1194  {
1195  const T s = u[i];
1196  x[unboundedSize + i] = s;
1197  const T* const g = &a10[i * unboundedSize];
1198  for (ndInt32 j = 0; j < unboundedSize; ++j)
1199  {
1200  x[j] += g[j] * s;
1201  }
1202  }
1203  ret = true;
1204  }
1205  }
1206  else
1207  {
1208  for (ndInt32 i = 0; i < size; ++i)
1209  {
1210  x[i] = T(0.0f);
1211  }
1212  ret = ndSolveDantzigLCP(size, symmetricMatrixPSD, x, b, low, high);
1213  }
1214 
1215  for (ndInt32 i = 0; i < size; ++i)
1216  {
1217  b[i] = x[i];
1218  }
1219  for (ndInt32 i = 0; i < size; ++i)
1220  {
1221  ndInt32 j = permute[i];
1222  x[j] = b[i];
1223  b[i] = T(0.0f);
1224  }
1225  return ret;
1226 }
1227 
1228 template <class T>
1229 void ndSolveDantzigLCP(ndInt32 size, T* const symmetricMatrixPSD, T* const x, T* const b, T* const low, T* const high)
1230 {
1231  T tol2 = T(0.25f * 0.25f);
1232  ndInt32 passes = ndClamp(size, 12, 20);
1233  T* const r = ndAlloca(T, size);
1234  ndInt16* const clipped = ndAlloca(ndInt16, size);
1235 
1236  // find an approximation to the solution
1237  ndGaussSeidelLcpSor(size, symmetricMatrixPSD, x, b, low, high, tol2, passes, clipped, T(1.3f));
1238 
1239  T err2(0.0f);
1240  ndInt32 stride = 0;
1241  ndInt32 clippeCount = 0;
1242  for (ndInt32 i = 0; i < size; ++i)
1243  {
1244  const T* const row = &symmetricMatrixPSD[stride];
1245  r[i] = b[i] - ndDotProduct(size, row, x);
1246  clippeCount += clipped[i];
1247  err2 += clipped[i] ? T(0.0f) : r[i] * r[i];
1248  stride += size;
1249  }
1250 
1251  if (err2 > tol2)
1252  {
1253  // check for small lcp
1254  if ((clippeCount < 16) && ((clippeCount < 32) && (err2 < T(16.0f))))
1255  {
1256  // small lcp can be solved with direct method
1257  T* const x0 = ndAlloca(T, size);
1258  for (ndInt32 i = 0; i < size; ++i)
1259  {
1260  low[i] -= x[i];
1261  high[i] -= x[i];
1262  }
1263  ndSolveDantzigLcpLow(size, symmetricMatrixPSD, x0, r, low, high);
1264  for (ndInt32 i = 0; i < size; ++i)
1265  {
1266  x[i] += x0[i];
1267  }
1268  }
1269  else
1270  {
1271  // larger lcp are too hard for direct method, see if we can get better approximation
1272  ndGaussSeidelLcpSor(size, symmetricMatrixPSD, x, b, low, high, tol2, 20, clipped, T(1.3f));
1273  }
1274  }
1275 }
1276 
1277 #endif