Newton Dynamics  4.00
dGeneralMatrix.h
1 
2 /* Copyright (c) <2003-2019> <Julio Jerez, Newton Game Dynamics>
3 *
4 * This software is provided 'as-is', without any express or implied
5 * warranty. In no event will the authors be held liable for any damages
6 * arising from the use of this software.
7 *
8 * Permission is granted to anyone to use this software for any purpose,
9 * including commercial applications, and to alter it and redistribute it
10 * freely, subject to the following restrictions:
11 *
12 * 1. The origin of this software must not be misrepresented; you must not
13 * claim that you wrote the original software. If you use this software
14 * in a product, an acknowledgment in the product documentation would be
15 * appreciated but is not required.
16 *
17 * 2. Altered source versions must be plainly marked as such, and must not be
18 * misrepresented as being the original software.
19 *
20 * 3. This notice may not be removed or altered from any source distribution.
21 */
22 
23 #ifndef __D_GENERAL_MATRIX_H__
24 #define __D_GENERAL_MATRIX_H__
25 
26 #include "dCoreStdafx.h"
27 #include "dTypes.h"
28 #include "dGeneralVector.h"
29 
30 #define D_LCP_MAX_VALUE dFloat32 (1.0e10f)
31 
32 
33 template<class T>
35 {
36  public:
38  dSymmetricConjugateGradientSolver(T* const r0, T* const z0, T* const p0, T* const q0);
40 
41  void SetBuffers(T* const r0, T* const z0, T* const p0, T* const q0);
42  T Solve(dInt32 size, T tolerance, T* const x, const T* const b);
43 
44  protected:
45  virtual void MatrixTimeVector(T* const out, const T* const v) const = 0;
46  virtual void InversePrecoditionerTimeVector(T* const out, const T* const v) const = 0;
47 
48  private:
49  T SolveInternal(dInt32 size, T tolerance, T* const x, const T* const b) const;
50  //T DotProduct(dInt32 size, const T* const b, const T* const c) const;
51  //void Sub(dInt32 size, T* const a, const T* const b, const T* const c) const;
52  //void ScaleAdd(dInt32 size, T* const a, const T* const b, T scale, const T* const c) const;
53 
54  T* m_r0;
55  T* m_z0;
56  T* m_p0;
57  T* m_q0;
58 };
59 
60 template<class T>
62 {
63  SetBuffers(nullptr, nullptr, nullptr, nullptr);
64 }
65 
66 template<class T>
67 dSymmetricConjugateGradientSolver<T>::dSymmetricConjugateGradientSolver(T* const r0, T* const z0, T* const p0, T* const q0)
68 {
69  SetBuffers(r0, z0, p0, q0);
70 }
71 
72 template<class T>
74 {
75 }
76 
77 template<class T>
78 void dSymmetricConjugateGradientSolver<T>::SetBuffers(T* const r0, T* const z0, T* const p0, T* const q0)
79 {
80  m_r0 = r0;
81  m_z0 = z0;
82  m_p0 = p0;
83  m_q0 = q0;
84 }
85 
86 template<class T>
87 T dSymmetricConjugateGradientSolver<T>::Solve(dInt32 size, T tolerance, T* const x, const T* const b)
88 {
89  if (m_r0)
90  {
91  return SolveInternal(size, tolerance, x, b);
92  }
93  else
94  {
95  T* const r0 = dAlloca(T, size);
96  T* const z0 = dAlloca(T, size);
97  T* const p0 = dAlloca(T, size);
98  T* const q0 = dAlloca(T, size);
99  SetBuffers(r0, z0, p0, q0);
100  T error = SolveInternal(size, tolerance, x, b);
101  SetBuffers(nullptr, nullptr, nullptr, nullptr);
102  return error;
103  }
104 }
105 
106 template<class T>
107 T dSymmetricConjugateGradientSolver<T>::SolveInternal(dInt32 size, T tolerance, T* const x, const T* const b) const
108 {
109  MatrixTimeVector(m_z0, x);
110  dSub(size, m_r0, b, m_z0);
111  InversePrecoditionerTimeVector(m_p0, m_r0);
112 
113  dInt32 iter = 0;
114  T num = dDotProduct(size, m_r0, m_p0);
115  T error2 = num;
116  for (dInt32 j = 0; (j < size) && (error2 > tolerance); j++)
117  {
118  MatrixTimeVector(m_z0, m_p0);
119  T den = dDotProduct(size, m_p0, m_z0);
120 
121  dAssert(fabs(den) > T(0.0f));
122  T alpha = num / den;
123 
124  dMulAdd(size, x, x, m_p0, alpha);
125  if ((j % 50) != 49)
126  {
127  dMulAdd(size, m_r0, m_r0, m_z0, -alpha);
128  } else
129  {
130  MatrixTimeVector(m_z0, x);
131  dSub(size, m_r0, b, m_z0);
132  }
133 
134  InversePrecoditionerTimeVector(m_q0, m_r0);
135 
136  T num1 = dDotProduct(size, m_r0, m_q0);
137  T beta = num1 / num;
138  dMulAdd(size, m_p0, m_q0, m_p0, beta);
139  num = dDotProduct(size, m_r0, m_q0);
140  iter++;
141  error2 = num;
142  if (j > 10)
143  {
144  error2 = T(0.0f);
145  for (dInt32 i = 0; i < size; i++)
146  {
147  error2 = dMax(error2, m_r0[i] * m_r0[i]);
148  }
149  }
150  }
151  dAssert(iter <= size);
152  return num;
153 }
154 
155 
156 //*************************************************************
157 //
158 // generic linear algebra functions
159 //
160 //*************************************************************
161 template<class T>
162 void dMatrixTimeVector(dInt32 size, const T* const matrix, const T* const v, T* const out)
163 {
164  dInt32 stride = 0;
165  for (dInt32 i = 0; i < size; i++)
166  {
167  const T* const row = &matrix[stride];
168  out[i] = dDotProduct(size, row, v);
169  stride += size;
170  }
171 }
172 
173 template<class T>
174 void dMatrixTimeMatrix(dInt32 size, const T* const matrixA, const T* const matrixB, T* const out)
175 {
176  for (dInt32 i = 0; i < size; i++)
177  {
178  const T* const rowA = &matrixA[i * size];
179  T* const rowOut = &out[i * size];
180  for (dInt32 j = 0; j < size; j++)
181  {
182  T acc = T(0.0f);
183  for (dInt32 k = 0; k < size; k++)
184  {
185  acc += rowA[k] * matrixB[k * size + j];
186  }
187  rowOut[j] = acc;
188  }
189  }
190 }
191 
192 template<class T>
193 void dCovarianceMatrix(dInt32 size, T* const matrix, const T* const vectorA, const T* const vectorB)
194 {
195  dInt32 stride = 0;
196  for (dInt32 i = 0; i < size; i++)
197  {
198  T* const row = &matrix[stride];
199  T scale (vectorA[i]);
200  for (dInt32 j = 0; j < size; j++)
201  {
202  row[j] = scale * vectorA[j];
203  }
204  stride += size;
205  }
206 }
207 
208 template<class T>
209 bool dCholeskyFactorizationAddRow(dInt32 size, dInt32 stride, dInt32 n, T* const matrix, T* const invDiagonalOut)
210 {
211  T* const rowN = &matrix[stride * n];
212 
213  dInt32 base = 0;
214  for (dInt32 j = 0; j <= n; j++)
215  {
216  T s(0.0f);
217  T* const rowJ = &matrix[base];
218  for (dInt32 k = 0; k < j; k++)
219  {
220  s += rowN[k] * rowJ[k];
221  }
222 
223  if (n == j)
224  {
225  T diag = rowN[n] - s;
226  if (diag < T(1.0e-6f))
227  {
228  return false;
229  }
230 
231  rowN[n] = T(sqrt(diag));
232  invDiagonalOut[n] = T(1.0f) / rowN[n];
233  }
234  else
235  {
236  rowJ[n] = T(0.0f);
237  //rowN[j] = (rowN[j] - s) / rowJ[j];
238  rowN[j] = invDiagonalOut[j] * (rowN[j] - s);
239  }
240 
241  base += stride;
242  }
243 
244  return true;
245 }
246 
247 template<class T>
248 bool dCholeskyFactorization(dInt32 size, dInt32 stride, T* const psdMatrix)
249 {
250  bool state = true;
251  T* const invDiagonal = dAlloca(T, size);
252  for (dInt32 i = 0; (i < size) && state; i++)
253  {
254  state = state && dCholeskyFactorizationAddRow(size, stride, i, psdMatrix, invDiagonal);
255  }
256  return state;
257 }
258 
259 template<class T>
260 bool dTestPSDmatrix(dInt32 size, dInt32 stride, T* const matrix)
261 {
262  T* const copy = dAlloca(T, size * size);
263  dInt32 row = 0;
264  for (dInt32 i = 0; i < size; i++)
265  {
266  memcpy(&copy[i * size], &matrix[row], size * sizeof (T));
267  row += stride;
268  }
269  return dCholeskyFactorization(size, size, copy);
270 }
271 
272 template<class T>
273 void dCholeskyApplyRegularizer (dInt32 size, dInt32 stride, T* const psdMatrix, T* const regularizer)
274 {
275  bool isPsdMatrix = false;
276  dFloat32* const lowerTriangule = dAlloca(dFloat32, stride * stride);
277  do
278  {
279  memcpy(lowerTriangule, psdMatrix, sizeof(dFloat32) * stride * stride);
280  isPsdMatrix = dCholeskyFactorization(size, stride, lowerTriangule);
281  if (!isPsdMatrix)
282  {
283  for (dInt32 i = 0; i < size; i++)
284  {
285  regularizer[i] *= dFloat32(4.0f);
286  psdMatrix[i * stride + i] += regularizer[i];
287  }
288  }
289  } while (!isPsdMatrix);
290 }
291 
292 template<class T>
293 void dSolveCholesky(dInt32 size, dInt32 stride, const T* const choleskyMatrix, T* const x, const T* const b)
294 {
295  dInt32 rowStart = 0;
296  for (dInt32 i = 0; i < size; i++)
297  {
298  T acc(0.0f);
299  const T* const row = &choleskyMatrix[rowStart];
300  for (dInt32 j = 0; j < i; j++)
301  {
302  acc = acc + row[j] * x[j];
303  }
304  x[i] = (b[i] - acc) / row[i];
305  rowStart += stride;
306  }
307 
308  for (dInt32 i = size - 1; i >= 0; i--)
309  {
310  T acc = 0.0f;
311  for (dInt32 j = i + 1; j < size; j++)
312  {
313  acc = acc + choleskyMatrix[stride * j + i] * x[j];
314  }
315  x[i] = (x[i] - acc) / choleskyMatrix[stride * i + i];
316  }
317 }
318 
319 template<class T>
320 void dSolveCholesky(dInt32 size, T* const choleskyMatrix, T* const x)
321 {
322  dSolveCholesky(size, size, choleskyMatrix, x);
323 }
324 
325 template<class T>
326 bool dSolveGaussian(dInt32 size, T* const matrix, T* const b)
327 {
328  for (dInt32 i = 0; i < size - 1; i++)
329  {
330  const T* const rowI = &matrix[i * size];
331  dInt32 m = i;
332  T maxVal (dAbs(rowI[i]));
333  for (dInt32 j = i + 1; j < size - 1; j++)
334  {
335  T val (dAbs(matrix[size * j + i]));
336  if (val > maxVal)
337  {
338  m = j;
339  maxVal = val;
340  }
341  }
342 
343  if (maxVal < T(1.0e-12f))
344  {
345  return false;
346  }
347 
348  if (m != i)
349  {
350  T* const rowK = &matrix[m * size];
351  T* const rowJ = &matrix[i * size];
352  for (dInt32 j = 0; j < size; j++)
353  {
354  dSwap(rowK[j], rowJ[j]);
355  }
356  dSwap(b[i], b[m]);
357  }
358 
359  T den = T(1.0f) / rowI[i];
360  for (dInt32 k = i + 1; k < size; k++)
361  {
362  T* const rowK = &matrix[size * k];
363  T factor(-rowK[i] * den);
364  for (dInt32 j = i + 1; j < size; j++)
365  {
366  rowK[j] += rowI[j] * factor;
367  }
368  rowK[i] = T(0.0f);
369  b[k] += b[i] * factor;
370  }
371  }
372 
373  for (dInt32 i = size - 1; i >= 0; i--)
374  {
375  T acc(0);
376  T* const rowI = &matrix[i * size];
377  for (dInt32 j = i + 1; j < size; j++)
378  {
379  acc = acc + rowI[j] * b[j];
380  }
381  b[i] = (b[i] - acc) / rowI[i];
382  }
383  return true;
384 }
385 
386 template <class T>
387 void dEigenValues(const dInt32 size, const dInt32 stride, const T* const symmetricMatrix, T* const eigenValues)
388 {
389  T* const offDiag = dAlloca(T, size);
390  T* const matrix = dAlloca(T, size * stride);
391 
392  memcpy(matrix, symmetricMatrix, sizeof(T) * size * stride);
393  for (dInt32 i = size - 1; i > 0; i--)
394  {
395  T h(0.0f);
396  T* const rowI = &matrix[i * stride];
397 
398  if (i > 1)
399  {
400  T scale(0.0f);
401  for (dInt32 k = 0; k < i; k++)
402  {
403  scale += dAbs(rowI[k]);
404  }
405 
406  if (scale == T(0.0f))
407  {
408  offDiag[i] = rowI[i - 1];
409  }
410  else
411  {
412  for (dInt32 k = 0; k < i; k++)
413  {
414  rowI[k] /= scale;
415  h += rowI[k] * rowI[k];
416  }
417 
418  T f(rowI[i - 1]);
419  T g((f >= T(0.0f) ? -T(sqrt(h)) : T(sqrt(h))));
420  offDiag[i] = scale * g;
421  h -= f * g;
422  rowI[i - 1] = f - g;
423  f = T(0.0f);
424 
425  for (dInt32 j = 0; j < i; j++)
426  {
427  g = T(0.0f);
428  const T* const rowJ = &matrix[j * stride];
429  for (dInt32 k = 0; k <= j; k++)
430  {
431  g += rowJ[k] * rowI[k];
432  }
433  for (dInt32 k = j + 1; k < i; k++)
434  {
435  g += matrix[k * stride + j] * rowI[k];
436  }
437  offDiag[j] = g / h;
438  f += offDiag[j] * rowI[j];
439  }
440 
441  T hh(f / (h + h));
442  for (dInt32 j = 0; j < i; j++)
443  {
444  T f1 (rowI[j]);
445  T g1(offDiag[j] - hh * f1);
446  offDiag[j] = g1;
447  T* const rowJ = &matrix[j * stride];
448  for (dInt32 k = 0; k <= j; k++)
449  {
450  rowJ[k] -= (f1 * offDiag[k] + g1 * rowI[k]);
451  }
452  }
453  }
454  }
455  else
456  {
457  offDiag[i] = rowI[i - 1];
458  }
459  eigenValues[i] = h;
460  }
461 
462  dInt32 index = stride;
463  eigenValues[0] = matrix[0];
464  for (dInt32 i = 1; i < size; i++)
465  {
466  eigenValues[i] = matrix[index + i];
467  offDiag[i - 1] = offDiag[i];
468  index += stride;
469  }
470 
471  for (dInt32 i = 0; i < size; i++)
472  {
473  dInt32 j;
474  dInt32 iter = 0;
475  do
476  {
477  for (j = i; j < size - 1; j++)
478  {
479  T dd(dAbs(eigenValues[j]) + dAbs(eigenValues[j + 1]));
480  if (dAbs(offDiag[j]) <= (T(1.e-6f) * dd))
481  {
482  break;
483  }
484  }
485 
486  if (j != i)
487  {
488  iter++;
489  if (iter == 10)
490  {
491  dAssert(0);
492  return;
493  }
494 
495  T g((eigenValues[i + 1] - eigenValues[i]) / (T(2.0f) * offDiag[i]));
496  T r(dPythag(g, T(1.0f)));
497  g = eigenValues[j] - eigenValues[i] + offDiag[i] / (g + dSign(r, g));
498  T s(1.0f);
499  T c(1.0f);
500  T p(0.0f);
501 
502  dInt32 k;
503  for (k = j - 1; k >= i; k--)
504  {
505  T f(s * offDiag[k]);
506  T b(c * offDiag[k]);
507  T d(dPythag(f, g));
508  offDiag[k + 1] = d;
509  if (d == T(0.0f))
510  {
511  eigenValues[k + 1] -= p;
512  offDiag[j] = T(0.0f);
513  break;
514  }
515  s = f / d;
516  c = g / d;
517  g = eigenValues[k + 1] - p;
518  d = (eigenValues[k] - g) * s + T(2.0f) * c * b;
519  p = s * d;
520  eigenValues[k + 1] = g + p;
521  g = c * d - b;
522  }
523 
524  if (r == T(0.0f) && k >= i)
525  {
526  continue;
527  }
528  eigenValues[i] -= p;
529  offDiag[i] = g;
530  offDiag[j] = T(0.0f);
531  }
532  } while (j != i);
533  }
534 }
535 
536 template <class T>
537 T dConditionNumber(const dInt32 size, const dInt32 stride, const T* const choleskyMatrix)
538 {
539  T* const eigenValues = dAlloca(T, size);
540  dEigenValues(size, stride, choleskyMatrix, eigenValues);
541 
542  T minVal = T(1.0e20f);
543  T maxVal = T(-1.0e20f);
544  for (dInt32 i = 0; i < size; i++)
545  {
546  minVal = dMin(minVal, eigenValues[i]);
547  maxVal = dMax(maxVal, eigenValues[i]);
548  }
549  T condition = T(dAbs(maxVal) / dAbs(minVal));
550  return condition;
551 }
552 
553 
554 // solve a general Linear complementary program (LCP)
555 // A * x = b + r
556 // subjected to constraints
557 // x(i) = low(i), if r(i) >= 0
558 // x(i) = high(i), if r(i) <= 0
559 // low(i) <= x(i) <= high(i), if r(i) == 0
560 //
561 // return true is the system has a solution.
562 // in return
563 // x is the solution,
564 // r is return in vector b
565 // note: although the system is called LCP, the solver is far more general than a strict LCP
566 // to solve a strict LCP, set the following
567 // low(i) = 0
568 // high(i) = infinity.
569 // this the same as enforcing the constraint: x(i) * r(i) = 0
570 template <class T>
571 void dGaussSeidelLcpSor(const dInt32 size, const T* const matrix, T* const x, const T* const b, const T* const low, const T* const high, T tol2, dInt32 maxIterCount, dInt16* const clipped, T sor)
572 {
573  const T* const me = matrix;
574  T* const invDiag1 = dAlloca(T, size);
575 
576  dInt32 stride = 0;
577  for (dInt32 i = 0; i < size; i++)
578  {
579  x[i] = dClamp(T(0.0f), low[i], high[i]);
580  invDiag1[i] = T(1.0f) / me[stride + i];
581  stride += size;
582  }
583 
584  T tolerance(tol2 * 2.0f);
585  const T* const invDiag = invDiag1;
586 #ifdef _DEBUG
587  dInt32 passes = 0;
588 #endif
589  for (dInt32 i = 0; (i < maxIterCount) && (tolerance > tol2); i++)
590  {
591  dInt32 base = 0;
592  tolerance = T(0.0f);
593 #ifdef _DEBUG
594  passes++;
595 #endif
596  for (dInt32 j = 0; j < size; j++)
597  {
598  const T* const row = &me[base];
599  T r(b[j] - dDotProduct(size, row, x));
600  T f((r + row[j] * x[j]) * invDiag[j]);
601  if (f > high[j])
602  {
603  x[j] = high[j];
604  clipped[j] = 1;
605  }
606  else if (f < low[j])
607  {
608  x[j] = low[j];
609  clipped[j] = 1;
610  }
611  else
612  {
613  clipped[j] = 0;
614  tolerance += r * r;
615  x[j] = x[j] + (f - x[j]) * sor;
616  }
617  base += size;
618  }
619  }
620 }
621 
622 template <class T>
623 void dGaussSeidelLcpSor(const dInt32 size, const dInt32 stride, const T* const matrix, T* const x, const T* const b, const dInt32* const normalIndex, const T* const low, const T* const high, T tol2, dInt32 maxIterCount, T sor)
624 {
625  const T* const me = matrix;
626  T* const invDiag1 = dAlloca(T, size);
627  T* const u = dAlloca(T, size + 1);
628  dInt32* const index = dAlloca(dInt32, size);
629 
630  u[size] = T(1.0f);
631  dInt32 rowStart = 0;
632  for (dInt32 j = 0; j < size; j++)
633  {
634  u[j] = x[j];
635  index[j] = normalIndex[j] ? j + normalIndex[j] : size;
636  }
637 
638  for (dInt32 j = 0; j < size; j++)
639  {
640  const T val = u[index[j]];
641  const T l = low[j] * val;
642  const T h = high[j] * val;
643  u[j] = dClamp(u[j], l, h);
644  invDiag1[j] = T(1.0f) / me[rowStart + j];
645  rowStart += stride;
646  }
647 
648  T tolerance(tol2 * 2.0f);
649  const T* const invDiag = invDiag1;
650  const dInt32 maxCount = dMax(8, size);
651  for (dInt32 i = 0; (i < maxCount) && (tolerance > tol2); i++)
652  {
653  dInt32 base = 0;
654  tolerance = T(0.0f);
655  for (dInt32 j = 0; j < size; j++)
656  {
657  const T* const row = &me[base];
658  T r(b[j] - dDotProduct(size, row, u));
659  T f((r + row[j] * u[j]) * invDiag[j]);
660 
661  const T val = u[index[j]];
662  const T l = low[j] * val;
663  const T h = high[j] * val;
664  if (f > h)
665  {
666  u[j] = h;
667  }
668  else if (f < l)
669  {
670  u[j] = l;
671  }
672  else
673  {
674  tolerance += r * r;
675  u[j] = f;
676  }
677  base += stride;
678  }
679  }
680 
681 #ifdef _DEBUG
682  dInt32 passes = 0;
683 #endif
684  for (dInt32 i = 0; (i < maxIterCount) && (tolerance > tol2); i++)
685  {
686  dInt32 base = 0;
687  tolerance = T(0.0f);
688 #ifdef _DEBUG
689  passes++;
690 #endif
691  for (dInt32 j = 0; j < size; j++)
692  {
693  const T* const row = &me[base];
694  T r(b[j] - dDotProduct(size, row, u));
695  T f((r + row[j] * u[j]) * invDiag[j]);
696  f = u[j] + (f - u[j]) * sor;
697 
698  const T val = u[index[j]];
699  const T l = low[j] * val;
700  const T h = high[j] * val;
701  if (f > h)
702  {
703  u[j] = h;
704  }
705  else if (f < l)
706  {
707  u[j] = l;
708  }
709  else
710  {
711  tolerance += r * r;
712  u[j] = f;
713  }
714  base += stride;
715  }
716  }
717 
718  for (dInt32 j = 0; j < size; j++)
719  {
720  x[j] = u[j];
721  }
722 }
723 
724 // solve a general Linear complementary program (LCP)
725 // A * x = b + r
726 // subjected to constraints
727 // x(i) = low(i), if r(i) >= 0
728 // x(i) = high(i), if r(i) <= 0
729 // low(i) <= x(i) <= high(i), if r(i) == 0
730 //
731 // return true is the system has a solution.
732 // in return
733 // x is the solution,
734 // r is return in vector b
735 // note: although the system is called LCP, the solver is far more general than a strict LCP
736 // to solve a strict LCP, set the following
737 // low(i) = 0
738 // high(i) = infinity.
739 // this the same as enforcing the constraint: x(i) * r(i) = 0
740 template <class T>
741 void dGaussSeidelLCP(const dInt32 size, const T* const matrix, T* const x, const T* const b, const T* const low, const T* const high, T sor = T(1.2f))
742 {
743  dInt16* const clipped = dAlloca(dInt16, size);
744  dGaussSeidelLcpSor(size, matrix, x, b, low, high, T(1.0e-3f), size * size, clipped, sor);
745 }
746 
747 template<class T>
748 void dPermuteRows(dInt32 size, dInt32 i, dInt32 j, T* const matrix, T* const choleskyMatrix, T* const x, T* const r, T* const low, T* const high, dInt16* const permute)
749 {
750  if (i != j)
751  {
752  T* const A = &matrix[size * i];
753  T* const B = &matrix[size * j];
754  T* const invA = &choleskyMatrix[size * i];
755  T* const invB = &choleskyMatrix[size * j];
756  for (dInt32 k = 0; k < size; k++)
757  {
758  dSwap(A[k], B[k]);
759  dSwap(invA[k], invB[k]);
760  }
761 
762  dInt32 stride = 0;
763  for (dInt32 k = 0; k < size; k++)
764  {
765  dSwap(matrix[stride + i], matrix[stride + j]);
766  stride += size;
767  }
768 
769  dSwap(x[i], x[j]);
770  dSwap(r[i], r[j]);
771  dSwap(low[i], low[j]);
772  dSwap(high[i], high[j]);
773  dSwap(permute[i], permute[j]);
774  }
775 }
776 
777 template<class T>
778 void dCalculateDelta_x(dInt32 size, dInt32 n, const T* const matrix, const T* const choleskyMatrix, T* const delta_x)
779 {
780  const T* const row = &matrix[size * n];
781  for (dInt32 i = 0; i < n; i++)
782  {
783  delta_x[i] = -row[i];
784  }
785  dSolveCholesky(size, n, choleskyMatrix, delta_x, delta_x);
786  delta_x[n] = T(1.0f);
787 }
788 
789 // calculate delta_r = A * delta_x
790 template<class T>
791 void dCalculateDelta_r(dInt32 size, dInt32 n, const T* const matrix, const T* const delta_x, T* const delta_r)
792 {
793  dInt32 stride = n * size;
794  const dInt32 size1 = n + 1;
795  for (dInt32 i = n; i < size; i++)
796  {
797  delta_r[i] = dDotProduct(size1, &matrix[stride], delta_x);
798  stride += size;
799  }
800 }
801 
802 template<class T>
803 void dHouseholderReflection(dInt32 size, dInt32 row, dInt32 colum, T* const choleskyMatrix, T* const tmp, T* const reflection)
804 {
805  dAssert(row <= colum);
806  if (row < colum)
807  {
808  for (dInt32 i = row; i <= colum; i++)
809  {
810  T* const rowI = &choleskyMatrix[size * i];
811  T mag2(0.0f);
812  for (dInt32 j = i + 1; j <= colum; j++)
813  {
814  mag2 += rowI[j] * rowI[j];
815  reflection[j] = rowI[j];
816  }
817  if (mag2 > T(1.0e-14f))
818  {
819  reflection[i] = rowI[i] + dSign(rowI[i]) * T(sqrt(mag2 + rowI[i] * rowI[i]));
820 
821  const T vMag2(mag2 + reflection[i] * reflection[i]);
822  const T den = T(2.0f) / vMag2;
823  for (dInt32 j = i; j < size; j++)
824  {
825  T acc(0.0f);
826  T* const rowJ = &choleskyMatrix[size * j];
827  for (dInt32 k = i; k <= colum; k++) {
828  acc += rowJ[k] * reflection[k];
829  }
830  tmp[j] = acc;
831  }
832 
833  for (dInt32 j = i + 1; j < size; j++)
834  {
835  rowI[j] = T(0.0f);
836  T* const rowJ = &choleskyMatrix[size * j];
837  const T a = tmp[j] * den;
838  for (dInt32 k = i; k <= colum; k++)
839  {
840  rowJ[k] -= a * reflection[k];
841  }
842  }
843  rowI[i] -= tmp[i] * reflection[i] * den;
844  }
845 
846  if (rowI[i] < T(0.0f))
847  {
848  for (dInt32 k = i; k < size; k++)
849  {
850  choleskyMatrix[size * k + i] = -choleskyMatrix[size * k + i];
851  }
852  }
853  }
854 
855  for (dInt32 i = row; i < size; i++)
856  {
857  choleskyMatrix[size * i + i] = dMax(choleskyMatrix[size * i + i], T(1.0e-6f));
858  }
859  }
860 }
861 
862 template<class T>
863 void dCholeskyUpdate(dInt32 size, dInt32 row, dInt32 colum, T* const choleskyMatrix, T* const tmp, T* const reflexion, const T* const psdMatrix)
864 {
865  const dInt32 n0 = colum - row;
866  const dInt32 n1 = n0 + 1;
867  const dInt32 choleskyCost = size * size * size / 3;
868  const dInt32 householdCost = n0 * (n0 + 1) / 2 + n1 * (n1 + 1) * (2 * (2 * n1 + 1) - 3 + 3 * (size - colum - 1)) / 6 - 1;
869 
870  if (householdCost < choleskyCost)
871  {
872  dHouseholderReflection(size, row, colum, choleskyMatrix, tmp, reflexion);
873  }
874  else
875  {
876  memcpy (choleskyMatrix, psdMatrix, sizeof (T) * size * size);
877  dCholeskyFactorization(size, choleskyMatrix);
878  }
879 
880 //#if _DEBUG
881 #if 0
882  T* const psdMatrixCopy = dAlloca(T, size * size);
883  memcpy(psdMatrixCopy, psdMatrix, sizeof(T) * size * size);
884  dCholeskyFactorization(size, psdMatrixCopy);
885 
886  for (dInt32 i = 0; i < size; i++)
887  {
888  for (dInt32 j = 0; j < size; j++)
889  {
890  T err = psdMatrixCopy[i*size + j] - choleskyMatrix[i*size + j];
891  dAssert(dAbs(err) < T(1.0e-4f));
892  }
893  }
894 #endif
895 }
896 
897 // solve a general Linear complementary program (LCP)
898 // A * x = b + r
899 // subjected to constraints
900 // x(i) = low(i), if r(i) >= 0
901 // x(i) = high(i), if r(i) <= 0
902 // low(i) <= x(i) <= high(i), if r(i) == 0
903 //
904 // return true is the system has a solution.
905 // in return
906 // x is the solution,
907 // r is return in vector b
908 // note: although the system is called LCP, the solver is far more general than a strict LCP
909 // to solve a strict LCP, set the following
910 // low(i) = 0
911 // high(i) = infinity.
912 // this the same as enforcing the constraint: x(i) * r(i) = 0
913 template <class T>
914 void dSolveDantzigLcpLow(dInt32 size, T* const symmetricMatrixPSD, T* const x, T* const b, T* const low, T* const high)
915 {
916  T* const x0 = dAlloca(T, size);
917  T* const r0 = dAlloca(T, size);
918  T* const tmp0 = dAlloca(T, size);
919  T* const tmp1 = dAlloca(T, size);
920  T* const delta_r = dAlloca(T, size);
921  T* const delta_x = dAlloca(T, size);
922  T* const lowerTriangularMatrix = dAlloca(T, size * size);
923  dInt16* const permute = dAlloca(dInt16, size);
924 
925  for (dInt32 i = 0; i < size; i++)
926  {
927  permute[i] = dInt16(i);
928  x0[i] = T(0.0f);
929  x[i] = dMax (b[i] * b[i], T (1.0f));
930  }
931 
932  for (dInt32 n = size - 1, i = size - 1; i >= 0; i--)
933  {
934  if (x[i] > T(1.0))
935  {
936  dPermuteRows(size, n, i, symmetricMatrixPSD, lowerTriangularMatrix, x, b, low, high, permute);
937  n --;
938  }
939  }
940 
941  for (dInt32 i = size - 1; (i >= 0) && (x[i] > T(1.0f)) ; i--)
942  {
943  dInt32 min = i;
944  for (dInt32 j = i - 1; (j >= 0) && (x[j] > T(1.0f)); j--)
945  {
946  if (x[j] > x[min])
947  {
948  min = j;
949  }
950  }
951  if (min != i)
952  {
953  dPermuteRows(size, i, min, symmetricMatrixPSD, lowerTriangularMatrix, x, b, low, high, permute);
954  }
955  }
956 
957  dInt32 initialGuessCount = size;
958  while (x[initialGuessCount - 1] >= T(16.0f))
959  {
960  initialGuessCount --;
961  }
962 
963  memcpy(lowerTriangularMatrix, symmetricMatrixPSD, sizeof(T) * size * size);
964 #ifdef _DEBUG
965  bool valid = dCholeskyFactorization(size, lowerTriangularMatrix);
966  dAssert(valid);
967 #else
968  dCholeskyFactorization(size, lowerTriangularMatrix);
969 #endif
970  for (dInt32 j = 0; (j != -1) && initialGuessCount;)
971  {
972  dSolveCholesky(size, initialGuessCount, lowerTriangularMatrix, x0, b);
973 
974  j = -1;
975  T alpha(1.0f);
976  T value(0.0f);
977  for (dInt32 i = initialGuessCount - 1; i >= 0; i--)
978  {
979  T x1 = alpha * x0[i];
980  if (x1 < low[i])
981  {
982  j = i;
983  value = low[i];
984  alpha = low[i] / x0[i];
985  }
986  else if (x1 > high[i])
987  {
988  j = i;
989  value = high[i];
990  alpha = high[i] / x0[i];
991  }
992  }
993 
994  if (j != -1)
995  {
996  x0[j] = value;
997  initialGuessCount--;
998  dPermuteRows(size, j, initialGuessCount, symmetricMatrixPSD, lowerTriangularMatrix, x0, b, low, high, permute);
999  dCholeskyUpdate(size, j, initialGuessCount, lowerTriangularMatrix, tmp0, tmp1, symmetricMatrixPSD);
1000  }
1001  }
1002 
1003  if (initialGuessCount == size)
1004  {
1005  for (dInt32 i = 0; i < size; i++)
1006  {
1007  dInt32 j = permute[i];
1008  x[j] = x0[i];
1009  b[i] = T(0.0f);
1010  }
1011  return;
1012  }
1013 
1014  dInt32 clampedIndex = size;
1015  dInt32 index = initialGuessCount;
1016  dInt32 count = size - initialGuessCount;
1017  dInt32 stride = index * size;
1018 
1019  for (dInt32 i = 0; i < size; i++)
1020  {
1021  r0[i] = T(0.0f);
1022  delta_x[i] = T(0.0f);
1023  delta_r[i] = T(0.0f);
1024  }
1025 
1026  for (dInt32 i = index; i < size; i++)
1027  {
1028  r0[i] = dDotProduct(size, &symmetricMatrixPSD[stride], x0) - b[i];
1029  stride += size;
1030  }
1031 
1032 
1033  while (count)
1034  {
1035  bool loop = true;
1036 
1037  while (loop)
1038  {
1039  loop = false;
1040  T clamp_x(0.0f);
1041  dInt32 swapIndex = -1;
1042 
1043  if (dAbs(r0[index]) > T(1.0e-12f))
1044  {
1045  dCalculateDelta_x(size, index, symmetricMatrixPSD, lowerTriangularMatrix, delta_x);
1046  dCalculateDelta_r(size, index, symmetricMatrixPSD, delta_x, delta_r);
1047 
1048  dAssert(delta_r[index] != T(0.0f));
1049  dAssert(dAbs(delta_x[index]) == T(1.0f));
1050  delta_r[index] = (delta_r[index] == T(0.0f)) ? T(1.0e-12f) : delta_r[index];
1051 
1052  T scale = -r0[index] / delta_r[index];
1053  dAssert(dAbs(scale) >= T(0.0f));
1054 
1055  for (dInt32 i = 0; i <= index; i++)
1056  {
1057  T x1 = x0[i] + scale * delta_x[i];
1058  if (x1 > high[i])
1059  {
1060  swapIndex = i;
1061  clamp_x = high[i];
1062  scale = (high[i] - x0[i]) / delta_x[i];
1063  }
1064  else if (x1 < low[i])
1065  {
1066  swapIndex = i;
1067  clamp_x = low[i];
1068  scale = (low[i] - x0[i]) / delta_x[i];
1069  }
1070  }
1071  dAssert(dAbs(scale) >= T(0.0f));
1072 
1073  for (dInt32 i = clampedIndex; (i < size) && (scale > T(1.0e-12f)); i++)
1074  {
1075  T r1 = r0[i] + scale * delta_r[i];
1076  if ((r1 * r0[i]) < T(0.0f))
1077  {
1078  dAssert(dAbs(delta_r[i]) > T(0.0f));
1079  T s1 = -r0[i] / delta_r[i];
1080  dAssert(dAbs(s1) >= T(0.0f));
1081  dAssert(dAbs(s1) <= dAbs(scale));
1082  if (dAbs(s1) < dAbs(scale))
1083  {
1084  scale = s1;
1085  swapIndex = i;
1086  }
1087  }
1088  }
1089 
1090  if (dAbs(scale) > T(1.0e-12f))
1091  {
1092  for (dInt32 i = 0; i < size; i++)
1093  {
1094  x0[i] += scale * delta_x[i];
1095  r0[i] += scale * delta_r[i];
1096  }
1097  }
1098  }
1099 
1100  if (swapIndex == -1)
1101  {
1102  r0[index] = T(0.0f);
1103  delta_r[index] = T(0.0f);
1104  index++;
1105  count--;
1106  loop = false;
1107  }
1108  else if (swapIndex == index)
1109  {
1110  count--;
1111  clampedIndex--;
1112  x0[index] = clamp_x;
1113  dPermuteRows(size, index, clampedIndex, symmetricMatrixPSD, lowerTriangularMatrix, x0, r0, low, high, permute);
1114  dCholeskyUpdate(size, index, clampedIndex, lowerTriangularMatrix, tmp0, tmp1, symmetricMatrixPSD);
1115  loop = count ? true : false;
1116  }
1117  else if (swapIndex > index)
1118  {
1119  loop = true;
1120  r0[swapIndex] = T(0.0f);
1121  dAssert(swapIndex < size);
1122  dAssert(clampedIndex <= size);
1123  if (swapIndex < clampedIndex)
1124  {
1125  count--;
1126  clampedIndex--;
1127  dPermuteRows(size, clampedIndex, swapIndex, symmetricMatrixPSD, lowerTriangularMatrix, x0, r0, low, high, permute);
1128  dCholeskyUpdate(size, swapIndex, clampedIndex, lowerTriangularMatrix, tmp0, tmp1, symmetricMatrixPSD);
1129  dAssert(clampedIndex >= index);
1130  }
1131  else
1132  {
1133  count++;
1134  dAssert(clampedIndex < size);
1135  dPermuteRows(size, clampedIndex, swapIndex, symmetricMatrixPSD, lowerTriangularMatrix, x0, r0, low, high, permute);
1136  dCholeskyUpdate(size, clampedIndex, swapIndex, lowerTriangularMatrix, tmp0, tmp1, symmetricMatrixPSD);
1137  clampedIndex++;
1138  dAssert(clampedIndex <= size);
1139  dAssert(clampedIndex >= index);
1140  }
1141  }
1142  else
1143  {
1144  dAssert(index > 0);
1145  x0[swapIndex] = clamp_x;
1146  delta_x[index] = T(0.0f);
1147 
1148  dAssert(swapIndex < index);
1149  dPermuteRows(size, swapIndex, index - 1, symmetricMatrixPSD, lowerTriangularMatrix, x0, r0, low, high, permute);
1150  dPermuteRows(size, index - 1, index, symmetricMatrixPSD, lowerTriangularMatrix, x0, r0, low, high, permute);
1151  dPermuteRows(size, clampedIndex - 1, index, symmetricMatrixPSD, lowerTriangularMatrix, x0, r0, low, high, permute);
1152  dCholeskyUpdate (size, swapIndex, clampedIndex - 1, lowerTriangularMatrix, tmp0, tmp1, symmetricMatrixPSD);
1153 
1154  clampedIndex--;
1155  index--;
1156  loop = true;
1157  }
1158  }
1159  }
1160 
1161  for (dInt32 i = 0; i < size; i++)
1162  {
1163  dInt32 j = permute[i];
1164  x[j] = x0[i];
1165  b[j] = r0[i];
1166  }
1167 }
1168 
1169 /*
1170 // solve a general Linear complementary program (LCP)
1171 // A * x = b + r
1172 // subjected to constraints
1173 // x(i) = low(i), if r(i) >= 0
1174 // x(i) = high(i), if r(i) <= 0
1175 // low(i) <= x(i) <= high(i), if r(i) == 0
1176 //
1177 // return true is the system has a solution.
1178 // in return
1179 // x is the solution,
1180 // r is return in vector b
1181 // note: although the system is called LCP, the solver is far more general than a strict LCP
1182 // to solve a strict LCP, set the following
1183 // low(i) = 0
1184 // high(i) = infinity.
1185 // this the same as enforcing the constraint: x(i) * r(i) = 0
1186 template <class T>
1187 bool dSolveDantzigLCP(dInt32 size, T* const symetricMatrix, T* const x, T* const b, T* const low, T* const high)
1188 {
1189  T* const choleskyMatrix = dAlloca(T, size * size);
1190  dCheckAligment(choleskyMatrix);
1191 
1192  memcpy (choleskyMatrix, symetricMatrix, sizeof (T) * size * size);
1193  dCholeskyFactorization(size, choleskyMatrix);
1194  for (dInt32 i = 0; i < size; i ++)
1195  {
1196  T* const row = &choleskyMatrix[i * size];
1197  for (dInt32 j = i + 1; j < size; j ++)
1198  {
1199  row[j] = T(0.0f);
1200  }
1201  }
1202  return dSolveDantzigLCP(size, symetricMatrix, choleskyMatrix, x, b, low, high);
1203 }
1204 */
1205 
1206 // solve a general Linear complementary program (LCP)
1207 // A * x = b + r
1208 // subjected to constraints
1209 // x(i) = low(i), if r(i) >= 0
1210 // x(i) = high(i), if r(i) <= 0
1211 // low(i) <= x(i) <= high(i), if r(i) == 0
1212 //
1213 // return true is the system has a solution.
1214 // in return
1215 // x is the solution,
1216 // b is zero
1217 // note: although the system is called LCP, the solver is far more general than a strict LCP
1218 // to solve a strict LCP, set the following
1219 // low(i) = 0
1220 // high(i) = infinity.
1221 // this is the same as enforcing the constraint: x(i) * r(i) = 0
1222 template <class T>
1223 bool dSolvePartitionDantzigLCP(dInt32 size, T* const symmetricMatrixPSD , T* const x, T* const b, T* const low, T* const high)
1224 {
1225  dInt16* const permute = dAlloca(dInt16, size);
1226 
1227  for (dInt32 i = 0; i < size; i++)
1228  {
1229  x[i] = b[i];
1230  permute[i] = dInt16(i);
1231  }
1232 
1233  dInt32 unboundedSize = size;
1234  for (dInt32 i = 0; i < unboundedSize; i++)
1235  {
1236  if ((low[i] <= T(-D_LCP_MAX_VALUE)) && (high[i] >= T(D_LCP_MAX_VALUE)))
1237  {
1238  dCholeskyFactorizationAddRow(size, i, symmetricMatrixPSD );
1239  }
1240  else
1241  {
1242  dInt32 j = unboundedSize - 1;
1243  if (i != j)
1244  {
1245  T* const A = &symmetricMatrixPSD [size * i];
1246  T* const B = &symmetricMatrixPSD [size * j];
1247  for (dInt32 k = 0; k < size; k++)
1248  {
1249  dSwap(A[k], B[k]);
1250  }
1251 
1252  dInt32 stride = 0;
1253  for (dInt32 k = 0; k < size; k++)
1254  {
1255  dSwap(symmetricMatrixPSD [stride + i], symmetricMatrixPSD [stride + j]);
1256  stride += size;
1257  }
1258  dSwap(x[i], x[j]);
1259  dSwap(b[i], b[j]);
1260  dSwap(low[i], low[j]);
1261  dSwap(high[i], high[j]);
1262  dSwap(permute[i], permute[j]);
1263  }
1264 
1265  i--;
1266  unboundedSize--;
1267  }
1268  }
1269 
1270  bool ret = false;
1271  if (unboundedSize > 0)
1272  {
1273  dSolveCholesky(size, unboundedSize, symmetricMatrixPSD , x);
1274  dInt32 base = unboundedSize * size;
1275  for (dInt32 i = unboundedSize; i < size; i++)
1276  {
1277  b[i] -= dDotProduct(unboundedSize, &symmetricMatrixPSD[base], x);
1278  base += size;
1279  }
1280 
1281  const dInt32 boundedSize = size - unboundedSize;
1282  T* const l = dAlloca(T, boundedSize);
1283  T* const h = dAlloca(T, boundedSize);
1284  T* const c = dAlloca(T, boundedSize);
1285  T* const u = dAlloca(T, boundedSize);
1286  T* const a11 = dAlloca(T, boundedSize * boundedSize);
1287  T* const a10 = dAlloca(T, boundedSize * unboundedSize);
1288 
1289  for (dInt32 i = 0; i < boundedSize; i++)
1290  {
1291  T* const g = &a10[i * unboundedSize];
1292  const T* const row = &symmetricMatrixPSD [(unboundedSize + i) * size];
1293  for (dInt32 j = 0; j < unboundedSize; j++)
1294  {
1295  g[j] = -row[j];
1296  }
1297  dSolveCholesky(size, unboundedSize, symmetricMatrixPSD, g);
1298 
1299  T* const arow = &a11[i * boundedSize];
1300  const T* const row2 = &symmetricMatrixPSD[(unboundedSize + i) * size];
1301  arow[i] = row2[unboundedSize + i] + dDotProduct(unboundedSize, g, row2);
1302  for (dInt32 j = i + 1; j < boundedSize; j++)
1303  {
1304  const T* const row1 = &symmetricMatrixPSD [(unboundedSize + j) * size];
1305  T elem = row1[unboundedSize + i] + dDotProduct(unboundedSize, g, row1);
1306  arow[j] = elem;
1307  a11[j * boundedSize + i] = elem;
1308  }
1309  u[i] = T(0.0f);
1310  c[i] = b[i + unboundedSize];
1311  l[i] = low[i + unboundedSize];
1312  h[i] = high[i + unboundedSize];
1313  }
1314 
1315  if (dSolveDantzigLCP(boundedSize, a11, u, c, l, h))
1316  {
1317  for (dInt32 i = 0; i < boundedSize; i++)
1318  {
1319  const T s = u[i];
1320  x[unboundedSize + i] = s;
1321  const T* const g = &a10[i * unboundedSize];
1322  for (dInt32 j = 0; j < unboundedSize; j++)
1323  {
1324  x[j] += g[j] * s;
1325  }
1326  }
1327  ret = true;
1328  }
1329  }
1330  else
1331  {
1332  for (dInt32 i = 0; i < size; i++)
1333  {
1334  x[i] = T(0.0f);
1335  }
1336  ret = dSolveDantzigLCP(size, symmetricMatrixPSD, x, b, low, high);
1337  }
1338 
1339  for (dInt32 i = 0; i < size; i++)
1340  {
1341  b[i] = x[i];
1342  }
1343  for (dInt32 i = 0; i < size; i++)
1344  {
1345  dInt32 j = permute[i];
1346  x[j] = b[i];
1347  b[i] = T(0.0f);
1348  }
1349  return ret;
1350 }
1351 
1352 template <class T>
1353 void dSolveDantzigLCP(dInt32 size, T* const symmetricMatrixPSD, T* const x, T* const b, T* const low, T* const high)
1354 {
1355  T tol2 = T(0.25f * 0.25f);
1356  dInt32 passes = dClamp(size, 12, 20);
1357  T* const r = dAlloca(T, size);
1358  dInt16* const clipped = dAlloca(dInt16, size);
1359 
1360  // find an approximation to the solution
1361  dGaussSeidelLcpSor(size, symmetricMatrixPSD, x, b, low, high, tol2, passes, clipped, T(1.3f));
1362 
1363  T err2(0.0f);
1364  dInt32 stride = 0;
1365  dInt32 clippeCount = 0;
1366  for (dInt32 i = 0; i < size; i++)
1367  {
1368  const T* const row = &symmetricMatrixPSD[stride];
1369  r[i] = b[i] - dDotProduct(size, row, x);
1370  clippeCount += clipped[i];
1371  err2 += clipped[i] ? T(0.0f) : r[i] * r[i];
1372  stride += size;
1373  }
1374 
1375  if (err2 > tol2)
1376  {
1377  // check for small lcp
1378  if ((clippeCount < 16) && ((clippeCount < 32) && (err2 < T(16.0f))))
1379  {
1380  // small lcp can be solved with direct method
1381  T* const x0 = dAlloca(T, size);
1382  for (dInt32 i = 0; i < size; i++)
1383  {
1384  low[i] -= x[i];
1385  high[i] -= x[i];
1386  }
1387  dSolveDantzigLcpLow(size, symmetricMatrixPSD, x0, r, low, high);
1388  for (dInt32 i = 0; i < size; i++)
1389  {
1390  x[i] += x0[i];
1391  }
1392  }
1393  else
1394  {
1395  // larger lcp are too hard for direct method, see if we can get better approximation
1396  dGaussSeidelLcpSor(size, symmetricMatrixPSD, x, b, low, high, tol2, 20, clipped, T(1.3f));
1397  }
1398  }
1399 }
1400 
1401 #endif
dSymmetricConjugateGradientSolver
Definition: dGeneralMatrix.h:35