[dali_2.3.21] Merge branch 'devel/master'
[platform/core/uifw/dali-toolkit.git] / dali-physics / third-party / bullet3 / src / BulletDynamics / MLCPSolvers / btDantzigLCP.cpp
1 /*************************************************************************
2 *                                                                       *
3 * Open Dynamics Engine, Copyright (C) 2001,2002 Russell L. Smith.       *
4 * All rights reserved.  Email: russ@q12.org   Web: www.q12.org          *
5 *                                                                       *
6 * This library is free software; you can redistribute it and/or         *
7 * modify it under the terms of EITHER:                                  *
8 *   (1) The GNU Lesser General Public License as published by the Free  *
9 *       Software Foundation; either version 2.1 of the License, or (at  *
10 *       your option) any later version. The text of the GNU Lesser      *
11 *       General Public License is included with this library in the     *
12 *       file LICENSE.TXT.                                               *
13 *   (2) The BSD-style license that is included with this library in     *
14 *       the file LICENSE-BSD.TXT.                                       *
15 *                                                                       *
16 * This library is distributed in the hope that it will be useful,       *
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the files    *
19 * LICENSE.TXT and LICENSE-BSD.TXT for more details.                     *
20 *                                                                       *
21 *************************************************************************/
22
23 /*
24
25
26 THE ALGORITHM
27 -------------
28
29 solve A*x = b+w, with x and w subject to certain LCP conditions.
30 each x(i),w(i) must lie on one of the three line segments in the following
31 diagram. each line segment corresponds to one index set :
32
33      w(i)
34      /|\      |           :
35       |       |           :
36       |       |i in N     :
37   w>0 |       |state[i]=0 :
38       |       |           :
39       |       |           :  i in C
40   w=0 +       +-----------------------+
41       |                   :           |
42       |                   :           |
43   w<0 |                   :           |i in N
44       |                   :           |state[i]=1
45       |                   :           |
46       |                   :           |
47       +-------|-----------|-----------|----------> x(i)
48              lo           0           hi
49
50 the Dantzig algorithm proceeds as follows:
51   for i=1:n
52     * if (x(i),w(i)) is not on the line, push x(i) and w(i) positive or
53       negative towards the line. as this is done, the other (x(j),w(j))
54       for j<i are constrained to be on the line. if any (x,w) reaches the
55       end of a line segment then it is switched between index sets.
56     * i is added to the appropriate index set depending on what line segment
57       it hits.
58
59 we restrict lo(i) <= 0 and hi(i) >= 0. this makes the algorithm a bit
60 simpler, because the starting point for x(i),w(i) is always on the dotted
61 line x=0 and x will only ever increase in one direction, so it can only hit
62 two out of the three line segments.
63
64
65 NOTES
66 -----
67
68 this is an implementation of "lcp_dantzig2_ldlt.m" and "lcp_dantzig_lohi.m".
69 the implementation is split into an LCP problem object (btLCP) and an LCP
70 driver function. most optimization occurs in the btLCP object.
71
72 a naive implementation of the algorithm requires either a lot of data motion
73 or a lot of permutation-array lookup, because we are constantly re-ordering
74 rows and columns. to avoid this and make a more optimized algorithm, a
75 non-trivial data structure is used to represent the matrix A (this is
76 implemented in the fast version of the btLCP object).
77
78 during execution of this algorithm, some indexes in A are clamped (set C),
79 some are non-clamped (set N), and some are "don't care" (where x=0).
80 A,x,b,w (and other problem vectors) are permuted such that the clamped
81 indexes are first, the unclamped indexes are next, and the don't-care
82 indexes are last. this permutation is recorded in the array `p'.
83 initially p = 0..n-1, and as the rows and columns of A,x,b,w are swapped,
84 the corresponding elements of p are swapped.
85
86 because the C and N elements are grouped together in the rows of A, we can do
87 lots of work with a fast dot product function. if A,x,etc were not permuted
88 and we only had a permutation array, then those dot products would be much
89 slower as we would have a permutation array lookup in some inner loops.
90
91 A is accessed through an array of row pointers, so that element (i,j) of the
92 permuted matrix is A[i][j]. this makes row swapping fast. for column swapping
93 we still have to actually move the data.
94
95 during execution of this algorithm we maintain an L*D*L' factorization of
96 the clamped submatrix of A (call it `AC') which is the top left nC*nC
97 submatrix of A. there are two ways we could arrange the rows/columns in AC.
98
99 (1) AC is always permuted such that L*D*L' = AC. this causes a problem
100 when a row/column is removed from C, because then all the rows/columns of A
101 between the deleted index and the end of C need to be rotated downward.
102 this results in a lot of data motion and slows things down.
103 (2) L*D*L' is actually a factorization of a *permutation* of AC (which is
104 itself a permutation of the underlying A). this is what we do - the
105 permutation is recorded in the vector C. call this permutation A[C,C].
106 when a row/column is removed from C, all we have to do is swap two
107 rows/columns and manipulate C.
108
109 */
110
111 #include "btDantzigLCP.h"
112
113 #include <string.h>  //memcpy
114
115 bool s_error = false;
116
117 //***************************************************************************
118 // code generation parameters
119
120 #define btLCP_FAST  // use fast btLCP object
121
122 // option 1 : matrix row pointers (less data copying)
123 #define BTROWPTRS
124 #define BTATYPE btScalar **
125 #define BTAROW(i) (m_A[i])
126
127 // option 2 : no matrix row pointers (slightly faster inner loops)
128 //#define NOROWPTRS
129 //#define BTATYPE btScalar *
130 //#define BTAROW(i) (m_A+(i)*m_nskip)
131
132 #define BTNUB_OPTIMIZATIONS
133
134 /* solve L*X=B, with B containing 1 right hand sides.
135  * L is an n*n lower triangular matrix with ones on the diagonal.
136  * L is stored by rows and its leading dimension is lskip.
137  * B is an n*1 matrix that contains the right hand sides.
138  * B is stored by columns and its leading dimension is also lskip.
139  * B is overwritten with X.
140  * this processes blocks of 2*2.
141  * if this is in the factorizer source file, n must be a multiple of 2.
142  */
143
144 static void btSolveL1_1(const btScalar *L, btScalar *B, int n, int lskip1)
145 {
146         /* declare variables - Z matrix, p and q vectors, etc */
147         btScalar Z11, m11, Z21, m21, p1, q1, p2, *ex;
148         const btScalar *ell;
149         int i, j;
150         /* compute all 2 x 1 blocks of X */
151         for (i = 0; i < n; i += 2)
152         {
153                 /* compute all 2 x 1 block of X, from rows i..i+2-1 */
154                 /* set the Z matrix to 0 */
155                 Z11 = 0;
156                 Z21 = 0;
157                 ell = L + i * lskip1;
158                 ex = B;
159                 /* the inner loop that computes outer products and adds them to Z */
160                 for (j = i - 2; j >= 0; j -= 2)
161                 {
162                         /* compute outer product and add it to the Z matrix */
163                         p1 = ell[0];
164                         q1 = ex[0];
165                         m11 = p1 * q1;
166                         p2 = ell[lskip1];
167                         m21 = p2 * q1;
168                         Z11 += m11;
169                         Z21 += m21;
170                         /* compute outer product and add it to the Z matrix */
171                         p1 = ell[1];
172                         q1 = ex[1];
173                         m11 = p1 * q1;
174                         p2 = ell[1 + lskip1];
175                         m21 = p2 * q1;
176                         /* advance pointers */
177                         ell += 2;
178                         ex += 2;
179                         Z11 += m11;
180                         Z21 += m21;
181                         /* end of inner loop */
182                 }
183                 /* compute left-over iterations */
184                 j += 2;
185                 for (; j > 0; j--)
186                 {
187                         /* compute outer product and add it to the Z matrix */
188                         p1 = ell[0];
189                         q1 = ex[0];
190                         m11 = p1 * q1;
191                         p2 = ell[lskip1];
192                         m21 = p2 * q1;
193                         /* advance pointers */
194                         ell += 1;
195                         ex += 1;
196                         Z11 += m11;
197                         Z21 += m21;
198                 }
199                 /* finish computing the X(i) block */
200                 Z11 = ex[0] - Z11;
201                 ex[0] = Z11;
202                 p1 = ell[lskip1];
203                 Z21 = ex[1] - Z21 - p1 * Z11;
204                 ex[1] = Z21;
205                 /* end of outer loop */
206         }
207 }
208
209 /* solve L*X=B, with B containing 2 right hand sides.
210  * L is an n*n lower triangular matrix with ones on the diagonal.
211  * L is stored by rows and its leading dimension is lskip.
212  * B is an n*2 matrix that contains the right hand sides.
213  * B is stored by columns and its leading dimension is also lskip.
214  * B is overwritten with X.
215  * this processes blocks of 2*2.
216  * if this is in the factorizer source file, n must be a multiple of 2.
217  */
218
219 static void btSolveL1_2(const btScalar *L, btScalar *B, int n, int lskip1)
220 {
221         /* declare variables - Z matrix, p and q vectors, etc */
222         btScalar Z11, m11, Z12, m12, Z21, m21, Z22, m22, p1, q1, p2, q2, *ex;
223         const btScalar *ell;
224         int i, j;
225         /* compute all 2 x 2 blocks of X */
226         for (i = 0; i < n; i += 2)
227         {
228                 /* compute all 2 x 2 block of X, from rows i..i+2-1 */
229                 /* set the Z matrix to 0 */
230                 Z11 = 0;
231                 Z12 = 0;
232                 Z21 = 0;
233                 Z22 = 0;
234                 ell = L + i * lskip1;
235                 ex = B;
236                 /* the inner loop that computes outer products and adds them to Z */
237                 for (j = i - 2; j >= 0; j -= 2)
238                 {
239                         /* compute outer product and add it to the Z matrix */
240                         p1 = ell[0];
241                         q1 = ex[0];
242                         m11 = p1 * q1;
243                         q2 = ex[lskip1];
244                         m12 = p1 * q2;
245                         p2 = ell[lskip1];
246                         m21 = p2 * q1;
247                         m22 = p2 * q2;
248                         Z11 += m11;
249                         Z12 += m12;
250                         Z21 += m21;
251                         Z22 += m22;
252                         /* compute outer product and add it to the Z matrix */
253                         p1 = ell[1];
254                         q1 = ex[1];
255                         m11 = p1 * q1;
256                         q2 = ex[1 + lskip1];
257                         m12 = p1 * q2;
258                         p2 = ell[1 + lskip1];
259                         m21 = p2 * q1;
260                         m22 = p2 * q2;
261                         /* advance pointers */
262                         ell += 2;
263                         ex += 2;
264                         Z11 += m11;
265                         Z12 += m12;
266                         Z21 += m21;
267                         Z22 += m22;
268                         /* end of inner loop */
269                 }
270                 /* compute left-over iterations */
271                 j += 2;
272                 for (; j > 0; j--)
273                 {
274                         /* compute outer product and add it to the Z matrix */
275                         p1 = ell[0];
276                         q1 = ex[0];
277                         m11 = p1 * q1;
278                         q2 = ex[lskip1];
279                         m12 = p1 * q2;
280                         p2 = ell[lskip1];
281                         m21 = p2 * q1;
282                         m22 = p2 * q2;
283                         /* advance pointers */
284                         ell += 1;
285                         ex += 1;
286                         Z11 += m11;
287                         Z12 += m12;
288                         Z21 += m21;
289                         Z22 += m22;
290                 }
291                 /* finish computing the X(i) block */
292                 Z11 = ex[0] - Z11;
293                 ex[0] = Z11;
294                 Z12 = ex[lskip1] - Z12;
295                 ex[lskip1] = Z12;
296                 p1 = ell[lskip1];
297                 Z21 = ex[1] - Z21 - p1 * Z11;
298                 ex[1] = Z21;
299                 Z22 = ex[1 + lskip1] - Z22 - p1 * Z12;
300                 ex[1 + lskip1] = Z22;
301                 /* end of outer loop */
302         }
303 }
304
305 void btFactorLDLT(btScalar *A, btScalar *d, int n, int nskip1)
306 {
307         int i, j;
308         btScalar sum, *ell, *dee, dd, p1, p2, q1, q2, Z11, m11, Z21, m21, Z22, m22;
309         if (n < 1) return;
310
311         for (i = 0; i <= n - 2; i += 2)
312         {
313                 /* solve L*(D*l)=a, l is scaled elements in 2 x i block at A(i,0) */
314                 btSolveL1_2(A, A + i * nskip1, i, nskip1);
315                 /* scale the elements in a 2 x i block at A(i,0), and also */
316                 /* compute Z = the outer product matrix that we'll need. */
317                 Z11 = 0;
318                 Z21 = 0;
319                 Z22 = 0;
320                 ell = A + i * nskip1;
321                 dee = d;
322                 for (j = i - 6; j >= 0; j -= 6)
323                 {
324                         p1 = ell[0];
325                         p2 = ell[nskip1];
326                         dd = dee[0];
327                         q1 = p1 * dd;
328                         q2 = p2 * dd;
329                         ell[0] = q1;
330                         ell[nskip1] = q2;
331                         m11 = p1 * q1;
332                         m21 = p2 * q1;
333                         m22 = p2 * q2;
334                         Z11 += m11;
335                         Z21 += m21;
336                         Z22 += m22;
337                         p1 = ell[1];
338                         p2 = ell[1 + nskip1];
339                         dd = dee[1];
340                         q1 = p1 * dd;
341                         q2 = p2 * dd;
342                         ell[1] = q1;
343                         ell[1 + nskip1] = q2;
344                         m11 = p1 * q1;
345                         m21 = p2 * q1;
346                         m22 = p2 * q2;
347                         Z11 += m11;
348                         Z21 += m21;
349                         Z22 += m22;
350                         p1 = ell[2];
351                         p2 = ell[2 + nskip1];
352                         dd = dee[2];
353                         q1 = p1 * dd;
354                         q2 = p2 * dd;
355                         ell[2] = q1;
356                         ell[2 + nskip1] = q2;
357                         m11 = p1 * q1;
358                         m21 = p2 * q1;
359                         m22 = p2 * q2;
360                         Z11 += m11;
361                         Z21 += m21;
362                         Z22 += m22;
363                         p1 = ell[3];
364                         p2 = ell[3 + nskip1];
365                         dd = dee[3];
366                         q1 = p1 * dd;
367                         q2 = p2 * dd;
368                         ell[3] = q1;
369                         ell[3 + nskip1] = q2;
370                         m11 = p1 * q1;
371                         m21 = p2 * q1;
372                         m22 = p2 * q2;
373                         Z11 += m11;
374                         Z21 += m21;
375                         Z22 += m22;
376                         p1 = ell[4];
377                         p2 = ell[4 + nskip1];
378                         dd = dee[4];
379                         q1 = p1 * dd;
380                         q2 = p2 * dd;
381                         ell[4] = q1;
382                         ell[4 + nskip1] = q2;
383                         m11 = p1 * q1;
384                         m21 = p2 * q1;
385                         m22 = p2 * q2;
386                         Z11 += m11;
387                         Z21 += m21;
388                         Z22 += m22;
389                         p1 = ell[5];
390                         p2 = ell[5 + nskip1];
391                         dd = dee[5];
392                         q1 = p1 * dd;
393                         q2 = p2 * dd;
394                         ell[5] = q1;
395                         ell[5 + nskip1] = q2;
396                         m11 = p1 * q1;
397                         m21 = p2 * q1;
398                         m22 = p2 * q2;
399                         Z11 += m11;
400                         Z21 += m21;
401                         Z22 += m22;
402                         ell += 6;
403                         dee += 6;
404                 }
405                 /* compute left-over iterations */
406                 j += 6;
407                 for (; j > 0; j--)
408                 {
409                         p1 = ell[0];
410                         p2 = ell[nskip1];
411                         dd = dee[0];
412                         q1 = p1 * dd;
413                         q2 = p2 * dd;
414                         ell[0] = q1;
415                         ell[nskip1] = q2;
416                         m11 = p1 * q1;
417                         m21 = p2 * q1;
418                         m22 = p2 * q2;
419                         Z11 += m11;
420                         Z21 += m21;
421                         Z22 += m22;
422                         ell++;
423                         dee++;
424                 }
425                 /* solve for diagonal 2 x 2 block at A(i,i) */
426                 Z11 = ell[0] - Z11;
427                 Z21 = ell[nskip1] - Z21;
428                 Z22 = ell[1 + nskip1] - Z22;
429                 dee = d + i;
430                 /* factorize 2 x 2 block Z,dee */
431                 /* factorize row 1 */
432                 dee[0] = btRecip(Z11);
433                 /* factorize row 2 */
434                 sum = 0;
435                 q1 = Z21;
436                 q2 = q1 * dee[0];
437                 Z21 = q2;
438                 sum += q1 * q2;
439                 dee[1] = btRecip(Z22 - sum);
440                 /* done factorizing 2 x 2 block */
441                 ell[nskip1] = Z21;
442         }
443         /* compute the (less than 2) rows at the bottom */
444         switch (n - i)
445         {
446                 case 0:
447                         break;
448
449                 case 1:
450                         btSolveL1_1(A, A + i * nskip1, i, nskip1);
451                         /* scale the elements in a 1 x i block at A(i,0), and also */
452                         /* compute Z = the outer product matrix that we'll need. */
453                         Z11 = 0;
454                         ell = A + i * nskip1;
455                         dee = d;
456                         for (j = i - 6; j >= 0; j -= 6)
457                         {
458                                 p1 = ell[0];
459                                 dd = dee[0];
460                                 q1 = p1 * dd;
461                                 ell[0] = q1;
462                                 m11 = p1 * q1;
463                                 Z11 += m11;
464                                 p1 = ell[1];
465                                 dd = dee[1];
466                                 q1 = p1 * dd;
467                                 ell[1] = q1;
468                                 m11 = p1 * q1;
469                                 Z11 += m11;
470                                 p1 = ell[2];
471                                 dd = dee[2];
472                                 q1 = p1 * dd;
473                                 ell[2] = q1;
474                                 m11 = p1 * q1;
475                                 Z11 += m11;
476                                 p1 = ell[3];
477                                 dd = dee[3];
478                                 q1 = p1 * dd;
479                                 ell[3] = q1;
480                                 m11 = p1 * q1;
481                                 Z11 += m11;
482                                 p1 = ell[4];
483                                 dd = dee[4];
484                                 q1 = p1 * dd;
485                                 ell[4] = q1;
486                                 m11 = p1 * q1;
487                                 Z11 += m11;
488                                 p1 = ell[5];
489                                 dd = dee[5];
490                                 q1 = p1 * dd;
491                                 ell[5] = q1;
492                                 m11 = p1 * q1;
493                                 Z11 += m11;
494                                 ell += 6;
495                                 dee += 6;
496                         }
497                         /* compute left-over iterations */
498                         j += 6;
499                         for (; j > 0; j--)
500                         {
501                                 p1 = ell[0];
502                                 dd = dee[0];
503                                 q1 = p1 * dd;
504                                 ell[0] = q1;
505                                 m11 = p1 * q1;
506                                 Z11 += m11;
507                                 ell++;
508                                 dee++;
509                         }
510                         /* solve for diagonal 1 x 1 block at A(i,i) */
511                         Z11 = ell[0] - Z11;
512                         dee = d + i;
513                         /* factorize 1 x 1 block Z,dee */
514                         /* factorize row 1 */
515                         dee[0] = btRecip(Z11);
516                         /* done factorizing 1 x 1 block */
517                         break;
518
519                         //default: *((char*)0)=0;  /* this should never happen! */
520         }
521 }
522
523 /* solve L*X=B, with B containing 1 right hand sides.
524  * L is an n*n lower triangular matrix with ones on the diagonal.
525  * L is stored by rows and its leading dimension is lskip.
526  * B is an n*1 matrix that contains the right hand sides.
527  * B is stored by columns and its leading dimension is also lskip.
528  * B is overwritten with X.
529  * this processes blocks of 4*4.
530  * if this is in the factorizer source file, n must be a multiple of 4.
531  */
532
533 void btSolveL1(const btScalar *L, btScalar *B, int n, int lskip1)
534 {
535         /* declare variables - Z matrix, p and q vectors, etc */
536         btScalar Z11, Z21, Z31, Z41, p1, q1, p2, p3, p4, *ex;
537         const btScalar *ell;
538         int lskip2, lskip3, i, j;
539         /* compute lskip values */
540         lskip2 = 2 * lskip1;
541         lskip3 = 3 * lskip1;
542         /* compute all 4 x 1 blocks of X */
543         for (i = 0; i <= n - 4; i += 4)
544         {
545                 /* compute all 4 x 1 block of X, from rows i..i+4-1 */
546                 /* set the Z matrix to 0 */
547                 Z11 = 0;
548                 Z21 = 0;
549                 Z31 = 0;
550                 Z41 = 0;
551                 ell = L + i * lskip1;
552                 ex = B;
553                 /* the inner loop that computes outer products and adds them to Z */
554                 for (j = i - 12; j >= 0; j -= 12)
555                 {
556                         /* load p and q values */
557                         p1 = ell[0];
558                         q1 = ex[0];
559                         p2 = ell[lskip1];
560                         p3 = ell[lskip2];
561                         p4 = ell[lskip3];
562                         /* compute outer product and add it to the Z matrix */
563                         Z11 += p1 * q1;
564                         Z21 += p2 * q1;
565                         Z31 += p3 * q1;
566                         Z41 += p4 * q1;
567                         /* load p and q values */
568                         p1 = ell[1];
569                         q1 = ex[1];
570                         p2 = ell[1 + lskip1];
571                         p3 = ell[1 + lskip2];
572                         p4 = ell[1 + lskip3];
573                         /* compute outer product and add it to the Z matrix */
574                         Z11 += p1 * q1;
575                         Z21 += p2 * q1;
576                         Z31 += p3 * q1;
577                         Z41 += p4 * q1;
578                         /* load p and q values */
579                         p1 = ell[2];
580                         q1 = ex[2];
581                         p2 = ell[2 + lskip1];
582                         p3 = ell[2 + lskip2];
583                         p4 = ell[2 + lskip3];
584                         /* compute outer product and add it to the Z matrix */
585                         Z11 += p1 * q1;
586                         Z21 += p2 * q1;
587                         Z31 += p3 * q1;
588                         Z41 += p4 * q1;
589                         /* load p and q values */
590                         p1 = ell[3];
591                         q1 = ex[3];
592                         p2 = ell[3 + lskip1];
593                         p3 = ell[3 + lskip2];
594                         p4 = ell[3 + lskip3];
595                         /* compute outer product and add it to the Z matrix */
596                         Z11 += p1 * q1;
597                         Z21 += p2 * q1;
598                         Z31 += p3 * q1;
599                         Z41 += p4 * q1;
600                         /* load p and q values */
601                         p1 = ell[4];
602                         q1 = ex[4];
603                         p2 = ell[4 + lskip1];
604                         p3 = ell[4 + lskip2];
605                         p4 = ell[4 + lskip3];
606                         /* compute outer product and add it to the Z matrix */
607                         Z11 += p1 * q1;
608                         Z21 += p2 * q1;
609                         Z31 += p3 * q1;
610                         Z41 += p4 * q1;
611                         /* load p and q values */
612                         p1 = ell[5];
613                         q1 = ex[5];
614                         p2 = ell[5 + lskip1];
615                         p3 = ell[5 + lskip2];
616                         p4 = ell[5 + lskip3];
617                         /* compute outer product and add it to the Z matrix */
618                         Z11 += p1 * q1;
619                         Z21 += p2 * q1;
620                         Z31 += p3 * q1;
621                         Z41 += p4 * q1;
622                         /* load p and q values */
623                         p1 = ell[6];
624                         q1 = ex[6];
625                         p2 = ell[6 + lskip1];
626                         p3 = ell[6 + lskip2];
627                         p4 = ell[6 + lskip3];
628                         /* compute outer product and add it to the Z matrix */
629                         Z11 += p1 * q1;
630                         Z21 += p2 * q1;
631                         Z31 += p3 * q1;
632                         Z41 += p4 * q1;
633                         /* load p and q values */
634                         p1 = ell[7];
635                         q1 = ex[7];
636                         p2 = ell[7 + lskip1];
637                         p3 = ell[7 + lskip2];
638                         p4 = ell[7 + lskip3];
639                         /* compute outer product and add it to the Z matrix */
640                         Z11 += p1 * q1;
641                         Z21 += p2 * q1;
642                         Z31 += p3 * q1;
643                         Z41 += p4 * q1;
644                         /* load p and q values */
645                         p1 = ell[8];
646                         q1 = ex[8];
647                         p2 = ell[8 + lskip1];
648                         p3 = ell[8 + lskip2];
649                         p4 = ell[8 + lskip3];
650                         /* compute outer product and add it to the Z matrix */
651                         Z11 += p1 * q1;
652                         Z21 += p2 * q1;
653                         Z31 += p3 * q1;
654                         Z41 += p4 * q1;
655                         /* load p and q values */
656                         p1 = ell[9];
657                         q1 = ex[9];
658                         p2 = ell[9 + lskip1];
659                         p3 = ell[9 + lskip2];
660                         p4 = ell[9 + lskip3];
661                         /* compute outer product and add it to the Z matrix */
662                         Z11 += p1 * q1;
663                         Z21 += p2 * q1;
664                         Z31 += p3 * q1;
665                         Z41 += p4 * q1;
666                         /* load p and q values */
667                         p1 = ell[10];
668                         q1 = ex[10];
669                         p2 = ell[10 + lskip1];
670                         p3 = ell[10 + lskip2];
671                         p4 = ell[10 + lskip3];
672                         /* compute outer product and add it to the Z matrix */
673                         Z11 += p1 * q1;
674                         Z21 += p2 * q1;
675                         Z31 += p3 * q1;
676                         Z41 += p4 * q1;
677                         /* load p and q values */
678                         p1 = ell[11];
679                         q1 = ex[11];
680                         p2 = ell[11 + lskip1];
681                         p3 = ell[11 + lskip2];
682                         p4 = ell[11 + lskip3];
683                         /* compute outer product and add it to the Z matrix */
684                         Z11 += p1 * q1;
685                         Z21 += p2 * q1;
686                         Z31 += p3 * q1;
687                         Z41 += p4 * q1;
688                         /* advance pointers */
689                         ell += 12;
690                         ex += 12;
691                         /* end of inner loop */
692                 }
693                 /* compute left-over iterations */
694                 j += 12;
695                 for (; j > 0; j--)
696                 {
697                         /* load p and q values */
698                         p1 = ell[0];
699                         q1 = ex[0];
700                         p2 = ell[lskip1];
701                         p3 = ell[lskip2];
702                         p4 = ell[lskip3];
703                         /* compute outer product and add it to the Z matrix */
704                         Z11 += p1 * q1;
705                         Z21 += p2 * q1;
706                         Z31 += p3 * q1;
707                         Z41 += p4 * q1;
708                         /* advance pointers */
709                         ell += 1;
710                         ex += 1;
711                 }
712                 /* finish computing the X(i) block */
713                 Z11 = ex[0] - Z11;
714                 ex[0] = Z11;
715                 p1 = ell[lskip1];
716                 Z21 = ex[1] - Z21 - p1 * Z11;
717                 ex[1] = Z21;
718                 p1 = ell[lskip2];
719                 p2 = ell[1 + lskip2];
720                 Z31 = ex[2] - Z31 - p1 * Z11 - p2 * Z21;
721                 ex[2] = Z31;
722                 p1 = ell[lskip3];
723                 p2 = ell[1 + lskip3];
724                 p3 = ell[2 + lskip3];
725                 Z41 = ex[3] - Z41 - p1 * Z11 - p2 * Z21 - p3 * Z31;
726                 ex[3] = Z41;
727                 /* end of outer loop */
728         }
729         /* compute rows at end that are not a multiple of block size */
730         for (; i < n; i++)
731         {
732                 /* compute all 1 x 1 block of X, from rows i..i+1-1 */
733                 /* set the Z matrix to 0 */
734                 Z11 = 0;
735                 ell = L + i * lskip1;
736                 ex = B;
737                 /* the inner loop that computes outer products and adds them to Z */
738                 for (j = i - 12; j >= 0; j -= 12)
739                 {
740                         /* load p and q values */
741                         p1 = ell[0];
742                         q1 = ex[0];
743                         /* compute outer product and add it to the Z matrix */
744                         Z11 += p1 * q1;
745                         /* load p and q values */
746                         p1 = ell[1];
747                         q1 = ex[1];
748                         /* compute outer product and add it to the Z matrix */
749                         Z11 += p1 * q1;
750                         /* load p and q values */
751                         p1 = ell[2];
752                         q1 = ex[2];
753                         /* compute outer product and add it to the Z matrix */
754                         Z11 += p1 * q1;
755                         /* load p and q values */
756                         p1 = ell[3];
757                         q1 = ex[3];
758                         /* compute outer product and add it to the Z matrix */
759                         Z11 += p1 * q1;
760                         /* load p and q values */
761                         p1 = ell[4];
762                         q1 = ex[4];
763                         /* compute outer product and add it to the Z matrix */
764                         Z11 += p1 * q1;
765                         /* load p and q values */
766                         p1 = ell[5];
767                         q1 = ex[5];
768                         /* compute outer product and add it to the Z matrix */
769                         Z11 += p1 * q1;
770                         /* load p and q values */
771                         p1 = ell[6];
772                         q1 = ex[6];
773                         /* compute outer product and add it to the Z matrix */
774                         Z11 += p1 * q1;
775                         /* load p and q values */
776                         p1 = ell[7];
777                         q1 = ex[7];
778                         /* compute outer product and add it to the Z matrix */
779                         Z11 += p1 * q1;
780                         /* load p and q values */
781                         p1 = ell[8];
782                         q1 = ex[8];
783                         /* compute outer product and add it to the Z matrix */
784                         Z11 += p1 * q1;
785                         /* load p and q values */
786                         p1 = ell[9];
787                         q1 = ex[9];
788                         /* compute outer product and add it to the Z matrix */
789                         Z11 += p1 * q1;
790                         /* load p and q values */
791                         p1 = ell[10];
792                         q1 = ex[10];
793                         /* compute outer product and add it to the Z matrix */
794                         Z11 += p1 * q1;
795                         /* load p and q values */
796                         p1 = ell[11];
797                         q1 = ex[11];
798                         /* compute outer product and add it to the Z matrix */
799                         Z11 += p1 * q1;
800                         /* advance pointers */
801                         ell += 12;
802                         ex += 12;
803                         /* end of inner loop */
804                 }
805                 /* compute left-over iterations */
806                 j += 12;
807                 for (; j > 0; j--)
808                 {
809                         /* load p and q values */
810                         p1 = ell[0];
811                         q1 = ex[0];
812                         /* compute outer product and add it to the Z matrix */
813                         Z11 += p1 * q1;
814                         /* advance pointers */
815                         ell += 1;
816                         ex += 1;
817                 }
818                 /* finish computing the X(i) block */
819                 Z11 = ex[0] - Z11;
820                 ex[0] = Z11;
821         }
822 }
823
824 /* solve L^T * x=b, with b containing 1 right hand side.
825  * L is an n*n lower triangular matrix with ones on the diagonal.
826  * L is stored by rows and its leading dimension is lskip.
827  * b is an n*1 matrix that contains the right hand side.
828  * b is overwritten with x.
829  * this processes blocks of 4.
830  */
831
832 void btSolveL1T(const btScalar *L, btScalar *B, int n, int lskip1)
833 {
834         /* declare variables - Z matrix, p and q vectors, etc */
835         btScalar Z11, m11, Z21, m21, Z31, m31, Z41, m41, p1, q1, p2, p3, p4, *ex;
836         const btScalar *ell;
837         int lskip2, i, j;
838         //  int lskip3;
839         /* special handling for L and B because we're solving L1 *transpose* */
840         L = L + (n - 1) * (lskip1 + 1);
841         B = B + n - 1;
842         lskip1 = -lskip1;
843         /* compute lskip values */
844         lskip2 = 2 * lskip1;
845         //lskip3 = 3*lskip1;
846         /* compute all 4 x 1 blocks of X */
847         for (i = 0; i <= n - 4; i += 4)
848         {
849                 /* compute all 4 x 1 block of X, from rows i..i+4-1 */
850                 /* set the Z matrix to 0 */
851                 Z11 = 0;
852                 Z21 = 0;
853                 Z31 = 0;
854                 Z41 = 0;
855                 ell = L - i;
856                 ex = B;
857                 /* the inner loop that computes outer products and adds them to Z */
858                 for (j = i - 4; j >= 0; j -= 4)
859                 {
860                         /* load p and q values */
861                         p1 = ell[0];
862                         q1 = ex[0];
863                         p2 = ell[-1];
864                         p3 = ell[-2];
865                         p4 = ell[-3];
866                         /* compute outer product and add it to the Z matrix */
867                         m11 = p1 * q1;
868                         m21 = p2 * q1;
869                         m31 = p3 * q1;
870                         m41 = p4 * q1;
871                         ell += lskip1;
872                         Z11 += m11;
873                         Z21 += m21;
874                         Z31 += m31;
875                         Z41 += m41;
876                         /* load p and q values */
877                         p1 = ell[0];
878                         q1 = ex[-1];
879                         p2 = ell[-1];
880                         p3 = ell[-2];
881                         p4 = ell[-3];
882                         /* compute outer product and add it to the Z matrix */
883                         m11 = p1 * q1;
884                         m21 = p2 * q1;
885                         m31 = p3 * q1;
886                         m41 = p4 * q1;
887                         ell += lskip1;
888                         Z11 += m11;
889                         Z21 += m21;
890                         Z31 += m31;
891                         Z41 += m41;
892                         /* load p and q values */
893                         p1 = ell[0];
894                         q1 = ex[-2];
895                         p2 = ell[-1];
896                         p3 = ell[-2];
897                         p4 = ell[-3];
898                         /* compute outer product and add it to the Z matrix */
899                         m11 = p1 * q1;
900                         m21 = p2 * q1;
901                         m31 = p3 * q1;
902                         m41 = p4 * q1;
903                         ell += lskip1;
904                         Z11 += m11;
905                         Z21 += m21;
906                         Z31 += m31;
907                         Z41 += m41;
908                         /* load p and q values */
909                         p1 = ell[0];
910                         q1 = ex[-3];
911                         p2 = ell[-1];
912                         p3 = ell[-2];
913                         p4 = ell[-3];
914                         /* compute outer product and add it to the Z matrix */
915                         m11 = p1 * q1;
916                         m21 = p2 * q1;
917                         m31 = p3 * q1;
918                         m41 = p4 * q1;
919                         ell += lskip1;
920                         ex -= 4;
921                         Z11 += m11;
922                         Z21 += m21;
923                         Z31 += m31;
924                         Z41 += m41;
925                         /* end of inner loop */
926                 }
927                 /* compute left-over iterations */
928                 j += 4;
929                 for (; j > 0; j--)
930                 {
931                         /* load p and q values */
932                         p1 = ell[0];
933                         q1 = ex[0];
934                         p2 = ell[-1];
935                         p3 = ell[-2];
936                         p4 = ell[-3];
937                         /* compute outer product and add it to the Z matrix */
938                         m11 = p1 * q1;
939                         m21 = p2 * q1;
940                         m31 = p3 * q1;
941                         m41 = p4 * q1;
942                         ell += lskip1;
943                         ex -= 1;
944                         Z11 += m11;
945                         Z21 += m21;
946                         Z31 += m31;
947                         Z41 += m41;
948                 }
949                 /* finish computing the X(i) block */
950                 Z11 = ex[0] - Z11;
951                 ex[0] = Z11;
952                 p1 = ell[-1];
953                 Z21 = ex[-1] - Z21 - p1 * Z11;
954                 ex[-1] = Z21;
955                 p1 = ell[-2];
956                 p2 = ell[-2 + lskip1];
957                 Z31 = ex[-2] - Z31 - p1 * Z11 - p2 * Z21;
958                 ex[-2] = Z31;
959                 p1 = ell[-3];
960                 p2 = ell[-3 + lskip1];
961                 p3 = ell[-3 + lskip2];
962                 Z41 = ex[-3] - Z41 - p1 * Z11 - p2 * Z21 - p3 * Z31;
963                 ex[-3] = Z41;
964                 /* end of outer loop */
965         }
966         /* compute rows at end that are not a multiple of block size */
967         for (; i < n; i++)
968         {
969                 /* compute all 1 x 1 block of X, from rows i..i+1-1 */
970                 /* set the Z matrix to 0 */
971                 Z11 = 0;
972                 ell = L - i;
973                 ex = B;
974                 /* the inner loop that computes outer products and adds them to Z */
975                 for (j = i - 4; j >= 0; j -= 4)
976                 {
977                         /* load p and q values */
978                         p1 = ell[0];
979                         q1 = ex[0];
980                         /* compute outer product and add it to the Z matrix */
981                         m11 = p1 * q1;
982                         ell += lskip1;
983                         Z11 += m11;
984                         /* load p and q values */
985                         p1 = ell[0];
986                         q1 = ex[-1];
987                         /* compute outer product and add it to the Z matrix */
988                         m11 = p1 * q1;
989                         ell += lskip1;
990                         Z11 += m11;
991                         /* load p and q values */
992                         p1 = ell[0];
993                         q1 = ex[-2];
994                         /* compute outer product and add it to the Z matrix */
995                         m11 = p1 * q1;
996                         ell += lskip1;
997                         Z11 += m11;
998                         /* load p and q values */
999                         p1 = ell[0];
1000                         q1 = ex[-3];
1001                         /* compute outer product and add it to the Z matrix */
1002                         m11 = p1 * q1;
1003                         ell += lskip1;
1004                         ex -= 4;
1005                         Z11 += m11;
1006                         /* end of inner loop */
1007                 }
1008                 /* compute left-over iterations */
1009                 j += 4;
1010                 for (; j > 0; j--)
1011                 {
1012                         /* load p and q values */
1013                         p1 = ell[0];
1014                         q1 = ex[0];
1015                         /* compute outer product and add it to the Z matrix */
1016                         m11 = p1 * q1;
1017                         ell += lskip1;
1018                         ex -= 1;
1019                         Z11 += m11;
1020                 }
1021                 /* finish computing the X(i) block */
1022                 Z11 = ex[0] - Z11;
1023                 ex[0] = Z11;
1024         }
1025 }
1026
1027 void btVectorScale(btScalar *a, const btScalar *d, int n)
1028 {
1029         btAssert(a && d && n >= 0);
1030         for (int i = 0; i < n; i++)
1031         {
1032                 a[i] *= d[i];
1033         }
1034 }
1035
1036 void btSolveLDLT(const btScalar *L, const btScalar *d, btScalar *b, int n, int nskip)
1037 {
1038         btAssert(L && d && b && n > 0 && nskip >= n);
1039         btSolveL1(L, b, n, nskip);
1040         btVectorScale(b, d, n);
1041         btSolveL1T(L, b, n, nskip);
1042 }
1043
1044 //***************************************************************************
1045
1046 // swap row/column i1 with i2 in the n*n matrix A. the leading dimension of
1047 // A is nskip. this only references and swaps the lower triangle.
1048 // if `do_fast_row_swaps' is nonzero and row pointers are being used, then
1049 // rows will be swapped by exchanging row pointers. otherwise the data will
1050 // be copied.
1051
1052 static void btSwapRowsAndCols(BTATYPE A, int n, int i1, int i2, int nskip,
1053                                                           int do_fast_row_swaps)
1054 {
1055         btAssert(A && n > 0 && i1 >= 0 && i2 >= 0 && i1 < n && i2 < n &&
1056                          nskip >= n && i1 < i2);
1057
1058 #ifdef BTROWPTRS
1059         btScalar *A_i1 = A[i1];
1060         btScalar *A_i2 = A[i2];
1061         for (int i = i1 + 1; i < i2; ++i)
1062         {
1063                 btScalar *A_i_i1 = A[i] + i1;
1064                 A_i1[i] = *A_i_i1;
1065                 *A_i_i1 = A_i2[i];
1066         }
1067         A_i1[i2] = A_i1[i1];
1068         A_i1[i1] = A_i2[i1];
1069         A_i2[i1] = A_i2[i2];
1070         // swap rows, by swapping row pointers
1071         if (do_fast_row_swaps)
1072         {
1073                 A[i1] = A_i2;
1074                 A[i2] = A_i1;
1075         }
1076         else
1077         {
1078                 // Only swap till i2 column to match A plain storage variant.
1079                 for (int k = 0; k <= i2; ++k)
1080                 {
1081                         btScalar tmp = A_i1[k];
1082                         A_i1[k] = A_i2[k];
1083                         A_i2[k] = tmp;
1084                 }
1085         }
1086         // swap columns the hard way
1087         for (int j = i2 + 1; j < n; ++j)
1088         {
1089                 btScalar *A_j = A[j];
1090                 btScalar tmp = A_j[i1];
1091                 A_j[i1] = A_j[i2];
1092                 A_j[i2] = tmp;
1093         }
1094 #else
1095         btScalar *A_i1 = A + i1 * nskip;
1096         btScalar *A_i2 = A + i2 * nskip;
1097         for (int k = 0; k < i1; ++k)
1098         {
1099                 btScalar tmp = A_i1[k];
1100                 A_i1[k] = A_i2[k];
1101                 A_i2[k] = tmp;
1102         }
1103         btScalar *A_i = A_i1 + nskip;
1104         for (int i = i1 + 1; i < i2; A_i += nskip, ++i)
1105         {
1106                 btScalar tmp = A_i2[i];
1107                 A_i2[i] = A_i[i1];
1108                 A_i[i1] = tmp;
1109         }
1110         {
1111                 btScalar tmp = A_i1[i1];
1112                 A_i1[i1] = A_i2[i2];
1113                 A_i2[i2] = tmp;
1114         }
1115         btScalar *A_j = A_i2 + nskip;
1116         for (int j = i2 + 1; j < n; A_j += nskip, ++j)
1117         {
1118                 btScalar tmp = A_j[i1];
1119                 A_j[i1] = A_j[i2];
1120                 A_j[i2] = tmp;
1121         }
1122 #endif
1123 }
1124
1125 // swap two indexes in the n*n LCP problem. i1 must be <= i2.
1126
1127 static void btSwapProblem(BTATYPE A, btScalar *x, btScalar *b, btScalar *w, btScalar *lo,
1128                                                   btScalar *hi, int *p, bool *state, int *findex,
1129                                                   int n, int i1, int i2, int nskip,
1130                                                   int do_fast_row_swaps)
1131 {
1132         btScalar tmpr;
1133         int tmpi;
1134         bool tmpb;
1135         btAssert(n > 0 && i1 >= 0 && i2 >= 0 && i1 < n && i2 < n && nskip >= n && i1 <= i2);
1136         if (i1 == i2) return;
1137
1138         btSwapRowsAndCols(A, n, i1, i2, nskip, do_fast_row_swaps);
1139
1140         tmpr = x[i1];
1141         x[i1] = x[i2];
1142         x[i2] = tmpr;
1143
1144         tmpr = b[i1];
1145         b[i1] = b[i2];
1146         b[i2] = tmpr;
1147
1148         tmpr = w[i1];
1149         w[i1] = w[i2];
1150         w[i2] = tmpr;
1151
1152         tmpr = lo[i1];
1153         lo[i1] = lo[i2];
1154         lo[i2] = tmpr;
1155
1156         tmpr = hi[i1];
1157         hi[i1] = hi[i2];
1158         hi[i2] = tmpr;
1159
1160         tmpi = p[i1];
1161         p[i1] = p[i2];
1162         p[i2] = tmpi;
1163
1164         tmpb = state[i1];
1165         state[i1] = state[i2];
1166         state[i2] = tmpb;
1167
1168         if (findex)
1169         {
1170                 tmpi = findex[i1];
1171                 findex[i1] = findex[i2];
1172                 findex[i2] = tmpi;
1173         }
1174 }
1175
1176 //***************************************************************************
1177 // btLCP manipulator object. this represents an n*n LCP problem.
1178 //
1179 // two index sets C and N are kept. each set holds a subset of
1180 // the variable indexes 0..n-1. an index can only be in one set.
1181 // initially both sets are empty.
1182 //
1183 // the index set C is special: solutions to A(C,C)\A(C,i) can be generated.
1184
1185 //***************************************************************************
1186 // fast implementation of btLCP. see the above definition of btLCP for
1187 // interface comments.
1188 //
1189 // `p' records the permutation of A,x,b,w,etc. p is initially 1:n and is
1190 // permuted as the other vectors/matrices are permuted.
1191 //
1192 // A,x,b,w,lo,hi,state,findex,p,c are permuted such that sets C,N have
1193 // contiguous indexes. the don't-care indexes follow N.
1194 //
1195 // an L*D*L' factorization is maintained of A(C,C), and whenever indexes are
1196 // added or removed from the set C the factorization is updated.
1197 // thus L*D*L'=A[C,C], i.e. a permuted top left nC*nC submatrix of A.
1198 // the leading dimension of the matrix L is always `nskip'.
1199 //
1200 // at the start there may be other indexes that are unbounded but are not
1201 // included in `nub'. btLCP will permute the matrix so that absolutely all
1202 // unbounded vectors are at the start. thus there may be some initial
1203 // permutation.
1204 //
1205 // the algorithms here assume certain patterns, particularly with respect to
1206 // index transfer.
1207
1208 #ifdef btLCP_FAST
1209
1210 struct btLCP
1211 {
1212         const int m_n;
1213         const int m_nskip;
1214         int m_nub;
1215         int m_nC, m_nN;                                                         // size of each index set
1216         BTATYPE const m_A;                                                      // A rows
1217         btScalar *const m_x, *const m_b, *const m_w, *const m_lo, *const m_hi;  // permuted LCP problem data
1218         btScalar *const m_L, *const m_d;                                        // L*D*L' factorization of set C
1219         btScalar *const m_Dell, *const m_ell, *const m_tmp;
1220         bool *const m_state;
1221         int *const m_findex, *const m_p, *const m_C;
1222
1223         btLCP(int _n, int _nskip, int _nub, btScalar *_Adata, btScalar *_x, btScalar *_b, btScalar *_w,
1224                   btScalar *_lo, btScalar *_hi, btScalar *l, btScalar *_d,
1225                   btScalar *_Dell, btScalar *_ell, btScalar *_tmp,
1226                   bool *_state, int *_findex, int *p, int *c, btScalar **Arows);
1227         int getNub() const { return m_nub; }
1228         void transfer_i_to_C(int i);
1229         void transfer_i_to_N(int i) { m_nN++; }  // because we can assume C and N span 1:i-1
1230         void transfer_i_from_N_to_C(int i);
1231         void transfer_i_from_C_to_N(int i, btAlignedObjectArray<btScalar> &scratch);
1232         int numC() const { return m_nC; }
1233         int numN() const { return m_nN; }
1234         int indexC(int i) const { return i; }
1235         int indexN(int i) const { return i + m_nC; }
1236         btScalar Aii(int i) const { return BTAROW(i)[i]; }
1237         btScalar AiC_times_qC(int i, btScalar *q) const { return btLargeDot(BTAROW(i), q, m_nC); }
1238         btScalar AiN_times_qN(int i, btScalar *q) const { return btLargeDot(BTAROW(i) + m_nC, q + m_nC, m_nN); }
1239         void pN_equals_ANC_times_qC(btScalar *p, btScalar *q);
1240         void pN_plusequals_ANi(btScalar *p, int i, int sign = 1);
1241         void pC_plusequals_s_times_qC(btScalar *p, btScalar s, btScalar *q);
1242         void pN_plusequals_s_times_qN(btScalar *p, btScalar s, btScalar *q);
1243         void solve1(btScalar *a, int i, int dir = 1, int only_transfer = 0);
1244         void unpermute();
1245 };
1246
1247 btLCP::btLCP(int _n, int _nskip, int _nub, btScalar *_Adata, btScalar *_x, btScalar *_b, btScalar *_w,
1248                          btScalar *_lo, btScalar *_hi, btScalar *l, btScalar *_d,
1249                          btScalar *_Dell, btScalar *_ell, btScalar *_tmp,
1250                          bool *_state, int *_findex, int *p, int *c, btScalar **Arows) : m_n(_n), m_nskip(_nskip), m_nub(_nub), m_nC(0), m_nN(0),
1251 #ifdef BTROWPTRS
1252                                                                                                                                                          m_A(Arows),
1253 #else
1254                                                                                                                                                          m_A(_Adata),
1255 #endif
1256                                                                                                                                                          m_x(_x),
1257                                                                                                                                                          m_b(_b),
1258                                                                                                                                                          m_w(_w),
1259                                                                                                                                                          m_lo(_lo),
1260                                                                                                                                                          m_hi(_hi),
1261                                                                                                                                                          m_L(l),
1262                                                                                                                                                          m_d(_d),
1263                                                                                                                                                          m_Dell(_Dell),
1264                                                                                                                                                          m_ell(_ell),
1265                                                                                                                                                          m_tmp(_tmp),
1266                                                                                                                                                          m_state(_state),
1267                                                                                                                                                          m_findex(_findex),
1268                                                                                                                                                          m_p(p),
1269                                                                                                                                                          m_C(c)
1270 {
1271         {
1272                 btSetZero(m_x, m_n);
1273         }
1274
1275         {
1276 #ifdef BTROWPTRS
1277                 // make matrix row pointers
1278                 btScalar *aptr = _Adata;
1279                 BTATYPE A = m_A;
1280                 const int n = m_n, nskip = m_nskip;
1281                 for (int k = 0; k < n; aptr += nskip, ++k) A[k] = aptr;
1282 #endif
1283         }
1284
1285         {
1286                 int *p = m_p;
1287                 const int n = m_n;
1288                 for (int k = 0; k < n; ++k) p[k] = k;  // initially unpermuted
1289         }
1290
1291         /*
1292   // for testing, we can do some random swaps in the area i > nub
1293   {
1294     const int n = m_n;
1295     const int nub = m_nub;
1296     if (nub < n) {
1297     for (int k=0; k<100; k++) {
1298       int i1,i2;
1299       do {
1300         i1 = dRandInt(n-nub)+nub;
1301         i2 = dRandInt(n-nub)+nub;
1302       }
1303       while (i1 > i2); 
1304       //printf ("--> %d %d\n",i1,i2);
1305       btSwapProblem (m_A,m_x,m_b,m_w,m_lo,m_hi,m_p,m_state,m_findex,n,i1,i2,m_nskip,0);
1306     }
1307   }
1308   */
1309
1310         // permute the problem so that *all* the unbounded variables are at the
1311         // start, i.e. look for unbounded variables not included in `nub'. we can
1312         // potentially push up `nub' this way and get a bigger initial factorization.
1313         // note that when we swap rows/cols here we must not just swap row pointers,
1314         // as the initial factorization relies on the data being all in one chunk.
1315         // variables that have findex >= 0 are *not* considered to be unbounded even
1316         // if lo=-inf and hi=inf - this is because these limits may change during the
1317         // solution process.
1318
1319         {
1320                 int *findex = m_findex;
1321                 btScalar *lo = m_lo, *hi = m_hi;
1322                 const int n = m_n;
1323                 for (int k = m_nub; k < n; ++k)
1324                 {
1325                         if (findex && findex[k] >= 0) continue;
1326                         if (lo[k] == -BT_INFINITY && hi[k] == BT_INFINITY)
1327                         {
1328                                 btSwapProblem(m_A, m_x, m_b, m_w, lo, hi, m_p, m_state, findex, n, m_nub, k, m_nskip, 0);
1329                                 m_nub++;
1330                         }
1331                 }
1332         }
1333
1334         // if there are unbounded variables at the start, factorize A up to that
1335         // point and solve for x. this puts all indexes 0..nub-1 into C.
1336         if (m_nub > 0)
1337         {
1338                 const int nub = m_nub;
1339                 {
1340                         btScalar *Lrow = m_L;
1341                         const int nskip = m_nskip;
1342                         for (int j = 0; j < nub; Lrow += nskip, ++j) memcpy(Lrow, BTAROW(j), (j + 1) * sizeof(btScalar));
1343                 }
1344                 btFactorLDLT(m_L, m_d, nub, m_nskip);
1345                 memcpy(m_x, m_b, nub * sizeof(btScalar));
1346                 btSolveLDLT(m_L, m_d, m_x, nub, m_nskip);
1347                 btSetZero(m_w, nub);
1348                 {
1349                         int *C = m_C;
1350                         for (int k = 0; k < nub; ++k) C[k] = k;
1351                 }
1352                 m_nC = nub;
1353         }
1354
1355         // permute the indexes > nub such that all findex variables are at the end
1356         if (m_findex)
1357         {
1358                 const int nub = m_nub;
1359                 int *findex = m_findex;
1360                 int num_at_end = 0;
1361                 for (int k = m_n - 1; k >= nub; k--)
1362                 {
1363                         if (findex[k] >= 0)
1364                         {
1365                                 btSwapProblem(m_A, m_x, m_b, m_w, m_lo, m_hi, m_p, m_state, findex, m_n, k, m_n - 1 - num_at_end, m_nskip, 1);
1366                                 num_at_end++;
1367                         }
1368                 }
1369         }
1370
1371         // print info about indexes
1372         /*
1373   {
1374     const int n = m_n;
1375     const int nub = m_nub;
1376     for (int k=0; k<n; k++) {
1377       if (k<nub) printf ("C");
1378       else if (m_lo[k]==-BT_INFINITY && m_hi[k]==BT_INFINITY) printf ("c");
1379       else printf (".");
1380     }
1381     printf ("\n");
1382   }
1383   */
1384 }
1385
1386 void btLCP::transfer_i_to_C(int i)
1387 {
1388         {
1389                 if (m_nC > 0)
1390                 {
1391                         // ell,Dell were computed by solve1(). note, ell = D \ L1solve (L,A(i,C))
1392                         {
1393                                 const int nC = m_nC;
1394                                 btScalar *const Ltgt = m_L + nC * m_nskip, *ell = m_ell;
1395                                 for (int j = 0; j < nC; ++j) Ltgt[j] = ell[j];
1396                         }
1397                         const int nC = m_nC;
1398                         m_d[nC] = btRecip(BTAROW(i)[i] - btLargeDot(m_ell, m_Dell, nC));
1399                 }
1400                 else
1401                 {
1402                         m_d[0] = btRecip(BTAROW(i)[i]);
1403                 }
1404
1405                 btSwapProblem(m_A, m_x, m_b, m_w, m_lo, m_hi, m_p, m_state, m_findex, m_n, m_nC, i, m_nskip, 1);
1406
1407                 const int nC = m_nC;
1408                 m_C[nC] = nC;
1409                 m_nC = nC + 1;  // nC value is outdated after this line
1410         }
1411 }
1412
1413 void btLCP::transfer_i_from_N_to_C(int i)
1414 {
1415         {
1416                 if (m_nC > 0)
1417                 {
1418                         {
1419                                 btScalar *const aptr = BTAROW(i);
1420                                 btScalar *Dell = m_Dell;
1421                                 const int *C = m_C;
1422 #ifdef BTNUB_OPTIMIZATIONS
1423                                 // if nub>0, initial part of aptr unpermuted
1424                                 const int nub = m_nub;
1425                                 int j = 0;
1426                                 for (; j < nub; ++j) Dell[j] = aptr[j];
1427                                 const int nC = m_nC;
1428                                 for (; j < nC; ++j) Dell[j] = aptr[C[j]];
1429 #else
1430                                 const int nC = m_nC;
1431                                 for (int j = 0; j < nC; ++j) Dell[j] = aptr[C[j]];
1432 #endif
1433                         }
1434                         btSolveL1(m_L, m_Dell, m_nC, m_nskip);
1435                         {
1436                                 const int nC = m_nC;
1437                                 btScalar *const Ltgt = m_L + nC * m_nskip;
1438                                 btScalar *ell = m_ell, *Dell = m_Dell, *d = m_d;
1439                                 for (int j = 0; j < nC; ++j) Ltgt[j] = ell[j] = Dell[j] * d[j];
1440                         }
1441                         const int nC = m_nC;
1442                         m_d[nC] = btRecip(BTAROW(i)[i] - btLargeDot(m_ell, m_Dell, nC));
1443                 }
1444                 else
1445                 {
1446                         m_d[0] = btRecip(BTAROW(i)[i]);
1447                 }
1448
1449                 btSwapProblem(m_A, m_x, m_b, m_w, m_lo, m_hi, m_p, m_state, m_findex, m_n, m_nC, i, m_nskip, 1);
1450
1451                 const int nC = m_nC;
1452                 m_C[nC] = nC;
1453                 m_nN--;
1454                 m_nC = nC + 1;  // nC value is outdated after this line
1455         }
1456
1457         // @@@ TO DO LATER
1458         // if we just finish here then we'll go back and re-solve for
1459         // delta_x. but actually we can be more efficient and incrementally
1460         // update delta_x here. but if we do this, we wont have ell and Dell
1461         // to use in updating the factorization later.
1462 }
1463
1464 void btRemoveRowCol(btScalar *A, int n, int nskip, int r)
1465 {
1466         btAssert(A && n > 0 && nskip >= n && r >= 0 && r < n);
1467         if (r >= n - 1) return;
1468         if (r > 0)
1469         {
1470                 {
1471                         const size_t move_size = (n - r - 1) * sizeof(btScalar);
1472                         btScalar *Adst = A + r;
1473                         for (int i = 0; i < r; Adst += nskip, ++i)
1474                         {
1475                                 btScalar *Asrc = Adst + 1;
1476                                 memmove(Adst, Asrc, move_size);
1477                         }
1478                 }
1479                 {
1480                         const size_t cpy_size = r * sizeof(btScalar);
1481                         btScalar *Adst = A + r * nskip;
1482                         for (int i = r; i < (n - 1); ++i)
1483                         {
1484                                 btScalar *Asrc = Adst + nskip;
1485                                 memcpy(Adst, Asrc, cpy_size);
1486                                 Adst = Asrc;
1487                         }
1488                 }
1489         }
1490         {
1491                 const size_t cpy_size = (n - r - 1) * sizeof(btScalar);
1492                 btScalar *Adst = A + r * (nskip + 1);
1493                 for (int i = r; i < (n - 1); ++i)
1494                 {
1495                         btScalar *Asrc = Adst + (nskip + 1);
1496                         memcpy(Adst, Asrc, cpy_size);
1497                         Adst = Asrc - 1;
1498                 }
1499         }
1500 }
1501
1502 void btLDLTAddTL(btScalar *L, btScalar *d, const btScalar *a, int n, int nskip, btAlignedObjectArray<btScalar> &scratch)
1503 {
1504         btAssert(L && d && a && n > 0 && nskip >= n);
1505
1506         if (n < 2) return;
1507         scratch.resize(2 * nskip);
1508         btScalar *W1 = &scratch[0];
1509
1510         btScalar *W2 = W1 + nskip;
1511
1512         W1[0] = btScalar(0.0);
1513         W2[0] = btScalar(0.0);
1514         for (int j = 1; j < n; ++j)
1515         {
1516                 W1[j] = W2[j] = (btScalar)(a[j] * SIMDSQRT12);
1517         }
1518         btScalar W11 = (btScalar)((btScalar(0.5) * a[0] + 1) * SIMDSQRT12);
1519         btScalar W21 = (btScalar)((btScalar(0.5) * a[0] - 1) * SIMDSQRT12);
1520
1521         btScalar alpha1 = btScalar(1.0);
1522         btScalar alpha2 = btScalar(1.0);
1523
1524         {
1525                 btScalar dee = d[0];
1526                 btScalar alphanew = alpha1 + (W11 * W11) * dee;
1527                 btAssert(alphanew != btScalar(0.0));
1528                 dee /= alphanew;
1529                 btScalar gamma1 = W11 * dee;
1530                 dee *= alpha1;
1531                 alpha1 = alphanew;
1532                 alphanew = alpha2 - (W21 * W21) * dee;
1533                 dee /= alphanew;
1534                 //btScalar gamma2 = W21 * dee;
1535                 alpha2 = alphanew;
1536                 btScalar k1 = btScalar(1.0) - W21 * gamma1;
1537                 btScalar k2 = W21 * gamma1 * W11 - W21;
1538                 btScalar *ll = L + nskip;
1539                 for (int p = 1; p < n; ll += nskip, ++p)
1540                 {
1541                         btScalar Wp = W1[p];
1542                         btScalar ell = *ll;
1543                         W1[p] = Wp - W11 * ell;
1544                         W2[p] = k1 * Wp + k2 * ell;
1545                 }
1546         }
1547
1548         btScalar *ll = L + (nskip + 1);
1549         for (int j = 1; j < n; ll += nskip + 1, ++j)
1550         {
1551                 btScalar k1 = W1[j];
1552                 btScalar k2 = W2[j];
1553
1554                 btScalar dee = d[j];
1555                 btScalar alphanew = alpha1 + (k1 * k1) * dee;
1556                 btAssert(alphanew != btScalar(0.0));
1557                 dee /= alphanew;
1558                 btScalar gamma1 = k1 * dee;
1559                 dee *= alpha1;
1560                 alpha1 = alphanew;
1561                 alphanew = alpha2 - (k2 * k2) * dee;
1562                 dee /= alphanew;
1563                 btScalar gamma2 = k2 * dee;
1564                 dee *= alpha2;
1565                 d[j] = dee;
1566                 alpha2 = alphanew;
1567
1568                 btScalar *l = ll + nskip;
1569                 for (int p = j + 1; p < n; l += nskip, ++p)
1570                 {
1571                         btScalar ell = *l;
1572                         btScalar Wp = W1[p] - k1 * ell;
1573                         ell += gamma1 * Wp;
1574                         W1[p] = Wp;
1575                         Wp = W2[p] - k2 * ell;
1576                         ell -= gamma2 * Wp;
1577                         W2[p] = Wp;
1578                         *l = ell;
1579                 }
1580         }
1581 }
1582
1583 #define _BTGETA(i, j) (A[i][j])
1584 //#define _GETA(i,j) (A[(i)*nskip+(j)])
1585 #define BTGETA(i, j) ((i > j) ? _BTGETA(i, j) : _BTGETA(j, i))
1586
1587 inline size_t btEstimateLDLTAddTLTmpbufSize(int nskip)
1588 {
1589         return nskip * 2 * sizeof(btScalar);
1590 }
1591
1592 void btLDLTRemove(btScalar **A, const int *p, btScalar *L, btScalar *d,
1593                                   int n1, int n2, int r, int nskip, btAlignedObjectArray<btScalar> &scratch)
1594 {
1595         btAssert(A && p && L && d && n1 > 0 && n2 > 0 && r >= 0 && r < n2 &&
1596                          n1 >= n2 && nskip >= n1);
1597 #ifdef BT_DEBUG
1598         for (int i = 0; i < n2; ++i)
1599                 btAssert(p[i] >= 0 && p[i] < n1);
1600 #endif
1601
1602         if (r == n2 - 1)
1603         {
1604                 return;  // deleting last row/col is easy
1605         }
1606         else
1607         {
1608                 size_t LDLTAddTL_size = btEstimateLDLTAddTLTmpbufSize(nskip);
1609                 btAssert(LDLTAddTL_size % sizeof(btScalar) == 0);
1610                 scratch.resize(nskip * 2 + n2);
1611                 btScalar *tmp = &scratch[0];
1612                 if (r == 0)
1613                 {
1614                         btScalar *a = (btScalar *)((char *)tmp + LDLTAddTL_size);
1615                         const int p_0 = p[0];
1616                         for (int i = 0; i < n2; ++i)
1617                         {
1618                                 a[i] = -BTGETA(p[i], p_0);
1619                         }
1620                         a[0] += btScalar(1.0);
1621                         btLDLTAddTL(L, d, a, n2, nskip, scratch);
1622                 }
1623                 else
1624                 {
1625                         btScalar *t = (btScalar *)((char *)tmp + LDLTAddTL_size);
1626                         {
1627                                 btScalar *Lcurr = L + r * nskip;
1628                                 for (int i = 0; i < r; ++Lcurr, ++i)
1629                                 {
1630                                         btAssert(d[i] != btScalar(0.0));
1631                                         t[i] = *Lcurr / d[i];
1632                                 }
1633                         }
1634                         btScalar *a = t + r;
1635                         {
1636                                 btScalar *Lcurr = L + r * nskip;
1637                                 const int *pp_r = p + r, p_r = *pp_r;
1638                                 const int n2_minus_r = n2 - r;
1639                                 for (int i = 0; i < n2_minus_r; Lcurr += nskip, ++i)
1640                                 {
1641                                         a[i] = btLargeDot(Lcurr, t, r) - BTGETA(pp_r[i], p_r);
1642                                 }
1643                         }
1644                         a[0] += btScalar(1.0);
1645                         btLDLTAddTL(L + r * nskip + r, d + r, a, n2 - r, nskip, scratch);
1646                 }
1647         }
1648
1649         // snip out row/column r from L and d
1650         btRemoveRowCol(L, n2, nskip, r);
1651         if (r < (n2 - 1)) memmove(d + r, d + r + 1, (n2 - r - 1) * sizeof(btScalar));
1652 }
1653
1654 void btLCP::transfer_i_from_C_to_N(int i, btAlignedObjectArray<btScalar> &scratch)
1655 {
1656         {
1657                 int *C = m_C;
1658                 // remove a row/column from the factorization, and adjust the
1659                 // indexes (black magic!)
1660                 int last_idx = -1;
1661                 const int nC = m_nC;
1662                 int j = 0;
1663                 for (; j < nC; ++j)
1664                 {
1665                         if (C[j] == nC - 1)
1666                         {
1667                                 last_idx = j;
1668                         }
1669                         if (C[j] == i)
1670                         {
1671                                 btLDLTRemove(m_A, C, m_L, m_d, m_n, nC, j, m_nskip, scratch);
1672                                 int k;
1673                                 if (last_idx == -1)
1674                                 {
1675                                         for (k = j + 1; k < nC; ++k)
1676                                         {
1677                                                 if (C[k] == nC - 1)
1678                                                 {
1679                                                         break;
1680                                                 }
1681                                         }
1682                                         btAssert(k < nC);
1683                                 }
1684                                 else
1685                                 {
1686                                         k = last_idx;
1687                                 }
1688                                 C[k] = C[j];
1689                                 if (j < (nC - 1)) memmove(C + j, C + j + 1, (nC - j - 1) * sizeof(int));
1690                                 break;
1691                         }
1692                 }
1693                 btAssert(j < nC);
1694
1695                 btSwapProblem(m_A, m_x, m_b, m_w, m_lo, m_hi, m_p, m_state, m_findex, m_n, i, nC - 1, m_nskip, 1);
1696
1697                 m_nN++;
1698                 m_nC = nC - 1;  // nC value is outdated after this line
1699         }
1700 }
1701
1702 void btLCP::pN_equals_ANC_times_qC(btScalar *p, btScalar *q)
1703 {
1704         // we could try to make this matrix-vector multiplication faster using
1705         // outer product matrix tricks, e.g. with the dMultidotX() functions.
1706         // but i tried it and it actually made things slower on random 100x100
1707         // problems because of the overhead involved. so we'll stick with the
1708         // simple method for now.
1709         const int nC = m_nC;
1710         btScalar *ptgt = p + nC;
1711         const int nN = m_nN;
1712         for (int i = 0; i < nN; ++i)
1713         {
1714                 ptgt[i] = btLargeDot(BTAROW(i + nC), q, nC);
1715         }
1716 }
1717
1718 void btLCP::pN_plusequals_ANi(btScalar *p, int i, int sign)
1719 {
1720         const int nC = m_nC;
1721         btScalar *aptr = BTAROW(i) + nC;
1722         btScalar *ptgt = p + nC;
1723         if (sign > 0)
1724         {
1725                 const int nN = m_nN;
1726                 for (int j = 0; j < nN; ++j) ptgt[j] += aptr[j];
1727         }
1728         else
1729         {
1730                 const int nN = m_nN;
1731                 for (int j = 0; j < nN; ++j) ptgt[j] -= aptr[j];
1732         }
1733 }
1734
1735 void btLCP::pC_plusequals_s_times_qC(btScalar *p, btScalar s, btScalar *q)
1736 {
1737         const int nC = m_nC;
1738         for (int i = 0; i < nC; ++i)
1739         {
1740                 p[i] += s * q[i];
1741         }
1742 }
1743
1744 void btLCP::pN_plusequals_s_times_qN(btScalar *p, btScalar s, btScalar *q)
1745 {
1746         const int nC = m_nC;
1747         btScalar *ptgt = p + nC, *qsrc = q + nC;
1748         const int nN = m_nN;
1749         for (int i = 0; i < nN; ++i)
1750         {
1751                 ptgt[i] += s * qsrc[i];
1752         }
1753 }
1754
1755 void btLCP::solve1(btScalar *a, int i, int dir, int only_transfer)
1756 {
1757         // the `Dell' and `ell' that are computed here are saved. if index i is
1758         // later added to the factorization then they can be reused.
1759         //
1760         // @@@ question: do we need to solve for entire delta_x??? yes, but
1761         //     only if an x goes below 0 during the step.
1762
1763         if (m_nC > 0)
1764         {
1765                 {
1766                         btScalar *Dell = m_Dell;
1767                         int *C = m_C;
1768                         btScalar *aptr = BTAROW(i);
1769 #ifdef BTNUB_OPTIMIZATIONS
1770                         // if nub>0, initial part of aptr[] is guaranteed unpermuted
1771                         const int nub = m_nub;
1772                         int j = 0;
1773                         for (; j < nub; ++j) Dell[j] = aptr[j];
1774                         const int nC = m_nC;
1775                         for (; j < nC; ++j) Dell[j] = aptr[C[j]];
1776 #else
1777                         const int nC = m_nC;
1778                         for (int j = 0; j < nC; ++j) Dell[j] = aptr[C[j]];
1779 #endif
1780                 }
1781                 btSolveL1(m_L, m_Dell, m_nC, m_nskip);
1782                 {
1783                         btScalar *ell = m_ell, *Dell = m_Dell, *d = m_d;
1784                         const int nC = m_nC;
1785                         for (int j = 0; j < nC; ++j) ell[j] = Dell[j] * d[j];
1786                 }
1787
1788                 if (!only_transfer)
1789                 {
1790                         btScalar *tmp = m_tmp, *ell = m_ell;
1791                         {
1792                                 const int nC = m_nC;
1793                                 for (int j = 0; j < nC; ++j) tmp[j] = ell[j];
1794                         }
1795                         btSolveL1T(m_L, tmp, m_nC, m_nskip);
1796                         if (dir > 0)
1797                         {
1798                                 int *C = m_C;
1799                                 btScalar *tmp = m_tmp;
1800                                 const int nC = m_nC;
1801                                 for (int j = 0; j < nC; ++j) a[C[j]] = -tmp[j];
1802                         }
1803                         else
1804                         {
1805                                 int *C = m_C;
1806                                 btScalar *tmp = m_tmp;
1807                                 const int nC = m_nC;
1808                                 for (int j = 0; j < nC; ++j) a[C[j]] = tmp[j];
1809                         }
1810                 }
1811         }
1812 }
1813
1814 void btLCP::unpermute()
1815 {
1816         // now we have to un-permute x and w
1817         {
1818                 memcpy(m_tmp, m_x, m_n * sizeof(btScalar));
1819                 btScalar *x = m_x, *tmp = m_tmp;
1820                 const int *p = m_p;
1821                 const int n = m_n;
1822                 for (int j = 0; j < n; ++j) x[p[j]] = tmp[j];
1823         }
1824         {
1825                 memcpy(m_tmp, m_w, m_n * sizeof(btScalar));
1826                 btScalar *w = m_w, *tmp = m_tmp;
1827                 const int *p = m_p;
1828                 const int n = m_n;
1829                 for (int j = 0; j < n; ++j) w[p[j]] = tmp[j];
1830         }
1831 }
1832
1833 #endif  // btLCP_FAST
1834
1835 //***************************************************************************
1836 // an optimized Dantzig LCP driver routine for the lo-hi LCP problem.
1837
1838 bool btSolveDantzigLCP(int n, btScalar *A, btScalar *x, btScalar *b,
1839                                            btScalar *outer_w, int nub, btScalar *lo, btScalar *hi, int *findex, btDantzigScratchMemory &scratchMem)
1840 {
1841         s_error = false;
1842
1843         //      printf("btSolveDantzigLCP n=%d\n",n);
1844         btAssert(n > 0 && A && x && b && lo && hi && nub >= 0 && nub <= n);
1845         btAssert(outer_w);
1846
1847 #ifdef BT_DEBUG
1848         {
1849                 // check restrictions on lo and hi
1850                 for (int k = 0; k < n; ++k)
1851                         btAssert(lo[k] <= 0 && hi[k] >= 0);
1852         }
1853 #endif
1854
1855         // if all the variables are unbounded then we can just factor, solve,
1856         // and return
1857         if (nub >= n)
1858         {
1859                 int nskip = (n);
1860                 btFactorLDLT(A, outer_w, n, nskip);
1861                 btSolveLDLT(A, outer_w, b, n, nskip);
1862                 memcpy(x, b, n * sizeof(btScalar));
1863
1864                 return !s_error;
1865         }
1866
1867         const int nskip = (n);
1868         scratchMem.L.resize(n * nskip);
1869
1870         scratchMem.d.resize(n);
1871
1872         btScalar *w = outer_w;
1873         scratchMem.delta_w.resize(n);
1874         scratchMem.delta_x.resize(n);
1875         scratchMem.Dell.resize(n);
1876         scratchMem.ell.resize(n);
1877         scratchMem.Arows.resize(n);
1878         scratchMem.p.resize(n);
1879         scratchMem.C.resize(n);
1880
1881         // for i in N, state[i] is 0 if x(i)==lo(i) or 1 if x(i)==hi(i)
1882         scratchMem.state.resize(n);
1883
1884         // create LCP object. note that tmp is set to delta_w to save space, this
1885         // optimization relies on knowledge of how tmp is used, so be careful!
1886         btLCP lcp(n, nskip, nub, A, x, b, w, lo, hi, &scratchMem.L[0], &scratchMem.d[0], &scratchMem.Dell[0], &scratchMem.ell[0], &scratchMem.delta_w[0], &scratchMem.state[0], findex, &scratchMem.p[0], &scratchMem.C[0], &scratchMem.Arows[0]);
1887         int adj_nub = lcp.getNub();
1888
1889         // loop over all indexes adj_nub..n-1. for index i, if x(i),w(i) satisfy the
1890         // LCP conditions then i is added to the appropriate index set. otherwise
1891         // x(i),w(i) is driven either +ve or -ve to force it to the valid region.
1892         // as we drive x(i), x(C) is also adjusted to keep w(C) at zero.
1893         // while driving x(i) we maintain the LCP conditions on the other variables
1894         // 0..i-1. we do this by watching out for other x(i),w(i) values going
1895         // outside the valid region, and then switching them between index sets
1896         // when that happens.
1897
1898         bool hit_first_friction_index = false;
1899         for (int i = adj_nub; i < n; ++i)
1900         {
1901                 s_error = false;
1902                 // the index i is the driving index and indexes i+1..n-1 are "dont care",
1903                 // i.e. when we make changes to the system those x's will be zero and we
1904                 // don't care what happens to those w's. in other words, we only consider
1905                 // an (i+1)*(i+1) sub-problem of A*x=b+w.
1906
1907                 // if we've hit the first friction index, we have to compute the lo and
1908                 // hi values based on the values of x already computed. we have been
1909                 // permuting the indexes, so the values stored in the findex vector are
1910                 // no longer valid. thus we have to temporarily unpermute the x vector.
1911                 // for the purposes of this computation, 0*infinity = 0 ... so if the
1912                 // contact constraint's normal force is 0, there should be no tangential
1913                 // force applied.
1914
1915                 if (!hit_first_friction_index && findex && findex[i] >= 0)
1916                 {
1917                         // un-permute x into delta_w, which is not being used at the moment
1918                         for (int j = 0; j < n; ++j) scratchMem.delta_w[scratchMem.p[j]] = x[j];
1919
1920                         // set lo and hi values
1921                         for (int k = i; k < n; ++k)
1922                         {
1923                                 btScalar wfk = scratchMem.delta_w[findex[k]];
1924                                 if (wfk == 0)
1925                                 {
1926                                         hi[k] = 0;
1927                                         lo[k] = 0;
1928                                 }
1929                                 else
1930                                 {
1931                                         hi[k] = btFabs(hi[k] * wfk);
1932                                         lo[k] = -hi[k];
1933                                 }
1934                         }
1935                         hit_first_friction_index = true;
1936                 }
1937
1938                 // thus far we have not even been computing the w values for indexes
1939                 // greater than i, so compute w[i] now.
1940                 w[i] = lcp.AiC_times_qC(i, x) + lcp.AiN_times_qN(i, x) - b[i];
1941
1942                 // if lo=hi=0 (which can happen for tangential friction when normals are
1943                 // 0) then the index will be assigned to set N with some state. however,
1944                 // set C's line has zero size, so the index will always remain in set N.
1945                 // with the "normal" switching logic, if w changed sign then the index
1946                 // would have to switch to set C and then back to set N with an inverted
1947                 // state. this is pointless, and also computationally expensive. to
1948                 // prevent this from happening, we use the rule that indexes with lo=hi=0
1949                 // will never be checked for set changes. this means that the state for
1950                 // these indexes may be incorrect, but that doesn't matter.
1951
1952                 // see if x(i),w(i) is in a valid region
1953                 if (lo[i] == 0 && w[i] >= 0)
1954                 {
1955                         lcp.transfer_i_to_N(i);
1956                         scratchMem.state[i] = false;
1957                 }
1958                 else if (hi[i] == 0 && w[i] <= 0)
1959                 {
1960                         lcp.transfer_i_to_N(i);
1961                         scratchMem.state[i] = true;
1962                 }
1963                 else if (w[i] == 0)
1964                 {
1965                         // this is a degenerate case. by the time we get to this test we know
1966                         // that lo != 0, which means that lo < 0 as lo is not allowed to be +ve,
1967                         // and similarly that hi > 0. this means that the line segment
1968                         // corresponding to set C is at least finite in extent, and we are on it.
1969                         // NOTE: we must call lcp.solve1() before lcp.transfer_i_to_C()
1970                         lcp.solve1(&scratchMem.delta_x[0], i, 0, 1);
1971
1972                         lcp.transfer_i_to_C(i);
1973                 }
1974                 else
1975                 {
1976                         // we must push x(i) and w(i)
1977                         for (;;)
1978                         {
1979                                 int dir;
1980                                 btScalar dirf;
1981                                 // find direction to push on x(i)
1982                                 if (w[i] <= 0)
1983                                 {
1984                                         dir = 1;
1985                                         dirf = btScalar(1.0);
1986                                 }
1987                                 else
1988                                 {
1989                                         dir = -1;
1990                                         dirf = btScalar(-1.0);
1991                                 }
1992
1993                                 // compute: delta_x(C) = -dir*A(C,C)\A(C,i)
1994                                 lcp.solve1(&scratchMem.delta_x[0], i, dir);
1995
1996                                 // note that delta_x[i] = dirf, but we wont bother to set it
1997
1998                                 // compute: delta_w = A*delta_x ... note we only care about
1999                                 // delta_w(N) and delta_w(i), the rest is ignored
2000                                 lcp.pN_equals_ANC_times_qC(&scratchMem.delta_w[0], &scratchMem.delta_x[0]);
2001                                 lcp.pN_plusequals_ANi(&scratchMem.delta_w[0], i, dir);
2002                                 scratchMem.delta_w[i] = lcp.AiC_times_qC(i, &scratchMem.delta_x[0]) + lcp.Aii(i) * dirf;
2003
2004                                 // find largest step we can take (size=s), either to drive x(i),w(i)
2005                                 // to the valid LCP region or to drive an already-valid variable
2006                                 // outside the valid region.
2007
2008                                 int cmd = 1;  // index switching command
2009                                 int si = 0;   // si = index to switch if cmd>3
2010                                 btScalar s = -w[i] / scratchMem.delta_w[i];
2011                                 if (dir > 0)
2012                                 {
2013                                         if (hi[i] < BT_INFINITY)
2014                                         {
2015                                                 btScalar s2 = (hi[i] - x[i]) * dirf;  // was (hi[i]-x[i])/dirf  // step to x(i)=hi(i)
2016                                                 if (s2 < s)
2017                                                 {
2018                                                         s = s2;
2019                                                         cmd = 3;
2020                                                 }
2021                                         }
2022                                 }
2023                                 else
2024                                 {
2025                                         if (lo[i] > -BT_INFINITY)
2026                                         {
2027                                                 btScalar s2 = (lo[i] - x[i]) * dirf;  // was (lo[i]-x[i])/dirf  // step to x(i)=lo(i)
2028                                                 if (s2 < s)
2029                                                 {
2030                                                         s = s2;
2031                                                         cmd = 2;
2032                                                 }
2033                                         }
2034                                 }
2035
2036                                 {
2037                                         const int numN = lcp.numN();
2038                                         for (int k = 0; k < numN; ++k)
2039                                         {
2040                                                 const int indexN_k = lcp.indexN(k);
2041                                                 if (!scratchMem.state[indexN_k] ? scratchMem.delta_w[indexN_k] < 0 : scratchMem.delta_w[indexN_k] > 0)
2042                                                 {
2043                                                         // don't bother checking if lo=hi=0
2044                                                         if (lo[indexN_k] == 0 && hi[indexN_k] == 0) continue;
2045                                                         btScalar s2 = -w[indexN_k] / scratchMem.delta_w[indexN_k];
2046                                                         if (s2 < s)
2047                                                         {
2048                                                                 s = s2;
2049                                                                 cmd = 4;
2050                                                                 si = indexN_k;
2051                                                         }
2052                                                 }
2053                                         }
2054                                 }
2055
2056                                 {
2057                                         const int numC = lcp.numC();
2058                                         for (int k = adj_nub; k < numC; ++k)
2059                                         {
2060                                                 const int indexC_k = lcp.indexC(k);
2061                                                 if (scratchMem.delta_x[indexC_k] < 0 && lo[indexC_k] > -BT_INFINITY)
2062                                                 {
2063                                                         btScalar s2 = (lo[indexC_k] - x[indexC_k]) / scratchMem.delta_x[indexC_k];
2064                                                         if (s2 < s)
2065                                                         {
2066                                                                 s = s2;
2067                                                                 cmd = 5;
2068                                                                 si = indexC_k;
2069                                                         }
2070                                                 }
2071                                                 if (scratchMem.delta_x[indexC_k] > 0 && hi[indexC_k] < BT_INFINITY)
2072                                                 {
2073                                                         btScalar s2 = (hi[indexC_k] - x[indexC_k]) / scratchMem.delta_x[indexC_k];
2074                                                         if (s2 < s)
2075                                                         {
2076                                                                 s = s2;
2077                                                                 cmd = 6;
2078                                                                 si = indexC_k;
2079                                                         }
2080                                                 }
2081                                         }
2082                                 }
2083
2084                                 //static char* cmdstring[8] = {0,"->C","->NL","->NH","N->C",
2085                                 //                           "C->NL","C->NH"};
2086                                 //printf ("cmd=%d (%s), si=%d\n",cmd,cmdstring[cmd],(cmd>3) ? si : i);
2087
2088                                 // if s <= 0 then we've got a problem. if we just keep going then
2089                                 // we're going to get stuck in an infinite loop. instead, just cross
2090                                 // our fingers and exit with the current solution.
2091                                 if (s <= btScalar(0.0))
2092                                 {
2093                                         //          printf("LCP internal error, s <= 0 (s=%.4e)",(double)s);
2094                                         if (i < n)
2095                                         {
2096                                                 btSetZero(x + i, n - i);
2097                                                 btSetZero(w + i, n - i);
2098                                         }
2099                                         s_error = true;
2100                                         break;
2101                                 }
2102
2103                                 // apply x = x + s * delta_x
2104                                 lcp.pC_plusequals_s_times_qC(x, s, &scratchMem.delta_x[0]);
2105                                 x[i] += s * dirf;
2106
2107                                 // apply w = w + s * delta_w
2108                                 lcp.pN_plusequals_s_times_qN(w, s, &scratchMem.delta_w[0]);
2109                                 w[i] += s * scratchMem.delta_w[i];
2110
2111                                 //        void *tmpbuf;
2112                                 // switch indexes between sets if necessary
2113                                 switch (cmd)
2114                                 {
2115                                         case 1:  // done
2116                                                 w[i] = 0;
2117                                                 lcp.transfer_i_to_C(i);
2118                                                 break;
2119                                         case 2:  // done
2120                                                 x[i] = lo[i];
2121                                                 scratchMem.state[i] = false;
2122                                                 lcp.transfer_i_to_N(i);
2123                                                 break;
2124                                         case 3:  // done
2125                                                 x[i] = hi[i];
2126                                                 scratchMem.state[i] = true;
2127                                                 lcp.transfer_i_to_N(i);
2128                                                 break;
2129                                         case 4:  // keep going
2130                                                 w[si] = 0;
2131                                                 lcp.transfer_i_from_N_to_C(si);
2132                                                 break;
2133                                         case 5:  // keep going
2134                                                 x[si] = lo[si];
2135                                                 scratchMem.state[si] = false;
2136                                                 lcp.transfer_i_from_C_to_N(si, scratchMem.m_scratch);
2137                                                 break;
2138                                         case 6:  // keep going
2139                                                 x[si] = hi[si];
2140                                                 scratchMem.state[si] = true;
2141                                                 lcp.transfer_i_from_C_to_N(si, scratchMem.m_scratch);
2142                                                 break;
2143                                 }
2144
2145                                 if (cmd <= 3) break;
2146                         }  // for (;;)
2147                 }      // else
2148
2149                 if (s_error)
2150                 {
2151                         break;
2152                 }
2153         }  // for (int i=adj_nub; i<n; ++i)
2154
2155         lcp.unpermute();
2156
2157         return !s_error;
2158 }