dali-physics/third-party/bullet3/src/BulletDynamics/MLCPSolvers/btDantzigLCP.cpp

   1 /*************************************************************************
   2 *                                                                       *
   3 * Open Dynamics Engine, Copyright (C) 2001,2002 Russell L. Smith.       *
   4 * All rights reserved.  Email: russ@q12.org   Web: www.q12.org          *
   5 *                                                                       *
   6 * This library is free software; you can redistribute it and/or         *
   7 * modify it under the terms of EITHER:                                  *
   8 *   (1) The GNU Lesser General Public License as published by the Free  *
   9 *       Software Foundation; either version 2.1 of the License, or (at  *
  10 *       your option) any later version. The text of the GNU Lesser      *
  11 *       General Public License is included with this library in the     *
  12 *       file LICENSE.TXT.                                               *
  13 *   (2) The BSD-style license that is included with this library in     *
  14 *       the file LICENSE-BSD.TXT.                                       *
  15 *                                                                       *
  16 * This library is distributed in the hope that it will be useful,       *
  17 * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
  18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the files    *
  19 * LICENSE.TXT and LICENSE-BSD.TXT for more details.                     *
  20 *                                                                       *
  21 *************************************************************************/
  22
  23 /*
  24
  25
  26 THE ALGORITHM
  27 -------------
  28
  29 solve A*x = b+w, with x and w subject to certain LCP conditions.
  30 each x(i),w(i) must lie on one of the three line segments in the following
  31 diagram. each line segment corresponds to one index set :
  32
  33      w(i)
  34      /|\      |           :
  35       |       |           :
  36       |       |i in N     :
  37   w>0 |       |state[i]=0 :
  38       |       |           :
  39       |       |           :  i in C
  40   w=0 +       +-----------------------+
  41       |                   :           |
  42       |                   :           |
  43   w<0 |                   :           |i in N
  44       |                   :           |state[i]=1
  45       |                   :           |
  46       |                   :           |
  47       +-------|-----------|-----------|----------> x(i)
  48              lo           0           hi
  49
  50 the Dantzig algorithm proceeds as follows:
  51   for i=1:n
  52     * if (x(i),w(i)) is not on the line, push x(i) and w(i) positive or
  53       negative towards the line. as this is done, the other (x(j),w(j))
  54       for j<i are constrained to be on the line. if any (x,w) reaches the
  55       end of a line segment then it is switched between index sets.
  56     * i is added to the appropriate index set depending on what line segment
  57       it hits.
  58
  59 we restrict lo(i) <= 0 and hi(i) >= 0. this makes the algorithm a bit
  60 simpler, because the starting point for x(i),w(i) is always on the dotted
  61 line x=0 and x will only ever increase in one direction, so it can only hit
  62 two out of the three line segments.
  63
  64
  65 NOTES
  66 -----
  67
  68 this is an implementation of "lcp_dantzig2_ldlt.m" and "lcp_dantzig_lohi.m".
  69 the implementation is split into an LCP problem object (btLCP) and an LCP
  70 driver function. most optimization occurs in the btLCP object.
  71
  72 a naive implementation of the algorithm requires either a lot of data motion
  73 or a lot of permutation-array lookup, because we are constantly re-ordering
  74 rows and columns. to avoid this and make a more optimized algorithm, a
  75 non-trivial data structure is used to represent the matrix A (this is
  76 implemented in the fast version of the btLCP object).
  77
  78 during execution of this algorithm, some indexes in A are clamped (set C),
  79 some are non-clamped (set N), and some are "don't care" (where x=0).
  80 A,x,b,w (and other problem vectors) are permuted such that the clamped
  81 indexes are first, the unclamped indexes are next, and the don't-care
  82 indexes are last. this permutation is recorded in the array `p'.
  83 initially p = 0..n-1, and as the rows and columns of A,x,b,w are swapped,
  84 the corresponding elements of p are swapped.
  85
  86 because the C and N elements are grouped together in the rows of A, we can do
  87 lots of work with a fast dot product function. if A,x,etc were not permuted
  88 and we only had a permutation array, then those dot products would be much
  89 slower as we would have a permutation array lookup in some inner loops.
  90
  91 A is accessed through an array of row pointers, so that element (i,j) of the
  92 permuted matrix is A[i][j]. this makes row swapping fast. for column swapping
  93 we still have to actually move the data.
  94
  95 during execution of this algorithm we maintain an L*D*L' factorization of
  96 the clamped submatrix of A (call it `AC') which is the top left nC*nC
  97 submatrix of A. there are two ways we could arrange the rows/columns in AC.
  98
  99 (1) AC is always permuted such that L*D*L' = AC. this causes a problem
 100 when a row/column is removed from C, because then all the rows/columns of A
 101 between the deleted index and the end of C need to be rotated downward.
 102 this results in a lot of data motion and slows things down.
 103 (2) L*D*L' is actually a factorization of a *permutation* of AC (which is
 104 itself a permutation of the underlying A). this is what we do - the
 105 permutation is recorded in the vector C. call this permutation A[C,C].
 106 when a row/column is removed from C, all we have to do is swap two
 107 rows/columns and manipulate C.
 108
 109 */
 110
 111 #include "btDantzigLCP.h"
 112
 113 #include <string.h>  //memcpy
 114
 115 bool s_error = false;
 116
 117 //***************************************************************************
 118 // code generation parameters
 119
 120 #define btLCP_FAST  // use fast btLCP object
 121
 122 // option 1 : matrix row pointers (less data copying)
 123 #define BTROWPTRS
 124 #define BTATYPE btScalar **
 125 #define BTAROW(i) (m_A[i])
 126
 127 // option 2 : no matrix row pointers (slightly faster inner loops)
 128 //#define NOROWPTRS
 129 //#define BTATYPE btScalar *
 130 //#define BTAROW(i) (m_A+(i)*m_nskip)
 131
 132 #define BTNUB_OPTIMIZATIONS
 133
 134 /* solve L*X=B, with B containing 1 right hand sides.
 135  * L is an n*n lower triangular matrix with ones on the diagonal.
 136  * L is stored by rows and its leading dimension is lskip.
 137  * B is an n*1 matrix that contains the right hand sides.
 138  * B is stored by columns and its leading dimension is also lskip.
 139  * B is overwritten with X.
 140  * this processes blocks of 2*2.
 141  * if this is in the factorizer source file, n must be a multiple of 2.
 142  */
 143
 144 static void btSolveL1_1(const btScalar *L, btScalar *B, int n, int lskip1)
 145 {
 146         /* declare variables - Z matrix, p and q vectors, etc */
 147         btScalar Z11, m11, Z21, m21, p1, q1, p2, *ex;
 148         const btScalar *ell;
 149         int i, j;
 150         /* compute all 2 x 1 blocks of X */
 151         for (i = 0; i < n; i += 2)
 152         {
 153                 /* compute all 2 x 1 block of X, from rows i..i+2-1 */
 154                 /* set the Z matrix to 0 */
 155                 Z11 = 0;
 156                 Z21 = 0;
 157                 ell = L + i * lskip1;
 158                 ex = B;
 159                 /* the inner loop that computes outer products and adds them to Z */
 160                 for (j = i - 2; j >= 0; j -= 2)
 161                 {
 162                         /* compute outer product and add it to the Z matrix */
 163                         p1 = ell[0];
 164                         q1 = ex[0];
 165                         m11 = p1 * q1;
 166                         p2 = ell[lskip1];
 167                         m21 = p2 * q1;
 168                         Z11 += m11;
 169                         Z21 += m21;
 170                         /* compute outer product and add it to the Z matrix */
 171                         p1 = ell[1];
 172                         q1 = ex[1];
 173                         m11 = p1 * q1;
 174                         p2 = ell[1 + lskip1];
 175                         m21 = p2 * q1;
 176                         /* advance pointers */
 177                         ell += 2;
 178                         ex += 2;
 179                         Z11 += m11;
 180                         Z21 += m21;
 181                         /* end of inner loop */
 182                 }
 183                 /* compute left-over iterations */
 184                 j += 2;
 185                 for (; j > 0; j--)
 186                 {
 187                         /* compute outer product and add it to the Z matrix */
 188                         p1 = ell[0];
 189                         q1 = ex[0];
 190                         m11 = p1 * q1;
 191                         p2 = ell[lskip1];
 192                         m21 = p2 * q1;
 193                         /* advance pointers */
 194                         ell += 1;
 195                         ex += 1;
 196                         Z11 += m11;
 197                         Z21 += m21;
 198                 }
 199                 /* finish computing the X(i) block */
 200                 Z11 = ex[0] - Z11;
 201                 ex[0] = Z11;
 202                 p1 = ell[lskip1];
 203                 Z21 = ex[1] - Z21 - p1 * Z11;
 204                 ex[1] = Z21;
 205                 /* end of outer loop */
 206         }
 207 }
 208
 209 /* solve L*X=B, with B containing 2 right hand sides.
 210  * L is an n*n lower triangular matrix with ones on the diagonal.
 211  * L is stored by rows and its leading dimension is lskip.
 212  * B is an n*2 matrix that contains the right hand sides.
 213  * B is stored by columns and its leading dimension is also lskip.
 214  * B is overwritten with X.
 215  * this processes blocks of 2*2.
 216  * if this is in the factorizer source file, n must be a multiple of 2.
 217  */
 218
 219 static void btSolveL1_2(const btScalar *L, btScalar *B, int n, int lskip1)
 220 {
 221         /* declare variables - Z matrix, p and q vectors, etc */
 222         btScalar Z11, m11, Z12, m12, Z21, m21, Z22, m22, p1, q1, p2, q2, *ex;
 223         const btScalar *ell;
 224         int i, j;
 225         /* compute all 2 x 2 blocks of X */
 226         for (i = 0; i < n; i += 2)
 227         {
 228                 /* compute all 2 x 2 block of X, from rows i..i+2-1 */
 229                 /* set the Z matrix to 0 */
 230                 Z11 = 0;
 231                 Z12 = 0;
 232                 Z21 = 0;
 233                 Z22 = 0;
 234                 ell = L + i * lskip1;
 235                 ex = B;
 236                 /* the inner loop that computes outer products and adds them to Z */
 237                 for (j = i - 2; j >= 0; j -= 2)
 238                 {
 239                         /* compute outer product and add it to the Z matrix */
 240                         p1 = ell[0];
 241                         q1 = ex[0];
 242                         m11 = p1 * q1;
 243                         q2 = ex[lskip1];
 244                         m12 = p1 * q2;
 245                         p2 = ell[lskip1];
 246                         m21 = p2 * q1;
 247                         m22 = p2 * q2;
 248                         Z11 += m11;
 249                         Z12 += m12;
 250                         Z21 += m21;
 251                         Z22 += m22;
 252                         /* compute outer product and add it to the Z matrix */
 253                         p1 = ell[1];
 254                         q1 = ex[1];
 255                         m11 = p1 * q1;
 256                         q2 = ex[1 + lskip1];
 257                         m12 = p1 * q2;
 258                         p2 = ell[1 + lskip1];
 259                         m21 = p2 * q1;
 260                         m22 = p2 * q2;
 261                         /* advance pointers */
 262                         ell += 2;
 263                         ex += 2;
 264                         Z11 += m11;
 265                         Z12 += m12;
 266                         Z21 += m21;
 267                         Z22 += m22;
 268                         /* end of inner loop */
 269                 }
 270                 /* compute left-over iterations */
 271                 j += 2;
 272                 for (; j > 0; j--)
 273                 {
 274                         /* compute outer product and add it to the Z matrix */
 275                         p1 = ell[0];
 276                         q1 = ex[0];
 277                         m11 = p1 * q1;
 278                         q2 = ex[lskip1];
 279                         m12 = p1 * q2;
 280                         p2 = ell[lskip1];
 281                         m21 = p2 * q1;
 282                         m22 = p2 * q2;
 283                         /* advance pointers */
 284                         ell += 1;
 285                         ex += 1;
 286                         Z11 += m11;
 287                         Z12 += m12;
 288                         Z21 += m21;
 289                         Z22 += m22;
 290                 }
 291                 /* finish computing the X(i) block */
 292                 Z11 = ex[0] - Z11;
 293                 ex[0] = Z11;
 294                 Z12 = ex[lskip1] - Z12;
 295                 ex[lskip1] = Z12;
 296                 p1 = ell[lskip1];
 297                 Z21 = ex[1] - Z21 - p1 * Z11;
 298                 ex[1] = Z21;
 299                 Z22 = ex[1 + lskip1] - Z22 - p1 * Z12;
 300                 ex[1 + lskip1] = Z22;
 301                 /* end of outer loop */
 302         }
 303 }
 304
 305 void btFactorLDLT(btScalar *A, btScalar *d, int n, int nskip1)
 306 {
 307         int i, j;
 308         btScalar sum, *ell, *dee, dd, p1, p2, q1, q2, Z11, m11, Z21, m21, Z22, m22;
 309         if (n < 1) return;
 310
 311         for (i = 0; i <= n - 2; i += 2)
 312         {
 313                 /* solve L*(D*l)=a, l is scaled elements in 2 x i block at A(i,0) */
 314                 btSolveL1_2(A, A + i * nskip1, i, nskip1);
 315                 /* scale the elements in a 2 x i block at A(i,0), and also */
 316                 /* compute Z = the outer product matrix that we'll need. */
 317                 Z11 = 0;
 318                 Z21 = 0;
 319                 Z22 = 0;
 320                 ell = A + i * nskip1;
 321                 dee = d;
 322                 for (j = i - 6; j >= 0; j -= 6)
 323                 {
 324                         p1 = ell[0];
 325                         p2 = ell[nskip1];
 326                         dd = dee[0];
 327                         q1 = p1 * dd;
 328                         q2 = p2 * dd;
 329                         ell[0] = q1;
 330                         ell[nskip1] = q2;
 331                         m11 = p1 * q1;
 332                         m21 = p2 * q1;
 333                         m22 = p2 * q2;
 334                         Z11 += m11;
 335                         Z21 += m21;
 336                         Z22 += m22;
 337                         p1 = ell[1];
 338                         p2 = ell[1 + nskip1];
 339                         dd = dee[1];
 340                         q1 = p1 * dd;
 341                         q2 = p2 * dd;
 342                         ell[1] = q1;
 343                         ell[1 + nskip1] = q2;
 344                         m11 = p1 * q1;
 345                         m21 = p2 * q1;
 346                         m22 = p2 * q2;
 347                         Z11 += m11;
 348                         Z21 += m21;
 349                         Z22 += m22;
 350                         p1 = ell[2];
 351                         p2 = ell[2 + nskip1];
 352                         dd = dee[2];
 353                         q1 = p1 * dd;
 354                         q2 = p2 * dd;
 355                         ell[2] = q1;
 356                         ell[2 + nskip1] = q2;
 357                         m11 = p1 * q1;
 358                         m21 = p2 * q1;
 359                         m22 = p2 * q2;
 360                         Z11 += m11;
 361                         Z21 += m21;
 362                         Z22 += m22;
 363                         p1 = ell[3];
 364                         p2 = ell[3 + nskip1];
 365                         dd = dee[3];
 366                         q1 = p1 * dd;
 367                         q2 = p2 * dd;
 368                         ell[3] = q1;
 369                         ell[3 + nskip1] = q2;
 370                         m11 = p1 * q1;
 371                         m21 = p2 * q1;
 372                         m22 = p2 * q2;
 373                         Z11 += m11;
 374                         Z21 += m21;
 375                         Z22 += m22;
 376                         p1 = ell[4];
 377                         p2 = ell[4 + nskip1];
 378                         dd = dee[4];
 379                         q1 = p1 * dd;
 380                         q2 = p2 * dd;
 381                         ell[4] = q1;
 382                         ell[4 + nskip1] = q2;
 383                         m11 = p1 * q1;
 384                         m21 = p2 * q1;
 385                         m22 = p2 * q2;
 386                         Z11 += m11;
 387                         Z21 += m21;
 388                         Z22 += m22;
 389                         p1 = ell[5];
 390                         p2 = ell[5 + nskip1];
 391                         dd = dee[5];
 392                         q1 = p1 * dd;
 393                         q2 = p2 * dd;
 394                         ell[5] = q1;
 395                         ell[5 + nskip1] = q2;
 396                         m11 = p1 * q1;
 397                         m21 = p2 * q1;
 398                         m22 = p2 * q2;
 399                         Z11 += m11;
 400                         Z21 += m21;
 401                         Z22 += m22;
 402                         ell += 6;
 403                         dee += 6;
 404                 }
 405                 /* compute left-over iterations */
 406                 j += 6;
 407                 for (; j > 0; j--)
 408                 {
 409                         p1 = ell[0];
 410                         p2 = ell[nskip1];
 411                         dd = dee[0];
 412                         q1 = p1 * dd;
 413                         q2 = p2 * dd;
 414                         ell[0] = q1;
 415                         ell[nskip1] = q2;
 416                         m11 = p1 * q1;
 417                         m21 = p2 * q1;
 418                         m22 = p2 * q2;
 419                         Z11 += m11;
 420                         Z21 += m21;
 421                         Z22 += m22;
 422                         ell++;
 423                         dee++;
 424                 }
 425                 /* solve for diagonal 2 x 2 block at A(i,i) */
 426                 Z11 = ell[0] - Z11;
 427                 Z21 = ell[nskip1] - Z21;
 428                 Z22 = ell[1 + nskip1] - Z22;
 429                 dee = d + i;
 430                 /* factorize 2 x 2 block Z,dee */
 431                 /* factorize row 1 */
 432                 dee[0] = btRecip(Z11);
 433                 /* factorize row 2 */
 434                 sum = 0;
 435                 q1 = Z21;
 436                 q2 = q1 * dee[0];
 437                 Z21 = q2;
 438                 sum += q1 * q2;
 439                 dee[1] = btRecip(Z22 - sum);
 440                 /* done factorizing 2 x 2 block */
 441                 ell[nskip1] = Z21;
 442         }
 443         /* compute the (less than 2) rows at the bottom */
 444         switch (n - i)
 445         {
 446                 case 0:
 447                         break;
 448
 449                 case 1:
 450                         btSolveL1_1(A, A + i * nskip1, i, nskip1);
 451                         /* scale the elements in a 1 x i block at A(i,0), and also */
 452                         /* compute Z = the outer product matrix that we'll need. */
 453                         Z11 = 0;
 454                         ell = A + i * nskip1;
 455                         dee = d;
 456                         for (j = i - 6; j >= 0; j -= 6)
 457                         {
 458                                 p1 = ell[0];
 459                                 dd = dee[0];
 460                                 q1 = p1 * dd;
 461                                 ell[0] = q1;
 462                                 m11 = p1 * q1;
 463                                 Z11 += m11;
 464                                 p1 = ell[1];
 465                                 dd = dee[1];
 466                                 q1 = p1 * dd;
 467                                 ell[1] = q1;
 468                                 m11 = p1 * q1;
 469                                 Z11 += m11;
 470                                 p1 = ell[2];
 471                                 dd = dee[2];
 472                                 q1 = p1 * dd;
 473                                 ell[2] = q1;
 474                                 m11 = p1 * q1;
 475                                 Z11 += m11;
 476                                 p1 = ell[3];
 477                                 dd = dee[3];
 478                                 q1 = p1 * dd;
 479                                 ell[3] = q1;
 480                                 m11 = p1 * q1;
 481                                 Z11 += m11;
 482                                 p1 = ell[4];
 483                                 dd = dee[4];
 484                                 q1 = p1 * dd;
 485                                 ell[4] = q1;
 486                                 m11 = p1 * q1;
 487                                 Z11 += m11;
 488                                 p1 = ell[5];
 489                                 dd = dee[5];
 490                                 q1 = p1 * dd;
 491                                 ell[5] = q1;
 492                                 m11 = p1 * q1;
 493                                 Z11 += m11;
 494                                 ell += 6;
 495                                 dee += 6;
 496                         }
 497                         /* compute left-over iterations */
 498                         j += 6;
 499                         for (; j > 0; j--)
 500                         {
 501                                 p1 = ell[0];
 502                                 dd = dee[0];
 503                                 q1 = p1 * dd;
 504                                 ell[0] = q1;
 505                                 m11 = p1 * q1;
 506                                 Z11 += m11;
 507                                 ell++;
 508                                 dee++;
 509                         }
 510                         /* solve for diagonal 1 x 1 block at A(i,i) */
 511                         Z11 = ell[0] - Z11;
 512                         dee = d + i;
 513                         /* factorize 1 x 1 block Z,dee */
 514                         /* factorize row 1 */
 515                         dee[0] = btRecip(Z11);
 516                         /* done factorizing 1 x 1 block */
 517                         break;
 518
 519                         //default: *((char*)0)=0;  /* this should never happen! */
 520         }
 521 }
 522
 523 /* solve L*X=B, with B containing 1 right hand sides.
 524  * L is an n*n lower triangular matrix with ones on the diagonal.
 525  * L is stored by rows and its leading dimension is lskip.
 526  * B is an n*1 matrix that contains the right hand sides.
 527  * B is stored by columns and its leading dimension is also lskip.
 528  * B is overwritten with X.
 529  * this processes blocks of 4*4.
 530  * if this is in the factorizer source file, n must be a multiple of 4.
 531  */
 532
 533 void btSolveL1(const btScalar *L, btScalar *B, int n, int lskip1)
 534 {
 535         /* declare variables - Z matrix, p and q vectors, etc */
 536         btScalar Z11, Z21, Z31, Z41, p1, q1, p2, p3, p4, *ex;
 537         const btScalar *ell;
 538         int lskip2, lskip3, i, j;
 539         /* compute lskip values */
 540         lskip2 = 2 * lskip1;
 541         lskip3 = 3 * lskip1;
 542         /* compute all 4 x 1 blocks of X */
 543         for (i = 0; i <= n - 4; i += 4)
 544         {
 545                 /* compute all 4 x 1 block of X, from rows i..i+4-1 */
 546                 /* set the Z matrix to 0 */
 547                 Z11 = 0;
 548                 Z21 = 0;
 549                 Z31 = 0;
 550                 Z41 = 0;
 551                 ell = L + i * lskip1;
 552                 ex = B;
 553                 /* the inner loop that computes outer products and adds them to Z */
 554                 for (j = i - 12; j >= 0; j -= 12)
 555                 {
 556                         /* load p and q values */
 557                         p1 = ell[0];
 558                         q1 = ex[0];
 559                         p2 = ell[lskip1];
 560                         p3 = ell[lskip2];
 561                         p4 = ell[lskip3];
 562                         /* compute outer product and add it to the Z matrix */
 563                         Z11 += p1 * q1;
 564                         Z21 += p2 * q1;
 565                         Z31 += p3 * q1;
 566                         Z41 += p4 * q1;
 567                         /* load p and q values */
 568                         p1 = ell[1];
 569                         q1 = ex[1];
 570                         p2 = ell[1 + lskip1];
 571                         p3 = ell[1 + lskip2];
 572                         p4 = ell[1 + lskip3];
 573                         /* compute outer product and add it to the Z matrix */
 574                         Z11 += p1 * q1;
 575                         Z21 += p2 * q1;
 576                         Z31 += p3 * q1;
 577                         Z41 += p4 * q1;
 578                         /* load p and q values */
 579                         p1 = ell[2];
 580                         q1 = ex[2];
 581                         p2 = ell[2 + lskip1];
 582                         p3 = ell[2 + lskip2];
 583                         p4 = ell[2 + lskip3];
 584                         /* compute outer product and add it to the Z matrix */
 585                         Z11 += p1 * q1;
 586                         Z21 += p2 * q1;
 587                         Z31 += p3 * q1;
 588                         Z41 += p4 * q1;
 589                         /* load p and q values */
 590                         p1 = ell[3];
 591                         q1 = ex[3];
 592                         p2 = ell[3 + lskip1];
 593                         p3 = ell[3 + lskip2];
 594                         p4 = ell[3 + lskip3];
 595                         /* compute outer product and add it to the Z matrix */
 596                         Z11 += p1 * q1;
 597                         Z21 += p2 * q1;
 598                         Z31 += p3 * q1;
 599                         Z41 += p4 * q1;
 600                         /* load p and q values */
 601                         p1 = ell[4];
 602                         q1 = ex[4];
 603                         p2 = ell[4 + lskip1];
 604                         p3 = ell[4 + lskip2];
 605                         p4 = ell[4 + lskip3];
 606                         /* compute outer product and add it to the Z matrix */
 607                         Z11 += p1 * q1;
 608                         Z21 += p2 * q1;
 609                         Z31 += p3 * q1;
 610                         Z41 += p4 * q1;
 611                         /* load p and q values */
 612                         p1 = ell[5];
 613                         q1 = ex[5];
 614                         p2 = ell[5 + lskip1];
 615                         p3 = ell[5 + lskip2];
 616                         p4 = ell[5 + lskip3];
 617                         /* compute outer product and add it to the Z matrix */
 618                         Z11 += p1 * q1;
 619                         Z21 += p2 * q1;
 620                         Z31 += p3 * q1;
 621                         Z41 += p4 * q1;
 622                         /* load p and q values */
 623                         p1 = ell[6];
 624                         q1 = ex[6];
 625                         p2 = ell[6 + lskip1];
 626                         p3 = ell[6 + lskip2];
 627                         p4 = ell[6 + lskip3];
 628                         /* compute outer product and add it to the Z matrix */
 629                         Z11 += p1 * q1;
 630                         Z21 += p2 * q1;
 631                         Z31 += p3 * q1;
 632                         Z41 += p4 * q1;
 633                         /* load p and q values */
 634                         p1 = ell[7];
 635                         q1 = ex[7];
 636                         p2 = ell[7 + lskip1];
 637                         p3 = ell[7 + lskip2];
 638                         p4 = ell[7 + lskip3];
 639                         /* compute outer product and add it to the Z matrix */
 640                         Z11 += p1 * q1;
 641                         Z21 += p2 * q1;
 642                         Z31 += p3 * q1;
 643                         Z41 += p4 * q1;
 644                         /* load p and q values */
 645                         p1 = ell[8];
 646                         q1 = ex[8];
 647                         p2 = ell[8 + lskip1];
 648                         p3 = ell[8 + lskip2];
 649                         p4 = ell[8 + lskip3];
 650                         /* compute outer product and add it to the Z matrix */
 651                         Z11 += p1 * q1;
 652                         Z21 += p2 * q1;
 653                         Z31 += p3 * q1;
 654                         Z41 += p4 * q1;
 655                         /* load p and q values */
 656                         p1 = ell[9];
 657                         q1 = ex[9];
 658                         p2 = ell[9 + lskip1];
 659                         p3 = ell[9 + lskip2];
 660                         p4 = ell[9 + lskip3];
 661                         /* compute outer product and add it to the Z matrix */
 662                         Z11 += p1 * q1;
 663                         Z21 += p2 * q1;
 664                         Z31 += p3 * q1;
 665                         Z41 += p4 * q1;
 666                         /* load p and q values */
 667                         p1 = ell[10];
 668                         q1 = ex[10];
 669                         p2 = ell[10 + lskip1];
 670                         p3 = ell[10 + lskip2];
 671                         p4 = ell[10 + lskip3];
 672                         /* compute outer product and add it to the Z matrix */
 673                         Z11 += p1 * q1;
 674                         Z21 += p2 * q1;
 675                         Z31 += p3 * q1;
 676                         Z41 += p4 * q1;
 677                         /* load p and q values */
 678                         p1 = ell[11];
 679                         q1 = ex[11];
 680                         p2 = ell[11 + lskip1];
 681                         p3 = ell[11 + lskip2];
 682                         p4 = ell[11 + lskip3];
 683                         /* compute outer product and add it to the Z matrix */
 684                         Z11 += p1 * q1;
 685                         Z21 += p2 * q1;
 686                         Z31 += p3 * q1;
 687                         Z41 += p4 * q1;
 688                         /* advance pointers */
 689                         ell += 12;
 690                         ex += 12;
 691                         /* end of inner loop */
 692                 }
 693                 /* compute left-over iterations */
 694                 j += 12;
 695                 for (; j > 0; j--)
 696                 {
 697                         /* load p and q values */
 698                         p1 = ell[0];
 699                         q1 = ex[0];
 700                         p2 = ell[lskip1];
 701                         p3 = ell[lskip2];
 702                         p4 = ell[lskip3];
 703                         /* compute outer product and add it to the Z matrix */
 704                         Z11 += p1 * q1;
 705                         Z21 += p2 * q1;
 706                         Z31 += p3 * q1;
 707                         Z41 += p4 * q1;
 708                         /* advance pointers */
 709                         ell += 1;
 710                         ex += 1;
 711                 }
 712                 /* finish computing the X(i) block */
 713                 Z11 = ex[0] - Z11;
 714                 ex[0] = Z11;
 715                 p1 = ell[lskip1];
 716                 Z21 = ex[1] - Z21 - p1 * Z11;
 717                 ex[1] = Z21;
 718                 p1 = ell[lskip2];
 719                 p2 = ell[1 + lskip2];
 720                 Z31 = ex[2] - Z31 - p1 * Z11 - p2 * Z21;
 721                 ex[2] = Z31;
 722                 p1 = ell[lskip3];
 723                 p2 = ell[1 + lskip3];
 724                 p3 = ell[2 + lskip3];
 725                 Z41 = ex[3] - Z41 - p1 * Z11 - p2 * Z21 - p3 * Z31;
 726                 ex[3] = Z41;
 727                 /* end of outer loop */
 728         }
 729         /* compute rows at end that are not a multiple of block size */
 730         for (; i < n; i++)
 731         {
 732                 /* compute all 1 x 1 block of X, from rows i..i+1-1 */
 733                 /* set the Z matrix to 0 */
 734                 Z11 = 0;
 735                 ell = L + i * lskip1;
 736                 ex = B;
 737                 /* the inner loop that computes outer products and adds them to Z */
 738                 for (j = i - 12; j >= 0; j -= 12)
 739                 {
 740                         /* load p and q values */
 741                         p1 = ell[0];
 742                         q1 = ex[0];
 743                         /* compute outer product and add it to the Z matrix */
 744                         Z11 += p1 * q1;
 745                         /* load p and q values */
 746                         p1 = ell[1];
 747                         q1 = ex[1];
 748                         /* compute outer product and add it to the Z matrix */
 749                         Z11 += p1 * q1;
 750                         /* load p and q values */
 751                         p1 = ell[2];
 752                         q1 = ex[2];
 753                         /* compute outer product and add it to the Z matrix */
 754                         Z11 += p1 * q1;
 755                         /* load p and q values */
 756                         p1 = ell[3];
 757                         q1 = ex[3];
 758                         /* compute outer product and add it to the Z matrix */
 759                         Z11 += p1 * q1;
 760                         /* load p and q values */
 761                         p1 = ell[4];
 762                         q1 = ex[4];
 763                         /* compute outer product and add it to the Z matrix */
 764                         Z11 += p1 * q1;
 765                         /* load p and q values */
 766                         p1 = ell[5];
 767                         q1 = ex[5];
 768                         /* compute outer product and add it to the Z matrix */
 769                         Z11 += p1 * q1;
 770                         /* load p and q values */
 771                         p1 = ell[6];
 772                         q1 = ex[6];
 773                         /* compute outer product and add it to the Z matrix */
 774                         Z11 += p1 * q1;
 775                         /* load p and q values */
 776                         p1 = ell[7];
 777                         q1 = ex[7];
 778                         /* compute outer product and add it to the Z matrix */
 779                         Z11 += p1 * q1;
 780                         /* load p and q values */
 781                         p1 = ell[8];
 782                         q1 = ex[8];
 783                         /* compute outer product and add it to the Z matrix */
 784                         Z11 += p1 * q1;
 785                         /* load p and q values */
 786                         p1 = ell[9];
 787                         q1 = ex[9];
 788                         /* compute outer product and add it to the Z matrix */
 789                         Z11 += p1 * q1;
 790                         /* load p and q values */
 791                         p1 = ell[10];
 792                         q1 = ex[10];
 793                         /* compute outer product and add it to the Z matrix */
 794                         Z11 += p1 * q1;
 795                         /* load p and q values */
 796                         p1 = ell[11];
 797                         q1 = ex[11];
 798                         /* compute outer product and add it to the Z matrix */
 799                         Z11 += p1 * q1;
 800                         /* advance pointers */
 801                         ell += 12;
 802                         ex += 12;
 803                         /* end of inner loop */
 804                 }
 805                 /* compute left-over iterations */
 806                 j += 12;
 807                 for (; j > 0; j--)
 808                 {
 809                         /* load p and q values */
 810                         p1 = ell[0];
 811                         q1 = ex[0];
 812                         /* compute outer product and add it to the Z matrix */
 813                         Z11 += p1 * q1;
 814                         /* advance pointers */
 815                         ell += 1;
 816                         ex += 1;
 817                 }
 818                 /* finish computing the X(i) block */
 819                 Z11 = ex[0] - Z11;
 820                 ex[0] = Z11;
 821         }
 822 }
 823
 824 /* solve L^T * x=b, with b containing 1 right hand side.
 825  * L is an n*n lower triangular matrix with ones on the diagonal.
 826  * L is stored by rows and its leading dimension is lskip.
 827  * b is an n*1 matrix that contains the right hand side.
 828  * b is overwritten with x.
 829  * this processes blocks of 4.
 830  */
 831
 832 void btSolveL1T(const btScalar *L, btScalar *B, int n, int lskip1)
 833 {
 834         /* declare variables - Z matrix, p and q vectors, etc */
 835         btScalar Z11, m11, Z21, m21, Z31, m31, Z41, m41, p1, q1, p2, p3, p4, *ex;
 836         const btScalar *ell;
 837         int lskip2, i, j;
 838         //  int lskip3;
 839         /* special handling for L and B because we're solving L1 *transpose* */
 840         L = L + (n - 1) * (lskip1 + 1);
 841         B = B + n - 1;
 842         lskip1 = -lskip1;
 843         /* compute lskip values */
 844         lskip2 = 2 * lskip1;
 845         //lskip3 = 3*lskip1;
 846         /* compute all 4 x 1 blocks of X */
 847         for (i = 0; i <= n - 4; i += 4)
 848         {
 849                 /* compute all 4 x 1 block of X, from rows i..i+4-1 */
 850                 /* set the Z matrix to 0 */
 851                 Z11 = 0;
 852                 Z21 = 0;
 853                 Z31 = 0;
 854                 Z41 = 0;
 855                 ell = L - i;
 856                 ex = B;
 857                 /* the inner loop that computes outer products and adds them to Z */
 858                 for (j = i - 4; j >= 0; j -= 4)
 859                 {
 860                         /* load p and q values */
 861                         p1 = ell[0];
 862                         q1 = ex[0];
 863                         p2 = ell[-1];
 864                         p3 = ell[-2];
 865                         p4 = ell[-3];
 866                         /* compute outer product and add it to the Z matrix */
 867                         m11 = p1 * q1;
 868                         m21 = p2 * q1;
 869                         m31 = p3 * q1;
 870                         m41 = p4 * q1;
 871                         ell += lskip1;
 872                         Z11 += m11;
 873                         Z21 += m21;
 874                         Z31 += m31;
 875                         Z41 += m41;
 876                         /* load p and q values */
 877                         p1 = ell[0];
 878                         q1 = ex[-1];
 879                         p2 = ell[-1];
 880                         p3 = ell[-2];
 881                         p4 = ell[-3];
 882                         /* compute outer product and add it to the Z matrix */
 883                         m11 = p1 * q1;
 884                         m21 = p2 * q1;
 885                         m31 = p3 * q1;
 886                         m41 = p4 * q1;
 887                         ell += lskip1;
 888                         Z11 += m11;
 889                         Z21 += m21;
 890                         Z31 += m31;
 891                         Z41 += m41;
 892                         /* load p and q values */
 893                         p1 = ell[0];
 894                         q1 = ex[-2];
 895                         p2 = ell[-1];
 896                         p3 = ell[-2];
 897                         p4 = ell[-3];
 898                         /* compute outer product and add it to the Z matrix */
 899                         m11 = p1 * q1;
 900                         m21 = p2 * q1;
 901                         m31 = p3 * q1;
 902                         m41 = p4 * q1;
 903                         ell += lskip1;
 904                         Z11 += m11;
 905                         Z21 += m21;
 906                         Z31 += m31;
 907                         Z41 += m41;
 908                         /* load p and q values */
 909                         p1 = ell[0];
 910                         q1 = ex[-3];
 911                         p2 = ell[-1];
 912                         p3 = ell[-2];
 913                         p4 = ell[-3];
 914                         /* compute outer product and add it to the Z matrix */
 915                         m11 = p1 * q1;
 916                         m21 = p2 * q1;
 917                         m31 = p3 * q1;
 918                         m41 = p4 * q1;
 919                         ell += lskip1;
 920                         ex -= 4;
 921                         Z11 += m11;
 922                         Z21 += m21;
 923                         Z31 += m31;
 924                         Z41 += m41;
 925                         /* end of inner loop */
 926                 }
 927                 /* compute left-over iterations */
 928                 j += 4;
 929                 for (; j > 0; j--)
 930                 {
 931                         /* load p and q values */
 932                         p1 = ell[0];
 933                         q1 = ex[0];
 934                         p2 = ell[-1];
 935                         p3 = ell[-2];
 936                         p4 = ell[-3];
 937                         /* compute outer product and add it to the Z matrix */
 938                         m11 = p1 * q1;
 939                         m21 = p2 * q1;
 940                         m31 = p3 * q1;
 941                         m41 = p4 * q1;
 942                         ell += lskip1;
 943                         ex -= 1;
 944                         Z11 += m11;
 945                         Z21 += m21;
 946                         Z31 += m31;
 947                         Z41 += m41;
 948                 }
 949                 /* finish computing the X(i) block */
 950                 Z11 = ex[0] - Z11;
 951                 ex[0] = Z11;
 952                 p1 = ell[-1];
 953                 Z21 = ex[-1] - Z21 - p1 * Z11;
 954                 ex[-1] = Z21;
 955                 p1 = ell[-2];
 956                 p2 = ell[-2 + lskip1];
 957                 Z31 = ex[-2] - Z31 - p1 * Z11 - p2 * Z21;
 958                 ex[-2] = Z31;
 959                 p1 = ell[-3];
 960                 p2 = ell[-3 + lskip1];
 961                 p3 = ell[-3 + lskip2];
 962                 Z41 = ex[-3] - Z41 - p1 * Z11 - p2 * Z21 - p3 * Z31;
 963                 ex[-3] = Z41;
 964                 /* end of outer loop */
 965         }
 966         /* compute rows at end that are not a multiple of block size */
 967         for (; i < n; i++)
 968         {
 969                 /* compute all 1 x 1 block of X, from rows i..i+1-1 */
 970                 /* set the Z matrix to 0 */
 971                 Z11 = 0;
 972                 ell = L - i;
 973                 ex = B;
 974                 /* the inner loop that computes outer products and adds them to Z */
 975                 for (j = i - 4; j >= 0; j -= 4)
 976                 {
 977                         /* load p and q values */
 978                         p1 = ell[0];
 979                         q1 = ex[0];
 980                         /* compute outer product and add it to the Z matrix */
 981                         m11 = p1 * q1;
 982                         ell += lskip1;
 983                         Z11 += m11;
 984                         /* load p and q values */
 985                         p1 = ell[0];
 986                         q1 = ex[-1];
 987                         /* compute outer product and add it to the Z matrix */
 988                         m11 = p1 * q1;
 989                         ell += lskip1;
 990                         Z11 += m11;
 991                         /* load p and q values */
 992                         p1 = ell[0];
 993                         q1 = ex[-2];
 994                         /* compute outer product and add it to the Z matrix */
 995                         m11 = p1 * q1;
 996                         ell += lskip1;
 997                         Z11 += m11;
 998                         /* load p and q values */
 999                         p1 = ell[0];
1000                         q1 = ex[-3];
1001                         /* compute outer product and add it to the Z matrix */
1002                         m11 = p1 * q1;
1003                         ell += lskip1;
1004                         ex -= 4;
1005                         Z11 += m11;
1006                         /* end of inner loop */
1007                 }
1008                 /* compute left-over iterations */
1009                 j += 4;
1010                 for (; j > 0; j--)
1011                 {
1012                         /* load p and q values */
1013                         p1 = ell[0];
1014                         q1 = ex[0];
1015                         /* compute outer product and add it to the Z matrix */
1016                         m11 = p1 * q1;
1017                         ell += lskip1;
1018                         ex -= 1;
1019                         Z11 += m11;
1020                 }
1021                 /* finish computing the X(i) block */
1022                 Z11 = ex[0] - Z11;
1023                 ex[0] = Z11;
1024         }
1025 }
1026
1027 void btVectorScale(btScalar *a, const btScalar *d, int n)
1028 {
1029         btAssert(a && d && n >= 0);
1030         for (int i = 0; i < n; i++)
1031         {
1032                 a[i] *= d[i];
1033         }
1034 }
1035
1036 void btSolveLDLT(const btScalar *L, const btScalar *d, btScalar *b, int n, int nskip)
1037 {
1038         btAssert(L && d && b && n > 0 && nskip >= n);
1039         btSolveL1(L, b, n, nskip);
1040         btVectorScale(b, d, n);
1041         btSolveL1T(L, b, n, nskip);
1042 }
1043
1044 //***************************************************************************
1045
1046 // swap row/column i1 with i2 in the n*n matrix A. the leading dimension of
1047 // A is nskip. this only references and swaps the lower triangle.
1048 // if `do_fast_row_swaps' is nonzero and row pointers are being used, then
1049 // rows will be swapped by exchanging row pointers. otherwise the data will
1050 // be copied.
1051
1052 static void btSwapRowsAndCols(BTATYPE A, int n, int i1, int i2, int nskip,
1053                                                           int do_fast_row_swaps)
1054 {
1055         btAssert(A && n > 0 && i1 >= 0 && i2 >= 0 && i1 < n && i2 < n &&
1056                          nskip >= n && i1 < i2);
1057
1058 #ifdef BTROWPTRS
1059         btScalar *A_i1 = A[i1];
1060         btScalar *A_i2 = A[i2];
1061         for (int i = i1 + 1; i < i2; ++i)
1062         {
1063                 btScalar *A_i_i1 = A[i] + i1;
1064                 A_i1[i] = *A_i_i1;
1065                 *A_i_i1 = A_i2[i];
1066         }
1067         A_i1[i2] = A_i1[i1];
1068         A_i1[i1] = A_i2[i1];
1069         A_i2[i1] = A_i2[i2];
1070         // swap rows, by swapping row pointers
1071         if (do_fast_row_swaps)
1072         {
1073                 A[i1] = A_i2;
1074                 A[i2] = A_i1;
1075         }
1076         else
1077         {
1078                 // Only swap till i2 column to match A plain storage variant.
1079                 for (int k = 0; k <= i2; ++k)
1080                 {
1081                         btScalar tmp = A_i1[k];
1082                         A_i1[k] = A_i2[k];
1083                         A_i2[k] = tmp;
1084                 }
1085         }
1086         // swap columns the hard way
1087         for (int j = i2 + 1; j < n; ++j)
1088         {
1089                 btScalar *A_j = A[j];
1090                 btScalar tmp = A_j[i1];
1091                 A_j[i1] = A_j[i2];
1092                 A_j[i2] = tmp;
1093         }
1094 #else
1095         btScalar *A_i1 = A + i1 * nskip;
1096         btScalar *A_i2 = A + i2 * nskip;
1097         for (int k = 0; k < i1; ++k)
1098         {
1099                 btScalar tmp = A_i1[k];
1100                 A_i1[k] = A_i2[k];
1101                 A_i2[k] = tmp;
1102         }
1103         btScalar *A_i = A_i1 + nskip;
1104         for (int i = i1 + 1; i < i2; A_i += nskip, ++i)
1105         {
1106                 btScalar tmp = A_i2[i];
1107                 A_i2[i] = A_i[i1];
1108                 A_i[i1] = tmp;
1109         }
1110         {
1111                 btScalar tmp = A_i1[i1];
1112                 A_i1[i1] = A_i2[i2];
1113                 A_i2[i2] = tmp;
1114         }
1115         btScalar *A_j = A_i2 + nskip;
1116         for (int j = i2 + 1; j < n; A_j += nskip, ++j)
1117         {
1118                 btScalar tmp = A_j[i1];
1119                 A_j[i1] = A_j[i2];
1120                 A_j[i2] = tmp;
1121         }
1122 #endif
1123 }
1124
1125 // swap two indexes in the n*n LCP problem. i1 must be <= i2.
1126
1127 static void btSwapProblem(BTATYPE A, btScalar *x, btScalar *b, btScalar *w, btScalar *lo,
1128                                                   btScalar *hi, int *p, bool *state, int *findex,
1129                                                   int n, int i1, int i2, int nskip,
1130                                                   int do_fast_row_swaps)
1131 {
1132         btScalar tmpr;
1133         int tmpi;
1134         bool tmpb;
1135         btAssert(n > 0 && i1 >= 0 && i2 >= 0 && i1 < n && i2 < n && nskip >= n && i1 <= i2);
1136         if (i1 == i2) return;
1137
1138         btSwapRowsAndCols(A, n, i1, i2, nskip, do_fast_row_swaps);
1139
1140         tmpr = x[i1];
1141         x[i1] = x[i2];
1142         x[i2] = tmpr;
1143
1144         tmpr = b[i1];
1145         b[i1] = b[i2];
1146         b[i2] = tmpr;
1147
1148         tmpr = w[i1];
1149         w[i1] = w[i2];
1150         w[i2] = tmpr;
1151
1152         tmpr = lo[i1];
1153         lo[i1] = lo[i2];
1154         lo[i2] = tmpr;
1155
1156         tmpr = hi[i1];
1157         hi[i1] = hi[i2];
1158         hi[i2] = tmpr;
1159
1160         tmpi = p[i1];
1161         p[i1] = p[i2];
1162         p[i2] = tmpi;
1163
1164         tmpb = state[i1];
1165         state[i1] = state[i2];
1166         state[i2] = tmpb;
1167
1168         if (findex)
1169         {
1170                 tmpi = findex[i1];
1171                 findex[i1] = findex[i2];
1172                 findex[i2] = tmpi;
1173         }
1174 }
1175
1176 //***************************************************************************
1177 // btLCP manipulator object. this represents an n*n LCP problem.
1178 //
1179 // two index sets C and N are kept. each set holds a subset of
1180 // the variable indexes 0..n-1. an index can only be in one set.
1181 // initially both sets are empty.
1182 //
1183 // the index set C is special: solutions to A(C,C)\A(C,i) can be generated.
1184
1185 //***************************************************************************
1186 // fast implementation of btLCP. see the above definition of btLCP for
1187 // interface comments.
1188 //
1189 // `p' records the permutation of A,x,b,w,etc. p is initially 1:n and is
1190 // permuted as the other vectors/matrices are permuted.
1191 //
1192 // A,x,b,w,lo,hi,state,findex,p,c are permuted such that sets C,N have
1193 // contiguous indexes. the don't-care indexes follow N.
1194 //
1195 // an L*D*L' factorization is maintained of A(C,C), and whenever indexes are
1196 // added or removed from the set C the factorization is updated.
1197 // thus L*D*L'=A[C,C], i.e. a permuted top left nC*nC submatrix of A.
1198 // the leading dimension of the matrix L is always `nskip'.
1199 //
1200 // at the start there may be other indexes that are unbounded but are not
1201 // included in `nub'. btLCP will permute the matrix so that absolutely all
1202 // unbounded vectors are at the start. thus there may be some initial
1203 // permutation.
1204 //
1205 // the algorithms here assume certain patterns, particularly with respect to
1206 // index transfer.
1207
1208 #ifdef btLCP_FAST
1209
1210 struct btLCP
1211 {
1212         const int m_n;
1213         const int m_nskip;
1214         int m_nub;
1215         int m_nC, m_nN;                                                         // size of each index set
1216         BTATYPE const m_A;                                                      // A rows
1217         btScalar *const m_x, *const m_b, *const m_w, *const m_lo, *const m_hi;  // permuted LCP problem data
1218         btScalar *const m_L, *const m_d;                                        // L*D*L' factorization of set C
1219         btScalar *const m_Dell, *const m_ell, *const m_tmp;
1220         bool *const m_state;
1221         int *const m_findex, *const m_p, *const m_C;
1222
1223         btLCP(int _n, int _nskip, int _nub, btScalar *_Adata, btScalar *_x, btScalar *_b, btScalar *_w,
1224                   btScalar *_lo, btScalar *_hi, btScalar *l, btScalar *_d,
1225                   btScalar *_Dell, btScalar *_ell, btScalar *_tmp,
1226                   bool *_state, int *_findex, int *p, int *c, btScalar **Arows);
1227         int getNub() const { return m_nub; }
1228         void transfer_i_to_C(int i);
1229         void transfer_i_to_N(int i) { m_nN++; }  // because we can assume C and N span 1:i-1
1230         void transfer_i_from_N_to_C(int i);
1231         void transfer_i_from_C_to_N(int i, btAlignedObjectArray<btScalar> &scratch);
1232         int numC() const { return m_nC; }
1233         int numN() const { return m_nN; }
1234         int indexC(int i) const { return i; }
1235         int indexN(int i) const { return i + m_nC; }
1236         btScalar Aii(int i) const { return BTAROW(i)[i]; }
1237         btScalar AiC_times_qC(int i, btScalar *q) const { return btLargeDot(BTAROW(i), q, m_nC); }
1238         btScalar AiN_times_qN(int i, btScalar *q) const { return btLargeDot(BTAROW(i) + m_nC, q + m_nC, m_nN); }
1239         void pN_equals_ANC_times_qC(btScalar *p, btScalar *q);
1240         void pN_plusequals_ANi(btScalar *p, int i, int sign = 1);
1241         void pC_plusequals_s_times_qC(btScalar *p, btScalar s, btScalar *q);
1242         void pN_plusequals_s_times_qN(btScalar *p, btScalar s, btScalar *q);
1243         void solve1(btScalar *a, int i, int dir = 1, int only_transfer = 0);
1244         void unpermute();
1245 };
1246
1247 btLCP::btLCP(int _n, int _nskip, int _nub, btScalar *_Adata, btScalar *_x, btScalar *_b, btScalar *_w,
1248                          btScalar *_lo, btScalar *_hi, btScalar *l, btScalar *_d,
1249                          btScalar *_Dell, btScalar *_ell, btScalar *_tmp,
1250                          bool *_state, int *_findex, int *p, int *c, btScalar **Arows) : m_n(_n), m_nskip(_nskip), m_nub(_nub), m_nC(0), m_nN(0),
1251 #ifdef BTROWPTRS
1252                                                                                                                                                          m_A(Arows),
1253 #else
1254                                                                                                                                                          m_A(_Adata),
1255 #endif
1256                                                                                                                                                          m_x(_x),
1257                                                                                                                                                          m_b(_b),
1258                                                                                                                                                          m_w(_w),
1259                                                                                                                                                          m_lo(_lo),
1260                                                                                                                                                          m_hi(_hi),
1261                                                                                                                                                          m_L(l),
1262                                                                                                                                                          m_d(_d),
1263                                                                                                                                                          m_Dell(_Dell),
1264                                                                                                                                                          m_ell(_ell),
1265                                                                                                                                                          m_tmp(_tmp),
1266                                                                                                                                                          m_state(_state),
1267                                                                                                                                                          m_findex(_findex),
1268                                                                                                                                                          m_p(p),
1269                                                                                                                                                          m_C(c)
1270 {
1271         {
1272                 btSetZero(m_x, m_n);
1273         }
1274
1275         {
1276 #ifdef BTROWPTRS
1277                 // make matrix row pointers
1278                 btScalar *aptr = _Adata;
1279                 BTATYPE A = m_A;
1280                 const int n = m_n, nskip = m_nskip;
1281                 for (int k = 0; k < n; aptr += nskip, ++k) A[k] = aptr;
1282 #endif
1283         }
1284
1285         {
1286                 int *p = m_p;
1287                 const int n = m_n;
1288                 for (int k = 0; k < n; ++k) p[k] = k;  // initially unpermuted
1289         }
1290
1291         /*
1292   // for testing, we can do some random swaps in the area i > nub
1293   {
1294     const int n = m_n;
1295     const int nub = m_nub;
1296     if (nub < n) {
1297     for (int k=0; k<100; k++) {
1298       int i1,i2;
1299       do {
1300         i1 = dRandInt(n-nub)+nub;
1301         i2 = dRandInt(n-nub)+nub;
1302       }
1303       while (i1 > i2);
1304       //printf ("--> %d %d\n",i1,i2);
1305       btSwapProblem (m_A,m_x,m_b,m_w,m_lo,m_hi,m_p,m_state,m_findex,n,i1,i2,m_nskip,0);
1306     }
1307   }
1308   */
1309
1310         // permute the problem so that *all* the unbounded variables are at the
1311         // start, i.e. look for unbounded variables not included in `nub'. we can
1312         // potentially push up `nub' this way and get a bigger initial factorization.
1313         // note that when we swap rows/cols here we must not just swap row pointers,
1314         // as the initial factorization relies on the data being all in one chunk.
1315         // variables that have findex >= 0 are *not* considered to be unbounded even
1316         // if lo=-inf and hi=inf - this is because these limits may change during the
1317         // solution process.
1318
1319         {
1320                 int *findex = m_findex;
1321                 btScalar *lo = m_lo, *hi = m_hi;
1322                 const int n = m_n;
1323                 for (int k = m_nub; k < n; ++k)
1324                 {
1325                         if (findex && findex[k] >= 0) continue;
1326                         if (lo[k] == -BT_INFINITY && hi[k] == BT_INFINITY)
1327                         {
1328                                 btSwapProblem(m_A, m_x, m_b, m_w, lo, hi, m_p, m_state, findex, n, m_nub, k, m_nskip, 0);
1329                                 m_nub++;
1330                         }
1331                 }
1332         }
1333
1334         // if there are unbounded variables at the start, factorize A up to that
1335         // point and solve for x. this puts all indexes 0..nub-1 into C.
1336         if (m_nub > 0)
1337         {
1338                 const int nub = m_nub;
1339                 {
1340                         btScalar *Lrow = m_L;
1341                         const int nskip = m_nskip;
1342                         for (int j = 0; j < nub; Lrow += nskip, ++j) memcpy(Lrow, BTAROW(j), (j + 1) * sizeof(btScalar));
1343                 }
1344                 btFactorLDLT(m_L, m_d, nub, m_nskip);
1345                 memcpy(m_x, m_b, nub * sizeof(btScalar));
1346                 btSolveLDLT(m_L, m_d, m_x, nub, m_nskip);
1347                 btSetZero(m_w, nub);
1348                 {
1349                         int *C = m_C;
1350                         for (int k = 0; k < nub; ++k) C[k] = k;
1351                 }
1352                 m_nC = nub;
1353         }
1354
1355         // permute the indexes > nub such that all findex variables are at the end
1356         if (m_findex)
1357         {
1358                 const int nub = m_nub;
1359                 int *findex = m_findex;
1360                 int num_at_end = 0;
1361                 for (int k = m_n - 1; k >= nub; k--)
1362                 {
1363                         if (findex[k] >= 0)
1364                         {
1365                                 btSwapProblem(m_A, m_x, m_b, m_w, m_lo, m_hi, m_p, m_state, findex, m_n, k, m_n - 1 - num_at_end, m_nskip, 1);
1366                                 num_at_end++;
1367                         }
1368                 }
1369         }
1370
1371         // print info about indexes
1372         /*
1373   {
1374     const int n = m_n;
1375     const int nub = m_nub;
1376     for (int k=0; k<n; k++) {
1377       if (k<nub) printf ("C");
1378       else if (m_lo[k]==-BT_INFINITY && m_hi[k]==BT_INFINITY) printf ("c");
1379       else printf (".");
1380     }
1381     printf ("\n");
1382   }
1383   */
1384 }
1385
1386 void btLCP::transfer_i_to_C(int i)
1387 {
1388         {
1389                 if (m_nC > 0)
1390                 {
1391                         // ell,Dell were computed by solve1(). note, ell = D \ L1solve (L,A(i,C))
1392                         {
1393                                 const int nC = m_nC;
1394                                 btScalar *const Ltgt = m_L + nC * m_nskip, *ell = m_ell;
1395                                 for (int j = 0; j < nC; ++j) Ltgt[j] = ell[j];
1396                         }
1397                         const int nC = m_nC;
1398                         m_d[nC] = btRecip(BTAROW(i)[i] - btLargeDot(m_ell, m_Dell, nC));
1399                 }
1400                 else
1401                 {
1402                         m_d[0] = btRecip(BTAROW(i)[i]);
1403                 }
1404
1405                 btSwapProblem(m_A, m_x, m_b, m_w, m_lo, m_hi, m_p, m_state, m_findex, m_n, m_nC, i, m_nskip, 1);
1406
1407                 const int nC = m_nC;
1408                 m_C[nC] = nC;
1409                 m_nC = nC + 1;  // nC value is outdated after this line
1410         }
1411 }
1412
1413 void btLCP::transfer_i_from_N_to_C(int i)
1414 {
1415         {
1416                 if (m_nC > 0)
1417                 {
1418                         {
1419                                 btScalar *const aptr = BTAROW(i);
1420                                 btScalar *Dell = m_Dell;
1421                                 const int *C = m_C;
1422 #ifdef BTNUB_OPTIMIZATIONS
1423                                 // if nub>0, initial part of aptr unpermuted
1424                                 const int nub = m_nub;
1425                                 int j = 0;
1426                                 for (; j < nub; ++j) Dell[j] = aptr[j];
1427                                 const int nC = m_nC;
1428                                 for (; j < nC; ++j) Dell[j] = aptr[C[j]];
1429 #else
1430                                 const int nC = m_nC;
1431                                 for (int j = 0; j < nC; ++j) Dell[j] = aptr[C[j]];
1432 #endif
1433                         }
1434                         btSolveL1(m_L, m_Dell, m_nC, m_nskip);
1435                         {
1436                                 const int nC = m_nC;
1437                                 btScalar *const Ltgt = m_L + nC * m_nskip;
1438                                 btScalar *ell = m_ell, *Dell = m_Dell, *d = m_d;
1439                                 for (int j = 0; j < nC; ++j) Ltgt[j] = ell[j] = Dell[j] * d[j];
1440                         }
1441                         const int nC = m_nC;
1442                         m_d[nC] = btRecip(BTAROW(i)[i] - btLargeDot(m_ell, m_Dell, nC));
1443                 }
1444                 else
1445                 {
1446                         m_d[0] = btRecip(BTAROW(i)[i]);
1447                 }
1448
1449                 btSwapProblem(m_A, m_x, m_b, m_w, m_lo, m_hi, m_p, m_state, m_findex, m_n, m_nC, i, m_nskip, 1);
1450
1451                 const int nC = m_nC;
1452                 m_C[nC] = nC;
1453                 m_nN--;
1454                 m_nC = nC + 1;  // nC value is outdated after this line
1455         }
1456
1457         // @@@ TO DO LATER
1458         // if we just finish here then we'll go back and re-solve for
1459         // delta_x. but actually we can be more efficient and incrementally
1460         // update delta_x here. but if we do this, we wont have ell and Dell
1461         // to use in updating the factorization later.
1462 }
1463
1464 void btRemoveRowCol(btScalar *A, int n, int nskip, int r)
1465 {
1466         btAssert(A && n > 0 && nskip >= n && r >= 0 && r < n);
1467         if (r >= n - 1) return;
1468         if (r > 0)
1469         {
1470                 {
1471                         const size_t move_size = (n - r - 1) * sizeof(btScalar);
1472                         btScalar *Adst = A + r;
1473                         for (int i = 0; i < r; Adst += nskip, ++i)
1474                         {
1475                                 btScalar *Asrc = Adst + 1;
1476                                 memmove(Adst, Asrc, move_size);
1477                         }
1478                 }
1479                 {
1480                         const size_t cpy_size = r * sizeof(btScalar);
1481                         btScalar *Adst = A + r * nskip;
1482                         for (int i = r; i < (n - 1); ++i)
1483                         {
1484                                 btScalar *Asrc = Adst + nskip;
1485                                 memcpy(Adst, Asrc, cpy_size);
1486                                 Adst = Asrc;
1487                         }
1488                 }
1489         }
1490         {
1491                 const size_t cpy_size = (n - r - 1) * sizeof(btScalar);
1492                 btScalar *Adst = A + r * (nskip + 1);
1493                 for (int i = r; i < (n - 1); ++i)
1494                 {
1495                         btScalar *Asrc = Adst + (nskip + 1);
1496                         memcpy(Adst, Asrc, cpy_size);
1497                         Adst = Asrc - 1;
1498                 }
1499         }
1500 }
1501
1502 void btLDLTAddTL(btScalar *L, btScalar *d, const btScalar *a, int n, int nskip, btAlignedObjectArray<btScalar> &scratch)
1503 {
1504         btAssert(L && d && a && n > 0 && nskip >= n);
1505
1506         if (n < 2) return;
1507         scratch.resize(2 * nskip);
1508         btScalar *W1 = &scratch[0];
1509
1510         btScalar *W2 = W1 + nskip;
1511
1512         W1[0] = btScalar(0.0);
1513         W2[0] = btScalar(0.0);
1514         for (int j = 1; j < n; ++j)
1515         {
1516                 W1[j] = W2[j] = (btScalar)(a[j] * SIMDSQRT12);
1517         }
1518         btScalar W11 = (btScalar)((btScalar(0.5) * a[0] + 1) * SIMDSQRT12);
1519         btScalar W21 = (btScalar)((btScalar(0.5) * a[0] - 1) * SIMDSQRT12);
1520
1521         btScalar alpha1 = btScalar(1.0);
1522         btScalar alpha2 = btScalar(1.0);
1523
1524         {
1525                 btScalar dee = d[0];
1526                 btScalar alphanew = alpha1 + (W11 * W11) * dee;
1527                 btAssert(alphanew != btScalar(0.0));
1528                 dee /= alphanew;
1529                 btScalar gamma1 = W11 * dee;
1530                 dee *= alpha1;
1531                 alpha1 = alphanew;
1532                 alphanew = alpha2 - (W21 * W21) * dee;
1533                 dee /= alphanew;
1534                 //btScalar gamma2 = W21 * dee;
1535                 alpha2 = alphanew;
1536                 btScalar k1 = btScalar(1.0) - W21 * gamma1;
1537                 btScalar k2 = W21 * gamma1 * W11 - W21;
1538                 btScalar *ll = L + nskip;
1539                 for (int p = 1; p < n; ll += nskip, ++p)
1540                 {
1541                         btScalar Wp = W1[p];
1542                         btScalar ell = *ll;
1543                         W1[p] = Wp - W11 * ell;
1544                         W2[p] = k1 * Wp + k2 * ell;
1545                 }
1546         }
1547
1548         btScalar *ll = L + (nskip + 1);
1549         for (int j = 1; j < n; ll += nskip + 1, ++j)
1550         {
1551                 btScalar k1 = W1[j];
1552                 btScalar k2 = W2[j];
1553
1554                 btScalar dee = d[j];
1555                 btScalar alphanew = alpha1 + (k1 * k1) * dee;
1556                 btAssert(alphanew != btScalar(0.0));
1557                 dee /= alphanew;
1558                 btScalar gamma1 = k1 * dee;
1559                 dee *= alpha1;
1560                 alpha1 = alphanew;
1561                 alphanew = alpha2 - (k2 * k2) * dee;
1562                 dee /= alphanew;
1563                 btScalar gamma2 = k2 * dee;
1564                 dee *= alpha2;
1565                 d[j] = dee;
1566                 alpha2 = alphanew;
1567
1568                 btScalar *l = ll + nskip;
1569                 for (int p = j + 1; p < n; l += nskip, ++p)
1570                 {
1571                         btScalar ell = *l;
1572                         btScalar Wp = W1[p] - k1 * ell;
1573                         ell += gamma1 * Wp;
1574                         W1[p] = Wp;
1575                         Wp = W2[p] - k2 * ell;
1576                         ell -= gamma2 * Wp;
1577                         W2[p] = Wp;
1578                         *l = ell;
1579                 }
1580         }
1581 }
1582
1583 #define _BTGETA(i, j) (A[i][j])
1584 //#define _GETA(i,j) (A[(i)*nskip+(j)])
1585 #define BTGETA(i, j) ((i > j) ? _BTGETA(i, j) : _BTGETA(j, i))
1586
1587 inline size_t btEstimateLDLTAddTLTmpbufSize(int nskip)
1588 {
1589         return nskip * 2 * sizeof(btScalar);
1590 }
1591
1592 void btLDLTRemove(btScalar **A, const int *p, btScalar *L, btScalar *d,
1593                                   int n1, int n2, int r, int nskip, btAlignedObjectArray<btScalar> &scratch)
1594 {
1595         btAssert(A && p && L && d && n1 > 0 && n2 > 0 && r >= 0 && r < n2 &&
1596                          n1 >= n2 && nskip >= n1);
1597 #ifdef BT_DEBUG
1598         for (int i = 0; i < n2; ++i)
1599                 btAssert(p[i] >= 0 && p[i] < n1);
1600 #endif
1601
1602         if (r == n2 - 1)
1603         {
1604                 return;  // deleting last row/col is easy
1605         }
1606         else
1607         {
1608                 size_t LDLTAddTL_size = btEstimateLDLTAddTLTmpbufSize(nskip);
1609                 btAssert(LDLTAddTL_size % sizeof(btScalar) == 0);
1610                 scratch.resize(nskip * 2 + n2);
1611                 btScalar *tmp = &scratch[0];
1612                 if (r == 0)
1613                 {
1614                         btScalar *a = (btScalar *)((char *)tmp + LDLTAddTL_size);
1615                         const int p_0 = p[0];
1616                         for (int i = 0; i < n2; ++i)
1617                         {
1618                                 a[i] = -BTGETA(p[i], p_0);
1619                         }
1620                         a[0] += btScalar(1.0);
1621                         btLDLTAddTL(L, d, a, n2, nskip, scratch);
1622                 }
1623                 else
1624                 {
1625                         btScalar *t = (btScalar *)((char *)tmp + LDLTAddTL_size);
1626                         {
1627                                 btScalar *Lcurr = L + r * nskip;
1628                                 for (int i = 0; i < r; ++Lcurr, ++i)
1629                                 {
1630                                         btAssert(d[i] != btScalar(0.0));
1631                                         t[i] = *Lcurr / d[i];
1632                                 }
1633                         }
1634                         btScalar *a = t + r;
1635                         {
1636                                 btScalar *Lcurr = L + r * nskip;
1637                                 const int *pp_r = p + r, p_r = *pp_r;
1638                                 const int n2_minus_r = n2 - r;
1639                                 for (int i = 0; i < n2_minus_r; Lcurr += nskip, ++i)
1640                                 {
1641                                         a[i] = btLargeDot(Lcurr, t, r) - BTGETA(pp_r[i], p_r);
1642                                 }
1643                         }
1644                         a[0] += btScalar(1.0);
1645                         btLDLTAddTL(L + r * nskip + r, d + r, a, n2 - r, nskip, scratch);
1646                 }
1647         }
1648
1649         // snip out row/column r from L and d
1650         btRemoveRowCol(L, n2, nskip, r);
1651         if (r < (n2 - 1)) memmove(d + r, d + r + 1, (n2 - r - 1) * sizeof(btScalar));
1652 }
1653
1654 void btLCP::transfer_i_from_C_to_N(int i, btAlignedObjectArray<btScalar> &scratch)
1655 {
1656         {
1657                 int *C = m_C;
1658                 // remove a row/column from the factorization, and adjust the
1659                 // indexes (black magic!)
1660                 int last_idx = -1;
1661                 const int nC = m_nC;
1662                 int j = 0;
1663                 for (; j < nC; ++j)
1664                 {
1665                         if (C[j] == nC - 1)
1666                         {
1667                                 last_idx = j;
1668                         }
1669                         if (C[j] == i)
1670                         {
1671                                 btLDLTRemove(m_A, C, m_L, m_d, m_n, nC, j, m_nskip, scratch);
1672                                 int k;
1673                                 if (last_idx == -1)
1674                                 {
1675                                         for (k = j + 1; k < nC; ++k)
1676                                         {
1677                                                 if (C[k] == nC - 1)
1678                                                 {
1679                                                         break;
1680                                                 }
1681                                         }
1682                                         btAssert(k < nC);
1683                                 }
1684                                 else
1685                                 {
1686                                         k = last_idx;
1687                                 }
1688                                 C[k] = C[j];
1689                                 if (j < (nC - 1)) memmove(C + j, C + j + 1, (nC - j - 1) * sizeof(int));
1690                                 break;
1691                         }
1692                 }
1693                 btAssert(j < nC);
1694
1695                 btSwapProblem(m_A, m_x, m_b, m_w, m_lo, m_hi, m_p, m_state, m_findex, m_n, i, nC - 1, m_nskip, 1);
1696
1697                 m_nN++;
1698                 m_nC = nC - 1;  // nC value is outdated after this line
1699         }
1700 }
1701
1702 void btLCP::pN_equals_ANC_times_qC(btScalar *p, btScalar *q)
1703 {
1704         // we could try to make this matrix-vector multiplication faster using
1705         // outer product matrix tricks, e.g. with the dMultidotX() functions.
1706         // but i tried it and it actually made things slower on random 100x100
1707         // problems because of the overhead involved. so we'll stick with the
1708         // simple method for now.
1709         const int nC = m_nC;
1710         btScalar *ptgt = p + nC;
1711         const int nN = m_nN;
1712         for (int i = 0; i < nN; ++i)
1713         {
1714                 ptgt[i] = btLargeDot(BTAROW(i + nC), q, nC);
1715         }
1716 }
1717
1718 void btLCP::pN_plusequals_ANi(btScalar *p, int i, int sign)
1719 {
1720         const int nC = m_nC;
1721         btScalar *aptr = BTAROW(i) + nC;
1722         btScalar *ptgt = p + nC;
1723         if (sign > 0)
1724         {
1725                 const int nN = m_nN;
1726                 for (int j = 0; j < nN; ++j) ptgt[j] += aptr[j];
1727         }
1728         else
1729         {
1730                 const int nN = m_nN;
1731                 for (int j = 0; j < nN; ++j) ptgt[j] -= aptr[j];
1732         }
1733 }
1734
1735 void btLCP::pC_plusequals_s_times_qC(btScalar *p, btScalar s, btScalar *q)
1736 {
1737         const int nC = m_nC;
1738         for (int i = 0; i < nC; ++i)
1739         {
1740                 p[i] += s * q[i];
1741         }
1742 }
1743
1744 void btLCP::pN_plusequals_s_times_qN(btScalar *p, btScalar s, btScalar *q)
1745 {
1746         const int nC = m_nC;
1747         btScalar *ptgt = p + nC, *qsrc = q + nC;
1748         const int nN = m_nN;
1749         for (int i = 0; i < nN; ++i)
1750         {
1751                 ptgt[i] += s * qsrc[i];
1752         }
1753 }
1754
1755 void btLCP::solve1(btScalar *a, int i, int dir, int only_transfer)
1756 {
1757         // the `Dell' and `ell' that are computed here are saved. if index i is
1758         // later added to the factorization then they can be reused.
1759         //
1760         // @@@ question: do we need to solve for entire delta_x??? yes, but
1761         //     only if an x goes below 0 during the step.
1762
1763         if (m_nC > 0)
1764         {
1765                 {
1766                         btScalar *Dell = m_Dell;
1767                         int *C = m_C;
1768                         btScalar *aptr = BTAROW(i);
1769 #ifdef BTNUB_OPTIMIZATIONS
1770                         // if nub>0, initial part of aptr[] is guaranteed unpermuted
1771                         const int nub = m_nub;
1772                         int j = 0;
1773                         for (; j < nub; ++j) Dell[j] = aptr[j];
1774                         const int nC = m_nC;
1775                         for (; j < nC; ++j) Dell[j] = aptr[C[j]];
1776 #else
1777                         const int nC = m_nC;
1778                         for (int j = 0; j < nC; ++j) Dell[j] = aptr[C[j]];
1779 #endif
1780                 }
1781                 btSolveL1(m_L, m_Dell, m_nC, m_nskip);
1782                 {
1783                         btScalar *ell = m_ell, *Dell = m_Dell, *d = m_d;
1784                         const int nC = m_nC;
1785                         for (int j = 0; j < nC; ++j) ell[j] = Dell[j] * d[j];
1786                 }
1787
1788                 if (!only_transfer)
1789                 {
1790                         btScalar *tmp = m_tmp, *ell = m_ell;
1791                         {
1792                                 const int nC = m_nC;
1793                                 for (int j = 0; j < nC; ++j) tmp[j] = ell[j];
1794                         }
1795                         btSolveL1T(m_L, tmp, m_nC, m_nskip);
1796                         if (dir > 0)
1797                         {
1798                                 int *C = m_C;
1799                                 btScalar *tmp = m_tmp;
1800                                 const int nC = m_nC;
1801                                 for (int j = 0; j < nC; ++j) a[C[j]] = -tmp[j];
1802                         }
1803                         else
1804                         {
1805                                 int *C = m_C;
1806                                 btScalar *tmp = m_tmp;
1807                                 const int nC = m_nC;
1808                                 for (int j = 0; j < nC; ++j) a[C[j]] = tmp[j];
1809                         }
1810                 }
1811         }
1812 }
1813
1814 void btLCP::unpermute()
1815 {
1816         // now we have to un-permute x and w
1817         {
1818                 memcpy(m_tmp, m_x, m_n * sizeof(btScalar));
1819                 btScalar *x = m_x, *tmp = m_tmp;
1820                 const int *p = m_p;
1821                 const int n = m_n;
1822                 for (int j = 0; j < n; ++j) x[p[j]] = tmp[j];
1823         }
1824         {
1825                 memcpy(m_tmp, m_w, m_n * sizeof(btScalar));
1826                 btScalar *w = m_w, *tmp = m_tmp;
1827                 const int *p = m_p;
1828                 const int n = m_n;
1829                 for (int j = 0; j < n; ++j) w[p[j]] = tmp[j];
1830         }
1831 }
1832
1833 #endif  // btLCP_FAST
1834
1835 //***************************************************************************
1836 // an optimized Dantzig LCP driver routine for the lo-hi LCP problem.
1837
1838 bool btSolveDantzigLCP(int n, btScalar *A, btScalar *x, btScalar *b,
1839                                            btScalar *outer_w, int nub, btScalar *lo, btScalar *hi, int *findex, btDantzigScratchMemory &scratchMem)
1840 {
1841         s_error = false;
1842
1843         //      printf("btSolveDantzigLCP n=%d\n",n);
1844         btAssert(n > 0 && A && x && b && lo && hi && nub >= 0 && nub <= n);
1845         btAssert(outer_w);
1846
1847 #ifdef BT_DEBUG
1848         {
1849                 // check restrictions on lo and hi
1850                 for (int k = 0; k < n; ++k)
1851                         btAssert(lo[k] <= 0 && hi[k] >= 0);
1852         }
1853 #endif
1854
1855         // if all the variables are unbounded then we can just factor, solve,
1856         // and return
1857         if (nub >= n)
1858         {
1859                 int nskip = (n);
1860                 btFactorLDLT(A, outer_w, n, nskip);
1861                 btSolveLDLT(A, outer_w, b, n, nskip);
1862                 memcpy(x, b, n * sizeof(btScalar));
1863
1864                 return !s_error;
1865         }
1866
1867         const int nskip = (n);
1868         scratchMem.L.resize(n * nskip);
1869
1870         scratchMem.d.resize(n);
1871
1872         btScalar *w = outer_w;
1873         scratchMem.delta_w.resize(n);
1874         scratchMem.delta_x.resize(n);
1875         scratchMem.Dell.resize(n);
1876         scratchMem.ell.resize(n);
1877         scratchMem.Arows.resize(n);
1878         scratchMem.p.resize(n);
1879         scratchMem.C.resize(n);
1880
1881         // for i in N, state[i] is 0 if x(i)==lo(i) or 1 if x(i)==hi(i)
1882         scratchMem.state.resize(n);
1883
1884         // create LCP object. note that tmp is set to delta_w to save space, this
1885         // optimization relies on knowledge of how tmp is used, so be careful!
1886         btLCP lcp(n, nskip, nub, A, x, b, w, lo, hi, &scratchMem.L[0], &scratchMem.d[0], &scratchMem.Dell[0], &scratchMem.ell[0], &scratchMem.delta_w[0], &scratchMem.state[0], findex, &scratchMem.p[0], &scratchMem.C[0], &scratchMem.Arows[0]);
1887         int adj_nub = lcp.getNub();
1888
1889         // loop over all indexes adj_nub..n-1. for index i, if x(i),w(i) satisfy the
1890         // LCP conditions then i is added to the appropriate index set. otherwise
1891         // x(i),w(i) is driven either +ve or -ve to force it to the valid region.
1892         // as we drive x(i), x(C) is also adjusted to keep w(C) at zero.
1893         // while driving x(i) we maintain the LCP conditions on the other variables
1894         // 0..i-1. we do this by watching out for other x(i),w(i) values going
1895         // outside the valid region, and then switching them between index sets
1896         // when that happens.
1897
1898         bool hit_first_friction_index = false;
1899         for (int i = adj_nub; i < n; ++i)
1900         {
1901                 s_error = false;
1902                 // the index i is the driving index and indexes i+1..n-1 are "dont care",
1903                 // i.e. when we make changes to the system those x's will be zero and we
1904                 // don't care what happens to those w's. in other words, we only consider
1905                 // an (i+1)*(i+1) sub-problem of A*x=b+w.
1906
1907                 // if we've hit the first friction index, we have to compute the lo and
1908                 // hi values based on the values of x already computed. we have been
1909                 // permuting the indexes, so the values stored in the findex vector are
1910                 // no longer valid. thus we have to temporarily unpermute the x vector.
1911                 // for the purposes of this computation, 0*infinity = 0 ... so if the
1912                 // contact constraint's normal force is 0, there should be no tangential
1913                 // force applied.
1914
1915                 if (!hit_first_friction_index && findex && findex[i] >= 0)
1916                 {
1917                         // un-permute x into delta_w, which is not being used at the moment
1918                         for (int j = 0; j < n; ++j) scratchMem.delta_w[scratchMem.p[j]] = x[j];
1919
1920                         // set lo and hi values
1921                         for (int k = i; k < n; ++k)
1922                         {
1923                                 btScalar wfk = scratchMem.delta_w[findex[k]];
1924                                 if (wfk == 0)
1925                                 {
1926                                         hi[k] = 0;
1927                                         lo[k] = 0;
1928                                 }
1929                                 else
1930                                 {
1931                                         hi[k] = btFabs(hi[k] * wfk);
1932                                         lo[k] = -hi[k];
1933                                 }
1934                         }
1935                         hit_first_friction_index = true;
1936                 }
1937
1938                 // thus far we have not even been computing the w values for indexes
1939                 // greater than i, so compute w[i] now.
1940                 w[i] = lcp.AiC_times_qC(i, x) + lcp.AiN_times_qN(i, x) - b[i];
1941
1942                 // if lo=hi=0 (which can happen for tangential friction when normals are
1943                 // 0) then the index will be assigned to set N with some state. however,
1944                 // set C's line has zero size, so the index will always remain in set N.
1945                 // with the "normal" switching logic, if w changed sign then the index
1946                 // would have to switch to set C and then back to set N with an inverted
1947                 // state. this is pointless, and also computationally expensive. to
1948                 // prevent this from happening, we use the rule that indexes with lo=hi=0
1949                 // will never be checked for set changes. this means that the state for
1950                 // these indexes may be incorrect, but that doesn't matter.
1951
1952                 // see if x(i),w(i) is in a valid region
1953                 if (lo[i] == 0 && w[i] >= 0)
1954                 {
1955                         lcp.transfer_i_to_N(i);
1956                         scratchMem.state[i] = false;
1957                 }
1958                 else if (hi[i] == 0 && w[i] <= 0)
1959                 {
1960                         lcp.transfer_i_to_N(i);
1961                         scratchMem.state[i] = true;
1962                 }
1963                 else if (w[i] == 0)
1964                 {
1965                         // this is a degenerate case. by the time we get to this test we know
1966                         // that lo != 0, which means that lo < 0 as lo is not allowed to be +ve,
1967                         // and similarly that hi > 0. this means that the line segment
1968                         // corresponding to set C is at least finite in extent, and we are on it.
1969                         // NOTE: we must call lcp.solve1() before lcp.transfer_i_to_C()
1970                         lcp.solve1(&scratchMem.delta_x[0], i, 0, 1);
1971
1972                         lcp.transfer_i_to_C(i);
1973                 }
1974                 else
1975                 {
1976                         // we must push x(i) and w(i)
1977                         for (;;)
1978                         {
1979                                 int dir;
1980                                 btScalar dirf;
1981                                 // find direction to push on x(i)
1982                                 if (w[i] <= 0)
1983                                 {
1984                                         dir = 1;
1985                                         dirf = btScalar(1.0);
1986                                 }
1987                                 else
1988                                 {
1989                                         dir = -1;
1990                                         dirf = btScalar(-1.0);
1991                                 }
1992
1993                                 // compute: delta_x(C) = -dir*A(C,C)\A(C,i)
1994                                 lcp.solve1(&scratchMem.delta_x[0], i, dir);
1995
1996                                 // note that delta_x[i] = dirf, but we wont bother to set it
1997
1998                                 // compute: delta_w = A*delta_x ... note we only care about
1999                                 // delta_w(N) and delta_w(i), the rest is ignored
2000                                 lcp.pN_equals_ANC_times_qC(&scratchMem.delta_w[0], &scratchMem.delta_x[0]);
2001                                 lcp.pN_plusequals_ANi(&scratchMem.delta_w[0], i, dir);
2002                                 scratchMem.delta_w[i] = lcp.AiC_times_qC(i, &scratchMem.delta_x[0]) + lcp.Aii(i) * dirf;
2003
2004                                 // find largest step we can take (size=s), either to drive x(i),w(i)
2005                                 // to the valid LCP region or to drive an already-valid variable
2006                                 // outside the valid region.
2007
2008                                 int cmd = 1;  // index switching command
2009                                 int si = 0;   // si = index to switch if cmd>3
2010                                 btScalar s = -w[i] / scratchMem.delta_w[i];
2011                                 if (dir > 0)
2012                                 {
2013                                         if (hi[i] < BT_INFINITY)
2014                                         {
2015                                                 btScalar s2 = (hi[i] - x[i]) * dirf;  // was (hi[i]-x[i])/dirf  // step to x(i)=hi(i)
2016                                                 if (s2 < s)
2017                                                 {
2018                                                         s = s2;
2019                                                         cmd = 3;
2020                                                 }
2021                                         }
2022                                 }
2023                                 else
2024                                 {
2025                                         if (lo[i] > -BT_INFINITY)
2026                                         {
2027                                                 btScalar s2 = (lo[i] - x[i]) * dirf;  // was (lo[i]-x[i])/dirf  // step to x(i)=lo(i)
2028                                                 if (s2 < s)
2029                                                 {
2030                                                         s = s2;
2031                                                         cmd = 2;
2032                                                 }
2033                                         }
2034                                 }
2035
2036                                 {
2037                                         const int numN = lcp.numN();
2038                                         for (int k = 0; k < numN; ++k)
2039                                         {
2040                                                 const int indexN_k = lcp.indexN(k);
2041                                                 if (!scratchMem.state[indexN_k] ? scratchMem.delta_w[indexN_k] < 0 : scratchMem.delta_w[indexN_k] > 0)
2042                                                 {
2043                                                         // don't bother checking if lo=hi=0
2044                                                         if (lo[indexN_k] == 0 && hi[indexN_k] == 0) continue;
2045                                                         btScalar s2 = -w[indexN_k] / scratchMem.delta_w[indexN_k];
2046                                                         if (s2 < s)
2047                                                         {
2048                                                                 s = s2;
2049                                                                 cmd = 4;
2050                                                                 si = indexN_k;
2051                                                         }
2052                                                 }
2053                                         }
2054                                 }
2055
2056                                 {
2057                                         const int numC = lcp.numC();
2058                                         for (int k = adj_nub; k < numC; ++k)
2059                                         {
2060                                                 const int indexC_k = lcp.indexC(k);
2061                                                 if (scratchMem.delta_x[indexC_k] < 0 && lo[indexC_k] > -BT_INFINITY)
2062                                                 {
2063                                                         btScalar s2 = (lo[indexC_k] - x[indexC_k]) / scratchMem.delta_x[indexC_k];
2064                                                         if (s2 < s)
2065                                                         {
2066                                                                 s = s2;
2067                                                                 cmd = 5;
2068                                                                 si = indexC_k;
2069                                                         }
2070                                                 }
2071                                                 if (scratchMem.delta_x[indexC_k] > 0 && hi[indexC_k] < BT_INFINITY)
2072                                                 {
2073                                                         btScalar s2 = (hi[indexC_k] - x[indexC_k]) / scratchMem.delta_x[indexC_k];
2074                                                         if (s2 < s)
2075                                                         {
2076                                                                 s = s2;
2077                                                                 cmd = 6;
2078                                                                 si = indexC_k;
2079                                                         }
2080                                                 }
2081                                         }
2082                                 }
2083
2084                                 //static char* cmdstring[8] = {0,"->C","->NL","->NH","N->C",
2085                                 //                           "C->NL","C->NH"};
2086                                 //printf ("cmd=%d (%s), si=%d\n",cmd,cmdstring[cmd],(cmd>3) ? si : i);
2087
2088                                 // if s <= 0 then we've got a problem. if we just keep going then
2089                                 // we're going to get stuck in an infinite loop. instead, just cross
2090                                 // our fingers and exit with the current solution.
2091                                 if (s <= btScalar(0.0))
2092                                 {
2093                                         //          printf("LCP internal error, s <= 0 (s=%.4e)",(double)s);
2094                                         if (i < n)
2095                                         {
2096                                                 btSetZero(x + i, n - i);
2097                                                 btSetZero(w + i, n - i);
2098                                         }
2099                                         s_error = true;
2100                                         break;
2101                                 }
2102
2103                                 // apply x = x + s * delta_x
2104                                 lcp.pC_plusequals_s_times_qC(x, s, &scratchMem.delta_x[0]);
2105                                 x[i] += s * dirf;
2106
2107                                 // apply w = w + s * delta_w
2108                                 lcp.pN_plusequals_s_times_qN(w, s, &scratchMem.delta_w[0]);
2109                                 w[i] += s * scratchMem.delta_w[i];
2110
2111                                 //        void *tmpbuf;
2112                                 // switch indexes between sets if necessary
2113                                 switch (cmd)
2114                                 {
2115                                         case 1:  // done
2116                                                 w[i] = 0;
2117                                                 lcp.transfer_i_to_C(i);
2118                                                 break;
2119                                         case 2:  // done
2120                                                 x[i] = lo[i];
2121                                                 scratchMem.state[i] = false;
2122                                                 lcp.transfer_i_to_N(i);
2123                                                 break;
2124                                         case 3:  // done
2125                                                 x[i] = hi[i];
2126                                                 scratchMem.state[i] = true;
2127                                                 lcp.transfer_i_to_N(i);
2128                                                 break;
2129                                         case 4:  // keep going
2130                                                 w[si] = 0;
2131                                                 lcp.transfer_i_from_N_to_C(si);
2132                                                 break;
2133                                         case 5:  // keep going
2134                                                 x[si] = lo[si];
2135                                                 scratchMem.state[si] = false;
2136                                                 lcp.transfer_i_from_C_to_N(si, scratchMem.m_scratch);
2137                                                 break;
2138                                         case 6:  // keep going
2139                                                 x[si] = hi[si];
2140                                                 scratchMem.state[si] = true;
2141                                                 lcp.transfer_i_from_C_to_N(si, scratchMem.m_scratch);
2142                                                 break;
2143                                 }
2144
2145                                 if (cmd <= 3) break;
2146                         }  // for (;;)
2147                 }      // else
2148
2149                 if (s_error)
2150                 {
2151                         break;
2152                 }
2153         }  // for (int i=adj_nub; i<n; ++i)
2154
2155         lcp.unpermute();
2156
2157         return !s_error;
2158 }