internal/ceres/schur_eliminator.h

   1 // Ceres Solver - A fast non-linear least squares minimizer
   2 // Copyright 2015 Google Inc. All rights reserved.
   3 // http://ceres-solver.org/
   4 //
   5 // Redistribution and use in source and binary forms, with or without
   6 // modification, are permitted provided that the following conditions are met:
   7 //
   8 // * Redistributions of source code must retain the above copyright notice,
   9 //   this list of conditions and the following disclaimer.
  10 // * Redistributions in binary form must reproduce the above copyright notice,
  11 //   this list of conditions and the following disclaimer in the documentation
  12 //   and/or other materials provided with the distribution.
  13 // * Neither the name of Google Inc. nor the names of its contributors may be
  14 //   used to endorse or promote products derived from this software without
  15 //   specific prior written permission.
  16 //
  17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  18 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  21 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  22 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  23 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  24 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  25 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  26 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  27 // POSSIBILITY OF SUCH DAMAGE.
  28 //
  29 // Author: sameeragarwal@google.com (Sameer Agarwal)
  30
  31 #ifndef CERES_INTERNAL_SCHUR_ELIMINATOR_H_
  32 #define CERES_INTERNAL_SCHUR_ELIMINATOR_H_
  33
  34 #include <map>
  35 #include <vector>
  36 #include "ceres/mutex.h"
  37 #include "ceres/block_random_access_matrix.h"
  38 #include "ceres/block_sparse_matrix.h"
  39 #include "ceres/block_structure.h"
  40 #include "ceres/linear_solver.h"
  41 #include "ceres/internal/eigen.h"
  42 #include "ceres/internal/scoped_ptr.h"
  43
  44 namespace ceres {
  45 namespace internal {
  46
  47 // Classes implementing the SchurEliminatorBase interface implement
  48 // variable elimination for linear least squares problems. Assuming
  49 // that the input linear system Ax = b can be partitioned into
  50 //
  51 //  E y + F z = b
  52 //
  53 // Where x = [y;z] is a partition of the variables.  The paritioning
  54 // of the variables is such that, E'E is a block diagonal matrix. Or
  55 // in other words, the parameter blocks in E form an independent set
  56 // of the of the graph implied by the block matrix A'A. Then, this
  57 // class provides the functionality to compute the Schur complement
  58 // system
  59 //
  60 //   S z = r
  61 //
  62 // where
  63 //
  64 //   S = F'F - F'E (E'E)^{-1} E'F and r = F'b - F'E(E'E)^(-1) E'b
  65 //
  66 // This is the Eliminate operation, i.e., construct the linear system
  67 // obtained by eliminating the variables in E.
  68 //
  69 // The eliminator also provides the reverse functionality, i.e. given
  70 // values for z it can back substitute for the values of y, by solving the
  71 // linear system
  72 //
  73 //  Ey = b - F z
  74 //
  75 // which is done by observing that
  76 //
  77 //  y = (E'E)^(-1) [E'b - E'F z]
  78 //
  79 // The eliminator has a number of requirements.
  80 //
  81 // The rows of A are ordered so that for every variable block in y,
  82 // all the rows containing that variable block occur as a vertically
  83 // contiguous block. i.e the matrix A looks like
  84 //
  85 //              E                 F                   chunk
  86 //  A = [ y1   0   0   0 |  z1    0    0   0    z5]     1
  87 //      [ y1   0   0   0 |  z1   z2    0   0     0]     1
  88 //      [  0  y2   0   0 |   0    0   z3   0     0]     2
  89 //      [  0   0  y3   0 |  z1   z2   z3  z4    z5]     3
  90 //      [  0   0  y3   0 |  z1    0    0   0    z5]     3
  91 //      [  0   0   0  y4 |   0    0    0   0    z5]     4
  92 //      [  0   0   0  y4 |   0   z2    0   0     0]     4
  93 //      [  0   0   0  y4 |   0    0    0   0     0]     4
  94 //      [  0   0   0   0 |  z1    0    0   0     0] non chunk blocks
  95 //      [  0   0   0   0 |   0    0   z3  z4    z5] non chunk blocks
  96 //
  97 // This structure should be reflected in the corresponding
  98 // CompressedRowBlockStructure object associated with A. The linear
  99 // system Ax = b should either be well posed or the array D below
 100 // should be non-null and the diagonal matrix corresponding to it
 101 // should be non-singular. For simplicity of exposition only the case
 102 // with a null D is described.
 103 //
 104 // The usual way to do the elimination is as follows. Starting with
 105 //
 106 //  E y + F z = b
 107 //
 108 // we can form the normal equations,
 109 //
 110 //  E'E y + E'F z = E'b
 111 //  F'E y + F'F z = F'b
 112 //
 113 // multiplying both sides of the first equation by (E'E)^(-1) and then
 114 // by F'E we get
 115 //
 116 //  F'E y + F'E (E'E)^(-1) E'F z =  F'E (E'E)^(-1) E'b
 117 //  F'E y +                F'F z =  F'b
 118 //
 119 // now subtracting the two equations we get
 120 //
 121 // [FF' - F'E (E'E)^(-1) E'F] z = F'b - F'E(E'E)^(-1) E'b
 122 //
 123 // Instead of forming the normal equations and operating on them as
 124 // general sparse matrices, the algorithm here deals with one
 125 // parameter block in y at a time. The rows corresponding to a single
 126 // parameter block yi are known as a chunk, and the algorithm operates
 127 // on one chunk at a time. The mathematics remains the same since the
 128 // reduced linear system can be shown to be the sum of the reduced
 129 // linear systems for each chunk. This can be seen by observing two
 130 // things.
 131 //
 132 //  1. E'E is a block diagonal matrix.
 133 //
 134 //  2. When E'F is computed, only the terms within a single chunk
 135 //  interact, i.e for y1 column blocks when transposed and multiplied
 136 //  with F, the only non-zero contribution comes from the blocks in
 137 //  chunk1.
 138 //
 139 // Thus, the reduced linear system
 140 //
 141 //  FF' - F'E (E'E)^(-1) E'F
 142 //
 143 // can be re-written as
 144 //
 145 //  sum_k F_k F_k' - F_k'E_k (E_k'E_k)^(-1) E_k' F_k
 146 //
 147 // Where the sum is over chunks and E_k'E_k is dense matrix of size y1
 148 // x y1.
 149 //
 150 // Advanced usage. Uptil now it has been assumed that the user would
 151 // be interested in all of the Schur Complement S. However, it is also
 152 // possible to use this eliminator to obtain an arbitrary submatrix of
 153 // the full Schur complement. When the eliminator is generating the
 154 // blocks of S, it asks the RandomAccessBlockMatrix instance passed to
 155 // it if it has storage for that block. If it does, the eliminator
 156 // computes/updates it, if not it is skipped. This is useful when one
 157 // is interested in constructing a preconditioner based on the Schur
 158 // Complement, e.g., computing the block diagonal of S so that it can
 159 // be used as a preconditioner for an Iterative Substructuring based
 160 // solver [See Agarwal et al, Bundle Adjustment in the Large, ECCV
 161 // 2008 for an example of such use].
 162 //
 163 // Example usage: Please see schur_complement_solver.cc
 164 class SchurEliminatorBase {
 165  public:
 166   virtual ~SchurEliminatorBase() {}
 167
 168   // Initialize the eliminator. It is the user's responsibilty to call
 169   // this function before calling Eliminate or BackSubstitute. It is
 170   // also the caller's responsibilty to ensure that the
 171   // CompressedRowBlockStructure object passed to this method is the
 172   // same one (or is equivalent to) the one associated with the
 173   // BlockSparseMatrix objects below.
 174   //
 175   // assume_full_rank_ete controls how the eliminator inverts with the
 176   // diagonal blocks corresponding to e blocks in A'A. If
 177   // assume_full_rank_ete is true, then a Cholesky factorization is
 178   // used to compute the inverse, otherwise a singular value
 179   // decomposition is used to compute the pseudo inverse.
 180   virtual void Init(int num_eliminate_blocks,
 181                     bool assume_full_rank_ete,
 182                     const CompressedRowBlockStructure* bs) = 0;
 183
 184   // Compute the Schur complement system from the augmented linear
 185   // least squares problem [A;D] x = [b;0]. The left hand side and the
 186   // right hand side of the reduced linear system are returned in lhs
 187   // and rhs respectively.
 188   //
 189   // It is the caller's responsibility to construct and initialize
 190   // lhs. Depending upon the structure of the lhs object passed here,
 191   // the full or a submatrix of the Schur complement will be computed.
 192   //
 193   // Since the Schur complement is a symmetric matrix, only the upper
 194   // triangular part of the Schur complement is computed.
 195   virtual void Eliminate(const BlockSparseMatrix* A,
 196                          const double* b,
 197                          const double* D,
 198                          BlockRandomAccessMatrix* lhs,
 199                          double* rhs) = 0;
 200
 201   // Given values for the variables z in the F block of A, solve for
 202   // the optimal values of the variables y corresponding to the E
 203   // block in A.
 204   virtual void BackSubstitute(const BlockSparseMatrix* A,
 205                               const double* b,
 206                               const double* D,
 207                               const double* z,
 208                               double* y) = 0;
 209   // Factory
 210   static SchurEliminatorBase* Create(const LinearSolver::Options& options);
 211 };
 212
 213 // Templated implementation of the SchurEliminatorBase interface. The
 214 // templating is on the sizes of the row, e and f blocks sizes in the
 215 // input matrix. In many problems, the sizes of one or more of these
 216 // blocks are constant, in that case, its worth passing these
 217 // parameters as template arguments so that they are visible to the
 218 // compiler and can be used for compile time optimization of the low
 219 // level linear algebra routines.
 220 //
 221 // This implementation is mulithreaded using OpenMP. The level of
 222 // parallelism is controlled by LinearSolver::Options::num_threads.
 223 template <int kRowBlockSize = Eigen::Dynamic,
 224           int kEBlockSize = Eigen::Dynamic,
 225           int kFBlockSize = Eigen::Dynamic >
 226 class SchurEliminator : public SchurEliminatorBase {
 227  public:
 228   explicit SchurEliminator(const LinearSolver::Options& options)
 229       : num_threads_(options.num_threads) {
 230   }
 231
 232   // SchurEliminatorBase Interface
 233   virtual ~SchurEliminator();
 234   virtual void Init(int num_eliminate_blocks,
 235                     bool assume_full_rank_ete,
 236                     const CompressedRowBlockStructure* bs);
 237   virtual void Eliminate(const BlockSparseMatrix* A,
 238                          const double* b,
 239                          const double* D,
 240                          BlockRandomAccessMatrix* lhs,
 241                          double* rhs);
 242   virtual void BackSubstitute(const BlockSparseMatrix* A,
 243                               const double* b,
 244                               const double* D,
 245                               const double* z,
 246                               double* y);
 247
 248  private:
 249   // Chunk objects store combinatorial information needed to
 250   // efficiently eliminate a whole chunk out of the least squares
 251   // problem. Consider the first chunk in the example matrix above.
 252   //
 253   //      [ y1   0   0   0 |  z1    0    0   0    z5]
 254   //      [ y1   0   0   0 |  z1   z2    0   0     0]
 255   //
 256   // One of the intermediate quantities that needs to be calculated is
 257   // for each row the product of the y block transposed with the
 258   // non-zero z block, and the sum of these blocks across rows. A
 259   // temporary array "buffer_" is used for computing and storing them
 260   // and the buffer_layout maps the indices of the z-blocks to
 261   // position in the buffer_ array.  The size of the chunk is the
 262   // number of row blocks/residual blocks for the particular y block
 263   // being considered.
 264   //
 265   // For the example chunk shown above,
 266   //
 267   // size = 2
 268   //
 269   // The entries of buffer_layout will be filled in the following order.
 270   //
 271   // buffer_layout[z1] = 0
 272   // buffer_layout[z5] = y1 * z1
 273   // buffer_layout[z2] = y1 * z1 + y1 * z5
 274   typedef std::map<int, int> BufferLayoutType;
 275   struct Chunk {
 276     Chunk() : size(0) {}
 277     int size;
 278     int start;
 279     BufferLayoutType buffer_layout;
 280   };
 281
 282   void ChunkDiagonalBlockAndGradient(
 283       const Chunk& chunk,
 284       const BlockSparseMatrix* A,
 285       const double* b,
 286       int row_block_counter,
 287       typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix* eet,
 288       double* g,
 289       double* buffer,
 290       BlockRandomAccessMatrix* lhs);
 291
 292   void UpdateRhs(const Chunk& chunk,
 293                  const BlockSparseMatrix* A,
 294                  const double* b,
 295                  int row_block_counter,
 296                  const double* inverse_ete_g,
 297                  double* rhs);
 298
 299   void ChunkOuterProduct(const CompressedRowBlockStructure* bs,
 300                          const Matrix& inverse_eet,
 301                          const double* buffer,
 302                          const BufferLayoutType& buffer_layout,
 303                          BlockRandomAccessMatrix* lhs);
 304   void EBlockRowOuterProduct(const BlockSparseMatrix* A,
 305                              int row_block_index,
 306                              BlockRandomAccessMatrix* lhs);
 307
 308
 309   void NoEBlockRowsUpdate(const BlockSparseMatrix* A,
 310                              const double* b,
 311                              int row_block_counter,
 312                              BlockRandomAccessMatrix* lhs,
 313                              double* rhs);
 314
 315   void NoEBlockRowOuterProduct(const BlockSparseMatrix* A,
 316                                int row_block_index,
 317                                BlockRandomAccessMatrix* lhs);
 318
 319   int num_threads_;
 320   int num_eliminate_blocks_;
 321   bool assume_full_rank_ete_;
 322
 323   // Block layout of the columns of the reduced linear system. Since
 324   // the f blocks can be of varying size, this vector stores the
 325   // position of each f block in the row/col of the reduced linear
 326   // system. Thus lhs_row_layout_[i] is the row/col position of the
 327   // i^th f block.
 328   std::vector<int> lhs_row_layout_;
 329
 330   // Combinatorial structure of the chunks in A. For more information
 331   // see the documentation of the Chunk object above.
 332   std::vector<Chunk> chunks_;
 333
 334   // TODO(sameeragarwal): The following two arrays contain per-thread
 335   // storage. They should be refactored into a per thread struct.
 336
 337   // Buffer to store the products of the y and z blocks generated
 338   // during the elimination phase. buffer_ is of size num_threads *
 339   // buffer_size_. Each thread accesses the chunk
 340   //
 341   //   [thread_id * buffer_size_ , (thread_id + 1) * buffer_size_]
 342   //
 343   scoped_array<double> buffer_;
 344
 345   // Buffer to store per thread matrix matrix products used by
 346   // ChunkOuterProduct. Like buffer_ it is of size num_threads *
 347   // buffer_size_. Each thread accesses the chunk
 348   //
 349   //   [thread_id * buffer_size_ , (thread_id + 1) * buffer_size_ -1]
 350   //
 351   scoped_array<double> chunk_outer_product_buffer_;
 352
 353   int buffer_size_;
 354   int uneliminated_row_begins_;
 355
 356   // Locks for the blocks in the right hand side of the reduced linear
 357   // system.
 358   std::vector<Mutex*> rhs_locks_;
 359 };
 360
 361 }  // namespace internal
 362 }  // namespace ceres
 363
 364 #endif  // CERES_INTERNAL_SCHUR_ELIMINATOR_H_