internal/ceres/inner_product_computer.cc

   1 // Ceres Solver - A fast non-linear least squares minimizer
   2 // Copyright 2017 Google Inc. All rights reserved.
   3 // http://ceres-solver.org/
   4 //
   5 // Redistribution and use in source and binary forms, with or without
   6 // modification, are permitted provided that the following conditions are met:
   7 //
   8 // * Redistributions of source code must retain the above copyright notice,
   9 //   this list of conditions and the following disclaimer.
  10 // * Redistributions in binary form must reproduce the above copyright notice,
  11 //   this list of conditions and the following disclaimer in the documentation
  12 //   and/or other materials provided with the distribution.
  13 // * Neither the name of Google Inc. nor the names of its contributors may be
  14 //   used to endorse or promote products derived from this software without
  15 //   specific prior written permission.
  16 //
  17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  18 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  21 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  22 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  23 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  24 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  25 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  26 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  27 // POSSIBILITY OF SUCH DAMAGE.
  28 //
  29 // Author: sameeragarwal@google.com (Sameer Agarwal)
  30
  31 #include "ceres/inner_product_computer.h"
  32
  33 #include <algorithm>
  34
  35 namespace ceres {
  36 namespace internal {
  37 namespace {
  38
  39 // Compute the product (in MATLAB notation)
  40 //
  41 // c(0:a_cols, 0:b_cols) = a' * b
  42 //
  43 // Where:
  44 //
  45 //  a is ab_rows x a_cols
  46 //  b is ab_rows x b_cols
  47 //  c is a_cos x c_col_stride
  48 //
  49 // a, b and c are row-major matrices.
  50 //
  51 // Performance note:
  52 // ----------------
  53 //
  54 // Technically this function is a repeat of a similarly named function
  55 // in small_blas.h but its performance is considerably better than
  56 // that of the version there due to the way it accesses memory.
  57 //
  58 // TODO(sameeragarwal): Measure and tune the performance of
  59 // small_blas.h based on the insights gained here.
  60 EIGEN_STRONG_INLINE void MatrixTransposeMatrixMultiply(const int ab_rows,
  61                                                        const double* a,
  62                                                        const int a_cols,
  63                                                        const double* b,
  64                                                        const int b_cols,
  65                                                        double* c,
  66                                                        int c_cols) {
  67   // Compute c as the sum of ab_rows, rank 1 outer products of the
  68   // corresponding rows of a and b.
  69   for (int r = 0; r < ab_rows; ++r) {
  70     double* c_r = c;
  71     for (int i1 = 0; i1 < a_cols; ++i1) {
  72       const double a_v = a[i1];
  73       for (int i2 = 0; i2 < b_cols; ++i2) {
  74         c_r[i2] += a_v * b[i2];
  75       }
  76       c_r += c_cols;
  77     }
  78     a += a_cols;
  79     b += b_cols;
  80   }
  81 }
  82
  83 }  // namespace
  84
  85 // Create the CompressedRowSparseMatrix matrix that will contain the
  86 // inner product.
  87 //
  88 // storage_type controls whether the result matrix contains the upper
  89 // or the lower triangular part of the product.
  90 //
  91 // num_nonzeros is the number of non-zeros in the result matrix.
  92 CompressedRowSparseMatrix* InnerProductComputer::CreateResultMatrix(
  93     const CompressedRowSparseMatrix::StorageType storage_type,
  94     const int num_nonzeros) {
  95   CompressedRowSparseMatrix* matrix =
  96       new CompressedRowSparseMatrix(m_.num_cols(), m_.num_cols(), num_nonzeros);
  97   matrix->set_storage_type(storage_type);
  98
  99   const CompressedRowBlockStructure* bs = m_.block_structure();
 100   const std::vector<Block>& blocks = bs->cols;
 101   matrix->mutable_row_blocks()->resize(blocks.size());
 102   matrix->mutable_col_blocks()->resize(blocks.size());
 103   for (int i = 0; i < blocks.size(); ++i) {
 104     (*(matrix->mutable_row_blocks()))[i] = blocks[i].size;
 105     (*(matrix->mutable_col_blocks()))[i] = blocks[i].size;
 106   }
 107
 108   return matrix;
 109 }
 110
 111 // Given the set of product terms in the inner product, return the
 112 // total number of non-zeros in the result and for each row block of
 113 // the result matrix, compute the number of non-zeros in any one row
 114 // of the row block.
 115 int InnerProductComputer::ComputeNonzeros(
 116     const std::vector<InnerProductComputer::ProductTerm>& product_terms,
 117     std::vector<int>* row_nnz) {
 118   const CompressedRowBlockStructure* bs = m_.block_structure();
 119   const std::vector<Block>& blocks = bs->cols;
 120
 121   row_nnz->resize(blocks.size());
 122   std::fill(row_nnz->begin(), row_nnz->end(), 0);
 123
 124   // First product term.
 125   (*row_nnz)[product_terms[0].row] = blocks[product_terms[0].col].size;
 126   int num_nonzeros =
 127       blocks[product_terms[0].row].size * blocks[product_terms[0].col].size;
 128
 129   // Remaining product terms.
 130   for (int i = 1; i < product_terms.size(); ++i) {
 131     const ProductTerm& previous = product_terms[i - 1];
 132     const ProductTerm& current = product_terms[i];
 133
 134     // Each (row, col) block counts only once.
 135     // This check depends on product sorted on (row, col).
 136     if (current.row != previous.row || current.col != previous.col) {
 137       (*row_nnz)[current.row] += blocks[current.col].size;
 138       num_nonzeros += blocks[current.row].size * blocks[current.col].size;
 139     }
 140   }
 141
 142   return num_nonzeros;
 143 }
 144
 145 InnerProductComputer::InnerProductComputer(const BlockSparseMatrix& m,
 146                                            const int start_row_block,
 147                                            const int end_row_block)
 148     : m_(m), start_row_block_(start_row_block), end_row_block_(end_row_block) {}
 149
 150 // Compute the sparsity structure of the product m.transpose() * m
 151 // and create a CompressedRowSparseMatrix corresponding to it.
 152 //
 153 // Also compute the "program" vector, which for every term in the
 154 // block outer product provides the information for the entry in the
 155 // values array of the result matrix where it should be accumulated.
 156 //
 157 // Since the entries of the program are the same for rows with the
 158 // same sparsity structure, the program only stores the result for one
 159 // row per row block. The Compute function reuses this information for
 160 // each row in the row block.
 161 //
 162 // product_storage_type controls the form of the output matrix. It
 163 // can be LOWER_TRIANGULAR or UPPER_TRIANGULAR.
 164 InnerProductComputer* InnerProductComputer::Create(
 165     const BlockSparseMatrix& m,
 166     CompressedRowSparseMatrix::StorageType product_storage_type) {
 167   return InnerProductComputer::Create(
 168       m, 0, m.block_structure()->rows.size(), product_storage_type);
 169 }
 170
 171 InnerProductComputer* InnerProductComputer::Create(
 172     const BlockSparseMatrix& m,
 173     const int start_row_block,
 174     const int end_row_block,
 175     CompressedRowSparseMatrix::StorageType product_storage_type) {
 176   CHECK(product_storage_type == CompressedRowSparseMatrix::LOWER_TRIANGULAR ||
 177         product_storage_type == CompressedRowSparseMatrix::UPPER_TRIANGULAR);
 178   CHECK_GT(m.num_nonzeros(), 0)
 179       << "Congratulations, you found a bug in Ceres. Please report it.";
 180   InnerProductComputer* inner_product_computer =
 181       new InnerProductComputer(m, start_row_block, end_row_block);
 182   inner_product_computer->Init(product_storage_type);
 183   return inner_product_computer;
 184 }
 185
 186 void InnerProductComputer::Init(
 187     const CompressedRowSparseMatrix::StorageType product_storage_type) {
 188   std::vector<InnerProductComputer::ProductTerm> product_terms;
 189   const CompressedRowBlockStructure* bs = m_.block_structure();
 190
 191   // Give input matrix m in Block Sparse format
 192   //     (row_block, col_block)
 193   // represent each block multiplication
 194   //     (row_block, col_block1)' X (row_block, col_block2)
 195   // by its product term:
 196   //     (col_block1, col_block2, index)
 197   for (int row_block = start_row_block_; row_block < end_row_block_;
 198        ++row_block) {
 199     const CompressedRow& row = bs->rows[row_block];
 200     for (int c1 = 0; c1 < row.cells.size(); ++c1) {
 201       const Cell& cell1 = row.cells[c1];
 202       int c2_begin, c2_end;
 203       if (product_storage_type == CompressedRowSparseMatrix::LOWER_TRIANGULAR) {
 204         c2_begin = 0;
 205         c2_end = c1 + 1;
 206       } else {
 207         c2_begin = c1;
 208         c2_end = row.cells.size();
 209       }
 210
 211       for (int c2 = c2_begin; c2 < c2_end; ++c2) {
 212         const Cell& cell2 = row.cells[c2];
 213         product_terms.push_back(InnerProductComputer::ProductTerm(
 214             cell1.block_id, cell2.block_id, product_terms.size()));
 215       }
 216     }
 217   }
 218
 219   std::sort(product_terms.begin(), product_terms.end());
 220   ComputeOffsetsAndCreateResultMatrix(product_storage_type, product_terms);
 221 }
 222
 223 void InnerProductComputer::ComputeOffsetsAndCreateResultMatrix(
 224     const CompressedRowSparseMatrix::StorageType product_storage_type,
 225     const std::vector<InnerProductComputer::ProductTerm>& product_terms) {
 226   const std::vector<Block>& col_blocks = m_.block_structure()->cols;
 227
 228   std::vector<int> row_block_nnz;
 229   const int num_nonzeros = ComputeNonzeros(product_terms, &row_block_nnz);
 230
 231   result_.reset(CreateResultMatrix(product_storage_type, num_nonzeros));
 232
 233   // Populate the row non-zero counts in the result matrix.
 234   int* crsm_rows = result_->mutable_rows();
 235   crsm_rows[0] = 0;
 236   for (int i = 0; i < col_blocks.size(); ++i) {
 237     for (int j = 0; j < col_blocks[i].size; ++j, ++crsm_rows) {
 238       *(crsm_rows + 1) = *crsm_rows + row_block_nnz[i];
 239     }
 240   }
 241
 242   // The following macro FILL_CRSM_COL_BLOCK is key to understanding
 243   // how this class works.
 244   //
 245   // It does two things.
 246   //
 247   // Sets the value for the current term in the result_offsets_ array
 248   // and populates the cols array of the result matrix.
 249   //
 250   // row_block and col_block as the names imply, refer to the row and
 251   // column blocks of the current term.
 252   //
 253   // row_nnz is the number of nonzeros in the result_matrix at the
 254   // beginning of the first row of row_block.
 255   //
 256   // col_nnz is the number of nonzeros in the first row of the row
 257   // block that occur before the current column block, i.e. this is
 258   // sum of the sizes of all the column blocks in this row block that
 259   // came before this column block.
 260   //
 261   // Given these two numbers and the total number of nonzeros in this
 262   // row (nnz_in_row), we can now populate the cols array as follows:
 263   //
 264   // nnz + j * nnz_in_row is the beginning of the j^th row.
 265   //
 266   // nnz + j * nnz_in_row + col_nnz is the beginning of the column
 267   // block in the j^th row.
 268   //
 269   // nnz + j * nnz_in_row + col_nnz + k is then the j^th row and the
 270   // k^th column of the product block, whose value is
 271   //
 272   // col_blocks[col_block].position + k, which is the column number of
 273   // the k^th column of the current column block.
 274 #define FILL_CRSM_COL_BLOCK                                \
 275   const int row_block = current->row;                      \
 276   const int col_block = current->col;                      \
 277   const int nnz_in_row = row_block_nnz[row_block];         \
 278   int* crsm_cols = result_->mutable_cols();                \
 279   result_offsets_[current->index] = nnz + col_nnz;         \
 280   for (int j = 0; j < col_blocks[row_block].size; ++j) {   \
 281     for (int k = 0; k < col_blocks[col_block].size; ++k) { \
 282       crsm_cols[nnz + j * nnz_in_row + col_nnz + k] =      \
 283           col_blocks[col_block].position + k;              \
 284     }                                                      \
 285   }
 286
 287   result_offsets_.resize(product_terms.size());
 288   int col_nnz = 0;
 289   int nnz = 0;
 290
 291   // Process the first term.
 292   const InnerProductComputer::ProductTerm* current = &product_terms[0];
 293   FILL_CRSM_COL_BLOCK;
 294
 295   // Process the rest of the terms.
 296   for (int i = 1; i < product_terms.size(); ++i) {
 297     current = &product_terms[i];
 298     const InnerProductComputer::ProductTerm* previous = &product_terms[i - 1];
 299
 300     // If the current term is the same as the previous term, then it
 301     // stores its product at the same location as the previous term.
 302     if (previous->row == current->row && previous->col == current->col) {
 303       result_offsets_[current->index] = result_offsets_[previous->index];
 304       continue;
 305     }
 306
 307     if (previous->row == current->row) {
 308       // if the current and previous terms are in the same row block,
 309       // then they differ in the column block, in which case advance
 310       // col_nnz by the column size of the prevous term.
 311       col_nnz += col_blocks[previous->col].size;
 312     } else {
 313       // If we have moved to a new row-block , then col_nnz is zero,
 314       // and nnz is set to the beginning of the row block.
 315       col_nnz = 0;
 316       nnz += row_block_nnz[previous->row] * col_blocks[previous->row].size;
 317     }
 318
 319     FILL_CRSM_COL_BLOCK;
 320   }
 321 }
 322
 323 // Use the results_offsets_ array to numerically compute the product
 324 // m' * m and store it in result_.
 325 //
 326 // TODO(sameeragarwal): Multithreading support.
 327 void InnerProductComputer::Compute() {
 328   const double* m_values = m_.values();
 329   const CompressedRowBlockStructure* bs = m_.block_structure();
 330
 331   const CompressedRowSparseMatrix::StorageType storage_type =
 332       result_->storage_type();
 333   result_->SetZero();
 334   double* values = result_->mutable_values();
 335   const int* rows = result_->rows();
 336   int cursor = 0;
 337
 338   // Iterate row blocks.
 339   for (int r = start_row_block_; r < end_row_block_; ++r) {
 340     const CompressedRow& m_row = bs->rows[r];
 341     for (int c1 = 0; c1 < m_row.cells.size(); ++c1) {
 342       const Cell& cell1 = m_row.cells[c1];
 343       const int c1_size = bs->cols[cell1.block_id].size;
 344       const int row_nnz = rows[bs->cols[cell1.block_id].position + 1] -
 345           rows[bs->cols[cell1.block_id].position];
 346
 347       int c2_begin, c2_end;
 348       if (storage_type == CompressedRowSparseMatrix::LOWER_TRIANGULAR) {
 349         c2_begin = 0;
 350         c2_end = c1 + 1;
 351       } else {
 352         c2_begin = c1;
 353         c2_end = m_row.cells.size();
 354       }
 355
 356       for (int c2 = c2_begin; c2 < c2_end; ++c2, ++cursor) {
 357         const Cell& cell2 = m_row.cells[c2];
 358         const int c2_size = bs->cols[cell2.block_id].size;
 359         MatrixTransposeMatrixMultiply(m_row.block.size,
 360                                       m_values + cell1.position, c1_size,
 361                                       m_values + cell2.position, c2_size,
 362                                       values + result_offsets_[cursor],
 363                                       row_nnz);
 364       }
 365     }
 366   }
 367
 368   CHECK_EQ(cursor, result_offsets_.size());
 369 }
 370
 371 }  // namespace internal
 372 }  // namespace ceres