modules/flann/include/opencv2/flann/lsh_table.h

   1 /***********************************************************************
   2  * Software License Agreement (BSD License)
   3  *
   4  * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
   5  * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
   6  *
   7  * THE BSD LICENSE
   8  *
   9  * Redistribution and use in source and binary forms, with or without
  10  * modification, are permitted provided that the following conditions
  11  * are met:
  12  *
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  20  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  21  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  22  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  24  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  28  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29  *************************************************************************/
  30
  31 /***********************************************************************
  32  * Author: Vincent Rabaud
  33  *************************************************************************/
  34
  35 #ifndef OPENCV_FLANN_LSH_TABLE_H_
  36 #define OPENCV_FLANN_LSH_TABLE_H_
  37
  38 #include <algorithm>
  39 #include <iostream>
  40 #include <iomanip>
  41 #include <limits.h>
  42 // TODO as soon as we use C++0x, use the code in USE_UNORDERED_MAP
  43 #ifdef __GXX_EXPERIMENTAL_CXX0X__
  44 #  define USE_UNORDERED_MAP 1
  45 #else
  46 #  define USE_UNORDERED_MAP 0
  47 #endif
  48 #if USE_UNORDERED_MAP
  49 #include <unordered_map>
  50 #else
  51 #include <map>
  52 #endif
  53 #include <math.h>
  54 #include <stddef.h>
  55
  56 #include "dynamic_bitset.h"
  57 #include "matrix.h"
  58
  59 namespace cvflann
  60 {
  61
  62 namespace lsh
  63 {
  64
  65 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  66
  67 /** What is stored in an LSH bucket
  68  */
  69 typedef uint32_t FeatureIndex;
  70 /** The id from which we can get a bucket back in an LSH table
  71  */
  72 typedef unsigned int BucketKey;
  73
  74 /** A bucket in an LSH table
  75  */
  76 typedef std::vector<FeatureIndex> Bucket;
  77
  78 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  79
  80 /** POD for stats about an LSH table
  81  */
  82 struct LshStats
  83 {
  84     std::vector<unsigned int> bucket_sizes_;
  85     size_t n_buckets_;
  86     size_t bucket_size_mean_;
  87     size_t bucket_size_median_;
  88     size_t bucket_size_min_;
  89     size_t bucket_size_max_;
  90     size_t bucket_size_std_dev;
  91     /** Each contained vector contains three value: beginning/end for interval, number of elements in the bin
  92      */
  93     std::vector<std::vector<unsigned int> > size_histogram_;
  94 };
  95
  96 /** Overload the << operator for LshStats
  97  * @param out the streams
  98  * @param stats the stats to display
  99  * @return the streams
 100  */
 101 inline std::ostream& operator <<(std::ostream& out, const LshStats& stats)
 102 {
 103     int w = 20;
 104     out << "Lsh Table Stats:\n" << std::setw(w) << std::setiosflags(std::ios::right) << "N buckets : "
 105     << stats.n_buckets_ << "\n" << std::setw(w) << std::setiosflags(std::ios::right) << "mean size : "
 106     << std::setiosflags(std::ios::left) << stats.bucket_size_mean_ << "\n" << std::setw(w)
 107     << std::setiosflags(std::ios::right) << "median size : " << stats.bucket_size_median_ << "\n" << std::setw(w)
 108     << std::setiosflags(std::ios::right) << "min size : " << std::setiosflags(std::ios::left)
 109     << stats.bucket_size_min_ << "\n" << std::setw(w) << std::setiosflags(std::ios::right) << "max size : "
 110     << std::setiosflags(std::ios::left) << stats.bucket_size_max_;
 111
 112     // Display the histogram
 113     out << std::endl << std::setw(w) << std::setiosflags(std::ios::right) << "histogram : "
 114     << std::setiosflags(std::ios::left);
 115     for (std::vector<std::vector<unsigned int> >::const_iterator iterator = stats.size_histogram_.begin(), end =
 116              stats.size_histogram_.end(); iterator != end; ++iterator) out << (*iterator)[0] << "-" << (*iterator)[1] << ": " << (*iterator)[2] << ",  ";
 117
 118     return out;
 119 }
 120
 121
 122 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 123
 124 /** Lsh hash table. As its key is a sub-feature, and as usually
 125  * the size of it is pretty small, we keep it as a continuous memory array.
 126  * The value is an index in the corpus of features (we keep it as an unsigned
 127  * int for pure memory reasons, it could be a size_t)
 128  */
 129 template<typename ElementType>
 130 class LshTable
 131 {
 132 public:
 133     /** A container of all the feature indices. Optimized for space
 134      */
 135 #if USE_UNORDERED_MAP
 136     typedef std::unordered_map<BucketKey, Bucket> BucketsSpace;
 137 #else
 138     typedef std::map<BucketKey, Bucket> BucketsSpace;
 139 #endif
 140
 141     /** A container of all the feature indices. Optimized for speed
 142      */
 143     typedef std::vector<Bucket> BucketsSpeed;
 144
 145     /** Default constructor
 146      */
 147     LshTable()
 148     {
 149     }
 150
 151     /** Default constructor
 152      * Create the mask and allocate the memory
 153      * @param feature_size is the size of the feature (considered as a ElementType[])
 154      * @param key_size is the number of bits that are turned on in the feature
 155      */
 156     LshTable(unsigned int /*feature_size*/, unsigned int /*key_size*/)
 157     {
 158         std::cerr << "LSH is not implemented for that type" << std::endl;
 159         assert(0);
 160     }
 161
 162     /** Add a feature to the table
 163      * @param value the value to store for that feature
 164      * @param feature the feature itself
 165      */
 166     void add(unsigned int value, const ElementType* feature)
 167     {
 168         // Add the value to the corresponding bucket
 169         BucketKey key = (lsh::BucketKey)getKey(feature);
 170
 171         switch (speed_level_) {
 172         case kArray:
 173             // That means we get the buckets from an array
 174             buckets_speed_[key].push_back(value);
 175             break;
 176         case kBitsetHash:
 177             // That means we can check the bitset for the presence of a key
 178             key_bitset_.set(key);
 179             buckets_space_[key].push_back(value);
 180             break;
 181         case kHash:
 182         {
 183             // That means we have to check for the hash table for the presence of a key
 184             buckets_space_[key].push_back(value);
 185             break;
 186         }
 187         }
 188     }
 189
 190     /** Add a set of features to the table
 191      * @param dataset the values to store
 192      */
 193     void add(Matrix<ElementType> dataset)
 194     {
 195 #if USE_UNORDERED_MAP
 196         buckets_space_.rehash((buckets_space_.size() + dataset.rows) * 1.2);
 197 #endif
 198         // Add the features to the table
 199         for (unsigned int i = 0; i < dataset.rows; ++i) add(i, dataset[i]);
 200         // Now that the table is full, optimize it for speed/space
 201         optimize();
 202     }
 203
 204     /** Get a bucket given the key
 205      * @param key
 206      * @return
 207      */
 208     inline const Bucket* getBucketFromKey(BucketKey key) const
 209     {
 210         // Generate other buckets
 211         switch (speed_level_) {
 212         case kArray:
 213             // That means we get the buckets from an array
 214             return &buckets_speed_[key];
 215             break;
 216         case kBitsetHash:
 217             // That means we can check the bitset for the presence of a key
 218             if (key_bitset_.test(key)) return &buckets_space_.find(key)->second;
 219             else return 0;
 220             break;
 221         case kHash:
 222         {
 223             // That means we have to check for the hash table for the presence of a key
 224             BucketsSpace::const_iterator bucket_it, bucket_end = buckets_space_.end();
 225             bucket_it = buckets_space_.find(key);
 226             // Stop here if that bucket does not exist
 227             if (bucket_it == bucket_end) return 0;
 228             else return &bucket_it->second;
 229             break;
 230         }
 231         }
 232         return 0;
 233     }
 234
 235     /** Compute the sub-signature of a feature
 236      */
 237     size_t getKey(const ElementType* /*feature*/) const
 238     {
 239         std::cerr << "LSH is not implemented for that type" << std::endl;
 240         assert(0);
 241         return 1;
 242     }
 243
 244     /** Get statistics about the table
 245      * @return
 246      */
 247     LshStats getStats() const;
 248
 249 private:
 250     /** defines the speed fo the implementation
 251      * kArray uses a vector for storing data
 252      * kBitsetHash uses a hash map but checks for the validity of a key with a bitset
 253      * kHash uses a hash map only
 254      */
 255     enum SpeedLevel
 256     {
 257         kArray, kBitsetHash, kHash
 258     };
 259
 260     /** Initialize some variables
 261      */
 262     void initialize(size_t key_size)
 263     {
 264         const size_t key_size_lower_bound = 1;
 265         //a value (size_t(1) << key_size) must fit the size_t type so key_size has to be strictly less than size of size_t
 266         const size_t key_size_upper_bound = std::min(sizeof(BucketKey) * CHAR_BIT + 1, sizeof(size_t) * CHAR_BIT);
 267         if (key_size < key_size_lower_bound || key_size >= key_size_upper_bound)
 268         {
 269             CV_Error(cv::Error::StsBadArg, cv::format("Invalid key_size (=%d). Valid values for your system are %d <= key_size < %d.", (int)key_size, (int)key_size_lower_bound, (int)key_size_upper_bound));
 270         }
 271
 272         speed_level_ = kHash;
 273         key_size_ = (unsigned)key_size;
 274     }
 275
 276     /** Optimize the table for speed/space
 277      */
 278     void optimize()
 279     {
 280         // If we are already using the fast storage, no need to do anything
 281         if (speed_level_ == kArray) return;
 282
 283         // Use an array if it will be more than half full
 284         if (buckets_space_.size() > ((size_t(1) << key_size_) / 2)) {
 285             speed_level_ = kArray;
 286             // Fill the array version of it
 287             buckets_speed_.resize(size_t(1) << key_size_);
 288             for (BucketsSpace::const_iterator key_bucket = buckets_space_.begin(); key_bucket != buckets_space_.end(); ++key_bucket) buckets_speed_[key_bucket->first] = key_bucket->second;
 289
 290             // Empty the hash table
 291             buckets_space_.clear();
 292             return;
 293         }
 294
 295         // If the bitset is going to use less than 10% of the RAM of the hash map (at least 1 size_t for the key and two
 296         // for the vector) or less than 512MB (key_size_ <= 30)
 297         if (((std::max(buckets_space_.size(), buckets_speed_.size()) * CHAR_BIT * 3 * sizeof(BucketKey)) / 10
 298              >= (size_t(1) << key_size_)) || (key_size_ <= 32)) {
 299             speed_level_ = kBitsetHash;
 300             key_bitset_.resize(size_t(1) << key_size_);
 301             key_bitset_.reset();
 302             // Try with the BucketsSpace
 303             for (BucketsSpace::const_iterator key_bucket = buckets_space_.begin(); key_bucket != buckets_space_.end(); ++key_bucket) key_bitset_.set(key_bucket->first);
 304         }
 305         else {
 306             speed_level_ = kHash;
 307             key_bitset_.clear();
 308         }
 309     }
 310
 311     /** The vector of all the buckets if they are held for speed
 312      */
 313     BucketsSpeed buckets_speed_;
 314
 315     /** The hash table of all the buckets in case we cannot use the speed version
 316      */
 317     BucketsSpace buckets_space_;
 318
 319     /** What is used to store the data */
 320     SpeedLevel speed_level_;
 321
 322     /** If the subkey is small enough, it will keep track of which subkeys are set through that bitset
 323      * That is just a speedup so that we don't look in the hash table (which can be mush slower that checking a bitset)
 324      */
 325     DynamicBitset key_bitset_;
 326
 327     /** The size of the sub-signature in bits
 328      */
 329     unsigned int key_size_;
 330
 331     // Members only used for the unsigned char specialization
 332     /** The mask to apply to a feature to get the hash key
 333      * Only used in the unsigned char case
 334      */
 335     std::vector<size_t> mask_;
 336 };
 337
 338 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 339 // Specialization for unsigned char
 340
 341 template<>
 342 inline LshTable<unsigned char>::LshTable(unsigned int feature_size, unsigned int subsignature_size)
 343 {
 344     initialize(subsignature_size);
 345     // Allocate the mask
 346     mask_ = std::vector<size_t>((size_t)ceil((float)(feature_size * sizeof(char)) / (float)sizeof(size_t)), 0);
 347
 348     // A bit brutal but fast to code
 349     static std::vector<size_t>* indices = NULL;
 350
 351     //Ensure the Nth bit will be selected only once among the different LshTables
 352     //to avoid having two different tables with signatures sharing many dimensions/many bits
 353     if( indices == NULL )
 354     {
 355         indices = new std::vector<size_t>( feature_size * CHAR_BIT );
 356     }
 357     else if( indices->size() < key_size_ )
 358     {
 359       indices->resize( feature_size * CHAR_BIT );
 360       for (size_t i = 0; i < feature_size * CHAR_BIT; ++i) {
 361           (*indices)[i] = i;
 362       }
 363       std::random_shuffle(indices->begin(), indices->end());
 364     }
 365
 366     // Generate a random set of order of subsignature_size_ bits
 367     for (unsigned int i = 0; i < key_size_; ++i) {
 368         size_t index = (*indices)[0];
 369         indices->erase( indices->begin() );
 370
 371         // Set that bit in the mask
 372         size_t divisor = CHAR_BIT * sizeof(size_t);
 373         size_t idx = index / divisor; //pick the right size_t index
 374         mask_[idx] |= size_t(1) << (index % divisor); //use modulo to find the bit offset
 375     }
 376
 377     // Set to 1 if you want to display the mask for debug
 378 #if 0
 379     {
 380         size_t bcount = 0;
 381         BOOST_FOREACH(size_t mask_block, mask_){
 382             out << std::setw(sizeof(size_t) * CHAR_BIT / 4) << std::setfill('0') << std::hex << mask_block
 383                 << std::endl;
 384             bcount += __builtin_popcountll(mask_block);
 385         }
 386         out << "bit count : " << std::dec << bcount << std::endl;
 387         out << "mask size : " << mask_.size() << std::endl;
 388         return out;
 389     }
 390 #endif
 391 }
 392
 393 /** Return the Subsignature of a feature
 394  * @param feature the feature to analyze
 395  */
 396 template<>
 397 inline size_t LshTable<unsigned char>::getKey(const unsigned char* feature) const
 398 {
 399     // no need to check if T is dividable by sizeof(size_t) like in the Hamming
 400     // distance computation as we have a mask
 401     const size_t* feature_block_ptr = reinterpret_cast<const size_t*> ((const void*)feature);
 402
 403     // Figure out the subsignature of the feature
 404     // Given the feature ABCDEF, and the mask 001011, the output will be
 405     // 000CEF
 406     size_t subsignature = 0;
 407     size_t bit_index = 1;
 408
 409     for (std::vector<size_t>::const_iterator pmask_block = mask_.begin(); pmask_block != mask_.end(); ++pmask_block) {
 410         // get the mask and signature blocks
 411         size_t feature_block = *feature_block_ptr;
 412         size_t mask_block = *pmask_block;
 413         while (mask_block) {
 414             // Get the lowest set bit in the mask block
 415             size_t lowest_bit = mask_block & (-(ptrdiff_t)mask_block);
 416             // Add it to the current subsignature if necessary
 417             subsignature += (feature_block & lowest_bit) ? bit_index : 0;
 418             // Reset the bit in the mask block
 419             mask_block ^= lowest_bit;
 420             // increment the bit index for the subsignature
 421             bit_index <<= 1;
 422         }
 423         // Check the next feature block
 424         ++feature_block_ptr;
 425     }
 426     return subsignature;
 427 }
 428
 429 template<>
 430 inline LshStats LshTable<unsigned char>::getStats() const
 431 {
 432     LshStats stats;
 433     stats.bucket_size_mean_ = 0;
 434     if ((buckets_speed_.empty()) && (buckets_space_.empty())) {
 435         stats.n_buckets_ = 0;
 436         stats.bucket_size_median_ = 0;
 437         stats.bucket_size_min_ = 0;
 438         stats.bucket_size_max_ = 0;
 439         return stats;
 440     }
 441
 442     if (!buckets_speed_.empty()) {
 443         for (BucketsSpeed::const_iterator pbucket = buckets_speed_.begin(); pbucket != buckets_speed_.end(); ++pbucket) {
 444             stats.bucket_sizes_.push_back((lsh::FeatureIndex)pbucket->size());
 445             stats.bucket_size_mean_ += pbucket->size();
 446         }
 447         stats.bucket_size_mean_ /= buckets_speed_.size();
 448         stats.n_buckets_ = buckets_speed_.size();
 449     }
 450     else {
 451         for (BucketsSpace::const_iterator x = buckets_space_.begin(); x != buckets_space_.end(); ++x) {
 452             stats.bucket_sizes_.push_back((lsh::FeatureIndex)x->second.size());
 453             stats.bucket_size_mean_ += x->second.size();
 454         }
 455         stats.bucket_size_mean_ /= buckets_space_.size();
 456         stats.n_buckets_ = buckets_space_.size();
 457     }
 458
 459     std::sort(stats.bucket_sizes_.begin(), stats.bucket_sizes_.end());
 460
 461     //  BOOST_FOREACH(int size, stats.bucket_sizes_)
 462     //          std::cout << size << " ";
 463     //  std::cout << std::endl;
 464     stats.bucket_size_median_ = stats.bucket_sizes_[stats.bucket_sizes_.size() / 2];
 465     stats.bucket_size_min_ = stats.bucket_sizes_.front();
 466     stats.bucket_size_max_ = stats.bucket_sizes_.back();
 467
 468     // TODO compute mean and std
 469     /*float mean, stddev;
 470        stats.bucket_size_mean_ = mean;
 471        stats.bucket_size_std_dev = stddev;*/
 472
 473     // Include a histogram of the buckets
 474     unsigned int bin_start = 0;
 475     unsigned int bin_end = 20;
 476     bool is_new_bin = true;
 477     for (std::vector<unsigned int>::iterator iterator = stats.bucket_sizes_.begin(), end = stats.bucket_sizes_.end(); iterator
 478          != end; )
 479         if (*iterator < bin_end) {
 480             if (is_new_bin) {
 481                 stats.size_histogram_.push_back(std::vector<unsigned int>(3, 0));
 482                 stats.size_histogram_.back()[0] = bin_start;
 483                 stats.size_histogram_.back()[1] = bin_end - 1;
 484                 is_new_bin = false;
 485             }
 486             ++stats.size_histogram_.back()[2];
 487             ++iterator;
 488         }
 489         else {
 490             bin_start += 20;
 491             bin_end += 20;
 492             is_new_bin = true;
 493         }
 494
 495     return stats;
 496 }
 497
 498 // End the two namespaces
 499 }
 500 }
 501
 502 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 503
 504 #endif /* OPENCV_FLANN_LSH_TABLE_H_ */