1 /***********************************************************************
2 * Software License Agreement (BSD License)
4 * Copyright 2008-2009 Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
5 * Copyright 2008-2009 David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 *************************************************************************/
31 /***********************************************************************
32 * Author: Vincent Rabaud
33 *************************************************************************/
35 #ifndef OPENCV_FLANN_LSH_TABLE_H_
36 #define OPENCV_FLANN_LSH_TABLE_H_
42 // TODO as soon as we use C++0x, use the code in USE_UNORDERED_MAP
43 #ifdef __GXX_EXPERIMENTAL_CXX0X__
44 # define USE_UNORDERED_MAP 1
46 # define USE_UNORDERED_MAP 0
49 #include <unordered_map>
56 #include "dynamic_bitset.h"
65 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
67 /** What is stored in an LSH bucket
69 typedef uint32_t FeatureIndex;
70 /** The id from which we can get a bucket back in an LSH table
72 typedef unsigned int BucketKey;
74 /** A bucket in an LSH table
76 typedef std::vector<FeatureIndex> Bucket;
78 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
80 /** POD for stats about an LSH table
84 std::vector<unsigned int> bucket_sizes_;
86 size_t bucket_size_mean_;
87 size_t bucket_size_median_;
88 size_t bucket_size_min_;
89 size_t bucket_size_max_;
90 size_t bucket_size_std_dev;
91 /** Each contained vector contains three value: beginning/end for interval, number of elements in the bin
93 std::vector<std::vector<unsigned int> > size_histogram_;
96 /** Overload the << operator for LshStats
97 * @param out the streams
98 * @param stats the stats to display
101 inline std::ostream& operator <<(std::ostream& out, const LshStats& stats)
104 out << "Lsh Table Stats:\n" << std::setw(w) << std::setiosflags(std::ios::right) << "N buckets : "
105 << stats.n_buckets_ << "\n" << std::setw(w) << std::setiosflags(std::ios::right) << "mean size : "
106 << std::setiosflags(std::ios::left) << stats.bucket_size_mean_ << "\n" << std::setw(w)
107 << std::setiosflags(std::ios::right) << "median size : " << stats.bucket_size_median_ << "\n" << std::setw(w)
108 << std::setiosflags(std::ios::right) << "min size : " << std::setiosflags(std::ios::left)
109 << stats.bucket_size_min_ << "\n" << std::setw(w) << std::setiosflags(std::ios::right) << "max size : "
110 << std::setiosflags(std::ios::left) << stats.bucket_size_max_;
112 // Display the histogram
113 out << std::endl << std::setw(w) << std::setiosflags(std::ios::right) << "histogram : "
114 << std::setiosflags(std::ios::left);
115 for (std::vector<std::vector<unsigned int> >::const_iterator iterator = stats.size_histogram_.begin(), end =
116 stats.size_histogram_.end(); iterator != end; ++iterator) out << (*iterator)[0] << "-" << (*iterator)[1] << ": " << (*iterator)[2] << ", ";
122 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
124 /** Lsh hash table. As its key is a sub-feature, and as usually
125 * the size of it is pretty small, we keep it as a continuous memory array.
126 * The value is an index in the corpus of features (we keep it as an unsigned
127 * int for pure memory reasons, it could be a size_t)
129 template<typename ElementType>
133 /** A container of all the feature indices. Optimized for space
135 #if USE_UNORDERED_MAP
136 typedef std::unordered_map<BucketKey, Bucket> BucketsSpace;
138 typedef std::map<BucketKey, Bucket> BucketsSpace;
141 /** A container of all the feature indices. Optimized for speed
143 typedef std::vector<Bucket> BucketsSpeed;
145 /** Default constructor
151 /** Default constructor
152 * Create the mask and allocate the memory
153 * @param feature_size is the size of the feature (considered as a ElementType[])
154 * @param key_size is the number of bits that are turned on in the feature
156 LshTable(unsigned int /*feature_size*/, unsigned int /*key_size*/)
158 std::cerr << "LSH is not implemented for that type" << std::endl;
162 /** Add a feature to the table
163 * @param value the value to store for that feature
164 * @param feature the feature itself
166 void add(unsigned int value, const ElementType* feature)
168 // Add the value to the corresponding bucket
169 BucketKey key = (lsh::BucketKey)getKey(feature);
171 switch (speed_level_) {
173 // That means we get the buckets from an array
174 buckets_speed_[key].push_back(value);
177 // That means we can check the bitset for the presence of a key
178 key_bitset_.set(key);
179 buckets_space_[key].push_back(value);
183 // That means we have to check for the hash table for the presence of a key
184 buckets_space_[key].push_back(value);
190 /** Add a set of features to the table
191 * @param dataset the values to store
193 void add(Matrix<ElementType> dataset)
195 #if USE_UNORDERED_MAP
196 buckets_space_.rehash((buckets_space_.size() + dataset.rows) * 1.2);
198 // Add the features to the table
199 for (unsigned int i = 0; i < dataset.rows; ++i) add(i, dataset[i]);
200 // Now that the table is full, optimize it for speed/space
204 /** Get a bucket given the key
208 inline const Bucket* getBucketFromKey(BucketKey key) const
210 // Generate other buckets
211 switch (speed_level_) {
213 // That means we get the buckets from an array
214 return &buckets_speed_[key];
217 // That means we can check the bitset for the presence of a key
218 if (key_bitset_.test(key)) return &buckets_space_.find(key)->second;
223 // That means we have to check for the hash table for the presence of a key
224 BucketsSpace::const_iterator bucket_it, bucket_end = buckets_space_.end();
225 bucket_it = buckets_space_.find(key);
226 // Stop here if that bucket does not exist
227 if (bucket_it == bucket_end) return 0;
228 else return &bucket_it->second;
235 /** Compute the sub-signature of a feature
237 size_t getKey(const ElementType* /*feature*/) const
239 std::cerr << "LSH is not implemented for that type" << std::endl;
244 /** Get statistics about the table
247 LshStats getStats() const;
250 /** defines the speed fo the implementation
251 * kArray uses a vector for storing data
252 * kBitsetHash uses a hash map but checks for the validity of a key with a bitset
253 * kHash uses a hash map only
257 kArray, kBitsetHash, kHash
260 /** Initialize some variables
262 void initialize(size_t key_size)
264 const size_t key_size_lower_bound = 1;
265 //a value (size_t(1) << key_size) must fit the size_t type so key_size has to be strictly less than size of size_t
266 const size_t key_size_upper_bound = std::min(sizeof(BucketKey) * CHAR_BIT + 1, sizeof(size_t) * CHAR_BIT);
267 if (key_size < key_size_lower_bound || key_size >= key_size_upper_bound)
269 CV_Error(cv::Error::StsBadArg, cv::format("Invalid key_size (=%d). Valid values for your system are %d <= key_size < %d.", (int)key_size, (int)key_size_lower_bound, (int)key_size_upper_bound));
272 speed_level_ = kHash;
273 key_size_ = (unsigned)key_size;
276 /** Optimize the table for speed/space
280 // If we are already using the fast storage, no need to do anything
281 if (speed_level_ == kArray) return;
283 // Use an array if it will be more than half full
284 if (buckets_space_.size() > ((size_t(1) << key_size_) / 2)) {
285 speed_level_ = kArray;
286 // Fill the array version of it
287 buckets_speed_.resize(size_t(1) << key_size_);
288 for (BucketsSpace::const_iterator key_bucket = buckets_space_.begin(); key_bucket != buckets_space_.end(); ++key_bucket) buckets_speed_[key_bucket->first] = key_bucket->second;
290 // Empty the hash table
291 buckets_space_.clear();
295 // If the bitset is going to use less than 10% of the RAM of the hash map (at least 1 size_t for the key and two
296 // for the vector) or less than 512MB (key_size_ <= 30)
297 if (((std::max(buckets_space_.size(), buckets_speed_.size()) * CHAR_BIT * 3 * sizeof(BucketKey)) / 10
298 >= (size_t(1) << key_size_)) || (key_size_ <= 32)) {
299 speed_level_ = kBitsetHash;
300 key_bitset_.resize(size_t(1) << key_size_);
302 // Try with the BucketsSpace
303 for (BucketsSpace::const_iterator key_bucket = buckets_space_.begin(); key_bucket != buckets_space_.end(); ++key_bucket) key_bitset_.set(key_bucket->first);
306 speed_level_ = kHash;
311 /** The vector of all the buckets if they are held for speed
313 BucketsSpeed buckets_speed_;
315 /** The hash table of all the buckets in case we cannot use the speed version
317 BucketsSpace buckets_space_;
319 /** What is used to store the data */
320 SpeedLevel speed_level_;
322 /** If the subkey is small enough, it will keep track of which subkeys are set through that bitset
323 * That is just a speedup so that we don't look in the hash table (which can be mush slower that checking a bitset)
325 DynamicBitset key_bitset_;
327 /** The size of the sub-signature in bits
329 unsigned int key_size_;
331 // Members only used for the unsigned char specialization
332 /** The mask to apply to a feature to get the hash key
333 * Only used in the unsigned char case
335 std::vector<size_t> mask_;
338 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
339 // Specialization for unsigned char
342 inline LshTable<unsigned char>::LshTable(unsigned int feature_size, unsigned int subsignature_size)
344 initialize(subsignature_size);
346 mask_ = std::vector<size_t>((size_t)ceil((float)(feature_size * sizeof(char)) / (float)sizeof(size_t)), 0);
348 // A bit brutal but fast to code
349 static std::vector<size_t>* indices = NULL;
351 //Ensure the Nth bit will be selected only once among the different LshTables
352 //to avoid having two different tables with signatures sharing many dimensions/many bits
353 if( indices == NULL )
355 indices = new std::vector<size_t>( feature_size * CHAR_BIT );
357 else if( indices->size() < key_size_ )
359 indices->resize( feature_size * CHAR_BIT );
360 for (size_t i = 0; i < feature_size * CHAR_BIT; ++i) {
363 std::random_shuffle(indices->begin(), indices->end());
366 // Generate a random set of order of subsignature_size_ bits
367 for (unsigned int i = 0; i < key_size_; ++i) {
368 size_t index = (*indices)[0];
369 indices->erase( indices->begin() );
371 // Set that bit in the mask
372 size_t divisor = CHAR_BIT * sizeof(size_t);
373 size_t idx = index / divisor; //pick the right size_t index
374 mask_[idx] |= size_t(1) << (index % divisor); //use modulo to find the bit offset
377 // Set to 1 if you want to display the mask for debug
381 BOOST_FOREACH(size_t mask_block, mask_){
382 out << std::setw(sizeof(size_t) * CHAR_BIT / 4) << std::setfill('0') << std::hex << mask_block
384 bcount += __builtin_popcountll(mask_block);
386 out << "bit count : " << std::dec << bcount << std::endl;
387 out << "mask size : " << mask_.size() << std::endl;
393 /** Return the Subsignature of a feature
394 * @param feature the feature to analyze
397 inline size_t LshTable<unsigned char>::getKey(const unsigned char* feature) const
399 // no need to check if T is dividable by sizeof(size_t) like in the Hamming
400 // distance computation as we have a mask
401 const size_t* feature_block_ptr = reinterpret_cast<const size_t*> ((const void*)feature);
403 // Figure out the subsignature of the feature
404 // Given the feature ABCDEF, and the mask 001011, the output will be
406 size_t subsignature = 0;
407 size_t bit_index = 1;
409 for (std::vector<size_t>::const_iterator pmask_block = mask_.begin(); pmask_block != mask_.end(); ++pmask_block) {
410 // get the mask and signature blocks
411 size_t feature_block = *feature_block_ptr;
412 size_t mask_block = *pmask_block;
414 // Get the lowest set bit in the mask block
415 size_t lowest_bit = mask_block & (-(ptrdiff_t)mask_block);
416 // Add it to the current subsignature if necessary
417 subsignature += (feature_block & lowest_bit) ? bit_index : 0;
418 // Reset the bit in the mask block
419 mask_block ^= lowest_bit;
420 // increment the bit index for the subsignature
423 // Check the next feature block
430 inline LshStats LshTable<unsigned char>::getStats() const
433 stats.bucket_size_mean_ = 0;
434 if ((buckets_speed_.empty()) && (buckets_space_.empty())) {
435 stats.n_buckets_ = 0;
436 stats.bucket_size_median_ = 0;
437 stats.bucket_size_min_ = 0;
438 stats.bucket_size_max_ = 0;
442 if (!buckets_speed_.empty()) {
443 for (BucketsSpeed::const_iterator pbucket = buckets_speed_.begin(); pbucket != buckets_speed_.end(); ++pbucket) {
444 stats.bucket_sizes_.push_back((lsh::FeatureIndex)pbucket->size());
445 stats.bucket_size_mean_ += pbucket->size();
447 stats.bucket_size_mean_ /= buckets_speed_.size();
448 stats.n_buckets_ = buckets_speed_.size();
451 for (BucketsSpace::const_iterator x = buckets_space_.begin(); x != buckets_space_.end(); ++x) {
452 stats.bucket_sizes_.push_back((lsh::FeatureIndex)x->second.size());
453 stats.bucket_size_mean_ += x->second.size();
455 stats.bucket_size_mean_ /= buckets_space_.size();
456 stats.n_buckets_ = buckets_space_.size();
459 std::sort(stats.bucket_sizes_.begin(), stats.bucket_sizes_.end());
461 // BOOST_FOREACH(int size, stats.bucket_sizes_)
462 // std::cout << size << " ";
463 // std::cout << std::endl;
464 stats.bucket_size_median_ = stats.bucket_sizes_[stats.bucket_sizes_.size() / 2];
465 stats.bucket_size_min_ = stats.bucket_sizes_.front();
466 stats.bucket_size_max_ = stats.bucket_sizes_.back();
468 // TODO compute mean and std
469 /*float mean, stddev;
470 stats.bucket_size_mean_ = mean;
471 stats.bucket_size_std_dev = stddev;*/
473 // Include a histogram of the buckets
474 unsigned int bin_start = 0;
475 unsigned int bin_end = 20;
476 bool is_new_bin = true;
477 for (std::vector<unsigned int>::iterator iterator = stats.bucket_sizes_.begin(), end = stats.bucket_sizes_.end(); iterator
479 if (*iterator < bin_end) {
481 stats.size_histogram_.push_back(std::vector<unsigned int>(3, 0));
482 stats.size_histogram_.back()[0] = bin_start;
483 stats.size_histogram_.back()[1] = bin_end - 1;
486 ++stats.size_histogram_.back()[2];
498 // End the two namespaces
502 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
504 #endif /* OPENCV_FLANN_LSH_TABLE_H_ */