1 // See www.openfst.org for extensive documentation on this weighted
2 // finite-state transducer library.
4 #include <fst/extensions/ngram/bitmap-index.h>
10 #include <fst/extensions/ngram/nthbit.h>
14 const size_t kPrimaryBlockBits =
15 BitmapIndex::kStorageBitSize * BitmapIndex::kSecondaryBlockSize;
17 // If [begin, begin+size) is a monotonically increasing running sum of
18 // popcounts for a bitmap, this will return the index of the word that contains
19 // the value'th zero. If value is larger then the number of zeros in the
20 // bitmap, size will be returned. The idea is that the number of zerocounts
21 // (i.e. the popcount of logical NOT of values) is offset * kStorageBitSize
22 // minus the value for each element of the running sum.
23 template <size_t BlockSize, typename Iter, typename T>
24 Iter InvertedSearch(Iter first, Iter last, T value) {
25 const Iter begin = first;
26 while (first != last) {
27 // Invariant: [first, last) is the search range.
28 Iter mid = first + ((last - first) / 2);
29 size_t mid_value = BlockSize * (1 + (mid - begin)) - *mid;
30 if (mid_value < value) {
40 size_t BitmapIndex::Rank1(size_t end) const {
41 if (end == 0) return 0;
42 CHECK_LE(end, Bits());
43 const uint32 end_word = (end - 1) >> BitmapIndex::kStorageLogBitSize;
44 const uint32 sum = get_index_ones_count(end_word);
45 const size_t masked = end & kStorageBlockMask;
47 return sum + __builtin_popcountll(bits_[end_word]);
49 const uint64 zero = 0;
50 return sum + __builtin_popcountll(bits_[end_word] &
51 (~zero >> (kStorageBitSize - masked)));
55 size_t BitmapIndex::Select1(size_t bit_index) const {
56 if (bit_index >= GetOnesCount()) return Bits();
57 // search primary index for the relevant block
58 uint32 rembits = bit_index + 1;
59 const uint32 block = find_primary_block(bit_index + 1);
62 rembits -= primary_index_[block - 1];
63 offset += block * kSecondaryBlockSize;
65 // search the secondary index
66 uint32 word = find_secondary_block(offset, rembits);
68 rembits -= secondary_index_[offset + word - 1];
71 int nth = nth_bit(bits_[offset], rembits);
72 return (offset << BitmapIndex::kStorageLogBitSize) + nth;
75 size_t BitmapIndex::Select0(size_t bit_index) const {
76 if (bit_index >= Bits() - GetOnesCount()) return Bits();
77 // search inverted primary index for relevant block
78 uint32 remzeros = bit_index + 1;
80 const uint32 block = find_inverted_primary_block(bit_index + 1);
82 remzeros -= kPrimaryBlockBits * block - primary_index_[block - 1];
83 offset += block * kSecondaryBlockSize;
85 // search the inverted secondary index
86 uint32 word = find_inverted_secondary_block(offset, remzeros);
88 remzeros -= BitmapIndex::kStorageBitSize * word -
89 secondary_index_[offset + word - 1];
92 int nth = nth_bit(~bits_[offset], remzeros);
93 return (offset << BitmapIndex::kStorageLogBitSize) + nth;
96 std::pair<size_t, size_t> BitmapIndex::Select0s(size_t bit_index) const {
97 const uint64 zero = 0;
98 const uint64 ones = ~zero;
99 size_t zeros_count = Bits() - GetOnesCount();
100 if (bit_index >= zeros_count) return std::make_pair(Bits(), Bits());
101 if (bit_index + 1 >= zeros_count) {
102 return std::make_pair(Select0(bit_index), Bits());
104 // search inverted primary index for relevant block
105 uint32 remzeros = bit_index + 1;
107 const uint32 block = find_inverted_primary_block(bit_index + 1);
108 size_t num_zeros_in_block =
109 kPrimaryBlockBits * (1 + block) - primary_index_[block];
111 size_t num_zeros_next =
112 kPrimaryBlockBits * block - primary_index_[block - 1];
113 num_zeros_in_block -= num_zeros_next;
114 remzeros -= num_zeros_next;
115 offset += block * kSecondaryBlockSize;
117 // search the inverted secondary index
118 uint32 word = find_inverted_secondary_block(offset, remzeros);
119 uint32 sum_zeros_next_word = BitmapIndex::kStorageBitSize * (1 + word) -
120 secondary_index_[offset + word];
121 uint32 sum_zeros_this_word = 0;
123 sum_zeros_this_word = BitmapIndex::kStorageBitSize * word -
124 secondary_index_[offset + word - 1];
125 remzeros -= sum_zeros_this_word;
128 int nth = nth_bit(~bits_[offset], remzeros);
129 size_t current_zero = (offset << BitmapIndex::kStorageLogBitSize) + nth;
132 // Does the current block contain the next zero?
133 if (num_zeros_in_block > remzeros + 1) {
134 if (sum_zeros_next_word - sum_zeros_this_word >= remzeros + 1) {
135 // the next zero is in this word
136 next_zero = (offset << BitmapIndex::kStorageLogBitSize) +
137 nth_bit(~bits_[offset], remzeros + 1);
139 // Find the first field that is not all ones by linear scan.
140 // In the worst case, this may scan 8Kbytes. The alternative is
141 // to inspect secondary_index_ looking for a place to jump to, but
142 // that would probably use more cache.
143 while (bits_[++offset] == ones) {
145 next_zero = (offset << BitmapIndex::kStorageLogBitSize) +
146 __builtin_ctzll(~bits_[offset]);
149 // the next zero is in a different block, a full search is required.
150 next_zero = Select0(bit_index + 1);
152 return std::make_pair(current_zero, next_zero);
155 size_t BitmapIndex::get_index_ones_count(size_t array_index) const {
157 if (array_index > 0) {
158 sum += secondary_index_[array_index - 1];
159 uint32 end_block = (array_index - 1) / kSecondaryBlockSize;
160 if (end_block > 0) sum += primary_index_[end_block - 1];
165 void BitmapIndex::BuildIndex(const uint64 *bits, size_t size) {
168 primary_index_.resize(primary_index_size());
169 secondary_index_.resize(ArraySize());
170 const uint64 zero = 0;
171 const uint64 ones = ~zero;
173 for (uint32 block = 0; block * kSecondaryBlockSize < ArraySize(); block++) {
174 uint32 block_popcount = 0;
175 uint32 block_begin = block * kSecondaryBlockSize;
176 uint32 block_end = block_begin + kSecondaryBlockSize;
177 if (block_end > ArraySize()) block_end = ArraySize();
178 for (uint32 j = block_begin; j < block_end; ++j) {
180 if (j == ArraySize() - 1) {
181 mask = ones >> (-size_ & BitmapIndex::kStorageBlockMask);
183 block_popcount += __builtin_popcountll(bits_[j] & mask);
184 secondary_index_[j] = block_popcount;
186 popcount += block_popcount;
187 primary_index_[block] = popcount;
191 size_t BitmapIndex::find_secondary_block(size_t block_begin,
192 size_t rem_bit_index) const {
193 size_t block_end = block_begin + kSecondaryBlockSize;
194 if (block_end > ArraySize()) block_end = ArraySize();
195 return std::distance(
196 secondary_index_.begin() + block_begin,
197 std::lower_bound(secondary_index_.begin() + block_begin,
198 secondary_index_.begin() + block_end, rem_bit_index));
201 size_t BitmapIndex::find_inverted_secondary_block(size_t block_begin,
202 size_t rem_bit_index) const {
203 size_t block_end = block_begin + kSecondaryBlockSize;
204 if (block_end > ArraySize()) block_end = ArraySize();
205 return std::distance(
206 secondary_index_.begin() + block_begin,
207 InvertedSearch<BitmapIndex::kStorageBitSize>(
208 secondary_index_.begin() + block_begin,
209 secondary_index_.begin() + block_end, rem_bit_index));
212 inline size_t BitmapIndex::find_primary_block(size_t bit_index) const {
213 return std::distance(
214 primary_index_.begin(),
215 std::lower_bound(primary_index_.begin(),
216 primary_index_.begin() + primary_index_size(),
220 size_t BitmapIndex::find_inverted_primary_block(size_t bit_index) const {
221 return std::distance(
222 primary_index_.begin(),
223 InvertedSearch<kPrimaryBlockBits>(primary_index_.begin(),
224 primary_index_.end(), bit_index));
226 } // end namespace fst