return *Index;
}
+ float consume(DocID ID) override { return DEFAULT_BOOST_SCORE; }
+
private:
llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {
OS << '[';
DocID peek() const override { return Children.front()->peek(); }
+ // If not exhausted and points to the given item, consume() returns the
+ // product of Children->consume(ID). Otherwise, DEFAULT_BOOST_SCORE is
+ // returned.
+ float consume(DocID ID) override {
+ if (reachedEnd() || peek() != ID)
+ return DEFAULT_BOOST_SCORE;
+ return std::accumulate(
+ begin(Children), end(Children), DEFAULT_BOOST_SCORE,
+ [&](float Current, const std::unique_ptr<Iterator> &Child) {
+ return Current * Child->consume(ID);
+ });
+ }
+
private:
llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {
OS << "(& ";
return Result;
}
+ // Returns the maximum boosting score among all Children when iterator is not
+ // exhausted and points to the given ID, DEFAULT_BOOST_SCORE otherwise.
+ float consume(DocID ID) override {
+ if (reachedEnd() || peek() != ID)
+ return DEFAULT_BOOST_SCORE;
+ return std::accumulate(
+ begin(Children), end(Children), DEFAULT_BOOST_SCORE,
+ [&](float Current, const std::unique_ptr<Iterator> &Child) {
+ return (!Child->reachedEnd() && Child->peek() == ID)
+ ? std::max(Current, Child->consume(ID))
+ : Current;
+ });
+ }
+
private:
llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {
OS << "(| ";
return Index;
}
+ float consume(DocID) override { return DEFAULT_BOOST_SCORE; }
+
private:
llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {
OS << "(TRUE {" << Index << "} out of " << Size << ")";
DocID Size;
};
+/// Boost iterator is a wrapper around its child which multiplies scores of
+/// each retrieved item by a given factor.
+class BoostIterator : public Iterator {
+public:
+ BoostIterator(std::unique_ptr<Iterator> Child, float Factor)
+ : Child(move(Child)), Factor(Factor) {}
+
+ bool reachedEnd() const override { return Child->reachedEnd(); }
+
+ void advance() override { Child->advance(); }
+
+ void advanceTo(DocID ID) override { Child->advanceTo(ID); }
+
+ DocID peek() const override { return Child->peek(); }
+
+ float consume(DocID ID) override { return Child->consume(ID) * Factor; }
+
+private:
+ llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {
+ OS << "(BOOST " << Factor << ' ' << *Child << ')';
+ return OS;
+ }
+
+ std::unique_ptr<Iterator> Child;
+ float Factor;
+};
+
} // end namespace
-std::vector<DocID> consume(Iterator &It, size_t Limit) {
- std::vector<DocID> Result;
+std::vector<std::pair<DocID, float>> consume(Iterator &It, size_t Limit) {
+ std::vector<std::pair<DocID, float>> Result;
for (size_t Retrieved = 0; !It.reachedEnd() && Retrieved < Limit;
- It.advance(), ++Retrieved)
- Result.push_back(It.peek());
+ It.advance(), ++Retrieved) {
+ DocID Document = It.peek();
+ Result.push_back(std::make_pair(Document, It.consume(Document)));
+ }
return Result;
}
return llvm::make_unique<TrueIterator>(Size);
}
+std::unique_ptr<Iterator> createBoost(std::unique_ptr<Iterator> Child,
+ float Factor) {
+ return llvm::make_unique<BoostIterator>(move(Child), Factor);
+}
+
} // namespace dex
} // namespace clangd
} // namespace clang
/// (their children) to form a multi-level Query Tree. The interface is designed
/// to be extensible in order to support multiple types of iterators.
class Iterator {
- // FIXME(kbobyrev): Provide callback for matched documents.
- // FIXME(kbobyrev): Implement new types of iterators: Label, Boost (with
- // scoring), Limit.
// FIXME(kbobyrev): Implement iterator cost, an estimate of advance() calls
// before iterator exhaustion.
+ // FIXME(kbobyrev): Implement Limit iterator.
public:
/// Returns true if all valid DocIDs were processed and hence the iterator is
/// exhausted.
///
/// Note: reachedEnd() must be false.
virtual DocID peek() const = 0;
+ /// Retrieves boosting score. Query tree root should pass Root->peek() to this
+ /// function, the parameter is needed to propagate through the tree. Given ID
+ /// should be compared against BOOST iterator peek()s: some of the iterators
+ /// would not point to the item which was propagated to the top of the query
+ /// tree (e.g. if these iterators are branches of OR iterator) and hence
+ /// shouldn't apply any boosting to the consumed item.
+ virtual float consume(DocID ID) = 0;
virtual ~Iterator() {}
return Iterator.dump(OS);
}
+ constexpr static float DEFAULT_BOOST_SCORE = 1;
+
private:
virtual llvm::raw_ostream &dump(llvm::raw_ostream &OS) const = 0;
};
/// Advances the iterator until it is either exhausted or the number of
-/// requested items is reached. The result contains sorted DocumentIDs.
-std::vector<DocID> consume(Iterator &It,
- size_t Limit = std::numeric_limits<size_t>::max());
+/// requested items is reached. Returns pairs of document IDs with the
+/// corresponding boosting score.
+///
+/// Boosting can be seen as a compromise between retrieving too many items and
+/// calculating finals score for each of them (which might be very expensive)
+/// and not retrieving enough items so that items with very high final score
+/// would not be processed. Boosting score is a computationally efficient way
+/// to acquire preliminary scores of requested items.
+std::vector<std::pair<DocID, float>>
+consume(Iterator &It, size_t Limit = std::numeric_limits<size_t>::max());
/// Returns a document iterator over given PostingList.
+///
+/// DocumentIterator returns DEFAULT_BOOST_SCORE for each processed item.
std::unique_ptr<Iterator> create(PostingListRef Documents);
/// Returns AND Iterator which performs the intersection of the PostingLists of
/// its children.
+///
+/// consume(): AND Iterator returns the product of Childrens' boosting scores
+/// when not exhausted and DEFAULT_BOOST_SCORE otherwise.
std::unique_ptr<Iterator>
createAnd(std::vector<std::unique_ptr<Iterator>> Children);
/// Returns OR Iterator which performs the union of the PostingLists of its
/// children.
+///
+/// consume(): OR Iterator returns the highest boost value among children
+/// pointing to requested item when not exhausted and DEFAULT_BOOST_SCORE
+/// otherwise.
std::unique_ptr<Iterator>
createOr(std::vector<std::unique_ptr<Iterator>> Children);
/// Returns TRUE Iterator which iterates over "virtual" PostingList containing
/// all items in range [0, Size) in an efficient manner.
+///
+/// TRUE returns DEFAULT_BOOST_SCORE for each processed item.
std::unique_ptr<Iterator> createTrue(DocID Size);
+/// Returns BOOST iterator which multiplies the score of each item by given
+/// factor. Boosting can be used as a computationally inexpensive filtering.
+/// Users can return significantly more items using consumeAndBoost() and then
+/// trim Top K using retrieval score.
+std::unique_ptr<Iterator> createBoost(std::unique_ptr<Iterator> Child,
+ float Factor);
+
/// This allows createAnd(create(...), create(...)) syntax.
template <typename... Args> std::unique_ptr<Iterator> createAnd(Args... args) {
std::vector<std::unique_ptr<Iterator>> Children;
namespace dex {
namespace {
+std::vector<DocID>
+consumeIDs(Iterator &It, size_t Limit = std::numeric_limits<size_t>::max()) {
+ auto IDAndScore = consume(It, Limit);
+ std::vector<DocID> IDs(IDAndScore.size());
+ for (size_t I = 0; I < IDAndScore.size(); ++I)
+ IDs[I] = IDAndScore[I].first;
+ return IDs;
+}
+
TEST(DexIndexIterators, DocumentIterator) {
const PostingList L = {4, 7, 8, 20, 42, 100};
auto DocIterator = create(L);
auto AndWithEmpty = createAnd(create(L0), create(L1));
EXPECT_TRUE(AndWithEmpty->reachedEnd());
- EXPECT_THAT(consume(*AndWithEmpty), ElementsAre());
+ EXPECT_THAT(consumeIDs(*AndWithEmpty), ElementsAre());
}
TEST(DexIndexIterators, AndTwoLists) {
auto And = createAnd(create(L1), create(L0));
EXPECT_FALSE(And->reachedEnd());
- EXPECT_THAT(consume(*And), ElementsAre(0U, 7U, 10U, 320U, 9000U));
+ EXPECT_THAT(consumeIDs(*And), ElementsAre(0U, 7U, 10U, 320U, 9000U));
And = createAnd(create(L0), create(L1));
auto OrWithEmpty = createOr(create(L0), create(L1));
EXPECT_FALSE(OrWithEmpty->reachedEnd());
- EXPECT_THAT(consume(*OrWithEmpty),
+ EXPECT_THAT(consumeIDs(*OrWithEmpty),
ElementsAre(0U, 5U, 7U, 10U, 42U, 320U, 9000U));
}
Or = createOr(create(L0), create(L1));
- EXPECT_THAT(consume(*Or),
+ EXPECT_THAT(consumeIDs(*Or),
ElementsAre(0U, 4U, 5U, 7U, 10U, 30U, 42U, 60U, 320U, 9000U));
}
// FIXME(kbobyrev): The testcase below is similar to what is expected in real
// queries. It should be updated once new iterators (such as boosting, limiting,
// etc iterators) appear. However, it is not exhaustive and it would be
-// beneficial to implement automatic generation of query trees for more
-// comprehensive testing.
+// beneficial to implement automatic generation (e.g. fuzzing) of query trees
+// for more comprehensive testing.
TEST(DexIndexIterators, QueryTree) {
- // An example of more complicated query
//
// +-----------------+
// |And Iterator:1, 5|
// +--------+--------+
// |
// |
- // +------------------------------------+
+ // +-------------+----------------------+
// | |
// | |
- // +----------v----------+ +----------v---------+
- // |And Iterator: 1, 5, 9| |Or Iterator: 0, 1, 5|
- // +----------+----------+ +----------+---------+
+ // +----------v----------+ +----------v------------+
+ // |And Iterator: 1, 5, 9| |Or Iterator: 0, 1, 3, 5|
+ // +----------+----------+ +----------+------------+
// | |
- // +------+-----+ +---------+-----------+
+ // +------+-----+ +---------------------+
// | | | | |
- // +-------v-----+ +----v-----+ +--v--+ +-V--+ +---v---+
- // |1, 3, 5, 8, 9| |1, 5, 7, 9| |Empty| |0, 5| |0, 1, 5|
- // +-------------+ +----------+ +-----+ +----+ +-------+
-
+ // +-------v-----+ +----+---+ +--v--+ +---v----+ +----v---+
+ // |1, 3, 5, 8, 9| |Boost: 2| |Empty| |Boost: 3| |Boost: 4|
+ // +-------------+ +----+---+ +-----+ +---+----+ +----+---+
+ // | | |
+ // +----v-----+ +-v--+ +---v---+
+ // |1, 5, 7, 9| |1, 5| |0, 3, 5|
+ // +----------+ +----+ +-------+
+ //
const PostingList L0 = {1, 3, 5, 8, 9};
const PostingList L1 = {1, 5, 7, 9};
- const PostingList L2 = {0, 5};
- const PostingList L3 = {0, 1, 5};
- const PostingList L4;
+ const PostingList L3;
+ const PostingList L4 = {1, 5};
+ const PostingList L5 = {0, 3, 5};
// Root of the query tree: [1, 5]
auto Root = createAnd(
// Lower And Iterator: [1, 5, 9]
- createAnd(create(L0), create(L1)),
+ createAnd(create(L0), createBoost(create(L1), 2U)),
// Lower Or Iterator: [0, 1, 5]
- createOr(create(L2), create(L3), create(L4)));
+ createOr(create(L3), createBoost(create(L4), 3U),
+ createBoost(create(L5), 4U)));
EXPECT_FALSE(Root->reachedEnd());
EXPECT_EQ(Root->peek(), 1U);
Root->advanceTo(1);
Root->advanceTo(0);
EXPECT_EQ(Root->peek(), 1U);
+ auto ElementBoost = Root->consume(Root->peek());
+ EXPECT_THAT(ElementBoost, 6);
Root->advance();
EXPECT_EQ(Root->peek(), 5U);
Root->advanceTo(5);
EXPECT_EQ(Root->peek(), 5U);
+ ElementBoost = Root->consume(Root->peek());
+ EXPECT_THAT(ElementBoost, 8);
Root->advanceTo(9000);
EXPECT_TRUE(Root->reachedEnd());
}
const PostingList L5;
auto DocIterator = create(L0);
- EXPECT_THAT(consume(*DocIterator, 42), ElementsAre(4, 7, 8, 20, 42, 100));
+ EXPECT_THAT(consumeIDs(*DocIterator, 42), ElementsAre(4, 7, 8, 20, 42, 100));
DocIterator = create(L0);
- EXPECT_THAT(consume(*DocIterator), ElementsAre(4, 7, 8, 20, 42, 100));
+ EXPECT_THAT(consumeIDs(*DocIterator), ElementsAre(4, 7, 8, 20, 42, 100));
DocIterator = create(L0);
- EXPECT_THAT(consume(*DocIterator, 3), ElementsAre(4, 7, 8));
+ EXPECT_THAT(consumeIDs(*DocIterator, 3), ElementsAre(4, 7, 8));
DocIterator = create(L0);
- EXPECT_THAT(consume(*DocIterator, 0), ElementsAre());
+ EXPECT_THAT(consumeIDs(*DocIterator, 0), ElementsAre());
}
TEST(DexIndexIterators, True) {
auto TrueIterator = createTrue(0U);
EXPECT_TRUE(TrueIterator->reachedEnd());
- EXPECT_THAT(consume(*TrueIterator), ElementsAre());
+ EXPECT_THAT(consumeIDs(*TrueIterator), ElementsAre());
PostingList L0 = {1, 2, 5, 7};
TrueIterator = createTrue(7U);
EXPECT_THAT(TrueIterator->peek(), 0);
auto AndIterator = createAnd(create(L0), move(TrueIterator));
EXPECT_FALSE(AndIterator->reachedEnd());
- EXPECT_THAT(consume(*AndIterator), ElementsAre(1, 2, 5));
+ EXPECT_THAT(consumeIDs(*AndIterator), ElementsAre(1, 2, 5));
+}
+
+TEST(DexIndexIterators, Boost) {
+ auto BoostIterator = createBoost(createTrue(5U), 42U);
+ EXPECT_FALSE(BoostIterator->reachedEnd());
+ auto ElementBoost = BoostIterator->consume(BoostIterator->peek());
+ EXPECT_THAT(ElementBoost, 42U);
+
+ const PostingList L0 = {2, 4};
+ const PostingList L1 = {1, 4};
+ auto Root = createOr(createTrue(5U), createBoost(create(L0), 2U),
+ createBoost(create(L1), 3U));
+
+ ElementBoost = Root->consume(Root->peek());
+ EXPECT_THAT(ElementBoost, Iterator::DEFAULT_BOOST_SCORE);
+ Root->advance();
+ EXPECT_THAT(Root->peek(), 1U);
+ ElementBoost = Root->consume(Root->peek());
+ EXPECT_THAT(ElementBoost, 3);
+
+ Root->advance();
+ EXPECT_THAT(Root->peek(), 2U);
+ ElementBoost = Root->consume(Root->peek());
+ EXPECT_THAT(ElementBoost, 2);
+
+ Root->advanceTo(4);
+ ElementBoost = Root->consume(Root->peek());
+ EXPECT_THAT(ElementBoost, 3);
}
testing::Matcher<std::vector<Token>>