Skip to content

Commit

Permalink
opt9: batch vle processing
Browse files Browse the repository at this point in the history
  • Loading branch information
msm-code committed Oct 14, 2024
1 parent cdbf0c1 commit 377e3cb
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 6 deletions.
82 changes: 76 additions & 6 deletions libursa/SortedRun.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "SortedRun.h"

#include <algorithm>
#include <numeric>
#include <stdexcept>

#include "Utils.h"
Expand Down Expand Up @@ -67,19 +68,88 @@ void SortedRun::do_or(SortedRun &other) {
std::swap(new_results, sequence_);
}

// Performance critical method. As of the current version, in some tests,
// more than half of the time is spent ANDing decompressed and compressed runs.
// We expect the compressed set to be large, and sequence to be short.
//
// About the fast case: integers on disk are Variable Length Encoded (VLE).
// This means that interes <= 0x7f are encoded as a single byte, and larger
// ones use multiple bytes (with 0x80 bit used as a continuation bit).
// This function optimizes the common case where most VLE integers are
// small, i.e. are stored as a single byte without additional encoding.
void RunIterator::do_and(RunIterator begin, RunIterator end,
std::vector<uint32_t> *target) {
std::vector<uint32_t> &sequence = *target;
int out_ndx = 0;
int ndx = 0;
int size = sequence.size();

RunIterator it = begin;
while (it.pos_ < end.pos_ && ndx < size) {
// Handle the fast-case. This is purely an optimization, the function
// will still function properly (but slower) with this `if` removed.
// Fast case: the next 8 VLE bytes are all small (0x80 bit not set).
constexpr int BATCH_SIZE = 8;
constexpr uint64_t VLE_MASK = 0x8080808080808080UL;
uint64_t *as_qword = (uint64_t *)it.pos_;
if (it.pos_ + BATCH_SIZE < end.pos_ && (*as_qword & VLE_MASK) == 0) {
// Fast case of the fast case - if the pointer after processing the
// current 8 bytes is still smaller than the next element of
// sequence, just get the sum and skip 8 elements forward.
uint32_t after_batch = it.prev_ + BATCH_SIZE;
after_batch += std::accumulate(it.pos_, it.pos_ + BATCH_SIZE, 0);
if (after_batch < sequence[ndx]) {
it.forward(BATCH_SIZE, after_batch);
continue;
}

// Regular fast case - do the intersection without decoding VLE
// integers (remember, we know they are all small, i.e. one byte).
// Basically the same logic as regular case (see below).
for (uint8_t *end = it.pos_ + 8; it.pos_ < end && ndx < size;) {
uint32_t next = it.prev_ + *it.pos_ + 1;
if (next < sequence[ndx]) {
it.forward(1, next);
continue;
}
if (sequence[ndx] == next) {
sequence[out_ndx++] = sequence[ndx];
it.forward(1, next);
}
ndx += 1;
}
continue;
}

// Regular set intersection logic (non-optimized/default case).
// This is basically equivalent to std::set_intersection.
uint32_t next = *it;
if (next < sequence[ndx]) {
++it;
continue;
}
if (sequence[ndx] == next) {
sequence[out_ndx++] = sequence[ndx];
++it;
}
ndx += 1;
}

// Clean up elements from the end of the ANDed vector.
sequence.erase(sequence.begin() + out_ndx, sequence.end());
}

void SortedRun::do_and(SortedRun &other) {
// Benchmarking shows that handling a situation where this->is_compressed()
// makes the code *slower*. I assume that's because of memory efficiency.
decompress();
std::vector<uint32_t>::iterator new_end;
if (other.is_compressed()) {
new_end = std::set_intersection(other.comp_begin(), other.comp_end(),
begin(), end(), begin());
RunIterator::do_and(other.comp_begin(), other.comp_end(), &sequence_);
} else {
new_end = std::set_intersection(other.begin(), other.end(), begin(),
end(), begin());
std::vector<uint32_t>::iterator new_end = std::set_intersection(
other.begin(), other.end(), begin(), end(), begin());
sequence_.erase(new_end, sequence_.end());
}
sequence_.erase(new_end, sequence_.end());
}

void SortedRun::decompress() {
Expand Down
18 changes: 18 additions & 0 deletions libursa/SortedRun.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,34 @@ class RunIterator : public std::iterator<std::forward_iterator_tag, uint32_t> {
uint8_t *pos_;
int32_t prev_;

void forward(int steps, int32_t new_pos) {
pos_ += steps;
prev_ = new_pos;
}
uint32_t current() const;
uint8_t *nextpos();

public:
RunIterator(uint8_t *run) : pos_(run), prev_(-1) {}
~RunIterator() {}

// Moves to the next element (this may mean a variable number of bytes).
RunIterator &operator++() {
prev_ = current();
pos_ = nextpos();
return *this;
}

// Gets the current element under the iterator. Useful in STL algorithms.
uint32_t operator*() const { return current(); }

// Compares the iterators. Useful in STL algorithms.
bool operator!=(const iterator &rhs) const { return pos_ != rhs.pos_; }

// Fast (optimized) std::set_intersection implementation, tweaked for the
// expected data distribution.
static void do_and(RunIterator begin, RunIterator end,
std::vector<uint32_t> *target);
};

// This class represents a "run" - a sorted list of FileIDs. This can be
Expand Down Expand Up @@ -72,11 +85,16 @@ class SortedRun {
: sequence_(other.sequence_), run_(other.run_) {}
SortedRun &operator=(SortedRun &&) = default;

// Checks if the current run is empty.
bool empty() const { return sequence_.empty() && run_.empty(); }

// Does the OR operation with the other vector, overwrites this object.
void do_or(SortedRun &other);

// Does the AND operation with the other vector, overwrites this object.
void do_and(SortedRun &other);

// Does the MIN_OF operation on specified operands. Allocates a new reuslt.
static SortedRun pick_common(int cutoff, std::vector<SortedRun *> &sources);

// When you really need to clone the run - TODO remove.
Expand Down

0 comments on commit 377e3cb

Please sign in to comment.