From 377e3cbcafcd55dd5f8b4f2216acee0d052bd0c8 Mon Sep 17 00:00:00 2001 From: msm Date: Sun, 13 Oct 2024 01:09:36 +0200 Subject: [PATCH 1/5] opt9: batch vle processing --- libursa/SortedRun.cpp | 82 +++++++++++++++++++++++++++++++++++++++---- libursa/SortedRun.h | 18 ++++++++++ 2 files changed, 94 insertions(+), 6 deletions(-) diff --git a/libursa/SortedRun.cpp b/libursa/SortedRun.cpp index f3fac27..c46cf34 100644 --- a/libursa/SortedRun.cpp +++ b/libursa/SortedRun.cpp @@ -1,6 +1,7 @@ #include "SortedRun.h" #include +#include #include #include "Utils.h" @@ -67,19 +68,88 @@ void SortedRun::do_or(SortedRun &other) { std::swap(new_results, sequence_); } +// Performance critical method. As of the current version, in some tests, +// more than half of the time is spent ANDing decompressed and compressed runs. +// We expect the compressed set to be large, and sequence to be short. +// +// About the fast case: integers on disk are Variable Length Encoded (VLE). +// This means that interes <= 0x7f are encoded as a single byte, and larger +// ones use multiple bytes (with 0x80 bit used as a continuation bit). +// This function optimizes the common case where most VLE integers are +// small, i.e. are stored as a single byte without additional encoding. +void RunIterator::do_and(RunIterator begin, RunIterator end, + std::vector *target) { + std::vector &sequence = *target; + int out_ndx = 0; + int ndx = 0; + int size = sequence.size(); + + RunIterator it = begin; + while (it.pos_ < end.pos_ && ndx < size) { + // Handle the fast-case. This is purely an optimization, the function + // will still function properly (but slower) with this `if` removed. + // Fast case: the next 8 VLE bytes are all small (0x80 bit not set). + constexpr int BATCH_SIZE = 8; + constexpr uint64_t VLE_MASK = 0x8080808080808080UL; + uint64_t *as_qword = (uint64_t *)it.pos_; + if (it.pos_ + BATCH_SIZE < end.pos_ && (*as_qword & VLE_MASK) == 0) { + // Fast case of the fast case - if the pointer after processing the + // current 8 bytes is still smaller than the next element of + // sequence, just get the sum and skip 8 elements forward. + uint32_t after_batch = it.prev_ + BATCH_SIZE; + after_batch += std::accumulate(it.pos_, it.pos_ + BATCH_SIZE, 0); + if (after_batch < sequence[ndx]) { + it.forward(BATCH_SIZE, after_batch); + continue; + } + + // Regular fast case - do the intersection without decoding VLE + // integers (remember, we know they are all small, i.e. one byte). + // Basically the same logic as regular case (see below). + for (uint8_t *end = it.pos_ + 8; it.pos_ < end && ndx < size;) { + uint32_t next = it.prev_ + *it.pos_ + 1; + if (next < sequence[ndx]) { + it.forward(1, next); + continue; + } + if (sequence[ndx] == next) { + sequence[out_ndx++] = sequence[ndx]; + it.forward(1, next); + } + ndx += 1; + } + continue; + } + + // Regular set intersection logic (non-optimized/default case). + // This is basically equivalent to std::set_intersection. + uint32_t next = *it; + if (next < sequence[ndx]) { + ++it; + continue; + } + if (sequence[ndx] == next) { + sequence[out_ndx++] = sequence[ndx]; + ++it; + } + ndx += 1; + } + + // Clean up elements from the end of the ANDed vector. + sequence.erase(sequence.begin() + out_ndx, sequence.end()); +} + void SortedRun::do_and(SortedRun &other) { // Benchmarking shows that handling a situation where this->is_compressed() // makes the code *slower*. I assume that's because of memory efficiency. decompress(); - std::vector::iterator new_end; if (other.is_compressed()) { - new_end = std::set_intersection(other.comp_begin(), other.comp_end(), - begin(), end(), begin()); + RunIterator::do_and(other.comp_begin(), other.comp_end(), &sequence_); } else { - new_end = std::set_intersection(other.begin(), other.end(), begin(), - end(), begin()); + std::vector::iterator new_end = std::set_intersection( + other.begin(), other.end(), begin(), end(), begin()); + sequence_.erase(new_end, sequence_.end()); } - sequence_.erase(new_end, sequence_.end()); } void SortedRun::decompress() { diff --git a/libursa/SortedRun.h b/libursa/SortedRun.h index 34638ea..2dd0f91 100644 --- a/libursa/SortedRun.h +++ b/libursa/SortedRun.h @@ -9,6 +9,10 @@ class RunIterator : public std::iterator { uint8_t *pos_; int32_t prev_; + void forward(int steps, int32_t new_pos) { + pos_ += steps; + prev_ = new_pos; + } uint32_t current() const; uint8_t *nextpos(); @@ -16,14 +20,23 @@ class RunIterator : public std::iterator { RunIterator(uint8_t *run) : pos_(run), prev_(-1) {} ~RunIterator() {} + // Moves to the next element (this may mean a variable number of bytes). RunIterator &operator++() { prev_ = current(); pos_ = nextpos(); return *this; } + // Gets the current element under the iterator. Useful in STL algorithms. uint32_t operator*() const { return current(); } + + // Compares the iterators. Useful in STL algorithms. bool operator!=(const iterator &rhs) const { return pos_ != rhs.pos_; } + + // Fast (optimized) std::set_intersection implementation, tweaked for the + // expected data distribution. + static void do_and(RunIterator begin, RunIterator end, + std::vector *target); }; // This class represents a "run" - a sorted list of FileIDs. This can be @@ -72,11 +85,16 @@ class SortedRun { : sequence_(other.sequence_), run_(other.run_) {} SortedRun &operator=(SortedRun &&) = default; + // Checks if the current run is empty. bool empty() const { return sequence_.empty() && run_.empty(); } + // Does the OR operation with the other vector, overwrites this object. void do_or(SortedRun &other); + + // Does the AND operation with the other vector, overwrites this object. void do_and(SortedRun &other); + // Does the MIN_OF operation on specified operands. Allocates a new reuslt. static SortedRun pick_common(int cutoff, std::vector &sources); // When you really need to clone the run - TODO remove. From 481661a66f79859446dc84eca29f92f217e00585 Mon Sep 17 00:00:00 2001 From: msm Date: Wed, 16 Oct 2024 02:45:47 +0200 Subject: [PATCH 2/5] Refactor the code, hopefully --- libursa/SortedRun.cpp | 168 +++++++++++++++++++----------------------- libursa/SortedRun.h | 56 +++++--------- 2 files changed, 96 insertions(+), 128 deletions(-) diff --git a/libursa/SortedRun.cpp b/libursa/SortedRun.cpp index c46cf34..9f18d70 100644 --- a/libursa/SortedRun.cpp +++ b/libursa/SortedRun.cpp @@ -6,23 +6,25 @@ #include "Utils.h" -uint32_t RunIterator::current() const { +// Read element currently under pos. +uint32_t run_read(uint8_t *pos) { uint64_t acc = 0; uint32_t shift = 0; - for (uint8_t *it = pos_;; it++) { + for (uint8_t *it = pos;; it++) { uint32_t next = *it; acc += (next & 0x7FU) << shift; shift += 7U; if ((next & 0x80U) == 0) { - return prev_ + acc + 1; + return acc + 1; } } } -uint8_t *RunIterator::nextpos() { - for (uint8_t *it = pos_;; it++) { - if ((*it & 0x80) == 0) { - return it + 1; +// Move pos to the next element. +uint8_t *run_forward(uint8_t *pos) { + for (;; pos++) { + if ((*pos & 0x80) == 0) { + return pos + 1; } } } @@ -43,113 +45,97 @@ std::vector::iterator SortedRun::end() { return sequence_.end(); } -RunIterator SortedRun::comp_begin() { - validate_compression(true); - return RunIterator(run_.data()); -} - -RunIterator SortedRun::comp_end() { - validate_compression(true); - return RunIterator(run_.data() + run_.size()); -} - void SortedRun::do_or(SortedRun &other) { // In almost every case this is already decompressed. decompress(); + other.decompress(); std::vector new_results; - if (other.is_compressed()) { - // Unlikely case, in most cases both runs are already decompressed. - std::set_union(other.comp_begin(), other.comp_end(), begin(), end(), - std::back_inserter(new_results)); - } else { - std::set_union(other.begin(), other.end(), begin(), end(), - std::back_inserter(new_results)); - } + std::set_union(other.begin(), other.end(), begin(), end(), + std::back_inserter(new_results)); std::swap(new_results, sequence_); } -// Performance critical method. As of the current version, in some tests, -// more than half of the time is spent ANDing decompressed and compressed runs. -// We expect the compressed set to be large, and sequence to be short. -// -// About the fast case: integers on disk are Variable Length Encoded (VLE). -// This means that interes <= 0x7f are encoded as a single byte, and larger -// ones use multiple bytes (with 0x80 bit used as a continuation bit). -// This function optimizes the common case where most VLE integers are -// small, i.e. are stored as a single byte without additional encoding. -void RunIterator::do_and(RunIterator begin, RunIterator end, - std::vector *target) { - std::vector &sequence = *target; - int out_ndx = 0; - int ndx = 0; - int size = sequence.size(); - - RunIterator it = begin; - while (it.pos_ < end.pos_ && ndx < size) { - // Handle the fast-case. This is purely an optimization, the function - // will still function properly (but slower) with this `if` removed. - // Fast case: the next 8 VLE bytes are all small (0x80 bit not set). - constexpr int BATCH_SIZE = 8; - constexpr uint64_t VLE_MASK = 0x8080808080808080UL; - uint64_t *as_qword = (uint64_t *)it.pos_; - if (it.pos_ + BATCH_SIZE < end.pos_ && (*as_qword & VLE_MASK) == 0) { - // Fast case of the fast case - if the pointer after processing the - // current 8 bytes is still smaller than the next element of - // sequence, just get the sum and skip 8 elements forward. - uint32_t after_batch = it.prev_ + BATCH_SIZE; - after_batch += std::accumulate(it.pos_, it.pos_ + BATCH_SIZE, 0); - if (after_batch < sequence[ndx]) { - it.forward(BATCH_SIZE, after_batch); - continue; - } +// Read VLE integer under run_it_ and do the intersection. +void IntersectionHelper::step_single() { + uint32_t next = prev_ + run_read(run_it_); + if (next < *seq_it_) { + prev_ = next; + run_it_ = run_forward(run_it_); + return; + } + if (*seq_it_ == next) { + *seq_out_++ = *seq_it_; + prev_ = next; + run_it_ = run_forward(run_it_); + } + seq_it_++; +} - // Regular fast case - do the intersection without decoding VLE - // integers (remember, we know they are all small, i.e. one byte). - // Basically the same logic as regular case (see below). - for (uint8_t *end = it.pos_ + 8; it.pos_ < end && ndx < size;) { - uint32_t next = it.prev_ + *it.pos_ + 1; - if (next < sequence[ndx]) { - it.forward(1, next); - continue; - } - if (sequence[ndx] == next) { - sequence[out_ndx++] = sequence[ndx]; - it.forward(1, next); - } - ndx += 1; - } +// Read 8 bytes under run_it_. If all are small, handle them all. +bool IntersectionHelper::step_by_8() { + constexpr int BATCH_SIZE = 8; + constexpr uint64_t VLE_MASK = 0x8080808080808080UL; + + uint64_t *as_qword = (uint64_t *)run_it_; + uint64_t hit = (*as_qword & VLE_MASK); + if (hit != 0) { + return false; + } + + uint32_t after_batch = prev_ + BATCH_SIZE; + after_batch += std::accumulate(run_it_, run_it_ + BATCH_SIZE, 0); + + if (after_batch < *seq_it_) { + run_it_ += BATCH_SIZE; + prev_ = after_batch; + return true; + } + + for (uint8_t *end = run_it_ + BATCH_SIZE; run_it_ < end && seq_it_ < seq_end_;) { + uint32_t next = prev_ + *run_it_ + 1; + if (next < *seq_it_) { + prev_ = next; + run_it_ += 1; continue; } + if (*seq_it_ == next) { + *seq_out_++ = *seq_it_; + prev_ = next; + run_it_ += 1; + } + seq_it_++; + } + return true; +} - // Regular set intersection logic (non-optimized/default case). - // This is basically equivalent to std::set_intersection. - uint32_t next = *it; - if (next < sequence[ndx]) { - ++it; +void IntersectionHelper::intersect_by_8() { + while (run_it_ < run_end_ - 8 && seq_it_ < seq_end_) { + if (step_by_8()) { continue; } - if (sequence[ndx] == next) { - sequence[out_ndx++] = sequence[ndx]; - ++it; - } - ndx += 1; + step_single(); } +} - // Clean up elements from the end of the ANDed vector. - sequence.erase(sequence.begin() + out_ndx, sequence.end()); +void IntersectionHelper::intersect() { + intersect_by_8(); + while (run_it_ < run_end_ && seq_it_ < seq_end_) { + step_single(); + } } void SortedRun::do_and(SortedRun &other) { - // Benchmarking shows that handling a situation where this->is_compressed() - // makes the code *slower*. I assume that's because of memory efficiency. decompress(); + std::vector::iterator new_end; if (other.is_compressed()) { - RunIterator::do_and(other.comp_begin(), other.comp_end(), &sequence_); + IntersectionHelper helper(&sequence_, &other.run_); + helper.intersect(); + new_end = begin() + helper.result_size(); } else { - std::vector::iterator new_end = std::set_intersection( + new_end = std::set_intersection( other.begin(), other.end(), begin(), end(), begin()); - sequence_.erase(new_end, sequence_.end()); } + sequence_.erase(new_end, end()); } void SortedRun::decompress() { diff --git a/libursa/SortedRun.h b/libursa/SortedRun.h index 2dd0f91..acbda32 100644 --- a/libursa/SortedRun.h +++ b/libursa/SortedRun.h @@ -1,42 +1,28 @@ #include "Core.h" +#include -// Iterate over a compressed run representation. -// "Run" here means a sorted list of FileIDs (this name is used in the -// codebase). And a "compressed" run format is described in the documentation -// "ondiskformat.md", in the "Index" section. -class RunIterator : public std::iterator { - typedef RunIterator iterator; - uint8_t *pos_; +uint32_t run_read(uint8_t *pos); +uint8_t *run_forward(uint8_t *pos); + +class IntersectionHelper { + uint8_t *run_it_; + uint8_t *run_end_; int32_t prev_; + uint32_t *seq_start_; + uint32_t *seq_it_; + uint32_t *seq_end_; + uint32_t *seq_out_; - void forward(int steps, int32_t new_pos) { - pos_ += steps; - prev_ = new_pos; - } - uint32_t current() const; - uint8_t *nextpos(); + bool step_by_8(); + void step_single(); + void intersect_by_8(); public: - RunIterator(uint8_t *run) : pos_(run), prev_(-1) {} - ~RunIterator() {} - - // Moves to the next element (this may mean a variable number of bytes). - RunIterator &operator++() { - prev_ = current(); - pos_ = nextpos(); - return *this; - } - - // Gets the current element under the iterator. Useful in STL algorithms. - uint32_t operator*() const { return current(); } - - // Compares the iterators. Useful in STL algorithms. - bool operator!=(const iterator &rhs) const { return pos_ != rhs.pos_; } - - // Fast (optimized) std::set_intersection implementation, tweaked for the - // expected data distribution. - static void do_and(RunIterator begin, RunIterator end, - std::vector *target); + IntersectionHelper(std::vector *seq, std::vector *run) + :run_it_(run->data()), run_end_(run->data() + run->size()), prev_(-1), seq_start_(seq->data()), seq_it_(seq->data()), seq_end_(seq->data() + seq->size()), seq_out_(seq->data()) {} + + size_t result_size() const { return seq_out_ - seq_start_; } + void intersect(); }; // This class represents a "run" - a sorted list of FileIDs. This can be @@ -65,10 +51,6 @@ class SortedRun { std::vector::iterator begin(); std::vector::iterator end(); - // Iterate over the compressed representation (throws if decompressed) - RunIterator comp_begin(); - RunIterator comp_end(); - SortedRun(const SortedRun &other) = default; public: From 0bc7d9d6e87deb096caec22d03cef5182b682b18 Mon Sep 17 00:00:00 2001 From: msm Date: Wed, 16 Oct 2024 02:52:11 +0200 Subject: [PATCH 3/5] Format with clang-format --- libursa/SortedRun.cpp | 9 +++++---- libursa/SortedRun.h | 11 +++++++++-- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/libursa/SortedRun.cpp b/libursa/SortedRun.cpp index 9f18d70..8881867 100644 --- a/libursa/SortedRun.cpp +++ b/libursa/SortedRun.cpp @@ -51,7 +51,7 @@ void SortedRun::do_or(SortedRun &other) { other.decompress(); std::vector new_results; std::set_union(other.begin(), other.end(), begin(), end(), - std::back_inserter(new_results)); + std::back_inserter(new_results)); std::swap(new_results, sequence_); } @@ -91,7 +91,8 @@ bool IntersectionHelper::step_by_8() { return true; } - for (uint8_t *end = run_it_ + BATCH_SIZE; run_it_ < end && seq_it_ < seq_end_;) { + for (uint8_t *end = run_it_ + BATCH_SIZE; + run_it_ < end && seq_it_ < seq_end_;) { uint32_t next = prev_ + *run_it_ + 1; if (next < *seq_it_) { prev_ = next; @@ -132,8 +133,8 @@ void SortedRun::do_and(SortedRun &other) { helper.intersect(); new_end = begin() + helper.result_size(); } else { - new_end = std::set_intersection( - other.begin(), other.end(), begin(), end(), begin()); + new_end = std::set_intersection(other.begin(), other.end(), begin(), + end(), begin()); } sequence_.erase(new_end, end()); } diff --git a/libursa/SortedRun.h b/libursa/SortedRun.h index acbda32..c1d2b22 100644 --- a/libursa/SortedRun.h +++ b/libursa/SortedRun.h @@ -1,6 +1,7 @@ -#include "Core.h" #include +#include "Core.h" + uint32_t run_read(uint8_t *pos); uint8_t *run_forward(uint8_t *pos); @@ -19,7 +20,13 @@ class IntersectionHelper { public: IntersectionHelper(std::vector *seq, std::vector *run) - :run_it_(run->data()), run_end_(run->data() + run->size()), prev_(-1), seq_start_(seq->data()), seq_it_(seq->data()), seq_end_(seq->data() + seq->size()), seq_out_(seq->data()) {} + : run_it_(run->data()), + run_end_(run->data() + run->size()), + prev_(-1), + seq_start_(seq->data()), + seq_it_(seq->data()), + seq_end_(seq->data() + seq->size()), + seq_out_(seq->data()) {} size_t result_size() const { return seq_out_ - seq_start_; } void intersect(); From c010742410386b9b472e300108f38faf1ab395ad Mon Sep 17 00:00:00 2001 From: msm Date: Wed, 16 Oct 2024 02:54:02 +0200 Subject: [PATCH 4/5] Fix version number --- libursa/Version.h.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libursa/Version.h.in b/libursa/Version.h.in index 877ca09..aefb917 100644 --- a/libursa/Version.h.in +++ b/libursa/Version.h.in @@ -9,5 +9,5 @@ constexpr std::string_view ursadb_format_version = "1.5.0"; // Project version. // Consider updating the version tag when doing PRs. // clang-format off -constexpr std::string_view ursadb_version_string = "@PROJECT_VERSION@+opt8"; +constexpr std::string_view ursadb_version_string = "@PROJECT_VERSION@+opt9"; // clang-format on From cacf13cb8cbc81b873a634a6fa40edf640b00889 Mon Sep 17 00:00:00 2001 From: msm Date: Wed, 16 Oct 2024 14:45:04 +0200 Subject: [PATCH 5/5] A few comments --- libursa/SortedRun.cpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/libursa/SortedRun.cpp b/libursa/SortedRun.cpp index 8881867..bcde0f1 100644 --- a/libursa/SortedRun.cpp +++ b/libursa/SortedRun.cpp @@ -6,7 +6,7 @@ #include "Utils.h" -// Read element currently under pos. +// Read VLE integer stored under pos. uint32_t run_read(uint8_t *pos) { uint64_t acc = 0; uint32_t shift = 0; @@ -20,7 +20,7 @@ uint32_t run_read(uint8_t *pos) { } } -// Move pos to the next element. +// Return a pointer to the next encoded integer. uint8_t *run_forward(uint8_t *pos) { for (;; pos++) { if ((*pos & 0x80) == 0) { @@ -71,7 +71,8 @@ void IntersectionHelper::step_single() { seq_it_++; } -// Read 8 bytes under run_it_. If all are small, handle them all. +// Read 8 bytes under run_it_. If all are small, intersect them all. +// Returns true if the method can continue, and false if a large int was found. bool IntersectionHelper::step_by_8() { constexpr int BATCH_SIZE = 8; constexpr uint64_t VLE_MASK = 0x8080808080808080UL; @@ -79,18 +80,22 @@ bool IntersectionHelper::step_by_8() { uint64_t *as_qword = (uint64_t *)run_it_; uint64_t hit = (*as_qword & VLE_MASK); if (hit != 0) { + // A large byte (>0x80) was found, handle them in a slow path. return false; } uint32_t after_batch = prev_ + BATCH_SIZE; after_batch += std::accumulate(run_it_, run_it_ + BATCH_SIZE, 0); + // Fast-fast path. Maybe we can just add all 8 bytes and still are + // below the next sequence byte (i.e. nothing to do in intersection). if (after_batch < *seq_it_) { run_it_ += BATCH_SIZE; prev_ = after_batch; return true; } + // Regular batch: like step_single but we know are only dealing with bytes. for (uint8_t *end = run_it_ + BATCH_SIZE; run_it_ < end && seq_it_ < seq_end_;) { uint32_t next = prev_ + *run_it_ + 1; @@ -109,6 +114,7 @@ bool IntersectionHelper::step_by_8() { return true; } +// Do the intersection in batches of 8 bytes at once. void IntersectionHelper::intersect_by_8() { while (run_it_ < run_end_ - 8 && seq_it_ < seq_end_) { if (step_by_8()) { @@ -118,6 +124,8 @@ void IntersectionHelper::intersect_by_8() { } } +// This function is basically std::set_intersection, but optimized as +// much as possible (since sometimes almost 50% of time is spent here). void IntersectionHelper::intersect() { intersect_by_8(); while (run_it_ < run_end_ && seq_it_ < seq_end_) {