opt9: batch vle processing

CERT-Polska · Oct 14, 2024 · 377e3cb · 377e3cb
1 parent cdbf0c1
commit 377e3cb
Show file tree

Hide file tree

Showing 2 changed files with 94 additions and 6 deletions.
diff --git a/libursa/SortedRun.cpp b/libursa/SortedRun.cpp
@@ -1,6 +1,7 @@
 #include "SortedRun.h"
 
 #include <algorithm>
+#include <numeric>
 #include <stdexcept>
 
 #include "Utils.h"
@@ -67,19 +68,88 @@ void SortedRun::do_or(SortedRun &other) {
     std::swap(new_results, sequence_);
 }
 
+// Performance critical method. As of the current version, in some tests,
+// more than half of the time is spent ANDing decompressed and compressed runs.
+// We expect the compressed set to be large, and sequence to be short.
+//
+// About the fast case: integers on disk are Variable Length Encoded (VLE).
+// This means that interes <= 0x7f are encoded as a single byte, and larger
+// ones use multiple bytes (with 0x80 bit used as a continuation bit).
+// This function optimizes the common case where most VLE integers are
+// small, i.e. are stored as a single byte without additional encoding.
+void RunIterator::do_and(RunIterator begin, RunIterator end,
+                         std::vector<uint32_t> *target) {
+    std::vector<uint32_t> &sequence = *target;
+    int out_ndx = 0;
+    int ndx = 0;
+    int size = sequence.size();
+
+    RunIterator it = begin;
+    while (it.pos_ < end.pos_ && ndx < size) {
+        // Handle the fast-case. This is purely an optimization, the function
+        // will still function properly (but slower) with this `if` removed.
+        // Fast case: the next 8 VLE bytes are all small (0x80 bit not set).
+        constexpr int BATCH_SIZE = 8;
+        constexpr uint64_t VLE_MASK = 0x8080808080808080UL;
+        uint64_t *as_qword = (uint64_t *)it.pos_;
+        if (it.pos_ + BATCH_SIZE < end.pos_ && (*as_qword & VLE_MASK) == 0) {
+            // Fast case of the fast case - if the pointer after processing the
+            // current 8 bytes is still smaller than the next element of
+            // sequence, just get the sum and skip 8 elements forward.
+            uint32_t after_batch = it.prev_ + BATCH_SIZE;
+            after_batch += std::accumulate(it.pos_, it.pos_ + BATCH_SIZE, 0);
+            if (after_batch < sequence[ndx]) {
+                it.forward(BATCH_SIZE, after_batch);
+                continue;
+            }
+
+            // Regular fast case - do the intersection without decoding VLE
+            // integers (remember, we know they are all small, i.e. one byte).
+            // Basically the same logic as regular case (see below).
+            for (uint8_t *end = it.pos_ + 8; it.pos_ < end && ndx < size;) {
+                uint32_t next = it.prev_ + *it.pos_ + 1;
+                if (next < sequence[ndx]) {
+                    it.forward(1, next);
+                    continue;
+                }
+                if (sequence[ndx] == next) {
+                    sequence[out_ndx++] = sequence[ndx];
+                    it.forward(1, next);
+                }
+                ndx += 1;
+            }
+            continue;
+        }
+
+        // Regular set intersection logic (non-optimized/default case).
+        // This is basically equivalent to std::set_intersection.
+        uint32_t next = *it;
+        if (next < sequence[ndx]) {
+            ++it;
+            continue;
+        }
+        if (sequence[ndx] == next) {
+            sequence[out_ndx++] = sequence[ndx];
+            ++it;
+        }
+        ndx += 1;
+    }
+
+    // Clean up elements from the end of the ANDed vector.
+    sequence.erase(sequence.begin() + out_ndx, sequence.end());
+}
+
 void SortedRun::do_and(SortedRun &other) {
     // Benchmarking shows that handling a situation where this->is_compressed()
     // makes the code *slower*. I assume that's because of memory efficiency.
     decompress();
-    std::vector<uint32_t>::iterator new_end;
     if (other.is_compressed()) {
-        new_end = std::set_intersection(other.comp_begin(), other.comp_end(),
-                                        begin(), end(), begin());
+        RunIterator::do_and(other.comp_begin(), other.comp_end(), &sequence_);
     } else {
-        new_end = std::set_intersection(other.begin(), other.end(), begin(),
-                                        end(), begin());
+        std::vector<uint32_t>::iterator new_end = std::set_intersection(
+            other.begin(), other.end(), begin(), end(), begin());
+        sequence_.erase(new_end, sequence_.end());
     }
-    sequence_.erase(new_end, sequence_.end());
 }
 
 void SortedRun::decompress() {

diff --git a/libursa/SortedRun.h b/libursa/SortedRun.h
@@ -9,21 +9,34 @@ class RunIterator : public std::iterator<std::forward_iterator_tag, uint32_t> {
     uint8_t *pos_;
     int32_t prev_;
 
+    void forward(int steps, int32_t new_pos) {
+        pos_ += steps;
+        prev_ = new_pos;
+    }
     uint32_t current() const;
     uint8_t *nextpos();
 
    public:
     RunIterator(uint8_t *run) : pos_(run), prev_(-1) {}
     ~RunIterator() {}
 
+    // Moves to the next element (this may mean a variable number of bytes).
     RunIterator &operator++() {
         prev_ = current();
         pos_ = nextpos();
         return *this;
     }
 
+    // Gets the current element under the iterator. Useful in STL algorithms.
     uint32_t operator*() const { return current(); }
+
+    // Compares the iterators. Useful in STL algorithms.
     bool operator!=(const iterator &rhs) const { return pos_ != rhs.pos_; }
+
+    // Fast (optimized) std::set_intersection implementation, tweaked for the
+    // expected data distribution.
+    static void do_and(RunIterator begin, RunIterator end,
+                       std::vector<uint32_t> *target);
 };
 
 // This class represents a "run" - a sorted list of FileIDs. This can be
@@ -72,11 +85,16 @@ class SortedRun {
         : sequence_(other.sequence_), run_(other.run_) {}
     SortedRun &operator=(SortedRun &&) = default;
 
+    // Checks if the current run is empty.
     bool empty() const { return sequence_.empty() && run_.empty(); }
 
+    // Does the OR operation with the other vector, overwrites this object.
     void do_or(SortedRun &other);
+
+    // Does the AND operation with the other vector, overwrites this object.
     void do_and(SortedRun &other);
 
+    // Does the MIN_OF operation on specified operands. Allocates a new reuslt.
     static SortedRun pick_common(int cutoff, std::vector<SortedRun *> &sources);
 
     // When you really need to clone the run - TODO remove.