From 377e3cbcafcd55dd5f8b4f2216acee0d052bd0c8 Mon Sep 17 00:00:00 2001
From: msm <msm@cert.pl>
Date: Sun, 13 Oct 2024 01:09:36 +0200
Subject: [PATCH 1/5] opt9: batch vle processing

---
 libursa/SortedRun.cpp | 82 +++++++++++++++++++++++++++++++++++++++----
 libursa/SortedRun.h   | 18 ++++++++++
 2 files changed, 94 insertions(+), 6 deletions(-)
diff --git a/libursa/SortedRun.cpp b/libursa/SortedRun.cpp
index f3fac27..c46cf34 100644
--- a/libursa/SortedRun.cpp
+++ b/libursa/SortedRun.cpp
@@ -1,6 +1,7 @@
 #include "SortedRun.h"
 
 #include <algorithm>
+#include <numeric>
 #include <stdexcept>
 
 #include "Utils.h"
@@ -67,19 +68,88 @@ void SortedRun::do_or(SortedRun &other) {
     std::swap(new_results, sequence_);
 }
 
+// Performance critical method. As of the current version, in some tests,
+// more than half of the time is spent ANDing decompressed and compressed runs.
+// We expect the compressed set to be large, and sequence to be short.
+//
+// About the fast case: integers on disk are Variable Length Encoded (VLE).
+// This means that interes <= 0x7f are encoded as a single byte, and larger
+// ones use multiple bytes (with 0x80 bit used as a continuation bit).
+// This function optimizes the common case where most VLE integers are
+// small, i.e. are stored as a single byte without additional encoding.
+void RunIterator::do_and(RunIterator begin, RunIterator end,
+                         std::vector<uint32_t> *target) {
+    std::vector<uint32_t> &sequence = *target;
+    int out_ndx = 0;
+    int ndx = 0;
+    int size = sequence.size();
+
+    RunIterator it = begin;
+    while (it.pos_ < end.pos_ && ndx < size) {
+        // Handle the fast-case. This is purely an optimization, the function
+        // will still function properly (but slower) with this `if` removed.
+        // Fast case: the next 8 VLE bytes are all small (0x80 bit not set).
+        constexpr int BATCH_SIZE = 8;
+        constexpr uint64_t VLE_MASK = 0x8080808080808080UL;
+        uint64_t *as_qword = (uint64_t *)it.pos_;
+        if (it.pos_ + BATCH_SIZE < end.pos_ && (*as_qword & VLE_MASK) == 0) {
+            // Fast case of the fast case - if the pointer after processing the
+            // current 8 bytes is still smaller than the next element of
+            // sequence, just get the sum and skip 8 elements forward.
+            uint32_t after_batch = it.prev_ + BATCH_SIZE;
+            after_batch += std::accumulate(it.pos_, it.pos_ + BATCH_SIZE, 0);
+            if (after_batch < sequence[ndx]) {
+                it.forward(BATCH_SIZE, after_batch);
+                continue;
+            }
+
+            // Regular fast case - do the intersection without decoding VLE
+            // integers (remember, we know they are all small, i.e. one byte).
+            // Basically the same logic as regular case (see below).
+            for (uint8_t *end = it.pos_ + 8; it.pos_ < end && ndx < size;) {
+                uint32_t next = it.prev_ + *it.pos_ + 1;
+                if (next < sequence[ndx]) {
+                    it.forward(1, next);
+                    continue;
+                }
+                if (sequence[ndx] == next) {
+                    sequence[out_ndx++] = sequence[ndx];
+                    it.forward(1, next);
+                }
+                ndx += 1;
+            }
+            continue;
+        }
+
+        // Regular set intersection logic (non-optimized/default case).
+        // This is basically equivalent to std::set_intersection.
+        uint32_t next = *it;
+        if (next < sequence[ndx]) {
+            ++it;
+            continue;
+        }
+        if (sequence[ndx] == next) {
+            sequence[out_ndx++] = sequence[ndx];
+            ++it;
+        }
+        ndx += 1;
+    }
+
+    // Clean up elements from the end of the ANDed vector.
+    sequence.erase(sequence.begin() + out_ndx, sequence.end());
+}
+
 void SortedRun::do_and(SortedRun &other) {
     // Benchmarking shows that handling a situation where this->is_compressed()
     // makes the code *slower*. I assume that's because of memory efficiency.
     decompress();
-    std::vector<uint32_t>::iterator new_end;
     if (other.is_compressed()) {
-        new_end = std::set_intersection(other.comp_begin(), other.comp_end(),
-                                        begin(), end(), begin());
+        RunIterator::do_and(other.comp_begin(), other.comp_end(), &sequence_);
     } else {
-        new_end = std::set_intersection(other.begin(), other.end(), begin(),
-                                        end(), begin());
+        std::vector<uint32_t>::iterator new_end = std::set_intersection(
+            other.begin(), other.end(), begin(), end(), begin());
+        sequence_.erase(new_end, sequence_.end());
     }
-    sequence_.erase(new_end, sequence_.end());
 }
 
 void SortedRun::decompress() {
diff --git a/libursa/SortedRun.h b/libursa/SortedRun.h
index 34638ea..2dd0f91 100644
--- a/libursa/SortedRun.h
+++ b/libursa/SortedRun.h
@@ -9,6 +9,10 @@ class RunIterator : public std::iterator<std::forward_iterator_tag, uint32_t> {
     uint8_t *pos_;
     int32_t prev_;
 
+    void forward(int steps, int32_t new_pos) {
+        pos_ += steps;
+        prev_ = new_pos;
+    }
     uint32_t current() const;
     uint8_t *nextpos();
 
@@ -16,14 +20,23 @@ class RunIterator : public std::iterator<std::forward_iterator_tag, uint32_t> {
     RunIterator(uint8_t *run) : pos_(run), prev_(-1) {}
     ~RunIterator() {}
 
+    // Moves to the next element (this may mean a variable number of bytes).
     RunIterator &operator++() {
         prev_ = current();
         pos_ = nextpos();
         return *this;
     }
 
+    // Gets the current element under the iterator. Useful in STL algorithms.
     uint32_t operator*() const { return current(); }
+
+    // Compares the iterators. Useful in STL algorithms.
     bool operator!=(const iterator &rhs) const { return pos_ != rhs.pos_; }
+
+    // Fast (optimized) std::set_intersection implementation, tweaked for the
+    // expected data distribution.
+    static void do_and(RunIterator begin, RunIterator end,
+                       std::vector<uint32_t> *target);
 };
 
 // This class represents a "run" - a sorted list of FileIDs. This can be
@@ -72,11 +85,16 @@ class SortedRun {
         : sequence_(other.sequence_), run_(other.run_) {}
     SortedRun &operator=(SortedRun &&) = default;
 
+    // Checks if the current run is empty.
     bool empty() const { return sequence_.empty() && run_.empty(); }
 
+    // Does the OR operation with the other vector, overwrites this object.
     void do_or(SortedRun &other);
+
+    // Does the AND operation with the other vector, overwrites this object.
     void do_and(SortedRun &other);
 
+    // Does the MIN_OF operation on specified operands. Allocates a new reuslt.
     static SortedRun pick_common(int cutoff, std::vector<SortedRun *> &sources);
 
     // When you really need to clone the run - TODO remove.

From 481661a66f79859446dc84eca29f92f217e00585 Mon Sep 17 00:00:00 2001
From: msm <msm@cert.pl>
Date: Wed, 16 Oct 2024 02:45:47 +0200
Subject: [PATCH 2/5] Refactor the code, hopefully

---
 libursa/SortedRun.cpp | 168 +++++++++++++++++++-----------------------
 libursa/SortedRun.h   |  56 +++++---------
 2 files changed, 96 insertions(+), 128 deletions(-)

diff --git a/libursa/SortedRun.cpp b/libursa/SortedRun.cpp
index c46cf34..9f18d70 100644
--- a/libursa/SortedRun.cpp
+++ b/libursa/SortedRun.cpp
@@ -6,23 +6,25 @@
 
 #include "Utils.h"
 
-uint32_t RunIterator::current() const {
+// Read element currently under pos.
+uint32_t run_read(uint8_t *pos) {
     uint64_t acc = 0;
     uint32_t shift = 0;
-    for (uint8_t *it = pos_;; it++) {
+    for (uint8_t *it = pos;; it++) {
         uint32_t next = *it;
         acc += (next & 0x7FU) << shift;
         shift += 7U;
         if ((next & 0x80U) == 0) {
-            return prev_ + acc + 1;
+            return acc + 1;
         }
     }
 }
 
-uint8_t *RunIterator::nextpos() {
-    for (uint8_t *it = pos_;; it++) {
-        if ((*it & 0x80) == 0) {
-            return it + 1;
+// Move pos to the next element.
+uint8_t *run_forward(uint8_t *pos) {
+    for (;; pos++) {
+        if ((*pos & 0x80) == 0) {
+            return pos + 1;
         }
     }
 }
@@ -43,113 +45,97 @@ std::vector<uint32_t>::iterator SortedRun::end() {
     return sequence_.end();
 }
 
-RunIterator SortedRun::comp_begin() {
-    validate_compression(true);
-    return RunIterator(run_.data());
-}
-
-RunIterator SortedRun::comp_end() {
-    validate_compression(true);
-    return RunIterator(run_.data() + run_.size());
-}
-
 void SortedRun::do_or(SortedRun &other) {
     // In almost every case this is already decompressed.
     decompress();
+    other.decompress();
     std::vector<FileId> new_results;
-    if (other.is_compressed()) {
-        // Unlikely case, in most cases both runs are already decompressed.
-        std::set_union(other.comp_begin(), other.comp_end(), begin(), end(),
-                       std::back_inserter(new_results));
-    } else {
-        std::set_union(other.begin(), other.end(), begin(), end(),
-                       std::back_inserter(new_results));
-    }
+    std::set_union(other.begin(), other.end(), begin(), end(),
+                    std::back_inserter(new_results));
     std::swap(new_results, sequence_);
 }
 
-// Performance critical method. As of the current version, in some tests,
-// more than half of the time is spent ANDing decompressed and compressed runs.
-// We expect the compressed set to be large, and sequence to be short.
-//
-// About the fast case: integers on disk are Variable Length Encoded (VLE).
-// This means that interes <= 0x7f are encoded as a single byte, and larger
-// ones use multiple bytes (with 0x80 bit used as a continuation bit).
-// This function optimizes the common case where most VLE integers are
-// small, i.e. are stored as a single byte without additional encoding.
-void RunIterator::do_and(RunIterator begin, RunIterator end,
-                         std::vector<uint32_t> *target) {
-    std::vector<uint32_t> &sequence = *target;
-    int out_ndx = 0;
-    int ndx = 0;
-    int size = sequence.size();
-
-    RunIterator it = begin;
-    while (it.pos_ < end.pos_ && ndx < size) {
-        // Handle the fast-case. This is purely an optimization, the function
-        // will still function properly (but slower) with this `if` removed.
-        // Fast case: the next 8 VLE bytes are all small (0x80 bit not set).
-        constexpr int BATCH_SIZE = 8;
-        constexpr uint64_t VLE_MASK = 0x8080808080808080UL;
-        uint64_t *as_qword = (uint64_t *)it.pos_;
-        if (it.pos_ + BATCH_SIZE < end.pos_ && (*as_qword & VLE_MASK) == 0) {
-            // Fast case of the fast case - if the pointer after processing the
-            // current 8 bytes is still smaller than the next element of
-            // sequence, just get the sum and skip 8 elements forward.
-            uint32_t after_batch = it.prev_ + BATCH_SIZE;
-            after_batch += std::accumulate(it.pos_, it.pos_ + BATCH_SIZE, 0);
-            if (after_batch < sequence[ndx]) {
-                it.forward(BATCH_SIZE, after_batch);
-                continue;
-            }
+// Read VLE integer under run_it_ and do the intersection.
+void IntersectionHelper::step_single() {
+    uint32_t next = prev_ + run_read(run_it_);
+    if (next < *seq_it_) {
+        prev_ = next;
+        run_it_ = run_forward(run_it_);
+        return;
+    }
+    if (*seq_it_ == next) {
+        *seq_out_++ = *seq_it_;
+        prev_ = next;
+        run_it_ = run_forward(run_it_);
+    }
+    seq_it_++;
+}
 
-            // Regular fast case - do the intersection without decoding VLE
-            // integers (remember, we know they are all small, i.e. one byte).
-            // Basically the same logic as regular case (see below).
-            for (uint8_t *end = it.pos_ + 8; it.pos_ < end && ndx < size;) {
-                uint32_t next = it.prev_ + *it.pos_ + 1;
-                if (next < sequence[ndx]) {
-                    it.forward(1, next);
-                    continue;
-                }
-                if (sequence[ndx] == next) {
-                    sequence[out_ndx++] = sequence[ndx];
-                    it.forward(1, next);
-                }
-                ndx += 1;
-            }
+// Read 8 bytes under run_it_. If all are small, handle them all.
+bool IntersectionHelper::step_by_8() {
+    constexpr int BATCH_SIZE = 8;
+    constexpr uint64_t VLE_MASK = 0x8080808080808080UL;
+
+    uint64_t *as_qword = (uint64_t *)run_it_;
+    uint64_t hit = (*as_qword & VLE_MASK);
+    if (hit != 0) {
+        return false;
+    }
+
+    uint32_t after_batch = prev_ + BATCH_SIZE;
+    after_batch += std::accumulate(run_it_, run_it_ + BATCH_SIZE, 0);
+
+    if (after_batch < *seq_it_) {
+        run_it_ += BATCH_SIZE;
+        prev_ = after_batch;
+        return true;
+    }
+
+    for (uint8_t *end = run_it_ + BATCH_SIZE; run_it_ < end && seq_it_ < seq_end_;) {
+        uint32_t next = prev_ + *run_it_ + 1;
+        if (next < *seq_it_) {
+            prev_ = next;
+            run_it_ += 1;
             continue;
         }
+        if (*seq_it_ == next) {
+            *seq_out_++ = *seq_it_;
+            prev_ = next;
+            run_it_ += 1;
+        }
+        seq_it_++;
+    }
+    return true;
+}
 
-        // Regular set intersection logic (non-optimized/default case).
-        // This is basically equivalent to std::set_intersection.
-        uint32_t next = *it;
-        if (next < sequence[ndx]) {
-            ++it;
+void IntersectionHelper::intersect_by_8() {
+    while (run_it_ < run_end_ - 8 && seq_it_ < seq_end_) {
+        if (step_by_8()) {
             continue;
         }
-        if (sequence[ndx] == next) {
-            sequence[out_ndx++] = sequence[ndx];
-            ++it;
-        }
-        ndx += 1;
+        step_single();
     }
+}
 
-    // Clean up elements from the end of the ANDed vector.
-    sequence.erase(sequence.begin() + out_ndx, sequence.end());
+void IntersectionHelper::intersect() {
+    intersect_by_8();
+    while (run_it_ < run_end_ && seq_it_ < seq_end_) {
+        step_single();
+    }
 }
 
 void SortedRun::do_and(SortedRun &other) {
-    // Benchmarking shows that handling a situation where this->is_compressed()
-    // makes the code *slower*. I assume that's because of memory efficiency.
     decompress();
+    std::vector<uint32_t>::iterator new_end;
     if (other.is_compressed()) {
-        RunIterator::do_and(other.comp_begin(), other.comp_end(), &sequence_);
+        IntersectionHelper helper(&sequence_, &other.run_);
+        helper.intersect();
+        new_end = begin() + helper.result_size();
     } else {
-        std::vector<uint32_t>::iterator new_end = std::set_intersection(
+        new_end = std::set_intersection(
             other.begin(), other.end(), begin(), end(), begin());
-        sequence_.erase(new_end, sequence_.end());
     }
+    sequence_.erase(new_end, end());
 }
 
 void SortedRun::decompress() {
diff --git a/libursa/SortedRun.h b/libursa/SortedRun.h
index 2dd0f91..acbda32 100644
--- a/libursa/SortedRun.h
+++ b/libursa/SortedRun.h
@@ -1,42 +1,28 @@
 #include "Core.h"
+#include <emmintrin.h>
 
-// Iterate over a compressed run representation.
-// "Run" here means a sorted list of FileIDs (this name is used in the
-// codebase).  And a "compressed" run format is described in the documentation
-// "ondiskformat.md", in the "Index" section.
-class RunIterator : public std::iterator<std::forward_iterator_tag, uint32_t> {
-    typedef RunIterator iterator;
-    uint8_t *pos_;
+uint32_t run_read(uint8_t *pos);
+uint8_t *run_forward(uint8_t *pos);
+
+class IntersectionHelper {
+    uint8_t *run_it_;
+    uint8_t *run_end_;
     int32_t prev_;
+    uint32_t *seq_start_;
+    uint32_t *seq_it_;
+    uint32_t *seq_end_;
+    uint32_t *seq_out_;
 
-    void forward(int steps, int32_t new_pos) {
-        pos_ += steps;
-        prev_ = new_pos;
-    }
-    uint32_t current() const;
-    uint8_t *nextpos();
+    bool step_by_8();
+    void step_single();
+    void intersect_by_8();
 
    public:
-    RunIterator(uint8_t *run) : pos_(run), prev_(-1) {}
-    ~RunIterator() {}
-
-    // Moves to the next element (this may mean a variable number of bytes).
-    RunIterator &operator++() {
-        prev_ = current();
-        pos_ = nextpos();
-        return *this;
-    }
-
-    // Gets the current element under the iterator. Useful in STL algorithms.
-    uint32_t operator*() const { return current(); }
-
-    // Compares the iterators. Useful in STL algorithms.
-    bool operator!=(const iterator &rhs) const { return pos_ != rhs.pos_; }
-
-    // Fast (optimized) std::set_intersection implementation, tweaked for the
-    // expected data distribution.
-    static void do_and(RunIterator begin, RunIterator end,
-                       std::vector<uint32_t> *target);
+    IntersectionHelper(std::vector<uint32_t> *seq, std::vector<uint8_t> *run)
+    :run_it_(run->data()), run_end_(run->data() + run->size()), prev_(-1), seq_start_(seq->data()), seq_it_(seq->data()), seq_end_(seq->data() + seq->size()), seq_out_(seq->data()) {}
+
+    size_t result_size() const { return seq_out_ - seq_start_; }
+    void intersect();
 };
 
 // This class represents a "run" - a sorted list of FileIDs. This can be
@@ -65,10 +51,6 @@ class SortedRun {
     std::vector<uint32_t>::iterator begin();
     std::vector<uint32_t>::iterator end();
 
-    // Iterate over the compressed representation (throws if decompressed)
-    RunIterator comp_begin();
-    RunIterator comp_end();
-
     SortedRun(const SortedRun &other) = default;
 
    public:

From 0bc7d9d6e87deb096caec22d03cef5182b682b18 Mon Sep 17 00:00:00 2001
From: msm <msm@cert.pl>
Date: Wed, 16 Oct 2024 02:52:11 +0200
Subject: [PATCH 3/5] Format with clang-format

---
 libursa/SortedRun.cpp |  9 +++++----
 libursa/SortedRun.h   | 11 +++++++++--
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/libursa/SortedRun.cpp b/libursa/SortedRun.cpp
index 9f18d70..8881867 100644
--- a/libursa/SortedRun.cpp
+++ b/libursa/SortedRun.cpp
@@ -51,7 +51,7 @@ void SortedRun::do_or(SortedRun &other) {
     other.decompress();
     std::vector<FileId> new_results;
     std::set_union(other.begin(), other.end(), begin(), end(),
-                    std::back_inserter(new_results));
+                   std::back_inserter(new_results));
     std::swap(new_results, sequence_);
 }
 
@@ -91,7 +91,8 @@ bool IntersectionHelper::step_by_8() {
         return true;
     }
 
-    for (uint8_t *end = run_it_ + BATCH_SIZE; run_it_ < end && seq_it_ < seq_end_;) {
+    for (uint8_t *end = run_it_ + BATCH_SIZE;
+         run_it_ < end && seq_it_ < seq_end_;) {
         uint32_t next = prev_ + *run_it_ + 1;
         if (next < *seq_it_) {
             prev_ = next;
@@ -132,8 +133,8 @@ void SortedRun::do_and(SortedRun &other) {
         helper.intersect();
         new_end = begin() + helper.result_size();
     } else {
-        new_end = std::set_intersection(
-            other.begin(), other.end(), begin(), end(), begin());
+        new_end = std::set_intersection(other.begin(), other.end(), begin(),
+                                        end(), begin());
     }
     sequence_.erase(new_end, end());
 }
diff --git a/libursa/SortedRun.h b/libursa/SortedRun.h
index acbda32..c1d2b22 100644
--- a/libursa/SortedRun.h
+++ b/libursa/SortedRun.h
@@ -1,6 +1,7 @@
-#include "Core.h"
 #include <emmintrin.h>
 
+#include "Core.h"
+
 uint32_t run_read(uint8_t *pos);
 uint8_t *run_forward(uint8_t *pos);
 
@@ -19,7 +20,13 @@ class IntersectionHelper {
 
    public:
     IntersectionHelper(std::vector<uint32_t> *seq, std::vector<uint8_t> *run)
-    :run_it_(run->data()), run_end_(run->data() + run->size()), prev_(-1), seq_start_(seq->data()), seq_it_(seq->data()), seq_end_(seq->data() + seq->size()), seq_out_(seq->data()) {}
+        : run_it_(run->data()),
+          run_end_(run->data() + run->size()),
+          prev_(-1),
+          seq_start_(seq->data()),
+          seq_it_(seq->data()),
+          seq_end_(seq->data() + seq->size()),
+          seq_out_(seq->data()) {}
 
     size_t result_size() const { return seq_out_ - seq_start_; }
     void intersect();

From c010742410386b9b472e300108f38faf1ab395ad Mon Sep 17 00:00:00 2001
From: msm <msm@cert.pl>
Date: Wed, 16 Oct 2024 02:54:02 +0200
Subject: [PATCH 4/5] Fix version number

---
 libursa/Version.h.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libursa/Version.h.in b/libursa/Version.h.in
index 877ca09..aefb917 100644
--- a/libursa/Version.h.in
+++ b/libursa/Version.h.in
@@ -9,5 +9,5 @@ constexpr std::string_view ursadb_format_version = "1.5.0";
 // Project version.
 // Consider updating the version tag when doing PRs.
 // clang-format off
-constexpr std::string_view ursadb_version_string = "@PROJECT_VERSION@+opt8";
+constexpr std::string_view ursadb_version_string = "@PROJECT_VERSION@+opt9";
 // clang-format on

From cacf13cb8cbc81b873a634a6fa40edf640b00889 Mon Sep 17 00:00:00 2001
From: msm <msm@cert.pl>
Date: Wed, 16 Oct 2024 14:45:04 +0200
Subject: [PATCH 5/5] A few comments

---
 libursa/SortedRun.cpp | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/libursa/SortedRun.cpp b/libursa/SortedRun.cpp
index 8881867..bcde0f1 100644
--- a/libursa/SortedRun.cpp
+++ b/libursa/SortedRun.cpp
@@ -6,7 +6,7 @@
 
 #include "Utils.h"
 
-// Read element currently under pos.
+// Read VLE integer stored under pos.
 uint32_t run_read(uint8_t *pos) {
     uint64_t acc = 0;
     uint32_t shift = 0;
@@ -20,7 +20,7 @@ uint32_t run_read(uint8_t *pos) {
     }
 }
 
-// Move pos to the next element.
+// Return a pointer to the next encoded integer.
 uint8_t *run_forward(uint8_t *pos) {
     for (;; pos++) {
         if ((*pos & 0x80) == 0) {
@@ -71,7 +71,8 @@ void IntersectionHelper::step_single() {
     seq_it_++;
 }
 
-// Read 8 bytes under run_it_. If all are small, handle them all.
+// Read 8 bytes under run_it_. If all are small, intersect them all.
+// Returns true if the method can continue, and false if a large int was found.
 bool IntersectionHelper::step_by_8() {
     constexpr int BATCH_SIZE = 8;
     constexpr uint64_t VLE_MASK = 0x8080808080808080UL;
@@ -79,18 +80,22 @@ bool IntersectionHelper::step_by_8() {
     uint64_t *as_qword = (uint64_t *)run_it_;
     uint64_t hit = (*as_qword & VLE_MASK);
     if (hit != 0) {
+        // A large byte (>0x80) was found, handle them in a slow path.
         return false;
     }
 
     uint32_t after_batch = prev_ + BATCH_SIZE;
     after_batch += std::accumulate(run_it_, run_it_ + BATCH_SIZE, 0);
 
+    // Fast-fast path. Maybe we can just add all 8 bytes and still are
+    // below the next sequence byte (i.e. nothing to do in intersection).
     if (after_batch < *seq_it_) {
         run_it_ += BATCH_SIZE;
         prev_ = after_batch;
         return true;
     }
 
+    // Regular batch: like step_single but we know are only dealing with bytes.
     for (uint8_t *end = run_it_ + BATCH_SIZE;
          run_it_ < end && seq_it_ < seq_end_;) {
         uint32_t next = prev_ + *run_it_ + 1;
@@ -109,6 +114,7 @@ bool IntersectionHelper::step_by_8() {
     return true;
 }
 
+// Do the intersection in batches of 8 bytes at once.
 void IntersectionHelper::intersect_by_8() {
     while (run_it_ < run_end_ - 8 && seq_it_ < seq_end_) {
         if (step_by_8()) {
@@ -118,6 +124,8 @@ void IntersectionHelper::intersect_by_8() {
     }
 }
 
+// This function is basically std::set_intersection, but optimized as
+// much as possible (since sometimes almost 50% of time is spent here).
 void IntersectionHelper::intersect() {
     intersect_by_8();
     while (run_it_ < run_end_ && seq_it_ < seq_end_) {