From d64bc156cd4d26b88009466da16a459143f12643 Mon Sep 17 00:00:00 2001 From: msm-cert <156842376+msm-cert@users.noreply.github.com> Date: Tue, 1 Oct 2024 14:07:49 +0000 Subject: [PATCH] opt6: reorder_subqueries (#225) --- libursa/QueryOptimizer.cpp | 57 +++++++++++++++++++++++++++++++++++++- libursa/Version.h.in | 2 +- 2 files changed, 57 insertions(+), 2 deletions(-) diff --git a/libursa/QueryOptimizer.cpp b/libursa/QueryOptimizer.cpp index 003dfaa..5a849e0 100644 --- a/libursa/QueryOptimizer.cpp +++ b/libursa/QueryOptimizer.cpp @@ -144,6 +144,61 @@ Query propagate_degenerate_queries(Query &&q, bool *changed) { return std::move(q); } +// This heuristic should ideally measure "what is the chance +// that this query returns zero results", or "how many files we expect to get". +// Of course, less files and bigger chance for zero result is better. +// This should also be weighted by the query cost (100 queries for 10% chance +// to get empty result is worse than 2 queries for 15% chance of empty result). +// +// The current implementation is a very naive heuristic, that just looks at +// the query type, and index type for primitives, and orders basing on that. +uint32_t query_heuristic_cost(const Query &q) { + // From empirical test, order of query types doesn't seem to matter much. + switch (q.get_type()) { + case QueryType::PRIMITIVE: + // Sort by ngram type, then by ngram value, alphabetically first. + // This is (un)surprisingly important for two reasons: + // 1. we read sequentially as many ngrams as possible. + // 2. consecutive ngrams are independent: (abc, bcd) vs (abc, def). + // Use smaller indexes first, because they're faster to read. + switch (q.as_ngram().itype) { + case IndexType::WIDE8: + return (0 << 24) + q.as_ngram().trigram; + case IndexType::TEXT4: + return (1 << 24) + q.as_ngram().trigram; + case IndexType::HASH4: + return (2 << 24) + q.as_ngram().trigram; + case IndexType::GRAM3: + return (3 << 24) + q.as_ngram().trigram; + } + case QueryType::AND: + return 4 << 24; + case QueryType::MIN_OF: + return 5 << 24; + case QueryType::OR: + // OR is the worst operation, since it always needs to scan + // all of its arguments (no chance of early exit). + return 6 << 24; + } + throw std::runtime_error("Unexpected query/index type."); +} + +// Order queries by their heuristic cost. +bool query_heuristic_comparer(const Query &left, const Query &right) { + return query_heuristic_cost(left) < query_heuristic_cost(right); +} + +// Order the subqueries to maximize the chance of early exit. +// This is done after all other optimizations, and there's no point of +// running this in a loop. +Query reorder_subqueries(Query &&q) { + if (q.get_type() == QueryType::AND) { + std::stable_sort(q.as_queries().begin(), q.as_queries().end(), + query_heuristic_comparer); + } + return std::move(q); // Currently only support AND operators. +} + Query q_optimize(Query &&q) { if (q.get_type() == QueryType::PRIMITIVE) { // Nothing to improve here. @@ -160,6 +215,6 @@ Query q_optimize(Query &&q) { q = simplify_minof(std::move(q), &changed); q = propagate_degenerate_queries(std::move(q), &changed); } - + q = reorder_subqueries(std::move(q)); return std::move(q); } diff --git a/libursa/Version.h.in b/libursa/Version.h.in index c8124a9..57dda41 100644 --- a/libursa/Version.h.in +++ b/libursa/Version.h.in @@ -9,5 +9,5 @@ constexpr std::string_view ursadb_format_version = "1.5.0"; // Project version. // Consider updating the version tag when doing PRs. // clang-format off -constexpr std::string_view ursadb_version_string = "@PROJECT_VERSION@+opt5"; +constexpr std::string_view ursadb_version_string = "@PROJECT_VERSION@+opt6"; // clang-format on