From 8467f9fd4da74907b10e481375ab5a8d93c9c56b Mon Sep 17 00:00:00 2001 From: msm-cert <156842376+msm-cert@users.noreply.github.com> Date: Tue, 1 Oct 2024 12:43:31 +0000 Subject: [PATCH] Create the optimizer framework (#219) Create the optimizer framework --- libursa/CMakeLists.txt | 2 ++ libursa/OnDiskDataset.cpp | 4 +++- libursa/Query.cpp | 7 +++++++ libursa/Query.h | 5 +++-- libursa/QueryOptimizer.cpp | 31 +++++++++++++++++++++++++++++++ libursa/QueryOptimizer.h | 8 ++++++++ libursa/Version.h.in | 5 +++-- 7 files changed, 57 insertions(+), 5 deletions(-) create mode 100644 libursa/QueryOptimizer.cpp create mode 100644 libursa/QueryOptimizer.h diff --git a/libursa/CMakeLists.txt b/libursa/CMakeLists.txt index 9d25f61..b4988c2 100644 --- a/libursa/CMakeLists.txt +++ b/libursa/CMakeLists.txt @@ -50,6 +50,8 @@ add_library( QueryParser.h QueryResult.cpp QueryResult.h + QueryOptimizer.cpp + QueryOptimizer.h RawFile.cpp RawFile.h Responses.cpp diff --git a/libursa/OnDiskDataset.cpp b/libursa/OnDiskDataset.cpp index 80e3210..fe6fc44 100644 --- a/libursa/OnDiskDataset.cpp +++ b/libursa/OnDiskDataset.cpp @@ -8,6 +8,7 @@ #include "DatabaseName.h" #include "Json.h" #include "Query.h" +#include "QueryOptimizer.h" #include "spdlog/fmt/ostr.h" #include "spdlog/spdlog.h" @@ -91,7 +92,8 @@ void OnDiskDataset::execute(const Query &query, ResultWriter *out, for (const auto &ndx : get_indexes()) { types_to_query.emplace(ndx.index_type()); } - const Query plan = query.plan(types_to_query); + Query plan = query.plan(types_to_query); + plan = q_optimize(std::move(plan)); spdlog::debug("PLAN: {}", plan); QueryResult result = this->query(plan, counters); diff --git a/libursa/Query.cpp b/libursa/Query.cpp index 262df7d..33443a8 100644 --- a/libursa/Query.cpp +++ b/libursa/Query.cpp @@ -18,7 +18,14 @@ const std::vector &Query::as_queries() const { type != QueryType::MIN_OF) { throw std::runtime_error("This query doesn\'t contain subqueries."); } + return queries; +} +std::vector &Query::as_queries() { + if (type != QueryType::AND && type != QueryType::OR && + type != QueryType::MIN_OF) { + throw std::runtime_error("This query doesn\'t contain subqueries."); + } return queries; } diff --git a/libursa/Query.h b/libursa/Query.h index ecc96c0..7149a80 100644 --- a/libursa/Query.h +++ b/libursa/Query.h @@ -22,8 +22,8 @@ class PrimitiveQuery { PrimitiveQuery(IndexType itype, TriGram trigram) : itype(itype), trigram(trigram) {} - const IndexType itype; - const TriGram trigram; + IndexType itype; + TriGram trigram; // We want to use PrimitiveQuery in STL containers, and this means they // must be comparable using <. Specific order doesn't matter. @@ -51,6 +51,7 @@ class Query { Query &operator=(Query &&) = default; const std::vector &as_queries() const; + std::vector &as_queries(); const QString &as_value() const; uint32_t as_count() const; std::string as_string_repr() const; diff --git a/libursa/QueryOptimizer.cpp b/libursa/QueryOptimizer.cpp new file mode 100644 index 0000000..f8198ce --- /dev/null +++ b/libursa/QueryOptimizer.cpp @@ -0,0 +1,31 @@ +#include "QueryOptimizer.h" + +#include + +// Run the optimization pases on subqueries. +// After this step, every subquery should be maximally optimized, +// So I believe there's no need to run this in a loop. +Query simplify_subqueries(Query &&q) { + // q_optimize ensures QueryType is not PRIMITIVE already + std::vector newqueries; + for (auto &&query : q.as_queries()) { + newqueries.emplace_back(q_optimize(std::move(query))); + } + if (q.get_type() == QueryType::MIN_OF) { + return q_min_of(q.as_count(), std::move(newqueries)); + } + return std::move(Query(q.get_type(), std::move(newqueries))); +} + +Query q_optimize(Query &&q) { + if (q.get_type() == QueryType::PRIMITIVE) { + // Nothing to improve here. + return std::move(q); + } + + q = simplify_subqueries(std::move(q)); + + // Optimization passes will be added here later. + + return std::move(q); +} diff --git a/libursa/QueryOptimizer.h b/libursa/QueryOptimizer.h new file mode 100644 index 0000000..2d267a2 --- /dev/null +++ b/libursa/QueryOptimizer.h @@ -0,0 +1,8 @@ +#pragma once + +#include "Query.h" + +// Optimizes a query, and returns the optimized version. +// Optimizations try to simplify the expression in various ways to make the +// execution faster - for example by enabling short-circuiting in some places. +Query q_optimize(Query &&query); diff --git a/libursa/Version.h.in b/libursa/Version.h.in index 73203c2..dbec52a 100644 --- a/libursa/Version.h.in +++ b/libursa/Version.h.in @@ -8,5 +8,6 @@ constexpr std::string_view ursadb_format_version = "1.5.0"; // Project version. // Consider updating the version tag when doing PRs. -constexpr std::string_view ursadb_version_string = - "@PROJECT_VERSION@+debuglogs"; +// clang-format off +constexpr std::string_view ursadb_version_string = "@PROJECT_VERSION@+opt0"; +// clang-format on