From 18a45d8b079f18020c5a07b438e8435d3309d448 Mon Sep 17 00:00:00 2001 From: msm-cert <156842376+msm-cert@users.noreply.github.com> Date: Tue, 1 Oct 2024 13:13:59 +0000 Subject: [PATCH] opt3: deduplicate_primitives (#222) --- libursa/Query.cpp | 8 +++++++- libursa/Query.h | 1 + libursa/QueryOptimizer.cpp | 24 ++++++++++++++++++++++++ libursa/Version.h.in | 2 +- 4 files changed, 33 insertions(+), 2 deletions(-) diff --git a/libursa/Query.cpp b/libursa/Query.cpp index 33443a8..c48898a 100644 --- a/libursa/Query.cpp +++ b/libursa/Query.cpp @@ -80,10 +80,16 @@ const QString &Query::as_value() const { if (type != QueryType::PRIMITIVE) { throw std::runtime_error("This query doesn\'t have any value."); } - return value; } +PrimitiveQuery Query::as_ngram() const { + if (type != QueryType::PRIMITIVE) { + throw std::runtime_error("This query doesn\'t contain a ngram."); + } + return ngram; +} + std::string Query::as_string_repr() const { std::string out = ""; if (value.empty()) { diff --git a/libursa/Query.h b/libursa/Query.h index 7149a80..c7d138a 100644 --- a/libursa/Query.h +++ b/libursa/Query.h @@ -54,6 +54,7 @@ class Query { std::vector &as_queries(); const QString &as_value() const; uint32_t as_count() const; + PrimitiveQuery as_ngram() const; std::string as_string_repr() const; const QueryType &get_type() const; bool operator==(const Query &other) const; diff --git a/libursa/QueryOptimizer.cpp b/libursa/QueryOptimizer.cpp index 362c76f..316d6dc 100644 --- a/libursa/QueryOptimizer.cpp +++ b/libursa/QueryOptimizer.cpp @@ -53,6 +53,29 @@ Query inline_suboperations(Query &&q, bool *changed) { return std::move(Query(q.get_type(), std::move(newqueries))); } +// This optimization gets rid of duplicated primitive queries. +// AND(a, a, a, a, b, b) == AND(a, b) +// This also applies to OR(), but it'll happen very rarely. +Query deduplicate_primitives(Query &&q, bool *changed) { + if (q.get_type() != QueryType::AND && q.get_type() != QueryType::OR) { + return std::move(q); + } + + std::set seen; + std::vector newqueries; + for (auto &&query : q.as_queries()) { + if (query.get_type() != QueryType::PRIMITIVE) { + newqueries.emplace_back(std::move(query)); + } else if (seen.count(query.as_ngram()) == 0) { + newqueries.emplace_back(std::move(query)); + seen.insert(query.as_ngram()); + } else { + *changed = true; + } + } + return std::move(Query(q.get_type(), std::move(newqueries))); +} + Query q_optimize(Query &&q) { if (q.get_type() == QueryType::PRIMITIVE) { // Nothing to improve here. @@ -65,6 +88,7 @@ Query q_optimize(Query &&q) { changed = false; q = flatten_trivial_operations(std::move(q), &changed); q = inline_suboperations(std::move(q), &changed); + q = deduplicate_primitives(std::move(q), &changed); } return std::move(q); diff --git a/libursa/Version.h.in b/libursa/Version.h.in index e85d3b8..15d1ab5 100644 --- a/libursa/Version.h.in +++ b/libursa/Version.h.in @@ -9,5 +9,5 @@ constexpr std::string_view ursadb_format_version = "1.5.0"; // Project version. // Consider updating the version tag when doing PRs. // clang-format off -constexpr std::string_view ursadb_version_string = "@PROJECT_VERSION@+opt2"; +constexpr std::string_view ursadb_version_string = "@PROJECT_VERSION@+opt3"; // clang-format on