From 90d706def486fb08dca3bde3bc53a91bb72fbb77 Mon Sep 17 00:00:00 2001 From: msm-cert <156842376+msm-cert@users.noreply.github.com> Date: Tue, 1 Oct 2024 12:01:01 +0000 Subject: [PATCH] Refactor primitive queries into AND of atoms (#217) Refactor primitive queries into AND of atoms --- libursa/CMakeLists.txt | 15 --------------- libursa/Query.cpp | 39 ++++++++++++++++----------------------- libursa/Query.h | 16 +++++++--------- libursa/Version.h.in | 6 +++--- 4 files changed, 26 insertions(+), 50 deletions(-) diff --git a/libursa/CMakeLists.txt b/libursa/CMakeLists.txt index a457448..9d25f61 100644 --- a/libursa/CMakeLists.txt +++ b/libursa/CMakeLists.txt @@ -80,19 +80,4 @@ target_clangformat_setup(ursa) # See: https://github.com/zeromq/cppzmq/issues/330 target_compile_options(ursa PUBLIC -Wno-deprecated-declarations) -find_package(Git) -if(Git_FOUND) - execute_process( - COMMAND - # get commit shorthash - ${GIT_EXECUTABLE} rev-parse --short HEAD - OUTPUT_VARIABLE - COMMIT_HASH - OUTPUT_STRIP_TRAILING_WHITESPACE - ) -else() - message(WARNING "Git not found. Hash info won't be available") - set(COMMIT_HASH "(unknown commit)") -endif() - configure_file(Version.h.in ${PROJECT_BINARY_DIR}/generated/Version.h) diff --git a/libursa/Query.cpp b/libursa/Query.cpp index 370f0f6..5df7c8b 100644 --- a/libursa/Query.cpp +++ b/libursa/Query.cpp @@ -60,7 +60,7 @@ std::ostream &operator<<(std::ostream &os, const Query &query) { } os << ")"; } else if (type == QueryType::PRIMITIVE) { - os << "'" << query.as_string_repr() << "'"; + os << query.as_string_repr(); } else { throw std::runtime_error("Unknown query type."); } @@ -79,12 +79,9 @@ const QString &Query::as_value() const { std::string Query::as_string_repr() const { std::string out = ""; - if (!query_plan.empty()) { + if (value.empty()) { // Query is already after planning stage. Show low-level representation. - for (const auto &token : query_plan) { - out += fmt::format("[{:x}]", token.trigram); - } - return out; + return fmt::format("{}:[{:06x}]", (int)ngram.itype, ngram.trigram); } // No query plan yet. Show stringlike representation. for (const auto &token : value) { @@ -126,9 +123,9 @@ QToken filter_qtoken(const QToken &token, uint32_t off, // For primitive queries, find a minimal covering set of ngram queries and // return it. If there are multiple disconnected components, AND them. // For example, "abcde\x??efg" will return abcd & bcde & efg -std::vector plan_qstring( - const std::unordered_set &types_to_query, const QString &value) { - std::vector plan; +Query plan_qstring(const std::unordered_set &types_to_query, + const QString &value) { + std::vector plan; bool has_gram3 = types_to_query.count(IndexType::GRAM3) != 0; bool has_text4 = types_to_query.count(IndexType::TEXT4) != 0; @@ -144,7 +141,7 @@ std::vector plan_qstring( // If wide8 index is supported, try to add a token and skip 6 bytes. if (has_wide8) { if (const auto &gram = convert_gram(IndexType::WIDE8, i, value)) { - plan.emplace_back(IndexType::WIDE8, *gram); + plan.emplace_back(PrimitiveQuery(IndexType::WIDE8, *gram)); skip_to = i + 6; i += 2; continue; @@ -153,7 +150,7 @@ std::vector plan_qstring( // If text4 index is supported, try to add a token and skip 2 bytes. if (has_text4) { if (const auto &gram = convert_gram(IndexType::TEXT4, i, value)) { - plan.emplace_back(IndexType::TEXT4, *gram); + plan.emplace_back(PrimitiveQuery(IndexType::TEXT4, *gram)); skip_to = i + 2; i += 1; continue; @@ -162,14 +159,14 @@ std::vector plan_qstring( // If hash4 index is supported and current ngram is not text, try hash4. const auto &hgram = convert_gram(IndexType::HASH4, i, value); if (i >= (skip_to - 1) && has_hash4 && hgram) { - plan.emplace_back(IndexType::HASH4, *hgram); + plan.emplace_back(PrimitiveQuery(IndexType::HASH4, *hgram)); // Don't continue here - gram3 can give us more information. } // Otherwise, add a regular gram3 token. const auto &gram = convert_gram(IndexType::GRAM3, i, value); if (i >= skip_to && gram) { if (has_gram3) { - plan.emplace_back(IndexType::GRAM3, *gram); + plan.emplace_back(PrimitiveQuery(IndexType::GRAM3, *gram)); } i += 1; continue; @@ -178,9 +175,12 @@ std::vector plan_qstring( i += 1; } - return std::move(plan); + return q_and(std::move(plan)); } +// Query provided by user contains generic strings like "asdf". +// To actually do the query, we need to convert generic strings to n-grams. +// This is done using plan_qstring, everything else is copied unchanged. Query Query::plan(const std::unordered_set &types_to_query) const { if (type != QueryType::PRIMITIVE) { std::vector plans; @@ -193,21 +193,14 @@ Query Query::plan(const std::unordered_set &types_to_query) const { return Query(type, std::move(plans)); } - return Query(plan_qstring(types_to_query, value)); + return plan_qstring(types_to_query, value); } QueryResult Query::run(const QueryPrimitive &primitive, QueryCounters *counters) const { // Case: primitive query - reduces to AND with tokens from query plan. if (type == QueryType::PRIMITIVE) { - auto result = QueryResult::everything(); - for (const auto &token : query_plan) { - result.do_and(primitive(token, counters), &counters->ands()); - if (result.is_empty()) { - break; - } - } - return result; + return primitive(ngram, counters); } // Case: and. Short circuits when result is already empty. if (type == QueryType::AND) { diff --git a/libursa/Query.h b/libursa/Query.h index 40bc94f..ecc96c0 100644 --- a/libursa/Query.h +++ b/libursa/Query.h @@ -41,17 +41,14 @@ using QueryPrimitive = // planned (using a plan() method). At this point query decides which ngrams // will actually be checked. class Query { - private: - explicit Query(std::vector &&query_plan) - : type(QueryType::PRIMITIVE), - query_plan(std::move(query_plan)), - value() {} - public: + explicit Query(PrimitiveQuery ngram) + : type(QueryType::PRIMITIVE), ngram(ngram), value() {} explicit Query(QString &&qstr); explicit Query(uint32_t count, std::vector &&queries); explicit Query(const QueryType &type, std::vector &&queries); Query(Query &&other) = default; + Query &operator=(Query &&) = default; const std::vector &as_queries() const; const QString &as_value() const; @@ -66,9 +63,10 @@ class Query { private: QueryType type; - // used for QueryType::PRIMITIVE - QString value; // before plan() - std::vector query_plan; // after plan() + // used for QueryType::PRIMITIVE before plan() + QString value; + // used for QueryType::PRIMITIVE after plan(). Initial value arbitrary. + PrimitiveQuery ngram = PrimitiveQuery(IndexType::GRAM3, 0); // used for QueryType::MIN_OF uint32_t count; // used for QueryType::AND/OR/MIN_OF diff --git a/libursa/Version.h.in b/libursa/Version.h.in index ec78d18..d8826cd 100644 --- a/libursa/Version.h.in +++ b/libursa/Version.h.in @@ -6,7 +6,7 @@ // It looks similar to the db version to make it easy to correlate them. constexpr std::string_view ursadb_format_version = "1.5.0"; -constexpr std::string_view ursadb_version = "@PROJECT_VERSION@"; -constexpr std::string_view ursadb_commit = "@COMMIT_HASH@"; +// Project version. +// Consider updating the version tag when doing PRs. constexpr std::string_view ursadb_version_string = - "@PROJECT_VERSION@+@COMMIT_HASH@"; + "@PROJECT_VERSION@+primitives";