From 177d81473ad49ed185f3d8ad779a9abe53845603 Mon Sep 17 00:00:00 2001 From: msm Date: Tue, 1 Oct 2024 13:38:04 +0200 Subject: [PATCH] Refactor primitive queries into AND of atoms --- libursa/CMakeLists.txt | 15 --------------- libursa/Query.cpp | 37 +++++++++++++++---------------------- libursa/Query.h | 16 ++++++++-------- libursa/Version.h.in | 7 +++---- 4 files changed, 26 insertions(+), 49 deletions(-) diff --git a/libursa/CMakeLists.txt b/libursa/CMakeLists.txt index a457448..9d25f61 100644 --- a/libursa/CMakeLists.txt +++ b/libursa/CMakeLists.txt @@ -80,19 +80,4 @@ target_clangformat_setup(ursa) # See: https://github.com/zeromq/cppzmq/issues/330 target_compile_options(ursa PUBLIC -Wno-deprecated-declarations) -find_package(Git) -if(Git_FOUND) - execute_process( - COMMAND - # get commit shorthash - ${GIT_EXECUTABLE} rev-parse --short HEAD - OUTPUT_VARIABLE - COMMIT_HASH - OUTPUT_STRIP_TRAILING_WHITESPACE - ) -else() - message(WARNING "Git not found. Hash info won't be available") - set(COMMIT_HASH "(unknown commit)") -endif() - configure_file(Version.h.in ${PROJECT_BINARY_DIR}/generated/Version.h) diff --git a/libursa/Query.cpp b/libursa/Query.cpp index 370f0f6..2e8bf91 100644 --- a/libursa/Query.cpp +++ b/libursa/Query.cpp @@ -60,7 +60,7 @@ std::ostream &operator<<(std::ostream &os, const Query &query) { } os << ")"; } else if (type == QueryType::PRIMITIVE) { - os << "'" << query.as_string_repr() << "'"; + os << query.as_string_repr(); } else { throw std::runtime_error("Unknown query type."); } @@ -79,12 +79,9 @@ const QString &Query::as_value() const { std::string Query::as_string_repr() const { std::string out = ""; - if (!query_plan.empty()) { + if (value.empty()) { // Query is already after planning stage. Show low-level representation. - for (const auto &token : query_plan) { - out += fmt::format("[{:x}]", token.trigram); - } - return out; + return fmt::format("{}:[{:06x}]", (int)ngram.itype, ngram.trigram); } // No query plan yet. Show stringlike representation. for (const auto &token : value) { @@ -126,9 +123,9 @@ QToken filter_qtoken(const QToken &token, uint32_t off, // For primitive queries, find a minimal covering set of ngram queries and // return it. If there are multiple disconnected components, AND them. // For example, "abcde\x??efg" will return abcd & bcde & efg -std::vector plan_qstring( +Query plan_qstring( const std::unordered_set &types_to_query, const QString &value) { - std::vector plan; + std::vector plan; bool has_gram3 = types_to_query.count(IndexType::GRAM3) != 0; bool has_text4 = types_to_query.count(IndexType::TEXT4) != 0; @@ -144,7 +141,7 @@ std::vector plan_qstring( // If wide8 index is supported, try to add a token and skip 6 bytes. if (has_wide8) { if (const auto &gram = convert_gram(IndexType::WIDE8, i, value)) { - plan.emplace_back(IndexType::WIDE8, *gram); + plan.emplace_back(PrimitiveQuery(IndexType::WIDE8, *gram)); skip_to = i + 6; i += 2; continue; @@ -153,7 +150,7 @@ std::vector plan_qstring( // If text4 index is supported, try to add a token and skip 2 bytes. if (has_text4) { if (const auto &gram = convert_gram(IndexType::TEXT4, i, value)) { - plan.emplace_back(IndexType::TEXT4, *gram); + plan.emplace_back(PrimitiveQuery(IndexType::TEXT4, *gram)); skip_to = i + 2; i += 1; continue; @@ -162,14 +159,14 @@ std::vector plan_qstring( // If hash4 index is supported and current ngram is not text, try hash4. const auto &hgram = convert_gram(IndexType::HASH4, i, value); if (i >= (skip_to - 1) && has_hash4 && hgram) { - plan.emplace_back(IndexType::HASH4, *hgram); + plan.emplace_back(PrimitiveQuery(IndexType::HASH4, *hgram)); // Don't continue here - gram3 can give us more information. } // Otherwise, add a regular gram3 token. const auto &gram = convert_gram(IndexType::GRAM3, i, value); if (i >= skip_to && gram) { if (has_gram3) { - plan.emplace_back(IndexType::GRAM3, *gram); + plan.emplace_back(PrimitiveQuery(IndexType::GRAM3, *gram)); } i += 1; continue; @@ -178,9 +175,12 @@ std::vector plan_qstring( i += 1; } - return std::move(plan); + return q_and(std::move(plan)); } +// Query provided by user contains generic strings like "asdf". +// To actually do the query, we need to convert generic strings to n-grams. +// This is done using plan_qstring, everything else is copied unchanged. Query Query::plan(const std::unordered_set &types_to_query) const { if (type != QueryType::PRIMITIVE) { std::vector plans; @@ -193,21 +193,14 @@ Query Query::plan(const std::unordered_set &types_to_query) const { return Query(type, std::move(plans)); } - return Query(plan_qstring(types_to_query, value)); + return plan_qstring(types_to_query, value); } QueryResult Query::run(const QueryPrimitive &primitive, QueryCounters *counters) const { // Case: primitive query - reduces to AND with tokens from query plan. if (type == QueryType::PRIMITIVE) { - auto result = QueryResult::everything(); - for (const auto &token : query_plan) { - result.do_and(primitive(token, counters), &counters->ands()); - if (result.is_empty()) { - break; - } - } - return result; + return primitive(ngram, counters); } // Case: and. Short circuits when result is already empty. if (type == QueryType::AND) { diff --git a/libursa/Query.h b/libursa/Query.h index 40bc94f..4911a7b 100644 --- a/libursa/Query.h +++ b/libursa/Query.h @@ -41,17 +41,16 @@ using QueryPrimitive = // planned (using a plan() method). At this point query decides which ngrams // will actually be checked. class Query { - private: - explicit Query(std::vector &&query_plan) + public: + explicit Query(PrimitiveQuery ngram) : type(QueryType::PRIMITIVE), - query_plan(std::move(query_plan)), + ngram(ngram), value() {} - - public: explicit Query(QString &&qstr); explicit Query(uint32_t count, std::vector &&queries); explicit Query(const QueryType &type, std::vector &&queries); Query(Query &&other) = default; + Query& operator=(Query&&) = default; const std::vector &as_queries() const; const QString &as_value() const; @@ -66,9 +65,10 @@ class Query { private: QueryType type; - // used for QueryType::PRIMITIVE - QString value; // before plan() - std::vector query_plan; // after plan() + // used for QueryType::PRIMITIVE before plan() + QString value; + // used for QueryType::PRIMITIVE after plan(). Initial value arbitrary. + PrimitiveQuery ngram = PrimitiveQuery(IndexType::GRAM3, 0); // used for QueryType::MIN_OF uint32_t count; // used for QueryType::AND/OR/MIN_OF diff --git a/libursa/Version.h.in b/libursa/Version.h.in index ec78d18..1a67a7d 100644 --- a/libursa/Version.h.in +++ b/libursa/Version.h.in @@ -6,7 +6,6 @@ // It looks similar to the db version to make it easy to correlate them. constexpr std::string_view ursadb_format_version = "1.5.0"; -constexpr std::string_view ursadb_version = "@PROJECT_VERSION@"; -constexpr std::string_view ursadb_commit = "@COMMIT_HASH@"; -constexpr std::string_view ursadb_version_string = - "@PROJECT_VERSION@+@COMMIT_HASH@"; +// Project version. +// Consider updating the version tag when doing PRs. +constexpr std::string_view ursadb_version_string = "@PROJECT_VERSION@+primitives";