Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor primitive queries into AND of atoms #217

Merged
merged 2 commits into from
Oct 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 0 additions & 15 deletions libursa/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -80,19 +80,4 @@ target_clangformat_setup(ursa)
# See: https://github.com/zeromq/cppzmq/issues/330
target_compile_options(ursa PUBLIC -Wno-deprecated-declarations)

find_package(Git)
if(Git_FOUND)
execute_process(
COMMAND
# get commit shorthash
${GIT_EXECUTABLE} rev-parse --short HEAD
OUTPUT_VARIABLE
COMMIT_HASH
OUTPUT_STRIP_TRAILING_WHITESPACE
)
else()
message(WARNING "Git not found. Hash info won't be available")
set(COMMIT_HASH "(unknown commit)")
endif()

configure_file(Version.h.in ${PROJECT_BINARY_DIR}/generated/Version.h)
39 changes: 16 additions & 23 deletions libursa/Query.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ std::ostream &operator<<(std::ostream &os, const Query &query) {
}
os << ")";
} else if (type == QueryType::PRIMITIVE) {
os << "'" << query.as_string_repr() << "'";
os << query.as_string_repr();
} else {
throw std::runtime_error("Unknown query type.");
}
Expand All @@ -79,12 +79,9 @@ const QString &Query::as_value() const {

std::string Query::as_string_repr() const {
std::string out = "";
if (!query_plan.empty()) {
if (value.empty()) {
// Query is already after planning stage. Show low-level representation.
for (const auto &token : query_plan) {
out += fmt::format("[{:x}]", token.trigram);
}
return out;
return fmt::format("{}:[{:06x}]", (int)ngram.itype, ngram.trigram);
}
// No query plan yet. Show stringlike representation.
for (const auto &token : value) {
Expand Down Expand Up @@ -126,9 +123,9 @@ QToken filter_qtoken(const QToken &token, uint32_t off,
// For primitive queries, find a minimal covering set of ngram queries and
// return it. If there are multiple disconnected components, AND them.
// For example, "abcde\x??efg" will return abcd & bcde & efg
std::vector<PrimitiveQuery> plan_qstring(
const std::unordered_set<IndexType> &types_to_query, const QString &value) {
std::vector<PrimitiveQuery> plan;
Query plan_qstring(const std::unordered_set<IndexType> &types_to_query,
const QString &value) {
std::vector<Query> plan;

bool has_gram3 = types_to_query.count(IndexType::GRAM3) != 0;
bool has_text4 = types_to_query.count(IndexType::TEXT4) != 0;
Expand All @@ -144,7 +141,7 @@ std::vector<PrimitiveQuery> plan_qstring(
// If wide8 index is supported, try to add a token and skip 6 bytes.
if (has_wide8) {
if (const auto &gram = convert_gram(IndexType::WIDE8, i, value)) {
plan.emplace_back(IndexType::WIDE8, *gram);
plan.emplace_back(PrimitiveQuery(IndexType::WIDE8, *gram));
skip_to = i + 6;
i += 2;
continue;
Expand All @@ -153,7 +150,7 @@ std::vector<PrimitiveQuery> plan_qstring(
// If text4 index is supported, try to add a token and skip 2 bytes.
if (has_text4) {
if (const auto &gram = convert_gram(IndexType::TEXT4, i, value)) {
plan.emplace_back(IndexType::TEXT4, *gram);
plan.emplace_back(PrimitiveQuery(IndexType::TEXT4, *gram));
skip_to = i + 2;
i += 1;
continue;
Expand All @@ -162,14 +159,14 @@ std::vector<PrimitiveQuery> plan_qstring(
// If hash4 index is supported and current ngram is not text, try hash4.
const auto &hgram = convert_gram(IndexType::HASH4, i, value);
if (i >= (skip_to - 1) && has_hash4 && hgram) {
plan.emplace_back(IndexType::HASH4, *hgram);
plan.emplace_back(PrimitiveQuery(IndexType::HASH4, *hgram));
// Don't continue here - gram3 can give us more information.
}
// Otherwise, add a regular gram3 token.
const auto &gram = convert_gram(IndexType::GRAM3, i, value);
if (i >= skip_to && gram) {
if (has_gram3) {
plan.emplace_back(IndexType::GRAM3, *gram);
plan.emplace_back(PrimitiveQuery(IndexType::GRAM3, *gram));
}
i += 1;
continue;
Expand All @@ -178,9 +175,12 @@ std::vector<PrimitiveQuery> plan_qstring(
i += 1;
}

return std::move(plan);
return q_and(std::move(plan));
}

// Query provided by user contains generic strings like "asdf".
// To actually do the query, we need to convert generic strings to n-grams.
// This is done using plan_qstring, everything else is copied unchanged.
Query Query::plan(const std::unordered_set<IndexType> &types_to_query) const {
if (type != QueryType::PRIMITIVE) {
std::vector<Query> plans;
Expand All @@ -193,21 +193,14 @@ Query Query::plan(const std::unordered_set<IndexType> &types_to_query) const {
return Query(type, std::move(plans));
}

return Query(plan_qstring(types_to_query, value));
return plan_qstring(types_to_query, value);
}

QueryResult Query::run(const QueryPrimitive &primitive,
QueryCounters *counters) const {
// Case: primitive query - reduces to AND with tokens from query plan.
if (type == QueryType::PRIMITIVE) {
auto result = QueryResult::everything();
for (const auto &token : query_plan) {
result.do_and(primitive(token, counters), &counters->ands());
if (result.is_empty()) {
break;
}
}
return result;
return primitive(ngram, counters);
}
// Case: and. Short circuits when result is already empty.
if (type == QueryType::AND) {
Expand Down
16 changes: 7 additions & 9 deletions libursa/Query.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,17 +41,14 @@ using QueryPrimitive =
// planned (using a plan() method). At this point query decides which ngrams
// will actually be checked.
class Query {
private:
explicit Query(std::vector<PrimitiveQuery> &&query_plan)
: type(QueryType::PRIMITIVE),
query_plan(std::move(query_plan)),
value() {}

public:
explicit Query(PrimitiveQuery ngram)
: type(QueryType::PRIMITIVE), ngram(ngram), value() {}
explicit Query(QString &&qstr);
explicit Query(uint32_t count, std::vector<Query> &&queries);
explicit Query(const QueryType &type, std::vector<Query> &&queries);
Query(Query &&other) = default;
Query &operator=(Query &&) = default;

const std::vector<Query> &as_queries() const;
const QString &as_value() const;
Expand All @@ -66,9 +63,10 @@ class Query {

private:
QueryType type;
// used for QueryType::PRIMITIVE
QString value; // before plan()
std::vector<PrimitiveQuery> query_plan; // after plan()
// used for QueryType::PRIMITIVE before plan()
QString value;
// used for QueryType::PRIMITIVE after plan(). Initial value arbitrary.
PrimitiveQuery ngram = PrimitiveQuery(IndexType::GRAM3, 0);
// used for QueryType::MIN_OF
uint32_t count;
// used for QueryType::AND/OR/MIN_OF
Expand Down
6 changes: 3 additions & 3 deletions libursa/Version.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
// It looks similar to the db version to make it easy to correlate them.
constexpr std::string_view ursadb_format_version = "1.5.0";

constexpr std::string_view ursadb_version = "@PROJECT_VERSION@";
constexpr std::string_view ursadb_commit = "@COMMIT_HASH@";
// Project version.
// Consider updating the version tag when doing PRs.
constexpr std::string_view ursadb_version_string =
"@PROJECT_VERSION@+@COMMIT_HASH@";
"@PROJECT_VERSION@+primitives";
Loading