Skip to content

Commit

Permalink
Fix performance and accuracy for wide ascii strings (#155)
Browse files Browse the repository at this point in the history
Due to a refactoring earlier (probably), we started returning
many more results than we should. This commit should fix this,
without affecting performance negatively.
  • Loading branch information
msm-code authored Jun 3, 2020
1 parent 5c41e4a commit 86a8905
Show file tree
Hide file tree
Showing 9 changed files with 159 additions and 127 deletions.
5 changes: 3 additions & 2 deletions libursa/DatabaseSnapshot.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,8 @@ QueryCounters DatabaseSnapshot::execute(const Query &query,
}
}

const QueryGraphCollection graphs{query, types_to_query, config};
Query query_copy{query.clone()};
query_copy.precompute(types_to_query, config);

task->spec().estimate_work(datasets_to_query.size());

Expand All @@ -243,7 +244,7 @@ QueryCounters DatabaseSnapshot::execute(const Query &query,
if (!ds->has_all_taints(taints)) {
continue;
}
ds->execute(graphs, out, &counters);
ds->execute(query_copy, out, &counters);
}
return counters;
}
Expand Down
47 changes: 18 additions & 29 deletions libursa/OnDiskDataset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,19 +64,27 @@ std::string OnDiskDataset::get_file_name(FileId fid) const {
return files_index->get_file_name(fid);
}

QueryResult OnDiskDataset::query(const QueryGraphCollection &graphs,
QueryResult OnDiskDataset::query(const Query &query,
QueryCounters *counters) const {
QueryResult result = QueryResult::everything();
for (auto &ndx : indices) {
auto subresult{ndx.query(graphs.get(ndx.index_type()), counters)};
result.do_and(subresult, &counters->ands());
}
return result;
return query.run(
[this](auto &graphs, QueryCounters *counters) {
QueryResult result = QueryResult::everything();
for (auto &ndx : indices) {
if (graphs.count(ndx.index_type()) == 0) {
throw std::runtime_error("Unexpected graph type in query");
}
auto subresult{
ndx.query(graphs.at(ndx.index_type()), counters)};
result.do_and(subresult, &counters->ands());
}
return result;
},
counters);
}

void OnDiskDataset::execute(const QueryGraphCollection &graphs,
ResultWriter *out, QueryCounters *counters) const {
QueryResult result = query(graphs, counters);
void OnDiskDataset::execute(const Query &query, ResultWriter *out,
QueryCounters *counters) const {
QueryResult result = this->query(query, counters);
if (result.is_everything()) {
files_index->for_each_filename(
[&out](const std::string &fname) { out->push_back(fname); });
Expand Down Expand Up @@ -296,22 +304,3 @@ std::vector<const OnDiskDataset *> OnDiskDataset::get_compact_candidates(

return out;
}

QueryGraphCollection::QueryGraphCollection(
const Query &query, const std::unordered_set<IndexType> &types,
const DatabaseConfig &config) {
graphs_.reserve(types.size());
for (const auto type : types) {
graphs_.emplace(type, std::move(query.to_graph(type, config)));
}
}

const QueryGraph &QueryGraphCollection::get(IndexType type) const {
const auto it = graphs_.find(type);
if (it == graphs_.end()) {
throw std::runtime_error(
"QueryGraphCollection doesn't contain a graph of the requested "
"type");
}
return it->second;
}
16 changes: 2 additions & 14 deletions libursa/OnDiskDataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,6 @@
#include "ResultWriter.h"
#include "Task.h"

class QueryGraphCollection {
std::unordered_map<IndexType, QueryGraph> graphs_;

public:
QueryGraphCollection(const Query &query,
const std::unordered_set<IndexType> &types,
const DatabaseConfig &config);

const QueryGraph &get(IndexType type) const;
};

class OnDiskDataset {
std::string name;
fs::path db_base;
Expand All @@ -37,8 +26,7 @@ class OnDiskDataset {
return taints == other.taints;
}
std::string get_file_name(FileId fid) const;
QueryResult query(const QueryGraphCollection &graphs,
QueryCounters *counters) const;
QueryResult query(const Query &query, QueryCounters *counters) const;
const OnDiskIndex &get_index_with_type(IndexType index_type) const;
void drop_file(const std::string &fname) const;

Expand All @@ -54,7 +42,7 @@ class OnDiskDataset {
}
void toggle_taint(const std::string &taint);
bool has_all_taints(const std::set<std::string> &taints) const;
void execute(const QueryGraphCollection &graphs, ResultWriter *out,
void execute(const Query &query, ResultWriter *out,
QueryCounters *counters) const;
uint64_t get_file_count() const { return files_index->get_file_count(); }
void for_each_filename(std::function<void(const std::string &)> cb) const {
Expand Down
5 changes: 0 additions & 5 deletions libursa/OnDiskIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,6 @@ class OnDiskIndex {
std::vector<FileId> query_primitive(TriGram trigram,
QueryCounter *counter) const;
std::pair<uint64_t, uint64_t> get_run_offsets(TriGram trigram) const;
bool internal_expand(QString::const_iterator qit, uint8_t *out, size_t pos,
size_t comb_len, const TrigramGenerator &gen,
QueryResult &res) const;
QueryResult expand_wildcards(const QString &qstr, size_t len,
const TrigramGenerator &gen) const;

static void on_disk_merge_core(const std::vector<IndexMergeHelper> &indexes,
RawFile *out, TaskSpec *task);
Expand Down
71 changes: 46 additions & 25 deletions libursa/Query.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,17 @@ const QString &Query::as_value() const {
return value;
}

std::string Query::as_string_repr() const { return "[primitive]"; }
std::string Query::as_string_repr() const {
std::string out = "";
for (const auto &token : value) {
if (token.num_possible_values() == 1) {
out += token.possible_values()[0];
} else {
out += "?";
}
}
return out;
}

Query q(QString &&qstr) { return Query(std::move(qstr)); }

Expand Down Expand Up @@ -197,39 +207,50 @@ QueryGraph to_query_graph(const QString &str, int size,
return result;
}

QueryGraph Query::to_graph(IndexType ntype,
const DatabaseConfig &config) const {
void Query::precompute(const std::unordered_set<IndexType> &types_to_query,
const DatabaseConfig &config) {
if (type == QueryType::PRIMITIVE) {
TokenValidator validator = get_validator_for(ntype);
size_t input_len = get_ngram_size_for(ntype);
return to_query_graph(value, input_len, config, validator);
value_graphs.clear();
for (const auto &ntype : types_to_query) {
TokenValidator validator = get_validator_for(ntype);
size_t input_len = get_ngram_size_for(ntype);
auto graph{to_query_graph(value, input_len, config, validator)};
value_graphs.emplace(ntype, std::move(graph));
}
} else {
for (auto &query : queries) {
query.precompute(types_to_query, config);
}
}
}

if (type == QueryType::AND) {
QueryGraph result;
QueryResult Query::run(const QueryPrimitive &primitive,
QueryCounters *counters) const {
if (type == QueryType::PRIMITIVE) {
return primitive(value_graphs, counters);
} else if (type == QueryType::AND) {
auto result = QueryResult::everything();
for (const auto &query : queries) {
result.and_(query.to_graph(ntype, config));
result.do_and(query.run(primitive, counters), &counters->ands());
}
return result;
}

if (type == QueryType::OR) {
if (queries.empty()) {
return QueryGraph();
}
QueryGraph result = std::move(queries[0].to_graph(ntype, config));
for (size_t i = 1; i < queries.size(); i++) {
result.or_(queries[i].to_graph(ntype, config));
} else if (type == QueryType::OR) {
auto result = QueryResult::empty();
for (const auto &query : queries) {
result.do_or(query.run(primitive, counters), &counters->ors());
}
return result;
}

if (type == QueryType::MIN_OF) {
std::vector<QueryGraph> subgraphs;
} else if (type == QueryType::MIN_OF) {
std::vector<QueryResult> results;
std::vector<const QueryResult *> results_ptrs;
results.reserve(queries.size());
results_ptrs.reserve(queries.size());
for (const auto &query : queries) {
subgraphs.emplace_back(query.to_graph(ntype, config));
results.emplace_back(query.run(primitive, counters));
results_ptrs.emplace_back(&results.back());
}
return QueryGraph::min_of(count, std::move(subgraphs));
return QueryResult::do_min_of(count, results_ptrs, &counters->minofs());
} else {
throw std::runtime_error("Unexpected query type");
}
throw std::runtime_error("Unknown query type.");
}
41 changes: 34 additions & 7 deletions libursa/Query.h
Original file line number Diff line number Diff line change
@@ -1,22 +1,41 @@
#pragma once

#include <functional>
#include <ostream>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>

#include "DatabaseConfig.h"
#include "QString.h"
#include "QueryGraph.h"
#include "QueryResult.h"
#include "Utils.h"

enum QueryType { PRIMITIVE = 1, AND = 2, OR = 3, MIN_OF = 4 };
enum class QueryType { PRIMITIVE = 1, AND = 2, OR = 3, MIN_OF = 4 };

using QueryPrimitive = std::function<QueryResult(
const std::unordered_map<IndexType, QueryGraph> &, QueryCounters *counter)>;

class Query {
private:
Query(const Query &other)
: type(other.type), value_graphs(), count(other.count) {
queries.reserve(other.queries.size());
for (const auto &query : other.queries) {
queries.emplace_back(query.clone());
}
value.reserve(other.value.size());
for (const auto &token : other.value) {
value.emplace_back(token.clone());
}
}

public:
explicit Query(QString &&qstr);
explicit Query(uint32_t count, std::vector<Query> &&queries);
explicit Query(const QueryType &type, std::vector<Query> &&queries);
Query(const Query &other) = delete;
Query(Query &&other) = default;

const std::vector<Query> &as_queries() const;
Expand All @@ -26,14 +45,22 @@ class Query {
const QueryType &get_type() const;
bool operator==(const Query &other) const;

// Converts this instance of Query to equivalent QueryGraph.
QueryGraph to_graph(IndexType ntype, const DatabaseConfig &config) const;
QueryResult run(const QueryPrimitive &primitive,
QueryCounters *counters) const;
void precompute(const std::unordered_set<IndexType> &types_to_query,
const DatabaseConfig &config);

Query clone() const { return Query(*this); }

private:
QueryType type;
uint32_t count; // used for QueryType::MIN_OF
QString value; // used for QueryType::PRIMITIVE
std::vector<Query> queries; // used for QueryType::AND/OR
// used for QueryType::PRIMITIVE
QString value;
std::unordered_map<IndexType, QueryGraph> value_graphs;
// used for QueryType::MIN_OF
uint32_t count;
// used for QueryType::AND/OR/MIN_OF
std::vector<Query> queries;
};

// Creates a literal query. Literals can contain wildcards and alternatives.
Expand Down
10 changes: 4 additions & 6 deletions src/Tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,10 @@ QString mqs(const std::string &str) {
return out;
}

QueryGraph mqg(const std::string &str, IndexType type) {
QString out;
for (const auto &c : str) {
out.emplace_back(QToken::single(c));
}
return q(std::move(out)).to_graph(type, DatabaseConfig());
QueryGraph mqg(const std::string &str, IndexType ntype) {
TokenValidator validator = get_validator_for(ntype);
size_t input_len = get_ngram_size_for(ntype);
return to_query_graph(mqs(str), input_len, DatabaseConfig(), validator);
}

TEST_CASE("packing 3grams", "[internal]") {
Expand Down
Loading

0 comments on commit 86a8905

Please sign in to comment.