Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Double filter #299

Closed
wants to merge 7 commits into from
6 changes: 5 additions & 1 deletion src/interface/parse_args.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ void parse_args(int argc,
args::Flag one_to_one(mapping_opts, "", "Perform one-to-one filtering", {'o', "one-to-one"});
args::Flag lower_triangular(mapping_opts, "", "Only compute the lower triangular for all-vs-all mapping", {'L', "lower-triangular"});
args::ValueFlag<char> skip_prefix(mapping_opts, "C", "map between sequence groups with different prefix [#]", {'Y', "group-prefix"});
args::Flag disable_grouping(mapping_opts, "", "disable sequence grouping (equivalent to -Y \\0)", {'G', "disable-grouping"});
args::ValueFlag<std::string> target_prefix(mapping_opts, "pfx", "use only targets whose names start with this prefix", {'T', "target-prefix"});
args::ValueFlag<std::string> target_list(mapping_opts, "FILE", "file containing list of target sequence names to use", {'R', "target-list"});
args::ValueFlag<std::string> query_prefix(mapping_opts, "pfxs", "filter queries by comma-separated prefixes", {'Q', "query-prefix"});
Expand Down Expand Up @@ -173,7 +174,10 @@ void parse_args(int argc,
map_parameters.lower_triangular = lower_triangular ? args::get(lower_triangular) : false;
map_parameters.keep_low_pct_id = true;

if (skip_prefix) {
if (disable_grouping) {
map_parameters.prefix_delim = '\0';
map_parameters.skip_prefix = false;
} else if (skip_prefix) {
map_parameters.prefix_delim = args::get(skip_prefix);
map_parameters.skip_prefix = map_parameters.prefix_delim != '\0';
} else {
Expand Down
39 changes: 34 additions & 5 deletions src/map/include/computeMap.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -533,6 +533,21 @@ namespace skch

std::vector<std::vector<std::string>> target_subsets = createTargetSubsets(targetSequenceNames);

// Calculate and log subset statistics
uint64_t total_subset_size = 0;
for (const auto& subset : target_subsets) {
for (const auto& seqName : subset) {
seqno_t seqId = idManager->getSequenceId(seqName);
total_subset_size += idManager->getSequenceLength(seqId);
}
}
double avg_subset_size = target_subsets.size() ? (double)total_subset_size / target_subsets.size() : 0;
std::cerr << "[wfmash::mashmap] Target subsets: " << target_subsets.size();
if (param.index_by_size > 0) {
std::cerr << ", target size: " << param.index_by_size << "bp";
}
std::cerr << ", average size: " << std::fixed << std::setprecision(0) << avg_subset_size << "bp" << std::endl;

std::unordered_map<seqno_t, MappingResultsVector_t> combinedMappings;

// Build index for the current subset
Expand Down Expand Up @@ -607,16 +622,16 @@ namespace skch
std::atomic<bool> output_done(false);
aggregate_atomic_queue_t aggregate_queue;

// Get total count of mappings
// Get total count of mappings (after subset filtering)
uint64_t totalMappings = 0;
for (const auto& [querySeqId, mappings] : combinedMappings) {
totalMappings += mappings.size();
}

// Initialize progress logger
// Initialize progress logger for final filtering
progress_meter::ProgressMeter progress(
totalMappings * 2,
"[wfmash::mashmap] merging and filtering");
totalMappings,
"[wfmash::mashmap] final filtering");

// Start worker threads
std::vector<std::thread> workers;
Expand Down Expand Up @@ -706,6 +721,7 @@ namespace skch

aggregator.join();


// Reset flags and clear aggregatedMappings for next iteration
reader_done.store(false);
workers_done.store(false);
Expand Down Expand Up @@ -925,6 +941,20 @@ namespace skch

mappingBoundarySanityCheck(input, output->results);

// Apply pre-filtering immediately after mapping against this subset
if (param.filterMode == filter::MAP || param.filterMode == filter::ONETOONE) {
MappingResultsVector_t filteredMappings;
filterByGroup(output->results, filteredMappings, param.numMappingsForSegment - 1, false, *idManager, input->progress);
output->results = std::move(filteredMappings);

// Also apply length and identity filters
filterWeakMappings(output->results, std::floor(param.block_length / param.segLength));
if (param.filterLengthMismatches) {
filterFalseHighIdentity(output->results);
}
sparsifyMappings(output->results);
}

return output;
}

Expand Down Expand Up @@ -2011,7 +2041,6 @@ namespace skch
best_it2->chainPairScore = best_score;
best_it2->chainPairId = it->splitMappingId;
}
progress.increment(1);
}

// Assign the merged mapping ids
Expand Down
Loading