Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Freq filter yes #293

Merged
merged 7 commits into from
Nov 12, 2024
4 changes: 2 additions & 2 deletions src/interface/parse_args.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ void parse_args(int argc,
args::ValueFlag<double> kmer_complexity(mapping_opts, "FLOAT", "minimum k-mer complexity threshold", {'J', "kmer-cmplx"});
args::ValueFlag<std::string> hg_filter(mapping_opts, "numer,ani-Δ,conf", "hypergeometric filter params [1.0,0.0,99.9]", {"hg-filter"});
args::ValueFlag<int> min_hits(mapping_opts, "INT", "minimum number of hits for L1 filtering [auto]", {'H', "l1-hits"});
args::ValueFlag<uint64_t> max_kmer_freq(mapping_opts, "INT", "maximum allowed k-mer frequency [unlimited]", {'F', "max-kmer-freq"});
args::ValueFlag<double> max_kmer_freq(mapping_opts, "FLOAT", "filter out top FLOAT fraction of repetitive minimizers [0.0002]", {'F', "filter-freq"});

args::Group alignment_opts(options_group, "Alignment:");
args::ValueFlag<std::string> input_mapping(alignment_opts, "FILE", "input PAF file for alignment", {'i', "align-paf"});
Expand Down Expand Up @@ -557,7 +557,7 @@ void parse_args(int argc,
if (max_kmer_freq) {
map_parameters.max_kmer_freq = args::get(max_kmer_freq);
} else {
map_parameters.max_kmer_freq = std::numeric_limits<uint64_t>::max(); // unlimited
map_parameters.max_kmer_freq = 0.0002; // default filter fraction
}

//if (window_minimizers) {
Expand Down
2 changes: 1 addition & 1 deletion src/map/include/map_parameters.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ struct Parameters
//std::unordered_set<std::string> high_freq_kmers; //
int64_t index_by_size = std::numeric_limits<int64_t>::max(); // Target total size of sequences for each index subset
int minimum_hits = -1; // Minimum number of hits required for L1 filtering (-1 means auto)
uint64_t max_kmer_freq = std::numeric_limits<uint64_t>::max(); // Maximum allowed k-mer frequency
double max_kmer_freq = 0.0002; // Maximum allowed k-mer frequency fraction (0-1) or count (>1)
};


Expand Down
28 changes: 24 additions & 4 deletions src/map/include/winSketch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,15 @@ namespace skch
continue; // Should never happen
}

if (freq_it->second > param.max_kmer_freq) {
uint64_t freq_cutoff;
if (param.max_kmer_freq <= 1.0) {
// Calculate cutoff based on fraction of total windows
freq_cutoff = std::max(1UL, (uint64_t)(total_windows * param.max_kmer_freq));
} else {
// Use direct count cutoff
freq_cutoff = (uint64_t)param.max_kmer_freq;
}
if (freq_it->second > freq_cutoff) {
filtered_kmers++;
continue;
}
Expand All @@ -282,12 +290,24 @@ namespace skch
// Finish second progress meter
index_progress.finish();

double filtered_pct = (filtered_kmers * 100.0) / total_kmers;
uint64_t freq_cutoff;
if (param.max_kmer_freq <= 1.0) {
freq_cutoff = std::max(1UL, (uint64_t)(total_windows * param.max_kmer_freq));
} else {
freq_cutoff = (uint64_t)param.max_kmer_freq;
}
std::cerr << "[wfmash::mashmap] Processed " << totalSeqProcessed << " sequences (" << totalSeqSkipped << " skipped, " << total_seq_length << " total bp), "
<< minmerPosLookupIndex.size() << " unique hashes, " << minmerIndex.size() << " windows" << std::endl
<< "[wfmash::mashmap] Filtered " << filtered_kmers << "/" << total_kmers
<< " k-mers (" << std::fixed << std::setprecision(2) << filtered_pct << "%) exceeding frequency threshold of "
<< param.max_kmer_freq << std::endl;
<< " k-mers occurring > " << freq_cutoff << " times"
<< " (target: " << (param.max_kmer_freq <= 1.0 ?
([&]() {
std::stringstream ss;
ss << std::fixed << std::setprecision(2) << (param.max_kmer_freq * 100);
return ss.str();
})() + "%" :
">" + std::to_string((int)param.max_kmer_freq) + " occurrences")
<< ")" << std::endl;
}

std::chrono::duration<double> timeRefSketch = skch::Time::now() - t0;
Expand Down
Loading