From 165294b78cbf360239035d4155384e7b7b9327d6 Mon Sep 17 00:00:00 2001 From: "Erik Garrison (aider)" Date: Tue, 12 Nov 2024 15:20:22 -0600 Subject: [PATCH 1/7] feat: Implement flexible k-mer frequency filtering with fraction and count modes --- src/interface/parse_args.hpp | 4 ++-- src/map/include/map_parameters.hpp | 2 +- src/map/include/winSketch.hpp | 10 +++++++++- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/interface/parse_args.hpp b/src/interface/parse_args.hpp index cf61348a..b72cca77 100644 --- a/src/interface/parse_args.hpp +++ b/src/interface/parse_args.hpp @@ -98,7 +98,7 @@ void parse_args(int argc, args::ValueFlag kmer_complexity(mapping_opts, "FLOAT", "minimum k-mer complexity threshold", {'J', "kmer-cmplx"}); args::ValueFlag hg_filter(mapping_opts, "numer,ani-Δ,conf", "hypergeometric filter params [1.0,0.0,99.9]", {"hg-filter"}); args::ValueFlag min_hits(mapping_opts, "INT", "minimum number of hits for L1 filtering [auto]", {'H', "l1-hits"}); - args::ValueFlag max_kmer_freq(mapping_opts, "INT", "maximum allowed k-mer frequency [unlimited]", {'F', "max-kmer-freq"}); + args::ValueFlag max_kmer_freq(mapping_opts, "FLOAT", "filter out top FLOAT fraction of repetitive minimizers [0.0002]", {'f', "filter-freq"}); args::Group alignment_opts(options_group, "Alignment:"); args::ValueFlag input_mapping(alignment_opts, "FILE", "input PAF file for alignment", {'i', "align-paf"}); @@ -557,7 +557,7 @@ void parse_args(int argc, if (max_kmer_freq) { map_parameters.max_kmer_freq = args::get(max_kmer_freq); } else { - map_parameters.max_kmer_freq = std::numeric_limits::max(); // unlimited + map_parameters.max_kmer_freq = 0.0002; // default filter fraction } //if (window_minimizers) { diff --git a/src/map/include/map_parameters.hpp b/src/map/include/map_parameters.hpp index e022878e..b0f72286 100644 --- a/src/map/include/map_parameters.hpp +++ b/src/map/include/map_parameters.hpp @@ -87,7 +87,7 @@ struct Parameters //std::unordered_set high_freq_kmers; // int64_t index_by_size = std::numeric_limits::max(); // Target total size of sequences for each index subset int minimum_hits = -1; // Minimum number of hits required for L1 filtering (-1 means auto) - uint64_t max_kmer_freq = std::numeric_limits::max(); // Maximum allowed k-mer frequency + double max_kmer_freq = 0.0002; // Maximum allowed k-mer frequency fraction (0-1) or count (>1) }; diff --git a/src/map/include/winSketch.hpp b/src/map/include/winSketch.hpp index 60099e6e..3c26a682 100644 --- a/src/map/include/winSketch.hpp +++ b/src/map/include/winSketch.hpp @@ -256,7 +256,15 @@ namespace skch continue; // Should never happen } - if (freq_it->second > param.max_kmer_freq) { + uint64_t freq_cutoff; + if (param.max_kmer_freq <= 1.0) { + // Calculate cutoff based on fraction of total windows + freq_cutoff = std::max(1UL, (uint64_t)(total_windows * param.max_kmer_freq)); + } else { + // Use direct count cutoff + freq_cutoff = (uint64_t)param.max_kmer_freq; + } + if (freq_it->second > freq_cutoff) { filtered_kmers++; continue; } From b8111f81811c6c2130ce4d379d61882742220278 Mon Sep 17 00:00:00 2001 From: "Erik Garrison (aider)" Date: Tue, 12 Nov 2024 15:20:39 -0600 Subject: [PATCH 2/7] refactor: Improve k-mer frequency filtering with dynamic cutoff calculation --- src/map/include/winSketch.hpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/map/include/winSketch.hpp b/src/map/include/winSketch.hpp index 3c26a682..91e29813 100644 --- a/src/map/include/winSketch.hpp +++ b/src/map/include/winSketch.hpp @@ -291,11 +291,17 @@ namespace skch index_progress.finish(); double filtered_pct = (filtered_kmers * 100.0) / total_kmers; + uint64_t freq_cutoff; + if (param.max_kmer_freq <= 1.0) { + freq_cutoff = std::max(1UL, (uint64_t)(total_windows * param.max_kmer_freq)); + } else { + freq_cutoff = (uint64_t)param.max_kmer_freq; + } std::cerr << "[wfmash::mashmap] Processed " << totalSeqProcessed << " sequences (" << totalSeqSkipped << " skipped, " << total_seq_length << " total bp), " << minmerPosLookupIndex.size() << " unique hashes, " << minmerIndex.size() << " windows" << std::endl << "[wfmash::mashmap] Filtered " << filtered_kmers << "/" << total_kmers << " k-mers (" << std::fixed << std::setprecision(2) << filtered_pct << "%) exceeding frequency threshold of " - << param.max_kmer_freq << std::endl; + << freq_cutoff << " occurrences (filter fraction: " << param.max_kmer_freq << ")" << std::endl; } std::chrono::duration timeRefSketch = skch::Time::now() - t0; From bb22ff58f62988fc57b3fe9fa91916f4c50f0bcf Mon Sep 17 00:00:00 2001 From: "Erik Garrison (aider)" Date: Tue, 12 Nov 2024 15:22:22 -0600 Subject: [PATCH 3/7] refactor: Improve k-mer filtering output message for clarity and precision --- src/map/include/winSketch.hpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/map/include/winSketch.hpp b/src/map/include/winSketch.hpp index 91e29813..efdd07df 100644 --- a/src/map/include/winSketch.hpp +++ b/src/map/include/winSketch.hpp @@ -290,7 +290,6 @@ namespace skch // Finish second progress meter index_progress.finish(); - double filtered_pct = (filtered_kmers * 100.0) / total_kmers; uint64_t freq_cutoff; if (param.max_kmer_freq <= 1.0) { freq_cutoff = std::max(1UL, (uint64_t)(total_windows * param.max_kmer_freq)); @@ -300,8 +299,11 @@ namespace skch std::cerr << "[wfmash::mashmap] Processed " << totalSeqProcessed << " sequences (" << totalSeqSkipped << " skipped, " << total_seq_length << " total bp), " << minmerPosLookupIndex.size() << " unique hashes, " << minmerIndex.size() << " windows" << std::endl << "[wfmash::mashmap] Filtered " << filtered_kmers << "/" << total_kmers - << " k-mers (" << std::fixed << std::setprecision(2) << filtered_pct << "%) exceeding frequency threshold of " - << freq_cutoff << " occurrences (filter fraction: " << param.max_kmer_freq << ")" << std::endl; + << " k-mers occurring > " << freq_cutoff << " times" + << " (target: " << (param.max_kmer_freq <= 1.0 ? + std::to_string(param.max_kmer_freq * 100) + "% most frequent" : + ">" + std::to_string((int)param.max_kmer_freq) + " occurrences") + << ")" << std::endl; } std::chrono::duration timeRefSketch = skch::Time::now() - t0; From 220e694638e2493f537560960a8774a0c7d2b09d Mon Sep 17 00:00:00 2001 From: "Erik Garrison (aider)" Date: Tue, 12 Nov 2024 15:24:01 -0600 Subject: [PATCH 4/7] style: Simplify percentage display by removing "most frequent" text --- src/map/include/winSketch.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/map/include/winSketch.hpp b/src/map/include/winSketch.hpp index efdd07df..97a25f89 100644 --- a/src/map/include/winSketch.hpp +++ b/src/map/include/winSketch.hpp @@ -301,7 +301,7 @@ namespace skch << "[wfmash::mashmap] Filtered " << filtered_kmers << "/" << total_kmers << " k-mers occurring > " << freq_cutoff << " times" << " (target: " << (param.max_kmer_freq <= 1.0 ? - std::to_string(param.max_kmer_freq * 100) + "% most frequent" : + std::to_string(param.max_kmer_freq * 100) + "%" : ">" + std::to_string((int)param.max_kmer_freq) + " occurrences") << ")" << std::endl; } From 37f4589655624622b5d5310b2129fcc02cd91d59 Mon Sep 17 00:00:00 2001 From: "Erik Garrison (aider)" Date: Tue, 12 Nov 2024 15:25:47 -0600 Subject: [PATCH 5/7] fix: Format k-mer frequency percentage with proper decimal precision --- src/map/include/winSketch.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/map/include/winSketch.hpp b/src/map/include/winSketch.hpp index 97a25f89..7a07bf23 100644 --- a/src/map/include/winSketch.hpp +++ b/src/map/include/winSketch.hpp @@ -301,7 +301,7 @@ namespace skch << "[wfmash::mashmap] Filtered " << filtered_kmers << "/" << total_kmers << " k-mers occurring > " << freq_cutoff << " times" << " (target: " << (param.max_kmer_freq <= 1.0 ? - std::to_string(param.max_kmer_freq * 100) + "%" : + (std::stringstream() << std::fixed << std::setprecision(2) << (param.max_kmer_freq * 100)).str() + "%" : ">" + std::to_string((int)param.max_kmer_freq) + " occurrences") << ")" << std::endl; } From 27e849b4dd76551b9b3b6a90a912142e3519fe73 Mon Sep 17 00:00:00 2001 From: "Erik Garrison (aider)" Date: Tue, 12 Nov 2024 15:26:56 -0600 Subject: [PATCH 6/7] fix: Change -f flag to -F to avoid conflict with --no-filter --- src/interface/parse_args.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/interface/parse_args.hpp b/src/interface/parse_args.hpp index b72cca77..5a4c4208 100644 --- a/src/interface/parse_args.hpp +++ b/src/interface/parse_args.hpp @@ -98,7 +98,7 @@ void parse_args(int argc, args::ValueFlag kmer_complexity(mapping_opts, "FLOAT", "minimum k-mer complexity threshold", {'J', "kmer-cmplx"}); args::ValueFlag hg_filter(mapping_opts, "numer,ani-Δ,conf", "hypergeometric filter params [1.0,0.0,99.9]", {"hg-filter"}); args::ValueFlag min_hits(mapping_opts, "INT", "minimum number of hits for L1 filtering [auto]", {'H', "l1-hits"}); - args::ValueFlag max_kmer_freq(mapping_opts, "FLOAT", "filter out top FLOAT fraction of repetitive minimizers [0.0002]", {'f', "filter-freq"}); + args::ValueFlag max_kmer_freq(mapping_opts, "FLOAT", "filter out top FLOAT fraction of repetitive minimizers [0.0002]", {'F', "filter-freq"}); args::Group alignment_opts(options_group, "Alignment:"); args::ValueFlag input_mapping(alignment_opts, "FILE", "input PAF file for alignment", {'i', "align-paf"}); From 2427c758d57b6658611b8589a7b4d66a584f48e1 Mon Sep 17 00:00:00 2001 From: "Erik Garrison (aider)" Date: Tue, 12 Nov 2024 15:30:56 -0600 Subject: [PATCH 7/7] fix: Improve stringstream compatibility for older GCC versions --- src/map/include/winSketch.hpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/map/include/winSketch.hpp b/src/map/include/winSketch.hpp index 7a07bf23..ee02632a 100644 --- a/src/map/include/winSketch.hpp +++ b/src/map/include/winSketch.hpp @@ -301,7 +301,11 @@ namespace skch << "[wfmash::mashmap] Filtered " << filtered_kmers << "/" << total_kmers << " k-mers occurring > " << freq_cutoff << " times" << " (target: " << (param.max_kmer_freq <= 1.0 ? - (std::stringstream() << std::fixed << std::setprecision(2) << (param.max_kmer_freq * 100)).str() + "%" : + ([&]() { + std::stringstream ss; + ss << std::fixed << std::setprecision(2) << (param.max_kmer_freq * 100); + return ss.str(); + })() + "%" : ">" + std::to_string((int)param.max_kmer_freq) + " occurrences") << ")" << std::endl; }