Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

High sensitivity defaults #268

Merged
merged 9 commits into from
Sep 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ Specifying `-m, --approx-map` lets us stop before alignment and obtain the appro

Together, these settings allow us to precisely define an alignment space to consider.
During all-to-all mapping, `-X` can additionally help us by removing self mappings from the reported set, and `-Y` extends this capability to prevent mapping between sequences with the same name prefix.
When working with large sequence collections we frequently use [PanSN](https://github.com/pangenome/PanSN-spec) naming convention and `-Y'#'` to specify that we want to group mappings by prefix, which in this context means genome or haplotype groupings.


## input indexing
Expand Down
2 changes: 1 addition & 1 deletion src/common/wflign/src/wflign.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ namespace wavefront {
/*
* Configuration
*/
#define MAX_LEN_FOR_STANDARD_WFA 25000 // default --block-length. Only for low-divergence, otherwise disabled
#define MAX_LEN_FOR_STANDARD_WFA 1000
#define MIN_WF_LENGTH 256

/*
Expand Down
29 changes: 14 additions & 15 deletions src/interface/parse_args.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,12 @@ void parse_args(int argc,
args::Positional<std::string> query_sequence_file(io_opts, "query", "query sequence file (optional)");

args::Group mapping_opts(parser, "[ Mapping Options ]");
args::ValueFlag<float> map_pct_identity(mapping_opts, "%", "percent identity in the mashmap step [default: 90]", {'p', "map-pct-id"});
args::ValueFlag<std::string> segment_length(mapping_opts, "N", "segment seed length for mapping [default: 5k]", {'s', "segment-length"});
args::ValueFlag<std::string> block_length(mapping_opts, "N", "keep merged mappings supported by homologies of this total length [default: 5*segment-length]", {'l', "block-length"});
args::ValueFlag<float> map_pct_identity(mapping_opts, "%", "percent identity in the mashmap step [default: 70]", {'p', "map-pct-id"});
args::ValueFlag<std::string> segment_length(mapping_opts, "N", "segment seed length for mapping [default: 1k]", {'s', "segment-length"});
args::ValueFlag<std::string> block_length(mapping_opts, "N", "keep merged mappings supported by homologies of this total length [default: 3*segment-length]", {'l', "block-length"});
args::ValueFlag<uint32_t> num_mappings_for_segments(mapping_opts, "N", "number of mappings to retain for each query/reference pair [default: 1]", {'n', "num-mappings-for-segment"});
args::ValueFlag<uint32_t> num_mappings_for_short_seq(mapping_opts, "N", "number of mappings to retain for each query/reference pair where the query sequence is shorter than segment length [default: 1]", {'S', "num-mappings-for-short-seq"});
args::ValueFlag<int> kmer_size(mapping_opts, "N", "kmer size [default: 19]", {'k', "kmer"});
args::ValueFlag<int> kmer_size(mapping_opts, "N", "kmer size [default: 15]", {'k', "kmer"});
args::ValueFlag<float> kmer_pct_threshold(mapping_opts, "%", "ignore the top % most-frequent kmers [default: 0.001]", {'H', "kmer-threshold"});
args::Flag lower_triangular(mapping_opts, "", "only map shorter sequences against longer", {'L', "lower-triangular"});
args::Flag skip_self(mapping_opts, "", "skip self mappings when the query and target name is the same (for all-vs-all mode)", {'X', "skip-self"});
Expand All @@ -85,8 +85,8 @@ void parse_args(int argc,
args::ValueFlag<std::string> query_list(mapping_opts, "FILE", "file containing list of query sequence names", {'A', "query-list"});
args::Flag approx_mapping(mapping_opts, "approx-map", "skip base-level alignment, producing an approximate mapping in PAF", {'m',"approx-map"});
args::Flag no_split(mapping_opts, "no-split", "disable splitting of input sequences during mapping [default: enabled]", {'N',"no-split"});
args::ValueFlag<std::string> chain_gap(mapping_opts, "N", "chain mappings closer than this distance in query and target, sets approximate maximum variant length detectable in alignment [default: 6*segment_length, up to 30k]", {'c', "chain-gap"});
args::ValueFlag<std::string> max_mapping_length(mapping_opts, "N", "maximum length of a single mapping before breaking [default: inf]", {'P', "max-mapping-length"});
args::ValueFlag<std::string> chain_gap(mapping_opts, "N", "chain mappings closer than this distance in query and target, sets approximate maximum variant length detectable in alignment [default: 30k]", {'c', "chain-gap"});
args::ValueFlag<std::string> max_mapping_length(mapping_opts, "N", "maximum length of a single mapping before breaking (inf to unset) [default: 50k]", {'P', "max-mapping-length"});
args::Flag drop_low_map_pct_identity(mapping_opts, "K", "drop mappings with estimated identity below --map-pct-id=%", {'K', "drop-low-map-id"});
args::ValueFlag<double> overlap_threshold(mapping_opts, "F", "drop mappings overlapping more than fraction F with a higher scoring mapping [default: 0.5]", {'O', "overlap-threshold"});
args::Flag no_filter(mapping_opts, "MODE", "disable mapping filtering", {'f', "no-filter"});
Expand Down Expand Up @@ -390,7 +390,7 @@ void parse_args(int argc,
}
map_parameters.segLength = s;
} else {
map_parameters.segLength = 5000;
map_parameters.segLength = 1000;
}

if (map_pct_identity) {
Expand All @@ -413,9 +413,7 @@ void parse_args(int argc,

map_parameters.block_length = l;
} else {
// n.b. we map-merge across gaps up to 3x segment length
// and then filter for things that are at least block_length long
map_parameters.block_length = 5 * map_parameters.segLength;
map_parameters.block_length = 3 * map_parameters.segLength;
}

if (chain_gap) {
Expand All @@ -427,19 +425,20 @@ void parse_args(int argc,
map_parameters.chain_gap = l;
align_parameters.chain_gap = l;
} else {
map_parameters.chain_gap = std::min((int64_t)30000, 6*map_parameters.segLength);
align_parameters.chain_gap = std::min((int64_t)30000, 6*map_parameters.segLength);
map_parameters.chain_gap = 30000;
align_parameters.chain_gap = 30000;
}

if (max_mapping_length) {
const int64_t l = wfmash::handy_parameter(args::get(max_mapping_length));
const int64_t l = args::get(max_mapping_length) == "inf" ? std::numeric_limits<int64_t>::max()
: wfmash::handy_parameter(args::get(max_mapping_length));
if (l <= 0) {
std::cerr << "[wfmash] ERROR: max mapping length must be greater than 0." << std::endl;
exit(1);
}
map_parameters.max_mapping_length = l;
} else {
map_parameters.max_mapping_length = std::numeric_limits<int64_t>::max();
map_parameters.max_mapping_length = 50000;
}

if (drop_low_map_pct_identity) {
Expand All @@ -464,7 +463,7 @@ void parse_args(int argc,
map_parameters.kmerSize = (map_parameters.percentageIdentity >= 0.97 ? 18 :
(map_parameters.percentageIdentity >= 0.9 ? 17 : 15));
*/
map_parameters.kmerSize = 19;
map_parameters.kmerSize = 15;
}

if (kmer_pct_threshold) {
Expand Down
2 changes: 1 addition & 1 deletion src/map/include/computeMap.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1650,7 +1650,7 @@ namespace skch
dist = std::sqrt(std::pow(query_dist,2) + std::pow(ref_dist,2));
awed = axis_weighted_euclidean_distance(query_dist, ref_dist, 0.9);
}
if (awed < max_dist) {
if (dist < max_dist) {
distances.push_back(std::make_tuple(awed, dist, it2->splitMappingId));
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/map/include/map_parameters.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ namespace fixed
double ss_table_max = 1000.0; // Maximum size of dp table for filtering
double pval_cutoff = 1e-3; // p-value cutoff for determining window size
float confidence_interval = 0.95; // Confidence interval to relax jaccard cutoff for mapping (0-1)
float percentage_identity = 0.90; // Percent identity in the mapping step
float percentage_identity = 0.70; // Percent identity in the mapping step
float ANIDiff = 0.0; // Stage 1 ANI diff threshold
float ANIDiffConf = 0.999; // ANI diff confidence
std::string VERSION = "3.1.1"; // Version of MashMap
Expand Down
Loading