Skip to content

Commit

Permalink
feat: log basic seed and align stats at debug level
Browse files Browse the repository at this point in the history
  • Loading branch information
corneliusroemer committed Aug 30, 2023
1 parent e66d566 commit 27d669e
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 9 deletions.
20 changes: 15 additions & 5 deletions packages_rs/nextclade/src/align/align.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use crate::alphabet::letter::Letter;
use crate::alphabet::nuc::Nuc;
use crate::make_error;
use eyre::{Report, WrapErr};
use log::{info, trace};
use log::{debug, info, trace};

fn align_pairwise<T: Letter<T>>(
qry_seq: &[T],
Expand Down Expand Up @@ -48,7 +48,7 @@ pub fn align_nuc(
if ref_len + qry_len < (10 * params.seed_length) {
// for very short sequences, use full square
let stripes = full_matrix(ref_len, qry_len);
trace!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Band construction: short sequences, using full matrix");
debug!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Band construction: short sequences, using full matrix");
return Ok(align_pairwise(qry_seq, ref_seq, gap_open_close, params, &stripes));
}

Expand Down Expand Up @@ -77,24 +77,34 @@ pub fn align_nuc(
params.max_band_area,
)?;

if log::max_level() == log::Level::Debug {
let band_area: usize = stripes.iter().map(Stripe::len).sum();
let mean_band_width = band_area / ref_len;
debug!("Nucleotide alignment band area={band_area}, mean band width={mean_band_width}");
}

let mut alignment = align_pairwise(&qry_seq, ref_seq, gap_open_close, params, &stripes);
alignment.is_reverse_complement = is_reverse_complement;

let alignment_score = alignment.alignment_score;

debug!("Attempt: {attempt}, Alignment score: {alignment_score}");

if alignment.hit_boundary {
terminal_bandwidth *= 2;
excess_bandwidth *= 2;
allowed_mismatches *= 2;
attempt += 1;
info!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Band boundary is hit on attempt {}. Retrying with relaxed parameters. Alignment score was: {}", attempt, alignment.alignment_score);
info!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Band boundary is hit on attempt {}. Retrying with relaxed parameters. Alignment score was: {alignment_score}", attempt);
} else {
if attempt > 0 {
info!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Succeeded without hitting band boundary on attempt {}. Alignment score was: {}", attempt+1, alignment.alignment_score);
info!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Succeeded without hitting band boundary on attempt {}. Alignment score was: {alignment_score}", attempt+1);
}
return Ok(alignment);
}

if attempt > params.max_alignment_attempts {
info!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Attempted to relax band parameters {attempt} times, but still hitting the band boundary. Alignment score was: {}", alignment.alignment_score);
info!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Attempted to relax band parameters {attempt} times, but still hitting the band boundary. Alignment score was: {alignment_score}");
return Ok(alignment);
}
}
Expand Down
3 changes: 2 additions & 1 deletion packages_rs/nextclade/src/align/score_matrix.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ pub fn score_matrix<T: Letter<T>>(
let mut scores = Band2d::<i32>::new(stripes);
let band_size = paths.data_len();

trace!("Score matrix: allocated alignment band of size={band_size}");
let mean_band_width = band_size / ref_len;
trace!("Score matrix: allocated alignment band of size={band_size}, mean band width={mean_band_width}");

let left_align = match params.gap_alignment_side {
GapAlignmentSide::Left => 1,
Expand Down
37 changes: 34 additions & 3 deletions packages_rs/nextclade/src/align/seed_match2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ use eyre::Report;
use gcollections::ops::{Bounded, Intersection, IsEmpty, Union};
use interval::interval_set::{IntervalSet, ToIntervalSet};
use itertools::Itertools;
use log::{debug, trace};
use std::borrow::Cow;
use std::cmp::{max, min};
use std::collections::{BTreeMap, VecDeque};
Expand Down Expand Up @@ -481,14 +482,44 @@ pub fn get_seed_matches2(
let seed_matches = chain_seeds(&matches);
// write_matches_to_file(&seed_matches, "chained_matches.csv");

// Trace basic seed alignment stats
if log::max_level() >= log::Level::Debug {
let max_shift = seed_matches.iter().map(|sm| sm.offset.abs()).max().unwrap_or(0);
// Need to iterate in pairs
let max_unmatched_ref_stretch = seed_matches
.iter()
.tuple_windows()
.map(|(sm1, sm2)| sm2.ref_pos - (sm1.ref_pos + sm1.length))
.max()
.unwrap_or(0);
let max_unmatched_qry_stretch = seed_matches
.iter()
.tuple_windows()
.map(|(sm1, sm2)| sm2.qry_pos - (sm1.qry_pos + sm1.length))
.max()
.unwrap_or(0);
let first_match = seed_matches.first().unwrap();
let last_match = seed_matches.last().unwrap();
let first_match_ref_pos = first_match.ref_pos;
let first_match_qry_pos = first_match.qry_pos;
let last_match_ref_pos = ref_seq.len() - last_match.ref_pos - last_match.length;
let last_match_qry_pos = qry_seq.len() - last_match.qry_pos - last_match.length;

let n_matches = seed_matches.len();
debug!("Chained seed stats. max indel: {max_shift}, # matches: {n_matches}, first/last match distance from start/end [start: (ref: {first_match_ref_pos}, qry: {first_match_qry_pos}), end: (ref: {last_match_ref_pos}, qry: {last_match_qry_pos})], max unmatched stretch (ref: {max_unmatched_ref_stretch}, qry stretch: {max_unmatched_qry_stretch})");
}

let sum_of_seed_length: usize = seed_matches.iter().map(|sm| sm.length).sum();
if (sum_of_seed_length as f64 / qry_seq.len() as f64) < params.min_seed_cover {
let qry_coverage = sum_of_seed_length as f64 / qry_seq.len() as f64;
debug!("Seed alignment covers {:.2}% of query length", 100.0 * qry_coverage);
if qry_coverage < params.min_seed_cover {
let query_knowns = qry_seq.iter().filter(|n| n.is_acgt()).count();
let qry_coverage_knowns = sum_of_seed_length as f64 / query_knowns as f64;
if (sum_of_seed_length as f64 / query_knowns as f64) < params.min_seed_cover {
return make_error!(
"Unable to align: seed alignment covers {:.2}% of the query sequence, which is less than expected {:.2}% \
"Unable to align: seed alignment covers {:.2}% of query sequence ACGTs, which is less than required {:.2}% \
(configurable using 'min seed cover' CLI flag or dataset property). This is likely due to low quality of the \
provided sequence, or due to using incorrect reference sequence.",
provided sequence, or due to using an incorrect reference sequence.",
100.0 * (sum_of_seed_length as f64) / (query_knowns as f64),
100.0 * params.min_seed_cover
);
Expand Down

0 comments on commit 27d669e

Please sign in to comment.