diff --git a/Cargo.lock b/Cargo.lock index f6669ee..925f544 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -867,6 +867,7 @@ dependencies = [ "indexmap", "indicatif", "itertools", + "lazy_static", "noodles", "num-format", "plotly", diff --git a/Cargo.toml b/Cargo.toml index 306ad22..c9081d8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,6 +19,7 @@ git-testament = "0.2.1" indexmap = "1.9.1" indicatif = "0.16.2" itertools = "0.10.5" +lazy_static = "1.4.0" noodles = { version = "0.34.0", features = [ "async", "bam", diff --git a/src/derive/command/endedness.rs b/src/derive/command/endedness.rs index 902c26b..8bae07c 100644 --- a/src/derive/command/endedness.rs +++ b/src/derive/command/endedness.rs @@ -12,6 +12,7 @@ use tracing::info; use tracing::trace; use crate::derive::endedness::compute; +use crate::derive::endedness::compute::{BOTH, FIRST, LAST, NEITHER, OVERALL, UNKNOWN_READ_GROUP}; use crate::utils::formats::bam::ParsedBAMFile; use crate::utils::formats::utils::IndexCheck; @@ -31,7 +32,7 @@ pub fn deviance_in_range(deviance_raw: &str) -> Result { /// Clap arguments for the `ngs derive endedness` subcommand. #[derive(Args)] pub struct DeriveEndednessArgs { - // Source BAM. + /// Source BAM. #[arg(value_name = "BAM")] src: PathBuf, @@ -64,32 +65,25 @@ struct ReadGroup { both: usize, neither: usize, } + /// Main function for the `ngs derive endedness` subcommand. pub fn derive(args: DeriveEndednessArgs) -> anyhow::Result<()> { info!("Starting derive endedness subcommand."); let mut new_ordering_flags: HashMap = HashMap::new(); - new_ordering_flags.insert("f+l-".to_string(), 0); - new_ordering_flags.insert("f-l+".to_string(), 0); - new_ordering_flags.insert("f+l+".to_string(), 0); - new_ordering_flags.insert("f-l-".to_string(), 0); + new_ordering_flags.insert(*FIRST, 0); + new_ordering_flags.insert(*LAST, 0); + new_ordering_flags.insert(*BOTH, 0); + new_ordering_flags.insert(*NEITHER, 0); let mut ordering_flags: HashMap, HashMap> = HashMap::new(); - ordering_flags.insert(Rc::new("overall".to_string()), new_ordering_flags.clone()); - ordering_flags.insert(Rc::new("unknown_read_group".to_string()), new_ordering_flags.clone()); - - new_ordering_flags - .entry("f+l-".to_string()) - .and_modify(|e| *e += 1); - new_ordering_flags - .entry("f-l+".to_string()) - .and_modify(|e| *e += 1); - new_ordering_flags - .entry("f+l+".to_string()) - .and_modify(|e| *e += 1); - new_ordering_flags - .entry("f-l-".to_string()) - .and_modify(|e| *e += 1); + ordering_flags.insert(Rc::new(*OVERALL), new_ordering_flags.clone()); + ordering_flags.insert(Rc::new(*UNKNOWN_READ_GROUP), new_ordering_flags.clone()); + + new_ordering_flags.entry(*FIRST).and_modify(|e| *e += 1); + new_ordering_flags.entry(*LAST).and_modify(|e| *e += 1); + new_ordering_flags.entry(*BOTH).and_modify(|e| *e += 1); + new_ordering_flags.entry(*NEITHER).and_modify(|e| *e += 1); // only used if args.calc_rpt is true let mut found_rgs = HashSet::new(); @@ -118,18 +112,28 @@ pub fn derive(args: DeriveEndednessArgs) -> anyhow::Result<()> { continue; } - let read_group = record - .data() - .get(Tag::ReadGroup) - .map(|v| Rc::new(v.to_string())) - .unwrap_or(Rc::new(String::from("unknown_read_group"))); + let read_group = match record.data().get(Tag::ReadGroup) { + Some(rg) => Rc::new(String::from(rg.as_str().unwrap())), + None => Rc::new(*UNKNOWN_READ_GROUP), + }; if args.calc_rpt { found_rgs.insert(Rc::clone(&read_group)); match record.read_name() { Some(rn) => { - read_names.insert(rn.to_string(), vec![Rc::clone(&read_group)]); + let rg_vec = read_names.get_mut(&rn.to_string()); + + match rg_vec { + Some(rg_vec) => { + rg_vec.push(Rc::clone(&read_group)); + } + None => { + let mut rg_vec = Vec::new(); + rg_vec.push(Rc::clone(&read_group)); + read_names.insert(rn.to_string(), rg_vec); + } + } } None => { trace!("Could not parse a QNAME from a read in the file."); @@ -140,43 +144,47 @@ pub fn derive(args: DeriveEndednessArgs) -> anyhow::Result<()> { } if record.flags().is_first_segment() && !record.flags().is_last_segment() { - ordering_flags.entry(Rc::new("overall".to_string())).and_modify(|e| { - e.entry("f+l-".to_string()).and_modify(|e| *e += 1); + ordering_flags.entry(Rc::new(*OVERALL)).and_modify(|e| { + e.entry(*FIRST).and_modify(|e| *e += 1); }); + ordering_flags .entry(read_group) .and_modify(|e| { - e.entry("f+l-".to_string()).and_modify(|e| *e += 1); + e.entry(*FIRST).and_modify(|e| *e += 1); }) .or_insert(new_ordering_flags.clone()); } else if !record.flags().is_first_segment() && record.flags().is_last_segment() { - ordering_flags.entry(Rc::new("overall".to_string())).and_modify(|e| { - e.entry("f-l+".to_string()).and_modify(|e| *e += 1); + ordering_flags.entry(Rc::new(*OVERALL)).and_modify(|e| { + e.entry(*LAST).and_modify(|e| *e += 1); }); + ordering_flags .entry(read_group) .and_modify(|e| { - e.entry("f-l+".to_string()).and_modify(|e| *e += 1); + e.entry(*LAST).and_modify(|e| *e += 1); }) .or_insert(new_ordering_flags.clone()); } else if record.flags().is_first_segment() && record.flags().is_last_segment() { - ordering_flags.entry(Rc::new("overall".to_string())).and_modify(|e| { - e.entry("f+l+".to_string()).and_modify(|e| *e += 1); + ordering_flags.entry(Rc::new(*OVERALL)).and_modify(|e| { + e.entry(*BOTH).and_modify(|e| *e += 1); }); + ordering_flags .entry(read_group) .and_modify(|e| { - e.entry("f+l+".to_string()).and_modify(|e| *e += 1); + e.entry(*BOTH).and_modify(|e| *e += 1); }) .or_insert(new_ordering_flags.clone()); } else if !record.flags().is_first_segment() && !record.flags().is_last_segment() { - ordering_flags.entry(Rc::new("overall".to_string())).and_modify(|e| { - e.entry("f-l-".to_string()).and_modify(|e| *e += 1); + ordering_flags.entry(Rc::new(*OVERALL)).and_modify(|e| { + e.entry(*NEITHER).and_modify(|e| *e += 1); }); + ordering_flags .entry(read_group) .and_modify(|e| { - e.entry("f-l-".to_string()).and_modify(|e| *e += 1); + e.entry(*NEITHER).and_modify(|e| *e += 1); }) .or_insert(new_ordering_flags.clone()); } else { @@ -192,7 +200,8 @@ pub fn derive(args: DeriveEndednessArgs) -> anyhow::Result<()> { } // (2) Derive the consensus endedness based on the ordering flags gathered. - let result = compute::predict(ordering_flags, args.paired_deviance.unwrap()).unwrap(); + let result = + compute::predict(ordering_flags, read_names, args.paired_deviance.unwrap()).unwrap(); // (3) Print the output to stdout as JSON (more support for different output // types may be added in the future, but for now, only JSON). diff --git a/src/derive/endedness/compute.rs b/src/derive/endedness/compute.rs index e0a2608..9e496b9 100644 --- a/src/derive/endedness/compute.rs +++ b/src/derive/endedness/compute.rs @@ -1,10 +1,25 @@ //! Module holding the logic for computing the endedness of a BAM. use anyhow::bail; +use lazy_static::lazy_static; +use radix_trie::iter; +use radix_trie::Trie; use serde::Serialize; use std::collections::HashMap; use std::rc::Rc; +lazy_static! { + // Strings used to index into the HashMaps used to store the Read Group ordering flags. + pub static ref OVERALL: String = String::from("overall"); + pub static ref UNKNOWN_READ_GROUP: String = String::from("unknown_read_group"); + + // Strings used to index into the HashMaps used to store the ordering flag counts. + pub static ref FIRST: String = String::from("f+l-"); + pub static ref LAST: String = String::from("f-l+"); + pub static ref BOTH: String = String::from("f+l+"); + pub static ref NEITHER: String = String::from("f-l-"); +} + /// Struct holding the final results for an `ngs derive endedness` subcommand /// call. #[derive(Debug, Serialize)] @@ -49,60 +64,81 @@ impl DerivedEndednessResult { } } +fn calculate_reads_per_template( + read_names: Trie>>, +) -> HashMap, f64> { + let mut total_reads: usize = 0; + let mut templates_templates: usize = 0; + let mut reads_per_template: HashMap, f64> = HashMap::new(); + + for read_groups in iter::Iter::new(read_names) {} + + reads_per_template +} + /// Main method to evaluate the collected ordering flags and /// return a result for the endedness of the file. This may fail, and the /// resulting [`DerivedEndednessResult`] should be evaluated accordingly. pub fn predict( ordering_flags: HashMap, HashMap>, + read_names: Trie>>, paired_deviance: f64, ) -> Result { - let first = ordering_flags[&String::from("overall")]["f+l-"]; - let last = ordering_flags[&String::from("overall")]["f-l+"]; - let both = ordering_flags[&String::from("overall")]["f+l+"]; - let neither = ordering_flags[&String::from("overall")]["f-l-"]; + let overall_ordering_flags = ordering_flags.get(&*OVERALL).unwrap(); + + let overall_first = *overall_ordering_flags.get(&*FIRST).unwrap(); + let overall_last = *overall_ordering_flags.get(&*LAST).unwrap(); + let overall_both = *overall_ordering_flags.get(&*BOTH).unwrap(); + let overall_neither = *overall_ordering_flags.get(&*NEITHER).unwrap(); - let mut result = - DerivedEndednessResult::new(false, "Unknown".to_string(), first, last, both, neither); + let mut result = DerivedEndednessResult::new( + false, + "Unknown".to_string(), + overall_first, + overall_last, + overall_both, + overall_neither, + ); // all zeroes - if first == 0 && last == 0 && both == 0 && neither == 0 { + if overall_first == 0 && overall_last == 0 && overall_both == 0 && overall_neither == 0 { bail!("No reads were detected in the file."); } // only first present - if first > 0 && last == 0 && both == 0 && neither == 0 { + if overall_first > 0 && overall_last == 0 && overall_both == 0 && overall_neither == 0 { return Ok(result); } // only last present - if first == 0 && last > 0 && both == 0 && neither == 0 { + if overall_first == 0 && overall_last > 0 && overall_both == 0 && overall_neither == 0 { return Ok(result); } // only both present - if first == 0 && last == 0 && both > 0 && neither == 0 { + if overall_first == 0 && overall_last == 0 && overall_both > 0 && overall_neither == 0 { result.succeeded = true; result.endedness = "Single-End".to_string(); return Ok(result); } // only neither present - if first == 0 && last == 0 && both == 0 && neither > 0 { + if overall_first == 0 && overall_last == 0 && overall_both == 0 && overall_neither > 0 { return Ok(result); } // first/last mixed with both/neither - if (first > 0 || last > 0) && (both > 0 || neither > 0) { + if (overall_first > 0 || overall_last > 0) && (overall_both > 0 || overall_neither > 0) { return Ok(result); } // any mix of both/neither, regardless of first/last - if both > 0 && neither > 0 { + if overall_both > 0 && overall_neither > 0 { return Ok(result); } // both and neither are now guarenteed to be 0 // We only need to check first and last - let first_frac = first as f64 / (first + last) as f64; + let first_frac = overall_first as f64 / (overall_first + overall_last) as f64; let lower_limit = 0.5 - paired_deviance; let upper_limit = 0.5 + paired_deviance; - if (first == last) || (lower_limit <= first_frac && first_frac <= upper_limit) { + if (overall_first == overall_last) || (lower_limit <= first_frac && first_frac <= upper_limit) { result.succeeded = true; result.endedness = "Paired-End".to_string(); return Ok(result);