From c60c771826f2a811962679c4673fdd5440567abb Mon Sep 17 00:00:00 2001 From: Adrian Sampson Date: Sun, 8 Dec 2024 17:04:16 -0500 Subject: [PATCH 1/4] Try a little reverse-printing sequence wrapper --- flatgfa/src/flatgfa.rs | 45 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/flatgfa/src/flatgfa.rs b/flatgfa/src/flatgfa.rs index a7f0e5dd..fb3599f3 100644 --- a/flatgfa/src/flatgfa.rs +++ b/flatgfa/src/flatgfa.rs @@ -248,6 +248,47 @@ pub enum LineKind { Link, } +pub struct Sequence<'a> { + data: &'a [u8], + revcmp: bool, +} + +impl<'a> Sequence<'a> { + pub fn new(data: &'a [u8], ori: Orientation) -> Self { + Self { + data, + revcmp: ori == Orientation::Backward, + } + } +} + +fn nucleotide_complement(c: u8) -> u8 { + match c { + b'A' => b'T', + b'T' => b'A', + b'C' => b'G', + b'G' => b'C', + b'a' => b't', + b't' => b'a', + b'c' => b'g', + b'g' => b'c', + x => x, + } +} + +impl<'a> std::fmt::Display for Sequence<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if self.revcmp { + for &c in self.data.iter().rev() { + write!(f, "{}", nucleotide_complement(c))?; + } + } else { + write!(f, "{}", BStr::new(self.data))?; + } + Ok(()) + } +} + impl<'a> FlatGFA<'a> { /// Get the base-pair sequence for a segment. pub fn get_seq(&self, seg: &Segment) -> &BStr { @@ -287,8 +328,8 @@ impl<'a> FlatGFA<'a> { /// Look up a CIGAR alignment. pub fn get_alignment(&self, overlap: Span) -> Alignment { - Alignment { - ops: &self.alignment[overlap] + Alignment { + ops: &self.alignment[overlap], } } From 8d19988712f46acbc2bfc5104a768088a4f4fc01 Mon Sep 17 00:00:00 2001 From: Adrian Sampson Date: Sun, 8 Dec 2024 17:23:34 -0500 Subject: [PATCH 2/4] Use new Sequence struct to print slices --- flatgfa/src/flatgfa.rs | 31 ++++++++++++++++++++++++++++++- flatgfa/src/gaf.rs | 41 ++++------------------------------------- 2 files changed, 34 insertions(+), 38 deletions(-) diff --git a/flatgfa/src/flatgfa.rs b/flatgfa/src/flatgfa.rs index fb3599f3..8c6bf31e 100644 --- a/flatgfa/src/flatgfa.rs +++ b/flatgfa/src/flatgfa.rs @@ -1,3 +1,4 @@ +use std::ops::Range; use std::str::FromStr; use crate::pool::{self, Id, Pool, Span, Store}; @@ -260,6 +261,28 @@ impl<'a> Sequence<'a> { revcmp: ori == Orientation::Backward, } } + + pub fn index(&self, idx: usize) -> u8 { + if self.revcmp { + nucleotide_complement(self.data[self.data.len() - idx - 1]) + } else { + self.data[idx] + } + } + + pub fn slice(&self, range: Range) -> Self { + let data = if self.revcmp { + // The range starts at the end of the buffer: + // [----- u8 { @@ -280,7 +303,7 @@ impl<'a> std::fmt::Display for Sequence<'a> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { if self.revcmp { for &c in self.data.iter().rev() { - write!(f, "{}", nucleotide_complement(c))?; + write!(f, "{}", nucleotide_complement(c) as char)?; } } else { write!(f, "{}", BStr::new(self.data))?; @@ -295,6 +318,12 @@ impl<'a> FlatGFA<'a> { self.seq_data[seg.seq].as_ref() } + pub fn get_seq_oriented(&self, handle: Handle) -> Sequence { + let seg = self.get_handle_seg(handle); + let seq_data = self.seq_data[seg.seq].as_ref(); + Sequence::new(seq_data, handle.orient()) + } + /// Look up a segment by its name. pub fn find_seg(&self, name: usize) -> Option> { // TODO Make this more efficient by maintaining the name index? This would not be diff --git a/flatgfa/src/gaf.rs b/flatgfa/src/gaf.rs index 540b2bc6..50f23551 100644 --- a/flatgfa/src/gaf.rs +++ b/flatgfa/src/gaf.rs @@ -89,48 +89,15 @@ fn print_event(gfa: &flatgfa::FlatGFA, event: ChunkEvent) { } } -fn reverse_complement(seq: &[u8]) -> Vec { - seq.iter() - .rev() - .map(|&b| match b { - b'A' => b'T', - b'T' => b'A', - b'C' => b'G', - b'G' => b'C', - b'a' => b't', - b't' => b'a', - b'c' => b'g', - b'g' => b'c', - x => x, - }) - .collect() -} - fn print_seq(gfa: &flatgfa::FlatGFA, event: ChunkEvent) { - let seg = gfa.segs[event.handle.segment()]; - let seq = gfa.get_seq(&seg); + let seq = gfa.get_seq_oriented(event.handle); match event.range { ChunkRange::Partial(start, end) => { - let chunk = if event.handle.orient() == flatgfa::Orientation::Forward { - &seq[start..end] - } else { - let len = seq.len(); - &seq[(len - end)..(len - start)] - }; - - if event.handle.orient() == flatgfa::Orientation::Forward { - print!("{}", chunk); - } else { - print!("{}", String::from_utf8_lossy(&reverse_complement(chunk))); - } + print!("{}", &seq.slice(start..end)); } ChunkRange::All => { - if event.handle.orient() == flatgfa::Orientation::Forward { - print!("{}", seq); - } else { - print!("{}", String::from_utf8_lossy(&reverse_complement(seq))); - } + print!("{}", seq); } ChunkRange::None => {} } @@ -284,7 +251,7 @@ impl<'a, 'b> Iterator for PathChunker<'a, 'b> { false => flatgfa::Orientation::Backward, }; let handle = flatgfa::Handle::new(seg_id, dir); - + // Accumulate the length to track our position in the path. let seg_len = self.gfa.segs[seg_id].len(); let next_pos = self.pos + seg_len; From 810bf876b6c7b844bce4f65cfe7c64764af57d28 Mon Sep 17 00:00:00 2001 From: Adrian Sampson Date: Sun, 8 Dec 2024 17:32:01 -0500 Subject: [PATCH 3/4] Try allocating data buffer to print --- flatgfa/src/flatgfa.rs | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/flatgfa/src/flatgfa.rs b/flatgfa/src/flatgfa.rs index 8c6bf31e..95212035 100644 --- a/flatgfa/src/flatgfa.rs +++ b/flatgfa/src/flatgfa.rs @@ -283,6 +283,18 @@ impl<'a> Sequence<'a> { revcmp: self.revcmp, } } + + pub fn as_vec(&self) -> Vec { + if self.revcmp { + self.data + .iter() + .rev() + .map(|&c| nucleotide_complement(c)) + .collect() + } else { + self.data.to_vec() + } + } } fn nucleotide_complement(c: u8) -> u8 { @@ -302,9 +314,8 @@ fn nucleotide_complement(c: u8) -> u8 { impl<'a> std::fmt::Display for Sequence<'a> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { if self.revcmp { - for &c in self.data.iter().rev() { - write!(f, "{}", nucleotide_complement(c) as char)?; - } + let bytes = self.as_vec(); + write!(f, "{}", BStr::new(&bytes))?; } else { write!(f, "{}", BStr::new(self.data))?; } From dd5f87b2c5298ab508e46bdb9c491e0f2c0ae152 Mon Sep 17 00:00:00 2001 From: Adrian Sampson Date: Sun, 8 Dec 2024 17:35:15 -0500 Subject: [PATCH 4/4] Comment about the efficient printing --- flatgfa/src/flatgfa.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/flatgfa/src/flatgfa.rs b/flatgfa/src/flatgfa.rs index 95212035..a8899c45 100644 --- a/flatgfa/src/flatgfa.rs +++ b/flatgfa/src/flatgfa.rs @@ -314,6 +314,10 @@ fn nucleotide_complement(c: u8) -> u8 { impl<'a> std::fmt::Display for Sequence<'a> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { if self.revcmp { + // For small sequences, it's faster to allocate the reverse-complement + // string and write the whole buffer at once. For larger sequences, it + // may be more efficient to do it one character at a time to avoid an + // allocation? Not sure, but could be worth a try. let bytes = self.as_vec(); write!(f, "{}", BStr::new(&bytes))?; } else {