Skip to content

Commit

Permalink
Avoid allocations when handling reverse-complement sequence data (#197)
Browse files Browse the repository at this point in the history
  • Loading branch information
sampsyo authored Dec 8, 2024
2 parents 53c0908 + dd5f87b commit 47b67b9
Show file tree
Hide file tree
Showing 2 changed files with 91 additions and 39 deletions.
89 changes: 87 additions & 2 deletions flatgfa/src/flatgfa.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use std::ops::Range;
use std::str::FromStr;

use crate::pool::{self, Id, Pool, Span, Store};
Expand Down Expand Up @@ -248,12 +249,96 @@ pub enum LineKind {
Link,
}

pub struct Sequence<'a> {
data: &'a [u8],
revcmp: bool,
}

impl<'a> Sequence<'a> {
pub fn new(data: &'a [u8], ori: Orientation) -> Self {
Self {
data,
revcmp: ori == Orientation::Backward,
}
}

pub fn index(&self, idx: usize) -> u8 {
if self.revcmp {
nucleotide_complement(self.data[self.data.len() - idx - 1])
} else {
self.data[idx]
}
}

pub fn slice(&self, range: Range<usize>) -> Self {
let data = if self.revcmp {
// The range starts at the end of the buffer:
// [-----<end<******<start<------]
&self.data[(self.data.len() - range.end)..(self.data.len() - range.start)]
} else {
&self.data[range]
};
Self {
data,
revcmp: self.revcmp,
}
}

pub fn as_vec(&self) -> Vec<u8> {
if self.revcmp {
self.data
.iter()
.rev()
.map(|&c| nucleotide_complement(c))
.collect()
} else {
self.data.to_vec()
}
}
}

fn nucleotide_complement(c: u8) -> u8 {
match c {
b'A' => b'T',
b'T' => b'A',
b'C' => b'G',
b'G' => b'C',
b'a' => b't',
b't' => b'a',
b'c' => b'g',
b'g' => b'c',
x => x,
}
}

impl<'a> std::fmt::Display for Sequence<'a> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
if self.revcmp {
// For small sequences, it's faster to allocate the reverse-complement
// string and write the whole buffer at once. For larger sequences, it
// may be more efficient to do it one character at a time to avoid an
// allocation? Not sure, but could be worth a try.
let bytes = self.as_vec();
write!(f, "{}", BStr::new(&bytes))?;
} else {
write!(f, "{}", BStr::new(self.data))?;
}
Ok(())
}
}

impl<'a> FlatGFA<'a> {
/// Get the base-pair sequence for a segment.
pub fn get_seq(&self, seg: &Segment) -> &BStr {
self.seq_data[seg.seq].as_ref()
}

pub fn get_seq_oriented(&self, handle: Handle) -> Sequence {
let seg = self.get_handle_seg(handle);
let seq_data = self.seq_data[seg.seq].as_ref();
Sequence::new(seq_data, handle.orient())
}

/// Look up a segment by its name.
pub fn find_seg(&self, name: usize) -> Option<Id<Segment>> {
// TODO Make this more efficient by maintaining the name index? This would not be
Expand Down Expand Up @@ -287,8 +372,8 @@ impl<'a> FlatGFA<'a> {

/// Look up a CIGAR alignment.
pub fn get_alignment(&self, overlap: Span<AlignOp>) -> Alignment {
Alignment {
ops: &self.alignment[overlap]
Alignment {
ops: &self.alignment[overlap],
}
}

Expand Down
41 changes: 4 additions & 37 deletions flatgfa/src/gaf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,48 +89,15 @@ fn print_event(gfa: &flatgfa::FlatGFA, event: ChunkEvent) {
}
}

fn reverse_complement(seq: &[u8]) -> Vec<u8> {
seq.iter()
.rev()
.map(|&b| match b {
b'A' => b'T',
b'T' => b'A',
b'C' => b'G',
b'G' => b'C',
b'a' => b't',
b't' => b'a',
b'c' => b'g',
b'g' => b'c',
x => x,
})
.collect()
}

fn print_seq(gfa: &flatgfa::FlatGFA, event: ChunkEvent) {
let seg = gfa.segs[event.handle.segment()];
let seq = gfa.get_seq(&seg);
let seq = gfa.get_seq_oriented(event.handle);

match event.range {
ChunkRange::Partial(start, end) => {
let chunk = if event.handle.orient() == flatgfa::Orientation::Forward {
&seq[start..end]
} else {
let len = seq.len();
&seq[(len - end)..(len - start)]
};

if event.handle.orient() == flatgfa::Orientation::Forward {
print!("{}", chunk);
} else {
print!("{}", String::from_utf8_lossy(&reverse_complement(chunk)));
}
print!("{}", &seq.slice(start..end));
}
ChunkRange::All => {
if event.handle.orient() == flatgfa::Orientation::Forward {
print!("{}", seq);
} else {
print!("{}", String::from_utf8_lossy(&reverse_complement(seq)));
}
print!("{}", seq);
}
ChunkRange::None => {}
}
Expand Down Expand Up @@ -284,7 +251,7 @@ impl<'a, 'b> Iterator for PathChunker<'a, 'b> {
false => flatgfa::Orientation::Backward,
};
let handle = flatgfa::Handle::new(seg_id, dir);

// Accumulate the length to track our position in the path.
let seg_len = self.gfa.segs[seg_id].len();
let next_pos = self.pos + seg_len;
Expand Down

0 comments on commit 47b67b9

Please sign in to comment.