diff --git a/.gitignore b/.gitignore index 2c542cdd..f4e54928 100644 --- a/.gitignore +++ b/.gitignore @@ -22,4 +22,7 @@ pollen/target polbin/target pollen/*.rlib +flatgfa/target +flatgfa/**/target + slow_odgi/dist/ diff --git a/Makefile b/Makefile index 15b77ce4..e2c374df 100644 --- a/Makefile +++ b/Makefile @@ -25,7 +25,7 @@ test-slow-odgi: fetch test-flatgfa: fetch cd flatgfa ; cargo build - turnt -e flatgfa_mem -e flatgfa_file -e flatgfa_file_inplace tests/*.gfa + turnt -v -e flatgfa_mem -e flatgfa_file -e flatgfa_file_inplace tests/*.gfa -turnt --save -v -e chop_oracle_fgfa tests/*.gfa turnt -v -e flatgfa_chop tests/*.gfa diff --git a/flatgfa-py/Cargo.lock b/flatgfa-py/Cargo.lock index 3b85ab62..bb352e08 100644 --- a/flatgfa-py/Cargo.lock +++ b/flatgfa-py/Cargo.lock @@ -56,9 +56,9 @@ checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1" [[package]] name = "bstr" -version = "1.9.1" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05efc5cfd9110c8416e471df0e96702d58690178e206e61b7173706673c93706" +checksum = "40723b8fb387abc38f4f4a37c09073622e41dd12327033091ef8950659e6dc0c" dependencies = [ "memchr", "regex-automata", @@ -120,9 +120,9 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" [[package]] name = "indexmap" -version = "2.2.6" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" +checksum = "68b900aa2f7301e21c36462b170ee99994de34dff39a4a6a528e80e7376d07e5" dependencies = [ "equivalent", "hashbrown", @@ -158,9 +158,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.7.2" +version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" [[package]] name = "memmap" @@ -183,27 +183,27 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.18" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", ] [[package]] name = "num_enum" -version = "0.7.2" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02339744ee7253741199f897151b38e72257d13802d4ee837285cc2990a90845" +checksum = "4e613fc340b2220f734a8595782c551f1250e969d87d3be1ae0579e8d4065179" dependencies = [ "num_enum_derive", ] [[package]] name = "num_enum_derive" -version = "0.7.2" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "681030a937600a36906c185595136d26abfebb4aa9c65701cefcaf8578bb982b" +checksum = "af1844ef2428cc3e1cb900be36181049ef3d3193c63e43026cfe202983b27a56" dependencies = [ "proc-macro-crate", "proc-macro2", @@ -248,9 +248,9 @@ checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0" [[package]] name = "proc-macro-crate" -version = "3.1.0" +version = "3.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d37c51ca738a55da99dc0c4a34860fd675453b8b36209178c2249bb13651284" +checksum = "8ecf48c7ca261d60b74ab1a7b20da18bede46776b2e55535cb958eb595c5fa7b" dependencies = [ "toml_edit", ] @@ -348,9 +348,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.6" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" +checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" [[package]] name = "scopeguard" @@ -403,21 +403,21 @@ checksum = "e1fc403891a21bcfb7c37834ba66a547a8f402146eba7265b5a6d88059c9ff2f" [[package]] name = "tinyvec" -version = "1.6.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938" [[package]] name = "toml_datetime" -version = "0.6.5" +version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3550f4e9685620ac18a50ed434eb3aec30db8ba93b0287467bca5826ea25baf1" +checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41" [[package]] name = "toml_edit" -version = "0.21.1" +version = "0.22.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a8534fd7f78b5405e860340ad6575217ce99f38d4d5c8f2442cb5ecb50090e1" +checksum = "4ae48d6208a266e853d946088ed816055e556cc6028c5e8e2b84d9fa5dd7c7f5" dependencies = [ "indexmap", "toml_datetime", @@ -524,18 +524,18 @@ checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0" [[package]] name = "winnow" -version = "0.5.40" +version = "0.6.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f593a95398737aeed53e489c785df13f3618e41dbcd6718c6addbf1395aa6876" +checksum = "36c1fec1a2bb5866f07c25f68c26e565c4c200aebb96d7e55710c19d3e8ac49b" dependencies = [ "memchr", ] [[package]] name = "zerocopy" -version = "0.7.32" +version = "0.7.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74d4d3961e53fa4c9a25a8637fc2bfaf2595b3d3ae34875568a5cf64787716be" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" dependencies = [ "byteorder", "zerocopy-derive", @@ -543,9 +543,9 @@ dependencies = [ [[package]] name = "zerocopy-derive" -version = "0.7.32" +version = "0.7.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", diff --git a/flatgfa-py/src/lib.rs b/flatgfa-py/src/lib.rs index 85452d48..8a3a4a5f 100644 --- a/flatgfa-py/src/lib.rs +++ b/flatgfa-py/src/lib.rs @@ -1,5 +1,5 @@ -use flatgfa::pool::Id; -use flatgfa::{self, file, print, FlatGFA, HeapGFAStore}; +use flatgfa::fgfa_ds::{file, print, pool::Id}; +use flatgfa::{self, FlatGFA, HeapGFAStore}; use pyo3::exceptions::PyIndexError; use pyo3::prelude::*; use pyo3::types::{PyBytes, PySlice}; diff --git a/flatgfa/Cargo.lock b/flatgfa/Cargo.lock index d09b04b8..7a0c0020 100644 --- a/flatgfa/Cargo.lock +++ b/flatgfa/Cargo.lock @@ -44,15 +44,15 @@ dependencies = [ [[package]] name = "autocfg" -version = "1.1.0" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" [[package]] name = "bstr" -version = "1.9.1" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05efc5cfd9110c8416e471df0e96702d58690178e206e61b7173706673c93706" +checksum = "40723b8fb387abc38f4f4a37c09073622e41dd12327033091ef8950659e6dc0c" dependencies = [ "memchr", "regex-automata", @@ -87,15 +87,15 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.14.3" +version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" [[package]] name = "indexmap" -version = "2.2.5" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b0b929d511467233429c45a44ac1dcaa21ba0f5ba11e4879e6ed28ddb4f9df4" +checksum = "68b900aa2f7301e21c36462b170ee99994de34dff39a4a6a528e80e7376d07e5" dependencies = [ "equivalent", "hashbrown", @@ -103,15 +103,15 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.153" +version = "0.2.159" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" +checksum = "561d97a539a36e26a9a5fad1ea11a3039a67714694aaa379433e580854bc3dc5" [[package]] name = "memchr" -version = "2.7.1" +version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" [[package]] name = "memmap" @@ -125,27 +125,27 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.18" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", ] [[package]] name = "num_enum" -version = "0.7.2" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02339744ee7253741199f897151b38e72257d13802d4ee837285cc2990a90845" +checksum = "4e613fc340b2220f734a8595782c551f1250e969d87d3be1ae0579e8d4065179" dependencies = [ "num_enum_derive", ] [[package]] name = "num_enum_derive" -version = "0.7.2" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "681030a937600a36906c185595136d26abfebb4aa9c65701cefcaf8578bb982b" +checksum = "af1844ef2428cc3e1cb900be36181049ef3d3193c63e43026cfe202983b27a56" dependencies = [ "proc-macro-crate", "proc-macro2", @@ -155,9 +155,9 @@ dependencies = [ [[package]] name = "proc-macro-crate" -version = "3.1.0" +version = "3.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d37c51ca738a55da99dc0c4a34860fd675453b8b36209178c2249bb13651284" +checksum = "8ecf48c7ca261d60b74ab1a7b20da18bede46776b2e55535cb958eb595c5fa7b" dependencies = [ "toml_edit", ] @@ -182,9 +182,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.6" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" +checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" [[package]] name = "serde" @@ -219,21 +219,21 @@ dependencies = [ [[package]] name = "tinyvec" -version = "1.6.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938" [[package]] name = "toml_datetime" -version = "0.6.5" +version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3550f4e9685620ac18a50ed434eb3aec30db8ba93b0287467bca5826ea25baf1" +checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41" [[package]] name = "toml_edit" -version = "0.21.1" +version = "0.22.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a8534fd7f78b5405e860340ad6575217ce99f38d4d5c8f2442cb5ecb50090e1" +checksum = "4ae48d6208a266e853d946088ed816055e556cc6028c5e8e2b84d9fa5dd7c7f5" dependencies = [ "indexmap", "toml_datetime", @@ -270,18 +270,18 @@ checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "winnow" -version = "0.5.40" +version = "0.6.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f593a95398737aeed53e489c785df13f3618e41dbcd6718c6addbf1395aa6876" +checksum = "36c1fec1a2bb5866f07c25f68c26e565c4c200aebb96d7e55710c19d3e8ac49b" dependencies = [ "memchr", ] [[package]] name = "zerocopy" -version = "0.7.32" +version = "0.7.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74d4d3961e53fa4c9a25a8637fc2bfaf2595b3d3ae34875568a5cf64787716be" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" dependencies = [ "byteorder", "zerocopy-derive", @@ -289,9 +289,9 @@ dependencies = [ [[package]] name = "zerocopy-derive" -version = "0.7.32" +version = "0.7.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", diff --git a/flatgfa/src/cmds.rs b/flatgfa/src/cmds.rs deleted file mode 100644 index 48437e4c..00000000 --- a/flatgfa/src/cmds.rs +++ /dev/null @@ -1,547 +0,0 @@ -use crate::flatgfa::{self, Handle, Link, Orientation, Path, Segment}; -use crate::pool::{self, Id, Span, Store}; -use crate::{GFAStore, HeapFamily}; -use argh::FromArgs; -use std::collections::{HashMap, HashSet}; - -/// print the FlatGFA table of contents -#[derive(FromArgs, PartialEq, Debug)] -#[argh(subcommand, name = "toc")] -pub struct Toc {} - -pub fn toc(gfa: &flatgfa::FlatGFA) { - eprintln!("header: {}", gfa.header.len()); - eprintln!("segs: {}", gfa.segs.len()); - eprintln!("paths: {}", gfa.paths.len()); - eprintln!("links: {}", gfa.links.len()); - eprintln!("steps: {}", gfa.steps.len()); - eprintln!("seq_data: {}", gfa.seq_data.len()); - eprintln!("overlaps: {}", gfa.overlaps.len()); - eprintln!("alignment: {}", gfa.alignment.len()); - eprintln!("name_data: {}", gfa.name_data.len()); - eprintln!("optional_data: {}", gfa.optional_data.len()); - eprintln!("line_order: {}", gfa.line_order.len()); -} - -/// list the paths -#[derive(FromArgs, PartialEq, Debug)] -#[argh(subcommand, name = "paths")] -pub struct Paths {} - -pub fn paths(gfa: &flatgfa::FlatGFA) { - for path in gfa.paths.all().iter() { - println!("{}", gfa.get_path_name(path)); - } -} - -/// calculate graph statistics -#[derive(FromArgs, PartialEq, Debug)] -#[argh(subcommand, name = "stats")] -pub struct Stats { - /// show basic metrics - #[argh(switch, short = 'S')] - summarize: bool, - - /// number of segments with at least one self-loop link - #[argh(switch, short = 'L')] - self_loops: bool, -} - -pub fn stats(gfa: &flatgfa::FlatGFA, args: Stats) { - if args.summarize { - println!("#length\tnodes\tedges\tpaths\tsteps"); - println!( - "{}\t{}\t{}\t{}\t{}", - gfa.seq_data.len(), - gfa.segs.len(), - gfa.links.len(), - gfa.paths.len(), - gfa.steps.len() - ); - } else if args.self_loops { - let mut counts: HashMap, usize> = HashMap::new(); - let mut total: usize = 0; - for link in gfa.links.all().iter() { - if link.from.segment() == link.to.segment() { - let count = counts.entry(link.from.segment()).or_insert(0); - *count += 1; - total += 1; - } - } - println!("#type\tnum"); - println!("total\t{}", total); - println!("unique\t{}", counts.len()); - } -} - -/// find a nucleotide position within a path -#[derive(FromArgs, PartialEq, Debug)] -#[argh(subcommand, name = "position")] -pub struct Position { - /// path_name,offset,orientation - #[argh(option, short = 'p')] - path_pos: String, -} - -pub fn position(gfa: &flatgfa::FlatGFA, args: Position) -> Result<(), &'static str> { - // Parse the position triple, which looks like `path,42,+`. - let (path_name, offset, orientation) = { - let parts: Vec<_> = args.path_pos.split(',').collect(); - if parts.len() != 3 { - return Err("position must be path_name,offset,orientation"); - } - let off: usize = parts[1].parse().or(Err("offset must be a number"))?; - let ori: flatgfa::Orientation = parts[2].parse().or(Err("orientation must be + or -"))?; - (parts[0], off, ori) - }; - - let path_id = gfa.find_path(path_name.into()).ok_or("path not found")?; - let path = &gfa.paths[path_id]; - assert_eq!( - orientation, - flatgfa::Orientation::Forward, - "only + is implemented so far" - ); - - // Traverse the path until we reach the position. - let mut cur_pos = 0; - let mut found = None; - for step in &gfa.steps[path.steps] { - let seg = gfa.get_handle_seg(*step); - let end_pos = cur_pos + seg.len(); - if offset < end_pos { - // Found it! - found = Some((*step, offset - cur_pos)); - break; - } - cur_pos = end_pos; - } - - // Print the match. - if let Some((handle, seg_off)) = found { - let seg = gfa.get_handle_seg(handle); - let seg_name = seg.name; - println!("#source.path.pos\ttarget.graph.pos"); - println!( - "{},{},{}\t{},{},{}", - path_name, - offset, - orientation, - seg_name, - seg_off, - handle.orient() - ); - } - - Ok(()) -} - -/// create a subset graph -#[derive(FromArgs, PartialEq, Debug)] -#[argh(subcommand, name = "extract")] -pub struct Extract { - /// segment to extract around - #[argh(option, short = 'n')] - seg_name: usize, - - /// number of edges "away" from the node to include - #[argh(option, short = 'c')] - link_distance: usize, - - /// maximum number of basepairs allowed between subpaths s.t. the subpaths are merged together - #[argh(option, short = 'd', long = "max-distance-subpaths", default = "300000")] - max_distance_subpaths: usize, // TODO: possibly make this bigger - - /// maximum number of iterations before we stop merging subpaths - #[argh(option, short = 'e', long = "max-merging-iterations", default = "6")] - num_iterations: usize // TODO: probably make this smaller -} - -pub fn extract( - gfa: &flatgfa::FlatGFA, - args: Extract, -) -> Result { - let origin_seg = gfa.find_seg(args.seg_name).ok_or("segment not found")?; - - let mut subgraph = SubgraphBuilder::new(gfa); - subgraph.add_header(); - subgraph.extract(origin_seg, args.link_distance, args.max_distance_subpaths, args.num_iterations); - Ok(subgraph.store) -} - -/// A helper to construct a new graph that includes part of an old graph. -struct SubgraphBuilder<'a> { - old: &'a flatgfa::FlatGFA<'a>, - store: flatgfa::HeapGFAStore, - seg_map: HashMap, Id>, -} - -struct SubpathStart { - step: Id, // The id of the first step in the subpath. - pos: usize, // The bp position at the start of the subpath. -} - -impl<'a> SubgraphBuilder<'a> { - fn new(old: &'a flatgfa::FlatGFA) -> Self { - Self { - old, - store: flatgfa::HeapGFAStore::default(), - seg_map: HashMap::new(), - } - } - - /// Include the old graph's header - fn add_header(&mut self) { - // pub fn add_header(&mut self, version: &[u8]) { - // assert!(self.header.as_ref().is_empty()); - // self.header.add_slice(version); - // } - assert!(self.store.header.as_ref().is_empty()); - self.store.header.add_slice(self.old.header.all()); - } - - /// Add a segment from the source graph to this subgraph. - fn include_seg(&mut self, seg_id: Id) { - let seg = &self.old.segs[seg_id]; - let new_seg_id = self.store.add_seg( - seg.name, - self.old.get_seq(seg), - self.old.get_optional_data(seg), - ); - self.seg_map.insert(seg_id, new_seg_id); - } - - /// Add a link from the source graph to the subgraph. - fn include_link(&mut self, link: &flatgfa::Link) { - let from = self.tr_handle(link.from); - let to = self.tr_handle(link.to); - let overlap = self.old.get_alignment(link.overlap); - self.store.add_link(from, to, overlap.ops.into()); - } - - /// Add a single subpath from the given path to the subgraph. - fn include_subpath(&mut self, path: &flatgfa::Path, start: &SubpathStart, end_pos: usize) { - let steps = pool::Span::new(start.step, self.store.steps.next_id()); // why the next id? - let name = format!("{}:{}-{}", self.old.get_path_name(path), start.pos, end_pos); - self.store - .add_path(name.as_bytes(), steps, std::iter::empty()); - } - - /// Identify all the subpaths in a path from the original graph that cross through - /// segments in this subgraph and merge them if possible. - fn merge_subpaths(&mut self, path: &flatgfa::Path, max_distance_subpaths: usize) { - // these are subpaths which *aren't* already included in the new graph - let mut cur_subpath_start: Option = Some(0); - let mut subpath_length = 0; - let mut ignore_path = true; - - for (idx, step) in self.old.steps[path.steps].iter().enumerate() { - let in_neighb = self.seg_map.contains_key(&step.segment()); - - if let (Some(start), true) = (&cur_subpath_start, in_neighb) { - // We just entered the subgraph. End the current subpath. - if !ignore_path && subpath_length <= max_distance_subpaths { - // TODO: type safety - let subpath_span = Span::new(path.steps.start + *start as u32, path.steps.start + idx as u32); - for step in &self.old.steps[subpath_span] { - if !self.seg_map.contains_key(&step.segment()) { - self.include_seg(step.segment()); - } - } - } - cur_subpath_start = None; - ignore_path = false; - } else if let (None, false) = (&cur_subpath_start, in_neighb) { - // We've exited the current subgraph, start a new subpath - cur_subpath_start = Some(idx); - } - - // Track the current bp position in the path. - subpath_length += self.old.get_handle_seg(*step).len(); - } - } - - /// Identify all the subpaths in a path from the original graph that cross through - /// segments in this subgraph and add them. - fn find_subpaths(&mut self, path: &flatgfa::Path) { - let mut cur_subpath_start: Option = None; - let mut path_pos = 0; - - for step in &self.old.steps[path.steps] { - let in_neighb = self.seg_map.contains_key(&step.segment()); - - if let (Some(start), false) = (&cur_subpath_start, in_neighb) { - // End the current subpath. - self.include_subpath(path, start, path_pos); - cur_subpath_start = None; - } else if let (None, true) = (&cur_subpath_start, in_neighb) { - // Start a new subpath. - cur_subpath_start = Some(SubpathStart { - step: self.store.steps.next_id(), - pos: path_pos, - }); - } - - // Add the (translated) step to the new graph. - if in_neighb { - self.store.add_step(self.tr_handle(*step)); - } - - // Track the current bp position in the path. - path_pos += self.old.get_handle_seg(*step).len(); - } - - // Did we reach the end of the path while still in the neighborhood? - if let Some(start) = cur_subpath_start { - self.include_subpath(path, &start, path_pos); - } - } - - /// Translate a handle from the source graph to this subgraph. - fn tr_handle(&self, old_handle: flatgfa::Handle) -> flatgfa::Handle { - // TODO: is this just generating the handle or should we add it to the new graph? - flatgfa::Handle::new(self.seg_map[&old_handle.segment()], old_handle.orient()) - } - - /// Check whether a segment from the old graph is in the subgraph. - fn contains(&self, old_seg_id: Id) -> bool { - self.seg_map.contains_key(&old_seg_id) - } - - /// Extract a subgraph consisting of a neighborhood of segments up to `dist` links away - /// from the given segment in the original graph. - /// - /// Include any links between the segments in the neighborhood and subpaths crossing - /// through the neighborhood. - fn extract(&mut self, origin: Id, dist: usize, max_distance_subpaths: usize, num_iterations: usize) { - self.include_seg(origin); - - // Find the set of all segments that are c links away. - let mut frontier: Vec> = Vec::new(); - let mut next_frontier: Vec> = Vec::new(); - frontier.push(origin); - for _ in 0..dist { - while let Some(seg_id) = frontier.pop() { - for link in self.old.links.all().iter() { - if let Some(other_seg) = link.incident_seg(seg_id) { - // Add other_seg to the frontier set if it is not already in the frontier set or the seg_map - if !self.seg_map.contains_key(&other_seg) { - self.include_seg(other_seg); - next_frontier.push(other_seg); - } - } - } - } - (frontier, next_frontier) = (next_frontier, frontier); - } - - // Merge subpaths within max_distance_subpaths bp of each other, num_iterations times - for _ in 0..num_iterations { - for path in self.old.paths.all().iter() { - self.merge_subpaths(path, max_distance_subpaths); - } - } - - // Include all links within the subgraph. - for link in self.old.links.all().iter() { - if self.contains(link.from.segment()) && self.contains(link.to.segment()) { - self.include_link(link); - } - } - - // Find subpaths within the subgraph. - for path in self.old.paths.all().iter() { - self.find_subpaths(path); - } - } -} - -/// compute node depth, the number of times paths cross a node -#[derive(FromArgs, PartialEq, Debug)] -#[argh(subcommand, name = "depth")] -pub struct Depth {} - -pub fn depth(gfa: &flatgfa::FlatGFA) { - // Initialize node depth - let mut depths = vec![0; gfa.segs.len()]; - // Initialize uniq_paths - let mut uniq_paths = Vec::>::new(); - uniq_paths.resize(gfa.segs.len(), HashSet::new()); - // do not assume that each handle in `gfa.steps()` is unique - for (idx, path) in gfa.paths.all().iter().enumerate() { - for step in &gfa.steps[path.steps] { - let seg_id = step.segment().index(); - // Increment depths - depths[seg_id] += 1; - // Update uniq_paths - uniq_paths[seg_id].insert(idx); - } - } - // print out depth and depth.uniq - println!("#node.id\tdepth\tdepth.uniq"); - for (id, seg) in gfa.segs.items() { - let name: u32 = seg.name as u32; - println!( - "{}\t{}\t{}", - name, - depths[id.index()], - uniq_paths[id.index()].len() - ); - } -} - -/// chop the segments in a graph into sizes of N or smaller -#[derive(FromArgs, PartialEq, Debug)] -#[argh(subcommand, name = "chop")] -pub struct Chop { - /// maximimum segment size. - // Use c in keeping with odgi convention - #[argh(option, short = 'c')] - c: usize, - - /// compute new links - #[argh(switch, short = 'l')] - l: bool, -} - -/// Chop a graph into segments of size no larger than c -/// By default, compact node ids -/// CIGAR strings, links, and optional Segment data are invalidated by chop -/// Generates a new graph, rather than modifying the old one in place -pub fn chop<'a>( - gfa: &'a flatgfa::FlatGFA<'a>, - args: Chop, -) -> Result { - - let mut flat = flatgfa::HeapGFAStore::default(); - - // when segment S is chopped into segments S1 through S2 (exclusive), - // seg_map[S.name] = Span(Id(S1.name), Id(S2.name)). If S is not chopped: S=S1, S2.name = S1.name+1 - let mut seg_map: Vec> = Vec::new(); - // The smallest id (>0) which does not already belong to a segment in `flat` - let mut max_node_id = 1; - - fn link_forward(flat: &mut GFAStore<'static, HeapFamily>, span: &Span) { - // Link segments spanned by `span` from head to tail - let overlap = Span::new_empty(); - flat.add_links((span.start.index()..span.end.index() - 1).map(|idx| Link { - from: Handle::new(Id::new(idx), Orientation::Forward), - to: Handle::new(Id::new(idx + 1), Orientation::Forward), - overlap, - })); - } - - // Add new, chopped segments - for seg in gfa.segs.all().iter() { - let len = seg.len(); - if len <= args.c { - // Leave the segment as is - let id = flat.segs.add(Segment { - name: max_node_id, - seq: seg.seq, - optional: Span::new_empty(), // TODO: Optional data may stay valid when seg not chopped? - }); - max_node_id += 1; - seg_map.push(Span::new(id, flat.segs.next_id())); - } else { - let seq_end = seg.seq.end; - let mut offset = seg.seq.start.index(); - let segs_start = flat.segs.next_id(); - // Could also generate end_id by setting it equal to the start_id and - // updating it for each segment that is added - only benefits us if we - // don't unroll the last iteration of this loop - while offset < seq_end.index() - args.c { - // Generate a new segment of length c - flat.segs.add(Segment { - name: max_node_id, - seq: Span::new(Id::new(offset), Id::new(offset + args.c)), - optional: Span::new_empty() - }); - offset += args.c; - max_node_id += 1; - } - // Generate the last segment - flat.segs.add(Segment { - name: max_node_id, - seq: Span::new(Id::new(offset), seq_end), - optional: Span::new_empty(), - }); - max_node_id += 1; - let new_seg_span = Span::new(segs_start, flat.segs.next_id()); - seg_map.push(new_seg_span); - if args.l { - link_forward(&mut flat, &new_seg_span); - } - } - } - - // For each path, add updated handles. Then add the updated path - for path in gfa.paths.all().iter() { - let path_start = flat.steps.next_id(); - let mut path_end = flat.steps.next_id(); - // Generate the new handles - // Tentative to-do: see if it is faster to read Id from segs than to re-generate it? - for step in gfa.get_path_steps(path) { - let range = { - let span = seg_map[step.segment().index()]; - std::ops::Range::from(span) - }; - match step.orient() { - Orientation::Forward => { - // In this builder, Id.index() == seg.name - 1 for all seg - path_end = flat - .add_steps(range.map(|idx| Handle::new(Id::new(idx), Orientation::Forward))) - .end; - } - Orientation::Backward => { - path_end = flat - .add_steps( - range - .rev() - .map(|idx| Handle::new(Id::new(idx), Orientation::Backward)), - ) - .end; - } - } - } - - // Add the updated path - flat.paths.add(Path { - name: path.name, - steps: Span::new(path_start, path_end), - overlaps: Span::new_empty(), - }); - } - - // If the 'l' flag is specified, compute the links in the new graph - if args.l { - // For each link in the old graph, from handle A -> B: - // Add a link from - // (A.forward ? (A.end, forward) : (A.begin, backwards)) - // -> (B.forward ? (B.begin, forward) : (B.end ? backwards)) - - for link in gfa.links.all().iter() { - let new_from = { - let old_from = link.from; - let chopped_segs = seg_map[old_from.segment().index()]; - let seg_id = match old_from.orient() { - Orientation::Forward => chopped_segs.end - 1, - Orientation::Backward => chopped_segs.start, - }; - Handle::new(seg_id, old_from.orient()) - }; - let new_to = { - let old_to = link.to; - let chopped_segs = seg_map[old_to.segment().index()]; - let seg_id = match old_to.orient() { - Orientation::Forward => chopped_segs.start, - Orientation::Backward => chopped_segs.end - 1, - }; - Handle::new(seg_id, old_to.orient()) - }; - flat.add_link(new_from, new_to, vec![]); - } - } - - Ok(flat) -} diff --git a/flatgfa/src/commands/basic_cmds.rs b/flatgfa/src/commands/basic_cmds.rs new file mode 100644 index 00000000..537b49e8 --- /dev/null +++ b/flatgfa/src/commands/basic_cmds.rs @@ -0,0 +1,136 @@ +use crate::fgfa_ds::flatgfa::{self, Orientation, Segment}; +use crate::fgfa_ds::pool::Id; +use argh::FromArgs; +use std::collections::HashMap; + +/// print the FlatGFA table of contents +#[derive(FromArgs, PartialEq, Debug)] +#[argh(subcommand, name = "toc")] +pub struct Toc {} + +pub fn toc(gfa: &flatgfa::FlatGFA) { + eprintln!("header: {}", gfa.header.len()); + eprintln!("segs: {}", gfa.segs.len()); + eprintln!("paths: {}", gfa.paths.len()); + eprintln!("links: {}", gfa.links.len()); + eprintln!("steps: {}", gfa.steps.len()); + eprintln!("seq_data: {}", gfa.seq_data.len()); + eprintln!("overlaps: {}", gfa.overlaps.len()); + eprintln!("alignment: {}", gfa.alignment.len()); + eprintln!("name_data: {}", gfa.name_data.len()); + eprintln!("optional_data: {}", gfa.optional_data.len()); + eprintln!("line_order: {}", gfa.line_order.len()); +} + +/// list the paths +#[derive(FromArgs, PartialEq, Debug)] +#[argh(subcommand, name = "paths")] +pub struct Paths {} + +pub fn paths(gfa: &flatgfa::FlatGFA) { + for path in gfa.paths.all().iter() { + println!("{}", gfa.get_path_name(path)); + } +} + +/// calculate graph statistics +#[derive(FromArgs, PartialEq, Debug)] +#[argh(subcommand, name = "stats")] +pub struct Stats { + /// show basic metrics + #[argh(switch, short = 'S')] + summarize: bool, + + /// number of segments with at least one self-loop link + #[argh(switch, short = 'L')] + self_loops: bool, +} + +pub fn stats(gfa: &flatgfa::FlatGFA, args: Stats) { + if args.summarize { + println!("#length\tnodes\tedges\tpaths\tsteps"); + println!( + "{}\t{}\t{}\t{}\t{}", + gfa.seq_data.len(), + gfa.segs.len(), + gfa.links.len(), + gfa.paths.len(), + gfa.steps.len() + ); + } else if args.self_loops { + let mut counts: HashMap, usize> = HashMap::new(); + let mut total: usize = 0; + for link in gfa.links.all().iter() { + if link.from.segment() == link.to.segment() { + let count = counts.entry(link.from.segment()).or_insert(0); + *count += 1; + total += 1; + } + } + println!("#type\tnum"); + println!("total\t{}", total); + println!("unique\t{}", counts.len()); + } +} + +/// find a nucleotide position within a path +#[derive(FromArgs, PartialEq, Debug)] +#[argh(subcommand, name = "position")] +pub struct Position { + /// path_name,offset,orientation + #[argh(option, short = 'p')] + path_pos: String, +} + +pub fn position(gfa: &flatgfa::FlatGFA, args: Position) -> Result<(), &'static str> { + // Parse the position triple, which looks like `path,42,+`. + let (path_name, offset, orientation) = { + let parts: Vec<_> = args.path_pos.split(',').collect(); + if parts.len() != 3 { + return Err("position must be path_name,offset,orientation"); + } + let off: usize = parts[1].parse().or(Err("offset must be a number"))?; + let ori: Orientation = parts[2].parse().or(Err("orientation must be + or -"))?; + (parts[0], off, ori) + }; + + let path_id = gfa.find_path(path_name.into()).ok_or("path not found")?; + let path = &gfa.paths[path_id]; + assert_eq!( + orientation, + Orientation::Forward, + "only + is implemented so far" + ); + + // Traverse the path until we reach the position. + let mut cur_pos = 0; + let mut found = None; + for step in &gfa.steps[path.steps] { + let seg = gfa.get_handle_seg(*step); + let end_pos = cur_pos + seg.len(); + if offset < end_pos { + // Found it! + found = Some((*step, offset - cur_pos)); + break; + } + cur_pos = end_pos; + } + + // Print the match. + if let Some((handle, seg_off)) = found { + let seg = gfa.get_handle_seg(handle); + let seg_name = seg.name; + println!("#source.path.pos\ttarget.graph.pos"); + println!( + "{},{},{}\t{},{},{}", + path_name, + offset, + orientation, + seg_name, + seg_off, + handle.orient() + ); + } + + Ok(()) +} \ No newline at end of file diff --git a/flatgfa/src/commands/chop.rs b/flatgfa/src/commands/chop.rs new file mode 100644 index 00000000..695fd52a --- /dev/null +++ b/flatgfa/src/commands/chop.rs @@ -0,0 +1,160 @@ +use crate::fgfa_ds::flatgfa::{self, Handle, Link, Orientation, Path, Segment}; +use crate::fgfa_ds::pool::{Id, Span, Store}; +use crate::fgfa_ds::flatgfa::{GFAStore, HeapFamily}; +use argh::FromArgs; + +/// chop the segments in a graph into sizes of N or smaller +#[derive(FromArgs, PartialEq, Debug)] +#[argh(subcommand, name = "chop")] +pub struct Chop { + /// maximimum segment size. + // Use c in keeping with odgi convention + #[argh(option, short = 'c')] + c: usize, + + /// compute new links + #[argh(switch, short = 'l')] + l: bool, +} + +/// Chop a graph into segments of size no larger than c +/// By default, compact node ids +/// CIGAR strings, links, and optional Segment data are invalidated by chop +/// Generates a new graph, rather than modifying the old one in place +pub fn chop<'a>( + gfa: &'a flatgfa::FlatGFA<'a>, + args: Chop, +) -> Result { + + let mut flat = flatgfa::HeapGFAStore::default(); + + // when segment S is chopped into segments S1 through S2 (exclusive), + // seg_map[S.name] = Span(Id(S1.name), Id(S2.name)). If S is not chopped: S=S1, S2.name = S1.name+1 + let mut seg_map: Vec> = Vec::new(); + // The smallest id (>0) which does not already belong to a segment in `flat` + let mut max_node_id = 1; + + fn link_forward(flat: &mut GFAStore<'static, HeapFamily>, span: &Span) { + // Link segments spanned by `span` from head to tail + let overlap = Span::new_empty(); + flat.add_links((span.start.index()..span.end.index() - 1).map(|idx| Link { + from: Handle::new(Id::new(idx), Orientation::Forward), + to: Handle::new(Id::new(idx + 1), Orientation::Forward), + overlap, + })); + } + + // Add new, chopped segments + for seg in gfa.segs.all().iter() { + let len = seg.len(); + if len <= args.c { + // Leave the segment as is + let id = flat.segs.add(Segment { + name: max_node_id, + seq: seg.seq, + optional: Span::new_empty(), // TODO: Optional data may stay valid when seg not chopped? + }); + max_node_id += 1; + seg_map.push(Span::new(id, flat.segs.next_id())); + } else { + let seq_end = seg.seq.end; + let mut offset = seg.seq.start.index(); + let segs_start = flat.segs.next_id(); + // Could also generate end_id by setting it equal to the start_id and + // updating it for each segment that is added - only benefits us if we + // don't unroll the last iteration of this loop + while offset < seq_end.index() - args.c { + // Generate a new segment of length c + flat.segs.add(Segment { + name: max_node_id, + seq: Span::new(Id::new(offset), Id::new(offset + args.c)), + optional: Span::new_empty() + }); + offset += args.c; + max_node_id += 1; + } + // Generate the last segment + flat.segs.add(Segment { + name: max_node_id, + seq: Span::new(Id::new(offset), seq_end), + optional: Span::new_empty(), + }); + max_node_id += 1; + let new_seg_span = Span::new(segs_start, flat.segs.next_id()); + seg_map.push(new_seg_span); + if args.l { + link_forward(&mut flat, &new_seg_span); + } + } + } + + // For each path, add updated handles. Then add the updated path + for path in gfa.paths.all().iter() { + let path_start = flat.steps.next_id(); + let mut path_end = flat.steps.next_id(); + // Generate the new handles + // Tentative to-do: see if it is faster to read Id from segs than to re-generate it? + for step in gfa.get_path_steps(path) { + let range = { + let span = seg_map[step.segment().index()]; + std::ops::Range::from(span) + }; + match step.orient() { + Orientation::Forward => { + // In this builder, Id.index() == seg.name - 1 for all seg + path_end = flat + .add_steps(range.map(|idx| Handle::new(Id::new(idx), Orientation::Forward))) + .end; + } + Orientation::Backward => { + path_end = flat + .add_steps( + range + .rev() + .map(|idx| Handle::new(Id::new(idx), Orientation::Backward)), + ) + .end; + } + } + } + + // Add the updated path + flat.paths.add(Path { + name: path.name, + steps: Span::new(path_start, path_end), + overlaps: Span::new_empty(), + }); + } + + // If the 'l' flag is specified, compute the links in the new graph + if args.l { + // For each link in the old graph, from handle A -> B: + // Add a link from + // (A.forward ? (A.end, forward) : (A.begin, backwards)) + // -> (B.forward ? (B.begin, forward) : (B.end ? backwards)) + + for link in gfa.links.all().iter() { + let new_from = { + let old_from = link.from; + let chopped_segs = seg_map[old_from.segment().index()]; + let seg_id = match old_from.orient() { + Orientation::Forward => chopped_segs.end - 1, + Orientation::Backward => chopped_segs.start, + }; + Handle::new(seg_id, old_from.orient()) + }; + let new_to = { + let old_to = link.to; + let chopped_segs = seg_map[old_to.segment().index()]; + let seg_id = match old_to.orient() { + Orientation::Forward => chopped_segs.start, + Orientation::Backward => chopped_segs.end - 1, + }; + Handle::new(seg_id, old_to.orient()) + }; + flat.add_link(new_from, new_to, vec![]); + } + } + + Ok(flat) +} \ No newline at end of file diff --git a/flatgfa/src/commands/depth.rs b/flatgfa/src/commands/depth.rs new file mode 100644 index 00000000..ef0b97ca --- /dev/null +++ b/flatgfa/src/commands/depth.rs @@ -0,0 +1,37 @@ +use crate::fgfa_ds::flatgfa::FlatGFA; +use argh::FromArgs; +use std::collections::HashSet; + +/// compute node depth, the number of times paths cross a node +#[derive(FromArgs, PartialEq, Debug)] +#[argh(subcommand, name = "depth")] +pub struct Depth {} + +pub fn depth(gfa: &FlatGFA) { + // Initialize node depth + let mut depths = vec![0; gfa.segs.len()]; + // Initialize uniq_paths + let mut uniq_paths = Vec::>::new(); + uniq_paths.resize(gfa.segs.len(), HashSet::new()); + // do not assume that each handle in `gfa.steps()` is unique + for (idx, path) in gfa.paths.all().iter().enumerate() { + for step in &gfa.steps[path.steps] { + let seg_id = step.segment().index(); + // Increment depths + depths[seg_id] += 1; + // Update uniq_paths + uniq_paths[seg_id].insert(idx); + } + } + // print out depth and depth.uniq + println!("#node.id\tdepth\tdepth.uniq"); + for (id, seg) in gfa.segs.items() { + let name: u32 = seg.name as u32; + println!( + "{}\t{}\t{}", + name, + depths[id.index()], + uniq_paths[id.index()].len() + ); + } +} \ No newline at end of file diff --git a/flatgfa/src/commands/extract.rs b/flatgfa/src/commands/extract.rs new file mode 100644 index 00000000..1a0ada61 --- /dev/null +++ b/flatgfa/src/commands/extract.rs @@ -0,0 +1,224 @@ +use crate::fgfa_ds::flatgfa::{self, Handle, Link, Path, Segment}; +use crate::fgfa_ds::pool::{self, Id, Span, Store}; +use argh::FromArgs; +use std::collections::HashMap; + +/// create a subset graph +#[derive(FromArgs, PartialEq, Debug)] +#[argh(subcommand, name = "extract")] +pub struct Extract { + /// segment to extract around + #[argh(option, short = 'n')] + seg_name: usize, + + /// number of edges "away" from the node to include + #[argh(option, short = 'c')] + link_distance: usize, + + /// maximum number of basepairs allowed between subpaths s.t. the subpaths are merged together + #[argh(option, short = 'd', long = "max-distance-subpaths", default = "300000")] + max_distance_subpaths: usize, // TODO: possibly make this bigger + + /// maximum number of iterations before we stop merging subpaths + #[argh(option, short = 'e', long = "max-merging-iterations", default = "6")] + num_iterations: usize // TODO: probably make this smaller +} + +pub fn extract( + gfa: &flatgfa::FlatGFA, + args: Extract, +) -> Result { + let origin_seg = gfa.find_seg(args.seg_name).ok_or("segment not found")?; + + let mut subgraph = SubgraphBuilder::new(gfa); + subgraph.add_header(); + subgraph.extract(origin_seg, args.link_distance, args.max_distance_subpaths, args.num_iterations); + Ok(subgraph.store) +} + +/// A helper to construct a new graph that includes part of an old graph. +struct SubgraphBuilder<'a> { + old: &'a flatgfa::FlatGFA<'a>, + store: flatgfa::HeapGFAStore, + seg_map: HashMap, Id>, +} + +struct SubpathStart { + step: Id, // The id of the first step in the subpath. + pos: usize, // The bp position at the start of the subpath. +} + +impl<'a> SubgraphBuilder<'a> { + fn new(old: &'a flatgfa::FlatGFA) -> Self { + Self { + old, + store: flatgfa::HeapGFAStore::default(), + seg_map: HashMap::new(), + } + } + + /// Include the old graph's header + fn add_header(&mut self) { + // pub fn add_header(&mut self, version: &[u8]) { + // assert!(self.header.as_ref().is_empty()); + // self.header.add_slice(version); + // } + assert!(self.store.header.as_ref().is_empty()); + self.store.header.add_slice(self.old.header.all()); + } + + /// Add a segment from the source graph to this subgraph. + fn include_seg(&mut self, seg_id: Id) { + let seg = &self.old.segs[seg_id]; + let new_seg_id = self.store.add_seg( + seg.name, + self.old.get_seq(seg), + self.old.get_optional_data(seg), + ); + self.seg_map.insert(seg_id, new_seg_id); + } + + /// Add a link from the source graph to the subgraph. + fn include_link(&mut self, link: &Link) { + let from = self.tr_handle(link.from); + let to = self.tr_handle(link.to); + let overlap = self.old.get_alignment(link.overlap); + self.store.add_link(from, to, overlap.ops.into()); + } + + /// Add a single subpath from the given path to the subgraph. + fn include_subpath(&mut self, path: &Path, start: &SubpathStart, end_pos: usize) { + let steps = pool::Span::new(start.step, self.store.steps.next_id()); // why the next id? + let name = format!("{}:{}-{}", self.old.get_path_name(path), start.pos, end_pos); + self.store + .add_path(name.as_bytes(), steps, std::iter::empty()); + } + + /// Identify all the subpaths in a path from the original graph that cross through + /// segments in this subgraph and merge them if possible. + fn merge_subpaths(&mut self, path: &Path, max_distance_subpaths: usize) { + // these are subpaths which *aren't* already included in the new graph + let mut cur_subpath_start: Option = Some(0); + let mut subpath_length = 0; + let mut ignore_path = true; + + for (idx, step) in self.old.steps[path.steps].iter().enumerate() { + let in_neighb = self.seg_map.contains_key(&step.segment()); + + if let (Some(start), true) = (&cur_subpath_start, in_neighb) { + // We just entered the subgraph. End the current subpath. + if !ignore_path && subpath_length <= max_distance_subpaths { + // TODO: type safety + let subpath_span = Span::new(path.steps.start + *start as u32, path.steps.start + idx as u32); + for step in &self.old.steps[subpath_span] { + if !self.seg_map.contains_key(&step.segment()) { + self.include_seg(step.segment()); + } + } + } + cur_subpath_start = None; + ignore_path = false; + } else if let (None, false) = (&cur_subpath_start, in_neighb) { + // We've exited the current subgraph, start a new subpath + cur_subpath_start = Some(idx); + } + + // Track the current bp position in the path. + subpath_length += self.old.get_handle_seg(*step).len(); + } + } + + /// Identify all the subpaths in a path from the original graph that cross through + /// segments in this subgraph and add them. + fn find_subpaths(&mut self, path: &Path) { + let mut cur_subpath_start: Option = None; + let mut path_pos = 0; + + for step in &self.old.steps[path.steps] { + let in_neighb = self.seg_map.contains_key(&step.segment()); + + if let (Some(start), false) = (&cur_subpath_start, in_neighb) { + // End the current subpath. + self.include_subpath(path, start, path_pos); + cur_subpath_start = None; + } else if let (None, true) = (&cur_subpath_start, in_neighb) { + // Start a new subpath. + cur_subpath_start = Some(SubpathStart { + step: self.store.steps.next_id(), + pos: path_pos, + }); + } + + // Add the (translated) step to the new graph. + if in_neighb { + self.store.add_step(self.tr_handle(*step)); + } + + // Track the current bp position in the path. + path_pos += self.old.get_handle_seg(*step).len(); + } + + // Did we reach the end of the path while still in the neighborhood? + if let Some(start) = cur_subpath_start { + self.include_subpath(path, &start, path_pos); + } + } + + /// Translate a handle from the source graph to this subgraph. + fn tr_handle(&self, old_handle: Handle) -> Handle { + // TODO: is this just generating the handle or should we add it to the new graph? + Handle::new(self.seg_map[&old_handle.segment()], old_handle.orient()) + } + + /// Check whether a segment from the old graph is in the subgraph. + fn contains(&self, old_seg_id: Id) -> bool { + self.seg_map.contains_key(&old_seg_id) + } + + /// Extract a subgraph consisting of a neighborhood of segments up to `dist` links away + /// from the given segment in the original graph. + /// + /// Include any links between the segments in the neighborhood and subpaths crossing + /// through the neighborhood. + fn extract(&mut self, origin: Id, dist: usize, max_distance_subpaths: usize, num_iterations: usize) { + self.include_seg(origin); + + // Find the set of all segments that are c links away. + let mut frontier: Vec> = Vec::new(); + let mut next_frontier: Vec> = Vec::new(); + frontier.push(origin); + for _ in 0..dist { + while let Some(seg_id) = frontier.pop() { + for link in self.old.links.all().iter() { + if let Some(other_seg) = link.incident_seg(seg_id) { + // Add other_seg to the frontier set if it is not already in the frontier set or the seg_map + if !self.seg_map.contains_key(&other_seg) { + self.include_seg(other_seg); + next_frontier.push(other_seg); + } + } + } + } + (frontier, next_frontier) = (next_frontier, frontier); + } + + // Merge subpaths within max_distance_subpaths bp of each other, num_iterations times + for _ in 0..num_iterations { + for path in self.old.paths.all().iter() { + self.merge_subpaths(path, max_distance_subpaths); + } + } + + // Include all links within the subgraph. + for link in self.old.links.all().iter() { + if self.contains(link.from.segment()) && self.contains(link.to.segment()) { + self.include_link(link); + } + } + + // Find subpaths within the subgraph. + for path in self.old.paths.all().iter() { + self.find_subpaths(path); + } + } +} \ No newline at end of file diff --git a/flatgfa/src/commands/mod.rs b/flatgfa/src/commands/mod.rs new file mode 100644 index 00000000..b4801f6b --- /dev/null +++ b/flatgfa/src/commands/mod.rs @@ -0,0 +1,4 @@ +pub mod basic_cmds; +pub mod chop; +pub mod depth; +pub mod extract; \ No newline at end of file diff --git a/flatgfa/src/file.rs b/flatgfa/src/fgfa_ds/file.rs similarity index 92% rename from flatgfa/src/file.rs rename to flatgfa/src/fgfa_ds/file.rs index 77bf7bf0..a0a55515 100644 --- a/flatgfa/src/file.rs +++ b/flatgfa/src/fgfa_ds/file.rs @@ -1,5 +1,5 @@ -use crate::flatgfa; -use crate::pool::{FixedStore, Pool, Span, Store}; +use super::pool::{FixedStore, Pool, Span, Store}; +use super::flatgfa::{AlignOp, FlatGFA, FixedGFAStore, Handle, Link, Path, Segment}; use memmap::{Mmap, MmapMut}; use std::mem::{size_of, size_of_val}; use tinyvec::SliceVec; @@ -65,20 +65,20 @@ impl Toc { pub fn size(&self) -> usize { size_of::() + self.header.bytes::() - + self.segs.bytes::() - + self.paths.bytes::() - + self.links.bytes::() - + self.steps.bytes::() + + self.segs.bytes::() + + self.paths.bytes::() + + self.links.bytes::() + + self.steps.bytes::() + self.seq_data.bytes::() - + self.overlaps.bytes::>() - + self.alignment.bytes::() + + self.overlaps.bytes::>() + + self.alignment.bytes::() + self.name_data.bytes::() + self.optional_data.bytes::() + self.line_order.bytes::() } /// Get a table of contents that fits a FlatGFA with no spare space. - fn full(gfa: &flatgfa::FlatGFA) -> Self { + fn full(gfa: &FlatGFA) -> Self { Self { magic: MAGIC_NUMBER, header: Size::of_pool(gfa.header), @@ -95,7 +95,7 @@ impl Toc { } } - pub fn for_fixed_store(store: &flatgfa::FixedGFAStore) -> Self { + pub fn for_fixed_store(store: &FixedGFAStore) -> Self { Self { magic: MAGIC_NUMBER, header: Size::of_store(&store.header), @@ -183,7 +183,7 @@ fn read_toc_mut(data: &mut [u8]) -> (&mut Toc, &mut [u8]) { } /// Get a FlatGFA backed by the data in a byte buffer. -pub fn view(data: &[u8]) -> flatgfa::FlatGFA { +pub fn view(data: &[u8]) -> FlatGFA { let (toc, rest) = read_toc(data); let (header, rest) = slice_prefix(rest, toc.header); @@ -198,7 +198,7 @@ pub fn view(data: &[u8]) -> flatgfa::FlatGFA { let (optional_data, rest) = slice_prefix(rest, toc.optional_data); let (line_order, _) = slice_prefix(rest, toc.line_order); - flatgfa::FlatGFA { + FlatGFA { header: header.into(), segs: segs.into(), paths: paths.into(), @@ -224,7 +224,7 @@ fn slice_vec_prefix( } /// Get a FlatGFA `SliceStore` from the suffix of a file just following the table of contents. -fn slice_store<'a>(data: &'a mut [u8], toc: &Toc) -> flatgfa::FixedGFAStore<'a> { +fn slice_store<'a>(data: &'a mut [u8], toc: &Toc) -> FixedGFAStore<'a> { let (header, rest) = slice_vec_prefix(data, toc.header); let (segs, rest) = slice_vec_prefix(rest, toc.segs); let (paths, rest) = slice_vec_prefix(rest, toc.paths); @@ -237,7 +237,7 @@ fn slice_store<'a>(data: &'a mut [u8], toc: &Toc) -> flatgfa::FixedGFAStore<'a> let (optional_data, rest) = slice_vec_prefix(rest, toc.optional_data); let (line_order, _) = slice_vec_prefix(rest, toc.line_order); - flatgfa::FixedGFAStore { + FixedGFAStore { header: header.into(), segs: segs.into(), paths: paths.into(), @@ -253,13 +253,13 @@ fn slice_store<'a>(data: &'a mut [u8], toc: &Toc) -> flatgfa::FixedGFAStore<'a> } /// Get a mutable FlatGFA `SliceStore` backed by a byte buffer. -pub fn view_store(data: &mut [u8]) -> flatgfa::FixedGFAStore { +pub fn view_store(data: &mut [u8]) -> FixedGFAStore { let (toc, rest) = read_toc_mut(data); slice_store(rest, toc) } /// Initialize a buffer with an empty FlatGFA store. -pub fn init(data: &mut [u8], toc: Toc) -> (&mut Toc, flatgfa::FixedGFAStore) { +pub fn init(data: &mut [u8], toc: Toc) -> (&mut Toc, FixedGFAStore) { // Write the table of contents. assert!(data.len() == toc.size()); toc.write_to_prefix(data).unwrap(); @@ -285,7 +285,7 @@ fn write_bytes<'a>(buf: &'a mut [u8], data: &[u8]) -> Option<&'a mut [u8]> { } /// Copy a FlatGFA into a byte buffer. -pub fn dump(gfa: &flatgfa::FlatGFA, buf: &mut [u8]) { +pub fn dump(gfa: &FlatGFA, buf: &mut [u8]) { // Table of contents. let toc = Toc::full(gfa); let rest = write_bump(buf, &toc).unwrap(); @@ -306,7 +306,7 @@ pub fn dump(gfa: &flatgfa::FlatGFA, buf: &mut [u8]) { /// Get the total size in bytes of a FlatGFA structure. This should result in a big /// enough buffer to write the entire FlatGFA into with `dump`. -pub fn size(gfa: &flatgfa::FlatGFA) -> usize { +pub fn size(gfa: &FlatGFA) -> usize { Toc::full(gfa).size() } diff --git a/flatgfa/src/flatgfa.rs b/flatgfa/src/fgfa_ds/flatgfa.rs similarity index 99% rename from flatgfa/src/flatgfa.rs rename to flatgfa/src/fgfa_ds/flatgfa.rs index a7f0e5dd..9a9e53d4 100644 --- a/flatgfa/src/flatgfa.rs +++ b/flatgfa/src/fgfa_ds/flatgfa.rs @@ -1,6 +1,6 @@ use std::str::FromStr; -use crate::pool::{self, Id, Pool, Span, Store}; +use super::pool::{self, Id, Pool, Span, Store}; use bstr::BStr; use num_enum::{IntoPrimitive, TryFromPrimitive}; use zerocopy::{AsBytes, FromBytes, FromZeroes}; @@ -287,8 +287,8 @@ impl<'a> FlatGFA<'a> { /// Look up a CIGAR alignment. pub fn get_alignment(&self, overlap: Span) -> Alignment { - Alignment { - ops: &self.alignment[overlap] + Alignment { + ops: &self.alignment[overlap], } } diff --git a/flatgfa/src/gfaline.rs b/flatgfa/src/fgfa_ds/gfaline.rs similarity index 96% rename from flatgfa/src/gfaline.rs rename to flatgfa/src/fgfa_ds/gfaline.rs index 36408d42..87178bbf 100644 --- a/flatgfa/src/gfaline.rs +++ b/flatgfa/src/fgfa_ds/gfaline.rs @@ -1,4 +1,4 @@ -use crate::flatgfa::{AlignOp, Orientation}; +use super::flatgfa::{AlignOp, AlignOpcode, Orientation}; use atoi::FromRadix10; type ParseResult = Result; @@ -174,10 +174,10 @@ fn parse_orient(line: &[u8]) -> PartialParseResult { fn parse_align_op(s: &[u8]) -> PartialParseResult { let (len, rest) = parse_num::(s)?; let op = match rest[0] { - b'M' => crate::flatgfa::AlignOpcode::Match, - b'N' => crate::flatgfa::AlignOpcode::Gap, - b'D' => crate::flatgfa::AlignOpcode::Deletion, - b'I' => crate::flatgfa::AlignOpcode::Insertion, + b'M' => AlignOpcode::Match, + b'N' => AlignOpcode::Gap, + b'D' => AlignOpcode::Deletion, + b'I' => AlignOpcode::Insertion, _ => return Err("expected align op"), }; Ok((AlignOp::new(op, len), &rest[1..])) diff --git a/flatgfa/src/fgfa_ds/mod.rs b/flatgfa/src/fgfa_ds/mod.rs new file mode 100644 index 00000000..32fd106b --- /dev/null +++ b/flatgfa/src/fgfa_ds/mod.rs @@ -0,0 +1,7 @@ +pub mod file; +pub mod flatgfa; +pub mod gfaline; +pub mod parse; +pub mod pool; +pub mod print; + diff --git a/flatgfa/src/parse.rs b/flatgfa/src/fgfa_ds/parse.rs similarity index 97% rename from flatgfa/src/parse.rs rename to flatgfa/src/fgfa_ds/parse.rs index 7b3ac373..e0eebf9e 100644 --- a/flatgfa/src/parse.rs +++ b/flatgfa/src/fgfa_ds/parse.rs @@ -1,5 +1,6 @@ -use crate::flatgfa::{self, Handle, LineKind, Orientation}; -use crate::gfaline; +use super::file::Toc; +use super::flatgfa::{self, Handle, LineKind, Orientation}; +use super::gfaline; use std::collections::HashMap; use std::io::BufRead; @@ -209,7 +210,7 @@ impl NameMap { /// Scan a GFA text file to count the number of each type of line and measure some sizes /// that are useful in estimating the final size of the FlatGFA file. -pub fn estimate_toc(buf: &[u8]) -> crate::file::Toc { +pub fn estimate_toc(buf: &[u8]) -> Toc { let mut segs = 0; let mut links = 0; let mut paths = 0; @@ -248,7 +249,7 @@ pub fn estimate_toc(buf: &[u8]) -> crate::file::Toc { rest = &rest[next + 1..]; } - crate::file::Toc::estimate(segs, links, paths, header_bytes, seg_bytes, path_bytes) + Toc::estimate(segs, links, paths, header_bytes, seg_bytes, path_bytes) } struct MemchrSplit<'a> { diff --git a/flatgfa/src/pool.rs b/flatgfa/src/fgfa_ds/pool.rs similarity index 98% rename from flatgfa/src/pool.rs rename to flatgfa/src/fgfa_ds/pool.rs index 2872388a..6055c166 100644 --- a/flatgfa/src/pool.rs +++ b/flatgfa/src/fgfa_ds/pool.rs @@ -134,10 +134,10 @@ pub trait Store { /// Get the number of items in the pool. fn len(&self) -> usize; - /// Check whether the pool is empty. - fn is_empty(&self) -> bool { - self.len() == 0 - } + // /// Check whether the pool is empty. + // fn is_empty(&self) -> bool { + // self.len() == 0 + // } /// Get the next available ID. fn next_id(&self) -> Id { diff --git a/flatgfa/src/print.rs b/flatgfa/src/fgfa_ds/print.rs similarity index 99% rename from flatgfa/src/print.rs rename to flatgfa/src/fgfa_ds/print.rs index b6d28502..09532389 100644 --- a/flatgfa/src/print.rs +++ b/flatgfa/src/fgfa_ds/print.rs @@ -1,4 +1,4 @@ -use crate::flatgfa; +use super::flatgfa; use std::fmt; impl fmt::Display for flatgfa::Orientation { diff --git a/flatgfa/src/lib.rs b/flatgfa/src/lib.rs index d6ec729e..b0f1f4bc 100644 --- a/flatgfa/src/lib.rs +++ b/flatgfa/src/lib.rs @@ -1,9 +1,4 @@ -pub mod cmds; -pub mod file; -pub mod flatgfa; -pub mod gfaline; -pub mod parse; -pub mod pool; -pub mod print; +pub mod fgfa_ds; -pub use flatgfa::*; +pub use fgfa_ds::*; +pub use fgfa_ds::flatgfa::*; \ No newline at end of file diff --git a/flatgfa/src/main.rs b/flatgfa/src/main.rs index 376404c4..3541b49a 100644 --- a/flatgfa/src/main.rs +++ b/flatgfa/src/main.rs @@ -1,8 +1,17 @@ use argh::FromArgs; -use flatgfa::flatgfa::FlatGFA; -use flatgfa::parse::Parser; -use flatgfa::pool::Store; -use flatgfa::{cmds, file, parse}; // TODO: hopefully remove at some point, this breaks a lot of principles + +mod fgfa_ds; +use fgfa_ds::flatgfa::FlatGFA; +use fgfa_ds::parse::Parser; +use fgfa_ds::pool::Store; +use fgfa_ds::{file, parse}; // TODO: hopefully remove at some point, this breaks a lot of principles + +mod commands; +use commands::basic_cmds::{Toc, Paths, Stats, Position}; +use commands::{chop::Chop, depth::Depth, extract::Extract}; + +use commands::basic_cmds::{toc, paths, stats, position}; +use commands::{chop::chop, depth::depth, extract::extract}; #[derive(FromArgs)] /// Convert between GFA text and FlatGFA binary formats. @@ -34,13 +43,13 @@ struct PolBin { #[derive(FromArgs, PartialEq, Debug)] #[argh(subcommand)] enum Command { - Toc(cmds::Toc), - Paths(cmds::Paths), - Stats(cmds::Stats), - Position(cmds::Position), - Extract(cmds::Extract), - Depth(cmds::Depth), - Chop(cmds::Chop), + Toc(Toc), + Paths(Paths), + Stats(Stats), + Position(Position), + Extract(Extract), + Depth(Depth), + Chop(Chop), } fn main() -> Result<(), &'static str> { @@ -88,31 +97,31 @@ fn main() -> Result<(), &'static str> { match args.command { Some(Command::Toc(_)) => { - cmds::toc(&gfa); + toc(&gfa); } Some(Command::Paths(_)) => { - cmds::paths(&gfa); + paths(&gfa); } Some(Command::Stats(sub_args)) => { - cmds::stats(&gfa, sub_args); + stats(&gfa, sub_args); } Some(Command::Position(sub_args)) => { - cmds::position(&gfa, sub_args)?; + position(&gfa, sub_args)?; } Some(Command::Extract(sub_args)) => { - let store = cmds::extract(&gfa, sub_args)?; + let store = extract(&gfa, sub_args)?; dump(&store.as_ref(), &args.output); } Some(Command::Depth(_)) => { - cmds::depth(&gfa); + depth(&gfa); } Some(Command::Chop(sub_args)) => { - let store = cmds::chop(&gfa, sub_args)?; + let store = chop(&gfa, sub_args)?; // TODO: Ideally, find a way to encapsulate the logic of chop in `cmd.rs`, instead of // defining here which values from out input `gfa` are needed by our final `flat` gfa. // Here we are reference values in two different Stores to create this Flatgfa, and // have not yet found a good rust-safe way to do this - let flat = flatgfa::FlatGFA { + let flat = FlatGFA { header: gfa.header, seq_data: gfa.seq_data, name_data: gfa.name_data,