From b2a3890ac4da60de4589210f49276526a3c99238 Mon Sep 17 00:00:00 2001 From: Adrian Sampson Date: Sun, 29 Dec 2024 15:09:30 -0500 Subject: [PATCH 1/5] Simplify return type of depth utility --- flatgfa/src/cli/cmds.rs | 2 +- flatgfa/src/ops/depth.rs | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/flatgfa/src/cli/cmds.rs b/flatgfa/src/cli/cmds.rs index f0c7592c..a9d1274d 100644 --- a/flatgfa/src/cli/cmds.rs +++ b/flatgfa/src/cli/cmds.rs @@ -204,7 +204,7 @@ pub fn depth(gfa: &flatgfa::FlatGFA) { "{}\t{}\t{}", name, depths[id.index()], - uniq_paths[id.index()].len() + uniq_paths[id.index()], ); } } diff --git a/flatgfa/src/ops/depth.rs b/flatgfa/src/ops/depth.rs index 5307afa7..9f583b5f 100644 --- a/flatgfa/src/ops/depth.rs +++ b/flatgfa/src/ops/depth.rs @@ -1,8 +1,7 @@ use crate::flatgfa; use std::collections::HashSet; -pub fn depth(gfa: &flatgfa::FlatGFA) -> (Vec, Vec>) { - // Initialize node depth +pub fn depth(gfa: &flatgfa::FlatGFA) -> (Vec, Vec) { let mut depths = vec![0; gfa.segs.len()]; // Initialize uniq_paths let mut uniq_paths = Vec::>::new(); @@ -18,5 +17,6 @@ pub fn depth(gfa: &flatgfa::FlatGFA) -> (Vec, Vec>) { } } - (depths, uniq_paths) + let uniq_depths = uniq_paths.iter().map(|set| set.len()).collect(); + (depths, uniq_depths) } From 3bebd190f283c14b52b3401b7c4b0ebf302bbd14 Mon Sep 17 00:00:00 2001 From: Adrian Sampson Date: Sun, 29 Dec 2024 15:18:05 -0500 Subject: [PATCH 2/5] Try using dense vector for depth uniqueness On chr22 (already converted to FlatGFA), this seems like a pretty big performance win: depth goes from 14.7 to 2.2 seconds. --- flatgfa/src/ops/depth.rs | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/flatgfa/src/ops/depth.rs b/flatgfa/src/ops/depth.rs index 9f583b5f..1542d873 100644 --- a/flatgfa/src/ops/depth.rs +++ b/flatgfa/src/ops/depth.rs @@ -1,22 +1,20 @@ use crate::flatgfa; -use std::collections::HashSet; pub fn depth(gfa: &flatgfa::FlatGFA) -> (Vec, Vec) { - let mut depths = vec![0; gfa.segs.len()]; - // Initialize uniq_paths - let mut uniq_paths = Vec::>::new(); - uniq_paths.resize(gfa.segs.len(), HashSet::new()); - // do not assume that each handle in `gfa.steps()` is unique - for (idx, path) in gfa.paths.all().iter().enumerate() { + let mut depths = vec![0; gfa.segs.len()]; // The number of paths that traverse each segment. + let mut uniq_depths = vec![0; gfa.segs.len()]; // The number of distinct paths that traverse them. + + for path in gfa.paths.all().iter() { + let mut seen = vec![0; gfa.segs.len()]; // Has this path traversed this segment? for step in &gfa.steps[path.steps] { let seg_id = step.segment().index(); - // Increment depths depths[seg_id] += 1; - // Update uniq_paths - uniq_paths[seg_id].insert(idx); + if seen[seg_id] == 0 { + uniq_depths[seg_id] += 1; + seen[seg_id] = 1; + } } } - let uniq_depths = uniq_paths.iter().map(|set| set.len()).collect(); (depths, uniq_depths) } From 9e3fc846d1223d1f070e80a6a8ce225f4edf981a Mon Sep 17 00:00:00 2001 From: Adrian Sampson Date: Sun, 29 Dec 2024 15:26:37 -0500 Subject: [PATCH 3/5] Use a proper bit-vector for depth uniqueness Further speeds up chr22 depth computation, from 2.2 to 2.0 seconds. --- flatgfa/Cargo.lock | 7 +++++++ flatgfa/Cargo.toml | 1 + flatgfa/src/ops/depth.rs | 7 ++++--- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/flatgfa/Cargo.lock b/flatgfa/Cargo.lock index 1a1a4b4c..4416ef80 100644 --- a/flatgfa/Cargo.lock +++ b/flatgfa/Cargo.lock @@ -48,6 +48,12 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + [[package]] name = "bstr" version = "1.10.0" @@ -108,6 +114,7 @@ version = "0.1.0" dependencies = [ "argh", "atoi", + "bit-vec", "bstr", "memchr", "memmap", diff --git a/flatgfa/Cargo.toml b/flatgfa/Cargo.toml index b46e4528..56cc9151 100644 --- a/flatgfa/Cargo.toml +++ b/flatgfa/Cargo.toml @@ -10,6 +10,7 @@ path = "src/cli/main.rs" [dependencies] argh = "0.1.12" atoi = "2.0.0" +bit-vec = "0.8.0" bstr = "1.10.0" memchr = "2.7.4" memmap = "0.7.0" diff --git a/flatgfa/src/ops/depth.rs b/flatgfa/src/ops/depth.rs index 1542d873..f13ddcde 100644 --- a/flatgfa/src/ops/depth.rs +++ b/flatgfa/src/ops/depth.rs @@ -1,17 +1,18 @@ use crate::flatgfa; +use bit_vec::BitVec; pub fn depth(gfa: &flatgfa::FlatGFA) -> (Vec, Vec) { let mut depths = vec![0; gfa.segs.len()]; // The number of paths that traverse each segment. let mut uniq_depths = vec![0; gfa.segs.len()]; // The number of distinct paths that traverse them. for path in gfa.paths.all().iter() { - let mut seen = vec![0; gfa.segs.len()]; // Has this path traversed this segment? + let mut seen = BitVec::from_elem(gfa.segs.len(), false); // Has this path traversed this segment? for step in &gfa.steps[path.steps] { let seg_id = step.segment().index(); depths[seg_id] += 1; - if seen[seg_id] == 0 { + if seen[seg_id] { uniq_depths[seg_id] += 1; - seen[seg_id] = 1; + seen.set(seg_id, true); } } } From dabc0ba4d1ac45c84d1f55b7f509f9f8a5f95d8e Mon Sep 17 00:00:00 2001 From: Adrian Sampson Date: Sun, 29 Dec 2024 15:27:45 -0500 Subject: [PATCH 4/5] Reuse a single BitVec Seems to have no impact on performance, but it makes me feel better anyway. --- flatgfa/src/ops/depth.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/flatgfa/src/ops/depth.rs b/flatgfa/src/ops/depth.rs index f13ddcde..25f91490 100644 --- a/flatgfa/src/ops/depth.rs +++ b/flatgfa/src/ops/depth.rs @@ -5,8 +5,9 @@ pub fn depth(gfa: &flatgfa::FlatGFA) -> (Vec, Vec) { let mut depths = vec![0; gfa.segs.len()]; // The number of paths that traverse each segment. let mut uniq_depths = vec![0; gfa.segs.len()]; // The number of distinct paths that traverse them. + let mut seen = BitVec::from_elem(gfa.segs.len(), false); // Has the current path traversed each segment? for path in gfa.paths.all().iter() { - let mut seen = BitVec::from_elem(gfa.segs.len(), false); // Has this path traversed this segment? + seen.clear(); for step in &gfa.steps[path.steps] { let seg_id = step.segment().index(); depths[seg_id] += 1; From f02cfe4564f840349bc4139ad1252ce99daaa121 Mon Sep 17 00:00:00 2001 From: Adrian Sampson Date: Sun, 29 Dec 2024 15:31:06 -0500 Subject: [PATCH 5/5] More comments on depth --- flatgfa/src/ops/depth.rs | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/flatgfa/src/ops/depth.rs b/flatgfa/src/ops/depth.rs index 25f91490..3d5ab66b 100644 --- a/flatgfa/src/ops/depth.rs +++ b/flatgfa/src/ops/depth.rs @@ -1,17 +1,30 @@ use crate::flatgfa; use bit_vec::BitVec; +/// Compute the *depth* of each segment in the variation graph. +/// +/// The depth is defined to be the number of times that a path traverses a given +/// segment. We return two values: the ordinary depth and the *unique* depth, +/// which only counts each path that tarverses a given segment once. +/// +/// Both outputs are depth values indexed by segment ID. pub fn depth(gfa: &flatgfa::FlatGFA) -> (Vec, Vec) { - let mut depths = vec![0; gfa.segs.len()]; // The number of paths that traverse each segment. - let mut uniq_depths = vec![0; gfa.segs.len()]; // The number of distinct paths that traverse them. + // Our output vectors: the ordinary and unique depths of each segment. + let mut depths = vec![0; gfa.segs.len()]; + let mut uniq_depths = vec![0; gfa.segs.len()]; + + // This bit vector keeps track of whether the current path has already + // traversed a given segment, and therefore whether we should ignore + // subsequent traversals (for the purpose of counting unique depth). + let mut seen = BitVec::from_elem(gfa.segs.len(), false); - let mut seen = BitVec::from_elem(gfa.segs.len(), false); // Has the current path traversed each segment? for path in gfa.paths.all().iter() { - seen.clear(); + seen.clear(); // All segments are unseen. for step in &gfa.steps[path.steps] { let seg_id = step.segment().index(); depths[seg_id] += 1; if seen[seg_id] { + // The first traversal of this path over this segment. uniq_depths[seg_id] += 1; seen.set(seg_id, true); }