Reorganize GFA operations into ops and cli modules (#198)

cucapra · Dec 28, 2024 · 30868ef · 30868ef
2 parents 47b67b9 + 3360c4f
commit 30868ef
Show file tree

Hide file tree

Showing 12 changed files with 633 additions and 597 deletions.
diff --git a/flatgfa/Cargo.toml b/flatgfa/Cargo.toml
@@ -5,7 +5,7 @@ edition = "2021"
 
 [[bin]]
 name = "fgfa"
-path = "src/main.rs"
+path = "src/cli/main.rs"
 
 [dependencies]
 argh = "0.1.12"

diff --git a/flatgfa/src/cli/cmds.rs b/flatgfa/src/cli/cmds.rs
@@ -0,0 +1,231 @@
+use crate::flatgfa::{self, Segment};
+use crate::ops;
+use crate::pool::Id;
+use argh::FromArgs;
+use std::collections::HashMap;
+
+/// print the FlatGFA table of contents
+#[derive(FromArgs, PartialEq, Debug)]
+#[argh(subcommand, name = "toc")]
+pub struct Toc {}
+
+pub fn toc(gfa: &flatgfa::FlatGFA) {
+    eprintln!("header: {}", gfa.header.len());
+    eprintln!("segs: {}", gfa.segs.len());
+    eprintln!("paths: {}", gfa.paths.len());
+    eprintln!("links: {}", gfa.links.len());
+    eprintln!("steps: {}", gfa.steps.len());
+    eprintln!("seq_data: {}", gfa.seq_data.len());
+    eprintln!("overlaps: {}", gfa.overlaps.len());
+    eprintln!("alignment: {}", gfa.alignment.len());
+    eprintln!("name_data: {}", gfa.name_data.len());
+    eprintln!("optional_data: {}", gfa.optional_data.len());
+    eprintln!("line_order: {}", gfa.line_order.len());
+}
+
+/// list the paths
+#[derive(FromArgs, PartialEq, Debug)]
+#[argh(subcommand, name = "paths")]
+pub struct Paths {}
+
+pub fn paths(gfa: &flatgfa::FlatGFA) {
+    for path in gfa.paths.all().iter() {
+        println!("{}", gfa.get_path_name(path));
+    }
+}
+
+/// calculate graph statistics
+#[derive(FromArgs, PartialEq, Debug)]
+#[argh(subcommand, name = "stats")]
+pub struct Stats {
+    /// show basic metrics
+    #[argh(switch, short = 'S')]
+    summarize: bool,
+
+    /// number of segments with at least one self-loop link
+    #[argh(switch, short = 'L')]
+    self_loops: bool,
+}
+
+pub fn stats(gfa: &flatgfa::FlatGFA, args: Stats) {
+    if args.summarize {
+        println!("#length\tnodes\tedges\tpaths\tsteps");
+        println!(
+            "{}\t{}\t{}\t{}\t{}",
+            gfa.seq_data.len(),
+            gfa.segs.len(),
+            gfa.links.len(),
+            gfa.paths.len(),
+            gfa.steps.len()
+        );
+    } else if args.self_loops {
+        let mut counts: HashMap<Id<Segment>, usize> = HashMap::new();
+        let mut total: usize = 0;
+        for link in gfa.links.all().iter() {
+            if link.from.segment() == link.to.segment() {
+                let count = counts.entry(link.from.segment()).or_insert(0);
+                *count += 1;
+                total += 1;
+            }
+        }
+        println!("#type\tnum");
+        println!("total\t{}", total);
+        println!("unique\t{}", counts.len());
+    }
+}
+
+/// find a nucleotide position within a path
+#[derive(FromArgs, PartialEq, Debug)]
+#[argh(subcommand, name = "position")]
+pub struct Position {
+    /// path_name,offset,orientation
+    #[argh(option, short = 'p')]
+    path_pos: String,
+}
+
+pub fn position(gfa: &flatgfa::FlatGFA, args: Position) -> Result<(), &'static str> {
+    // Parse the position triple, which looks like `path,42,+`.
+    let (path_name, offset, orientation) = {
+        let parts: Vec<_> = args.path_pos.split(',').collect();
+        if parts.len() != 3 {
+            return Err("position must be path_name,offset,orientation");
+        }
+        let off: usize = parts[1].parse().or(Err("offset must be a number"))?;
+        let ori: flatgfa::Orientation = parts[2].parse().or(Err("orientation must be + or -"))?;
+        (parts[0], off, ori)
+    };
+
+    let path_id = gfa.find_path(path_name.into()).ok_or("path not found")?;
+    let path = &gfa.paths[path_id];
+    assert_eq!(
+        orientation,
+        flatgfa::Orientation::Forward,
+        "only + is implemented so far"
+    );
+
+    // Print the match.
+    let found = ops::position::position(gfa, path, offset);
+    if let Some((handle, seg_off)) = found {
+        let seg = gfa.get_handle_seg(handle);
+        let seg_name = seg.name;
+        println!("#source.path.pos\ttarget.graph.pos");
+        println!(
+            "{},{},{}\t{},{},{}",
+            path_name,
+            offset,
+            orientation,
+            seg_name,
+            seg_off,
+            handle.orient()
+        );
+    }
+
+    Ok(())
+}
+
+/// benchmarks
+#[derive(FromArgs, PartialEq, Debug)]
+#[argh(subcommand, name = "bench")]
+pub struct Bench {
+    /// count lines in a text file
+    #[argh(option)]
+    wcl: Option<String>,
+
+    /// enable parallelism when available
+    #[argh(switch, short = 'p')]
+    parallel: bool,
+}
+
+pub fn bench(args: Bench) {
+    // TODO: We don't need a GFA for (some of) these? So avoid opening it.
+    if let Some(filename) = args.wcl {
+        println!("{}", ops::bench::line_count(&filename, args.parallel));
+    }
+}
+
+/// create a subset graph
+#[derive(FromArgs, PartialEq, Debug)]
+#[argh(subcommand, name = "extract")]
+pub struct Extract {
+    /// segment to extract around
+    #[argh(option, short = 'n')]
+    seg_name: usize,
+
+    /// number of edges "away" from the node to include
+    #[argh(option, short = 'c')]
+    link_distance: usize,
+
+    /// maximum number of basepairs allowed between subpaths s.t. the subpaths are merged together
+    #[argh(
+        option,
+        short = 'd',
+        long = "max-distance-subpaths",
+        default = "300000"
+    )]
+    max_distance_subpaths: usize, // TODO: possibly make this bigger
+
+    /// maximum number of iterations before we stop merging subpaths
+    #[argh(option, short = 'e', long = "max-merging-iterations", default = "6")]
+    num_iterations: usize, // TODO: probably make this smaller
+}
+
+pub fn extract(
+    gfa: &flatgfa::FlatGFA,
+    args: Extract,
+) -> Result<flatgfa::HeapGFAStore, &'static str> {
+    let origin_seg = gfa.find_seg(args.seg_name).ok_or("segment not found")?;
+
+    let mut subgraph = ops::extract::SubgraphBuilder::new(gfa);
+    subgraph.add_header();
+    subgraph.extract(
+        origin_seg,
+        args.link_distance,
+        args.max_distance_subpaths,
+        args.num_iterations,
+    );
+    Ok(subgraph.store)
+}
+
+/// compute node depth, the number of times paths cross a node
+#[derive(FromArgs, PartialEq, Debug)]
+#[argh(subcommand, name = "depth")]
+pub struct Depth {}
+
+pub fn depth(gfa: &flatgfa::FlatGFA) {
+    let (depths, uniq_paths) = ops::depth::depth(gfa);
+
+    println!("#node.id\tdepth\tdepth.uniq");
+    for (id, seg) in gfa.segs.items() {
+        let name: u32 = seg.name as u32;
+        println!(
+            "{}\t{}\t{}",
+            name,
+            depths[id.index()],
+            uniq_paths[id.index()].len()
+        );
+    }
+}
+
+/// chop the segments in a graph into sizes of N or smaller
+#[derive(FromArgs, PartialEq, Debug)]
+#[argh(subcommand, name = "chop")]
+pub struct Chop {
+    /// maximimum segment size
+    #[argh(option, short = 'c')]
+    count: usize,
+
+    /// compute new links
+    #[argh(switch, short = 'l')]
+    links: bool,
+}
+
+/// Chop a graph into segments of size no larger than c
+/// By default, compact node ids
+/// CIGAR strings, links, and optional Segment data are invalidated by chop
+/// Generates a new graph, rather than modifying the old one in place
+pub fn chop<'a>(
+    gfa: &'a flatgfa::FlatGFA<'a>,
+    args: Chop,
+) -> Result<flatgfa::HeapGFAStore, &'static str> {
+    Ok(ops::chop::chop(gfa, args.count, args.links))
+}
diff --git a/flatgfa/src/main.rs → flatgfa/src/cli/main.rs b/flatgfa/src/main.rs → flatgfa/src/cli/main.rs
@@ -3,7 +3,7 @@ use flatgfa::flatgfa::FlatGFA;
 use flatgfa::gaf;
 use flatgfa::parse::Parser;
 use flatgfa::pool::Store;
-use flatgfa::{cmds, file, memfile, parse}; // TODO: hopefully remove at some point, this breaks a lot of principles
+use flatgfa::{cli::cmds, file, memfile, parse};
 
 #[derive(FromArgs)]
 /// Convert between GFA text and FlatGFA binary formats.

diff --git a/flatgfa/src/cli/mod.rs b/flatgfa/src/cli/mod.rs
@@ -0,0 +1 @@
+pub mod cmds;