Skip to content

Commit

Permalink
Stop using odgi's Python bindings in calyx_depth.py (#116)
Browse files Browse the repository at this point in the history
Co-authored-by: susan-garry <[email protected]>
  • Loading branch information
anshumanmohan and susan-garry authored Nov 17, 2023
1 parent 43b24bc commit 3ebf978
Show file tree
Hide file tree
Showing 23 changed files with 199 additions and 9,110 deletions.
8 changes: 4 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@ fetch: $(TEST_FILES:%=tests/%.gfa)

og: $(OG_FILES)

test: og test-depth
test: fetch test-depth

test-depth: og
test-depth: fetch og
-turnt --save --env baseline tests/depth/subset-paths/*.txt
turnt tests/depth/subset-paths/*.txt
turnt --env calyx-depth tests/depth/subset-paths/*.txt

-turnt --save --env baseline $(DEPTH_OG_FILES)
turnt $(DEPTH_OG_FILES)
turnt --env calyx $(DEPTH_OG_FILES)


test-data-gen: og
Expand Down
8 changes: 5 additions & 3 deletions mygfa/mygfa/mygfa.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import re
import sys

from collections import OrderedDict
from dataclasses import dataclass
from typing import List, Tuple, Optional, Dict, TextIO, Iterator
from enum import Enum
import re
from typing import List, Tuple, Optional, Dict, TextIO, Iterator


def parse_orientation(ori: str) -> bool:
Expand Down Expand Up @@ -274,7 +276,7 @@ class Graph:
@classmethod
def parse(cls, infile: TextIO) -> "Graph":
"""Parse a GFA file."""
graph = Graph([], {}, [], {})
graph = Graph([], {}, [], OrderedDict())

for line in nonblanks(infile):
fields = line.split()
Expand Down
6 changes: 5 additions & 1 deletion mygfa/mygfa/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,11 @@ def pathseq(graph: mygfa.Graph) -> Dict[str, str]:


def get_maxes(graph: mygfa.Graph) -> Tuple[int, int, int]:
"""Return the maximum number of nodes, steps, and paths in the graph."""
"""Given a graph, returns:
- the number of nodes
- the maximum number of steps in a path
- the number of paths in the graph.
"""
max_nodes = len(graph.segments)
max_steps = max([len(steps) for steps in node_steps(graph).values()])
max_paths = len(graph.paths)
Expand Down
54 changes: 31 additions & 23 deletions pollen_data_gen/pollen_data_gen/__main__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import sys
import argparse
from mygfa import mygfa
from typing import List

from . import depth, simple

Expand All @@ -16,26 +17,23 @@ def parse_args() -> tuple[argparse.ArgumentParser, argparse.Namespace]:
simple_parser = subparsers.add_parser(
"simple", help="Produces a simple JSON serialization of the graph."
)
# Optional arguments - argparse automatically infers flags beginning with '-' as optional
simple_parser.add_argument(
"-n",
nargs="?",
const="d",
help="The max number of nodes.",
required=False,
)
simple_parser.add_argument(
"-e",
nargs="?",
const="d",
help="The max number of steps per node.",
required=False,
)
simple_parser.add_argument(
"-p",
nargs="?",
const="d",
help="The max number of paths.",
required=False,
)
simple_parser.add_argument(
"-s",
"--subset-paths",
help="A file where each line is a path of the graph to consider when calculating node depth",
)

_ = subparsers.add_parser(
Expand All @@ -48,46 +46,56 @@ def parse_args() -> tuple[argparse.ArgumentParser, argparse.Namespace]:
)
depth_parser.add_argument(
"-n",
nargs="?",
const="d",
help="The max number of nodes.",
required=False,
)
depth_parser.add_argument(
"-e",
nargs="?",
const="d",
help="The max number of steps per node.",
required=False,
)
depth_parser.add_argument(
"-p",
nargs="?",
const="d",
help="The max number of paths.",
required=False,
)
depth_parser.add_argument(
"-s",
"--subset-paths",
help="A file where each line is a path of the graph to consider when calculating node depth",
)

# Add the graph argument to all subparsers.
# Doing it this way means that the graph argument is sought _after_ the
# command name.
for subparser in subparsers.choices.values():
subparser.add_argument(
"graph", nargs="?", help="Input GFA file", metavar="GRAPH"
)
subparser.add_argument("graph", help="Input GFA file", metavar="GRAPH")

args = parser.parse_args()

return parser, args


def parse_subset_paths(filename: str) -> List[str]:
"""
Return a list of the names of paths in [filename]
"""

if filename is None: # Return the default value
return []

with open(filename, "r", encoding="utf-8") as paths_file:
text = paths_file.read()
return text.splitlines()


def dispatch(args: argparse.Namespace) -> None:
"""Parse the graph from filename,
then dispatch to the appropriate pollen_data_gen command.
"""
subset_paths = parse_subset_paths(args.subset_paths)
name_to_func = {
"depth": lambda g: depth.depth_stdout(g, args.n, args.e, args.p),
"simple": lambda g: simple.dump(g, sys.stdout, args.n, args.e, args.p),
"depth": lambda g: depth.depth_stdout(g, args.n, args.e, args.p, subset_paths),
"simple": lambda g: simple.dump(
g, sys.stdout, args.n, args.e, args.p, subset_paths
),
"roundtrip": simple.roundtrip_test,
}
graph = mygfa.Graph.parse(open(args.graph, "r", encoding="utf-8"))
Expand Down
49 changes: 40 additions & 9 deletions pollen_data_gen/pollen_data_gen/depth.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import sys
from typing import Any, Collection, Dict, Union, Optional
from typing import Any, Collection, Dict, OrderedDict, Union, Optional, List
import json
from json import JSONEncoder
from mygfa import mygfa, preprocess
Expand All @@ -23,6 +23,7 @@ def paths_viewed_from_nodes(
path2id = {path: id for id, path in enumerate(graph.paths, start=1)}
output = {}
json_format = format_gen(max_p.bit_length())
# segment name, (path name, index on path, direction) list
for seg, crossings in preprocess.node_steps(graph).items():
data = list(path2id[c[0]] for c in crossings)
data = data + [0] * (max_e - len(data))
Expand All @@ -33,7 +34,9 @@ def paths_viewed_from_nodes(
return output


def paths_to_consider(max_n: int, max_p: int) -> OutputType:
def paths_to_consider(
subset_paths_idx: List[int], max_n: int, max_p: int
) -> OutputType:
"""Currently just a stub; later we will populate this with a
bitvector of length MAX_PATHS, where the i'th index will be 1 if
the i'th path is to be considered during depth calculation.
Expand All @@ -42,8 +45,15 @@ def paths_to_consider(max_n: int, max_p: int) -> OutputType:
are nodes in the graph.
"""
output = {}
data = []
if subset_paths_idx:
data = [0] * (max_p + 1)
for path_idx in subset_paths_idx:
data[path_idx] = 1
else:
data = [0] + ([1] * max_p)

for i in range(1, max_n + 1):
data = [0] + [1] * (max_p)
output[f"paths_to_consider{i}"] = {"data": data, "format": format_gen(1)}
return output

Expand All @@ -54,11 +64,25 @@ class NodeDepthEncoder(JSONEncoder):
The exine command `depth` is the oracle for this encoding.
"""

def __init__(self, max_n: int, max_e: int, max_p: int, **kwargs: Any) -> None:
def __init__(
self,
max_n: int,
max_e: int,
max_p: int,
subset_paths: Optional[List[str]],
**kwargs: Any,
) -> None:
super(NodeDepthEncoder, self).__init__(**kwargs)
self.max_n = max_n
self.max_e = max_e
self.max_p = max_p
self.subset_paths = subset_paths

def paths_to_idxs(self, o: mygfa.Graph) -> List[int]:
if not self.subset_paths:
return []
path2id = {path: id for id, path in enumerate(o.paths, start=1)}
return list(map(lambda p: path2id[p], self.subset_paths))

def default(self, o: Any) -> Dict[str, Dict[str, Collection[object]]]:
answer_field = {
Expand All @@ -73,15 +97,20 @@ def default(self, o: Any) -> Dict[str, Dict[str, Collection[object]]]:
"format": format_gen(self.max_p.bit_length()),
}
}
subset_paths_idx = self.paths_to_idxs(o)
paths = paths_viewed_from_nodes(
o, self.max_n, self.max_e, self.max_p
) | paths_to_consider(self.max_n, self.max_p)
) | paths_to_consider(subset_paths_idx, self.max_n, self.max_p)

return answer_field | paths | answer_field_uniq


def depth_json(
graph: mygfa.Graph, max_n: Optional[int], max_e: Optional[int], max_p: Optional[int]
graph: mygfa.Graph,
max_n: Optional[int],
max_e: Optional[int],
max_p: Optional[int],
subset_paths: Optional[List[str]],
) -> str:
"""Returns a JSON representation of `graph`
that is specific to the exine command `depth`.
Expand All @@ -97,13 +126,15 @@ def depth_json(
max_p = p_tight

return NodeDepthEncoder(
max_n=int(max_n), max_e=int(max_e), max_p=int(max_p)
max_n=int(max_n), max_e=int(max_e), max_p=int(max_p), subset_paths=subset_paths
).encode(graph)


def depth_stdout(graph: mygfa.Graph, max_n: int, max_e: int, max_p: int) -> None:
def depth_stdout(
graph: mygfa.Graph, max_n: int, max_e: int, max_p: int, subset_paths: List[str]
) -> None:
"""Prints a JSON representation of `graph` to stdout."""
encoding = depth_json(graph, max_n, max_e, max_p)
encoding = depth_json(graph, max_n, max_e, max_p, subset_paths)

json.dump(
json.loads(encoding),
Expand Down
3 changes: 2 additions & 1 deletion pollen_data_gen/pollen_data_gen/simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ def dump(
max_n: Optional[int],
max_e: Optional[int],
max_p: Optional[int],
subset_paths: Optional[List[str]] = None,
) -> None:
"""Outputs the graph as a JSON, along with precomputed data for the
calculation of node depth.
Expand All @@ -140,7 +141,7 @@ def dump(
| {f"path_details_{k}": v for k, v in graph.paths.items()}
)

depth_encoding = depth.depth_json(graph, max_n, max_e, max_p)
depth_encoding = depth.depth_json(graph, max_n, max_e, max_p, subset_paths)

json.dump(
{
Expand Down
34 changes: 33 additions & 1 deletion pollen_py/pollen/depth/calyx_depth.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,14 @@
from calyx.py_ast import *
from . import parse_data

# from mygfa import mygfa, preprocess

# Defaults for the maximum possible number of nodes,
# steps per node, and paths to consider
MAX_NODES = 16
MAX_STEPS = 15
MAX_PATHS = 15


def node_depth(max_nodes, max_steps, max_paths):
stdlib = Stdlib()
Expand Down Expand Up @@ -545,9 +553,33 @@ def config_parser(parser):
)


# def get_maxes(filename):
# print("In `get_maxes`. Filename: ", filename)
# """Returns the maximum number of nodes, steps per node, and paths."""
# with open(filename, "r", encoding="utf-8") as infile:
# graph = mygfa.Graph.parse(infile)
# return preprocess.get_maxes(graph)


# def get_dimensions(args):
# """
# Compute the node depth accelerator's dimensions from commandline input.
# """
# if args.auto_size:
# filename = args.filename if args.auto_size == "d" else args.auto_size
# max_nodes, max_steps, max_paths = get_maxes(filename)
# else:
# max_nodes, max_steps, max_paths = MAX_NODES, MAX_STEPS, MAX_PATHS

# max_nodes = args.max_nodes if args.max_nodes else max_nodes
# max_steps = args.max_steps if args.max_steps else max_steps
# max_paths = args.max_paths if args.max_paths else max_paths

# return max_nodes, max_steps, max_paths


def run(args):
max_nodes, max_steps, max_paths = parse_data.get_dimensions(args)

program = node_depth(max_nodes, max_steps, max_paths)
output = program.doc()

Expand Down
Loading

0 comments on commit 3ebf978

Please sign in to comment.