Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds newick output support for the build and search commands in the CLI #22

Merged
merged 6 commits into from
Jan 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,25 @@ Validating a list of TaxIDs against a Tree data structure in ``pickle`` format:
taxonomy-resolver validate -in tree.pickle -taxids testdata/taxids_validate.txt


Load a previously built Tree data structure in ``pickle`` format and search for one or more TaxIDs (for example human, TaxID '9606'). Included, excluded and filter lists can be optionally passed as shown above.

.. code-block:: bash

taxonomy-resolver search -in tree.pickle -taxid "9606" -out testdata/taxids_human.txt

Writing the 'complete' human sub-tree (i.e. all levels of the hierarchy) in ``newick`` format can be done via the build command and passing a TaxID filter file, which can be generated by running the search command shown previously.

.. code-block:: bash

taxonomy-resolver build -in tree.pickle -inf pickle -taxidsf testdata/taxids_human.txt -out testdata/taxids_human.nwk -outf newick

Alternatively, ``newick`` output can be generated for the 'local' human sub-tree (i.e. species and sub-species levels only), directly using the search command, as shown below:

.. code-block:: bash

taxonomy-resolver search -in tree.pickle -taxid "9606" -out testdata/taxids_human_local.nwk -outf newick


Contributing
============

Expand Down
32 changes: 27 additions & 5 deletions taxonomyresolver/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
print_and_exit,
validate_inputs_outputs,
)
from taxonomyresolver.tree import write_tree


# reusing click args and options
Expand Down Expand Up @@ -174,7 +175,7 @@ def download(
default="pickle",
required=False,
multiple=False,
help="Output format (currently: 'pickle').",
help="Output format (currently: 'pickle' or 'newick').",
)
@click.option(
"-taxidsf",
Expand Down Expand Up @@ -289,6 +290,16 @@ def build(
multiple=False,
help="Input format (currently: 'pickle').",
)
@click.option(
"-outf",
"--outformat",
"outformat",
type=str,
default="txt",
required=False,
multiple=False,
help="Input format (currently: 'txt' or 'newick').",
)
@click.option(
"-taxid",
"--taxid",
Expand Down Expand Up @@ -354,6 +365,7 @@ def search(
infile: str,
outfile: str | None,
informat: str,
outformat: str,
taxids: str | None,
taxidincludes: str | None,
taxidsexcludes: str | None,
Expand Down Expand Up @@ -421,10 +433,20 @@ def search(
ignoreinvalid=ignoreinvalid,
)
if outfile:
with open(outfile, "w") as outf:
if tax_ids:
outf.write("\n".join(list(tax_ids)))
logging.info(f"Wrote list of TaxIDS in {outfile}.")
if outformat == "newick" and resolver.tree is not None and tax_ids:
subset = (
resolver.tree[resolver.tree["id"].isin(list(tax_ids))]
.sort_values("lft")
.reset_index()
)
write_tree(subset, outputfile=outfile, outputformat=outformat)
else:
with open(outfile, "w") as outf:
if tax_ids:

outf.write("\n".join(list(tax_ids)))
outf.write("\n".join(list(tax_ids)))
logging.info(f"Wrote list of TaxIDS in {outfile} in '{outformat}' format.")
else:
try:
if tax_ids:
Expand Down
6 changes: 5 additions & 1 deletion taxonomyresolver/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
split_line,
tree_reparenting,
tree_traversal,
tree_to_newick,
)


Expand Down Expand Up @@ -84,11 +85,14 @@ def write_tree(

:param tree: pandas DataFrame
:param outputfile: Path to outputfile
:param outputformat: currently "pickle" format
:param outputformat: currently "pickle" or "newick" format
:return: (side-effects) writes to file
"""
if outputformat == "pickle" and tree is not None:
tree.to_pickle(outputfile, protocol=4)
elif outputformat == "newick" and tree is not None:
with open(outputfile, "w") as outfile:
outfile.write(tree_to_newick(tree) + "\n")
# elif outputformat == "shelve":
# with shelve.open(outputfile) as outfile:
# outfile["tree"] = tree
Expand Down
37 changes: 37 additions & 0 deletions taxonomyresolver/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import pandas as pd
import requests
from tqdm import tqdm
from collections import defaultdict


def get_logging_level(level: str = "INFO"):
Expand Down Expand Up @@ -229,3 +230,39 @@ def get_parents(tree: pd.DataFrame, lft: int, rgt: int) -> list:
:return: list of TaxIDs
"""
return list(tree[(tree["lft"] < lft) & (tree["rgt"] > rgt)]["id"].values)


def tree_to_newick(tree: pd.DataFrame) -> str:
"""
Converts a hierarchical tree DataFrame into a Newick string.

:param tree: pandas DataFrame
:return: A string in Newick format.
"""

# find the root of the tree (where id == parent_id)
try:
root_id = tree[tree["id"] == tree["parent_id"]]["id"].iloc[0]
except IndexError:
# assume the tree is sorted and use the first line (sort of rootless)
root_id = tree["id"].iloc[0]

# build a dictionary of children for each node
children = defaultdict(list)
for _, row in tqdm(tree.iterrows(), total=len(tree), desc="Generating Newick"):
if row["id"] != row["parent_id"]:
children[row["parent_id"]].append(row["id"])

# process nodes in reverse order of `depth` (from leaf nodes to the root)
newick = {}
for depth in sorted(tree["depth"].unique(), reverse=True):
nodes_at_depth = tree[tree["depth"] == depth]["id"]
for node_id in nodes_at_depth:
if node_id in children:
subtree = ",".join(newick[child] for child in children[node_id])
newick[node_id] = f"({subtree}){node_id}"
else:
# leaf node
newick[node_id] = f"{node_id}"

return newick[root_id] + ";"
1 change: 1 addition & 0 deletions testdata/taxids_human.nwk
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
(63221,741158)9606;
3 changes: 3 additions & 0 deletions testdata/taxids_human.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
9606
741158
63221
11 changes: 10 additions & 1 deletion tests/test_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import pytest

from taxonomyresolver import TaxonResolver
from taxonomyresolver.utils import load_logging
from taxonomyresolver.utils import load_logging, tree_to_newick


@pytest.fixture
Expand Down Expand Up @@ -339,3 +339,12 @@ def test_resolver_validate_mock_tree(self, context, cwd):
assert resolver.validate(taxidinclude=["9"])
assert resolver.validate(taxidinclude=["10"])
assert not resolver.validate(taxidinclude=["9606"])

def test_resolver_mock_tree_to_newick(self, context, cwd):
resolver = TaxonResolver(logging=context)
resolver.load(os.path.join(cwd, "../testdata/tree_mock.pickle"), "pickle")
taxidfilter = ["28", "29"]
resolver.filter(taxidfilter)
if resolver.tree is not None:
newick = tree_to_newick(resolver.tree)
assert newick == "(((((((28,29)27)24)18)9)4)2)1;"
Loading