diff --git a/README.rst b/README.rst index 419f68f..f851fb4 100644 --- a/README.rst +++ b/README.rst @@ -273,6 +273,25 @@ Validating a list of TaxIDs against a Tree data structure in ``pickle`` format: taxonomy-resolver validate -in tree.pickle -taxids testdata/taxids_validate.txt +Load a previously built Tree data structure in ``pickle`` format and search for one or more TaxIDs (for example human, TaxID '9606'). Included, excluded and filter lists can be optionally passed as shown above. + +.. code-block:: bash + + taxonomy-resolver search -in tree.pickle -taxid "9606" -out testdata/taxids_human.txt + +Writing the 'complete' human sub-tree (i.e. all levels of the hierarchy) in ``newick`` format can be done via the build command and passing a TaxID filter file, which can be generated by running the search command shown previously. + +.. code-block:: bash + + taxonomy-resolver build -in tree.pickle -inf pickle -taxidsf testdata/taxids_human.txt -out testdata/taxids_human.nwk -outf newick + +Alternatively, ``newick`` output can be generated for the 'local' human sub-tree (i.e. species and sub-species levels only), directly using the search command, as shown below: + +.. code-block:: bash + + taxonomy-resolver search -in tree.pickle -taxid "9606" -out testdata/taxids_human_local.nwk -outf newick + + Contributing ============ diff --git a/taxonomyresolver/cli.py b/taxonomyresolver/cli.py index 3bc6577..c0717fb 100644 --- a/taxonomyresolver/cli.py +++ b/taxonomyresolver/cli.py @@ -17,6 +17,7 @@ print_and_exit, validate_inputs_outputs, ) +from taxonomyresolver.tree import write_tree # reusing click args and options @@ -174,7 +175,7 @@ def download( default="pickle", required=False, multiple=False, - help="Output format (currently: 'pickle').", + help="Output format (currently: 'pickle' or 'newick').", ) @click.option( "-taxidsf", @@ -289,6 +290,16 @@ def build( multiple=False, help="Input format (currently: 'pickle').", ) +@click.option( + "-outf", + "--outformat", + "outformat", + type=str, + default="txt", + required=False, + multiple=False, + help="Input format (currently: 'txt' or 'newick').", +) @click.option( "-taxid", "--taxid", @@ -354,6 +365,7 @@ def search( infile: str, outfile: str | None, informat: str, + outformat: str, taxids: str | None, taxidincludes: str | None, taxidsexcludes: str | None, @@ -421,10 +433,20 @@ def search( ignoreinvalid=ignoreinvalid, ) if outfile: - with open(outfile, "w") as outf: - if tax_ids: - outf.write("\n".join(list(tax_ids))) - logging.info(f"Wrote list of TaxIDS in {outfile}.") + if outformat == "newick" and resolver.tree is not None and tax_ids: + subset = ( + resolver.tree[resolver.tree["id"].isin(list(tax_ids))] + .sort_values("lft") + .reset_index() + ) + write_tree(subset, outputfile=outfile, outputformat=outformat) + else: + with open(outfile, "w") as outf: + if tax_ids: + + outf.write("\n".join(list(tax_ids))) + outf.write("\n".join(list(tax_ids))) + logging.info(f"Wrote list of TaxIDS in {outfile} in '{outformat}' format.") else: try: if tax_ids: diff --git a/taxonomyresolver/tree.py b/taxonomyresolver/tree.py index f655f96..6a36735 100644 --- a/taxonomyresolver/tree.py +++ b/taxonomyresolver/tree.py @@ -24,6 +24,7 @@ split_line, tree_reparenting, tree_traversal, + tree_to_newick, ) @@ -84,11 +85,14 @@ def write_tree( :param tree: pandas DataFrame :param outputfile: Path to outputfile - :param outputformat: currently "pickle" format + :param outputformat: currently "pickle" or "newick" format :return: (side-effects) writes to file """ if outputformat == "pickle" and tree is not None: tree.to_pickle(outputfile, protocol=4) + elif outputformat == "newick" and tree is not None: + with open(outputfile, "w") as outfile: + outfile.write(tree_to_newick(tree) + "\n") # elif outputformat == "shelve": # with shelve.open(outputfile) as outfile: # outfile["tree"] = tree diff --git a/taxonomyresolver/utils.py b/taxonomyresolver/utils.py index bcc8841..37be153 100644 --- a/taxonomyresolver/utils.py +++ b/taxonomyresolver/utils.py @@ -15,6 +15,7 @@ import pandas as pd import requests from tqdm import tqdm +from collections import defaultdict def get_logging_level(level: str = "INFO"): @@ -229,3 +230,39 @@ def get_parents(tree: pd.DataFrame, lft: int, rgt: int) -> list: :return: list of TaxIDs """ return list(tree[(tree["lft"] < lft) & (tree["rgt"] > rgt)]["id"].values) + + +def tree_to_newick(tree: pd.DataFrame) -> str: + """ + Converts a hierarchical tree DataFrame into a Newick string. + + :param tree: pandas DataFrame + :return: A string in Newick format. + """ + + # find the root of the tree (where id == parent_id) + try: + root_id = tree[tree["id"] == tree["parent_id"]]["id"].iloc[0] + except IndexError: + # assume the tree is sorted and use the first line (sort of rootless) + root_id = tree["id"].iloc[0] + + # build a dictionary of children for each node + children = defaultdict(list) + for _, row in tqdm(tree.iterrows(), total=len(tree), desc="Generating Newick"): + if row["id"] != row["parent_id"]: + children[row["parent_id"]].append(row["id"]) + + # process nodes in reverse order of `depth` (from leaf nodes to the root) + newick = {} + for depth in sorted(tree["depth"].unique(), reverse=True): + nodes_at_depth = tree[tree["depth"] == depth]["id"] + for node_id in nodes_at_depth: + if node_id in children: + subtree = ",".join(newick[child] for child in children[node_id]) + newick[node_id] = f"({subtree}){node_id}" + else: + # leaf node + newick[node_id] = f"{node_id}" + + return newick[root_id] + ";" diff --git a/testdata/taxids_human.nwk b/testdata/taxids_human.nwk new file mode 100644 index 0000000..86b12e6 --- /dev/null +++ b/testdata/taxids_human.nwk @@ -0,0 +1 @@ +(63221,741158)9606; diff --git a/testdata/taxids_human.txt b/testdata/taxids_human.txt new file mode 100644 index 0000000..f1387a5 --- /dev/null +++ b/testdata/taxids_human.txt @@ -0,0 +1,3 @@ +9606 +741158 +63221 \ No newline at end of file diff --git a/tests/test_tree.py b/tests/test_tree.py index bd66b2c..22589c5 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -13,7 +13,7 @@ import pytest from taxonomyresolver import TaxonResolver -from taxonomyresolver.utils import load_logging +from taxonomyresolver.utils import load_logging, tree_to_newick @pytest.fixture @@ -339,3 +339,12 @@ def test_resolver_validate_mock_tree(self, context, cwd): assert resolver.validate(taxidinclude=["9"]) assert resolver.validate(taxidinclude=["10"]) assert not resolver.validate(taxidinclude=["9606"]) + + def test_resolver_mock_tree_to_newick(self, context, cwd): + resolver = TaxonResolver(logging=context) + resolver.load(os.path.join(cwd, "../testdata/tree_mock.pickle"), "pickle") + taxidfilter = ["28", "29"] + resolver.filter(taxidfilter) + if resolver.tree is not None: + newick = tree_to_newick(resolver.tree) + assert newick == "(((((((28,29)27)24)18)9)4)2)1;"