diff --git a/paper/paper.bib b/paper/paper.bib index 60b6741..d30b828 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -37,18 +37,18 @@ @article{camacho_blast_2009 } @article{yu_ggtree_2017, -author = {Yu, Guangchuang and Smith, David K. and Zhu, Huachen and Guan, Yi and Lam, Tommy Tsan-Yuk}, -title = {ggtree: an r package for visualization and annotation of phylogenetic trees with their covariates and other associated data}, -journal = {Methods in Ecology and Evolution}, -volume = {8}, -number = {1}, -pages = {28-36}, -keywords = {annotation, bioconductor, evolution, phylogeny, r package, visualization}, -doi = {https://doi.org/10.1111/2041-210X.12628}, -url = {https://besjournals.onlinelibrary.wiley.com/doi/abs/10.1111/2041-210X.12628}, -eprint = {https://besjournals.onlinelibrary.wiley.com/doi/pdf/10.1111/2041-210X.12628}, -abstract = {Summary We present an r package, ggtree, which provides programmable visualization and annotation of phylogenetic trees. ggtree can read more tree file formats than other softwares, including newick, nexus, NHX, phylip and jplace formats, and support visualization of phylo, multiphylo, phylo4, phylo4d, obkdata and phyloseq tree objects defined in other r packages. It can also extract the tree/branch/node-specific and other data from the analysis outputs of beast, epa, hyphy, paml, phylodog, pplacer, r8s, raxml and revbayes software, and allows using these data to annotate the tree. The package allows colouring and annotation of a tree by numerical/categorical node attributes, manipulating a tree by rotating, collapsing and zooming out clades, highlighting user selected clades or operational taxonomic units and exploration of a large tree by zooming into a selected portion. A two-dimensional tree can be drawn by scaling the tree width based on an attribute of the nodes. A tree can be annotated with an associated numerical matrix (as a heat map), multiple sequence alignment, subplots or silhouette images. The package ggtree is released under the artistic-2.0 license. The source code and documents are freely available through bioconductor (http://www.bioconductor.org/packages/ggtree).}, -year = {2017} + author = {Yu, Guangchuang and Smith, David K. and Zhu, Huachen and Guan, Yi and Lam, Tommy Tsan-Yuk}, + title = {ggtree: an r package for visualization and annotation of phylogenetic trees with their covariates and other associated data}, + journal = {Methods in Ecology and Evolution}, + volume = {8}, + number = {1}, + pages = {28-36}, + keywords = {annotation, bioconductor, evolution, phylogeny, r package, visualization}, + doi = {https://doi.org/10.1111/2041-210X.12628}, + url = {https://besjournals.onlinelibrary.wiley.com/doi/abs/10.1111/2041-210X.12628}, + eprint = {https://besjournals.onlinelibrary.wiley.com/doi/pdf/10.1111/2041-210X.12628}, + abstract = {Summary We present an r package, ggtree, which provides programmable visualization and annotation of phylogenetic trees. ggtree can read more tree file formats than other softwares, including newick, nexus, NHX, phylip and jplace formats, and support visualization of phylo, multiphylo, phylo4, phylo4d, obkdata and phyloseq tree objects defined in other r packages. It can also extract the tree/branch/node-specific and other data from the analysis outputs of beast, epa, hyphy, paml, phylodog, pplacer, r8s, raxml and revbayes software, and allows using these data to annotate the tree. The package allows colouring and annotation of a tree by numerical/categorical node attributes, manipulating a tree by rotating, collapsing and zooming out clades, highlighting user selected clades or operational taxonomic units and exploration of a large tree by zooming into a selected portion. A two-dimensional tree can be drawn by scaling the tree width based on an attribute of the nodes. A tree can be annotated with an associated numerical matrix (as a heat map), multiple sequence alignment, subplots or silhouette images. The package ggtree is released under the artistic-2.0 license. The source code and documents are freely available through bioconductor (http://www.bioconductor.org/packages/ggtree).}, + year = {2017} } @@ -73,7 +73,7 @@ @misc{pandas_2024 abstract = {Pandas is a powerful data structures for data analysis, time series, and statistics.}, urldate = {2024-09-25}, publisher = {Zenodo}, - author = {The pandas development team}, + author = {"The pandas development team"}, month = sep, year = {2024}, doi = {10.5281/zenodo.13819579}, @@ -135,19 +135,19 @@ @incollection{celko_chapter_2004 } @misc{anytree, - author = {Cofe Code and contributors}, - title = {Anytree: Python tree data library}, - year = {2024}, - publisher = {GitHub}, - journal = {GitHub repository}, - url = {https://github.com/c0fec0de/anytree} + author = {"Cofe Code and contributors"}, + title = {Anytree: Python tree data library}, + year = {2024}, + publisher = {GitHub}, + journal = {GitHub repository}, + url = {https://github.com/c0fec0de/anytree} } @misc{bigtree, - author = {Kay Jan W. and contributors}, - title = {BigTree: Tree Implementation and Methods for Python, integrated with list, dictionary, pandas and polars DataFrame.}, - year = {2024}, - publisher = {GitHub}, - journal = {GitHub repository}, - url = {ttps://github.com/kayjan/bigtree} + author = {"Kay Jan W. and contributors"}, + title = {BigTree: Tree Implementation and Methods for Python, integrated with list, dictionary, pandas and polars DataFrame.}, + year = {2024}, + publisher = {GitHub}, + journal = {GitHub repository}, + url = {ttps://github.com/kayjan/bigtree} } diff --git a/paper/paper.md b/paper/paper.md index 4f046ad..6dec8bb 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -53,7 +53,7 @@ Taxonomy Resolver has been developed with simplicity in mind and it can be used * **filtering** a tree based on the inclusion and/or exclusion of certain TaxIDs * **writing and loading** tree data structures using Python’s object serialisation -A taxonomy tree is a hierarchical structure that can be seen as a collection of deeply nested containers - nodes connected by edges, following the hierarchy, from the parent node - the root, all the way down to the children nodes - the leaves. An object-oriented programming (OOP) tree implementation based on recursion does not typically scale well for large trees, such as the NCBI Taxonomy, which is composed of >2.6 million nodes. To improve performance, Taxonomy Resolver represents the tree structure following the Nested Set Model, which is a technique developed to represent hierarchical data in relational databases lacking recursion capabilities. This allows for efficient and inexpensive querying of parent-child relationships. The full tree is traversed following the Modified Preorder Tree Traversal (MPTT) strategy [@celko_chapter_2004], in which each node in the tree is visited twice. In a preorder traversal, the root node is visited first, then recursively a preorder traversal of the left sub-tree, followed by a recursive preorder traversal of the right subtree, in order, until every node has been visited. The modified strategy allows capturing the 'left' and 'right' ($lft$ and $rgt$, respectively) boundaries of each subtree, which are stored as two additional attributes. Finding a subtree is as simple as searching for the nodes of interest where $lft > node lft$ and $rgt < node rgt$. Likewise, finding the full path to a node is as simple as searching for the nodes where $lft < node lft$ and $rgt > node rgt$. Traversal attributes, depth and node indexes are captured for each tree node and are stored as a pandas DataFrame [@pandas_2024]. +A taxonomy tree is a hierarchical structure that can be seen as a collection of deeply nested containers - nodes connected by edges, following the hierarchy, from the parent node - the root, all the way down to the children nodes - the leaves. An object-oriented programming (OOP) tree implementation based on recursion does not typically scale well for large trees, such as the NCBI Taxonomy, which is composed of >2.6 million nodes. To improve performance, Taxonomy Resolver represents the tree structure following the Nested Set Model, which is a technique developed to represent hierarchical data in relational databases lacking recursion capabilities. This allows for efficient and inexpensive querying of parent-child relationships. The full tree is traversed following the Modified Preorder Tree Traversal (MPTT) strategy [@celko_chapter_2004], in which each node in the tree is visited twice. In a preorder traversal, the root node is visited first, then recursively a preorder traversal of the left sub-tree, followed by a recursive preorder traversal of the right subtree, in order, until every node has been visited. The modified strategy allows capturing the 'left' and 'right' ($lft$ and $rgt$, respectively) boundaries of each subtree, which are stored as two additional attributes. Finding a subtree is as simple as searching for the nodes of interest where $lft > node's\ \ lft$ and $rgt < node's\ \ rgt$. Likewise, finding the full path to a node is as simple as searching for the nodes where $lft < node's\ \ lft$ and $rgt > node's\ \ rgt$. Traversal attributes, depth and node indexes are captured for each tree node and are stored as a pandas DataFrame [@pandas_2024]. In conclusion, Taxonomy Resolver has been developed to take advantage of the Nested Set Model tree structure, so it can perform fast validation and create lists of taxa that compose a particular subtree. Inclusion and exclusion lists can also be seamlessly used to produce subset trees with wide applications, particularly for sequence similarity search.