diff --git a/.github/workflows/compile_paper.yml b/.github/workflows/compile_paper.yml new file mode 100644 index 0000000..b7053c0 --- /dev/null +++ b/.github/workflows/compile_paper.yml @@ -0,0 +1,24 @@ +name: Draft PDF +on: [push] + +jobs: + paper: + runs-on: ubuntu-latest + name: Paper Draft + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Build draft PDF + uses: openjournals/openjournals-draft-action@master + with: + journal: joss + # This should be the path to the paper within your repo. + paper-path: paper/paper.md + - name: Upload + uses: actions/upload-artifact@v4 + with: + name: paper + # This is the output path where Pandoc will write the compiled + # PDF. Note, this should be the same directory as the input + # paper.md + path: paper/paper.pdf diff --git a/paper/paper.bib b/paper/paper.bib new file mode 100644 index 0000000..095ab67 --- /dev/null +++ b/paper/paper.bib @@ -0,0 +1,169 @@ +@article{schoch_ncbi_2020, + title = {{NCBI} {Taxonomy}: a comprehensive update on curation, resources and tools}, + volume = {2020}, + issn = {1758-0463}, + shorttitle = {{NCBI} {Taxonomy}}, + doi = {10.1093/database/baaa062}, + abstract = {The National Center for Biotechnology Information (NCBI) Taxonomy includes organism names and classifications for every sequence in the nucleotide and protein sequence databases of the International Nucleotide Sequence Database Collaboration. Since the last review of this resource in 2012, it has undergone several improvements. Most notable is the shift from a single SQL database to a series of linked databases tied to a framework of data called NameBank. This means that relations among data elements can be adjusted in more detail, resulting in expanded annotation of synonyms, the ability to flag names with specific nomenclatural properties, enhanced tracking of publications tied to names and improved annotation of scientific authorities and types. Additionally, practices utilized by NCBI Taxonomy curators specific to major taxonomic groups are described, terms peculiar to NCBI Taxonomy are explained, external resources are acknowledged and updates to tools and other resources are documented. Database URL: https://www.ncbi.nlm.nih.gov/taxonomy.}, + language = {eng}, + journal = {Database: The Journal of Biological Databases and Curation}, + author = {Schoch, Conrad L. and Ciufo, Stacy and Domrachev, Mikhail and Hotton, Carol L. and Kannan, Sivakumar and Khovanskaya, Rogneda and Leipe, Detlef and Mcveigh, Richard and O'Neill, Kathleen and Robbertse, Barbara and Sharma, Shobha and Soussov, Vladimir and Sullivan, John P. and Sun, Lu and Turner, Seán and Karsch-Mizrachi, Ilene}, + month = jan, + year = {2020}, + pmid = {32761142}, + pmcid = {PMC7408187}, + keywords = {Databases, Genetic, Animals, Humans, Bacteria, Classification, Database Management Systems, National Library of Medicine (U.S.), Plants, United States, Viruses}, + pages = {baaa062}, +} + +@article{camacho_blast_2009, + title = {{BLAST}+: architecture and applications}, + volume = {10}, + issn = {1471-2105}, + shorttitle = {{BLAST}+}, + doi = {10.1186/1471-2105-10-421}, + abstract = {BACKGROUND: Sequence similarity searching is a very important bioinformatics task. While Basic Local Alignment Search Tool (BLAST) outperforms exact methods through its use of heuristics, the speed of the current BLAST software is suboptimal for very long queries or database sequences. There are also some shortcomings in the user-interface of the current command-line applications. +RESULTS: We describe features and improvements of rewritten BLAST software and introduce new command-line applications. Long query sequences are broken into chunks for processing, in some cases leading to dramatically shorter run times. For long database sequences, it is possible to retrieve only the relevant parts of the sequence, reducing CPU time and memory usage for searches of short queries against databases of contigs or chromosomes. The program can now retrieve masking information for database sequences from the BLAST databases. A new modular software library can now access subject sequence data from arbitrary data sources. We introduce several new features, including strategy files that allow a user to save and reuse their favorite set of options. The strategy files can be uploaded to and downloaded from the NCBI BLAST web site. +CONCLUSION: The new BLAST command-line applications, compared to the current BLAST tools, demonstrate substantial speed improvements for long queries as well as chromosome length database sequences. We have also improved the user interface of the command-line applications.}, + language = {eng}, + journal = {BMC bioinformatics}, + author = {Camacho, Christiam and Coulouris, George and Avagyan, Vahram and Ma, Ning and Papadopoulos, Jason and Bealer, Kevin and Madden, Thomas L.}, + month = dec, + year = {2009}, + pmid = {20003500}, + pmcid = {PMC2803857}, + keywords = {Computational Biology, Databases, Genetic, Sequence Alignment, Software}, + pages = {421}, +} + +@article{yu_ggtree_2017, + author = {Yu, Guangchuang and Smith, David K. and Zhu, Huachen and Guan, Yi and Lam, Tommy Tsan-Yuk}, + title = {ggtree: an r package for visualization and annotation of phylogenetic trees with their covariates and other associated data}, + journal = {Methods in Ecology and Evolution}, + volume = {8}, + number = {1}, + pages = {28-36}, + keywords = {annotation, bioconductor, evolution, phylogeny, r package, visualization}, + doi = {https://doi.org/10.1111/2041-210X.12628}, + url = {https://besjournals.onlinelibrary.wiley.com/doi/abs/10.1111/2041-210X.12628}, + eprint = {https://besjournals.onlinelibrary.wiley.com/doi/pdf/10.1111/2041-210X.12628}, + abstract = {Summary We present an r package, ggtree, which provides programmable visualization and annotation of phylogenetic trees. ggtree can read more tree file formats than other softwares, including newick, nexus, NHX, phylip and jplace formats, and support visualization of phylo, multiphylo, phylo4, phylo4d, obkdata and phyloseq tree objects defined in other r packages. It can also extract the tree/branch/node-specific and other data from the analysis outputs of beast, epa, hyphy, paml, phylodog, pplacer, r8s, raxml and revbayes software, and allows using these data to annotate the tree. The package allows colouring and annotation of a tree by numerical/categorical node attributes, manipulating a tree by rotating, collapsing and zooming out clades, highlighting user selected clades or operational taxonomic units and exploration of a large tree by zooming into a selected portion. A two-dimensional tree can be drawn by scaling the tree width based on an attribute of the nodes. A tree can be annotated with an associated numerical matrix (as a heat map), multiple sequence alignment, subplots or silhouette images. The package ggtree is released under the artistic-2.0 license. The source code and documents are freely available through bioconductor (http://www.bioconductor.org/packages/ggtree).}, + year = {2017} +} + + +@incollection{kans_entrez_2024, + title = {Entrez {Direct}: {E}-utilities on the {Unix} {Command} {Line}}, + shorttitle = {Entrez {Direct}}, + url = {https://www.ncbi.nlm.nih.gov/books/NBK179288/}, + abstract = {Entrez Direct (EDirect) provides access to the NCBI's suite of interconnected databases (publication, sequence, structure, gene, variation, expression, etc.) from a Unix terminal window. Search terms are entered as command-line arguments. Individual operations are connected with Unix pipes to construct multi-step queries. Selected records can then be retrieved in a variety of formats.}, + language = {en}, + urldate = {2024-09-25}, + booktitle = {Entrez {Programming} {Utilities} {Help} [{Internet}]}, + publisher = {National Center for Biotechnology Information (US)}, + author = {Kans, Jonathan}, + month = jul, + year = {2024}, +} + +@misc{pandas_2024, + title = {pandas-dev/pandas: {Pandas}}, + shorttitle = {pandas-dev/pandas}, + url = {https://zenodo.org/records/13819579}, + abstract = {Pandas is a powerful data structures for data analysis, time series, and statistics.}, + urldate = {2024-09-25}, + publisher = {Zenodo}, + author = {{The pandas development team}}, + month = sep, + year = {2024}, + doi = {10.5281/zenodo.13819579}, + keywords = {data science, python}, +} + +@article{ochoterena_search_2019, + title = {The {Search} for {Common} {Origin}: {Homology} {Revisited}}, + volume = {68}, + issn = {1063-5157}, + shorttitle = {The {Search} for {Common} {Origin}}, + url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6701455/}, + doi = {10.1093/sysbio/syz013}, + abstract = {Understanding the evolution of biodiversity on Earth is a central aim in biology. Currently, various disciplines of science contribute to unravel evolution at all levels of life, from individual organisms to species and higher ranks, using different approaches and specific terminologies. The search for common origin, traditionally called homology, is a connecting paradigm of all studies related to evolution. However, it is not always sufficiently taken into account that defining homology depends on the hierarchical level studied (organism, population, and species), which can cause confusion. Therefore, we propose a framework to define homologies making use of existing terms, which refer to homology in different fields, but restricting them to an unambiguous meaning and a particular hierarchical level. We propose to use the overarching term “homology” only when “morphological homology,” “vertical gene transfer,” and “phylogenetic homology” are confirmed. Consequently, neither phylogenetic nor morphological homology is equal to homology. This article is intended for readers with different research backgrounds. We challenge their traditional approaches, inviting them to consider the proposed framework and offering them a new perspective for their own research.}, + number = {5}, + urldate = {2024-09-26}, + journal = {Systematic Biology}, + author = {Ochoterena, Helga and Vrijdaghs, Alexander and Smets, Erik and Claßen-Bockhoff, Regine}, + month = sep, + year = {2019}, + pmid = {30796841}, + pmcid = {PMC6701455}, + pages = {767--780}, +} + +@article{sandall_globally_2023, + title = {A globally integrated structure of taxonomy to support biodiversity science and conservation}, + volume = {38}, + issn = {0169-5347}, + url = {https://www.sciencedirect.com/science/article/pii/S016953472300215X}, + doi = {10.1016/j.tree.2023.08.004}, + abstract = {All aspects of biodiversity research, from taxonomy to conservation, rely on data associated with species names. Effective integration of names across multiple fields is paramount and depends on the coordination and organization of taxonomic data. We assess current efforts and find that even key applications for well-studied taxa still lack commonality in taxonomic information required for integration. We identify essential taxonomic elements from our interoperability assessment to support improved access and integration of taxonomic data. A stronger focus on these elements has the potential to involve taxonomic communities in biodiversity science and overcome broken linkages currently limiting research capacity. We encourage a community effort to democratize taxonomic expertise and language in order to facilitate maximum interoperability and integration.}, + number = {12}, + urldate = {2024-09-26}, + journal = {Trends in Ecology \& Evolution}, + author = {Sandall, Emily L. and Maureaud, Aurore A. and Guralnick, Robert and McGeoch, Melodie A. and Sica, Yanina V. and Rogan, Matthew S. and Booher, Douglas B. and Edwards, Robert and Franz, Nico and Ingenloff, Kate and Lucas, Maisha and Marsh, Charles J. and McGowan, Jennifer and Pinkert, Stefan and Ranipeta, Ajay and Uetz, Peter and Wieczorek, John and Jetz, Walter}, + month = dec, + year = {2023}, + keywords = {biodiversity conservation, data linkage, integrative science, social infrastructure, taxonomic backbone}, + pages = {1143--1153} +} + +@incollection{celko_chapter_2004, + address = {San Francisco}, + series = {The {Morgan} {Kaufmann} {Series} in {Data} {Management} {Systems}}, + title = {Chapter 4 - {Nested} {Set} {Model} of {Hierarchies}}, + isbn = {978-1-55860-920-4}, + url = {https://www.sciencedirect.com/science/article/pii/B9781558609204500052}, + abstract = {Trees are often drawn as boxes-and-arrows charts that tend to fix the mental image of a tree into a graph structure. Another way of representing trees is to show them as nested sets. The chapter highlights this approach. To show a tree as nested sets, replace the boxes with ovals and then nest subordinate ovals inside their parents. Containment represents subordination. The root will be the largest oval and will contain every other node. The leaf nodes will be the innermost ovals, with nothing else inside them, and the nesting will show the hierarchical relationship. This is a natural way to model a parts explosion because a final assembly is made of physically nested assemblies that finally break down into separate parts. This approach is used to model a tree with nested sets with number pairs that always contain the pairs of their subordinates so that a child node is within the bounds of its parent. The chapter uses this approach of representation of a tree graph to present techniques to perform operations, such as finding root and leaf nodes, finding subtrees, finding levels and paths in a tree, finding the height of a tree, deleting nodes and subtrees, closing gaps in the tree, using summary functions on trees, and inserting and updating trees. The chapter also provides techniques to convert nested sets into adjacent list models, and compare nodes and structures of these models. All of these techniques are explained using simple SQL codes maintaining an organizational chart table to represent the hierarchy and people of a sample organization.}, + urldate = {2024-09-26}, + booktitle = {Joe {Celko}'s {Trees} and {Hierarchies} in {SQL} for {Smarties}}, + publisher = {Morgan Kaufmann}, + author = {Celko, Joe}, + editor = {Celko, Joe}, + month = jan, + year = {2004}, + doi = {10.1016/B978-155860920-4/50005-2}, + pages = {45--99}, +} + +@misc{anytree, + author = {{Cofe Code and contributors}}, + title = {Anytree: Python tree data library}, + year = {2024}, + publisher = {GitHub}, + journal = {GitHub repository}, + url = {https://github.com/c0fec0de/anytree} +} + +@misc{bigtree, + author = {{Kay Jan W. and contributors}}, + title = {BigTree: Tree Implementation and Methods for Python, integrated with list, dictionary, pandas and polars DataFrame.}, + year = {2024}, + publisher = {GitHub}, + journal = {GitHub repository}, + url = {ttps://github.com/kayjan/bigtree} +} + +@article{madeira_2024, + author = {Madeira, Fábio and Madhusoodanan, Nandana and Lee, Joonheung and Eusebi, Alberto and Niewielska, Ania and Tivey, Adrian R N and Lopez, Rodrigo and Butcher, Sarah}, + title = "{The EMBL-EBI Job Dispatcher sequence analysis tools framework in 2024}", + journal = {Nucleic Acids Research}, + volume = {52}, + number = {W1}, + pages = {W521-W525}, + year = {2024}, + month = {04}, + abstract = "{The EMBL-EBI Job Dispatcher sequence analysis tools framework (https://www.ebi.ac.uk/jdispatcher) enables the scientific community to perform a diverse range of sequence analyses using popular bioinformatics applications. Free access to the tools and required sequence datasets is provided through user-friendly web applications, as well as via RESTful and SOAP-based APIs. These are integrated into popular EMBL-EBI resources such as UniProt, InterPro, ENA and Ensembl Genomes. This paper overviews recent improvements to Job Dispatcher, including its brand new website and documentation, enhanced visualisations, improved job management, and a rising trend of user reliance on the service from low- and middle-income regions.}", + issn = {0305-1048}, + doi = {10.1093/nar/gkae241}, + url = {https://doi.org/10.1093/nar/gkae241}, + eprint = {https://academic.oup.com/nar/article-pdf/52/W1/W521/58436149/gkae241.pdf}, +} diff --git a/paper/paper.md b/paper/paper.md new file mode 100644 index 0000000..9d3d495 --- /dev/null +++ b/paper/paper.md @@ -0,0 +1,76 @@ +--- +title: 'Taxonomy Resolver: A Python package for building and filtering taxonomy trees' +tags: + - Python + - Taxonomy + - Tree + - Hierarchy + - NCBI Taxonomy + - NCBI BLAST+ + - Nested Set Model + - Modified Preorder Tree Traversal +authors: + - name: Fábio Madeira + orcid: 0000-0001-8728-9449 + corresponding: true + affiliation: 1 + - name: Nandana Madhusoodanan + orcid: 0000-0001-5004-152X + affiliation: 1 + - name: Alberto Eusebi + orcid: 0000-0001-5179-7724 + affiliation: 1 + - name: Joonheung Lee + orcid: 0000-0002-5760-2761 + affiliation: 1 + - name: Ania Niewielska + orcid: 0000-0003-0989-3389 + affiliation: 1 + - name: Sarah Butcher + orcid: 0000-0002-4494-5124 + affiliation: 1 +affiliations: + - name: | + European Molecular Biology Laboratory, European Bioinformatics Institute (EMBL-EBI), + Wellcome Trust Genome Campus, Hinxton, Cambridge CB10 1SD, UK + index: 1 +date: 26 September 2024 +bibliography: paper.bib + +--- + +# Summary + +Taxonomy classification provides an important source of information for studying biological systems. It is a key component for many areas of biological sciences research, particularly genetics, evolutionary biology, biodiversity and conservation [@sandall_globally_2023]. Common ancestry, homology and conservation of sequence and structure are all central ideas in biology that are directly related to the evolutionary history of any group of organisms [@ochoterena_search_2019]. The National Center for Biotechnology Information (NCBI) Taxonomy [@schoch_ncbi_2020] provides a curated classification and nomenclature for all the organisms in the public sequence databases, across the taxonomic ranks (i.e. Domain, Kingdom, Phylum, Class, Order, Family, Genus and Species). + +Here we describe ``Taxonomy Resolver``, a Python module and command-line interface (CLI) application for building and filtering taxonomy trees based on the NCBI Taxonomy. Taxonomy Resolver streamlines the process of manipulating trees, enabling fast tree traversal, searching and filtering. + +# Statement of need + +The NCBI Taxonomy Database [@schoch_ncbi_2020] provides a hierarchically arranged list of organisms across all domains of life found in the sequence databases. Tree filtering, i.e. generation of tree subsets, referred to as subtrees, has various applications for sequence analysis, particularly for reducing the search space of sequence similarity searching algorithms. A sequence dataset composed of sequences from diverse taxa can be more quickly searched if only a subset of sequences which belong to taxonomies of interest are selected. + +The NCBI BLAST+ suite is the most widely used toolset in bioinformatics for performing sequence similarity search [@camacho_blast_2009]. The suite provides a Bash script (`get_species_taxids.sh`) to convert NCBI Taxonomy identifiers (TaxIDs) or text into TaxIDs suitable for filtering sequence searches. While this is a useful utility, it only works with sequences submitted to GenBank or other NCBI-hosted databases, and more importantly, it relies on making API calls via Entrez Direct (EDirect) [@kans_entrez_2024]. EDirect requires an internet connection and it does not scale well when working with large sequence datasets. Other general-purpose tree libraries exist for Python (e.g. ``anytree`` [@anytree] and ``bigtree`` [@bigtree]) and R (e.g. ``ggtree`` [@yu_ggtree_2017]), but they do not support the core features provided by Taxonomy Resolver or focus mainly on tree visualisation. The development of Taxonomy Resolver started in 2020 and aims to provide user-friendly interfaces for working directly with the NCBI Taxonomy hierarchical dataset. + +# Features + +Taxonomy Resolver has been developed with simplicity in mind and it can be used both as a standard Python module or as a CLI application. The main tasks performed by Taxonomy Resolver are: + +* **downloading** the NCBI Taxonomy classification hierarchy “dump” from the NCBI FTP server +* **building** complete taxonomy tree data structures or partial trees, i.e. subtrees +* **searching** particular TaxIDs at any level of the taxonomy hierarchy, performing fast tree traversal +* **validating** TaxIDs against the NCBI Taxonomy or any given subtree +* **generating** taxonomy lists that compose any subtree, at any level of the taxonomy hierarchy +* **filtering** a tree based on the inclusion and/or exclusion of certain TaxIDs +* **writing and loading** tree data structures using Python’s object serialisation + +# Implementation + +A taxonomy tree is a hierarchical structure that can be seen as a collection of deeply nested containers - nodes connected by edges, following the hierarchy, from the parent node - the root, all the way down to the children nodes - the leaves. An object-oriented programming (OOP) tree implementation based on recursion does not typically scale well for large trees, such as the NCBI Taxonomy, which is composed of >2.6 million nodes. To improve performance, Taxonomy Resolver represents the tree structure following the Nested Set Model, which is a technique developed to represent hierarchical data in relational databases lacking recursion capabilities. This allows for efficient and inexpensive querying of parent-child relationships. The full tree is traversed following the Modified Preorder Tree Traversal (MPTT) strategy [@celko_chapter_2004], in which each node in the tree is visited twice. In a preorder traversal, the root node is visited first, then recursively a preorder traversal of the left subtree, followed by a recursive preorder traversal of the right subtree, in order, until every node has been visited. The modified strategy allows capturing the 'left' and 'right' ($lft$ and $rgt$, respectively) boundaries of each subtree, which are stored as two additional attributes. Finding a subtree is as simple as searching for the nodes of interest where $lft > node's\ lft$ and $rgt < node's\ rgt$. Likewise, finding the full path to a node is as simple as searching for the nodes where $lft < node's\ lft$ and $rgt > node's\ rgt$. Traversal attributes, depth and node indexes are captured for each tree node and are stored as a pandas DataFrame [@pandas_2024]. + +Taxonomy Resolver has been developed to take advantage of the Nested Set Model tree structure, so it can perform fast validation and create lists of taxa that compose a particular subtree. Inclusion and exclusion lists can also be seamlessly used to produce subset trees with wide applications, particularly for sequence similarity search. Taxonomy Resolver has been in production since 2020 serving thousands of users every month. It provides taxonomy filtering features for NCBI BLAST+ provided by the popular EMBL-EBI Job Dispatcher service, available from https://www.ebi.ac.uk/jdispatcher/sss/ncbiblast [@madeira_2024]. + +# Acknowledgements + +We would like to thank past and current members of the EMBL-EBI for their continued support. We would like to also thank EMBL and its funders. + +# References