From db33b1d436c5db7ce307efcce855cf6a6698db12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fa=CC=81bio=20Madeira?= Date: Tue, 24 Sep 2024 17:06:10 +0100 Subject: [PATCH 01/12] adds demo JOSS paper (markdown and bib files) --- paper/paper.bib | 59 ++++++++++++++++++++++++ paper/paper.md | 120 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 179 insertions(+) create mode 100644 paper/paper.bib create mode 100644 paper/paper.md diff --git a/paper/paper.bib b/paper/paper.bib new file mode 100644 index 0000000..4e4544a --- /dev/null +++ b/paper/paper.bib @@ -0,0 +1,59 @@ +@article{Pearson:2017, + url = {http://adsabs.harvard.edu/abs/2017arXiv170304627P}, + Archiveprefix = {arXiv}, + Author = {{Pearson}, S. and {Price-Whelan}, A.~M. and {Johnston}, K.~V.}, + Eprint = {1703.04627}, + Journal = {ArXiv e-prints}, + Keywords = {Astrophysics - Astrophysics of Galaxies}, + Month = mar, + Title = {{Gaps in Globular Cluster Streams: Pal 5 and the Galactic Bar}}, + Year = 2017 +} + +@book{Binney:2008, + url = {http://adsabs.harvard.edu/abs/2008gady.book.....B}, + Author = {{Binney}, J. and {Tremaine}, S.}, + Booktitle = {Galactic Dynamics: Second Edition, by James Binney and Scott Tremaine.~ISBN 978-0-691-13026-2 (HB).~Published by Princeton University Press, Princeton, NJ USA, 2008.}, + Publisher = {Princeton University Press}, + Title = {{Galactic Dynamics: Second Edition}}, + Year = 2008 +} + +@article{gaia, + author = {{Gaia Collaboration}}, + title = "{The Gaia mission}", + journal = {Astronomy and Astrophysics}, + archivePrefix = "arXiv", + eprint = {1609.04153}, + primaryClass = "astro-ph.IM", + keywords = {space vehicles: instruments, Galaxy: structure, astrometry, parallaxes, proper motions, telescopes}, + year = 2016, + month = nov, + volume = 595, + doi = {10.1051/0004-6361/201629272}, + url = {http://adsabs.harvard.edu/abs/2016A%26A...595A...1G}, +} + +@article{astropy, + author = {{Astropy Collaboration}}, + title = "{Astropy: A community Python package for astronomy}", + journal = {Astronomy and Astrophysics}, + archivePrefix = "arXiv", + eprint = {1307.6212}, + primaryClass = "astro-ph.IM", + keywords = {methods: data analysis, methods: miscellaneous, virtual observatory tools}, + year = 2013, + month = oct, + volume = 558, + doi = {10.1051/0004-6361/201322068}, + url = {http://adsabs.harvard.edu/abs/2013A%26A...558A..33A} +} + +@misc{fidgit, + author = {A. M. Smith and K. Thaney and M. Hahnel}, + title = {Fidgit: An ungodly union of GitHub and Figshare}, + year = {2020}, + publisher = {GitHub}, + journal = {GitHub repository}, + url = {https://github.com/arfon/fidgit} +} diff --git a/paper/paper.md b/paper/paper.md new file mode 100644 index 0000000..720e1e7 --- /dev/null +++ b/paper/paper.md @@ -0,0 +1,120 @@ +--- +title: 'Gala: A Python package for galactic dynamics' +tags: + - Python + - astronomy + - dynamics + - galactic dynamics + - milky way +authors: + - name: Adrian M. Price-Whelan + orcid: 0000-0000-0000-0000 + equal-contrib: true + affiliation: "1, 2" # (Multiple affiliations must be quoted) + - name: Author Without ORCID + equal-contrib: true # (This is how you can denote equal contributions between multiple authors) + affiliation: 2 + - name: Author with no affiliation + corresponding: true # (This is how to denote the corresponding author) + affiliation: 3 + - given-names: Ludwig + dropping-particle: van + surname: Beethoven + affiliation: 3 +affiliations: + - name: Lyman Spitzer, Jr. Fellow, Princeton University, USA + index: 1 + ror: 00hx57361 + - name: Institution Name, Country + index: 2 + - name: Independent Researcher, Country + index: 3 +date: 13 August 2017 +bibliography: paper.bib + +# Optional fields if submitting to a AAS journal too, see this blog post: +# https://blog.joss.theoj.org/2018/12/a-new-collaboration-with-aas-publishing +aas-doi: 10.3847/xxxxx <- update this with the DOI from AAS once you know it. +aas-journal: Astrophysical Journal <- The name of the AAS journal. +--- + +# Summary + +The forces on stars, galaxies, and dark matter under external gravitational +fields lead to the dynamical evolution of structures in the universe. The orbits +of these bodies are therefore key to understanding the formation, history, and +future state of galaxies. The field of "galactic dynamics," which aims to model +the gravitating components of galaxies to study their structure and evolution, +is now well-established, commonly taught, and frequently used in astronomy. +Aside from toy problems and demonstrations, the majority of problems require +efficient numerical tools, many of which require the same base code (e.g., for +performing numerical orbit integration). + +# Statement of need + +`Gala` is an Astropy-affiliated Python package for galactic dynamics. Python +enables wrapping low-level languages (e.g., C) for speed without losing +flexibility or ease-of-use in the user-interface. The API for `Gala` was +designed to provide a class-based and user-friendly interface to fast (C or +Cython-optimized) implementations of common operations such as gravitational +potential and force evaluation, orbit integration, dynamical transformations, +and chaos indicators for nonlinear dynamics. `Gala` also relies heavily on and +interfaces well with the implementations of physical units and astronomical +coordinate systems in the `Astropy` package [@astropy] (`astropy.units` and +`astropy.coordinates`). + +`Gala` was designed to be used by both astronomical researchers and by +students in courses on gravitational dynamics or astronomy. It has already been +used in a number of scientific publications [@Pearson:2017] and has also been +used in graduate courses on Galactic dynamics to, e.g., provide interactive +visualizations of textbook material [@Binney:2008]. The combination of speed, +design, and support for Astropy functionality in `Gala` will enable exciting +scientific explorations of forthcoming data releases from the *Gaia* mission +[@gaia] by students and experts alike. + +# Mathematics + +Single dollars ($) are required for inline mathematics e.g. $f(x) = e^{\pi/x}$ + +Double dollars make self-standing equations: + +$$\Theta(x) = \left\{\begin{array}{l} +0\textrm{ if } x < 0\cr +1\textrm{ else} +\end{array}\right.$$ + +You can also use plain \LaTeX for equations +\begin{equation}\label{eq:fourier} +\hat f(\omega) = \int_{-\infty}^{\infty} f(x) e^{i\omega x} dx +\end{equation} +and refer to \autoref{eq:fourier} from text. + +# Citations + +Citations to entries in paper.bib should be in +[rMarkdown](http://rmarkdown.rstudio.com/authoring_bibliographies_and_citations.html) +format. + +If you want to cite a software repository URL (e.g. something on GitHub without a preferred +citation) then you can do it with the example BibTeX entry below for @fidgit. + +For a quick reference, the following citation commands can be used: +- `@author:2001` -> "Author et al. (2001)" +- `[@author:2001]` -> "(Author et al., 2001)" +- `[@author1:2001; @author2:2001]` -> "(Author1 et al., 2001; Author2 et al., 2002)" + +# Figures + +Figures can be included like this: +![Caption for example figure.\label{fig:example}](figure.png) +and referenced from text using \autoref{fig:example}. + +Figure sizes can be customized by adding an optional second parameter: +![Caption for example figure.](figure.png){ width=20% } + +# Acknowledgements + +We acknowledge contributions from Brigitta Sipocz, Syrtis Major, and Semyeong +Oh, and support from Kathryn Johnston during the genesis of this project. + +# References From e74a7519eb533729a60ab761b2426af359755277 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fa=CC=81bio=20Madeira?= Date: Tue, 24 Sep 2024 17:06:27 +0100 Subject: [PATCH 02/12] cicd: adds new github action to compile the paper --- .github/workflows/compile_paper.yml | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 .github/workflows/compile_paper.yml diff --git a/.github/workflows/compile_paper.yml b/.github/workflows/compile_paper.yml new file mode 100644 index 0000000..11ea3c4 --- /dev/null +++ b/.github/workflows/compile_paper.yml @@ -0,0 +1,24 @@ +on: + workflow_dispatch: + +jobs: + paper: + runs-on: ubuntu-latest + name: Paper Draft + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Build draft PDF + uses: openjournals/openjournals-draft-action@master + with: + journal: joss + # This should be the path to the paper within your repo. + paper-path: paper/paper.md + - name: Upload + uses: actions/upload-artifact@v4 + with: + name: paper + # This is the output path where Pandoc will write the compiled + # PDF. Note, this should be the same directory as the input + # paper.md + path: paper/paper.pdf From 7efe142212b87a70dfe895f017f80335d19f8206 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fa=CC=81bio=20Madeira?= Date: Tue, 24 Sep 2024 17:11:46 +0100 Subject: [PATCH 03/12] cicd: minor update to compite on push --- .github/workflows/compile_paper.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/compile_paper.yml b/.github/workflows/compile_paper.yml index 11ea3c4..ec0b5b5 100644 --- a/.github/workflows/compile_paper.yml +++ b/.github/workflows/compile_paper.yml @@ -1,5 +1,5 @@ -on: - workflow_dispatch: +name: Draft PDF +on: [push] jobs: paper: From aeaa487484d4d0c80d33d088816d2fca9a5bbe5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fa=CC=81bio=20Madeira?= Date: Thu, 26 Sep 2024 16:55:28 +0100 Subject: [PATCH 04/12] cicd: bumping checkout action to v4 --- .github/workflows/compile_paper.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/compile_paper.yml b/.github/workflows/compile_paper.yml index ec0b5b5..b7053c0 100644 --- a/.github/workflows/compile_paper.yml +++ b/.github/workflows/compile_paper.yml @@ -7,7 +7,7 @@ jobs: name: Paper Draft steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Build draft PDF uses: openjournals/openjournals-draft-action@master with: From 7678024fa8b4778fbb8faf5a611be52c52ec3b58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fa=CC=81bio=20Madeira?= Date: Thu, 26 Sep 2024 16:56:01 +0100 Subject: [PATCH 05/12] paper: pushing draft to check compilation --- paper/paper.bib | 206 +++++++++++++++++++++++++++++++++++------------- paper/paper.md | 130 +++++++++--------------------- 2 files changed, 187 insertions(+), 149 deletions(-) diff --git a/paper/paper.bib b/paper/paper.bib index 4e4544a..ebca762 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -1,59 +1,153 @@ -@article{Pearson:2017, - url = {http://adsabs.harvard.edu/abs/2017arXiv170304627P}, - Archiveprefix = {arXiv}, - Author = {{Pearson}, S. and {Price-Whelan}, A.~M. and {Johnston}, K.~V.}, - Eprint = {1703.04627}, - Journal = {ArXiv e-prints}, - Keywords = {Astrophysics - Astrophysics of Galaxies}, - Month = mar, - Title = {{Gaps in Globular Cluster Streams: Pal 5 and the Galactic Bar}}, - Year = 2017 -} - -@book{Binney:2008, - url = {http://adsabs.harvard.edu/abs/2008gady.book.....B}, - Author = {{Binney}, J. and {Tremaine}, S.}, - Booktitle = {Galactic Dynamics: Second Edition, by James Binney and Scott Tremaine.~ISBN 978-0-691-13026-2 (HB).~Published by Princeton University Press, Princeton, NJ USA, 2008.}, - Publisher = {Princeton University Press}, - Title = {{Galactic Dynamics: Second Edition}}, - Year = 2008 -} - -@article{gaia, - author = {{Gaia Collaboration}}, - title = "{The Gaia mission}", - journal = {Astronomy and Astrophysics}, - archivePrefix = "arXiv", - eprint = {1609.04153}, - primaryClass = "astro-ph.IM", - keywords = {space vehicles: instruments, Galaxy: structure, astrometry, parallaxes, proper motions, telescopes}, - year = 2016, - month = nov, - volume = 595, - doi = {10.1051/0004-6361/201629272}, - url = {http://adsabs.harvard.edu/abs/2016A%26A...595A...1G}, -} - -@article{astropy, - author = {{Astropy Collaboration}}, - title = "{Astropy: A community Python package for astronomy}", - journal = {Astronomy and Astrophysics}, - archivePrefix = "arXiv", - eprint = {1307.6212}, - primaryClass = "astro-ph.IM", - keywords = {methods: data analysis, methods: miscellaneous, virtual observatory tools}, - year = 2013, - month = oct, - volume = 558, - doi = {10.1051/0004-6361/201322068}, - url = {http://adsabs.harvard.edu/abs/2013A%26A...558A..33A} -} - -@misc{fidgit, - author = {A. M. Smith and K. Thaney and M. Hahnel}, - title = {Fidgit: An ungodly union of GitHub and Figshare}, - year = {2020}, +@article{schoch_ncbi_2020, + title = {{NCBI} {Taxonomy}: a comprehensive update on curation, resources and tools}, + volume = {2020}, + issn = {1758-0463}, + shorttitle = {{NCBI} {Taxonomy}}, + doi = {10.1093/database/baaa062}, + abstract = {The National Center for Biotechnology Information (NCBI) Taxonomy includes organism names and classifications for every sequence in the nucleotide and protein sequence databases of the International Nucleotide Sequence Database Collaboration. Since the last review of this resource in 2012, it has undergone several improvements. Most notable is the shift from a single SQL database to a series of linked databases tied to a framework of data called NameBank. This means that relations among data elements can be adjusted in more detail, resulting in expanded annotation of synonyms, the ability to flag names with specific nomenclatural properties, enhanced tracking of publications tied to names and improved annotation of scientific authorities and types. Additionally, practices utilized by NCBI Taxonomy curators specific to major taxonomic groups are described, terms peculiar to NCBI Taxonomy are explained, external resources are acknowledged and updates to tools and other resources are documented. Database URL: https://www.ncbi.nlm.nih.gov/taxonomy.}, + language = {eng}, + journal = {Database: The Journal of Biological Databases and Curation}, + author = {Schoch, Conrad L. and Ciufo, Stacy and Domrachev, Mikhail and Hotton, Carol L. and Kannan, Sivakumar and Khovanskaya, Rogneda and Leipe, Detlef and Mcveigh, Richard and O'Neill, Kathleen and Robbertse, Barbara and Sharma, Shobha and Soussov, Vladimir and Sullivan, John P. and Sun, Lu and Turner, Seán and Karsch-Mizrachi, Ilene}, + month = jan, + year = {2020}, + pmid = {32761142}, + pmcid = {PMC7408187}, + keywords = {Databases, Genetic, Animals, Humans, Bacteria, Classification, Database Management Systems, National Library of Medicine (U.S.), Plants, United States, Viruses}, + pages = {baaa062}, +} + +@article{camacho_blast_2009, + title = {{BLAST}+: architecture and applications}, + volume = {10}, + issn = {1471-2105}, + shorttitle = {{BLAST}+}, + doi = {10.1186/1471-2105-10-421}, + abstract = {BACKGROUND: Sequence similarity searching is a very important bioinformatics task. While Basic Local Alignment Search Tool (BLAST) outperforms exact methods through its use of heuristics, the speed of the current BLAST software is suboptimal for very long queries or database sequences. There are also some shortcomings in the user-interface of the current command-line applications. +RESULTS: We describe features and improvements of rewritten BLAST software and introduce new command-line applications. Long query sequences are broken into chunks for processing, in some cases leading to dramatically shorter run times. For long database sequences, it is possible to retrieve only the relevant parts of the sequence, reducing CPU time and memory usage for searches of short queries against databases of contigs or chromosomes. The program can now retrieve masking information for database sequences from the BLAST databases. A new modular software library can now access subject sequence data from arbitrary data sources. We introduce several new features, including strategy files that allow a user to save and reuse their favorite set of options. The strategy files can be uploaded to and downloaded from the NCBI BLAST web site. +CONCLUSION: The new BLAST command-line applications, compared to the current BLAST tools, demonstrate substantial speed improvements for long queries as well as chromosome length database sequences. We have also improved the user interface of the command-line applications.}, + language = {eng}, + journal = {BMC bioinformatics}, + author = {Camacho, Christiam and Coulouris, George and Avagyan, Vahram and Ma, Ning and Papadopoulos, Jason and Bealer, Kevin and Madden, Thomas L.}, + month = dec, + year = {2009}, + pmid = {20003500}, + pmcid = {PMC2803857}, + keywords = {Computational Biology, Databases, Genetic, Sequence Alignment, Software}, + pages = {421}, +} + +@article{yu_ggtree_2017, +author = {Yu, Guangchuang and Smith, David K. and Zhu, Huachen and Guan, Yi and Lam, Tommy Tsan-Yuk}, +title = {ggtree: an r package for visualization and annotation of phylogenetic trees with their covariates and other associated data}, +journal = {Methods in Ecology and Evolution}, +volume = {8}, +number = {1}, +pages = {28-36}, +keywords = {annotation, bioconductor, evolution, phylogeny, r package, visualization}, +doi = {https://doi.org/10.1111/2041-210X.12628}, +url = {https://besjournals.onlinelibrary.wiley.com/doi/abs/10.1111/2041-210X.12628}, +eprint = {https://besjournals.onlinelibrary.wiley.com/doi/pdf/10.1111/2041-210X.12628}, +abstract = {Summary We present an r package, ggtree, which provides programmable visualization and annotation of phylogenetic trees. ggtree can read more tree file formats than other softwares, including newick, nexus, NHX, phylip and jplace formats, and support visualization of phylo, multiphylo, phylo4, phylo4d, obkdata and phyloseq tree objects defined in other r packages. It can also extract the tree/branch/node-specific and other data from the analysis outputs of beast, epa, hyphy, paml, phylodog, pplacer, r8s, raxml and revbayes software, and allows using these data to annotate the tree. The package allows colouring and annotation of a tree by numerical/categorical node attributes, manipulating a tree by rotating, collapsing and zooming out clades, highlighting user selected clades or operational taxonomic units and exploration of a large tree by zooming into a selected portion. A two-dimensional tree can be drawn by scaling the tree width based on an attribute of the nodes. A tree can be annotated with an associated numerical matrix (as a heat map), multiple sequence alignment, subplots or silhouette images. The package ggtree is released under the artistic-2.0 license. The source code and documents are freely available through bioconductor (http://www.bioconductor.org/packages/ggtree).}, +year = {2017} +} + + +@incollection{kans_entrez_2024, + title = {Entrez {Direct}: {E}-utilities on the {Unix} {Command} {Line}}, + shorttitle = {Entrez {Direct}}, + url = {https://www.ncbi.nlm.nih.gov/books/NBK179288/}, + abstract = {Entrez Direct (EDirect) provides access to the NCBI's suite of interconnected databases (publication, sequence, structure, gene, variation, expression, etc.) from a Unix terminal window. Search terms are entered as command-line arguments. Individual operations are connected with Unix pipes to construct multi-step queries. Selected records can then be retrieved in a variety of formats.}, + language = {en}, + urldate = {2024-09-25}, + booktitle = {Entrez {Programming} {Utilities} {Help} [{Internet}]}, + publisher = {National Center for Biotechnology Information (US)}, + author = {Kans, Jonathan}, + month = jul, + year = {2024}, +} + +@misc{team_pandas-devpandas_2024, + title = {pandas-dev/pandas: {Pandas}}, + shorttitle = {pandas-dev/pandas}, + url = {https://zenodo.org/records/13819579}, + abstract = {Pandas is a powerful data structures for data analysis, time series, and statistics.}, + urldate = {2024-09-25}, + publisher = {Zenodo}, + author = {team, The pandas development}, + month = sep, + year = {2024}, + doi = {10.5281/zenodo.13819579}, + keywords = {data science, python}, +} + +@article{ochoterena_search_2019, + title = {The {Search} for {Common} {Origin}: {Homology} {Revisited}}, + volume = {68}, + issn = {1063-5157}, + shorttitle = {The {Search} for {Common} {Origin}}, + url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6701455/}, + doi = {10.1093/sysbio/syz013}, + abstract = {Understanding the evolution of biodiversity on Earth is a central aim in biology. Currently, various disciplines of science contribute to unravel evolution at all levels of life, from individual organisms to species and higher ranks, using different approaches and specific terminologies. The search for common origin, traditionally called homology, is a connecting paradigm of all studies related to evolution. However, it is not always sufficiently taken into account that defining homology depends on the hierarchical level studied (organism, population, and species), which can cause confusion. Therefore, we propose a framework to define homologies making use of existing terms, which refer to homology in different fields, but restricting them to an unambiguous meaning and a particular hierarchical level. We propose to use the overarching term “homology” only when “morphological homology,” “vertical gene transfer,” and “phylogenetic homology” are confirmed. Consequently, neither phylogenetic nor morphological homology is equal to homology. This article is intended for readers with different research backgrounds. We challenge their traditional approaches, inviting them to consider the proposed framework and offering them a new perspective for their own research.}, + number = {5}, + urldate = {2024-09-26}, + journal = {Systematic Biology}, + author = {Ochoterena, Helga and Vrijdaghs, Alexander and Smets, Erik and Claßen-Bockhoff, Regine}, + month = sep, + year = {2019}, + pmid = {30796841}, + pmcid = {PMC6701455}, + pages = {767--780}, +} + +@article{sandall_globally_2023, + title = {A globally integrated structure of taxonomy to support biodiversity science and conservation}, + volume = {38}, + issn = {0169-5347}, + url = {https://www.sciencedirect.com/science/article/pii/S016953472300215X}, + doi = {10.1016/j.tree.2023.08.004}, + abstract = {All aspects of biodiversity research, from taxonomy to conservation, rely on data associated with species names. Effective integration of names across multiple fields is paramount and depends on the coordination and organization of taxonomic data. We assess current efforts and find that even key applications for well-studied taxa still lack commonality in taxonomic information required for integration. We identify essential taxonomic elements from our interoperability assessment to support improved access and integration of taxonomic data. A stronger focus on these elements has the potential to involve taxonomic communities in biodiversity science and overcome broken linkages currently limiting research capacity. We encourage a community effort to democratize taxonomic expertise and language in order to facilitate maximum interoperability and integration.}, + number = {12}, + urldate = {2024-09-26}, + journal = {Trends in Ecology \& Evolution}, + author = {Sandall, Emily L. and Maureaud, Aurore A. and Guralnick, Robert and McGeoch, Melodie A. and Sica, Yanina V. and Rogan, Matthew S. and Booher, Douglas B. and Edwards, Robert and Franz, Nico and Ingenloff, Kate and Lucas, Maisha and Marsh, Charles J. and McGowan, Jennifer and Pinkert, Stefan and Ranipeta, Ajay and Uetz, Peter and Wieczorek, John and Jetz, Walter}, + month = dec, + year = {2023}, + keywords = {biodiversity conservation, data linkage, integrative science, social infrastructure, taxonomic backbone}, + pages = {1143--1153} +} + +@incollection{celko_chapter_2004, + address = {San Francisco}, + series = {The {Morgan} {Kaufmann} {Series} in {Data} {Management} {Systems}}, + title = {Chapter 4 - {Nested} {Set} {Model} of {Hierarchies}}, + isbn = {978-1-55860-920-4}, + url = {https://www.sciencedirect.com/science/article/pii/B9781558609204500052}, + abstract = {Trees are often drawn as boxes-and-arrows charts that tend to fix the mental image of a tree into a graph structure. Another way of representing trees is to show them as nested sets. The chapter highlights this approach. To show a tree as nested sets, replace the boxes with ovals and then nest subordinate ovals inside their parents. Containment represents subordination. The root will be the largest oval and will contain every other node. The leaf nodes will be the innermost ovals, with nothing else inside them, and the nesting will show the hierarchical relationship. This is a natural way to model a parts explosion because a final assembly is made of physically nested assemblies that finally break down into separate parts. This approach is used to model a tree with nested sets with number pairs that always contain the pairs of their subordinates so that a child node is within the bounds of its parent. The chapter uses this approach of representation of a tree graph to present techniques to perform operations, such as finding root and leaf nodes, finding subtrees, finding levels and paths in a tree, finding the height of a tree, deleting nodes and subtrees, closing gaps in the tree, using summary functions on trees, and inserting and updating trees. The chapter also provides techniques to convert nested sets into adjacent list models, and compare nodes and structures of these models. All of these techniques are explained using simple SQL codes maintaining an organizational chart table to represent the hierarchy and people of a sample organization.}, + urldate = {2024-09-26}, + booktitle = {Joe {Celko}'s {Trees} and {Hierarchies} in {SQL} for {Smarties}}, + publisher = {Morgan Kaufmann}, + author = {Celko, Joe}, + editor = {Celko, Joe}, + month = jan, + year = {2004}, + doi = {10.1016/B978-155860920-4/50005-2}, + pages = {45--99}, +} + +@misc{anytree, + author = {Cofe Code and contributors}, + title = {Anytree: Python tree data library}, + year = {2024}, + publisher = {GitHub}, + journal = {GitHub repository}, + url = {https://github.com/c0fec0de/anytree} +} + +@misc{bigtree, + author = {Kay Jan W. and contributors}, + title = {BigTree: Tree Implementation and Methods for Python, integrated with list, dictionary, pandas and polars DataFrame.}, + year = {2024}, publisher = {GitHub}, journal = {GitHub repository}, - url = {https://github.com/arfon/fidgit} + url = {ttps://github.com/kayjan/bigtree} } diff --git a/paper/paper.md b/paper/paper.md index 720e1e7..633846e 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -1,120 +1,64 @@ --- -title: 'Gala: A Python package for galactic dynamics' +title: 'Taxonomy Resolver: A Python package for building and filtering taxonomy trees' tags: - Python - - astronomy - - dynamics - - galactic dynamics - - milky way + - Taxonomy + - Tree + - Hierarchy + - NCBI Taxonomy + - NCBI BLAST+ + - Nested Set Model + - Modified Preorder Tree Traversal authors: - - name: Adrian M. Price-Whelan - orcid: 0000-0000-0000-0000 - equal-contrib: true - affiliation: "1, 2" # (Multiple affiliations must be quoted) - - name: Author Without ORCID - equal-contrib: true # (This is how you can denote equal contributions between multiple authors) - affiliation: 2 - - name: Author with no affiliation - corresponding: true # (This is how to denote the corresponding author) - affiliation: 3 - - given-names: Ludwig - dropping-particle: van - surname: Beethoven - affiliation: 3 + - name: Fábio Madeira + orcid: 0000-0001-8728-9449 + corresponding: true + affiliation: 1 + - name: Ania Niewielska + orcid: 0000-0003-0989-3389 + affiliation: 1 + - name: Sarah Butcher + orcid: 0000-0002-4494-5124 + affiliation: 1 affiliations: - - name: Lyman Spitzer, Jr. Fellow, Princeton University, USA + - name: European Molecular Biology Laboratory, European Bioinformatics Institute (EMBL-EBI), Wellcome Trust Genome Campus, Hinxton, Cambridge CB10 1SD, UK index: 1 ror: 00hx57361 - - name: Institution Name, Country - index: 2 - - name: Independent Researcher, Country - index: 3 -date: 13 August 2017 +date: 26 September 2024 bibliography: paper.bib -# Optional fields if submitting to a AAS journal too, see this blog post: -# https://blog.joss.theoj.org/2018/12/a-new-collaboration-with-aas-publishing -aas-doi: 10.3847/xxxxx <- update this with the DOI from AAS once you know it. -aas-journal: Astrophysical Journal <- The name of the AAS journal. --- # Summary -The forces on stars, galaxies, and dark matter under external gravitational -fields lead to the dynamical evolution of structures in the universe. The orbits -of these bodies are therefore key to understanding the formation, history, and -future state of galaxies. The field of "galactic dynamics," which aims to model -the gravitating components of galaxies to study their structure and evolution, -is now well-established, commonly taught, and frequently used in astronomy. -Aside from toy problems and demonstrations, the majority of problems require -efficient numerical tools, many of which require the same base code (e.g., for -performing numerical orbit integration). +Taxonomy classification provides an important source of information for studying biological systems. It is a key component for many areas of biological sciences research, particularly genetics, evolutionary biology, biodiversity and conservation [@sandall_globally_2023]. Common ancestry, homology and conservation of sequence and structure are all central ideas in biology that are directly related to the evolutionary history of any group of organisms [@ochoterena_search_2019]. The National Center for Biotechnology Information (NCBI) Taxonomy [@schoch_ncbi_2020] provides a curated classification and nomenclature for all the organisms in the public sequence databases, across the taxonomic ranks (i.e. Domain, Kingdom, Phylum, Class, Order, Family, Genus and Species). -# Statement of need - -`Gala` is an Astropy-affiliated Python package for galactic dynamics. Python -enables wrapping low-level languages (e.g., C) for speed without losing -flexibility or ease-of-use in the user-interface. The API for `Gala` was -designed to provide a class-based and user-friendly interface to fast (C or -Cython-optimized) implementations of common operations such as gravitational -potential and force evaluation, orbit integration, dynamical transformations, -and chaos indicators for nonlinear dynamics. `Gala` also relies heavily on and -interfaces well with the implementations of physical units and astronomical -coordinate systems in the `Astropy` package [@astropy] (`astropy.units` and -`astropy.coordinates`). - -`Gala` was designed to be used by both astronomical researchers and by -students in courses on gravitational dynamics or astronomy. It has already been -used in a number of scientific publications [@Pearson:2017] and has also been -used in graduate courses on Galactic dynamics to, e.g., provide interactive -visualizations of textbook material [@Binney:2008]. The combination of speed, -design, and support for Astropy functionality in `Gala` will enable exciting -scientific explorations of forthcoming data releases from the *Gaia* mission -[@gaia] by students and experts alike. - -# Mathematics +Here we describe ``Taxonomy Resolver``, a Python module and command-line interface (CLI) application for building and filtering taxonomy trees based on the NCBI Taxonomy. Taxonomy Resolver streamlines the process of manipulating trees, enabling fast tree traversal, searching and filtering. -Single dollars ($) are required for inline mathematics e.g. $f(x) = e^{\pi/x}$ - -Double dollars make self-standing equations: - -$$\Theta(x) = \left\{\begin{array}{l} -0\textrm{ if } x < 0\cr -1\textrm{ else} -\end{array}\right.$$ - -You can also use plain \LaTeX for equations -\begin{equation}\label{eq:fourier} -\hat f(\omega) = \int_{-\infty}^{\infty} f(x) e^{i\omega x} dx -\end{equation} -and refer to \autoref{eq:fourier} from text. +# Statement of need -# Citations +The NCBI Taxonomy Database [@schoch_ncbi_2020] provides a hierarchically arranged list of organisms across all domains of life found in the sequence databases. Tree filtering, i.e. generation of tree subsets, referred to as subtrees, has various applications for sequence analysis, particularly for reducing the search space of sequence similarity searching algorithms. A sequence dataset composed of sequences from diverse taxa can be more quickly searched if only a subset of sequences which belong to taxonomies of interest are selected. -Citations to entries in paper.bib should be in -[rMarkdown](http://rmarkdown.rstudio.com/authoring_bibliographies_and_citations.html) -format. +The NCBI BLAST+ suite is the most widely used toolset in bioinformatics for performing sequence similarity search (4). The suite provides a Bash script (`get_species_taxids.sh`) to convert NCBI Taxonomy identifiers (TaxIDs) or text into TaxIDs suitable for filtering sequence searches. While this is a useful utility, it only works with sequences submitted to GenBank or other NCBI-hosted databases, and more importantly, it relies on making API calls via Entrez Direct (EDirect) [@kans_entrez_2024]. EDirect requires an internet connection and it does not scale well when working with large sequence datasets. Other general-purpose tree libraries exist for Python (e.g. ``anytree`` [@anytree] and ``bigtree`` [@bigtree]) and R (e.g. ``ggtree`` [@yu_ggtree_2017]), but they do not support the core features provided by Taxonomy Resolver or focus mainly on tree visualisation. The development of Taxonomy Resolver started in 2020 and aims to provide user-friendly interfaces for working directly with the NCBI Taxonomy hierarchical dataset. -If you want to cite a software repository URL (e.g. something on GitHub without a preferred -citation) then you can do it with the example BibTeX entry below for @fidgit. +# Implementation -For a quick reference, the following citation commands can be used: -- `@author:2001` -> "Author et al. (2001)" -- `[@author:2001]` -> "(Author et al., 2001)" -- `[@author1:2001; @author2:2001]` -> "(Author1 et al., 2001; Author2 et al., 2002)" +Taxonomy Resolver has been developed with simplicity in mind and it can be used both as a standard Python module or as a CLI application. The main tasks performed by Taxonomy Resolver are: -# Figures +* **downloading** the NCBI Taxonomy classification hierarchy “dump” from the NCBI FTP server +* **building** complete taxonomy tree data structures or partial trees, i.e. subtrees +* **searching** particular TaxIDs at any level of the taxonomy hierarchy, performing fast tree traversal +* **validating** TaxIDs against the NCBI Taxonomy or any given subtree +* **generating** taxonomy lists that compose any subtree, at any level of the taxonomy hierarchy +* **filtering** a tree based on the inclusion and/or exclusion of certain TaxIDs +* **writing** and loading tree data structures using Python’s object serialisation -Figures can be included like this: -![Caption for example figure.\label{fig:example}](figure.png) -and referenced from text using \autoref{fig:example}. +A taxonomy tree is a hierarchical structure that can be seen as a collection of deeply nested containers - nodes connected by edges, following the hierarchy, from the parent node - the root, all the way down to the children nodes - the leaves. An object-oriented programming (OOP) tree implementation based on recursion does not typically scale well for large trees, such as the NCBI Taxonomy, which is composed of >2.6 million nodes. To improve performance, Taxonomy Resolver represents the tree structure following the Nested Set Model, which is a technique developed to represent hierarchical data in relational databases lacking recursion capabilities. This allows for efficient and inexpensive querying of parent-child relationships. The full tree is traversed following the Modified Preorder Tree Traversal (MPTT) strategy [@celko_chapter_2004], in which each node in the tree is visited twice. In a preorder traversal, the root node is visited first, then recursively a preorder traversal of the left sub-tree, followed by a recursive preorder traversal of the right subtree, in order, until every node has been visited. The modified strategy allows capturing the 'left' and 'right' (*lft* and *rgt*, respectively) boundaries of each subtree, which are stored as two additional attributes. Finding a subtree is as simple as searching for the nodes where $lft > Node's lft$ and $rgt < Node's rgt$. Likewise, finding the full path to a node is as simple as searching for the nodes where $lft < Node's lft$ and $rgt > Node's rgt$. Traversal attributes, depth and node indexes are captured for each tree node and are stored as a pandas DataFrame [@team_pandas-devpandas_2024]. -Figure sizes can be customized by adding an optional second parameter: -![Caption for example figure.](figure.png){ width=20% } +In conclusion, Taxonomy Resolver has been developed to take advantage of the Nested Set Model tree structure, so it can perform fast validation and create lists of taxa that compose a particular subtree. Inclusion and exclusion lists can also be seamlessly used to produce subset trees with wide applications, particularly for sequence similarity search. # Acknowledgements -We acknowledge contributions from Brigitta Sipocz, Syrtis Major, and Semyeong -Oh, and support from Kathryn Johnston during the genesis of this project. +We would like to thank past and current members of the EMBL-EBI for their continued support. We would like to also thank EMBL and its funders. # References From 4814904d6c4d519ed80a58fd24b700fb2a6180cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fa=CC=81bio=20Madeira?= Date: Thu, 26 Sep 2024 17:06:53 +0100 Subject: [PATCH 06/12] paper: some fixes and improvements --- paper/paper.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index 633846e..234ccc6 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -21,9 +21,9 @@ authors: orcid: 0000-0002-4494-5124 affiliation: 1 affiliations: - - name: European Molecular Biology Laboratory, European Bioinformatics Institute (EMBL-EBI), Wellcome Trust Genome Campus, Hinxton, Cambridge CB10 1SD, UK + - name: 'European Molecular Biology Laboratory, European Bioinformatics Institute (EMBL-EBI), Wellcome Trust Genome Campus, Hinxton, Cambridge CB10 1SD, UK' index: 1 - ror: 00hx57361 + ror: 02catss52 date: 26 September 2024 bibliography: paper.bib @@ -39,7 +39,7 @@ Here we describe ``Taxonomy Resolver``, a Python module and command-line interfa The NCBI Taxonomy Database [@schoch_ncbi_2020] provides a hierarchically arranged list of organisms across all domains of life found in the sequence databases. Tree filtering, i.e. generation of tree subsets, referred to as subtrees, has various applications for sequence analysis, particularly for reducing the search space of sequence similarity searching algorithms. A sequence dataset composed of sequences from diverse taxa can be more quickly searched if only a subset of sequences which belong to taxonomies of interest are selected. -The NCBI BLAST+ suite is the most widely used toolset in bioinformatics for performing sequence similarity search (4). The suite provides a Bash script (`get_species_taxids.sh`) to convert NCBI Taxonomy identifiers (TaxIDs) or text into TaxIDs suitable for filtering sequence searches. While this is a useful utility, it only works with sequences submitted to GenBank or other NCBI-hosted databases, and more importantly, it relies on making API calls via Entrez Direct (EDirect) [@kans_entrez_2024]. EDirect requires an internet connection and it does not scale well when working with large sequence datasets. Other general-purpose tree libraries exist for Python (e.g. ``anytree`` [@anytree] and ``bigtree`` [@bigtree]) and R (e.g. ``ggtree`` [@yu_ggtree_2017]), but they do not support the core features provided by Taxonomy Resolver or focus mainly on tree visualisation. The development of Taxonomy Resolver started in 2020 and aims to provide user-friendly interfaces for working directly with the NCBI Taxonomy hierarchical dataset. +The NCBI BLAST+ suite is the most widely used toolset in bioinformatics for performing sequence similarity search [@camacho_blast_2009]. The suite provides a Bash script (`get_species_taxids.sh`) to convert NCBI Taxonomy identifiers (TaxIDs) or text into TaxIDs suitable for filtering sequence searches. While this is a useful utility, it only works with sequences submitted to GenBank or other NCBI-hosted databases, and more importantly, it relies on making API calls via Entrez Direct (EDirect) [@kans_entrez_2024]. EDirect requires an internet connection and it does not scale well when working with large sequence datasets. Other general-purpose tree libraries exist for Python (e.g. ``anytree`` [@anytree] and ``bigtree`` [@bigtree]) and R (e.g. ``ggtree`` [@yu_ggtree_2017]), but they do not support the core features provided by Taxonomy Resolver or focus mainly on tree visualisation. The development of Taxonomy Resolver started in 2020 and aims to provide user-friendly interfaces for working directly with the NCBI Taxonomy hierarchical dataset. # Implementation @@ -51,9 +51,9 @@ Taxonomy Resolver has been developed with simplicity in mind and it can be used * **validating** TaxIDs against the NCBI Taxonomy or any given subtree * **generating** taxonomy lists that compose any subtree, at any level of the taxonomy hierarchy * **filtering** a tree based on the inclusion and/or exclusion of certain TaxIDs -* **writing** and loading tree data structures using Python’s object serialisation +* **writing and loading** tree data structures using Python’s object serialisation -A taxonomy tree is a hierarchical structure that can be seen as a collection of deeply nested containers - nodes connected by edges, following the hierarchy, from the parent node - the root, all the way down to the children nodes - the leaves. An object-oriented programming (OOP) tree implementation based on recursion does not typically scale well for large trees, such as the NCBI Taxonomy, which is composed of >2.6 million nodes. To improve performance, Taxonomy Resolver represents the tree structure following the Nested Set Model, which is a technique developed to represent hierarchical data in relational databases lacking recursion capabilities. This allows for efficient and inexpensive querying of parent-child relationships. The full tree is traversed following the Modified Preorder Tree Traversal (MPTT) strategy [@celko_chapter_2004], in which each node in the tree is visited twice. In a preorder traversal, the root node is visited first, then recursively a preorder traversal of the left sub-tree, followed by a recursive preorder traversal of the right subtree, in order, until every node has been visited. The modified strategy allows capturing the 'left' and 'right' (*lft* and *rgt*, respectively) boundaries of each subtree, which are stored as two additional attributes. Finding a subtree is as simple as searching for the nodes where $lft > Node's lft$ and $rgt < Node's rgt$. Likewise, finding the full path to a node is as simple as searching for the nodes where $lft < Node's lft$ and $rgt > Node's rgt$. Traversal attributes, depth and node indexes are captured for each tree node and are stored as a pandas DataFrame [@team_pandas-devpandas_2024]. +A taxonomy tree is a hierarchical structure that can be seen as a collection of deeply nested containers - nodes connected by edges, following the hierarchy, from the parent node - the root, all the way down to the children nodes - the leaves. An object-oriented programming (OOP) tree implementation based on recursion does not typically scale well for large trees, such as the NCBI Taxonomy, which is composed of >2.6 million nodes. To improve performance, Taxonomy Resolver represents the tree structure following the Nested Set Model, which is a technique developed to represent hierarchical data in relational databases lacking recursion capabilities. This allows for efficient and inexpensive querying of parent-child relationships. The full tree is traversed following the Modified Preorder Tree Traversal (MPTT) strategy [@celko_chapter_2004], in which each node in the tree is visited twice. In a preorder traversal, the root node is visited first, then recursively a preorder traversal of the left sub-tree, followed by a recursive preorder traversal of the right subtree, in order, until every node has been visited. The modified strategy allows capturing the 'left' and 'right' ($lft$ and $rgt$, respectively) boundaries of each subtree, which are stored as two additional attributes. Finding a subtree is as simple as searching for the nodes of interest where $lft > node lft$ and $rgt < node rgt$. Likewise, finding the full path to a node is as simple as searching for the nodes where $lft < node lft$ and $rgt > node rgt$. Traversal attributes, depth and node indexes are captured for each tree node and are stored as a pandas DataFrame [@team_pandas-devpandas_2024]. In conclusion, Taxonomy Resolver has been developed to take advantage of the Nested Set Model tree structure, so it can perform fast validation and create lists of taxa that compose a particular subtree. Inclusion and exclusion lists can also be seamlessly used to produce subset trees with wide applications, particularly for sequence similarity search. From 3cfc42f570e74e241403926199923ba1fb7e823e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fa=CC=81bio=20Madeira?= Date: Thu, 26 Sep 2024 17:10:29 +0100 Subject: [PATCH 07/12] paper: fixing pandas citation --- paper/paper.bib | 4 ++-- paper/paper.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/paper/paper.bib b/paper/paper.bib index ebca762..60b6741 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -66,14 +66,14 @@ @incollection{kans_entrez_2024 year = {2024}, } -@misc{team_pandas-devpandas_2024, +@misc{pandas_2024, title = {pandas-dev/pandas: {Pandas}}, shorttitle = {pandas-dev/pandas}, url = {https://zenodo.org/records/13819579}, abstract = {Pandas is a powerful data structures for data analysis, time series, and statistics.}, urldate = {2024-09-25}, publisher = {Zenodo}, - author = {team, The pandas development}, + author = {The pandas development team}, month = sep, year = {2024}, doi = {10.5281/zenodo.13819579}, diff --git a/paper/paper.md b/paper/paper.md index 234ccc6..4f046ad 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -53,7 +53,7 @@ Taxonomy Resolver has been developed with simplicity in mind and it can be used * **filtering** a tree based on the inclusion and/or exclusion of certain TaxIDs * **writing and loading** tree data structures using Python’s object serialisation -A taxonomy tree is a hierarchical structure that can be seen as a collection of deeply nested containers - nodes connected by edges, following the hierarchy, from the parent node - the root, all the way down to the children nodes - the leaves. An object-oriented programming (OOP) tree implementation based on recursion does not typically scale well for large trees, such as the NCBI Taxonomy, which is composed of >2.6 million nodes. To improve performance, Taxonomy Resolver represents the tree structure following the Nested Set Model, which is a technique developed to represent hierarchical data in relational databases lacking recursion capabilities. This allows for efficient and inexpensive querying of parent-child relationships. The full tree is traversed following the Modified Preorder Tree Traversal (MPTT) strategy [@celko_chapter_2004], in which each node in the tree is visited twice. In a preorder traversal, the root node is visited first, then recursively a preorder traversal of the left sub-tree, followed by a recursive preorder traversal of the right subtree, in order, until every node has been visited. The modified strategy allows capturing the 'left' and 'right' ($lft$ and $rgt$, respectively) boundaries of each subtree, which are stored as two additional attributes. Finding a subtree is as simple as searching for the nodes of interest where $lft > node lft$ and $rgt < node rgt$. Likewise, finding the full path to a node is as simple as searching for the nodes where $lft < node lft$ and $rgt > node rgt$. Traversal attributes, depth and node indexes are captured for each tree node and are stored as a pandas DataFrame [@team_pandas-devpandas_2024]. +A taxonomy tree is a hierarchical structure that can be seen as a collection of deeply nested containers - nodes connected by edges, following the hierarchy, from the parent node - the root, all the way down to the children nodes - the leaves. An object-oriented programming (OOP) tree implementation based on recursion does not typically scale well for large trees, such as the NCBI Taxonomy, which is composed of >2.6 million nodes. To improve performance, Taxonomy Resolver represents the tree structure following the Nested Set Model, which is a technique developed to represent hierarchical data in relational databases lacking recursion capabilities. This allows for efficient and inexpensive querying of parent-child relationships. The full tree is traversed following the Modified Preorder Tree Traversal (MPTT) strategy [@celko_chapter_2004], in which each node in the tree is visited twice. In a preorder traversal, the root node is visited first, then recursively a preorder traversal of the left sub-tree, followed by a recursive preorder traversal of the right subtree, in order, until every node has been visited. The modified strategy allows capturing the 'left' and 'right' ($lft$ and $rgt$, respectively) boundaries of each subtree, which are stored as two additional attributes. Finding a subtree is as simple as searching for the nodes of interest where $lft > node lft$ and $rgt < node rgt$. Likewise, finding the full path to a node is as simple as searching for the nodes where $lft < node lft$ and $rgt > node rgt$. Traversal attributes, depth and node indexes are captured for each tree node and are stored as a pandas DataFrame [@pandas_2024]. In conclusion, Taxonomy Resolver has been developed to take advantage of the Nested Set Model tree structure, so it can perform fast validation and create lists of taxa that compose a particular subtree. Inclusion and exclusion lists can also be seamlessly used to produce subset trees with wide applications, particularly for sequence similarity search. From 0f0a11a1b2c1c3fe2f4bccf5ae0883c52686df9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fa=CC=81bio=20Madeira?= Date: Thu, 26 Sep 2024 17:28:13 +0100 Subject: [PATCH 08/12] paper: addional fixes --- paper/paper.bib | 50 ++++++++++++++++++++++++------------------------- paper/paper.md | 2 +- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/paper/paper.bib b/paper/paper.bib index 60b6741..d30b828 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -37,18 +37,18 @@ @article{camacho_blast_2009 } @article{yu_ggtree_2017, -author = {Yu, Guangchuang and Smith, David K. and Zhu, Huachen and Guan, Yi and Lam, Tommy Tsan-Yuk}, -title = {ggtree: an r package for visualization and annotation of phylogenetic trees with their covariates and other associated data}, -journal = {Methods in Ecology and Evolution}, -volume = {8}, -number = {1}, -pages = {28-36}, -keywords = {annotation, bioconductor, evolution, phylogeny, r package, visualization}, -doi = {https://doi.org/10.1111/2041-210X.12628}, -url = {https://besjournals.onlinelibrary.wiley.com/doi/abs/10.1111/2041-210X.12628}, -eprint = {https://besjournals.onlinelibrary.wiley.com/doi/pdf/10.1111/2041-210X.12628}, -abstract = {Summary We present an r package, ggtree, which provides programmable visualization and annotation of phylogenetic trees. ggtree can read more tree file formats than other softwares, including newick, nexus, NHX, phylip and jplace formats, and support visualization of phylo, multiphylo, phylo4, phylo4d, obkdata and phyloseq tree objects defined in other r packages. It can also extract the tree/branch/node-specific and other data from the analysis outputs of beast, epa, hyphy, paml, phylodog, pplacer, r8s, raxml and revbayes software, and allows using these data to annotate the tree. The package allows colouring and annotation of a tree by numerical/categorical node attributes, manipulating a tree by rotating, collapsing and zooming out clades, highlighting user selected clades or operational taxonomic units and exploration of a large tree by zooming into a selected portion. A two-dimensional tree can be drawn by scaling the tree width based on an attribute of the nodes. A tree can be annotated with an associated numerical matrix (as a heat map), multiple sequence alignment, subplots or silhouette images. The package ggtree is released under the artistic-2.0 license. The source code and documents are freely available through bioconductor (http://www.bioconductor.org/packages/ggtree).}, -year = {2017} + author = {Yu, Guangchuang and Smith, David K. and Zhu, Huachen and Guan, Yi and Lam, Tommy Tsan-Yuk}, + title = {ggtree: an r package for visualization and annotation of phylogenetic trees with their covariates and other associated data}, + journal = {Methods in Ecology and Evolution}, + volume = {8}, + number = {1}, + pages = {28-36}, + keywords = {annotation, bioconductor, evolution, phylogeny, r package, visualization}, + doi = {https://doi.org/10.1111/2041-210X.12628}, + url = {https://besjournals.onlinelibrary.wiley.com/doi/abs/10.1111/2041-210X.12628}, + eprint = {https://besjournals.onlinelibrary.wiley.com/doi/pdf/10.1111/2041-210X.12628}, + abstract = {Summary We present an r package, ggtree, which provides programmable visualization and annotation of phylogenetic trees. ggtree can read more tree file formats than other softwares, including newick, nexus, NHX, phylip and jplace formats, and support visualization of phylo, multiphylo, phylo4, phylo4d, obkdata and phyloseq tree objects defined in other r packages. It can also extract the tree/branch/node-specific and other data from the analysis outputs of beast, epa, hyphy, paml, phylodog, pplacer, r8s, raxml and revbayes software, and allows using these data to annotate the tree. The package allows colouring and annotation of a tree by numerical/categorical node attributes, manipulating a tree by rotating, collapsing and zooming out clades, highlighting user selected clades or operational taxonomic units and exploration of a large tree by zooming into a selected portion. A two-dimensional tree can be drawn by scaling the tree width based on an attribute of the nodes. A tree can be annotated with an associated numerical matrix (as a heat map), multiple sequence alignment, subplots or silhouette images. The package ggtree is released under the artistic-2.0 license. The source code and documents are freely available through bioconductor (http://www.bioconductor.org/packages/ggtree).}, + year = {2017} } @@ -73,7 +73,7 @@ @misc{pandas_2024 abstract = {Pandas is a powerful data structures for data analysis, time series, and statistics.}, urldate = {2024-09-25}, publisher = {Zenodo}, - author = {The pandas development team}, + author = {"The pandas development team"}, month = sep, year = {2024}, doi = {10.5281/zenodo.13819579}, @@ -135,19 +135,19 @@ @incollection{celko_chapter_2004 } @misc{anytree, - author = {Cofe Code and contributors}, - title = {Anytree: Python tree data library}, - year = {2024}, - publisher = {GitHub}, - journal = {GitHub repository}, - url = {https://github.com/c0fec0de/anytree} + author = {"Cofe Code and contributors"}, + title = {Anytree: Python tree data library}, + year = {2024}, + publisher = {GitHub}, + journal = {GitHub repository}, + url = {https://github.com/c0fec0de/anytree} } @misc{bigtree, - author = {Kay Jan W. and contributors}, - title = {BigTree: Tree Implementation and Methods for Python, integrated with list, dictionary, pandas and polars DataFrame.}, - year = {2024}, - publisher = {GitHub}, - journal = {GitHub repository}, - url = {ttps://github.com/kayjan/bigtree} + author = {"Kay Jan W. and contributors"}, + title = {BigTree: Tree Implementation and Methods for Python, integrated with list, dictionary, pandas and polars DataFrame.}, + year = {2024}, + publisher = {GitHub}, + journal = {GitHub repository}, + url = {ttps://github.com/kayjan/bigtree} } diff --git a/paper/paper.md b/paper/paper.md index 4f046ad..6dec8bb 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -53,7 +53,7 @@ Taxonomy Resolver has been developed with simplicity in mind and it can be used * **filtering** a tree based on the inclusion and/or exclusion of certain TaxIDs * **writing and loading** tree data structures using Python’s object serialisation -A taxonomy tree is a hierarchical structure that can be seen as a collection of deeply nested containers - nodes connected by edges, following the hierarchy, from the parent node - the root, all the way down to the children nodes - the leaves. An object-oriented programming (OOP) tree implementation based on recursion does not typically scale well for large trees, such as the NCBI Taxonomy, which is composed of >2.6 million nodes. To improve performance, Taxonomy Resolver represents the tree structure following the Nested Set Model, which is a technique developed to represent hierarchical data in relational databases lacking recursion capabilities. This allows for efficient and inexpensive querying of parent-child relationships. The full tree is traversed following the Modified Preorder Tree Traversal (MPTT) strategy [@celko_chapter_2004], in which each node in the tree is visited twice. In a preorder traversal, the root node is visited first, then recursively a preorder traversal of the left sub-tree, followed by a recursive preorder traversal of the right subtree, in order, until every node has been visited. The modified strategy allows capturing the 'left' and 'right' ($lft$ and $rgt$, respectively) boundaries of each subtree, which are stored as two additional attributes. Finding a subtree is as simple as searching for the nodes of interest where $lft > node lft$ and $rgt < node rgt$. Likewise, finding the full path to a node is as simple as searching for the nodes where $lft < node lft$ and $rgt > node rgt$. Traversal attributes, depth and node indexes are captured for each tree node and are stored as a pandas DataFrame [@pandas_2024]. +A taxonomy tree is a hierarchical structure that can be seen as a collection of deeply nested containers - nodes connected by edges, following the hierarchy, from the parent node - the root, all the way down to the children nodes - the leaves. An object-oriented programming (OOP) tree implementation based on recursion does not typically scale well for large trees, such as the NCBI Taxonomy, which is composed of >2.6 million nodes. To improve performance, Taxonomy Resolver represents the tree structure following the Nested Set Model, which is a technique developed to represent hierarchical data in relational databases lacking recursion capabilities. This allows for efficient and inexpensive querying of parent-child relationships. The full tree is traversed following the Modified Preorder Tree Traversal (MPTT) strategy [@celko_chapter_2004], in which each node in the tree is visited twice. In a preorder traversal, the root node is visited first, then recursively a preorder traversal of the left sub-tree, followed by a recursive preorder traversal of the right subtree, in order, until every node has been visited. The modified strategy allows capturing the 'left' and 'right' ($lft$ and $rgt$, respectively) boundaries of each subtree, which are stored as two additional attributes. Finding a subtree is as simple as searching for the nodes of interest where $lft > node's\ \ lft$ and $rgt < node's\ \ rgt$. Likewise, finding the full path to a node is as simple as searching for the nodes where $lft < node's\ \ lft$ and $rgt > node's\ \ rgt$. Traversal attributes, depth and node indexes are captured for each tree node and are stored as a pandas DataFrame [@pandas_2024]. In conclusion, Taxonomy Resolver has been developed to take advantage of the Nested Set Model tree structure, so it can perform fast validation and create lists of taxa that compose a particular subtree. Inclusion and exclusion lists can also be seamlessly used to produce subset trees with wide applications, particularly for sequence similarity search. From 60f3775da0afb552c601ae7fc1cdfc38edfd730f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fa=CC=81bio=20Madeira?= Date: Thu, 26 Sep 2024 17:36:08 +0100 Subject: [PATCH 09/12] paper: attempting to fix citation --- paper/paper.bib | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paper/paper.bib b/paper/paper.bib index d30b828..127c823 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -73,7 +73,7 @@ @misc{pandas_2024 abstract = {Pandas is a powerful data structures for data analysis, time series, and statistics.}, urldate = {2024-09-25}, publisher = {Zenodo}, - author = {"The pandas development team"}, + author = {{The pandas development team}}, month = sep, year = {2024}, doi = {10.5281/zenodo.13819579}, @@ -135,7 +135,7 @@ @incollection{celko_chapter_2004 } @misc{anytree, - author = {"Cofe Code and contributors"}, + author = {{Cofe Code and contributors}}, title = {Anytree: Python tree data library}, year = {2024}, publisher = {GitHub}, @@ -144,7 +144,7 @@ @misc{anytree } @misc{bigtree, - author = {"Kay Jan W. and contributors"}, + author = {{Kay Jan W. and contributors}}, title = {BigTree: Tree Implementation and Methods for Python, integrated with list, dictionary, pandas and polars DataFrame.}, year = {2024}, publisher = {GitHub}, From 61149a20394eea31121996162dfc9f64925316a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fa=CC=81bio=20Madeira?= Date: Thu, 26 Sep 2024 18:04:55 +0100 Subject: [PATCH 10/12] paper: additional fixes and authors --- paper/paper.md | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index 6dec8bb..d538403 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -14,6 +14,15 @@ authors: orcid: 0000-0001-8728-9449 corresponding: true affiliation: 1 + - name: Nandana Madhusoodanan + orcid: 0000-0001-5004-152X + affiliation: 1 + - name: Alberto Eusebi + orcid: 0000-0001-5179-7724 + affiliation: 1 + - name: Joonheung Lee + orcid: 0000-0002-5760-2761 + affiliation: 1 - name: Ania Niewielska orcid: 0000-0003-0989-3389 affiliation: 1 @@ -21,9 +30,10 @@ authors: orcid: 0000-0002-4494-5124 affiliation: 1 affiliations: - - name: 'European Molecular Biology Laboratory, European Bioinformatics Institute (EMBL-EBI), Wellcome Trust Genome Campus, Hinxton, Cambridge CB10 1SD, UK' - index: 1 - ror: 02catss52 + - name: | + European Molecular Biology Laboratory, European Bioinformatics Institute (EMBL-EBI), + Wellcome Trust Genome Campus, Hinxton, Cambridge CB10 1SD, UK + index: 1 date: 26 September 2024 bibliography: paper.bib @@ -53,7 +63,7 @@ Taxonomy Resolver has been developed with simplicity in mind and it can be used * **filtering** a tree based on the inclusion and/or exclusion of certain TaxIDs * **writing and loading** tree data structures using Python’s object serialisation -A taxonomy tree is a hierarchical structure that can be seen as a collection of deeply nested containers - nodes connected by edges, following the hierarchy, from the parent node - the root, all the way down to the children nodes - the leaves. An object-oriented programming (OOP) tree implementation based on recursion does not typically scale well for large trees, such as the NCBI Taxonomy, which is composed of >2.6 million nodes. To improve performance, Taxonomy Resolver represents the tree structure following the Nested Set Model, which is a technique developed to represent hierarchical data in relational databases lacking recursion capabilities. This allows for efficient and inexpensive querying of parent-child relationships. The full tree is traversed following the Modified Preorder Tree Traversal (MPTT) strategy [@celko_chapter_2004], in which each node in the tree is visited twice. In a preorder traversal, the root node is visited first, then recursively a preorder traversal of the left sub-tree, followed by a recursive preorder traversal of the right subtree, in order, until every node has been visited. The modified strategy allows capturing the 'left' and 'right' ($lft$ and $rgt$, respectively) boundaries of each subtree, which are stored as two additional attributes. Finding a subtree is as simple as searching for the nodes of interest where $lft > node's\ \ lft$ and $rgt < node's\ \ rgt$. Likewise, finding the full path to a node is as simple as searching for the nodes where $lft < node's\ \ lft$ and $rgt > node's\ \ rgt$. Traversal attributes, depth and node indexes are captured for each tree node and are stored as a pandas DataFrame [@pandas_2024]. +A taxonomy tree is a hierarchical structure that can be seen as a collection of deeply nested containers - nodes connected by edges, following the hierarchy, from the parent node - the root, all the way down to the children nodes - the leaves. An object-oriented programming (OOP) tree implementation based on recursion does not typically scale well for large trees, such as the NCBI Taxonomy, which is composed of >2.6 million nodes. To improve performance, Taxonomy Resolver represents the tree structure following the Nested Set Model, which is a technique developed to represent hierarchical data in relational databases lacking recursion capabilities. This allows for efficient and inexpensive querying of parent-child relationships. The full tree is traversed following the Modified Preorder Tree Traversal (MPTT) strategy [@celko_chapter_2004], in which each node in the tree is visited twice. In a preorder traversal, the root node is visited first, then recursively a preorder traversal of the left sub-tree, followed by a recursive preorder traversal of the right subtree, in order, until every node has been visited. The modified strategy allows capturing the 'left' and 'right' ($lft$ and $rgt$, respectively) boundaries of each subtree, which are stored as two additional attributes. Finding a subtree is as simple as searching for the nodes of interest where $lft > node's\ lft$ and $rgt < node's\ rgt$. Likewise, finding the full path to a node is as simple as searching for the nodes where $lft < node's\ lft$ and $rgt > node's\ rgt$. Traversal attributes, depth and node indexes are captured for each tree node and are stored as a pandas DataFrame [@pandas_2024]. In conclusion, Taxonomy Resolver has been developed to take advantage of the Nested Set Model tree structure, so it can perform fast validation and create lists of taxa that compose a particular subtree. Inclusion and exclusion lists can also be seamlessly used to produce subset trees with wide applications, particularly for sequence similarity search. From 0c386470af58eafc699bc61334d9cbd58057aa4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fa=CC=81bio=20Madeira?= Date: Fri, 27 Sep 2024 14:18:05 +0100 Subject: [PATCH 11/12] paper: updated conclusion --- paper/paper.bib | 16 ++++++++++++++++ paper/paper.md | 3 ++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/paper/paper.bib b/paper/paper.bib index 127c823..095ab67 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -151,3 +151,19 @@ @misc{bigtree journal = {GitHub repository}, url = {ttps://github.com/kayjan/bigtree} } + +@article{madeira_2024, + author = {Madeira, Fábio and Madhusoodanan, Nandana and Lee, Joonheung and Eusebi, Alberto and Niewielska, Ania and Tivey, Adrian R N and Lopez, Rodrigo and Butcher, Sarah}, + title = "{The EMBL-EBI Job Dispatcher sequence analysis tools framework in 2024}", + journal = {Nucleic Acids Research}, + volume = {52}, + number = {W1}, + pages = {W521-W525}, + year = {2024}, + month = {04}, + abstract = "{The EMBL-EBI Job Dispatcher sequence analysis tools framework (https://www.ebi.ac.uk/jdispatcher) enables the scientific community to perform a diverse range of sequence analyses using popular bioinformatics applications. Free access to the tools and required sequence datasets is provided through user-friendly web applications, as well as via RESTful and SOAP-based APIs. These are integrated into popular EMBL-EBI resources such as UniProt, InterPro, ENA and Ensembl Genomes. This paper overviews recent improvements to Job Dispatcher, including its brand new website and documentation, enhanced visualisations, improved job management, and a rising trend of user reliance on the service from low- and middle-income regions.}", + issn = {0305-1048}, + doi = {10.1093/nar/gkae241}, + url = {https://doi.org/10.1093/nar/gkae241}, + eprint = {https://academic.oup.com/nar/article-pdf/52/W1/W521/58436149/gkae241.pdf}, +} diff --git a/paper/paper.md b/paper/paper.md index d538403..080d86d 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -65,7 +65,8 @@ Taxonomy Resolver has been developed with simplicity in mind and it can be used A taxonomy tree is a hierarchical structure that can be seen as a collection of deeply nested containers - nodes connected by edges, following the hierarchy, from the parent node - the root, all the way down to the children nodes - the leaves. An object-oriented programming (OOP) tree implementation based on recursion does not typically scale well for large trees, such as the NCBI Taxonomy, which is composed of >2.6 million nodes. To improve performance, Taxonomy Resolver represents the tree structure following the Nested Set Model, which is a technique developed to represent hierarchical data in relational databases lacking recursion capabilities. This allows for efficient and inexpensive querying of parent-child relationships. The full tree is traversed following the Modified Preorder Tree Traversal (MPTT) strategy [@celko_chapter_2004], in which each node in the tree is visited twice. In a preorder traversal, the root node is visited first, then recursively a preorder traversal of the left sub-tree, followed by a recursive preorder traversal of the right subtree, in order, until every node has been visited. The modified strategy allows capturing the 'left' and 'right' ($lft$ and $rgt$, respectively) boundaries of each subtree, which are stored as two additional attributes. Finding a subtree is as simple as searching for the nodes of interest where $lft > node's\ lft$ and $rgt < node's\ rgt$. Likewise, finding the full path to a node is as simple as searching for the nodes where $lft < node's\ lft$ and $rgt > node's\ rgt$. Traversal attributes, depth and node indexes are captured for each tree node and are stored as a pandas DataFrame [@pandas_2024]. -In conclusion, Taxonomy Resolver has been developed to take advantage of the Nested Set Model tree structure, so it can perform fast validation and create lists of taxa that compose a particular subtree. Inclusion and exclusion lists can also be seamlessly used to produce subset trees with wide applications, particularly for sequence similarity search. +# Conclusion +Taxonomy Resolver has been developed to take advantage of the Nested Set Model tree structure, so it can perform fast validation and create lists of taxa that compose a particular subtree. Inclusion and exclusion lists can also be seamlessly used to produce subset trees with wide applications, particularly for sequence similarity search. Taxonomy Resolver has been in production since 2020 serving thousands of users every month. It provides taxonomy filtering features for NCBI BLAST+ provided by the popular EMBL-EBI Job Dispatcher service, available from https://www.ebi.ac.uk/jdispatcher/sss/ncbiblast [@madeira_2024]. # Acknowledgements From ae247a661e183ad0e28606964b28ccf709e4cc57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fa=CC=81bio=20Madeira?= Date: Mon, 30 Sep 2024 17:01:15 +0100 Subject: [PATCH 12/12] paper: small edits --- paper/paper.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index 080d86d..9d3d495 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -51,7 +51,7 @@ The NCBI Taxonomy Database [@schoch_ncbi_2020] provides a hierarchically arrange The NCBI BLAST+ suite is the most widely used toolset in bioinformatics for performing sequence similarity search [@camacho_blast_2009]. The suite provides a Bash script (`get_species_taxids.sh`) to convert NCBI Taxonomy identifiers (TaxIDs) or text into TaxIDs suitable for filtering sequence searches. While this is a useful utility, it only works with sequences submitted to GenBank or other NCBI-hosted databases, and more importantly, it relies on making API calls via Entrez Direct (EDirect) [@kans_entrez_2024]. EDirect requires an internet connection and it does not scale well when working with large sequence datasets. Other general-purpose tree libraries exist for Python (e.g. ``anytree`` [@anytree] and ``bigtree`` [@bigtree]) and R (e.g. ``ggtree`` [@yu_ggtree_2017]), but they do not support the core features provided by Taxonomy Resolver or focus mainly on tree visualisation. The development of Taxonomy Resolver started in 2020 and aims to provide user-friendly interfaces for working directly with the NCBI Taxonomy hierarchical dataset. -# Implementation +# Features Taxonomy Resolver has been developed with simplicity in mind and it can be used both as a standard Python module or as a CLI application. The main tasks performed by Taxonomy Resolver are: @@ -63,9 +63,10 @@ Taxonomy Resolver has been developed with simplicity in mind and it can be used * **filtering** a tree based on the inclusion and/or exclusion of certain TaxIDs * **writing and loading** tree data structures using Python’s object serialisation -A taxonomy tree is a hierarchical structure that can be seen as a collection of deeply nested containers - nodes connected by edges, following the hierarchy, from the parent node - the root, all the way down to the children nodes - the leaves. An object-oriented programming (OOP) tree implementation based on recursion does not typically scale well for large trees, such as the NCBI Taxonomy, which is composed of >2.6 million nodes. To improve performance, Taxonomy Resolver represents the tree structure following the Nested Set Model, which is a technique developed to represent hierarchical data in relational databases lacking recursion capabilities. This allows for efficient and inexpensive querying of parent-child relationships. The full tree is traversed following the Modified Preorder Tree Traversal (MPTT) strategy [@celko_chapter_2004], in which each node in the tree is visited twice. In a preorder traversal, the root node is visited first, then recursively a preorder traversal of the left sub-tree, followed by a recursive preorder traversal of the right subtree, in order, until every node has been visited. The modified strategy allows capturing the 'left' and 'right' ($lft$ and $rgt$, respectively) boundaries of each subtree, which are stored as two additional attributes. Finding a subtree is as simple as searching for the nodes of interest where $lft > node's\ lft$ and $rgt < node's\ rgt$. Likewise, finding the full path to a node is as simple as searching for the nodes where $lft < node's\ lft$ and $rgt > node's\ rgt$. Traversal attributes, depth and node indexes are captured for each tree node and are stored as a pandas DataFrame [@pandas_2024]. +# Implementation + +A taxonomy tree is a hierarchical structure that can be seen as a collection of deeply nested containers - nodes connected by edges, following the hierarchy, from the parent node - the root, all the way down to the children nodes - the leaves. An object-oriented programming (OOP) tree implementation based on recursion does not typically scale well for large trees, such as the NCBI Taxonomy, which is composed of >2.6 million nodes. To improve performance, Taxonomy Resolver represents the tree structure following the Nested Set Model, which is a technique developed to represent hierarchical data in relational databases lacking recursion capabilities. This allows for efficient and inexpensive querying of parent-child relationships. The full tree is traversed following the Modified Preorder Tree Traversal (MPTT) strategy [@celko_chapter_2004], in which each node in the tree is visited twice. In a preorder traversal, the root node is visited first, then recursively a preorder traversal of the left subtree, followed by a recursive preorder traversal of the right subtree, in order, until every node has been visited. The modified strategy allows capturing the 'left' and 'right' ($lft$ and $rgt$, respectively) boundaries of each subtree, which are stored as two additional attributes. Finding a subtree is as simple as searching for the nodes of interest where $lft > node's\ lft$ and $rgt < node's\ rgt$. Likewise, finding the full path to a node is as simple as searching for the nodes where $lft < node's\ lft$ and $rgt > node's\ rgt$. Traversal attributes, depth and node indexes are captured for each tree node and are stored as a pandas DataFrame [@pandas_2024]. -# Conclusion Taxonomy Resolver has been developed to take advantage of the Nested Set Model tree structure, so it can perform fast validation and create lists of taxa that compose a particular subtree. Inclusion and exclusion lists can also be seamlessly used to produce subset trees with wide applications, particularly for sequence similarity search. Taxonomy Resolver has been in production since 2020 serving thousands of users every month. It provides taxonomy filtering features for NCBI BLAST+ provided by the popular EMBL-EBI Job Dispatcher service, available from https://www.ebi.ac.uk/jdispatcher/sss/ncbiblast [@madeira_2024]. # Acknowledgements