ebi-jdispatcher · biomadeira · Oct 15, 2024 · Sep 24, 2024 · Sep 24, 2024 · Sep 24, 2024
diff --git a/.github/workflows/compile_paper.yml b/.github/workflows/compile_paper.yml
@@ -0,0 +1,24 @@
+name: Draft PDF
+on: [push]
+
+jobs:
+  paper:
+    runs-on: ubuntu-latest
+    name: Paper Draft
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Build draft PDF
+        uses: openjournals/openjournals-draft-action@master
+        with:
+          journal: joss
+          # This should be the path to the paper within your repo.
+          paper-path: paper/paper.md
+      - name: Upload
+        uses: actions/upload-artifact@v4
+        with:
+          name: paper
+          # This is the output path where Pandoc will write the compiled
+          # PDF. Note, this should be the same directory as the input
+          # paper.md
+          path: paper/paper.pdf
diff --git a/paper/paper.bib b/paper/paper.bib
@@ -0,0 +1,169 @@
+@article{schoch_ncbi_2020,
+	title = {{NCBI} {Taxonomy}: a comprehensive update on curation, resources and tools},
+	volume = {2020},
+	issn = {1758-0463},
+	shorttitle = {{NCBI} {Taxonomy}},
+	doi = {10.1093/database/baaa062},
+	abstract = {The National Center for Biotechnology Information (NCBI) Taxonomy includes organism names and classifications for every sequence in the nucleotide and protein sequence databases of the International Nucleotide Sequence Database Collaboration. Since the last review of this resource in 2012, it has undergone several improvements. Most notable is the shift from a single SQL database to a series of linked databases tied to a framework of data called NameBank. This means that relations among data elements can be adjusted in more detail, resulting in expanded annotation of synonyms, the ability to flag names with specific nomenclatural properties, enhanced tracking of publications tied to names and improved annotation of scientific authorities and types. Additionally, practices utilized by NCBI Taxonomy curators specific to major taxonomic groups are described, terms peculiar to NCBI Taxonomy are explained, external resources are acknowledged and updates to tools and other resources are documented. Database URL: https://www.ncbi.nlm.nih.gov/taxonomy.},
+	language = {eng},
+	journal = {Database: The Journal of Biological Databases and Curation},
+	author = {Schoch, Conrad L. and Ciufo, Stacy and Domrachev, Mikhail and Hotton, Carol L. and Kannan, Sivakumar and Khovanskaya, Rogneda and Leipe, Detlef and Mcveigh, Richard and O'Neill, Kathleen and Robbertse, Barbara and Sharma, Shobha and Soussov, Vladimir and Sullivan, John P. and Sun, Lu and Turner, Seán and Karsch-Mizrachi, Ilene},
+	month = jan,
+	year = {2020},
+	pmid = {32761142},
+	pmcid = {PMC7408187},
+	keywords = {Databases, Genetic, Animals, Humans, Bacteria, Classification, Database Management Systems, National Library of Medicine (U.S.), Plants, United States, Viruses},
+	pages = {baaa062},
+}
+
+@article{camacho_blast_2009,
+	title = {{BLAST}+: architecture and applications},
+	volume = {10},
+	issn = {1471-2105},
+	shorttitle = {{BLAST}+},
+	doi = {10.1186/1471-2105-10-421},
+	abstract = {BACKGROUND: Sequence similarity searching is a very important bioinformatics task. While Basic Local Alignment Search Tool (BLAST) outperforms exact methods through its use of heuristics, the speed of the current BLAST software is suboptimal for very long queries or database sequences. There are also some shortcomings in the user-interface of the current command-line applications.
+RESULTS: We describe features and improvements of rewritten BLAST software and introduce new command-line applications. Long query sequences are broken into chunks for processing, in some cases leading to dramatically shorter run times. For long database sequences, it is possible to retrieve only the relevant parts of the sequence, reducing CPU time and memory usage for searches of short queries against databases of contigs or chromosomes. The program can now retrieve masking information for database sequences from the BLAST databases. A new modular software library can now access subject sequence data from arbitrary data sources. We introduce several new features, including strategy files that allow a user to save and reuse their favorite set of options. The strategy files can be uploaded to and downloaded from the NCBI BLAST web site.
+CONCLUSION: The new BLAST command-line applications, compared to the current BLAST tools, demonstrate substantial speed improvements for long queries as well as chromosome length database sequences. We have also improved the user interface of the command-line applications.},
+	language = {eng},
+	journal = {BMC bioinformatics},
+	author = {Camacho, Christiam and Coulouris, George and Avagyan, Vahram and Ma, Ning and Papadopoulos, Jason and Bealer, Kevin and Madden, Thomas L.},
+	month = dec,
+	year = {2009},
+	pmid = {20003500},
+	pmcid = {PMC2803857},
+	keywords = {Computational Biology, Databases, Genetic, Sequence Alignment, Software},
+	pages = {421},
+}
+
+@article{yu_ggtree_2017,
+	author = {Yu, Guangchuang and Smith, David K. and Zhu, Huachen and Guan, Yi and Lam, Tommy Tsan-Yuk},
+	title = {ggtree: an r package for visualization and annotation of phylogenetic trees with their covariates and other associated data},
+	journal = {Methods in Ecology and Evolution},
+	volume = {8},
+	number = {1},
+	pages = {28-36},
+	keywords = {annotation, bioconductor, evolution, phylogeny, r package, visualization},
+	doi = {https://doi.org/10.1111/2041-210X.12628},
+	url = {https://besjournals.onlinelibrary.wiley.com/doi/abs/10.1111/2041-210X.12628},
+	eprint = {https://besjournals.onlinelibrary.wiley.com/doi/pdf/10.1111/2041-210X.12628},
+	abstract = {Summary We present an r package, ggtree, which provides programmable visualization and annotation of phylogenetic trees. ggtree can read more tree file formats than other softwares, including newick, nexus, NHX, phylip and jplace formats, and support visualization of phylo, multiphylo, phylo4, phylo4d, obkdata and phyloseq tree objects defined in other r packages. It can also extract the tree/branch/node-specific and other data from the analysis outputs of beast, epa, hyphy, paml, phylodog, pplacer, r8s, raxml and revbayes software, and allows using these data to annotate the tree. The package allows colouring and annotation of a tree by numerical/categorical node attributes, manipulating a tree by rotating, collapsing and zooming out clades, highlighting user selected clades or operational taxonomic units and exploration of a large tree by zooming into a selected portion. A two-dimensional tree can be drawn by scaling the tree width based on an attribute of the nodes. A tree can be annotated with an associated numerical matrix (as a heat map), multiple sequence alignment, subplots or silhouette images. The package ggtree is released under the artistic-2.0 license. The source code and documents are freely available through bioconductor (http://www.bioconductor.org/packages/ggtree).},
+	year = {2017}
+}
+
+
+@incollection{kans_entrez_2024,
+	title = {Entrez {Direct}: {E}-utilities on the {Unix} {Command} {Line}},
+	shorttitle = {Entrez {Direct}},
+	url = {https://www.ncbi.nlm.nih.gov/books/NBK179288/},
+	abstract = {Entrez Direct (EDirect) provides access to the NCBI's suite of interconnected databases (publication, sequence, structure, gene, variation, expression, etc.) from a Unix terminal window. Search terms are entered as command-line arguments. Individual operations are connected with Unix pipes to construct multi-step queries. Selected records can then be retrieved in a variety of formats.},
+	language = {en},
+	urldate = {2024-09-25},
+	booktitle = {Entrez {Programming} {Utilities} {Help} [{Internet}]},
+	publisher = {National Center for Biotechnology Information (US)},
+	author = {Kans, Jonathan},
+	month = jul,
+	year = {2024},
+}
+
+@misc{pandas_2024,
+	title = {pandas-dev/pandas: {Pandas}},
+	shorttitle = {pandas-dev/pandas},
+	url = {https://zenodo.org/records/13819579},
+	abstract = {Pandas is a powerful data structures for data analysis, time series, and statistics.},
+	urldate = {2024-09-25},
+	publisher = {Zenodo},
+	author = {{The pandas development team}},
+	month = sep,
+	year = {2024},
+	doi = {10.5281/zenodo.13819579},
+	keywords = {data science, python},
+}
+
+@article{ochoterena_search_2019,
+	title = {The {Search} for {Common} {Origin}: {Homology} {Revisited}},
+	volume = {68},
+	issn = {1063-5157},
+	shorttitle = {The {Search} for {Common} {Origin}},
+	url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6701455/},
+	doi = {10.1093/sysbio/syz013},
+	abstract = {Understanding the evolution of biodiversity on Earth is a central aim in biology. Currently, various disciplines of science contribute to unravel evolution at all levels of life, from individual organisms to species and higher ranks, using different approaches and specific terminologies. The search for common origin, traditionally called homology, is a connecting paradigm of all studies related to evolution. However, it is not always sufficiently taken into account that defining homology depends on the hierarchical level studied (organism, population, and species), which can cause confusion. Therefore, we propose a framework to define homologies making use of existing terms, which refer to homology in different fields, but restricting them to an unambiguous meaning and a particular hierarchical level. We propose to use the overarching term “homology” only when “morphological homology,” “vertical gene transfer,” and “phylogenetic homology” are confirmed. Consequently, neither phylogenetic nor morphological homology is equal to homology. This article is intended for readers with different research backgrounds. We challenge their traditional approaches, inviting them to consider the proposed framework and offering them a new perspective for their own research.},
+	number = {5},
+	urldate = {2024-09-26},
+	journal = {Systematic Biology},
+	author = {Ochoterena, Helga and Vrijdaghs, Alexander and Smets, Erik and Claßen-Bockhoff, Regine},
+	month = sep,
+	year = {2019},
+	pmid = {30796841},
+	pmcid = {PMC6701455},
+	pages = {767--780},
+}
+
+@article{sandall_globally_2023,
+	title = {A globally integrated structure of taxonomy to support biodiversity science and conservation},
+	volume = {38},
+	issn = {0169-5347},
+	url = {https://www.sciencedirect.com/science/article/pii/S016953472300215X},
+	doi = {10.1016/j.tree.2023.08.004},
+	abstract = {All aspects of biodiversity research, from taxonomy to conservation, rely on data associated with species names. Effective integration of names across multiple fields is paramount and depends on the coordination and organization of taxonomic data. We assess current efforts and find that even key applications for well-studied taxa still lack commonality in taxonomic information required for integration. We identify essential taxonomic elements from our interoperability assessment to support improved access and integration of taxonomic data. A stronger focus on these elements has the potential to involve taxonomic communities in biodiversity science and overcome broken linkages currently limiting research capacity. We encourage a community effort to democratize taxonomic expertise and language in order to facilitate maximum interoperability and integration.},
+	number = {12},
+	urldate = {2024-09-26},
+	journal = {Trends in Ecology \& Evolution},
+	author = {Sandall, Emily L. and Maureaud, Aurore A. and Guralnick, Robert and McGeoch, Melodie A. and Sica, Yanina V. and Rogan, Matthew S. and Booher, Douglas B. and Edwards, Robert and Franz, Nico and Ingenloff, Kate and Lucas, Maisha and Marsh, Charles J. and McGowan, Jennifer and Pinkert, Stefan and Ranipeta, Ajay and Uetz, Peter and Wieczorek, John and Jetz, Walter},
+	month = dec,
+	year = {2023},
+	keywords = {biodiversity conservation, data linkage, integrative science, social infrastructure, taxonomic backbone},
+	pages = {1143--1153}
+}
+
+@incollection{celko_chapter_2004,
+	address = {San Francisco},
+	series = {The {Morgan} {Kaufmann} {Series} in {Data} {Management} {Systems}},
+	title = {Chapter 4 - {Nested} {Set} {Model} of {Hierarchies}},
+	isbn = {978-1-55860-920-4},
+	url = {https://www.sciencedirect.com/science/article/pii/B9781558609204500052},
+	abstract = {Trees are often drawn as boxes-and-arrows charts that tend to fix the mental image of a tree into a graph structure. Another way of representing trees is to show them as nested sets. The chapter highlights this approach. To show a tree as nested sets, replace the boxes with ovals and then nest subordinate ovals inside their parents. Containment represents subordination. The root will be the largest oval and will contain every other node. The leaf nodes will be the innermost ovals, with nothing else inside them, and the nesting will show the hierarchical relationship. This is a natural way to model a parts explosion because a final assembly is made of physically nested assemblies that finally break down into separate parts. This approach is used to model a tree with nested sets with number pairs that always contain the pairs of their subordinates so that a child node is within the bounds of its parent. The chapter uses this approach of representation of a tree graph to present techniques to perform operations, such as finding root and leaf nodes, finding subtrees, finding levels and paths in a tree, finding the height of a tree, deleting nodes and subtrees, closing gaps in the tree, using summary functions on trees, and inserting and updating trees. The chapter also provides techniques to convert nested sets into adjacent list models, and compare nodes and structures of these models. All of these techniques are explained using simple SQL codes maintaining an organizational chart table to represent the hierarchy and people of a sample organization.},
+	urldate = {2024-09-26},
+	booktitle = {Joe {Celko}'s {Trees} and {Hierarchies} in {SQL} for {Smarties}},
+	publisher = {Morgan Kaufmann},
+	author = {Celko, Joe},
+	editor = {Celko, Joe},
+	month = jan,
+	year = {2004},
+	doi = {10.1016/B978-155860920-4/50005-2},
+	pages = {45--99},
+}
+
+@misc{anytree,
+	author = {{Cofe Code and contributors}},
+	title = {Anytree: Python tree data library},
+	year = {2024},
+	publisher = {GitHub},
+	journal = {GitHub repository},
+	url = {https://github.com/c0fec0de/anytree}
+}
+
+@misc{bigtree,
+	author = {{Kay Jan W. and contributors}},
+	title = {BigTree: Tree Implementation and Methods for Python, integrated with list, dictionary, pandas and polars DataFrame.},
+	year = {2024},
+	publisher = {GitHub},
+	journal = {GitHub repository},
+	url = {ttps://github.com/kayjan/bigtree}
+}
+
+@article{madeira_2024,
+    author = {Madeira, Fábio and Madhusoodanan, Nandana and Lee, Joonheung and Eusebi, Alberto and Niewielska, Ania and Tivey, Adrian R N and Lopez, Rodrigo and Butcher, Sarah},
+    title = "{The EMBL-EBI Job Dispatcher sequence analysis tools framework in 2024}",
+    journal = {Nucleic Acids Research},
+    volume = {52},
+    number = {W1},
+    pages = {W521-W525},
+    year = {2024},
+    month = {04},
+    abstract = "{The EMBL-EBI Job Dispatcher sequence analysis tools framework (https://www.ebi.ac.uk/jdispatcher) enables the scientific community to perform a diverse range of sequence analyses using popular bioinformatics applications. Free access to the tools and required sequence datasets is provided through user-friendly web applications, as well as via RESTful and SOAP-based APIs. These are integrated into popular EMBL-EBI resources such as UniProt, InterPro, ENA and Ensembl Genomes. This paper overviews recent improvements to Job Dispatcher, including its brand new website and documentation, enhanced visualisations, improved job management, and a rising trend of user reliance on the service from low- and middle-income regions.}",
+    issn = {0305-1048},
+    doi = {10.1093/nar/gkae241},
+    url = {https://doi.org/10.1093/nar/gkae241},
+    eprint = {https://academic.oup.com/nar/article-pdf/52/W1/W521/58436149/gkae241.pdf},
+}