Skip to content

Commit

Permalink
ENH: initial draft of get-gtdb (#154)
Browse files Browse the repository at this point in the history
  • Loading branch information
mikerobeson authored Mar 7, 2023
1 parent 2ff974b commit fcfb9fe
Show file tree
Hide file tree
Showing 8 changed files with 338 additions and 1 deletion.
26 changes: 26 additions & 0 deletions rescript/citations.bib
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,29 @@ @article{benson2012genbank
year={2012},
publisher={Oxford University Press}
}

@article{Parks2020gtdb,
title={A complete domain-to-species taxonomy for Bacteria and Archaea.},
author={Parks, Donovan H. and Chuvochina, Maria and Chaumeil, Pierre-Alain and Rinke, Christian and Mussig, Aaron J. and Hugenholtz, Philip},
journal={Nature Biotechnology},
volume={38},
pages={1079-1086},
year={2020},
doi={10.1038/s41587-020-0501-8},
url={https://doi.org/10.1038/s41587-020-0501-8}
}

@article{Parks2021gtdb,
author = {Parks, Donovan H and Chuvochina, Maria and Rinke, Christian and Mussig, Aaron J and Chaumeil, Pierre-Alain and Hugenholtz, Philip},
title = "{GTDB: an ongoing census of bacterial and archaeal diversity through a phylogenetically consistent, rank normalized and complete genome-based taxonomy}",
journal = {Nucleic Acids Research},
volume = {50},
number = {D1},
pages = {D785-D794},
year = {2021},
month = {09},
issn = {0305-1048},
doi = {10.1093/nar/gkab776},
url = {https://doi.org/10.1093/nar/gkab776},
eprint = {https://academic.oup.com/nar/article-pdf/50/D1/D785/42058271/gkab776\_supplemental\_file.pdf},
}
143 changes: 143 additions & 0 deletions rescript/get_gtdb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2019-2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import os
import tempfile
import tarfile
import warnings

import qiime2
from urllib.request import urlretrieve
from collections import defaultdict
from urllib.error import HTTPError
from rescript.get_data import _gzip_decompress

# Different versions may have different file names for archaea and
# bacteria. for example 'ar53' and 'bac120' mean that the GTDB phylogeny
# is based on 53 and 120 concatenated proteins (cp), respectively.
# If this changes we can set up a conditional statemnt below.
VERSION_MAP_DICT = {'207': {'Archaea': 'ar53', 'Bacteria': 'bac120'},
'202': {'Archaea': 'ar122', 'Bacteria': 'bac120'}}


def get_gtdb_data(ctx, version='207', domain='Both'):

ver_dom_dict = defaultdict(lambda: defaultdict(dict))

# Subset dict if needed, but keep same structure
# Although we can run the following merge actions on a list of one
# i.e. 'Archaea', we do not want to confuse anyone when looking
# at provenance, by running a merge command for no reason.
if domain == 'Both':
ver_dom_dict[version] = VERSION_MAP_DICT[version]
else:
ver_dom_dict[version][domain] = VERSION_MAP_DICT[version][domain]

queries = _assemble_queries(ver_dom_dict)
tax_q, seqs_q = _retrieve_data_from_gtdb(queries)

if domain == 'Both':
merge_gtdb_seqs = ctx.get_action('feature_table', 'merge_seqs')
merge_gtdb_taxonomy = ctx.get_action('feature_table', 'merge_taxa')
print('\n Merging taxonomy data...')
gtdb_tax, = merge_gtdb_taxonomy(data=tax_q)
print('\n Merging sequence data...')
gtdb_seqs, = merge_gtdb_seqs(data=seqs_q)
else:
gtdb_tax = tax_q[0]
gtdb_seqs = seqs_q[0]

print('\n Saving files...\n')
return gtdb_tax, gtdb_seqs


def _assemble_queries(ver_dom_dict):
queries = defaultdict(lambda: list())

base_seq_url = ('https://data.gtdb.ecogenomic.org/releases/release{ver}/'
'{ver}.0/genomic_files_reps/{cp}_ssu_reps_r{ver}.tar.gz')
base_tax_url = ('https://data.gtdb.ecogenomic.org/releases/release{ver}/'
'{ver}.0/{cp}_taxonomy_r{ver}.tsv.gz')

for version, dcp in ver_dom_dict.items():
for dom, cp in dcp.items():

queries['Taxonomy'].append(
(dom,
base_tax_url.format(**{'ver': version, 'cp': cp}),
'FeatureData[Taxonomy]',
'HeaderlessTSVTaxonomyFormat'))

queries['Sequence'].append(
(dom,
base_seq_url.format(**{'ver': version, 'cp': cp}),
'FeatureData[Sequence]',
'DNAFASTAFormat'))
return queries


def _retrieve_data_from_gtdb(queries):
'''
Download data from gtdb, given a list of queries.
queries: {'Taxonomy':(domain, url, type, format), (...),
'Sequence':(domain, url, type, format), (...)}
'''

tax_results = []
seq_results = []

print('\nDownloading and processing raw files ... \n')

with tempfile.TemporaryDirectory() as tmpdirname:
for sttype, q_info in queries.items():
for domain, url, dtype, fmt in q_info:
print('Retrieving {0} for {1} from {2}'.format(
sttype, domain, url))
# grab url
bn = os.path.basename(url)
destination = os.path.join(tmpdirname, bn)
try:
urlretrieve(url, destination)
except HTTPError:
msg = ("Unable to retrieve the followng file from GTDB:\n "
+ url)
warnings.warn(msg, UserWarning)
# seq files are contained within `tar.gz`
if tarfile.is_tarfile(destination):
seq_results.append(get_tar_data(destination, bn,
tmpdirname, dtype, fmt))
else:
tax_results.append(get_gzipped_data(destination, bn,
dtype, fmt))
return tax_results, seq_results


def get_tar_data(tar_loc, base_fn, tmpdirname, dtype, fmt):
try:
untarred_fn = base_fn.split('.')[0]+'.fna'
print(' Untarring {0}...\n'.format(base_fn))
with tarfile.open(tar_loc, 'r') as tar:
tar.extract(member=untarred_fn,
path=tmpdirname)
return qiime2.Artifact.import_data(dtype, os.path.join(
tmpdirname, untarred_fn), fmt)
except OSError:
raise OSError(('{0}: either does not exist or can not '
'be untarred!'.format(tar_loc)))


def get_gzipped_data(zipped_loc, base_fn, dtype, fmt):
try:
unzipped_destination = os.path.splitext(zipped_loc)[0]
print(' Unzipping {0}...\n'.format(base_fn))
_gzip_decompress(zipped_loc, unzipped_destination)
return qiime2.Artifact.import_data(dtype, unzipped_destination, fmt)
except OSError:
raise OSError(('{0}: either does not exist or can not '
'be unzipped!'.format(unzipped_destination)))
39 changes: 38 additions & 1 deletion rescript/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
from rescript.types.methods import reverse_transcribe
from rescript.ncbi import (
get_ncbi_data, _default_ranks, _allowed_ranks, get_ncbi_data_protein)

from .get_gtdb import get_gtdb_data

citations = Citations.load('citations.bib', package='rescript')

Expand All @@ -68,6 +68,12 @@
'and be aware that earlier versions may be released under a different '
'license.')

GTDB_LICENSE_NOTE = (
'NOTE: THIS ACTION ACQUIRES DATA FROM GTDB. SEE '
'https://gtdb.ecogenomic.org/about FOR MORE INFORMATION '
'and be aware that earlier versions may be released under a different '
'license.')

VOLATILITY_PLOT_XAXIS_INTERPRETATION = (
'The x-axis in these plots represents the taxonomic '
'levels present in the input taxonomies so are labeled numerically '
Expand Down Expand Up @@ -892,6 +898,37 @@
)


plugin.pipelines.register_function(
function=get_gtdb_data,
inputs={},
parameters={
'version': Str % Choices(['202', '207']),
'domain': Str % Choices(['Both', 'Bacteria', 'Archaea']),
},
outputs=[('gtdb_taxonomy', FeatureData[Taxonomy]),
('gtdb_sequences', FeatureData[Sequence])],
input_descriptions={},
parameter_descriptions={
'version': 'GTDB database version to download.',
'domain': 'Sequence and taxonomy data to download from a given '
'microbial domain from GTDB. \'Both\' will fetch both '
'bacterial and archaeal data. \'Bacteria\' will only '
'fetch bacterial data. \'Archaea\' will only fetch '
'archaeal data.'},
output_descriptions={
'gtdb_taxonomy': 'GTDB reference taxonomy.',
'gtdb_sequences': 'GTDB reference sequences.'},
name='Download, parse, and import GTDB reference data.',
description=(
'Download, parse, and import GTDB files, given a version '
'number. Downloads data directly from GTDB, '
'parses the taxonomy files, and outputs ready-to-use sequence and '
'taxonomy artifacts. REQUIRES STABLE INTERNET CONNECTION. ' +
GTDB_LICENSE_NOTE),
citations=[citations['Parks2020gtdb'], citations['Parks2021gtdb']]
)


plugin.methods.register_function(
function=filter_taxa,
inputs={'taxonomy': FeatureData[Taxonomy]},
Expand Down
4 changes: 4 additions & 0 deletions rescript/tests/data/gtdb-seqs-archaea.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>GB_GCA_000008085.1
CCCGTTGATCCTGCGGGAGGCCACCGCTATCTCCGTCCGGCTAACCCATGGAAGGCGAGGGTCCCCGGGTAAGGGGGCCCGCCGCACGGCTGAGTAACACGTCGGTAACCTACCCTCGGGACGGGGATAACCCCGGGAAACTGGGGCTAATCCCCGATAGGGGATGGGTGCTGGAAGGCCCCATCCCCGAGAGGGGCTAGCGGTACTTCCCCCGCTAGCCCGCCCGAGGATGGGCCGGCGGCCCATCAGGTAGTTGGCGGGGTAATGGCCCGCCAAGCCGAAGACGGGTAGGGGCCGTGAGAGCGGGAGCCCCCAGATCGGCACTGAGACAAGGGCCGAGGCCCTACGGGGCGCACCAGGGGCGAAACCTCCGCAATGCGGGAAACCGTGACGGGGGGACGGAGAGTGCCGGAGGGCGTTATGCTCTCCGGCTTTTGGGGAGTGTAAGTAGCTCCCCGAATAAGCGGTGGGCAAGAGGGGTGGCAGCCGCCGCGGGAACACCCCCACCGCGAGCGGTGGCCGTGATTATTGGGCCTAAAGGGGCCGTAGCCGGGCCGGTGTGGCTCCGGTGAAATCCTCGGGCTCAACCCGAGGGCGCGCCGGAGCTACTACCGGCCTAGGGACCGGGAGGGGCCGACCGTACTCCCGGGGGAGCGGTGAAATGCTGTAATCCCGGGAGGACGACCCGTGGCGAAAGCGGTCGGCCAGAACGGGTCCGACGGTGAGGGCCGAAGGCCGGGGGCTAGAACGGGATTAGAGACCCCGGTATTCCCGGCTGTCAACGCTGCGGGCTACCTGCTGGGCGGGCTACGAGCCCGCCCAGTGGGGTAGGGAAGCCGTTAAGCCCGCCGCCTGGGGAGTACGGCCGCAAGGCTGAAACTTAAAGGAATAGGCGGGGGAGCACACAAGAGGTGGGGTGCGCGGTTTAATTGGATTCGACGCCGGGAACCTCACCGGGGCTGACAGCACAATGATGGTCGGCCTGAAGGGCCTACCGGAGGCGCTGAGAGGAGGTGCATGGCCGCCGTCAGCCTGTGCCGTGAGGTGCCCTGTTAAGTCAGGAAACAGGCGAGACCCGCGCCCGCAGTTGCGACGGCCGAAAGGCCGGCACACTGCGGGGACTGCCGGGGAAACCCGGAGGAAGGTGCGGGCGACGGCAGGTATGCATGCCCCGAATGCCCCGGGCTACACGCGCGCATCAATGGGCGGGACAGGGGGCCGCGACCCCGAAAGGGGGAGCAAATCCCCAAACCCGCTCTCAGTCCAGATCGAGGGCTGCAACTCGCCCTCGTGACGGCGGAATCTCTAGTAGTCGGACGTCACCAGCGTCCGGCGAATACGTCCCTGCTCCTTGCACTCACCGCCCGTCAAGCCACCCGAGCTGGGGCCTAGCGAGGCCGTGGGGGGTTCGCCCCCCACGGTCGAGCTAGGCCCCGGCGAGGGGGGCTAAGTCGACACAAGGTAGCCGTAGGGGAACCTGCGGCTGGATCACCTCCTA
>GB_GCA_000016605.1
TCCGGTTGATCCTGCCGGACCCGATCGCTATAGGGGTAGGGCTAAGCCATGGGAGTCGTACGCTCTCGGGAAGAGGGCGTGGCGGACGGCTGAGTAACACGTGGCTAACCTGCCCTTGGGATCTGGATAACCCCGGGAAACTGGGGCTAATCCGGAGCGGGCAAGGGAATCTGGAATGATCTCTTGCCTAAAAGCCTCTCGGCTGATCCCGTCGAGAGGCGCCCAAGGATGGGGCTGCGGCCCATCAGGCTGTTGGGGGAGTAAAGGTCCCCCAAACCGATAACGGGTAGGGGCCGTGGGAGCGGGAGCCCCCAGTTGGGCACTGAGACAAGGGCCCAGGCCCTACGGGGCGCACCAGGCGCGGAACGTCCCCAATGCGGGAAACCGTGAGGGCGTTACCCCTAGTGCCCTCGCAAGAGGGCTTTTCTCCACTCCAGAAAGGTGGAGGAATAAGCGGGGGGCAAGACTGGTGTCAGCCGCCGCGGTAATACCAGCCCCGCGAGTGATCGGGACGTTTATTGGGCTTAAAGCGCCCGTAGCCGGCCTGTAAAGTCACCGTTTAAAGACCCGGGCTCAACTCGGGGAACGGCGGTGATACTTACAGGCTAGGGGGCGGGAGAGGTCGGAGGTACTCCCGGAGTAGGGGCGAAATCCTCAGATCCCGGGAGGACCACCAGTGGCGAAAGCGTCCGGCTAGAACGCGCCCGACGGTGAGGGGCGAAAGCCGGGGTAGCAAATAGGATTAGATACCCTAGTAGTCCCGGCTGTAAACGATGCAGGCTAGGTGTCGCGTAGGCTTTGTGCCTGCGCGGTGCCGCAGGAAAACTGGTAAGCCCGCCGCCTGGGGAGTACGGCCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCACCACAAGGGGTGGAACCTGCGGCTCAATTGGAGTCAACGCCTGGAATCTCACCGGGGGAGACCGCAGGATGACGGCCAGGCTAACGACCTTGCCAGACTCGCGGAGAGGAGGTGCATGGCCGTCGCCAGCTCGTGTTGTGAAATGTCCGGTTAAGTCCGGCAACGAGCGAGACCCCCACTTCTAGTTGGTAACCGTCTCTCCGGAGACGGTCCACACTAGAAGGACTGCCGGTGTTAAACCGGAGGAAGGAGGGGGCCACGGCAGGTCAGCATGCCCCGAAACTTCCGGGCCGCACGCGGGTTACAATGGCAGGGACAGCGGGATCCGACCCCGAGAGGGGAAGGCAATCCCACAAACCCTGCCTCAGTTGGGATCGAGGGCTGAAACTCGCCCTCGTGAACGAGGAATCCCTAGTAACCGCGGGTCAACAACCCGCGGTGAATACGTCCCTGCTCCTTGCACACACCGCCCGTCGCTCCACCCGAGTGGAGGGGAAGTGAGGCCTCTTGCCCCTCGGGGTGGGAGGTCGAGCTTCTCCTCCGCGAGGGGGGAGAAGTCGTAACAAGGTAGCCGTAGGGGAACCTGCGGCTGGATCACCTCA
4 changes: 4 additions & 0 deletions rescript/tests/data/gtdb-seqs-bacteria.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>GB_GCA_000007325.1
GAAGAGTTTGATCCTGGCTCAGGATGAACGCTGACAGAATGCTTAACACATGCAAGTCTACTTGAATTTGGGTTTTTTAACTTCGATTTGGGTGGCGGACGGGTGAGTAACGCGTAAAGAACTTGCCTCACAGCTAGGGACAACATTTGGAAACGAATGCTAATACCTGATATTATGATTATAGGGCATCCTAGAATTATGAAAGCTATATGCGCTGTGAGAGAGCTTTGCGTCCCATTAGCTAGTTGGAGAGGTAACGGCTCACCAAGGCGATGATGGGTAGCCGGCCTGAGAGGGTGAACGGCCACAAGGGGACTGAGACACGGCCCTTACTCCTACGGGAGGCAGCAGTGGGGAATATTGGACAATGGACCGAGAGTCTGATCCAGCAATTCTGTGTGCACGATGACGTTTTTCGGAATGTAAAGTGCTTTCAGTTGGGAAGAAAAAAATGACGGTACCAACAGAAGAAGTGACGGCTAAATACGTGCCAGCAGCCGCGGTAATACGTATGTCACGAGCGTTATCCGGATTTATTGGGCGTAAAGCGCGTCTAGGTGGTTATGTAAGTCTGATGTGAAAATGCAGGGCTCAACTCTGTATTGCGTTGGAAACTGTGTAACTAGAGTACTGGAGAGGTAAGCGGAACTACAAGTGTAGAGGTGAAATTCGTAGATATTTGTAGGAATGCCGATGGGGAAGCCAGCTTACTGGACAGATACTGACGCTGAAGCGCGAAAGCGTGGGTAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGATTACTAGGTGTTGGGGGTCGAACCTCAGCGCCCAAGCAAACGCGATAAGTAATCCGCCTGGGGAGTACGTACGCAAGTATGAAACTCAAAGGAATTGACGGGGACCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGACGCAACGCGAGGAACCTTACCAGCGTTTGACATCTTAGGAATGAGACAGAGATGTTTCAGTGTCCCTTCGGGGAAACCTAAAGACAGGTGGTGCATGGCTGTCGTCAGCTCGTGTCGTGAGATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCCTTTCGTATGTTACCATCATTAAGTTGGGGACTCATGCGATACTGCCTACGATGAGTAGGAGGAAGGTGGGGATGACGTCAAGTCATCATGCCCCTTATACGCTGGGCTACACACGTGCTACAATGGGTAGAACAGAGAGTTGCAAAGCCGTGAGGTGGAGCTAATCTCAGAAAACTATTCTTAGTTCGGATTGTACTCTGCAACTCGAGTACATGAAGTTGGAATCGCTAGTAATCGCGAATCAGCAATGTCGCGGTGAATACGTTCTCGGGTCTTGTACACACCGCCCGTCACACCACGAGAGTTGGTTGCACCTGAAGTAGCAGGCCTAACCGTAAGGAGGGATGTTCCGAGGGTGTGATTAGCGATTGGGGTGAAGTCGTAACAAGGTATCCGTACGGGAACGTGCGGATGGATCACCTCCTTTC
>GB_GCA_000008885.1
GGAGGTGATCCAACCACAGGTTCCCCTACGGTTACCTTGTTACGACTTCACCCCAGTTATGAACCACAAAGTGGTAAGCGCCCTCCGAAAGGTTAAGCTACCTGCTTCTTTTGCAGCTCACTTCCATGGTGTGACGGGCGGTGTGTACAAGGCCCGGGAACGTATTCACCGCGGCATGCTGATCCGCGATTACTAGCGATTCCGACTTCATGGAGTCGAGTTGCAGACTCCAATCCGGACTAAGACGTACTTTATGAGATTAGCTTACTTTCGCAAGTTTGCTGCCCTTTGTATACGCCATTGTAGCACGTGTGTAGCCCTACTCGTAAGGGCCATGATGACTTGACGTCATCCCCACCTTCCTCCGGTTTATCACCGGCAGTCTCCTTTGAGTTCCCGACTTTTTCGCTGGCAAAAAAGGATAGGGGTTGCGCTCGTTGCGGGACTTAACCCAACATTTCACAACACGAGCTGACGACAGCCATGCAGCACCTGTTTTAAAGCTCCCGAAGGCACTAAAGCATCTCTGCTAAATTCTTTAAATGTCAAGAGTAGGTAAGGTTTTTCGCGTTGCATCGAATTAAACCACATGCTCCACCGCTTGTGCGGGCCCCCGTCAATTCATTTGAGTTTTAACCTTGCGATTGTACTCCCCAGGCGGTCGATTTAACGCGTTAGCTTCGAAAACCCCGAGTAAACTCGCAACCTTCAAATCGACATCGTTTACGGCATGGACTACCAGGGTATCTAATCCTGTTTGCTCCCCACGCTTTCGTACATGAGCGTCAGTTTTCGCCCAGGAGGCCGCCTTCGCCGCTGGTATTCCTCCAGATATCTACGCATTTCACCGCTACACCTGGAATTCTACCTCCCTCTACGATACTCTAGTTTATTAGTTTCAAATGCAGTTCCTAGGTTGAGCCTAGGGATTTCACATCTGACTTAATAAACCGCCTGCGTACGCTTTACGCCCAGTAATTCCGATTAACGCTTGCACCCTCCGTATTACCGCGGCTGCTGGCACGGAGTTAGCCGGTGCTTCTTCTGCAAGTAACGTCACATAAATATGGTATTATCACATTTACTTTCTTCCCTGCTGAAAGTGCTTTACAATCCGAAGACCTTCTTCACACACGCGGCATAGCTGCATCAGGGTTTCCCCCATTGTGCAATATTCCCCACTGCTGCCTCCCGTAGGAGTCTGGACCGTATCTCAGTTCCAGTGTGGCTGGTTATCCTCTCAGACCAGCTAGAGATCGTAGCCTAGGTGAGCATTTACCTCACCTACTAGCTAATCTCATCTGGGTTCATCTAAAAACGCAAGGCTGATATAAAGTATTATATTAGTCCCCTGCTTTGATCTTTCGATATTATGCGGTATTAGCTACCGTTTCCAGTAGTTGTCCCCCTTTTTTAGGCAGATCCCCAGACATTACTCACCCGTTCGCCGCTCGCCGTCAAAGAAAAATCTCTACGCTGCCGCACGACTTGCATGTGTTAGGCTTGCCGCTAGCGTTCAATCTGAGCCATGATCAAACTCTTCAAT
3 changes: 3 additions & 0 deletions rescript/tests/data/gtdb-taxa-archaea.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Feature ID Taxon
GB_GCA_000008085.1 d__Archaea;p__Nanoarchaeota;c__Nanoarchaeia;o__Nanoarchaeales;f__Nanoarchaeaceae;g__Nanoarchaeum;s__Nanoarchaeum equitans
GB_GCA_000016605.1 d__Archaea;p__Thermoproteota;c__Thermoproteia;o__Sulfolobales;f__Sulfolobaceae;g__Metallosphaera;s__Metallosphaera sedula
3 changes: 3 additions & 0 deletions rescript/tests/data/gtdb-taxa-bacteria.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Feature ID Taxon
GB_GCA_000007325.1 d__Bacteria;p__Fusobacteriota;c__Fusobacteriia;o__Fusobacteriales;f__Fusobacteriaceae;g__Fusobacterium;s__Fusobacterium nucleatum
GB_GCA_000008885.1 d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales_A;f__Enterobacteriaceae_A;g__Wigglesworthia;s__Wigglesworthia glossinidia_A
Loading

0 comments on commit fcfb9fe

Please sign in to comment.