Skip to content

Commit

Permalink
Add module GTDB-Tk: a software toolkit for assigning objective taxono…
Browse files Browse the repository at this point in the history
…mic classifications to bacterial and archaeal genomes (MultiQC#2970)

* GTDBTK module minimum run

* parsing and plotting a table

* added defaults to prevent crashes if there is no classification

* removed redundant column

* cleaned up the table

* formatting and docstrings

* importing TableConfig

* allow for multiple log files

* fix no sample found error

* filter None

* Refactor

* Do not strip leading spaces to assure same number of columns

* Hide long classifications and method by default

* Lint

---------

Co-authored-by: Vlad Savelyev <[email protected]>
  • Loading branch information
harper357 and vladsavelyev authored Dec 12, 2024
1 parent 0c8e447 commit fa8f476
Show file tree
Hide file tree
Showing 7 changed files with 177 additions and 3 deletions.
1 change: 1 addition & 0 deletions multiqc/config_defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -484,6 +484,7 @@ module_order:
- truvari
- megahit
- ganon
- gtdbtk
# Alignment tool stats
- bbmap
- bismark
Expand Down
4 changes: 2 additions & 2 deletions multiqc/modules/checkm/checkm.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def parse_file(self, f, data_by_sample):

# Check which format the data is in so we can grab the correct columns later
column_names = []
cols = re.split(r"\t| {3,}", header.rstrip())
cols = re.split(r"\t| {3,}", header.rstrip("\n"))
format_different_column = cols[5]
if format_different_column == "0":
column_names = column_names_format_1
Expand All @@ -123,7 +123,7 @@ def parse_file(self, f, data_by_sample):
return

for line in lines[1:]:
row = re.split(r"\t| {3,}", line.rstrip())
row = re.split(r"\t| {3,}", line.rstrip("\n"))
sname = row[0]
if sname in data_by_sample:
log.debug(f"Duplicate sample name found! Overwriting: {sname}")
Expand Down
2 changes: 1 addition & 1 deletion multiqc/modules/checkm2/checkm2.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def parse_file(self, f, data_by_sample):
log.warning(f"Skipping file {f['fn']} because it has no data")
return
for line in lines[1:]:
row = line.rstrip().split("\t")
row = line.rstrip("\n").split("\t")
if len(row) != len(column_names):
log.warning(f"Skipping line {line} because it has {len(row)} columns instead of {len(column_names)}")
continue
Expand Down
3 changes: 3 additions & 0 deletions multiqc/modules/gtdbtk/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .gtdbtk import MultiqcModule

__all__ = ["MultiqcModule"]
166 changes: 166 additions & 0 deletions multiqc/modules/gtdbtk/gtdbtk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
import logging

from multiqc.base_module import BaseMultiqcModule, ModuleNoSamplesFound
from multiqc.plots import table
from multiqc.plots.table_object import TableConfig

log = logging.getLogger(__name__)


class MultiqcModule(BaseMultiqcModule):
"""
The module parses `summary.tsv` outputs from GTDB-Tk's `classify.py` and `classify_wf.py`.
The module only works for version >= 2.4.0 because column names changed.
`classify.py` and `classify_wf.py` are used to determine the taxonomic classification of input genomes.
"""

def __init__(self):
super(MultiqcModule, self).__init__(
name="GTDB-Tk",
anchor="gtdbtk",
href="https://ecogenomics.github.io/GTDBTk/index.html",
info="Toolkit for assigning objective taxonomic classifications to bacterial and archaeal genomes.",
doi=["10.1093/bioinformatics/btac672"],
)

data_by_sample = {}
for f in self.find_log_files("gtdbtk"):
self.parse_file(f, data_by_sample)
self.add_data_source(f)

data_by_sample = self.ignore_samples(data_by_sample)
if len(data_by_sample) == 0:
raise ModuleNoSamplesFound
log.info(f"Found {len(data_by_sample)} reports")

# Superfluous function call to confirm that it is used in this module
# Replace None with actual version if it is available
self.add_software_version()

self.write_data_file(data_by_sample, "multiqc_gtdbtk")

self.closest_taxa_table(data_by_sample)

def parse_file(self, f, data_by_sample):
"""Parse the summary.tsv outputs."""
column_names = (
"user_genome",
"classification",
"closest_genome_reference",
"closest_genome_reference_radius",
"closest_genome_taxonomy",
"closest_genome_ani",
"closest_genome_af",
"closest_placement_reference",
"closest_placement_radius",
"closest_placement_taxonomy",
"closest_placement_ani",
"closest_placement_af",
"pplacer_taxonomy",
"classification_method",
"note",
"other_related_references",
"msa_percent",
"translation_table",
"red_value",
"warnings",
)
lines = f["f"].splitlines()
if len(lines) <= 1:
log.warning(f"Skipping file {f['fn']} because it has no data")
return
for line in lines[1:]:
row = line.rstrip("\n").split("\t")
if len(row) != len(column_names):
log.warning(f"Skipping line {line} because it has {len(row)} columns instead of {len(column_names)}")
continue
sname = row[0]
if sname in data_by_sample:
log.debug(f"Duplicate sample name found! Overwriting: {sname}")
data_by_sample[sname] = {k: v for k, v in zip(column_names[1:], row[1:]) if v}

def closest_taxa_table(self, data_by_sample):
"""Add a table showing the closest taxa for each query genome."""
classication_method_translate_dict = {
"taxonomic classification defined by topology and ANI": ["closest_genome_ani", "closest_genome_af"],
"ani_screen": ["closest_genome_ani", "closest_genome_af"],
"taxonomic classification fully defined by topology": [
"closest_placement_ani",
"closest_placement_af",
],
"taxonomic novelty determined using RED": [None, None],
}
table_data = {}
for sname, data in data_by_sample.items():
classification_value_columns = classication_method_translate_dict.get(
data.get("classification_method"), [None, None]
)
classification = data.get("classification", "").split(";")
last_classification = classification[-1]
table_data[sname] = {
"classification": last_classification,
"full_classification": "; ".join(classification),
"classification_method": data.get("classification_method", None),
"ANI": data.get(classification_value_columns[0], None),
"AF": data.get(classification_value_columns[1], None),
"red_value": data.get("red_value", None),
"note": data.get("note"),
"warnings": data.get("warnings"),
}
headers = {
"classification": {
"title": "Classification",
"description": "GTDB taxonomy string inferred by the GTDB-Tk.",
},
"full_classification": {
"title": "Full classification",
"description": "Full GTDB taxonomy string inferred by the GTDB-Tk.",
"hidden": True,
},
"classification_method": {
"title": "Classification method",
"description": "Indicates the rule used to classify the genome.",
"hidden": True,
},
"ANI": {
"title": "ANI to closest genome",
"description": "Depending on the classification method, either the 'closest_genome_ani' or 'closest_placement_ani'.",
"min": 0,
"max": 100,
},
"AF": {
"title": "AF to closest genome",
"description": "Depending on the classification method, either the 'closest_genome_af' or 'closest_placement_af'.",
"min": 0,
"max": 1,
"scale": "Purples",
},
"red_value": {
"title": "RED",
"description": "Indicates the relative evolutionary divergence (RED) for a query genome. RED is not calculated when a query genome can be classified based on ANI.",
"min": 0,
"max": 1,
},
"warnings": {
"title": "Warnings",
"description": "Indicates unusual characteristics of the query genome that may impact the taxonomic assignment.",
},
"note": {
"title": "Notes",
"description": "Provides additional information regarding the classification of the genome.",
},
}
pconfig = TableConfig(
title="Taxonomy classifications",
id="gtdbtk-first-table",
col1_header="User genome",
)
self.add_section(
name="MAG taxonomy",
anchor="gtdbtk-taxonomy",
description="The taxonomy of a MAG as found by GTDB.",
helptext="GTDB-Tk is a software toolkit for assigning objective taxonomic classifications to bacterial and archaeal genomes based on the Genome Database Taxonomy GTDB. It is designed to work with recent advances that allow hundreds or thousands of metagenome-assembled genomes (MAGs) to be obtained directly from environmental samples. It can also be applied to isolate and single-cell genomes. ",
plot=table.plot(data=table_data, headers=headers, pconfig=pconfig),
)
3 changes: 3 additions & 0 deletions multiqc/search_patterns.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,9 @@ goleft_indexcov/ped:
fn: "*-indexcov.ped"
gopeaks:
fn: "*_gopeaks.json"
gtdbtk:
contents: "user_genome classification closest_genome_reference closest_genome_reference_radius closest_genome_taxonomy closest_genome_ani"
num_lines: 10
haplocheck:
contents: '"Sample" "Contamination Status" "Contamination Level" "Distance" "Sample Coverage"'
num_lines: 10
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ gatk = "multiqc.modules.gatk:MultiqcModule"
glimpse = "multiqc.modules.glimpse:MultiqcModule"
goleft_indexcov = "multiqc.modules.goleft_indexcov:MultiqcModule"
gopeaks = "multiqc.modules.gopeaks:MultiqcModule"
gtdbtk = "multiqc.modules.gtdbtk:MultiqcModule"
haplocheck = "multiqc.modules.haplocheck:MultiqcModule"
happy = "multiqc.modules.happy:MultiqcModule"
hicexplorer = "multiqc.modules.hicexplorer:MultiqcModule"
Expand Down

0 comments on commit fa8f476

Please sign in to comment.