Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: expand predict-genes-prodigal to take SampleData[Contigs] as input #236

Merged
merged 8 commits into from
Feb 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion q2_annotate/eggnog/tests/test_dbs.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ def test_fetch_ncbi_taxonomy(self, mock_os_rm, mock_run, mock_md5):
]

# Check that commands are ran as expected
mock_os_rm.assert_called_once_with(zip_path)
mock_os_rm.assert_any_call(zip_path)
mock_run.assert_has_calls(
expected_calls,
any_order=False
Expand Down
22 changes: 11 additions & 11 deletions q2_annotate/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -1208,10 +1208,10 @@
plugin.methods.register_function(
function=q2_annotate.prodigal.predict_genes_prodigal,
inputs={
'mags': FeatureData[MAG] | SampleData[MAGs]
'sequences': FeatureData[MAG] | SampleData[MAGs] | SampleData[Contigs]
},
input_descriptions={
'mags': 'MAGs for which one wishes to predict genes.'
'sequences': 'MAGs or contigs for which one wishes to predict genes.'
},
parameters={
"translation_table_number": Str % Choices([
Expand All @@ -1234,15 +1234,15 @@
('proteins', GenomeData[Proteins])
],
output_descriptions={
'loci': "Gene coordinates files (one per MAG) listing the location of "
"each predicted gene as well as some additional scoring "
"information. ",
'genes': "Fasta files (one per MAG) with the nucleotide sequences of "
"the predicted genes.",
'proteins': "Fasta files (one per MAG) with the protein translation "
"of the predicted genes."
},
name='Predict gene sequences from MAGs using Prodigal.',
'loci': "Gene coordinates files (one per MAG or sample) listing the "
"location of each predicted gene as well as some additional "
"scoring information. ",
'genes': "Fasta files (one per MAG or sample) with the nucleotide "
"sequences of the predicted genes.",
'proteins': "Fasta files (one per MAG or sample) with the protein "
"translation of the predicted genes."
},
name='Predict gene sequences from MAGs or contigs using Prodigal.',
description="Prodigal (PROkaryotic DYnamic programming "
"Gene-finding ALgorithm), a gene prediction algorithm "
"designed for improved gene structure prediction, translation "
Expand Down
26 changes: 16 additions & 10 deletions q2_annotate/prodigal/prodigal.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,16 @@
from typing import Union
from .._utils import run_command
from q2_types.feature_data_mag import MAGSequencesDirFmt
from q2_types.per_sample_sequences import MultiMAGSequencesDirFmt
from q2_types.per_sample_sequences import MultiMAGSequencesDirFmt, ContigSequencesDirFmt
from q2_types.genome_data import (
LociDirectoryFormat, GenesDirectoryFormat, ProteinsDirectoryFormat,
)


def predict_genes_prodigal(
mags: Union[MAGSequencesDirFmt, MultiMAGSequencesDirFmt],
sequences: Union[
MAGSequencesDirFmt, MultiMAGSequencesDirFmt, ContigSequencesDirFmt
],
translation_table_number: str = "11",
) -> (LociDirectoryFormat, GenesDirectoryFormat, ProteinsDirectoryFormat):

Expand All @@ -33,7 +35,7 @@ def predict_genes_prodigal(
"-f", "gff"
]

def _run_prodigal(path_to_input: str, mag_id: str, subdir: str = None):
def _run_prodigal(path_to_input: str, _id: str, subdir: str = None):
# If subdirectory is not None, append a "/" s.t. the command
# below is defined correctly. Otw subdir = ""
subdir = subdir + "/" if subdir else ""
Expand All @@ -42,18 +44,22 @@ def _run_prodigal(path_to_input: str, mag_id: str, subdir: str = None):
cmd = cp.deepcopy(base_cmd)
cmd.extend([
"-i", path_to_input,
"-o", os.path.join(loci.path, f"{subdir}{mag_id}.gff"),
"-a", os.path.join(proteins.path, f"{subdir}{mag_id}.fasta"),
"-d", os.path.join(genes.path, f"{subdir}{mag_id}.fasta")
"-o", os.path.join(loci.path, f"{subdir}{_id}.gff"),
"-a", os.path.join(proteins.path, f"{subdir}{_id}.fasta"),
"-d", os.path.join(genes.path, f"{subdir}{_id}.fasta")
])
run_command(cmd)

if isinstance(mags, MAGSequencesDirFmt):
for mag_id, mag_fp in mags.feature_dict().items():
if isinstance(sequences, MAGSequencesDirFmt):
for mag_id, mag_fp in sequences.feature_dict().items():
_run_prodigal(mag_fp, mag_id)

elif isinstance(mags, MultiMAGSequencesDirFmt):
for sample_id, mags_dict in mags.sample_dict().items():
elif isinstance(sequences, ContigSequencesDirFmt):
for sample_id, contigs_fp in sequences.sample_dict().items():
_run_prodigal(contigs_fp, sample_id)

elif isinstance(sequences, MultiMAGSequencesDirFmt):
for sample_id, mags_dict in sequences.sample_dict().items():
# Make sample_id folders in output locations
for output_object in [loci, genes, proteins]:
os.makedirs(os.path.join(output_object.path, sample_id))
Expand Down
30 changes: 23 additions & 7 deletions q2_annotate/prodigal/tests/test_prodigal.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from q2_annotate.prodigal.prodigal import predict_genes_prodigal
from qiime2.plugin.testing import TestPluginBase
from q2_types.feature_data_mag import MAGSequencesDirFmt
from q2_types.per_sample_sequences import MultiMAGSequencesDirFmt
from q2_types.per_sample_sequences import MultiMAGSequencesDirFmt, ContigSequencesDirFmt
from unittest.mock import patch, call
from q2_types.genome_data import (
LociDirectoryFormat, GenesDirectoryFormat, ProteinsDirectoryFormat,
Expand All @@ -22,9 +22,9 @@ class TestBUSCO(TestPluginBase):
@patch("subprocess.run")
def test_run_prodigal_feature_data_1_mag(self, subp_run):
# Run prodigal with dummy data
p = self.get_data_path("dir_with_1_mag")
p = self.get_data_path("mags/dir_with_1_mag")
mags = MAGSequencesDirFmt(path=p, mode="r")
loci, genes, proteins = predict_genes_prodigal(mags=mags)
loci, genes, proteins = predict_genes_prodigal(sequences=mags)

# Check that output is correct type
self.assertIsInstance(loci, LociDirectoryFormat)
Expand Down Expand Up @@ -55,9 +55,9 @@ def test_run_prodigal_feature_data_1_mag(self, subp_run):
@patch("subprocess.run")
def test_run_prodigal_feature_data_3_mag(self, subp_run):
# Run prodigal with dummy data
p = self.get_data_path("dir_with_3_mag")
p = self.get_data_path("mags/dir_with_3_mag")
mags = MAGSequencesDirFmt(path=p, mode="r")
loci, genes, proteins = predict_genes_prodigal(mags=mags)
loci, genes, proteins = predict_genes_prodigal(sequences=mags)

# Check that output is correct type
self.assertIsInstance(loci, LociDirectoryFormat)
Expand Down Expand Up @@ -88,9 +88,9 @@ def test_run_prodigal_feature_data_3_mag(self, subp_run):

@patch("subprocess.run")
def test_run_prodigal_sample_data(self, subp_run):
p = self.get_data_path("")
p = self.get_data_path("mags")
mags = MultiMAGSequencesDirFmt(path=p, mode="r")
loci, genes, prot = predict_genes_prodigal(mags=mags)
loci, genes, prot = predict_genes_prodigal(sequences=mags)

# Check that output is correct type
self.assertIsInstance(loci, LociDirectoryFormat)
Expand All @@ -117,3 +117,19 @@ def test_run_prodigal_sample_data(self, subp_run):

# Assert that patch was called 3 times
subp_run.assert_has_calls(calls, any_order=True)

@patch("subprocess.run")
def test_run_prodigal_contigs(self, subp_run):
contigs = ContigSequencesDirFmt(self.get_data_path("contigs"), mode="r")
loci, genes, prot = predict_genes_prodigal(sequences=contigs)

subp_run.assert_called_once_with([
"prodigal",
"-g", "11",
"-f", "gff",
"-i", os.path.join(contigs.path, "sample1_contigs.fasta"),
"-o", os.path.join(loci.path, "sample1.gff"),
"-a", os.path.join(prot.path, "sample1.fasta"),
"-d", os.path.join(genes.path, "sample1.fasta")],
check=True
)
Loading